diff options
Diffstat (limited to 'tools/testing')
632 files changed, 44627 insertions, 7258 deletions
diff --git a/tools/testing/crypto/chacha20-s390/test-cipher.c b/tools/testing/crypto/chacha20-s390/test-cipher.c index 827507844e8f..9f61454ed077 100644 --- a/tools/testing/crypto/chacha20-s390/test-cipher.c +++ b/tools/testing/crypto/chacha20-s390/test-cipher.c @@ -11,7 +11,6 @@ #include <crypto/akcipher.h> #include <crypto/acompress.h> #include <crypto/rng.h> -#include <crypto/drbg.h> #include <crypto/kpp.h> #include <crypto/internal/simd.h> #include <crypto/chacha.h> diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 418669927fb0..ef92dd35e030 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -318,7 +318,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = FAKE_QTG_ID, - .window_size = SZ_256M, + .window_size = SZ_256M > PMD_SIZE ? SZ_256M : PMD_SIZE, }, .target = { 3 }, }, @@ -433,12 +433,16 @@ static void depopulate_all_mock_resources(void) static struct cxl_mock_res *alloc_mock_res(resource_size_t size, int align) { - struct cxl_mock_res *res = kzalloc(sizeof(*res), GFP_KERNEL); struct genpool_data_align data = { .align = align, }; unsigned long phys; + struct cxl_mock_res *res __free(kfree) = kzalloc(sizeof(*res), + GFP_KERNEL); + if (!res) + return NULL; + INIT_LIST_HEAD(&res->list); phys = gen_pool_alloc_algo(cxl_mock_pool, size, gen_pool_first_fit_align, &data); @@ -453,7 +457,7 @@ static struct cxl_mock_res *alloc_mock_res(resource_size_t size, int align) list_add(&res->list, &mock_res); mutex_unlock(&mock_res_lock); - return res; + return no_free_ptr(res); } /* Only update CFMWS0 as this is used by the auto region. */ @@ -495,9 +499,12 @@ static int populate_cedt(void) for (i = cfmws_start; i <= cfmws_end; i++) { struct acpi_cedt_cfmws *window = mock_cfmws[i]; + int align = SZ_256M; cfmws_elc_update(window, i); - res = alloc_mock_res(window->window_size, SZ_256M); + if (window->restrictions & ACPI_CEDT_CFMWS_RESTRICT_VOLATILE) + align = max_t(int, SZ_256M, PMD_SIZE); + res = alloc_mock_res(window->window_size, align); if (!res) return -ENOMEM; window->base_hpa = res->range.start; @@ -1181,15 +1188,11 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) cxlsd = to_cxl_switch_decoder(dev); if (i == 0) { /* put cxl_mem.4 second in the decode order */ - if (pdev->id == 4) { - cxlsd->target[1] = dport; + if (pdev->id == 4) cxlsd->cxld.target_map[1] = dport->port_id; - } else { - cxlsd->target[0] = dport; + else cxlsd->cxld.target_map[0] = dport->port_id; - } } else { - cxlsd->target[0] = dport; cxlsd->cxld.target_map[0] = dport->port_id; } cxld = &cxlsd->cxld; @@ -1212,6 +1215,16 @@ static bool mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->commit = mock_decoder_commit; cxld->reset = mock_decoder_reset; + /* + * Only target_map[] is programmed above, mimicking + * firmware. On real hardware target[] is populated as + * dports enumerate, via update_decoder_targets(). The + * mock's dports are already bound by now, so fire that + * resolution explicitly here rather than stamping + * target[] directly. + */ + cxl_port_update_decoder_targets(iter, dport); + cxld_registry_update(cxld); put_device(dev); } @@ -1523,6 +1536,23 @@ static void mock_companion(struct acpi_device *adev, struct device *dev) #define SZ_64G (SZ_32G * 2) #endif +static int cxl_mock_platform_device_add(struct platform_device *pdev, + struct platform_device **ppdev) +{ + int rc; + + if (ppdev) + *ppdev = pdev; + rc = platform_device_add(pdev); + if (rc) { + platform_device_put(pdev); + if (ppdev) + *ppdev = NULL; + } + + return rc; +} + static __init int cxl_rch_topo_init(void) { int rc, i; @@ -1537,13 +1567,10 @@ static __init int cxl_rch_topo_init(void) goto err_bridge; mock_companion(adev, &pdev->dev); - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_rch[i]); + if (rc) goto err_bridge; - } - cxl_rch[i] = pdev; mock_pci_bus[idx].bridge = &pdev->dev; rc = sysfs_create_link(&pdev->dev.kobj, &pdev->dev.kobj, "firmware_node"); @@ -1595,13 +1622,10 @@ static __init int cxl_single_topo_init(void) goto err_bridge; mock_companion(adev, &pdev->dev); - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_hb_single[i]); + if (rc) goto err_bridge; - } - cxl_hb_single[i] = pdev; mock_pci_bus[i + NR_CXL_HOST_BRIDGES].bridge = &pdev->dev; rc = sysfs_create_link(&pdev->dev.kobj, &pdev->dev.kobj, "physical_node"); @@ -1620,12 +1644,9 @@ static __init int cxl_single_topo_init(void) goto err_port; pdev->dev.parent = &bridge->dev; - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_root_single[i]); + if (rc) goto err_port; - } - cxl_root_single[i] = pdev; } for (i = 0; i < ARRAY_SIZE(cxl_swu_single); i++) { @@ -1638,12 +1659,9 @@ static __init int cxl_single_topo_init(void) goto err_uport; pdev->dev.parent = &root_port->dev; - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_swu_single[i]); + if (rc) goto err_uport; - } - cxl_swu_single[i] = pdev; } for (i = 0; i < ARRAY_SIZE(cxl_swd_single); i++) { @@ -1657,12 +1675,9 @@ static __init int cxl_single_topo_init(void) goto err_dport; pdev->dev.parent = &uport->dev; - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_swd_single[i]); + if (rc) goto err_dport; - } - cxl_swd_single[i] = pdev; } return 0; @@ -1735,12 +1750,9 @@ static int cxl_mem_init(void) pdev->dev.parent = &dport->dev; set_dev_node(&pdev->dev, i % 2); - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_mem[i]); + if (rc) goto err_mem; - } - cxl_mem[i] = pdev; } for (i = 0; i < ARRAY_SIZE(cxl_mem_single); i++) { @@ -1753,12 +1765,9 @@ static int cxl_mem_init(void) pdev->dev.parent = &dport->dev; set_dev_node(&pdev->dev, i % 2); - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_mem_single[i]); + if (rc) goto err_single; - } - cxl_mem_single[i] = pdev; } for (i = 0; i < ARRAY_SIZE(cxl_rcd); i++) { @@ -1772,12 +1781,9 @@ static int cxl_mem_init(void) pdev->dev.parent = &rch->dev; set_dev_node(&pdev->dev, i % 2); - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_rcd[i]); + if (rc) goto err_rcd; - } - cxl_rcd[i] = pdev; } return 0; @@ -1826,6 +1832,12 @@ static __init int cxl_test_init(void) int rc, i; struct range mappable; + if (!IS_ALIGNED(mock_auto_region_size, PMD_SIZE)) { + pr_err_once("mock_auto_region_size %d must be PMD-aligned\n", + mock_auto_region_size); + return -EINVAL; + } + cxl_acpi_test(); cxl_core_test(); cxl_mem_test(); @@ -1869,13 +1881,10 @@ static __init int cxl_test_init(void) goto err_bridge; mock_companion(adev, &pdev->dev); - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_host_bridge[i]); + if (rc) goto err_bridge; - } - cxl_host_bridge[i] = pdev; mock_pci_bus[i].bridge = &pdev->dev; rc = sysfs_create_link(&pdev->dev.kobj, &pdev->dev.kobj, "physical_node"); @@ -1893,12 +1902,9 @@ static __init int cxl_test_init(void) goto err_port; pdev->dev.parent = &bridge->dev; - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_root_port[i]); + if (rc) goto err_port; - } - cxl_root_port[i] = pdev; } BUILD_BUG_ON(ARRAY_SIZE(cxl_switch_uport) != ARRAY_SIZE(cxl_root_port)); @@ -1911,12 +1917,9 @@ static __init int cxl_test_init(void) goto err_uport; pdev->dev.parent = &root_port->dev; - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_switch_uport[i]); + if (rc) goto err_uport; - } - cxl_switch_uport[i] = pdev; } for (i = 0; i < ARRAY_SIZE(cxl_switch_dport); i++) { @@ -1929,12 +1932,9 @@ static __init int cxl_test_init(void) goto err_dport; pdev->dev.parent = &uport->dev; - rc = platform_device_add(pdev); - if (rc) { - platform_device_put(pdev); + rc = cxl_mock_platform_device_add(pdev, &cxl_switch_dport[i]); + if (rc) goto err_dport; - } - cxl_switch_dport[i] = pdev; } rc = cxl_single_topo_init(); @@ -1953,9 +1953,9 @@ static __init int cxl_test_init(void) acpi0017_mock.dev.bus = &platform_bus_type; cxl_acpi->dev.groups = cxl_acpi_groups; - rc = platform_device_add(cxl_acpi); + rc = cxl_mock_platform_device_add(cxl_acpi, NULL); if (rc) - goto err_root; + goto err_rch; rc = cxl_mem_init(); if (rc) @@ -1970,7 +1970,7 @@ static __init int cxl_test_init(void) err_mem: cxl_mem_exit(); err_root: - platform_device_put(cxl_acpi); + platform_device_unregister(cxl_acpi); err_rch: cxl_rch_topo_exit(); err_single: diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c index 16328b2112b2..25a27e01ac21 100644 --- a/tools/testing/cxl/test/cxl_translate.c +++ b/tools/testing/cxl/test/cxl_translate.c @@ -236,8 +236,8 @@ static int setup_xor_mapping(void) if (!cximsd) return -ENOMEM; - memcpy(cximsd->xormaps, xormaps, nr_maps * sizeof(*cximsd->xormaps)); cximsd->nr_maps = nr_maps; + memcpy(cximsd->xormaps, xormaps, nr_maps * sizeof(*cximsd->xormaps)); return 0; } diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 271c7ad8cc32..a7da279aa3ef 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -2,7 +2,6 @@ // Copyright(c) 2021 Intel Corporation. All rights reserved. #include <linux/platform_device.h> -#include <linux/mod_devicetable.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/delay.h> @@ -312,12 +311,17 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) static int mock_clear_event(struct device *dev, struct cxl_mbox_cmd *cmd) { - struct cxl_mbox_clear_event_payload *pl = cmd->payload_in; + struct cxl_mbox_clear_event_payload *pl; struct mock_event_log *log; - u8 log_type = pl->event_log; + u8 log_type; u16 handle; int nr; + if (cmd->size_in < sizeof(*pl)) + return -EINVAL; + + pl = cmd->payload_in; + log_type = pl->event_log; if (log_type >= CXL_EVENT_TYPE_MAX) return -EINVAL; @@ -574,14 +578,19 @@ static int mock_gsl(struct cxl_mbox_cmd *cmd) static int mock_get_log(struct cxl_memdev_state *mds, struct cxl_mbox_cmd *cmd) { struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; - struct cxl_mbox_get_log *gl = cmd->payload_in; - u32 offset = le32_to_cpu(gl->offset); - u32 length = le32_to_cpu(gl->length); uuid_t uuid = DEFINE_CXL_CEL_UUID; + struct cxl_mbox_get_log *gl; void *data = &mock_cel; + u32 offset; + u32 length; if (cmd->size_in < sizeof(*gl)) return -EINVAL; + + gl = cmd->payload_in; + offset = le32_to_cpu(gl->offset); + length = le32_to_cpu(gl->length); + if (length > cxl_mbox->payload_size) return -EINVAL; if (offset + length > sizeof(mock_cel)) @@ -1053,7 +1062,7 @@ static int mock_get_lsa(struct cxl_mockmem_data *mdata, return -EINVAL; offset = le32_to_cpu(get_lsa->offset); length = le32_to_cpu(get_lsa->length); - if (offset + length > LSA_SIZE) + if (offset > LSA_SIZE || length > LSA_SIZE - offset) return -EINVAL; if (length > cmd->size_out) return -EINVAL; @@ -1073,7 +1082,7 @@ static int mock_set_lsa(struct cxl_mockmem_data *mdata, return -EINVAL; offset = le32_to_cpu(set_lsa->offset); length = cmd->size_in - sizeof(*set_lsa); - if (offset + length > LSA_SIZE) + if (offset > LSA_SIZE || length > LSA_SIZE - offset) return -EINVAL; memcpy(lsa + offset, &set_lsa->data[0], length); @@ -1336,10 +1345,14 @@ static int mock_fw_info(struct cxl_mockmem_data *mdata, static int mock_transfer_fw(struct cxl_mockmem_data *mdata, struct cxl_mbox_cmd *cmd) { - struct cxl_mbox_transfer_fw *transfer = cmd->payload_in; + struct cxl_mbox_transfer_fw *transfer; void *fw = mdata->fw; size_t offset, length; + if (cmd->size_in < sizeof(*transfer)) + return -EINVAL; + + transfer = cmd->payload_in; offset = le32_to_cpu(transfer->offset) * CXL_FW_TRANSFER_ALIGNMENT; length = cmd->size_in - sizeof(*transfer); if (offset + length > FW_SIZE) @@ -1415,11 +1428,18 @@ static int mock_get_test_feature(struct cxl_mockmem_data *mdata, struct cxl_mbox_cmd *cmd) { struct vendor_test_feat *output = cmd->payload_out; - struct cxl_mbox_get_feat_in *input = cmd->payload_in; - u16 offset = le16_to_cpu(input->offset); - u16 count = le16_to_cpu(input->count); + struct cxl_mbox_get_feat_in *input; + u16 offset; + u16 count; u8 *ptr; + if (cmd->size_in < sizeof(*input)) + return -EINVAL; + + input = cmd->payload_in; + offset = le16_to_cpu(input->offset); + count = le16_to_cpu(input->count); + if (offset > sizeof(*output)) { cmd->return_code = CXL_MBOX_CMD_RC_INPUT; return -EINVAL; @@ -1703,7 +1723,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) return -ENOMEM; dev_set_drvdata(dev, mdata); - mdata->lsa = vmalloc(LSA_SIZE); + mdata->lsa = vzalloc(LSA_SIZE); if (!mdata->lsa) return -ENOMEM; mdata->fw = vmalloc(FW_SIZE); @@ -1769,7 +1789,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(cxlds, NULL); + cxlmd = devm_cxl_add_classdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py index 742f5c555666..ac3f7159e67f 100755 --- a/tools/testing/kunit/kunit.py +++ b/tools/testing/kunit/kunit.py @@ -21,6 +21,7 @@ from enum import Enum, auto from typing import Iterable, List, Optional, Sequence, Tuple import kunit_json +import kunit_junit import kunit_kernel import kunit_parser from kunit_printer import stdout, null_printer @@ -49,6 +50,7 @@ class KunitBuildRequest(KunitConfigRequest): class KunitParseRequest: raw_output: Optional[str] json: Optional[str] + junit: Optional[str] summary: bool failed: bool @@ -268,6 +270,13 @@ def parse_tests(request: KunitParseRequest, metadata: kunit_json.Metadata, input stdout.print_with_timestamp("Test results stored in %s" % os.path.abspath(request.json)) + if request.junit: + if request.junit == 'stdout': + kunit_junit.print_junit_result(test=test) + else: + kunit_junit.write_junit_result(test=test,filename=request.junit) + stdout.print_with_timestamp(f"Test results stored in {os.path.abspath(request.junit)}") + if test.status != kunit_parser.TestStatus.SUCCESS: return KunitResult(KunitStatus.TEST_FAILURE, parse_time), test @@ -309,6 +318,7 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree, # So we hackily automatically rewrite --json => --json=stdout pseudo_bool_flag_defaults = { '--json': 'stdout', + '--junit': 'stdout', '--raw_output': 'kunit', } def massage_argv(argv: Sequence[str]) -> Sequence[str]: @@ -459,6 +469,11 @@ def add_parse_opts(parser: argparse.ArgumentParser) -> None: help='Prints parsed test results as JSON to stdout or a file if ' 'a filename is specified. Does nothing if --raw_output is set.', type=str, const='stdout', default=None, metavar='FILE') + parser.add_argument('--junit', + nargs='?', + help='Prints parsed test results as JUnit XML to stdout or a file if ' + 'a filename is specified. Does nothing if --raw_output is set.', + type=str, const='stdout', default=None, metavar='FILE') parser.add_argument('--summary', help='Prints only the summary line for parsed test results.' 'Does nothing if --raw_output is set.', @@ -502,6 +517,7 @@ def run_handler(cli_args: argparse.Namespace) -> None: jobs=cli_args.jobs, raw_output=cli_args.raw_output, json=cli_args.json, + junit=cli_args.junit, summary=cli_args.summary, failed=cli_args.failed, timeout=cli_args.timeout, @@ -552,6 +568,7 @@ def exec_handler(cli_args: argparse.Namespace) -> None: exec_request = KunitExecRequest(raw_output=cli_args.raw_output, build_dir=cli_args.build_dir, json=cli_args.json, + junit=cli_args.junit, summary=cli_args.summary, failed=cli_args.failed, timeout=cli_args.timeout, @@ -580,7 +597,9 @@ def parse_handler(cli_args: argparse.Namespace) -> None: # We know nothing about how the result was created! metadata = kunit_json.Metadata() request = KunitParseRequest(raw_output=cli_args.raw_output, - json=cli_args.json, summary=cli_args.summary, + json=cli_args.json, + junit=cli_args.junit, + summary=cli_args.summary, failed=cli_args.failed) result, _ = parse_tests(request, metadata, kunit_output) if result.status != KunitStatus.SUCCESS: diff --git a/tools/testing/kunit/kunit_junit.py b/tools/testing/kunit/kunit_junit.py new file mode 100644 index 000000000000..3622070358e7 --- /dev/null +++ b/tools/testing/kunit/kunit_junit.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Generates JUnit XML files from KUnit test results +# +# Copyright (C) 2026, Google LLC and David Gow. + +from xml.sax.saxutils import quoteattr, XMLGenerator +import xml.etree.ElementTree as ET +from kunit_parser import Test, TestStatus +from typing import Optional + +# Get a string representing a tes suite (including subtests) in JUnit XML +def get_test_suite(test: Test, parent: Optional[ET.Element]) -> ET.Element: + suite_attrs = { + 'name': test.name, + 'tests': str(test.counts.total()), + 'failures': str(test.counts.failed), + 'skipped': str(test.counts.skipped), + 'errors': str(test.counts.crashed + test.counts.errors), + } + + if parent is not None: + test_suite_element = ET.SubElement(parent, 'testsuite', suite_attrs) + else: + test_suite_element = ET.Element('testsuite', suite_attrs) + + for subtest in test.subtests: + if subtest.subtests: + get_test_suite(subtest, test_suite_element) + continue + test_case_element = ET.SubElement(test_suite_element, 'testcase', {'name': subtest.name}) + if subtest.status == TestStatus.FAILURE: + ET.SubElement(test_case_element, 'failure', {}).text = 'Test Failed' + elif subtest.status == TestStatus.SKIPPED: + ET.SubElement(test_case_element, 'skipped', {}).text = subtest.skip_reason + elif subtest.status == TestStatus.TEST_CRASHED: + ET.SubElement(test_case_element, 'error', {}).text = 'Test Crashed' + + if subtest.log: + ET.SubElement(test_case_element, 'system-out', {}).text = "\n".join(subtest.log) + + return test_suite_element + +# Get a string for an entire XML file for the test structure starting at test +def get_junit_result(test: Test) -> str: + root_element = get_test_suite(test, None) + ET.indent(root_element) + return ET.tostring(root_element, encoding="unicode", xml_declaration=True) + +# Print a JUnit result to stdout. +def print_junit_result(test: Test) -> None: + root_element = get_test_suite(test, None) + ET.indent(root_element) + ET.dump(root_element) + +# Write an entire XML file for the test structure starting at test +def write_junit_result(test: Test, filename: str) -> None: + root_element = get_test_suite(test, None) + ET.indent(root_element) + root_et = ET.ElementTree(root_element) + root_et.write(filename, encoding='utf-8', xml_declaration=True) diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 2869fcb199ff..58557c47d85f 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -218,7 +218,7 @@ def _get_qemu_ops(config_path: str, # exists (I learned this through experimentation and could not find it # anywhere in the Python documentation). # - # Bascially, we completely ignore the actual file location of the config + # Basically, we completely ignore the actual file location of the config # we are loading and just tell Python that the module lives in the # QEMU_CONFIGS_DIR for import purposes regardless of where it actually # exists as a file. diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py index 1c61a0ed740d..d722874bc660 100644 --- a/tools/testing/kunit/kunit_parser.py +++ b/tools/testing/kunit/kunit_parser.py @@ -17,7 +17,7 @@ import textwrap from enum import Enum, auto from typing import Iterable, Iterator, List, Optional, Tuple -from kunit_printer import Printer, stdout +from kunit_printer import Printer class Test: """ @@ -44,11 +44,12 @@ class Test: self.subtests = [] # type: List[Test] self.log = [] # type: List[str] self.counts = TestCounts() + self.skip_reason = '' def __str__(self) -> str: """Returns string representation of a Test class object.""" return (f'Test({self.status}, {self.name}, {self.expected_count}, ' - f'{self.subtests}, {self.log}, {self.counts})') + f'{self.subtests}, {self.log}, {self.counts}, {self.skip_reason})') def __repr__(self) -> str: """Returns string representation of a Test class object.""" @@ -57,7 +58,7 @@ class Test: def add_error(self, printer: Printer, error_message: str) -> None: """Records an error that occurred while parsing this test.""" self.counts.errors += 1 - printer.print_with_timestamp(stdout.red('[ERROR]') + f' Test: {self.name}: {error_message}') + printer.print_with_timestamp(printer.red('[ERROR]') + f' Test: {self.name}: {error_message}') def ok_status(self) -> bool: """Returns true if the status was ok, i.e. passed or skipped.""" @@ -268,7 +269,7 @@ def check_version(version_num: int, accepted_versions: List[int], if version_num < min(accepted_versions): test.add_error(printer, f'{version_type} version lower than expected!') elif version_num > max(accepted_versions): - test.add_error(printer, f'{version_type} version higer than expected!') + test.add_error(printer, f'{version_type} version higher than expected!') def parse_ktap_header(lines: LineStream, test: Test, printer: Printer) -> bool: """ @@ -352,9 +353,9 @@ def parse_test_plan(lines: LineStream, test: Test) -> bool: lines.pop() return True -TEST_RESULT = re.compile(r'^\s*(ok|not ok) ([0-9]+) ?(- )?([^#]*)( # .*)?$') +TEST_RESULT = re.compile(r'^\s*(ok|not ok) ([0-9]+) ?(:?- )?([^#]*)( # .*)?$') -TEST_RESULT_SKIP = re.compile(r'^\s*(ok|not ok) ([0-9]+) ?(- )?(.*) # SKIP ?(.*)$') +TEST_RESULT_SKIP = re.compile(r'^\s*(ok|not ok) ([0-9]+) ?(:?- )?(.*) # SKIP ?(.*)$') def peek_test_name_match(lines: LineStream, test: Test) -> bool: """ @@ -418,7 +419,7 @@ def parse_test_result(lines: LineStream, test: Test, # Set name of test object if skip_match: - test.name = skip_match.group(4) or skip_match.group(5) + test.name = skip_match.group(4) else: test.name = match.group(4) @@ -431,6 +432,7 @@ def parse_test_result(lines: LineStream, test: Test, status = match.group(1) if skip_match: test.status = TestStatus.SKIPPED + test.skip_reason = skip_match.group(5) or '' elif status == 'ok': test.status = TestStatus.SUCCESS else: @@ -539,12 +541,15 @@ def format_test_result(test: Test, printer: Printer) -> str: if test.status == TestStatus.SUCCESS: return printer.green('[PASSED] ') + test.name if test.status == TestStatus.SKIPPED: - return printer.yellow('[SKIPPED] ') + test.name + skip_message = printer.yellow('[SKIPPED] ') + test.name + if test.skip_reason != '': + skip_message += printer.yellow(' (' + test.skip_reason + ')') + return skip_message if test.status == TestStatus.NO_TESTS: return printer.yellow('[NO TESTS RUN] ') + test.name if test.status == TestStatus.TEST_CRASHED: print_log(test.log, printer) - return stdout.red('[CRASHED] ') + test.name + return printer.red('[CRASHED] ') + test.name print_log(test.log, printer) return printer.red('[FAILED] ') + test.name @@ -651,11 +656,11 @@ def print_summary_line(test: Test, printer: Printer) -> None: printer - Printer object to output results """ if test.status == TestStatus.SUCCESS: - color = stdout.green + color = printer.green elif test.status in (TestStatus.SKIPPED, TestStatus.NO_TESTS): - color = stdout.yellow + color = printer.yellow else: - color = stdout.red + color = printer.red printer.print_with_timestamp(color(f'Testing complete. {test.counts}')) # Summarize failures that might have gone off-screen since we had a lot diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py index 267c33cecf87..da88c3a1651d 100755 --- a/tools/testing/kunit/kunit_tool_test.py +++ b/tools/testing/kunit/kunit_tool_test.py @@ -24,6 +24,7 @@ import kunit_config import kunit_parser import kunit_kernel import kunit_json +import kunit_junit import kunit from kunit_printer import stdout @@ -235,10 +236,27 @@ class KUnitParserTest(unittest.TestCase): with open(skipped_log) as file: result = kunit_parser.parse_run_tests(file.readlines(), stdout) + # The test result is skipped, and the skip reason is valid + self.assertEqual(kunit_parser.TestStatus.SKIPPED, result.subtests[1].subtests[1].status) + self.assertEqual("this test should be skipped", result.subtests[1].subtests[1].skip_reason) + # A skipped test does not fail the whole suite. self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status) self.assertEqual(result.counts, kunit_parser.TestCounts(passed=4, skipped=1)) + def test_skipped_reason_parse(self): + skipped_log = _test_data_path('test_skip_all_tests.log') + with open(skipped_log) as file: + result = kunit_parser.parse_run_tests(file.readlines(), stdout) + + # The first test is skipped, with the correct reaons + self.assertEqual(kunit_parser.TestStatus.SKIPPED, result.subtests[0].subtests[0].status) + self.assertEqual("all tests skipped", result.subtests[0].subtests[0].skip_reason) + + # The first suite is skipped, with no reason + self.assertEqual(kunit_parser.TestStatus.SKIPPED, result.subtests[0].status) + self.assertEqual("", result.subtests[0].skip_reason) + def test_skipped_all_tests(self): skipped_log = _test_data_path('test_skip_all_tests.log') with open(skipped_log) as file: @@ -676,6 +694,38 @@ class StrContains(str): def __eq__(self, other): return self in other +class KUnitJUnitTest(unittest.TestCase): + def setUp(self): + self.print_mock = mock.patch('kunit_printer.Printer.print').start() + self.addCleanup(mock.patch.stopall) + + def _junit_string(self, log_file): + with open(_test_data_path(log_file)) as file: + test_result = kunit_parser.parse_run_tests(file, stdout) + junit_string = kunit_junit.get_junit_result( + test=test_result) + print(junit_string) + return junit_string + + def test_failed_test_junit(self): + result = self._junit_string('test_is_test_passed-failure.log') + self.assertTrue("<failure>" in result) + + def test_skipped_test_junit(self): + result = self._junit_string('test_skip_tests.log') + self.assertTrue("<skipped>" in result) + self.assertTrue("skipped=\"1\"" in result) + + def test_crashed_test_junit(self): + result = self._junit_string('test_kernel_panic_interrupt.log') + self.assertTrue("<error>" in result); + + def test_no_tests_junit(self): + result = self._junit_string('test_is_test_passed-no_tests_run_with_header.log') + self.assertTrue("tests=\"0\"" in result) + self.assertFalse("testcase" in result) + + class KUnitMainTest(unittest.TestCase): def setUp(self): path = _test_data_path('test_is_test_passed-all_passed.log') @@ -923,7 +973,7 @@ class KUnitMainTest(unittest.TestCase): self.linux_source_mock.run_kernel.return_value = ['TAP version 14', 'init: random output'] + want got = kunit._list_tests(self.linux_source_mock, - kunit.KunitExecRequest(None, None, False, False, '.kunit', 300, 'suite*', '', None, None, 'suite', False, False, False)) + kunit.KunitExecRequest(None, None, None, False, False, '.kunit', 300, 'suite*', '', None, None, 'suite', False, False, False)) self.assertEqual(got, want) # Should respect the user's filter glob when listing tests. self.linux_source_mock.run_kernel.assert_called_once_with( @@ -936,7 +986,7 @@ class KUnitMainTest(unittest.TestCase): # Should respect the user's filter glob when listing tests. mock_tests.assert_called_once_with(mock.ANY, - kunit.KunitExecRequest(None, None, False, False, '.kunit', 300, 'suite*.test*', '', None, None, 'suite', False, False, False)) + kunit.KunitExecRequest(None, None, None, False, False, '.kunit', 300, 'suite*.test*', '', None, None, 'suite', False, False, False)) self.linux_source_mock.run_kernel.assert_has_calls([ mock.call(args=None, build_dir='.kunit', filter_glob='suite.test*', filter='', filter_action=None, timeout=300), mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test*', filter='', filter_action=None, timeout=300), @@ -949,7 +999,7 @@ class KUnitMainTest(unittest.TestCase): # Should respect the user's filter glob when listing tests. mock_tests.assert_called_once_with(mock.ANY, - kunit.KunitExecRequest(None, None, False, False, '.kunit', 300, 'suite*', '', None, None, 'test', False, False, False)) + kunit.KunitExecRequest(None, None, None, False, False, '.kunit', 300, 'suite*', '', None, None, 'test', False, False, False)) self.linux_source_mock.run_kernel.assert_has_calls([ mock.call(args=None, build_dir='.kunit', filter_glob='suite.test1', filter='', filter_action=None, timeout=300), mock.call(args=None, build_dir='.kunit', filter_glob='suite.test2', filter='', filter_action=None, timeout=300), diff --git a/tools/testing/kunit/qemu_configs/or1k.py b/tools/testing/kunit/qemu_configs/or1k.py new file mode 100644 index 000000000000..dfbbad0f9076 --- /dev/null +++ b/tools/testing/kunit/qemu_configs/or1k.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-only +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='openrisc', + kconfig=''' +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_SYSCON=y +''', + qemu_arch='or1k', + kernel_path='vmlinux', + kernel_command_line='console=ttyS0', + extra_qemu_params=[ + '-machine', 'virt', + '-m', '512', + ]) diff --git a/tools/testing/memblock/README b/tools/testing/memblock/README index 7ca437d81806..b435f48d8a70 100644 --- a/tools/testing/memblock/README +++ b/tools/testing/memblock/README @@ -104,10 +104,7 @@ called at the beginning of each test. Known issues ============ -1. Requesting a specific NUMA node via memblock_alloc_node() does not work as - intended. Once the fix is in place, tests for this function can be added. - -2. Tests for memblock_alloc_low() can't be easily implemented. The function uses +1. Tests for memblock_alloc_low() can't be easily implemented. The function uses ARCH_LOW_ADDRESS_LIMIT marco, which can't be changed to point at the low memory of the memory_block. diff --git a/tools/testing/memblock/TODO b/tools/testing/memblock/TODO index e306c90c535f..c13ad0dae776 100644 --- a/tools/testing/memblock/TODO +++ b/tools/testing/memblock/TODO @@ -1,5 +1,5 @@ TODO ===== -1. Add tests for memblock_alloc_node() to check if the correct NUMA node is set - for the new region +1. Add tests for memblock_alloc_low() once the simulator can model + ARCH_LOW_ADDRESS_LIMIT against the low memory in memory_block diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index bb682659a12d..8d934ff5b080 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -35,4 +35,8 @@ typedef struct pglist_data { } pg_data_t; +enum migratetype { + MIGRATE_CMA, +}; + #endif diff --git a/tools/testing/memblock/tests/alloc_exact_nid_api.c b/tools/testing/memblock/tests/alloc_exact_nid_api.c index 6e14447da6e1..0c46c73b5e04 100644 --- a/tools/testing/memblock/tests/alloc_exact_nid_api.c +++ b/tools/testing/memblock/tests/alloc_exact_nid_api.c @@ -368,7 +368,7 @@ static int alloc_exact_nid_bottom_up_numa_part_reserved_check(void) max_addr = memblock_end_of_DRAM(); total_size = size + r1.size; - memblock_reserve(r1.base, r1.size); + __memblock_reserve(r1.base, r1.size, nid_req, MEMBLOCK_RSRV_KERN); allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, min_addr, max_addr, nid_req); @@ -861,8 +861,8 @@ static int alloc_exact_nid_numa_reserved_full_merge_generic_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + __memblock_reserve(r1.base, r1.size, nid_req, MEMBLOCK_RSRV_KERN); + __memblock_reserve(r2.base, r2.size, nid_req, MEMBLOCK_RSRV_KERN); allocated_ptr = memblock_alloc_exact_nid_raw(size, SMP_CACHE_BYTES, min_addr, max_addr, diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c index 562e4701b0e0..c04923532159 100644 --- a/tools/testing/memblock/tests/alloc_nid_api.c +++ b/tools/testing/memblock/tests/alloc_nid_api.c @@ -1965,7 +1965,7 @@ static int alloc_nid_bottom_up_numa_part_reserved_check(void) max_addr = memblock_end_of_DRAM(); total_size = size + r1.size; - memblock_reserve(r1.base, r1.size); + __memblock_reserve(r1.base, r1.size, nid_req, MEMBLOCK_RSRV_KERN); allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, nid_req); @@ -2412,8 +2412,8 @@ static int alloc_nid_numa_reserved_full_merge_generic_check(void) min_addr = r2.base + r2.size; max_addr = r1.base; - memblock_reserve(r1.base, r1.size); - memblock_reserve(r2.base, r2.size); + __memblock_reserve(r1.base, r1.size, nid_req, MEMBLOCK_RSRV_KERN); + __memblock_reserve(r2.base, r2.size, nid_req, MEMBLOCK_RSRV_KERN); allocated_ptr = run_memblock_alloc_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, nid_req); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 6e59b8f63e41..8d4db2241cc2 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -37,6 +37,7 @@ TARGETS += filesystems/fat TARGETS += filesystems/overlayfs TARGETS += filesystems/statmount TARGETS += filesystems/mount-notify +TARGETS += filesystems/nsfs TARGETS += filesystems/fuse TARGETS += filesystems/move_mount TARGETS += filesystems/empty_mntns @@ -85,12 +86,12 @@ TARGETS += net/ppp TARGETS += net/rds TARGETS += net/tcp_ao TARGETS += nolibc -TARGETS += nsfs TARGETS += pci_endpoint TARGETS += pcie_bwctrl TARGETS += perf_events TARGETS += pidfd TARGETS += pid_namespace +TARGETS += pipe TARGETS += power_supply TARGETS += powerpc TARGETS += prctl diff --git a/tools/testing/selftests/acct/.gitignore b/tools/testing/selftests/acct/.gitignore index 7e78aac19038..9e9c61c5bfd6 100644 --- a/tools/testing/selftests/acct/.gitignore +++ b/tools/testing/selftests/acct/.gitignore @@ -1,3 +1,4 @@ acct_syscall +taskstats_fill_stats_tgid config -process_log
\ No newline at end of file +process_log diff --git a/tools/testing/selftests/acct/Makefile b/tools/testing/selftests/acct/Makefile index 7e025099cf65..083cab5ddb72 100644 --- a/tools/testing/selftests/acct/Makefile +++ b/tools/testing/selftests/acct/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := acct_syscall +TEST_GEN_PROGS += taskstats_fill_stats_tgid + CFLAGS += -Wall +LDLIBS += -lpthread -include ../lib.mk
\ No newline at end of file +include ../lib.mk diff --git a/tools/testing/selftests/acct/taskstats_fill_stats_tgid.c b/tools/testing/selftests/acct/taskstats_fill_stats_tgid.c new file mode 100644 index 000000000000..d6cab4ae26f2 --- /dev/null +++ b/tools/testing/selftests/acct/taskstats_fill_stats_tgid.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <errno.h> +#include <linux/genetlink.h> +#include <linux/netlink.h> +#include <linux/taskstats.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> + +#include "kselftest.h" + +#ifndef NLA_ALIGN +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int)NLA_ALIGN(sizeof(struct nlattr))) +#endif + +#define BUSY_NS (200ULL * 1000 * 1000) + +struct worker_ctx { + pthread_mutex_t lock; + pthread_cond_t cond; + bool ready; + bool release; +}; + +static unsigned long busy_sink; + +static void *taskstats_nla_data(const struct nlattr *na) +{ + return (void *)((char *)na + NLA_HDRLEN); +} + +static bool taskstats_nla_ok(const struct nlattr *na, int remaining) +{ + return remaining >= (int)sizeof(*na) && + na->nla_len >= sizeof(*na) && + na->nla_len <= remaining; +} + +static struct nlattr *taskstats_nla_next(const struct nlattr *na, int *remaining) +{ + int aligned_len = NLA_ALIGN(na->nla_len); + + *remaining -= aligned_len; + return (struct nlattr *)((char *)na + aligned_len); +} + +static uint64_t timespec_diff_ns(const struct timespec *start, + const struct timespec *end) +{ + return (uint64_t)(end->tv_sec - start->tv_sec) * 1000000000ULL + + (uint64_t)(end->tv_nsec - start->tv_nsec); +} + +static void burn_cpu_for_ns(uint64_t runtime_ns) +{ + struct timespec start, now; + unsigned long acc = 0; + + if (clock_gettime(CLOCK_MONOTONIC, &start)) { + perror("clock_gettime"); + exit(EXIT_FAILURE); + } + + do { + for (int i = 0; i < 100000; i++) + acc += i; + if (clock_gettime(CLOCK_MONOTONIC, &now)) { + perror("clock_gettime"); + exit(EXIT_FAILURE); + } + } while (timespec_diff_ns(&start, &now) < runtime_ns); + + busy_sink = acc; +} + +static int netlink_open(void) +{ + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + .nl_pid = getpid(), + }; + int fd; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd < 0) + return -errno; + + if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + int err = -errno; + + close(fd); + return err; + } + + return fd; +} + +static int send_request(int fd, void *buf, size_t len) +{ + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + }; + + if (sendto(fd, buf, len, 0, (struct sockaddr *)&addr, sizeof(addr)) < 0) + return -errno; + + return 0; +} + +static int get_family_id(int fd, const char *name) +{ + struct { + struct nlmsghdr nlh; + struct genlmsghdr genl; + char buf[256]; + } req = { 0 }; + char resp[8192]; + struct nlmsghdr *nlh; + struct genlmsghdr *genl; + struct nlattr *na; + int len; + int rem; + int ret; + + req.nlh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + req.nlh.nlmsg_type = GENL_ID_CTRL; + req.nlh.nlmsg_flags = NLM_F_REQUEST; + req.nlh.nlmsg_seq = 1; + req.nlh.nlmsg_pid = getpid(); + + req.genl.cmd = CTRL_CMD_GETFAMILY; + req.genl.version = 1; + + na = (struct nlattr *)((char *)&req + NLMSG_ALIGN(req.nlh.nlmsg_len)); + na->nla_type = CTRL_ATTR_FAMILY_NAME; + na->nla_len = NLA_HDRLEN + strlen(name) + 1; + memcpy(taskstats_nla_data(na), name, strlen(name) + 1); + req.nlh.nlmsg_len = NLMSG_ALIGN(req.nlh.nlmsg_len) + NLA_ALIGN(na->nla_len); + + ret = send_request(fd, &req, req.nlh.nlmsg_len); + if (ret) + return ret; + + len = recv(fd, resp, sizeof(resp), 0); + if (len < 0) + return -errno; + + for (nlh = (struct nlmsghdr *)resp; NLMSG_OK(nlh, len); + nlh = NLMSG_NEXT(nlh, len)) { + if (nlh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = NLMSG_DATA(nlh); + + return err->error ? err->error : -ENOENT; + } + + genl = (struct genlmsghdr *)NLMSG_DATA(nlh); + rem = nlh->nlmsg_len - NLMSG_HDRLEN - GENL_HDRLEN; + na = (struct nlattr *)((char *)genl + GENL_HDRLEN); + while (taskstats_nla_ok(na, rem)) { + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + return *(uint16_t *)taskstats_nla_data(na); + na = taskstats_nla_next(na, &rem); + } + } + + return -ENOENT; +} + +static int get_taskstats(int fd, int family_id, uint16_t attr_type, uint32_t id, + struct taskstats *stats) +{ + struct { + struct nlmsghdr nlh; + struct genlmsghdr genl; + char buf[256]; + } req = { 0 }; + char resp[16384]; + struct nlmsghdr *nlh; + struct genlmsghdr *genl; + struct nlattr *na; + struct nlattr *nested; + int len; + int rem; + int nrem; + int ret; + + memset(stats, 0, sizeof(*stats)); + + req.nlh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + req.nlh.nlmsg_type = family_id; + req.nlh.nlmsg_flags = NLM_F_REQUEST; + req.nlh.nlmsg_seq = 2; + req.nlh.nlmsg_pid = getpid(); + + req.genl.cmd = TASKSTATS_CMD_GET; + req.genl.version = 1; + + na = (struct nlattr *)((char *)&req + NLMSG_ALIGN(req.nlh.nlmsg_len)); + na->nla_type = attr_type; + na->nla_len = NLA_HDRLEN + sizeof(id); + memcpy(taskstats_nla_data(na), &id, sizeof(id)); + req.nlh.nlmsg_len = NLMSG_ALIGN(req.nlh.nlmsg_len) + NLA_ALIGN(na->nla_len); + + ret = send_request(fd, &req, req.nlh.nlmsg_len); + if (ret) + return ret; + + len = recv(fd, resp, sizeof(resp), 0); + if (len < 0) + return -errno; + + for (nlh = (struct nlmsghdr *)resp; NLMSG_OK(nlh, len); + nlh = NLMSG_NEXT(nlh, len)) { + if (nlh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = NLMSG_DATA(nlh); + + return err->error ? err->error : -ENOENT; + } + + genl = (struct genlmsghdr *)NLMSG_DATA(nlh); + rem = nlh->nlmsg_len - NLMSG_HDRLEN - GENL_HDRLEN; + na = (struct nlattr *)((char *)genl + GENL_HDRLEN); + while (taskstats_nla_ok(na, rem)) { + if (na->nla_type == TASKSTATS_TYPE_AGGR_PID || + na->nla_type == TASKSTATS_TYPE_AGGR_TGID) { + nested = (struct nlattr *)taskstats_nla_data(na); + nrem = na->nla_len - NLA_HDRLEN; + while (taskstats_nla_ok(nested, nrem)) { + if (nested->nla_type == TASKSTATS_TYPE_STATS) { + memcpy(stats, taskstats_nla_data(nested), + sizeof(*stats)); + return 0; + } + nested = taskstats_nla_next(nested, &nrem); + } + } + na = taskstats_nla_next(na, &rem); + } + } + + return -ENOENT; +} + +static uint64_t cpu_total(const struct taskstats *stats) +{ + return (uint64_t)stats->ac_utime + (uint64_t)stats->ac_stime; +} + +static void print_stats(const char *label, const struct taskstats *stats) +{ + ksft_print_msg("%s: cpu_total=%llu nvcsw=%llu nivcsw=%llu\n", + label, (unsigned long long)cpu_total(stats), + (unsigned long long)stats->nvcsw, + (unsigned long long)stats->nivcsw); +} + +static void *worker_thread(void *arg) +{ + struct worker_ctx *ctx = arg; + + burn_cpu_for_ns(BUSY_NS); + + pthread_mutex_lock(&ctx->lock); + ctx->ready = true; + pthread_cond_broadcast(&ctx->cond); + while (!ctx->release) + pthread_cond_wait(&ctx->cond, &ctx->lock); + pthread_mutex_unlock(&ctx->lock); + + return NULL; +} + +int main(void) +{ + struct worker_ctx ctx = { + .lock = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, + }; + struct taskstats before, after; + pthread_t thread; + pid_t tgid = getpid(); + int family_id; + int fd; + int ret; + + ksft_print_header(); + ksft_set_plan(1); + + if (geteuid()) + ksft_exit_skip("taskstats_fill_stats_tgid needs root\n"); + + fd = netlink_open(); + if (fd < 0) + ksft_exit_skip("failed to open generic netlink socket: %s\n", + strerror(-fd)); + + family_id = get_family_id(fd, TASKSTATS_GENL_NAME); + if (family_id < 0) + ksft_exit_skip("taskstats generic netlink family unavailable: %s\n", + strerror(-family_id)); + + /* Create worker thread that burns 200ms of CPU */ + if (pthread_create(&thread, NULL, worker_thread, &ctx) != 0) + ksft_exit_fail_msg("pthread_create failed: %s\n", strerror(errno)); + + /* Wait for worker to finish generating activity */ + pthread_mutex_lock(&ctx.lock); + while (!ctx.ready) + pthread_cond_wait(&ctx.cond, &ctx.lock); + pthread_mutex_unlock(&ctx.lock); + + /* + * Snapshot A: TGID stats while worker is alive and sleeping. + * Contains main thread + worker contributions. + */ + ret = get_taskstats(fd, family_id, TASKSTATS_CMD_ATTR_TGID, tgid, &before); + if (ret) + ksft_exit_fail_msg("TGID query before exit failed: %s\n", + strerror(-ret)); + + /* Release worker so it can exit, then join (deterministic wait). + * + * Kernel exit path ordering guarantees: + * do_exit() + * taskstats_exit() -> fill_tgid_exit() (accumulates worker into signal->stats) + * exit_notify() (releases the thread) + * do_task_dead() -> __schedule() (wakes joiner) + * + * So pthread_join() returns only after fill_tgid_exit() has completed. + */ + pthread_mutex_lock(&ctx.lock); + ctx.release = true; + pthread_cond_broadcast(&ctx.cond); + pthread_mutex_unlock(&ctx.lock); + + pthread_join(thread, NULL); + + /* + * Snapshot B: TGID stats after worker has exited. + * fill_stats_for_tgid() does: + * memcpy(signal->stats) <- includes fill_tgid_exit accumulation + * + scan live threads <- only main thread now + */ + ret = get_taskstats(fd, family_id, TASKSTATS_CMD_ATTR_TGID, tgid, &after); + if (ret) + ksft_exit_fail_msg("TGID query after exit failed: %s\n", + strerror(-ret)); + + print_stats("TGID before worker exit", &before); + print_stats("TGID after worker exit", &after); + + /* + * The worker burned 200ms of CPU before the first snapshot. + * If the kernel correctly retained its contribution via + * fill_tgid_exit(), then the TGID CPU total after exit must be at + * least as large as the TGID CPU total before exit. + */ + ksft_test_result(cpu_total(&after) >= cpu_total(&before), + "TGID CPU stats should not regress after thread exit\n"); + + close(fd); + ksft_finished(); + return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; +} diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index e22703d6b97c..19fca95f7c22 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -108,6 +108,24 @@ static void f8mm8_sigill(void) asm volatile(".inst 0x6e80ec00"); } +static void f16f32dot_sigill(void) +{ + /* FDOT V0.2S, V0.4H, V0.2H[0] */ + asm volatile(".inst 0xf409000"); +} + +static void f16f32mm_sigill(void) +{ + /* FMMLA V0.4S, V0.8H, V0.8H */ + asm volatile(".inst 0x4e40ec00"); +} + +static void f16mm_sigill(void) +{ + /* FMMLA V0.8H, V0.8H, V0.8H */ + asm volatile(".inst 0x4ec0ec00"); +} + static void faminmax_sigill(void) { /* FAMIN V0.4H, V0.4H, V0.4H */ @@ -191,6 +209,12 @@ static void lut_sigill(void) asm volatile(".inst 0x4e801000"); } +static void sve_lut6_sigill(void) +{ + /* LUTI6 Z0.H, { Z0.H, Z1.H }, Z0[0] */ + asm volatile(".inst 0x4560ac00"); +} + static void mops_sigill(void) { char dst[1], src[1]; @@ -282,6 +306,18 @@ static void sme2p2_sigill(void) asm volatile("msr S0_3_C4_C6_3, xzr" : : : ); } +static void sme2p3_sigill(void) +{ + /* SMSTART SM */ + asm volatile("msr S0_3_C4_C3_3, xzr" : : : ); + + /* ADDQP Z0.B, Z0.B, Z0.B */ + asm volatile(".inst 0x4207800" : : : "z0"); + + /* SMSTOP */ + asm volatile("msr S0_3_C4_C6_3, xzr" : : : ); +} + static void sme_aes_sigill(void) { /* SMSTART SM */ @@ -378,6 +414,18 @@ static void smef8f32_sigill(void) asm volatile("msr S0_3_C4_C6_3, xzr" : : : ); } +static void smelut6_sigill(void) +{ + /* SMSTART */ + asm volatile("msr S0_3_C4_C7_3, xzr" : : : ); + + /* LUTI6 { Z0.B-Z3.B }, ZT0, { Z0-Z2 } */ + asm volatile(".inst 0xc08a0000" : : : ); + + /* SMSTOP */ + asm volatile("msr S0_3_C4_C6_3, xzr" : : : ); +} + static void smelutv2_sigill(void) { /* SMSTART */ @@ -486,6 +534,12 @@ static void sve2p2_sigill(void) asm volatile(".inst 0x4cea000" : : : "z0"); } +static void sve2p3_sigill(void) +{ + /* ADDQP Z0.B, Z0.B, Z0.B */ + asm volatile(".inst 0x4207800" : : : "z0"); +} + static void sveaes_sigill(void) { /* AESD z0.b, z0.b, z0.b */ @@ -504,6 +558,12 @@ static void sveb16b16_sigill(void) asm volatile(".inst 0x65000000" : : : ); } +static void sveb16mm_sigill(void) +{ + /* BFMMLA Z0.H, Z0.H, Z0.H */ + asm volatile(".inst 0x64e0e000" : : : ); +} + static void svebfscale_sigill(void) { /* BFSCALE Z0.H, P0/M, Z0.H, Z0.H */ @@ -730,6 +790,27 @@ static const struct hwcap_data { .sigill_fn = f8mm4_sigill, }, { + .name = "F16MM", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_F16MM, + .cpuinfo = "f16mm", + .sigill_fn = f16mm_sigill, + }, + { + .name = "F16F32DOT", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_F16F32DOT, + .cpuinfo = "f16f32dot", + .sigill_fn = f16f32dot_sigill, + }, + { + .name = "F16F32MM", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_F16F32MM, + .cpuinfo = "f16f32mm", + .sigill_fn = f16f32mm_sigill, + }, + { .name = "FAMINMAX", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_FAMINMAX, @@ -919,6 +1000,13 @@ static const struct hwcap_data { .sigill_fn = sme2p2_sigill, }, { + .name = "SME 2.3", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_SME2P3, + .cpuinfo = "sme2p3", + .sigill_fn = sme2p3_sigill, + }, + { .name = "SME AES", .at_hwcap = AT_HWCAP, .hwcap_bit = HWCAP_SME_AES, @@ -968,6 +1056,13 @@ static const struct hwcap_data { .sigill_fn = smef8f32_sigill, }, { + .name = "SME LUT6", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_SME_LUT6, + .cpuinfo = "smelut6", + .sigill_fn = smelut6_sigill, + }, + { .name = "SME LUTV2", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_SME_LUTV2, @@ -1053,6 +1148,13 @@ static const struct hwcap_data { .sigill_fn = sve2p2_sigill, }, { + .name = "SVE 2.3", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_SVE2P3, + .cpuinfo = "sve2p3", + .sigill_fn = sve2p3_sigill, + }, + { .name = "SVE AES", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_SVEAES, @@ -1067,6 +1169,13 @@ static const struct hwcap_data { .sigill_fn = sveaes2_sigill, }, { + .name = "SVE B16MM", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_SVE_B16MM, + .cpuinfo = "sveb16mm", + .sigill_fn = sveb16mm_sigill, + }, + { .name = "SVE BFSCALE", .at_hwcap = AT_HWCAP, .hwcap_bit = HWCAP_SVE_BFSCALE, @@ -1088,6 +1197,13 @@ static const struct hwcap_data { .sigill_fn = svef16mm_sigill, }, { + .name = "SVE_LUT6", + .at_hwcap = AT_HWCAP3, + .hwcap_bit = HWCAP3_SVE_LUT6, + .cpuinfo = "svelut6", + .sigill_fn = sve_lut6_sigill, + }, + { .name = "SVE2 B16B16", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_SVE_B16B16, diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index ee75a2c25ce7..c7c343494cb8 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -36,6 +36,7 @@ enum { FSME_FA64_BIT, FSME2_BIT, FGCS_BIT, + FPOE_BIT, FMAX_END }; @@ -45,6 +46,7 @@ enum { #define FEAT_SME_FA64 (1UL << FSME_FA64_BIT) #define FEAT_SME2 (1UL << FSME2_BIT) #define FEAT_GCS (1UL << FGCS_BIT) +#define FEAT_POE (1UL << FPOE_BIT) /* * A descriptor used to describe and configure a test case. diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index 5d3621921cfe..4b12dbd7669d 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -31,6 +31,7 @@ static char const *const feats_names[FMAX_END] = { " FA64 ", " SME2 ", " GCS ", + " POE ", }; #define MAX_FEATS_SZ 128 @@ -341,6 +342,8 @@ int test_init(struct tdescr *td) td->feats_supported |= FEAT_SME2; if (getauxval(AT_HWCAP) & HWCAP_GCS) td->feats_supported |= FEAT_GCS; + if (getauxval(AT_HWCAP2) & HWCAP2_POE) + td->feats_supported |= FEAT_POE; if (feats_ok(td)) { if (td->feats_required & td->feats_supported) fprintf(stderr, diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h index 36fc12b3cd60..2c7b8c64a35a 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.h +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h @@ -57,6 +57,22 @@ static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void) return val; } +#define SYS_POR_EL0 "S3_3_C10_C2_4" + +static inline uint64_t get_por_el0(void) +{ + uint64_t val; + + asm volatile("mrs %0, " SYS_POR_EL0 "\n" : "=r"(val)); + + return val; +} + +static inline void set_por_el0(uint64_t val) +{ + asm volatile("msr " SYS_POR_EL0 ", %0\n" :: "r"(val)); +} + static inline bool feats_ok(struct tdescr *td) { if (td->feats_incompatible & td->feats_supported) diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_missing_poe_context.c b/tools/testing/selftests/arm64/signal/testcases/poe_missing_poe_context.c new file mode 100644 index 000000000000..3f22d8cf6106 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/poe_missing_poe_context.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 Arm Ltd + * + * Verify that the POR_EL0 register is left untouched on sigreturn if the + * POE frame record is missing. + */ + +#include <asm/sigcontext.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +#define POR_EL0_INIT 0x07ul +#define POR_EL0_CUSTOM 0x77ul + +static bool failed_check; + +static bool modify_por_el0(struct tdescr *td) +{ + set_por_el0(POR_EL0_CUSTOM); + + return true; +} + +static int signal_remove_poe_context(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uc); + size_t resv_size = GET_UCP_RESV_SIZE(uc); + struct _aarch64_ctx *poe_ctx_head; + + poe_ctx_head = get_header(ctx, POE_MAGIC, resv_size, NULL); + if (!poe_ctx_head) { + fprintf(stderr, "Missing poe_context record\n"); + failed_check = true; + return 0; + } + + /* + * Actually removing the record would require moving down the next + * records. An easier option is to turn it into an ESR record, which is + * ignored by sigreturn(). + */ + poe_ctx_head->magic = ESR_MAGIC; + + return 0; +} + +static void check_por_el0_preserved(struct tdescr *td) +{ + uint64_t por_el0 = get_por_el0(); + + if (por_el0 == POR_EL0_INIT) { + fprintf(stderr, "POR_EL0 preserved\n"); + } else { + fprintf(stderr, "POR_EL0 unexpectedly set to %lx\n", por_el0); + failed_check = true; + } + + td->pass = !failed_check; +} + +struct tdescr tde = { + .name = "POR_EL0 missing poe_context", + .descr = "Remove poe_context record and check POR_EL0 is preserved", + .feats_required = FEAT_POE, + .timeout = 3, + .sig_trig = SIGUSR1, + .init = modify_por_el0, + .run = signal_remove_poe_context, + .check_result = check_por_el0_preserved, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_restore.c b/tools/testing/selftests/arm64/signal/testcases/poe_restore.c new file mode 100644 index 000000000000..9f9a61a4214d --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/poe_restore.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 Arm Ltd + * + * Verify that the POR_EL0 register is saved and restored as expected on signal + * entry/return. + */ + +#include <asm/sigcontext.h> + +#include "test_signals_utils.h" +#include "testcases.h" + +#define POR_EL0_INIT 0x07ul +#define POR_EL0_CUSTOM 0x77ul + +static bool failed_check; + +static bool modify_por_el0(struct tdescr *td) +{ + set_por_el0(POR_EL0_CUSTOM); + + return true; +} + +static int signal_check_por_el0_reset(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + uint64_t signal_por_el0 = get_por_el0(); + + if (signal_por_el0 != POR_EL0_INIT) { + fprintf(stderr, "POR_EL0 is %lx in signal handler (expected %lx)\n", + signal_por_el0, POR_EL0_INIT); + failed_check = true; + } + + return 0; +} + +static void check_por_el0_restored(struct tdescr *td) +{ + uint64_t por_el0 = get_por_el0(); + + if (por_el0 == POR_EL0_CUSTOM) { + fprintf(stderr, "POR_EL0 restored\n"); + } else { + fprintf(stderr, "POR_EL0 was %lx but is now %lx\n", + POR_EL0_CUSTOM, por_el0); + failed_check = true; + } + + td->pass = !failed_check; +} + +struct tdescr tde = { + .name = "POR_EL0 restore", + .descr = "Validate that POR_EL0 is saved/restored on signal entry/return", + .feats_required = FEAT_POE, + .timeout = 3, + .sig_trig = SIGUSR1, + .init = modify_por_el0, + .run = signal_check_por_el0_reset, + .check_result = check_por_el0_restored, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c index 36bd9940ee05..e15fedf4da6e 100644 --- a/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c +++ b/tools/testing/selftests/arm64/signal/testcases/poe_siginfo.c @@ -21,21 +21,6 @@ static union { char buf[1024 * 128]; } context; -#define SYS_POR_EL0 "S3_3_C10_C2_4" - -static uint64_t get_por_el0(void) -{ - uint64_t val; - - asm volatile( - "mrs %0, " SYS_POR_EL0 "\n" - : "=r"(val) - : - : ); - - return val; -} - int poe_present(struct tdescr *td, siginfo_t *si, ucontext_t *uc) { struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index bfdc5518ecc8..986a6389186b 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -21,7 +21,6 @@ test_lirc_mode2_user flow_dissector_load test_tcpnotify_user test_libbpf -xdping test_cpp *.d *.subskel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 6ef6872adbc3..b642ee489ea6 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -44,6 +44,12 @@ SKIP_LLVM ?= SKIP_LIBBFD ?= SKIP_CRYPTO ?= +# When BPF_STRICT_BUILD is 1, any BPF object, skeleton, test object, or +# benchmark compilation failure is fatal. Set to 0 to tolerate failures +# and continue building the remaining tests. +BPF_STRICT_BUILD ?= 1 +PERMISSIVE := $(filter 0,$(BPF_STRICT_BUILD)) + ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) @@ -51,19 +57,20 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif -CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ +COMMON_CFLAGS = -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ -Wall -Werror -fno-omit-frame-pointer \ -Wno-unused-but-set-variable \ $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ - -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT) + -I$(TOOLSINCDIR) -I$(TOOLSARCHINCDIR) -I$(APIDIR) -I$(OUTPUT) \ + -I$(CURDIR)/libarena/include LDFLAGS += $(SAN_LDFLAGS) LDLIBS += $(LIBELF_LIBS) -lz -lrt -lpthread PCAP_CFLAGS := $(shell $(PKG_CONFIG) --cflags libpcap 2>/dev/null && echo "-DTRAFFIC_MONITOR=1") PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) LDLIBS += $(PCAP_LIBS) -CFLAGS += $(PCAP_CFLAGS) +CFLAGS += $(COMMON_CFLAGS) $(PCAP_CFLAGS) # Some utility functions use LLVM libraries jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) @@ -78,6 +85,12 @@ ifneq ($(shell $(CLANG) --target=bpf -mcpu=help 2>&1 | grep 'v4'),) CLANG_CPUV4 := 1 endif +# Check whether clang supports BPF address sanitizer (requires LLVM 22+) +CLANG_HAS_ARENA_ASAN := $(shell echo 'int x;' | \ + $(CLANG) --target=bpf -fsanitize=kernel-address \ + -mllvm -asan-shadow-addr-space=1 \ + -x c -c - -o /dev/null 2>/dev/null && echo 1) + # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \ test_sockmap \ @@ -111,7 +124,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ - test_xdping.sh \ test_bpftool_build.sh \ test_doc_build.sh \ test_xsk.sh \ @@ -134,7 +146,6 @@ TEST_GEN_PROGS_EXTENDED = \ xdp_features \ xdp_hw_metadata \ xdp_synproxy \ - xdping \ xskxceiver TEST_GEN_FILES += $(TEST_KMODS) liburandom_read.so urandom_read sign-file uprobe_multi @@ -153,12 +164,13 @@ override define CLEAN $(Q)$(RM) -r $(TEST_KMODS) $(Q)$(RM) -r $(EXTRA_CLEAN) $(Q)$(MAKE) -C test_kmods clean + $(Q)$(MAKE) -C libarena clean $(Q)$(MAKE) docs-clean endef include ../lib.mk -NON_CHECK_FEAT_TARGETS := clean docs-clean +NON_CHECK_FEAT_TARGETS := clean docs-clean emit_tests CHECK_FEAT := $(filter-out $(NON_CHECK_FEAT_TARGETS),$(or $(MAKECMDGOALS), "none")) ifneq ($(CHECK_FEAT),) FEATURE_USER := .selftests @@ -182,8 +194,15 @@ ifeq ($(feature-llvm),1) LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) - # Prefer linking statically if it's available, otherwise fallback to shared - ifeq ($(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo static),static) + # Cross compilation must use dynamic linking to avoid unresolved library + # dependencies. For native build, prefer linking statically if it's + # available, otherwise fallback to shared. + ifneq ($(ARCH), $(HOSTARCH)) + LLVM_LINK_STATIC := + else + LLVM_LINK_STATIC := $(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo y) + endif + ifeq ($(LLVM_LINK_STATIC),y) LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS)) LLVM_LDLIBS += $(filter-out -lxml2,$(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS))) LLVM_LDLIBS += -lstdc++ @@ -255,7 +274,7 @@ endif $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom_read.map $(call msg,LIB,,$@) $(Q)$(CLANG) $(CLANG_TARGET_ARCH) \ - $(filter-out -static,$(CFLAGS) $(LDFLAGS)) \ + $(filter-out -static,$(COMMON_CFLAGS) $(LDFLAGS)) \ $(filter %.c,$^) $(filter-out -static,$(LDLIBS)) \ -Wno-unused-command-line-argument \ -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \ @@ -265,7 +284,7 @@ $(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c liburandom $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so $(call msg,BINARY,,$@) $(Q)$(CLANG) $(CLANG_TARGET_ARCH) \ - $(filter-out -static,$(CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ + $(filter-out -static,$(COMMON_CFLAGS) $(LDFLAGS)) $(filter %.c,$^) \ -Wno-unused-command-line-argument \ -lurandom_read $(filter-out -static,$(LDLIBS)) -L$(OUTPUT) \ -fuse-ld=$(LLD) -Wl,-znoseparate-code -Wl,--build-id=sha1 \ @@ -284,13 +303,15 @@ $(OUTPUT)/sign-file: ../../../../scripts/sign-file.c # subst() turns the rule into a pattern matching rule $(addprefix test_kmods/,$(subst .ko,%ko,$(TEST_KMODS))): $(VMLINUX_BTF) $(RESOLVE_BTFIDS) $(wildcard test_kmods/Makefile test_kmods/*.[ch]) $(Q)$(RM) test_kmods/*.ko test_kmods/*.mod.o # force re-compilation - $(Q)$(MAKE) $(submake_extras) -C test_kmods \ - RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) \ + $(Q)$(MAKE) $(submake_extras) -C test_kmods \ + $(if $(O),O=$(abspath $(O))) \ + $(if $(KBUILD_OUTPUT),KBUILD_OUTPUT=$(abspath $(KBUILD_OUTPUT)))\ + RESOLVE_BTFIDS=$(RESOLVE_BTFIDS) \ EXTRA_CFLAGS='' EXTRA_LDFLAGS='' $(TEST_KMOD_TARGETS): $(addprefix test_kmods/,$(TEST_KMODS)) $(call msg,MOD,,$@) - $(Q)cp test_kmods/$(@F) $@ + $(Q)$(if $(PERMISSIVE),if [ -f test_kmods/$(@F) ]; then )cp test_kmods/$(@F) $@$(if $(PERMISSIVE),; fi) DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool @@ -320,7 +341,6 @@ $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELP $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tag: $(TESTING_HELPERS) $(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS) -$(OUTPUT)/xdping: $(TESTING_HELPERS) $(OUTPUT)/flow_dissector_load: $(TESTING_HELPERS) $(OUTPUT)/test_maps: $(TESTING_HELPERS) $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) @@ -446,6 +466,7 @@ endif CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(CURDIR)/libarena/include \ -I$(abspath $(OUTPUT)/../usr/include) \ -std=gnu11 \ -fno-strict-aliasing \ @@ -471,22 +492,26 @@ $(OUTPUT)/cgroup_getset_retval_hooks.o: cgroup_getset_retval_hooks.h # $4 - binary name define CLANG_BPF_BUILD_RULE $(call msg,CLNG-BPF,$4,$2) - $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2 + $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v3 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef # Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32 define CLANG_NOALU32_BPF_BUILD_RULE $(call msg,CLNG-BPF,$4,$2) - $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2 + $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v2 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef # Similar to CLANG_BPF_BUILD_RULE, but with cpu-v4 define CLANG_CPUV4_BPF_BUILD_RULE $(call msg,CLNG-BPF,$4,$2) - $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2 + $(Q)$(CLANG) $3 -O2 $(BPF_TARGET_ENDIAN) -c $1 -mcpu=v4 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef # Build BPF object using GCC define GCC_BPF_BUILD_RULE $(call msg,GCC-BPF,$4,$2) - $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 + $(Q)$(BPF_GCC) $3 -DBPF_NO_PRESERVE_ACCESS_INDEX -Wno-attributes -O2 -c $1 -o $2 $(if $(PERMISSIVE),|| \ + ($(RM) $2; printf ' %-12s %s\n' 'SKIP-BPF' '$(notdir $2)' 1>&2)) endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c @@ -494,7 +519,10 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ linked_vars.skel.h linked_maps.skel.h \ test_subskeleton.skel.h test_subskeleton_lib.skel.h \ - test_usdt.skel.h + test_usdt.skel.h tracing_multi.skel.h \ + tracing_multi_module.skel.h \ + tracing_multi_intersect.skel.h \ + tracing_multi_session.skel.h LSKELS := fexit_sleep.c trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ @@ -520,11 +548,16 @@ test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o xdp_features.skel.h-deps := xdp_features.bpf.o +tracing_multi.skel.h-deps := tracing_multi_attach.bpf.o tracing_multi_check.bpf.o +tracing_multi_module.skel.h-deps := tracing_multi_attach_module.bpf.o tracing_multi_check.bpf.o +tracing_multi_intersect.skel.h-deps := tracing_multi_intersect_attach.bpf.o tracing_multi_check.bpf.o +tracing_multi_session.skel.h-deps := tracing_multi_session_attach.bpf.o tracing_multi_check.bpf.o LINKED_BPF_OBJS := $(foreach skel,$(LINKED_SKELS),$($(skel)-deps)) LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(LINKED_BPF_OBJS)) HEADERS_FOR_BPF_OBJS := $(wildcard $(BPFDIR)/*.bpf.h) \ + $(wildcard $(CURDIR)/libarena/include/*.[ch]) \ $(addprefix $(BPFDIR)/, bpf_core_read.h \ bpf_endian.h \ bpf_helpers.h \ @@ -569,6 +602,12 @@ endef # $2 - test runner extra "flavor" (e.g., no_alu32, cpuv4, bpf_gcc, etc) define DEFINE_TEST_RUNNER_RULES +# Permissive build behaviour (skip-on-failure compile, partial-link) only +# applies to test_progs and its flavors; runners that use strong cross-object +# references (e.g. test_maps) keep strict semantics even when permissive. +# The check is inlined per-runner so $1 is substituted at $(call) time and +# the result is baked into each rule's recipe. + ifeq ($($(TRUNNER_OUTPUT)-dir),) $(TRUNNER_OUTPUT)-dir := y $(TRUNNER_OUTPUT): @@ -592,47 +631,81 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.bpf.o: \ $$($$<-$2-CFLAGS),$(TRUNNER_BINARY)) $(TRUNNER_BPF_SKELS): %.skel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) - $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) - $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ - $(Q)$$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) - $(Q)rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) + $(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi;) \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2; \ + $$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< && \ + $$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) && \ + $$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) && \ + diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) && \ + $$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$@ && \ + $$(BPFTOOL) gen subskeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.bpf.o=)) > $$(@:.skel.h=.subskel.h) $(if $(PERMISSIVE),|| { \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(<:.o=.linked1.o) $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) $(TRUNNER_BPF_LSKELS): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) - $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) - $(Q)$$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ - $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi;) \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2; \ + $$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) && \ + diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) && \ + $$(BPFTOOL) gen skeleton -L $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(if $(PERMISSIVE),|| { \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) $(TRUNNER_BPF_LSKELS_SIGNED): %.lskel.h: %.bpf.o $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY) (signed),$$@) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) - $(Q)$$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) - $(Q)diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) - $(Q)$$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ - $(Q)rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) + $(Q)$(if $(PERMISSIVE),if [ ! -f $$< ]; then \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi;) \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY) (signed)] $$(notdir $$@)' 1>&2; \ + $$(BPFTOOL) gen object $$(<:.o=.llinked1.o) $$< && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked2.o) $$(<:.o=.llinked1.o) && \ + $$(BPFTOOL) gen object $$(<:.o=.llinked3.o) $$(<:.o=.llinked2.o) && \ + diff $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) && \ + $$(BPFTOOL) gen skeleton $(LSKEL_SIGN) $$(<:.o=.llinked3.o) name $$(notdir $$(<:.bpf.o=_lskel)) > $$@ $(if $(PERMISSIVE),|| { \ + $$(RM) $$@; \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(<:.o=.llinked1.o) $$(<:.o=.llinked2.o) $$(<:.o=.llinked3.o) $(LINKED_BPF_OBJS): %: $(TRUNNER_OUTPUT)/% # .SECONDEXPANSION here allows to correctly expand %-deps variables as prerequisites .SECONDEXPANSION: $(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_OUTPUT)/%: $$$$(%-deps) $(BPFTOOL) | $(TRUNNER_OUTPUT) - $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.bpf.o)) - $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) - $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) - $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) - $(Q)diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) - $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ - $(Q)$$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) - $(Q)rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) + $(Q)$(if $(PERMISSIVE),for f in $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)); do \ + if [ ! -f $$$$f ]; then \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + exit 0; \ + fi; \ + done;) \ + printf ' %-12s %s\n' 'LINK-BPF' '[$(TRUNNER_BINARY)] $$(notdir $$(@:.skel.h=.bpf.o))' 1>&2; \ + $$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) && \ + $$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) && \ + $$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) && \ + diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) && \ + printf ' %-12s %s\n' 'GEN-SKEL' '[$(TRUNNER_BINARY)] $$(notdir $$@)' 1>&2 && \ + $$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ && \ + $$(BPFTOOL) gen subskeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$(@:.skel.h=.subskel.h) $(if $(PERMISSIVE),|| { \ + $$(RM) $$@ $$(@:.skel.h=.subskel.h); \ + printf ' %-12s %s\n' 'SKIP-SKEL' '$$(notdir $$@)' 1>&2; \ + }) && \ + rm -f $$(@:.skel.h=.linked1.o) $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) # When the compiler generates a %.d file, only skel basenames (not # full paths) are specified as prerequisites for corresponding %.o @@ -664,22 +737,25 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_TESTS_DIR)/%.c \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) - $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $(Q)(cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)) $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),|| \ + ($(RM) $$@; printf ' %-12s %s\n' 'SKIP-TEST' '$$(notdir $$@)' 1>&2))) $$(if $$(TEST_NEEDS_BTFIDS), \ - $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(Q)if [ -f $$@ ]; then \ + $(if $(filter 1,$(V)),true,printf ' %-8s%s %s\n' "BTFIDS" " [$(TRUNNER_BINARY)]" "$$(notdir $$@)"); \ $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ - $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@) + $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@; \ + fi) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ $(TRUNNER_EXTRA_HDRS) \ + $$(BPFOBJ) | $(TRUNNER_OUTPUT) \ $(TRUNNER_BPF_SKELS) \ $(TRUNNER_BPF_LSKELS) \ $(TRUNNER_BPF_LSKELS_SIGNED) \ - $(TRUNNER_BPF_SKELS_LINKED) \ - $$(BPFOBJ) | $(TRUNNER_OUTPUT) + $(TRUNNER_BPF_SKELS_LINKED) -ifeq ($(filter clean docs-clean,$(MAKECMDGOALS)),) +ifeq ($(filter clean docs-clean emit_tests,$(MAKECMDGOALS)),) include $(wildcard $(TRUNNER_TEST_OBJS:.o=.d)) endif @@ -705,20 +781,21 @@ $(TRUNNER_LIB_OBJS): $(TRUNNER_OUTPUT)/%.o:$(TOOLSDIR)/lib/%.c $(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT) ifneq ($2:$(OUTPUT),:$(shell pwd)) $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES)) - $(Q)rsync -aq $$^ $(TRUNNER_OUTPUT)/ + $(Q)rsync -aq $(if $(PERMISSIVE),--ignore-missing-args) $$^ $(TRUNNER_OUTPUT)/ endif # some X.test.o files have runtime dependencies on Y.bpf.o files $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) -$(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ +$(OUTPUT)/$(TRUNNER_BINARY): $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$$(wildcard $(TRUNNER_TEST_OBJS)),$(TRUNNER_TEST_OBJS)),$(TRUNNER_TEST_OBJS)) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(TRUNNER_LIB_OBJS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ - | $(TRUNNER_BINARY)-extras + | $(TRUNNER_BINARY)-extras \ + $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$(TRUNNER_TEST_OBJS))) $$(call msg,BINARY,,$$@) - $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ + $(Q)$$(CC) $$(CFLAGS) $(if $(filter test_progs%,$1),$(if $(PERMISSIVE),$$(filter %.a %.o,$$(wildcard $(TRUNNER_TEST_OBJS)) $$(filter-out $(TRUNNER_TEST_OBJS),$$^)),$$(filter %.a %.o,$$^)),$$(filter %.a %.o,$$^)) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool @@ -740,6 +817,37 @@ $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) echo "};"; \ echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ +LIBARENA_MAKE_ARGS = \ + BPFTOOL="$(BPFTOOL)" \ + INCLUDE_DIR="$(INCLUDE_DIR)" \ + LIBBPF_INCLUDE="$(HOST_INCLUDE_DIR)" \ + BPFOBJ="$(BPFOBJ)" \ + LDLIBS="$(LDLIBS) -lzstd" \ + CLANG="$(CLANG)" \ + BPF_CFLAGS="$(BPF_CFLAGS) $(CLANG_CFLAGS)" \ + BPF_TARGET_ENDIAN="$(BPF_TARGET_ENDIAN)" \ + Q="$(Q)" + +LIBARENA_BPF_DEPS := $(wildcard libarena/Makefile \ + libarena/include/* \ + libarena/include/libarena/* \ + libarena/src/* \ + libarena/selftests/* \ + libarena/*.bpf.o) + +LIBARENA_SKEL := libarena/libarena.skel.h + +$(LIBARENA_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS) + +$(MAKE) -C libarena libarena.skel.h $(LIBARENA_MAKE_ARGS) + +ifneq ($(CLANG_HAS_ARENA_ASAN),) +LIBARENA_ASAN_SKEL := libarena/libarena_asan.skel.h +CFLAGS += -DHAS_BPF_ARENA_ASAN + +$(LIBARENA_ASAN_SKEL): $(INCLUDE_DIR)/vmlinux.h $(BPFOBJ) $(LIBARENA_BPF_DEPS) + +$(MAKE) -C libarena libarena_asan.skel.h $(LIBARENA_MAKE_ARGS) +endif + # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests TRUNNER_BPF_PROGS_DIR := progs @@ -764,7 +872,9 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ flow_dissector_load.h \ ip_check_defrag_frags.h \ bpftool_helpers.c \ - usdt_1.c usdt_2.c + usdt_1.c usdt_2.c \ + $(LIBARENA_SKEL) \ + $(LIBARENA_ASAN_SKEL) TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ @@ -849,7 +959,8 @@ $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) # Benchmark runner $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) $(call msg,CC,,$@) - $(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -O2 -c $(filter %.c,$^) $(LDLIBS) -o $@ $(if $(PERMISSIVE),|| \ + ($(RM) $@; printf ' %-12s %s\n' 'SKIP-BENCH' '$(notdir $@)' 1>&2)) $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \ @@ -866,6 +977,9 @@ $(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h $(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h $(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h $(OUTPUT)/bench_lpm_trie_map.o: $(OUTPUT)/lpm_trie_bench.skel.h $(OUTPUT)/lpm_trie_map.skel.h +$(OUTPUT)/bench_bpf_nop.o: $(OUTPUT)/bpf_nop_bench.skel.h bench_bpf_timing.h +$(OUTPUT)/bench_xdp_lb.o: $(OUTPUT)/xdp_lb_bench.skel.h bench_bpf_timing.h +$(OUTPUT)/bench_bpf_timing.o: bench_bpf_timing.h $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ) $(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: $(OUTPUT)/bench.o \ @@ -888,11 +1002,15 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \ $(OUTPUT)/bench_bpf_crypto.o \ $(OUTPUT)/bench_sockmap.o \ $(OUTPUT)/bench_lpm_trie_map.o \ + $(OUTPUT)/bench_bpf_timing.o \ + $(OUTPUT)/bench_bpf_nop.o \ + $(OUTPUT)/bench_xdp_lb.o \ $(OUTPUT)/usdt_1.o \ $(OUTPUT)/usdt_2.o \ # $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@ $(if $(PERMISSIVE),|| \ + ($(RM) $@; printf ' %-12s %s\n' 'SKIP-LINK' '$(notdir $@) (some benchmarks may have been skipped)' 1>&2)) # This works around GCC warning about snprintf truncating strings like: # @@ -925,11 +1043,28 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ # Delete partially updated (corrupted) files on error .DELETE_ON_ERROR: +# When permissive, tell rsync to ignore missing source arguments so that +# partial builds do not abort installation. +ifneq ($(PERMISSIVE),) +override define INSTALL_SINGLE_RULE + $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH)) + $(if $(INSTALL_LIST),rsync -a --copy-unsafe-links --ignore-missing-args $(INSTALL_LIST) $(INSTALL_PATH)/) +endef +endif + DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - @for DIR in $(TEST_INST_SUBDIRS); do \ - mkdir -p $(INSTALL_PATH)/$$DIR; \ - rsync -a $(OUTPUT)/$$DIR/*.bpf.o $(INSTALL_PATH)/$$DIR;\ + @for DIR in $(TEST_INST_SUBDIRS); do \ + mkdir -p $(INSTALL_PATH)/$$DIR; \ + rsync -a $(if $(PERMISSIVE),--ignore-missing-args) \ + $(OUTPUT)/$$DIR/*.bpf.o \ + $(INSTALL_PATH)/$$DIR; \ done endef + +libarena: $(LIBARENA_SKEL) + +ifneq ($(CLANG_HAS_ARENA_ASAN),) +libarena_asan: $(LIBARENA_ASAN_SKEL) +endif diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 776fbe3cb8f9..37164322a102 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -77,7 +77,7 @@ In case of linker errors when running selftests, try using static linking: .. code-block:: console - $ LDLIBS=-static PKG_CONFIG='pkg-config --static' vmtest.sh + $ LDLIBS=-static EXTRA_LDFLAGS=-static PKG_CONFIG='pkg-config --static' vmtest.sh .. note:: Some distros may not support static linking. diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index 029b3e21f438..3d9d2cd7764b 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -286,6 +286,7 @@ extern struct argp bench_trigger_batch_argp; extern struct argp bench_crypto_argp; extern struct argp bench_sockmap_argp; extern struct argp bench_lpm_trie_map_argp; +extern struct argp bench_xdp_lb_argp; static const struct argp_child bench_parsers[] = { { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, @@ -302,6 +303,7 @@ static const struct argp_child bench_parsers[] = { { &bench_crypto_argp, 0, "bpf crypto benchmark", 0 }, { &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 }, { &bench_lpm_trie_map_argp, 0, "LPM trie map benchmark", 0 }, + { &bench_xdp_lb_argp, 0, "XDP load-balancer benchmark", 0 }, {}, }; @@ -558,13 +560,16 @@ extern const struct bench bench_bpf_loop; extern const struct bench bench_strncmp_no_helper; extern const struct bench bench_strncmp_helper; extern const struct bench bench_bpf_hashmap_full_update; +extern const struct bench bench_bpf_rhashmap_full_update; extern const struct bench bench_local_storage_cache_seq_get; extern const struct bench bench_local_storage_cache_interleaved_get; extern const struct bench bench_local_storage_cache_hashmap_control; extern const struct bench bench_local_storage_tasks_trace; extern const struct bench bench_bpf_hashmap_lookup; +extern const struct bench bench_bpf_rhashmap_lookup; extern const struct bench bench_local_storage_create; extern const struct bench bench_htab_mem; +extern const struct bench bench_rhtab_mem; extern const struct bench bench_crypto_encrypt; extern const struct bench bench_crypto_decrypt; extern const struct bench bench_sockmap; @@ -575,6 +580,8 @@ extern const struct bench bench_lpm_trie_insert; extern const struct bench bench_lpm_trie_update; extern const struct bench bench_lpm_trie_delete; extern const struct bench bench_lpm_trie_free; +extern const struct bench bench_bpf_nop; +extern const struct bench bench_xdp_lb; static const struct bench *benchs[] = { &bench_count_global, @@ -636,13 +643,16 @@ static const struct bench *benchs[] = { &bench_strncmp_no_helper, &bench_strncmp_helper, &bench_bpf_hashmap_full_update, + &bench_bpf_rhashmap_full_update, &bench_local_storage_cache_seq_get, &bench_local_storage_cache_interleaved_get, &bench_local_storage_cache_hashmap_control, &bench_local_storage_tasks_trace, &bench_bpf_hashmap_lookup, + &bench_bpf_rhashmap_lookup, &bench_local_storage_create, &bench_htab_mem, + &bench_rhtab_mem, &bench_crypto_encrypt, &bench_crypto_decrypt, &bench_sockmap, @@ -653,6 +663,8 @@ static const struct bench *benchs[] = { &bench_lpm_trie_update, &bench_lpm_trie_delete, &bench_lpm_trie_free, + &bench_bpf_nop, + &bench_xdp_lb, }; static void find_benchmark(void) @@ -741,6 +753,13 @@ static void setup_benchmark(void) static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER; +void bench_force_done(void) +{ + pthread_mutex_lock(&bench_done_mtx); + pthread_cond_signal(&bench_done); + pthread_mutex_unlock(&bench_done_mtx); +} + static void collect_measurements(long delta_ns) { int iter = state.res_cnt++; struct bench_res *res = &state.results[iter]; @@ -750,11 +769,8 @@ static void collect_measurements(long delta_ns) { if (bench->report_progress) bench->report_progress(iter, res, delta_ns); - if (iter == env.duration_sec + env.warmup_sec) { - pthread_mutex_lock(&bench_done_mtx); - pthread_cond_signal(&bench_done); - pthread_mutex_unlock(&bench_done_mtx); - } + if (iter == env.duration_sec + env.warmup_sec) + bench_force_done(); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index 7cf21936e7ed..89a3fc72f70e 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -70,6 +70,7 @@ extern struct env env; extern const struct bench *bench; void setup_libbpf(void); +void bench_force_done(void); void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns); void hits_drops_report_final(struct bench_res res[], int res_cnt); void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns); diff --git a/tools/testing/selftests/bpf/bench_bpf_timing.h b/tools/testing/selftests/bpf/bench_bpf_timing.h new file mode 100644 index 000000000000..6ef23b6d6639 --- /dev/null +++ b/tools/testing/selftests/bpf/bench_bpf_timing.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef __BENCH_BPF_TIMING_H__ +#define __BENCH_BPF_TIMING_H__ + +#include <stdbool.h> +#include <linux/types.h> +#include "bench.h" + +#ifndef BENCH_NR_SAMPLES +#define BENCH_NR_SAMPLES 4096 +#endif +#ifndef BENCH_NR_CPUS +#define BENCH_NR_CPUS 256 +#endif + +typedef void (*bpf_bench_run_fn)(void *ctx); + +struct bpf_bench_timing { + __u64 (*samples)[BENCH_NR_SAMPLES]; /* skel->bss->timing_samples */ + __u32 *idx; /* skel->bss->timing_idx */ + volatile __u32 *timing_enabled; /* &skel->bss->timing_enabled */ + volatile __u32 *batch_iters_bss; /* &skel->bss->batch_iters */ + __u32 batch_iters; + __u32 target_samples; + __u32 nr_cpus; + int warmup_ticks; + bool done; + bool machine_readable; +}; + +#define BENCH_TIMING_INIT(t, skel, iters) do { \ + (t)->samples = (skel)->bss->timing_samples; \ + (t)->idx = (skel)->bss->timing_idx; \ + (t)->timing_enabled = &(skel)->bss->timing_enabled; \ + (t)->batch_iters_bss = &(skel)->bss->batch_iters; \ + (t)->batch_iters = (iters); \ + (t)->target_samples = 200; \ + (t)->nr_cpus = env.nr_cpus; \ + (t)->warmup_ticks = 0; \ + (t)->done = false; \ + (t)->machine_readable = false; \ +} while (0) + +void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res); +void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *desc); +void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *ctx); + +#endif /* __BENCH_BPF_TIMING_H__ */ diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c index ee1dc12c5e5e..7278fa860397 100644 --- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_full_update.c @@ -34,19 +34,29 @@ static void measure(struct bench_res *res) { } -static void setup(void) +static void hashmap_full_update_setup(enum bpf_map_type map_type) { struct bpf_link *link; int map_fd, i, max_entries; setup_libbpf(); - ctx.skel = bpf_hashmap_full_update_bench__open_and_load(); + ctx.skel = bpf_hashmap_full_update_bench__open(); if (!ctx.skel) { fprintf(stderr, "failed to open skeleton\n"); exit(1); } + bpf_map__set_type(ctx.skel->maps.hash_map_bench, map_type); + if (map_type == BPF_MAP_TYPE_RHASH) + bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, + BPF_F_NO_PREALLOC); + + if (bpf_hashmap_full_update_bench__load(ctx.skel)) { + fprintf(stderr, "failed to load skeleton\n"); + exit(1); + } + ctx.skel->bss->nr_loops = MAX_LOOP_NUM; link = bpf_program__attach(ctx.skel->progs.benchmark); @@ -62,6 +72,16 @@ static void setup(void) bpf_map_update_elem(map_fd, &i, &i, BPF_ANY); } +static void setup(void) +{ + hashmap_full_update_setup(BPF_MAP_TYPE_HASH); +} + +static void rhash_setup(void) +{ + hashmap_full_update_setup(BPF_MAP_TYPE_RHASH); +} + static void hashmap_report_final(struct bench_res res[], int res_cnt) { unsigned int nr_cpus = bpf_num_possible_cpus(); @@ -87,3 +107,13 @@ const struct bench bench_bpf_hashmap_full_update = { .report_progress = NULL, .report_final = hashmap_report_final, }; + +const struct bench bench_bpf_rhashmap_full_update = { + .name = "bpf-rhashmap-full-update", + .validate = validate, + .setup = rhash_setup, + .producer_thread = producer, + .measure = measure, + .report_progress = NULL, + .report_final = hashmap_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c index 279ff1b8b5b2..5264b7b20e39 100644 --- a/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_hashmap_lookup.c @@ -148,9 +148,10 @@ static inline void patch_key(u32 i, u32 *key) /* the rest of key is random */ } -static void setup(void) +static void hashmap_lookup_setup(enum bpf_map_type map_type) { struct bpf_link *link; + __u32 map_flags; int map_fd; int ret; int i; @@ -163,10 +164,15 @@ static void setup(void) exit(1); } + map_flags = args.map_flags; + if (map_type == BPF_MAP_TYPE_RHASH) + map_flags |= BPF_F_NO_PREALLOC; + + bpf_map__set_type(ctx.skel->maps.hash_map_bench, map_type); bpf_map__set_max_entries(ctx.skel->maps.hash_map_bench, args.max_entries); bpf_map__set_key_size(ctx.skel->maps.hash_map_bench, args.key_size); bpf_map__set_value_size(ctx.skel->maps.hash_map_bench, 8); - bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, args.map_flags); + bpf_map__set_map_flags(ctx.skel->maps.hash_map_bench, map_flags); ctx.skel->bss->nr_entries = args.nr_entries; ctx.skel->bss->nr_loops = args.nr_loops / args.nr_entries; @@ -197,6 +203,16 @@ static void setup(void) } } +static void setup(void) +{ + hashmap_lookup_setup(BPF_MAP_TYPE_HASH); +} + +static void rhash_setup(void) +{ + hashmap_lookup_setup(BPF_MAP_TYPE_RHASH); +} + static inline double events_from_time(u64 time) { if (time) @@ -275,3 +291,14 @@ const struct bench bench_bpf_hashmap_lookup = { .report_progress = NULL, .report_final = hashmap_report_final, }; + +const struct bench bench_bpf_rhashmap_lookup = { + .name = "bpf-rhashmap-lookup", + .argp = &bench_hashmap_lookup_argp, + .validate = validate, + .setup = rhash_setup, + .producer_thread = producer, + .measure = measure, + .report_progress = NULL, + .report_final = hashmap_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c b/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c new file mode 100644 index 000000000000..e2d8c2ccf384 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_nop.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include "bench.h" +#include "bench_bpf_timing.h" +#include "bpf_nop_bench.skel.h" +#include "bpf_util.h" + +static struct ctx { + struct bpf_nop_bench *skel; + struct bpf_bench_timing timing; + int prog_fd; +} ctx; + +static void nop_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumers\n"); + exit(1); + } +} + +static void nop_run_once(void *unused __always_unused) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + + bpf_prog_test_run_opts(ctx.prog_fd, &topts); +} + +static void nop_setup(void) +{ + struct bpf_nop_bench *skel; + int err; + + setup_libbpf(); + + skel = bpf_nop_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + err = bpf_nop_bench__load(skel); + if (err) { + fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err)); + bpf_nop_bench__destroy(skel); + exit(1); + } + + ctx.skel = skel; + ctx.prog_fd = bpf_program__fd(skel->progs.bench_nop); + + BENCH_TIMING_INIT(&ctx.timing, skel, 0); + bpf_bench_calibrate(&ctx.timing, nop_run_once, NULL); + + env.duration_sec = 600; +} + +static void *nop_producer(void *input) +{ + while (true) + nop_run_once(NULL); + + return NULL; +} + +static void nop_measure(struct bench_res *res) +{ + bpf_bench_timing_measure(&ctx.timing, res); +} + +static void nop_report_final(struct bench_res res[], int res_cnt) +{ + bpf_bench_timing_report(&ctx.timing, "bpf-nop", NULL); +} + +const struct bench bench_bpf_nop = { + .name = "bpf-nop", + .validate = nop_validate, + .setup = nop_setup, + .producer_thread = nop_producer, + .measure = nop_measure, + .report_final = nop_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c new file mode 100644 index 000000000000..e02ad324f7bc --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "bench_bpf_timing.h" +#include "bpf_util.h" + +struct timing_stats { + double min, max; + double median, p99; + double mean, stddev; + int count; +}; + +static int cmp_double(const void *a, const void *b) +{ + double da = *(const double *)a; + double db = *(const double *)b; + + if (da < db) + return -1; + if (da > db) + return 1; + return 0; +} + +static double percentile(const double *sorted, int n, double pct) +{ + int idx = (int)(n * pct / 100.0); + + if (idx >= n) + idx = n - 1; + return sorted[idx]; +} + +static int collect_samples(struct bpf_bench_timing *t, + double *out, int max_out) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u32 timed_iters = t->batch_iters; + int total = 0; + + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + for (unsigned int cpu = 0; cpu < nr_cpus; cpu++) { + __u32 count = t->idx[cpu]; + + if (count > BENCH_NR_SAMPLES) + count = BENCH_NR_SAMPLES; + + for (__u32 i = 0; i < count && total < max_out; i++) { + __u64 sample = t->samples[cpu][i]; + + if (sample == 0) + continue; + out[total++] = (double)sample / timed_iters; + } + } + + qsort(out, total, sizeof(double), cmp_double); + return total; +} + +static int filter_outliers_iqr(double *sorted, int n) +{ + double q1, q3, iqr, lo, hi; + int start = 0, end = n; + + if (n < 8) + return n; + + q1 = sorted[n / 4]; + q3 = sorted[3 * n / 4]; + iqr = q3 - q1; + lo = q1 - 1.5 * iqr; + hi = q3 + 1.5 * iqr; + + while (start < end && sorted[start] < lo) + start++; + while (end > start && sorted[end - 1] > hi) + end--; + + if (start > 0) + memmove(sorted, sorted + start, (end - start) * sizeof(double)); + + return end - start; +} + +static void compute_stats(const double *sorted, int n, + struct timing_stats *s) +{ + double sum = 0, var_sum = 0; + + memset(s, 0, sizeof(*s)); + s->count = n; + + if (n == 0) + return; + + s->min = sorted[0]; + s->max = sorted[n - 1]; + s->median = sorted[n / 2]; + s->p99 = percentile(sorted, n, 99); + + for (int i = 0; i < n; i++) + sum += sorted[i]; + s->mean = sum / n; + + for (int i = 0; i < n; i++) { + double d = sorted[i] - s->mean; + + var_sum += d * d; + } + s->stddev = n > 1 ? sqrt(var_sum / (n - 1)) : 0; +} + +void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res) +{ + unsigned int nr_cpus; + __u32 total_samples; + int i; + + t->warmup_ticks++; + + if (t->warmup_ticks < env.warmup_sec) + return; + + if (t->warmup_ticks == env.warmup_sec) { + *t->timing_enabled = 1; + return; + } + + nr_cpus = bpf_num_possible_cpus(); + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + total_samples = 0; + for (i = 0; i < (int)nr_cpus; i++) { + __u32 cnt = t->idx[i]; + + if (cnt > BENCH_NR_SAMPLES) + cnt = BENCH_NR_SAMPLES; + total_samples += cnt; + } + + if (total_samples >= (__u32)env.producer_cnt * t->target_samples && !t->done) { + t->done = true; + *t->timing_enabled = 0; + bench_force_done(); + } +} + +void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *description) +{ + int max_out = BENCH_NR_CPUS * BENCH_NR_SAMPLES; + struct timing_stats s; + double *all; + int total; + + all = calloc(max_out, sizeof(*all)); + if (!all) { + fprintf(stderr, "failed to allocate timing buffer\n"); + return; + } + + total = collect_samples(t, all, max_out); + + if (total == 0) { + printf("No timing samples collected.\n"); + free(all); + return; + } + + total = filter_outliers_iqr(all, total); + compute_stats(all, total, &s); + + if (t->machine_readable) { + printf("RESULT scenario=%s samples=%d median=%.2f stddev=%.2f cv=%.2f min=%.2f " + "p99=%.2f max=%.2f\n", name, total, s.median, s.stddev, + s.mean > 0 ? s.stddev / s.mean * 100.0 : 0.0, s.min, s.p99, s.max); + } else { + printf("%s: median %.2f ns/op, stddev %.2f, p99 %.2f (%d samples)\n", name, + s.median, s.stddev, s.p99, total); + } + + free(all); +} + +#define CALIBRATE_SEED_BATCH 100 +#define CALIBRATE_MIN_BATCH 100 +#define CALIBRATE_MAX_BATCH 10000000 +#define CALIBRATE_TARGET_MS 10 +#define CALIBRATE_RUNS 5 +#define PROPORTIONALITY_TOL 0.05 /* 5% */ + +static void reset_timing(struct bpf_bench_timing *t) +{ + *t->timing_enabled = 0; + memset(t->samples, 0, sizeof(__u64) * BENCH_NR_CPUS * BENCH_NR_SAMPLES); + memset(t->idx, 0, sizeof(__u32) * BENCH_NR_CPUS); +} + +static __u64 measure_elapsed(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx, + __u32 iters, int runs) +{ + __u64 buf[CALIBRATE_RUNS]; + int n = 0, i, j; + + reset_timing(t); + *t->batch_iters_bss = iters; + *t->timing_enabled = 1; + + for (i = 0; i < runs; i++) + run_fn(run_ctx); + + *t->timing_enabled = 0; + + for (i = 0; i < BENCH_NR_CPUS && n < runs; i++) { + __u32 cnt = t->idx[i]; + + for (j = 0; j < (int)cnt && n < runs; j++) + buf[n++] = t->samples[i][j]; + } + + if (n == 0) + return 0; + + for (i = 1; i < n; i++) { + __u64 key = buf[i]; + + j = i - 1; + while (j >= 0 && buf[j] > key) { + buf[j + 1] = buf[j]; + j--; + } + buf[j + 1] = key; + } + + return buf[n / 2]; +} + +static __u32 compute_batch_iters(__u64 per_op_ns) +{ + __u64 target_ns = (__u64)CALIBRATE_TARGET_MS * 1000000ULL; + __u32 iters; + + if (per_op_ns == 0) + return CALIBRATE_MIN_BATCH; + + iters = target_ns / per_op_ns; + + if (iters < CALIBRATE_MIN_BATCH) + iters = CALIBRATE_MIN_BATCH; + if (iters > CALIBRATE_MAX_BATCH) + iters = CALIBRATE_MAX_BATCH; + + return iters; +} + +void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx) +{ + __u64 elapsed, per_op_ns; + __u64 time_n, time_2n; + double ratio; + + elapsed = measure_elapsed(t, run_fn, run_ctx, CALIBRATE_SEED_BATCH, CALIBRATE_RUNS); + if (elapsed == 0) { + fprintf(stderr, "calibration: no timing samples, using default\n"); + t->batch_iters = 10000; + *t->batch_iters_bss = t->batch_iters; + reset_timing(t); + return; + } + + per_op_ns = elapsed / CALIBRATE_SEED_BATCH; + t->batch_iters = compute_batch_iters(per_op_ns); + + time_n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters, CALIBRATE_RUNS); + time_2n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters * 2, CALIBRATE_RUNS); + + if (time_n > 0 && time_2n > 0) { + ratio = (double)time_2n / (double)time_n; + + if (fabs(ratio - 2.0) / 2.0 > PROPORTIONALITY_TOL) + fprintf(stderr, + "WARNING: proportionality check failed (2N/N ratio=%.3f, " + "expected=2.000, error=%.1f%%)\n System noise may be affecting " + "results.\n", + ratio, fabs(ratio - 2.0) / 2.0 * 100.0); + } + + *t->batch_iters_bss = t->batch_iters; + reset_timing(t); +} diff --git a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c index 297e32390cd1..1ee217d97434 100644 --- a/tools/testing/selftests/bpf/benchs/bench_htab_mem.c +++ b/tools/testing/selftests/bpf/benchs/bench_htab_mem.c @@ -152,7 +152,7 @@ static const struct htab_mem_use_case *htab_mem_find_use_case_or_exit(const char exit(1); } -static void htab_mem_setup(void) +static void htab_mem_setup_impl(enum bpf_map_type map_type) { struct bpf_map *map; const char **names; @@ -178,10 +178,11 @@ static void htab_mem_setup(void) } map = ctx.skel->maps.htab; + bpf_map__set_type(map, map_type); bpf_map__set_value_size(map, args.value_size); /* Ensure that different CPUs can operate on different subset */ bpf_map__set_max_entries(map, MAX(8192, 64 * env.nr_cpus)); - if (args.preallocated) + if (map_type != BPF_MAP_TYPE_RHASH && args.preallocated) bpf_map__set_map_flags(map, bpf_map__map_flags(map) & ~BPF_F_NO_PREALLOC); names = ctx.uc->progs; @@ -220,6 +221,16 @@ cleanup: exit(1); } +static void htab_mem_setup(void) +{ + htab_mem_setup_impl(BPF_MAP_TYPE_HASH); +} + +static void rhtab_mem_setup(void) +{ + htab_mem_setup_impl(BPF_MAP_TYPE_RHASH); +} + static void htab_mem_add_fn(pthread_barrier_t *notify) { while (true) { @@ -338,6 +349,15 @@ static void htab_mem_report_final(struct bench_res res[], int res_cnt) cleanup_cgroup_environment(); } +static void rhtab_mem_validate(void) +{ + if (args.preallocated) { + fprintf(stderr, "rhash map does not support preallocation\n"); + exit(1); + } + htab_mem_validate(); +} + const struct bench bench_htab_mem = { .name = "htab-mem", .argp = &bench_htab_mem_argp, @@ -348,3 +368,14 @@ const struct bench bench_htab_mem = { .report_progress = htab_mem_report_progress, .report_final = htab_mem_report_final, }; + +const struct bench bench_rhtab_mem = { + .name = "rhtab-mem", + .argp = &bench_htab_mem_argp, + .validate = rhtab_mem_validate, + .setup = rhtab_mem_setup, + .producer_thread = htab_mem_producer, + .measure = htab_mem_measure, + .report_progress = htab_mem_report_progress, + .report_final = htab_mem_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c new file mode 100644 index 000000000000..8e25bccbde92 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/bench_xdp_lb.c @@ -0,0 +1,1124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <argp.h> +#include <string.h> +#include <arpa/inet.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include "bench.h" +#include "bench_bpf_timing.h" +#include "xdp_lb_bench.skel.h" +#include "xdp_lb_bench_common.h" +#include "bpf_util.h" + +#define IP4(a, b, c, d) (((__u32)(a) << 24) | ((__u32)(b) << 16) | ((__u32)(c) << 8) | (__u32)(d)) + +#define IP6(a, b, c, d) { (__u32)(a), (__u32)(b), (__u32)(c), (__u32)(d) } + +#define TNL_DST IP4(192, 168, 1, 2) +#define REAL_INDEX 1 +#define REAL_INDEX_V6 2 +#define MAX_PKT_SIZE 256 +#define IP_MF 0x2000 + +static const __u32 tnl_dst_v6[4] = { 0xfd000000, 0, 0, 2 }; + +static const __u8 lb_mac[ETH_ALEN] = {0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}; +static const __u8 client_mac[ETH_ALEN] = {0x11, 0x22, 0x33, 0x44, 0x55, 0x66}; +static const __u8 router_mac[ETH_ALEN] = {0xde, 0xad, 0xbe, 0xef, 0x00, 0x01}; + +enum scenario_id { + S_TCP_V4_LRU_HIT, + S_TCP_V4_CH, + S_TCP_V6_LRU_HIT, + S_TCP_V6_CH, + S_UDP_V4_LRU_HIT, + S_UDP_V6_LRU_HIT, + S_TCP_V4V6_LRU_HIT, + S_TCP_V4_LRU_DIVERSE, + S_TCP_V4_CH_DIVERSE, + S_TCP_V6_LRU_DIVERSE, + S_TCP_V6_CH_DIVERSE, + S_UDP_V4_LRU_DIVERSE, + S_TCP_V4_LRU_MISS, + S_UDP_V4_LRU_MISS, + S_TCP_V4_LRU_WARMUP, + S_TCP_V4_SYN, + S_TCP_V4_RST_MISS, + S_PASS_V4_NO_VIP, + S_PASS_V6_NO_VIP, + S_PASS_V4_ICMP, + S_PASS_NON_IP, + S_DROP_V4_FRAG, + S_DROP_V4_OPTIONS, + S_DROP_V6_FRAG, + NUM_SCENARIOS, +}; + +enum lru_miss_type { + LRU_MISS_AUTO = 0, /* compute from scenario flags (default) */ + LRU_MISS_NONE, /* 0 misses (all LRU hits) */ + LRU_MISS_ALL, /* batch_iters+1 misses (every op misses) */ + LRU_MISS_FIRST, /* 1 miss (first miss, then hits) */ +}; + +#define S_BASE_ENCAP_V4 \ + .expected_retval = XDP_TX, .expect_encap = true, \ + .tunnel_dst = TNL_DST + +#define S_BASE_ENCAP_V6 \ + .expected_retval = XDP_TX, .expect_encap = true, \ + .is_v6 = true, .encap_v6_outer = true, \ + .tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 } + +#define S_BASE_ENCAP_V4V6 \ + .expected_retval = XDP_TX, .expect_encap = true, \ + .encap_v6_outer = true, \ + .tunnel_dst_v6 = { 0xfd000000, 0, 0, 2 } + +struct test_scenario { + const char *name; + const char *description; + int expected_retval; + bool expect_encap; + bool is_v6; + __u32 vip_addr; + __u32 src_addr; + __u32 tunnel_dst; + __u32 vip_addr_v6[4]; + __u32 src_addr_v6[4]; + __u32 tunnel_dst_v6[4]; + __u16 dst_port; + __u16 src_port; + __u8 ip_proto; + __u32 vip_flags; + __u32 vip_num; + bool prepopulate_lru; + bool set_frag; + __u16 eth_proto; + bool encap_v6_outer; + __u32 flow_mask; + bool cold_lru; + bool set_syn; + bool set_rst; + bool set_ip_options; + __u32 fixed_batch_iters; /* 0 = auto-calibrate, >0 = use this value */ + enum lru_miss_type lru_miss; /* expected LRU miss pattern */ +}; + +static const struct test_scenario scenarios[NUM_SCENARIOS] = { + /* Single-flow baseline */ + [S_TCP_V4_LRU_HIT] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-hit", + .description = "IPv4 TCP, LRU hit, IPIP encap", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V4_CH] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-ch", + .description = "IPv4 TCP, CH (LRU bypass), IPIP encap", + .vip_addr = IP4(10, 10, 1, 2), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 1, + .lru_miss = LRU_MISS_ALL, + }, + [S_TCP_V6_LRU_HIT] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-lru-hit", + .description = "IPv6 TCP, LRU hit, IP6IP6 encap", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345, + .vip_num = 10, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V6_CH] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-ch", + .description = "IPv6 TCP, CH (LRU bypass), IP6IP6 encap", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 12, + .lru_miss = LRU_MISS_ALL, + }, + [S_UDP_V4_LRU_HIT] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, + .name = "udp-v4-lru-hit", + .description = "IPv4 UDP, LRU hit, IPIP encap", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, + .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, + .vip_num = 2, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_UDP_V6_LRU_HIT] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_UDP, + .name = "udp-v6-lru-hit", + .description = "IPv6 UDP, LRU hit, IP6IP6 encap", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 443, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 3), .src_port = 22222, + .vip_num = 14, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V4V6_LRU_HIT] = { + S_BASE_ENCAP_V4V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4v6-lru-hit", + .description = "IPv4 TCP, LRU hit, IPv4-in-IPv6 encap", + .vip_addr = IP4(10, 10, 1, 4), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 4), .src_port = 12347, + .vip_num = 13, + .prepopulate_lru = true, .lru_miss = LRU_MISS_NONE, + }, + + /* Diverse flows (4K src addrs) */ + [S_TCP_V4_LRU_DIVERSE] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-diverse", + .description = "IPv4 TCP, diverse flows, warm LRU", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .prepopulate_lru = true, .flow_mask = 0xFFF, + .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V4_CH_DIVERSE] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-ch-diverse", + .description = "IPv4 TCP, diverse flows, CH (LRU bypass)", + .vip_addr = IP4(10, 10, 1, 2), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 1, + .flow_mask = 0xFFF, .lru_miss = LRU_MISS_ALL, + }, + [S_TCP_V6_LRU_DIVERSE] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-lru-diverse", + .description = "IPv6 TCP, diverse flows, warm LRU", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 1), .src_port = 12345, + .vip_num = 10, + .prepopulate_lru = true, .flow_mask = 0xFFF, + .lru_miss = LRU_MISS_NONE, + }, + [S_TCP_V6_CH_DIVERSE] = { + S_BASE_ENCAP_V6, .ip_proto = IPPROTO_TCP, + .name = "tcp-v6-ch-diverse", + .description = "IPv6 TCP, diverse flows, CH (LRU bypass)", + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 2), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000200, 0, 0, 2), .src_port = 54321, + .vip_flags = F_LRU_BYPASS, .vip_num = 12, + .flow_mask = 0xFFF, .lru_miss = LRU_MISS_ALL, + }, + [S_UDP_V4_LRU_DIVERSE] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, + .name = "udp-v4-lru-diverse", + .description = "IPv4 UDP, diverse flows, warm LRU", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, + .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, + .vip_num = 2, + .prepopulate_lru = true, .flow_mask = 0xFFF, + .lru_miss = LRU_MISS_NONE, + }, + + /* LRU stress */ + [S_TCP_V4_LRU_MISS] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-miss", + .description = "IPv4 TCP, LRU miss (16M flow space), CH lookup", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .flow_mask = 0xFFFFFF, .cold_lru = true, + .lru_miss = LRU_MISS_FIRST, + }, + [S_UDP_V4_LRU_MISS] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_UDP, + .name = "udp-v4-lru-miss", + .description = "IPv4 UDP, LRU miss (16M flow space), CH lookup", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 443, + .src_addr = IP4(10, 10, 3, 1), .src_port = 11111, + .vip_num = 2, + .flow_mask = 0xFFFFFF, .cold_lru = true, + .lru_miss = LRU_MISS_FIRST, + }, + [S_TCP_V4_LRU_WARMUP] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-lru-warmup", + .description = "IPv4 TCP, 4K flows, ~50% LRU miss", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 2, 1), .src_port = 12345, + .flow_mask = 0xFFF, .cold_lru = true, + .fixed_batch_iters = 6500, + .lru_miss = LRU_MISS_FIRST, + }, + + /* TCP flags */ + [S_TCP_V4_SYN] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-syn", + .description = "IPv4 TCP SYN, skip LRU, CH + LRU insert", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 8, 2), .src_port = 60001, + .set_syn = true, .lru_miss = LRU_MISS_ALL, + }, + [S_TCP_V4_RST_MISS] = { + S_BASE_ENCAP_V4, .ip_proto = IPPROTO_TCP, + .name = "tcp-v4-rst-miss", + .description = "IPv4 TCP RST, CH lookup, no LRU insert", + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 8, 1), .src_port = 60000, + .flow_mask = 0xFFFFFF, .cold_lru = true, + .set_rst = true, .lru_miss = LRU_MISS_ALL, + }, + + /* Early exits */ + [S_PASS_V4_NO_VIP] = { + .name = "pass-v4-no-vip", + .description = "IPv4 TCP, unknown VIP, XDP_PASS", + .expected_retval = XDP_PASS, + .ip_proto = IPPROTO_TCP, + .vip_addr = IP4(10, 10, 9, 9), .dst_port = 80, + .src_addr = IP4(10, 10, 4, 1), .src_port = 33333, + }, + [S_PASS_V6_NO_VIP] = { + .name = "pass-v6-no-vip", + .description = "IPv6 TCP, unknown VIP, XDP_PASS", + .expected_retval = XDP_PASS, .is_v6 = true, + .ip_proto = IPPROTO_TCP, + .vip_addr_v6 = IP6(0xfd009900, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000400, 0, 0, 1), .src_port = 33333, + }, + [S_PASS_V4_ICMP] = { + .name = "pass-v4-icmp", + .description = "IPv4 ICMP, non-TCP/UDP protocol, XDP_PASS", + .expected_retval = XDP_PASS, + .ip_proto = IPPROTO_ICMP, + .vip_addr = IP4(10, 10, 1, 1), + .src_addr = IP4(10, 10, 6, 1), + }, + [S_PASS_NON_IP] = { + .name = "pass-non-ip", + .description = "Non-IP (ARP), earliest XDP_PASS exit", + .expected_retval = XDP_PASS, + .eth_proto = ETH_P_ARP, + }, + [S_DROP_V4_FRAG] = { + .name = "drop-v4-frag", + .description = "IPv4 fragmented, XDP_DROP", + .expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP, + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 5, 1), .src_port = 44444, + .set_frag = true, + }, + [S_DROP_V4_OPTIONS] = { + .name = "drop-v4-options", + .description = "IPv4 with IP options (ihl>5), XDP_DROP", + .expected_retval = XDP_DROP, .ip_proto = IPPROTO_TCP, + .vip_addr = IP4(10, 10, 1, 1), .dst_port = 80, + .src_addr = IP4(10, 10, 7, 1), .src_port = 55555, + .set_ip_options = true, + }, + [S_DROP_V6_FRAG] = { + .name = "drop-v6-frag", + .description = "IPv6 fragment extension header, XDP_DROP", + .expected_retval = XDP_DROP, .is_v6 = true, + .ip_proto = IPPROTO_TCP, + .vip_addr_v6 = IP6(0xfd000100, 0, 0, 1), .dst_port = 80, + .src_addr_v6 = IP6(0xfd000500, 0, 0, 1), .src_port = 44444, + .set_frag = true, + }, +}; + +#define MAX_ENCAP_SIZE (MAX_PKT_SIZE + sizeof(struct ipv6hdr)) + +static __u8 pkt_buf[NUM_SCENARIOS][MAX_PKT_SIZE]; +static __u32 pkt_len[NUM_SCENARIOS]; +static __u8 expected_buf[NUM_SCENARIOS][MAX_ENCAP_SIZE]; +static __u32 expected_len[NUM_SCENARIOS]; + +static int lru_inner_fds[BENCH_NR_CPUS]; +static int nr_inner_maps; + +static struct ctx { + struct xdp_lb_bench *skel; + struct bpf_bench_timing timing; + int prog_fd; +} ctx; + +static struct { + int scenario; + bool machine_readable; +} args = { + .scenario = -1, +}; + +static __u16 ip_checksum(const void *hdr, int len) +{ + const __u16 *p = hdr; + __u32 csum = 0; + int i; + + for (i = 0; i < len / 2; i++) + csum += p[i]; + + while (csum >> 16) + csum = (csum & 0xffff) + (csum >> 16); + + return ~csum; +} + +static void htonl_v6(__be32 dst[4], const __u32 src[4]) +{ + int i; + + for (i = 0; i < 4; i++) + dst[i] = htonl(src[i]); +} + +static void build_flow_key(struct flow_key *fk, const struct test_scenario *sc) +{ + memset(fk, 0, sizeof(*fk)); + if (sc->is_v6) { + htonl_v6(fk->srcv6, sc->src_addr_v6); + htonl_v6(fk->dstv6, sc->vip_addr_v6); + } else { + fk->src = htonl(sc->src_addr); + fk->dst = htonl(sc->vip_addr); + } + fk->proto = sc->ip_proto; + fk->port16[0] = htons(sc->src_port); + fk->port16[1] = htons(sc->dst_port); +} + +static void build_l4(const struct test_scenario *sc, __u8 *p, __u32 *off) +{ + if (sc->ip_proto == IPPROTO_TCP) { + struct tcphdr tcp = {}; + + tcp.source = htons(sc->src_port); + tcp.dest = htons(sc->dst_port); + tcp.doff = 5; + tcp.syn = sc->set_syn ? 1 : 0; + tcp.rst = sc->set_rst ? 1 : 0; + tcp.window = htons(8192); + memcpy(p + *off, &tcp, sizeof(tcp)); + *off += sizeof(tcp); + } else if (sc->ip_proto == IPPROTO_UDP) { + struct udphdr udp = {}; + + udp.source = htons(sc->src_port); + udp.dest = htons(sc->dst_port); + udp.len = htons(sizeof(udp) + 16); + memcpy(p + *off, &udp, sizeof(udp)); + *off += sizeof(udp); + } +} + +static void build_packet(int idx) +{ + const struct test_scenario *sc = &scenarios[idx]; + __u8 *p = pkt_buf[idx]; + struct ethhdr eth = {}; + __u16 proto; + __u32 off = 0; + + memcpy(eth.h_dest, lb_mac, ETH_ALEN); + memcpy(eth.h_source, client_mac, ETH_ALEN); + + if (sc->eth_proto) + proto = sc->eth_proto; + else if (sc->is_v6) + proto = ETH_P_IPV6; + else + proto = ETH_P_IP; + + eth.h_proto = htons(proto); + memcpy(p, ð, sizeof(eth)); + off += sizeof(eth); + + if (proto != ETH_P_IP && proto != ETH_P_IPV6) { + memcpy(p + off, "bench___payload!", 16); + off += 16; + pkt_len[idx] = off; + return; + } + + if (sc->is_v6) { + struct ipv6hdr ip6h = {}; + __u32 ip6_off = off; + + ip6h.version = 6; + ip6h.nexthdr = sc->set_frag ? 44 : sc->ip_proto; + ip6h.hop_limit = 64; + htonl_v6((__be32 *)&ip6h.saddr, sc->src_addr_v6); + htonl_v6((__be32 *)&ip6h.daddr, sc->vip_addr_v6); + off += sizeof(ip6h); + + if (sc->set_frag) { + memset(p + off, 0, 8); + p[off] = sc->ip_proto; + off += 8; + } + + build_l4(sc, p, &off); + + memcpy(p + off, "bench___payload!", 16); + off += 16; + + ip6h.payload_len = htons(off - ip6_off - sizeof(ip6h)); + memcpy(p + ip6_off, &ip6h, sizeof(ip6h)); + } else { + struct iphdr iph = {}; + __u32 ip_off = off; + + iph.version = 4; + iph.ihl = sc->set_ip_options ? 6 : 5; + iph.ttl = 64; + iph.protocol = sc->ip_proto; + iph.saddr = htonl(sc->src_addr); + iph.daddr = htonl(sc->vip_addr); + iph.frag_off = sc->set_frag ? htons(IP_MF) : 0; + off += sizeof(iph); + + if (sc->set_ip_options) { + /* NOP option padding (4 bytes = 1 word) */ + __u32 nop = htonl(0x01010101); + + memcpy(p + off, &nop, sizeof(nop)); + off += sizeof(nop); + } + + build_l4(sc, p, &off); + + memcpy(p + off, "bench___payload!", 16); + off += 16; + + iph.tot_len = htons(off - ip_off); + iph.check = ip_checksum(&iph, sizeof(iph)); + memcpy(p + ip_off, &iph, sizeof(iph)); + } + + pkt_len[idx] = off; +} + +static void populate_vip(struct xdp_lb_bench *skel, const struct test_scenario *sc) +{ + struct vip_definition key = {}; + struct vip_meta val = {}; + int err; + + if (sc->is_v6) + htonl_v6(key.vipv6, sc->vip_addr_v6); + else + key.vip = htonl(sc->vip_addr); + key.port = htons(sc->dst_port); + key.proto = sc->ip_proto; + val.flags = sc->vip_flags; + val.vip_num = sc->vip_num; + + err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &val, BPF_ANY); + if (err) { + fprintf(stderr, "vip_map [%s]: %s\n", sc->name, strerror(errno)); + exit(1); + } +} + +static void create_per_cpu_lru_maps(struct xdp_lb_bench *skel) +{ + int outer_fd = bpf_map__fd(skel->maps.lru_mapping); + unsigned int nr_cpus = bpf_num_possible_cpus(); + int i, inner_fd, err; + __u32 cpu; + + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + for (i = 0; i < (int)nr_cpus; i++) { + LIBBPF_OPTS(bpf_map_create_opts, opts); + + inner_fd = bpf_map_create(BPF_MAP_TYPE_LRU_HASH, "lru_inner", + sizeof(struct flow_key), + sizeof(struct real_pos_lru), + DEFAULT_LRU_SIZE, &opts); + if (inner_fd < 0) { + fprintf(stderr, "lru_inner[%d]: %s\n", i, strerror(errno)); + exit(1); + } + + cpu = i; + err = bpf_map_update_elem(outer_fd, &cpu, &inner_fd, BPF_ANY); + if (err) { + fprintf(stderr, "lru_mapping[%d]: %s\n", i, strerror(errno)); + close(inner_fd); + exit(1); + } + + lru_inner_fds[i] = inner_fd; + } + + nr_inner_maps = nr_cpus; +} + +static __u64 ktime_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return (__u64)ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} + +static void populate_lru(const struct test_scenario *sc, __u32 real_idx) +{ + struct real_pos_lru lru = { .pos = real_idx }; + struct flow_key fk; + int i, err; + + if (sc->ip_proto == IPPROTO_UDP) + lru.atime = ktime_get_ns(); + + build_flow_key(&fk, sc); + + /* Insert into every per-CPU inner LRU so the entry is found + * regardless of which CPU runs the BPF program. + */ + for (i = 0; i < nr_inner_maps; i++) { + err = bpf_map_update_elem(lru_inner_fds[i], &fk, &lru, BPF_ANY); + if (err) { + fprintf(stderr, "lru_inner[%d] [%s]: %s\n", i, sc->name, + strerror(errno)); + exit(1); + } + } +} + +static void populate_maps(struct xdp_lb_bench *skel) +{ + struct real_definition real_v4 = {}; + struct real_definition real_v6 = {}; + struct ctl_value cval = {}; + __u32 key, real_idx = REAL_INDEX; + int ch_fd, err, i; + + if (scenarios[args.scenario].expect_encap) + populate_vip(skel, &scenarios[args.scenario]); + + ch_fd = bpf_map__fd(skel->maps.ch_rings); + for (i = 0; i < CH_RINGS_SIZE; i++) { + __u32 k = i; + + err = bpf_map_update_elem(ch_fd, &k, &real_idx, BPF_ANY); + if (err) { + fprintf(stderr, "ch_rings[%d]: %s\n", i, strerror(errno)); + exit(1); + } + } + + memcpy(cval.mac, router_mac, ETH_ALEN); + key = 0; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.ctl_array), &key, &cval, BPF_ANY); + if (err) { + fprintf(stderr, "ctl_array: %s\n", strerror(errno)); + exit(1); + } + + key = REAL_INDEX; + real_v4.dst = htonl(TNL_DST); + htonl_v6(real_v4.dstv6, tnl_dst_v6); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v4, BPF_ANY); + if (err) { + fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX, strerror(errno)); + exit(1); + } + + key = REAL_INDEX_V6; + htonl_v6(real_v6.dstv6, tnl_dst_v6); + real_v6.flags = F_IPV6; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &key, &real_v6, BPF_ANY); + if (err) { + fprintf(stderr, "reals[%d]: %s\n", REAL_INDEX_V6, strerror(errno)); + exit(1); + } + + create_per_cpu_lru_maps(skel); + + if (scenarios[args.scenario].prepopulate_lru) { + const struct test_scenario *sc = &scenarios[args.scenario]; + __u32 ridx = sc->encap_v6_outer ? REAL_INDEX_V6 : REAL_INDEX; + + populate_lru(sc, ridx); + } + + if (scenarios[args.scenario].expect_encap) { + const struct test_scenario *sc = &scenarios[args.scenario]; + struct vip_definition miss_vip = {}; + + if (sc->is_v6) + htonl_v6(miss_vip.vipv6, sc->vip_addr_v6); + else + miss_vip.vip = htonl(sc->vip_addr); + miss_vip.port = htons(sc->dst_port); + miss_vip.proto = sc->ip_proto; + + key = 0; + err = bpf_map_update_elem(bpf_map__fd(skel->maps.vip_miss_stats), + &key, &miss_vip, BPF_ANY); + if (err) { + fprintf(stderr, "vip_miss_stats: %s\n", strerror(errno)); + exit(1); + } + } +} + +static void build_expected_packet(int idx) +{ + const struct test_scenario *sc = &scenarios[idx]; + __u8 *p = expected_buf[idx]; + struct ethhdr eth = {}; + const __u8 *in = pkt_buf[idx]; + __u32 in_len = pkt_len[idx]; + __u32 off = 0; + __u32 inner_len = in_len - sizeof(struct ethhdr); + + if (sc->expected_retval == XDP_DROP) { + expected_len[idx] = 0; + return; + } + + if (sc->expected_retval == XDP_PASS) { + memcpy(p, in, in_len); + expected_len[idx] = in_len; + return; + } + + memcpy(eth.h_dest, router_mac, ETH_ALEN); + memcpy(eth.h_source, lb_mac, ETH_ALEN); + eth.h_proto = htons(sc->encap_v6_outer ? ETH_P_IPV6 : ETH_P_IP); + memcpy(p, ð, sizeof(eth)); + off += sizeof(eth); + + if (sc->encap_v6_outer) { + struct ipv6hdr ip6h = {}; + __u8 nexthdr = sc->is_v6 ? IPPROTO_IPV6 : IPPROTO_IPIP; + + ip6h.version = 6; + ip6h.nexthdr = nexthdr; + ip6h.payload_len = htons(inner_len); + ip6h.hop_limit = 64; + + create_encap_ipv6_src(htons(sc->src_port), + sc->is_v6 ? htonl(sc->src_addr_v6[0]) + : htonl(sc->src_addr), + (__be32 *)&ip6h.saddr); + htonl_v6((__be32 *)&ip6h.daddr, sc->tunnel_dst_v6); + + memcpy(p + off, &ip6h, sizeof(ip6h)); + off += sizeof(ip6h); + } else { + struct iphdr iph = {}; + + iph.version = 4; + iph.ihl = sizeof(iph) >> 2; + iph.protocol = IPPROTO_IPIP; + iph.tot_len = htons(inner_len + sizeof(iph)); + iph.ttl = 64; + iph.saddr = create_encap_ipv4_src(htons(sc->src_port), + htonl(sc->src_addr)); + iph.daddr = htonl(sc->tunnel_dst); + iph.check = ip_checksum(&iph, sizeof(iph)); + + memcpy(p + off, &iph, sizeof(iph)); + off += sizeof(iph); + } + + memcpy(p + off, in + sizeof(struct ethhdr), inner_len); + off += inner_len; + + expected_len[idx] = off; +} + +static void print_hex_diff(const char *name, const __u8 *got, __u32 got_len, const __u8 *exp, + __u32 exp_len) +{ + __u32 max_len = got_len > exp_len ? got_len : exp_len; + __u32 i, ndiffs = 0; + + fprintf(stderr, " [%s] got %u bytes, expected %u bytes\n", + name, got_len, exp_len); + + for (i = 0; i < max_len && ndiffs < 8; i++) { + __u8 g = i < got_len ? got[i] : 0; + __u8 e = i < exp_len ? exp[i] : 0; + + if (g != e || i >= got_len || i >= exp_len) { + fprintf(stderr, " offset 0x%03x: got 0x%02x expected 0x%02x\n", + i, g, e); + ndiffs++; + } + } + + if (ndiffs >= 8 && i < max_len) + fprintf(stderr, " ... (more differences)\n"); +} + +static void read_stat(int stats_fd, __u32 key, __u64 *v1_out, __u64 *v2_out) +{ + struct lb_stats values[BENCH_NR_CPUS]; + unsigned int nr_cpus = bpf_num_possible_cpus(); + __u64 v1 = 0, v2 = 0; + unsigned int i; + + if (nr_cpus > BENCH_NR_CPUS) + nr_cpus = BENCH_NR_CPUS; + + if (bpf_map_lookup_elem(stats_fd, &key, values) == 0) { + for (i = 0; i < nr_cpus; i++) { + v1 += values[i].v1; + v2 += values[i].v2; + } + } + + *v1_out = v1; + *v2_out = v2; +} + +static void reset_stats(int stats_fd) +{ + struct lb_stats zeros[BENCH_NR_CPUS]; + __u32 key; + + memset(zeros, 0, sizeof(zeros)); + for (key = 0; key < STATS_SIZE; key++) + bpf_map_update_elem(stats_fd, &key, zeros, BPF_ANY); +} + +static bool validate_counters(int idx) +{ + const struct test_scenario *sc = &scenarios[idx]; + int stats_fd = bpf_map__fd(ctx.skel->maps.stats); + __u64 xdp_tx, xdp_pass, xdp_drop, lru_pkts, lru_misses, tcp_misses; + __u64 expected_misses; + __u64 dummy; + /* + * BENCH_BPF_LOOP runs batch_iters timed + 1 untimed iteration. + * Each iteration calls process_packet -> count_action, so all + * counters are incremented (batch_iters + 1) times. + */ + __u64 n = ctx.timing.batch_iters + 1; + bool pass = true; + + read_stat(stats_fd, STATS_XDP_TX, &xdp_tx, &dummy); + read_stat(stats_fd, STATS_XDP_PASS, &xdp_pass, &dummy); + read_stat(stats_fd, STATS_XDP_DROP, &xdp_drop, &dummy); + read_stat(stats_fd, STATS_LRU, &lru_pkts, &lru_misses); + read_stat(stats_fd, STATS_LRU_MISS, &tcp_misses, &dummy); + + if (sc->expected_retval == XDP_TX && xdp_tx != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_TX=%llu, expected %llu\n", sc->name, + (unsigned long long)xdp_tx, (unsigned long long)n); + pass = false; + } + if (sc->expected_retval == XDP_PASS && xdp_pass != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_PASS=%llu, expected %llu\n", + sc->name, (unsigned long long)xdp_pass, (unsigned long long)n); + pass = false; + } + if (sc->expected_retval == XDP_DROP && xdp_drop != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_XDP_DROP=%llu, expected %llu\n", + sc->name, (unsigned long long)xdp_drop, (unsigned long long)n); + pass = false; + } + + if (!sc->expect_encap) + goto out; + + if (lru_pkts != n) { + fprintf(stderr, " [%s] COUNTER FAIL: STATS_LRU.v1=%llu, expected %llu\n", + sc->name, (unsigned long long)lru_pkts, (unsigned long long)n); + pass = false; + } + + switch (sc->lru_miss) { + case LRU_MISS_NONE: + expected_misses = 0; + break; + case LRU_MISS_ALL: + expected_misses = n; + break; + case LRU_MISS_FIRST: + expected_misses = 1; + break; + default: + /* LRU_MISS_AUTO: compute from scenario flags */ + if (sc->prepopulate_lru && !sc->set_syn) + expected_misses = 0; + else if (sc->set_syn || sc->set_rst || + (sc->vip_flags & F_LRU_BYPASS)) + expected_misses = n; + else if (sc->cold_lru) + expected_misses = 1; + else + expected_misses = n; + break; + } + + if (lru_misses != expected_misses) { + fprintf(stderr, " [%s] COUNTER FAIL: LRU misses=%llu, expected %llu\n", + sc->name, (unsigned long long)lru_misses, + (unsigned long long)expected_misses); + pass = false; + } + + if (sc->ip_proto == IPPROTO_TCP && lru_misses > 0) { + if (tcp_misses != lru_misses) { + fprintf(stderr, " [%s] COUNTER FAIL: TCP LRU misses=%llu, expected %llu\n", + sc->name, (unsigned long long)tcp_misses, + (unsigned long long)lru_misses); + pass = false; + } + } + +out: + reset_stats(stats_fd); + return pass; +} + +static const char *xdp_action_str(int action) +{ + switch (action) { + case XDP_DROP: return "XDP_DROP"; + case XDP_PASS: return "XDP_PASS"; + case XDP_TX: return "XDP_TX"; + default: return "UNKNOWN"; + } +} + +static bool validate_scenario(int idx) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + const struct test_scenario *sc = &scenarios[idx]; + __u8 out[MAX_ENCAP_SIZE]; + int err; + + topts.data_in = pkt_buf[idx]; + topts.data_size_in = pkt_len[idx]; + topts.data_out = out; + topts.data_size_out = sizeof(out); + topts.repeat = 1; + + err = bpf_prog_test_run_opts(ctx.prog_fd, &topts); + if (err) { + fprintf(stderr, " [%s] FAIL: test_run: %s\n", sc->name, strerror(errno)); + return false; + } + + if ((int)topts.retval != sc->expected_retval) { + fprintf(stderr, " [%s] FAIL: retval %s, expected %s\n", sc->name, + xdp_action_str(topts.retval), xdp_action_str(sc->expected_retval)); + return false; + } + + /* + * Compare output packet when it's deterministic. + * Skip for XDP_DROP (no output) and cold_lru (source IP poisoned). + */ + if (sc->expected_retval != XDP_DROP && !sc->cold_lru) { + if (topts.data_size_out != expected_len[idx] || + memcmp(out, expected_buf[idx], expected_len[idx]) != 0) { + fprintf(stderr, " [%s] FAIL: output packet mismatch\n", sc->name); + print_hex_diff(sc->name, out, topts.data_size_out, expected_buf[idx], + expected_len[idx]); + return false; + } + } + + if (!validate_counters(idx)) + return false; + return true; +} + +static int find_scenario(const char *name) +{ + int i; + + for (i = 0; i < NUM_SCENARIOS; i++) { + if (strcmp(scenarios[i].name, name) == 0) + return i; + } + return -1; +} + +static void xdp_lb_validate(void) +{ + if (env.consumer_cnt != 0) { + fprintf(stderr, "benchmark doesn't support consumers\n"); + exit(1); + } + if (bpf_num_possible_cpus() > BENCH_NR_CPUS) { + fprintf(stderr, "too many CPUs (%d > %d), increase BENCH_NR_CPUS\n", + bpf_num_possible_cpus(), BENCH_NR_CPUS); + exit(1); + } +} + +static void xdp_lb_run_once(void *unused __always_unused) +{ + int idx = args.scenario; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = pkt_buf[idx], + .data_size_in = pkt_len[idx], + .repeat = 1, + ); + + bpf_prog_test_run_opts(ctx.prog_fd, &topts); +} + +static void xdp_lb_setup(void) +{ + struct xdp_lb_bench *skel; + int err; + + if (args.scenario < 0) { + fprintf(stderr, "--scenario is required. Use --list-scenarios to see options.\n"); + exit(1); + } + + setup_libbpf(); + + skel = xdp_lb_bench__open(); + if (!skel) { + fprintf(stderr, "failed to open skeleton\n"); + exit(1); + } + + err = xdp_lb_bench__load(skel); + if (err) { + fprintf(stderr, "failed to load skeleton: %s\n", strerror(-err)); + xdp_lb_bench__destroy(skel); + exit(1); + } + + ctx.skel = skel; + ctx.prog_fd = bpf_program__fd(skel->progs.xdp_lb_bench); + + build_packet(args.scenario); + build_expected_packet(args.scenario); + + populate_maps(skel); + + BENCH_TIMING_INIT(&ctx.timing, skel, 0); + ctx.timing.machine_readable = args.machine_readable; + + if (scenarios[args.scenario].fixed_batch_iters) { + ctx.timing.batch_iters = scenarios[args.scenario].fixed_batch_iters; + skel->bss->batch_iters = ctx.timing.batch_iters; + } else { + bpf_bench_calibrate(&ctx.timing, xdp_lb_run_once, NULL); + } + + env.duration_sec = 600; + + /* + * Enable cold_lru before validation so LRU miss counters are + * correct. Seed the LRU with one run so the original flow is + * present; validation then sees exactly 1 miss (the poisoned + * flow) regardless of whether calibration ran. + */ + if (scenarios[args.scenario].cold_lru) { + skel->bss->cold_lru = 1; + xdp_lb_run_once(NULL); + } + + reset_stats(bpf_map__fd(skel->maps.stats)); + + if (!validate_scenario(args.scenario)) { + fprintf(stderr, "Validation FAILED - aborting benchmark\n"); + exit(1); + } + + if (scenarios[args.scenario].flow_mask) + skel->bss->flow_mask = scenarios[args.scenario].flow_mask; +} + +static void *xdp_lb_producer(void *input) +{ + while (true) + xdp_lb_run_once(NULL); + + return NULL; +} + +static void xdp_lb_measure(struct bench_res *res) +{ + bpf_bench_timing_measure(&ctx.timing, res); +} + +static void xdp_lb_report_final(struct bench_res res[], int res_cnt) +{ + bpf_bench_timing_report(&ctx.timing, scenarios[args.scenario].name, + scenarios[args.scenario].description); +} + +enum { + ARG_SCENARIO = 9001, + ARG_LIST_SCENARIOS = 9002, + ARG_MACHINE_READABLE = 9003, +}; + +static const struct argp_option opts[] = { + { "scenario", ARG_SCENARIO, "NAME", 0, + "Scenario to benchmark (required)" }, + { "list-scenarios", ARG_LIST_SCENARIOS, NULL, 0, + "List available scenarios and exit" }, + { "machine-readable", ARG_MACHINE_READABLE, NULL, 0, + "Print only a machine-readable RESULT line" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + int i; + + switch (key) { + case ARG_SCENARIO: + args.scenario = find_scenario(arg); + if (args.scenario < 0) { + fprintf(stderr, "unknown scenario: '%s'\n", arg); + fprintf(stderr, "use --list-scenarios to see options\n"); + argp_usage(state); + } + break; + case ARG_LIST_SCENARIOS: + printf("Available scenarios:\n"); + for (i = 0; i < NUM_SCENARIOS; i++) + printf(" %-20s %s\n", scenarios[i].name, scenarios[i].description); + exit(0); + case ARG_MACHINE_READABLE: + args.machine_readable = true; + env.quiet = true; + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +const struct argp bench_xdp_lb_argp = { + .options = opts, + .parser = parse_arg, +}; + +const struct bench bench_xdp_lb = { + .name = "xdp-lb", + .argp = &bench_xdp_lb_argp, + .validate = xdp_lb_validate, + .setup = xdp_lb_setup, + .producer_thread = xdp_lb_producer, + .measure = xdp_lb_measure, + .report_final = xdp_lb_report_final, +}; diff --git a/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh b/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh new file mode 100755 index 000000000000..f65cf46214a3 --- /dev/null +++ b/tools/testing/selftests/bpf/benchs/run_bench_xdp_lb.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ./benchs/run_common.sh + +set -eufo pipefail + +WARMUP=${WARMUP:-3} + +RUN="sudo ./bench -q -w${WARMUP} -a xdp-lb --machine-readable" + +SEP=" +----------------------------------+----------+---------+----------+" +HDR=" | %-32s | %8s | %7s | %8s |\n" +ROW=" | %-32s | %8s | %7s | %8s |\n" + +function group_header() +{ + printf "%s\n" "$SEP" + printf "$HDR" "$1" "p50" "stddev" "p99" + printf "%s\n" "$SEP" +} + +function rval() +{ + echo "$1" | sed -nE "s/.*$2=([^ ]+).*/\1/p" +} + +function run_scenario() +{ + local sc="$1" + shift + local output rline + + output=$($RUN --scenario "$sc" "$@" 2>&1) || true + rline=$(echo "$output" | grep '^RESULT ' || true) + + if [ -z "$rline" ]; then + printf "$ROW" "$sc" "ERR" "-" "-" + return + fi + + printf "$ROW" "$sc" \ + "$(rval "$rline" median)" \ + "$(rval "$rline" stddev)" \ + "$(rval "$rline" p99)" +} + +header "XDP load-balancer benchmark" + +group_header "Single-flow baseline" +for sc in tcp-v4-lru-hit tcp-v4-ch \ + tcp-v6-lru-hit tcp-v6-ch \ + udp-v4-lru-hit udp-v6-lru-hit \ + tcp-v4v6-lru-hit; do + run_scenario "$sc" +done + +group_header "Diverse flows (4K src addrs)" +for sc in tcp-v4-lru-diverse tcp-v4-ch-diverse \ + tcp-v6-lru-diverse tcp-v6-ch-diverse \ + udp-v4-lru-diverse; do + run_scenario "$sc" +done + +group_header "TCP flags" +run_scenario tcp-v4-syn +run_scenario tcp-v4-rst-miss + +group_header "LRU stress" +run_scenario tcp-v4-lru-miss +run_scenario udp-v4-lru-miss +run_scenario tcp-v4-lru-warmup + +group_header "Early exits" +for sc in pass-v4-no-vip pass-v6-no-vip pass-v4-icmp pass-non-ip drop-v4-frag drop-v4-options \ + drop-v6-frag; do + run_scenario "$sc" +done +printf "%s\n" "$SEP" diff --git a/tools/testing/selftests/bpf/bpf_arena_alloc.h b/tools/testing/selftests/bpf/bpf_arena_alloc.h index c27678299e0c..cda147fd9d25 100644 --- a/tools/testing/selftests/bpf/bpf_arena_alloc.h +++ b/tools/testing/selftests/bpf/bpf_arena_alloc.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #pragma once -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #ifndef __round_mask #define __round_mask(x, y) ((__typeof__(x))((y)-1)) diff --git a/tools/testing/selftests/bpf/bpf_arena_htab.h b/tools/testing/selftests/bpf/bpf_arena_htab.h index acc01a876668..d7ba86362d86 100644 --- a/tools/testing/selftests/bpf/bpf_arena_htab.h +++ b/tools/testing/selftests/bpf/bpf_arena_htab.h @@ -14,9 +14,8 @@ struct htab { htab_bucket_t *buckets; int n_buckets; }; -typedef struct htab __arena htab_t; -static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash) +static inline htab_bucket_t *__select_bucket(struct htab __arena *htab, __u32 hash) { htab_bucket_t *b = htab->buckets; @@ -24,7 +23,7 @@ static inline htab_bucket_t *__select_bucket(htab_t *htab, __u32 hash) return &b[hash & (htab->n_buckets - 1)]; } -static inline arena_list_head_t *select_bucket(htab_t *htab, __u32 hash) +static inline arena_list_head_t *select_bucket(struct htab __arena *htab, __u32 hash) { return &__select_bucket(htab, hash)->head; } @@ -53,7 +52,7 @@ static int htab_hash(int key) return key; } -__weak int htab_lookup_elem(htab_t *htab __arg_arena, int key) +__weak int htab_lookup_elem(struct htab __arena *htab, int key) { hashtab_elem_t *l_old; arena_list_head_t *head; @@ -66,7 +65,7 @@ __weak int htab_lookup_elem(htab_t *htab __arg_arena, int key) return 0; } -__weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value) +__weak int htab_update_elem(struct htab __arena *htab, int key, int value) { hashtab_elem_t *l_new = NULL, *l_old; arena_list_head_t *head; @@ -90,7 +89,7 @@ __weak int htab_update_elem(htab_t *htab __arg_arena, int key, int value) return 0; } -void htab_init(htab_t *htab) +void htab_init(struct htab __arena *htab) { void __arena *buckets = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); diff --git a/tools/testing/selftests/bpf/bpf_arena_list.h b/tools/testing/selftests/bpf/bpf_arena_list.h index e16fa7d95fcf..1af2ffc27d9c 100644 --- a/tools/testing/selftests/bpf/bpf_arena_list.h +++ b/tools/testing/selftests/bpf/bpf_arena_list.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #pragma once -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> struct arena_list_node; diff --git a/tools/testing/selftests/bpf/bpf_arena_strsearch.h b/tools/testing/selftests/bpf/bpf_arena_strsearch.h index c1b6eaa905bb..10a70667c8bf 100644 --- a/tools/testing/selftests/bpf/bpf_arena_strsearch.h +++ b/tools/testing/selftests/bpf/bpf_arena_strsearch.h @@ -1,9 +1,9 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #pragma once -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> -__noinline int bpf_arena_strlen(const char __arena *s __arg_arena) +__noinline int bpf_arena_strlen(const char __arena *s) { const char __arena *sc; @@ -40,7 +40,7 @@ __noinline int bpf_arena_strlen(const char __arena *s __arg_arena) * * An opening bracket without a matching close is matched literally. */ -__noinline bool glob_match(char const __arena *pat __arg_arena, char const __arena *str __arg_arena) +__noinline bool glob_match(char const __arena *pat, char const __arena *str) { /* * Backtrack to previous * on mismatch and retry starting one diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2234bd6bc9d3..67ff7882299e 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -5,6 +5,7 @@ #include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_core_read.h> +#include <bpf_may_goto.h> #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) @@ -204,89 +205,6 @@ l_true: \ }) #endif -/* - * Note that cond_break can only be portably used in the body of a breakable - * construct, whereas can_loop can be used anywhere. - */ -#ifdef __BPF_FEATURE_MAY_GOTO -#define can_loop \ - ({ __label__ l_break, l_continue; \ - bool ret = true; \ - asm volatile goto("may_goto %l[l_break]" \ - :::: l_break); \ - goto l_continue; \ - l_break: ret = false; \ - l_continue:; \ - ret; \ - }) - -#define __cond_break(expr) \ - ({ __label__ l_break, l_continue; \ - asm volatile goto("may_goto %l[l_break]" \ - :::: l_break); \ - goto l_continue; \ - l_break: expr; \ - l_continue:; \ - }) -#else -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define can_loop \ - ({ __label__ l_break, l_continue; \ - bool ret = true; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: ret = false; \ - l_continue:; \ - ret; \ - }) - -#define __cond_break(expr) \ - ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: expr; \ - l_continue:; \ - }) -#else -#define can_loop \ - ({ __label__ l_break, l_continue; \ - bool ret = true; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: ret = false; \ - l_continue:; \ - ret; \ - }) - -#define __cond_break(expr) \ - ({ __label__ l_break, l_continue; \ - asm volatile goto("1:.byte 0xe5; \ - .byte 0; \ - .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ - .short 0" \ - :::: l_break); \ - goto l_continue; \ - l_break: expr; \ - l_continue:; \ - }) -#endif -#endif - -#define cond_break __cond_break(break) -#define cond_break_label(label) __cond_break(goto label) - #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) @@ -505,6 +423,8 @@ static inline int get_preempt_count(void) return bpf_get_current_task_btf()->thread_info.preempt_count; #elif defined(bpf_target_s390) return bpf_get_lowcore()->preempt_count; +#elif defined(bpf_target_loongarch) + return bpf_get_current_task_btf()->thread_info.preempt_count; #endif return 0; } @@ -515,6 +435,7 @@ static inline int get_preempt_count(void) * * arm64 * * powerpc64 * * s390x + * * loongarch */ static inline int bpf_in_interrupt(void) { @@ -536,6 +457,7 @@ static inline int bpf_in_interrupt(void) * * arm64 * * powerpc64 * * s390x + * * loongarch */ static inline int bpf_in_nmi(void) { @@ -548,6 +470,7 @@ static inline int bpf_in_nmi(void) * * arm64 * * powerpc64 * * s390x + * * loongarch */ static inline int bpf_in_hardirq(void) { @@ -560,6 +483,7 @@ static inline int bpf_in_hardirq(void) * * arm64 * * powerpc64 * * s390x + * * loongarch */ static inline int bpf_in_serving_softirq(void) { @@ -580,6 +504,7 @@ static inline int bpf_in_serving_softirq(void) * * arm64 * * powerpc64 * * s390x + * * loongarch */ static inline int bpf_in_task(void) { diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 7dad01439391..ae71e9b69051 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -40,7 +40,7 @@ extern void *bpf_dynptr_slice(const struct bpf_dynptr *ptr, __u64 offset, extern void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *ptr, __u64 offset, void *buffer, __u64 buffer__szk) __ksym __weak; -extern int bpf_dynptr_adjust(const struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; +extern int bpf_dynptr_adjust(struct bpf_dynptr *ptr, __u64 start, __u64 end) __ksym __weak; extern bool bpf_dynptr_is_null(const struct bpf_dynptr *ptr) __ksym __weak; extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym __weak; extern __u64 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym __weak; @@ -70,13 +70,13 @@ extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak; extern int bpf_get_file_xattr(struct file *file, const char *name, struct bpf_dynptr *value_ptr) __ksym; -extern int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *digest_ptr) __ksym; +extern int bpf_get_fsverity_digest(struct file *file, const struct bpf_dynptr *digest_ptr) __ksym; extern struct bpf_key *bpf_lookup_user_key(__s32 serial, __u64 flags) __ksym; extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; extern void bpf_key_put(struct bpf_key *key) __ksym; -extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, - struct bpf_dynptr *sig_ptr, +extern int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_ptr, + const struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; struct dentry; diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 24855381290d..adb25146e88c 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -45,13 +45,16 @@ CONFIG_IPV6=y CONFIG_IPV6_FOU=y CONFIG_IPV6_FOU_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_IOAM6_LWTUNNEL=y CONFIG_IPV6_SEG6_BPF=y +CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y CONFIG_LIVEPATCH=y CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_UNLOAD=y @@ -130,4 +133,5 @@ CONFIG_INFINIBAND=y CONFIG_SMC=y CONFIG_SMC_HS_CTRL_BPF=y CONFIG_DIBS=y -CONFIG_DIBS_LO=y
\ No newline at end of file +CONFIG_DIBS_LO=y +CONFIG_PM_WAKELOCKS=y diff --git a/tools/testing/selftests/bpf/default.profraw b/tools/testing/selftests/bpf/default.profraw Binary files differnew file mode 100644 index 000000000000..e865e87829f8 --- /dev/null +++ b/tools/testing/selftests/bpf/default.profraw diff --git a/tools/testing/selftests/bpf/jit_disasm_helpers.c b/tools/testing/selftests/bpf/jit_disasm_helpers.c index 364c557c5115..3558fe10e28c 100644 --- a/tools/testing/selftests/bpf/jit_disasm_helpers.c +++ b/tools/testing/selftests/bpf/jit_disasm_helpers.c @@ -96,10 +96,19 @@ static int disasm_one_func(FILE *text_out, uint8_t *image, __u32 len) __u32 *label_pc, pc; int i, cnt, err = 0; char buf[64]; + char *cpu, *features; triple = LLVMGetDefaultTargetTriple(); - ctx = LLVMCreateDisasm(triple, &labels, 0, NULL, lookup_symbol); - if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasm")) { + + cpu = LLVMGetHostCPUName(); + features = LLVMGetHostCPUFeatures(); + + ctx = LLVMCreateDisasmCPUFeatures(triple, cpu, features, &labels, 0, NULL, lookup_symbol); + + LLVMDisposeMessage(cpu); + LLVMDisposeMessage(features); + + if (!ASSERT_OK_PTR(ctx, "LLVMCreateDisasmCPUFeatures")) { err = -EINVAL; goto out; } diff --git a/tools/testing/selftests/bpf/libarena/Makefile b/tools/testing/selftests/bpf/libarena/Makefile new file mode 100644 index 000000000000..5e2ab514805e --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/Makefile @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +# Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + +.PHONY: clean + +# Defaults for standalone builds + +CLANG ?= clang +BPFTOOL ?= bpftool +LDLIBS ?= -lbpf -lelf -lz -lrt -lpthread -lzstd + +ifeq ($(V),1) +Q = +msg = +else +Q ?= @ +msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; +endif + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ + grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') +BPF_TARGET_ENDIAN ?= $(if $(IS_LITTLE_ENDIAN),--target=bpfel,--target=bpfeb) + +LIBARENA=$(abspath .) +BPFDIR=$(abspath $(LIBARENA)/..) + +INCLUDE_DIR ?= $(BPFDIR)/tools/include +LIBBPF_INCLUDE ?= $(INCLUDE_DIR) + +# Scan src/ and selftests/ to generate the final binaries +LIBARENA_SOURCES = $(wildcard $(LIBARENA)/src/*.bpf.c) $(wildcard $(LIBARENA)/selftests/*.bpf.c) +LIBARENA_OBJECTS = $(notdir $(LIBARENA_SOURCES:.bpf.c=.bpf.o)) +LIBARENA_OBJECTS_ASAN = $(notdir $(LIBARENA_SOURCES:.bpf.c=_asan.bpf.o)) + +INCLUDES = -I$(LIBARENA)/include -I$(BPFDIR) +ifneq ($(INCLUDE_DIR),) +INCLUDES += -I$(INCLUDE_DIR) +endif +ifneq ($(LIBBPF_INCLUDE),) +INCLUDES += -I$(LIBBPF_INCLUDE) +endif + +ASAN_FLAGS = -fsanitize=kernel-address -fno-stack-protector -fno-builtin +ASAN_FLAGS += -mllvm -asan-instrument-address-spaces=1 -mllvm -asan-shadow-addr-space=1 +ASAN_FLAGS += -mllvm -asan-use-stack-safety=0 -mllvm -asan-stack=0 +ASAN_FLAGS += -mllvm -asan-kernel=1 +ASAN_FLAGS += -mllvm -asan-constructor-kind=none +ASAN_FLAGS += -mllvm -asan-destructor-kind=none + +# ENABLE_ATOMICS_TESTS required because we use arena spinlocks +override BPF_CFLAGS += -DENABLE_ATOMICS_TESTS +override BPF_CFLAGS += -O2 -g +override BPF_CFLAGS += -Wno-incompatible-pointer-types-discards-qualifiers +# Required for suppressing harmless vmlinux.h-related warnings. +override BPF_CFLAGS += -Wno-missing-declarations +override BPF_CFLAGS += $(INCLUDES) + +CFLAGS = -O2 -no-pie +CFLAGS += $(INCLUDES) + +vpath %.bpf.c $(LIBARENA)/src $(LIBARENA)/selftests +vpath %.c $(LIBARENA)/src $(LIBARENA)/selftests + +skeletons: libarena.skel.h libarena_asan.skel.h +.PHONY: skeletons + +libarena_asan.skel.h: libarena_asan.bpf.o + $(call msg,GEN-SKEL,libarena,$@) + $(Q)$(BPFTOOL) gen skeleton $< name "libarena_asan" > $@ + +libarena.skel.h: libarena.bpf.o + $(call msg,GEN-SKEL,libarena,$@) + $(Q)$(BPFTOOL) gen skeleton $< name "libarena" > $@ + +libarena_asan.bpf.o: $(LIBARENA_OBJECTS_ASAN) + $(call msg,GEN-OBJ,libarena,$@) + $(Q)$(BPFTOOL) gen object $@ $^ + +libarena.bpf.o: $(LIBARENA_OBJECTS) + $(call msg,GEN-OBJ,libarena,$@) + $(Q)$(BPFTOOL) gen object $@ $^ + +%_asan.bpf.o: %.bpf.c + $(call msg,CLNG-BPF,libarena,$@) + $(Q)$(CLANG) $(BPF_CFLAGS) $(ASAN_FLAGS) -DBPF_ARENA_ASAN $(BPF_TARGET_ENDIAN) -c $< -o $@ + +%.bpf.o: %.bpf.c + $(call msg,CLNG-BPF,libarena,$@) + $(Q)$(CLANG) $(BPF_CFLAGS) $(BPF_TARGET_ENDIAN) -c $< -o $@ + +clean: + $(Q)rm -f *.skel.h *.bpf.o *.linked*.o diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h index 16f8ce832004..82aafe879fae 100644 --- a/tools/testing/selftests/bpf/bpf_arena_common.h +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_common.h @@ -33,12 +33,12 @@ #endif #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) -#define __arena __attribute__((address_space(1))) +#define __arena __attribute__((address_space(1))) __attribute__((btf_type_tag("arena"))) #define __arena_global __attribute__((address_space(1))) #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ #else -#define __arena +#define __arena __attribute__((btf_type_tag("arena"))) #define __arena_global SEC(".addr_space.1") #define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) #define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) @@ -54,7 +54,6 @@ void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym _ #else /* when compiled as user space code */ #define __arena -#define __arg_arena #define cast_kern(ptr) /* nop for user space */ #define cast_user(ptr) /* nop for user space */ __weak char arena[1]; diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h index f90531cf3ee5..ae6b72d15bb6 100644 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/libarena/include/bpf_arena_spin_lock.h @@ -5,7 +5,7 @@ #include <vmlinux.h> #include <bpf/bpf_helpers.h> -#include "bpf_atomic.h" +#include <bpf_atomic.h> #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) @@ -16,10 +16,6 @@ #define EOPNOTSUPP 95 #define ETIMEDOUT 110 -#ifndef __arena -#define __arena __attribute__((address_space(1))) -#endif - extern unsigned long CONFIG_NR_CPUS __kconfig; /* @@ -107,7 +103,12 @@ struct arena_qnode { #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) -struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; +/* + * The qnodes are marked __weak so we can define them in the header + * while still ensuring all compilation units use the same struct + * instance. + */ +struct arena_qnode __weak __arena __hidden qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; static inline u32 encode_tail(int cpu, int idx) { @@ -240,8 +241,8 @@ static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); } -__noinline -int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +__noinline __weak +int arena_spin_lock_slowpath(arena_spinlock_t __arena *lock, u32 val) { struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; int ret = -ETIMEDOUT; diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h index c550e5711967..b7b230431929 100644 --- a/tools/testing/selftests/bpf/bpf_atomic.h +++ b/tools/testing/selftests/bpf/libarena/include/bpf_atomic.h @@ -5,7 +5,7 @@ #include <vmlinux.h> #include <bpf/bpf_helpers.h> -#include "bpf_experimental.h" +#include <bpf_may_goto.h> extern bool CONFIG_X86_64 __kconfig __weak; @@ -42,7 +42,9 @@ extern bool CONFIG_X86_64 __kconfig __weak; #define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +#ifndef WRITE_ONCE #define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) +#endif #define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) diff --git a/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h b/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h new file mode 100644 index 000000000000..9ba90689d6ba --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/bpf_may_goto.h @@ -0,0 +1,84 @@ +#pragma once + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif +#endif + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/asan.h b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h new file mode 100644 index 000000000000..900267159292 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/asan.h @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +struct asan_init_args { + u64 arena_all_pages; + u64 arena_globals_pages; +}; + +int asan_init(struct asan_init_args *args); + +extern volatile u64 __asan_shadow_memory_dynamic_address; +extern volatile u32 asan_reported; +extern volatile bool asan_inited; +extern volatile bool asan_report_once; + +#ifdef __BPF__ + +#define ASAN_SHADOW_SHIFT 3 +#define ASAN_SHADOW_SCALE (1ULL << ASAN_SHADOW_SHIFT) +#define ASAN_GRANULE_MASK ((1ULL << ASAN_SHADOW_SHIFT) - 1) +#define ASAN_GRANULE(addr) ((s8)((u32)(u64)((addr)) & ASAN_GRANULE_MASK)) + +#define __noasan __attribute__((no_sanitize("address"))) + +#ifdef BPF_ARENA_ASAN + +static inline +s8 __arena *mem_to_shadow(void __arena *addr) +{ + return (s8 __arena *)(((u32)(u64)addr >> ASAN_SHADOW_SHIFT) + + __asan_shadow_memory_dynamic_address); +} + +__weak __noasan +bool asan_ready(void) +{ + return __asan_shadow_memory_dynamic_address; +} + +int asan_poison(void __arena *addr, s8 val, size_t size); +int asan_unpoison(void __arena *addr, size_t size); +bool asan_shadow_set(void __arena *addr); + +/* + * Dummy calls to ensure the ASAN runtime's BTF information is present + * in every object file when compiling the runtime and local BPF code + * separately. The runtime calls are injected into the LLVM IR file + */ +#define DECLARE_ASAN_LOAD_STORE_SIZE(size) \ + void __asan_store##size(intptr_t addr); \ + void __asan_store##size##_noabort(intptr_t addr); \ + void __asan_load##size(intptr_t addr); \ + void __asan_load##size##_noabort(intptr_t addr); \ + void __asan_report_store##size(intptr_t addr); \ + void __asan_report_store##size##_noabort(intptr_t addr); \ + void __asan_report_load##size(intptr_t addr); \ + void __asan_report_load##size##_noabort(intptr_t addr); + +DECLARE_ASAN_LOAD_STORE_SIZE(1); +DECLARE_ASAN_LOAD_STORE_SIZE(2); +DECLARE_ASAN_LOAD_STORE_SIZE(4); +DECLARE_ASAN_LOAD_STORE_SIZE(8); + +void __asan_storeN(intptr_t addr, ssize_t size); +void __asan_storeN_noabort(intptr_t addr, ssize_t size); +void __asan_loadN(intptr_t addr, ssize_t size); +void __asan_loadN_noabort(intptr_t addr, ssize_t size); + +/* + * Force LLVM to emit BTF information for the stubs, + * because the ASAN pass in LLVM by itself doesn't. + */ +#define ASAN_LOAD_STORE_SIZE(size) \ + __asan_store##size, \ + __asan_store##size##_noabort, \ + __asan_load##size, \ + __asan_load##size##_noabort, \ + __asan_report_store##size, \ + __asan_report_store##size##_noabort, \ + __asan_report_load##size, \ + __asan_report_load##size##_noabort + +__attribute__((used)) +static void (*__asan_btf_anchors[])(intptr_t) = { + ASAN_LOAD_STORE_SIZE(1), + ASAN_LOAD_STORE_SIZE(2), + ASAN_LOAD_STORE_SIZE(4), + ASAN_LOAD_STORE_SIZE(8), +}; + +#else /* BPF_ARENA_ASAN */ + +static inline int asan_poison(void __arena *addr, s8 val, size_t size) { return 0; } +static inline int asan_unpoison(void __arena *addr, size_t size) { return 0; } +static inline bool asan_shadow_set(void __arena *addr) { return 0; } +__weak bool asan_ready(void) { return true; } + +#endif /* BPF_ARENA_ASAN */ + +#endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h new file mode 100644 index 000000000000..528c69a1f38e --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/buddy.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +enum buddy_consts { + /* + * Minimum allocation is 1 << BUDDY_MIN_ALLOC_SHIFT. + * Larger sizes increase internal fragmentation, but smaller + * sizes increase the space overhead of the block metadata. + */ + BUDDY_MIN_ALLOC_SHIFT = 4, + BUDDY_MIN_ALLOC_BYTES = 1 << BUDDY_MIN_ALLOC_SHIFT, + + /* + * How many orders the buddy allocator can serve. Minimum block + * size is 1 << BUDDY_MIN_ALLOC_SHIFT, maximum block size is + * 1 << (BUDDY_MIN_ALLOC_SHIFT + BUDDY_CHUNK_NUM_ORDERS - 1): + * Each block has size 1 << BUDDY_MIN_ALLOC_SHIFT, and the + * allocation orders are in [0, BUDDY_CHUNK_NUM_ORDERS). + * We keep two blocks of the maximum size to retain the + * property in the code that all blocks have a buddy. + * Higher values increase the maximum allocation size, + * but also the size of the metadata for each block. + */ + BUDDY_CHUNK_NUM_ORDERS = 1 << 4, + BUDDY_CHUNK_BYTES = BUDDY_MIN_ALLOC_BYTES << (BUDDY_CHUNK_NUM_ORDERS), + + /* Offset of the buddy header within a free block, see buddy.bpf.c for details */ + BUDDY_HEADER_OFF = 8, + + /* The maximum number of blocks a chunk may have to track. */ + BUDDY_CHUNK_ITEMS = 1 << (BUDDY_CHUNK_NUM_ORDERS), + BUDDY_CHUNK_OFFSET_MASK = BUDDY_CHUNK_BYTES - 1, + + /* + * Alignment for chunk allocations based on bpf_arena_alloc_pages. + * The arena allocation kfunc does not have an alignment argument, + * but that is required for all block calculations in the chunk to + * work. + */ + BUDDY_VADDR_OFFSET = BUDDY_CHUNK_BYTES, + + /* Total arena virtual address space the allocator can consume. */ + BUDDY_VADDR_SIZE = BUDDY_CHUNK_BYTES << 10 +}; + +struct buddy_header { + u32 prev_index; /* "Pointer" to the previous available allocation of the same size. */ + u32 next_index; /* Same for the next allocation. */ +}; + +/* + * We bring memory into the allocator 1 MiB at a time. + */ +struct buddy_chunk { + /* The order of the current allocation for a item. 4 bits per order. */ + u8 orders[BUDDY_CHUNK_ITEMS / 2]; + /* + * Bit to denote whether chunk is allocated. Size of the allocated/free + * chunk found from the orders array. + */ + u8 allocated[BUDDY_CHUNK_ITEMS / 8]; + /* Freelists for O(1) allocation. */ + u64 freelists[BUDDY_CHUNK_NUM_ORDERS]; + struct buddy_chunk __arena *next; +}; + +struct buddy { + struct buddy_chunk __arena *first_chunk; /* Pointer to the chunk linked list. */ + arena_spinlock_t lock; /* Allocator lock */ + u64 vaddr; /* Allocation into reserved vaddr */ +}; + +#ifdef __BPF__ + +int buddy_init(struct buddy __arena *buddy); +int buddy_destroy(struct buddy __arena *buddy); +int buddy_free(struct buddy __arena *buddy, void __arena *free); +void __arena *buddy_alloc(struct buddy __arena *buddy, size_t size); + +#endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/common.h b/tools/testing/selftests/bpf/libarena/include/libarena/common.h new file mode 100644 index 000000000000..a3eb1641ac36 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/common.h @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifdef __BPF__ + +#include <vmlinux.h> + +#include <bpf_arena_common.h> +#include <bpf_arena_spin_lock.h> + +#include <asm-generic/errno.h> + +#ifndef __BPF_FEATURE_ADDR_SPACE_CAST +#error "Arena allocators require bpf_addr_space_cast feature" +#endif + +#define arena_stdout(fmt, ...) bpf_stream_printk(1, (fmt), ##__VA_ARGS__) +#define arena_stderr(fmt, ...) bpf_stream_printk(2, (fmt), ##__VA_ARGS__) + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((__unused__)) +#endif + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +#define ARENA_PAGES (1UL << (32 - __builtin_ffs(__PAGE_SIZE) + 1)) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); /* number of pages */ +#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) + __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ +#else + __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ +#endif +} arena __weak SEC(".maps"); + +/* + * This is a variable used to aid verification. The may_goto directive + * permits open-coded for loops, but requires that the index variable is + * imprecise. To force the variable to be imprecise, initialize it with + * the opaque volatile variable 0 instead of the constant 0. + */ +extern const volatile u32 zero; +extern volatile u64 asan_violated; + +int arena_fls(__u64 word); + +void __arena *arena_malloc(size_t size); +void arena_free(void __arena *ptr); + +/* + * The verifier associates arenas with programs by checking LD.IMM + * instruction operands for an arena and populating the program state + * with the first instance it finds. This requires accessing our global + * arena variable, but subprogs do not necessarily do so while still + * using pointers from that arena. Insert an LD.IMM instruction to + * access the arena and help the verifier. + */ +#define arena_subprog_init() do { asm volatile ("" :: "r"(&arena)); } while (0) + +#else /* ! __BPF__ */ + +#include <stdint.h> + +#define __arena + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +/* Dummy "definition" for userspace. */ +#define arena_spinlock_t int + +#endif /* __BPF__ */ + +struct arena_get_info_args { + void __arena *arena_base; +}; + +struct arena_alloc_reserve_args { + u64 nr_pages; +}; + +/* Reasonable default number of pages reserved by arena_alloc_reserve. */ +#define ARENA_RESERVE_PAGES_DFL (8) diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h b/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h new file mode 100644 index 000000000000..486428911d96 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/rbtree.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause */ + +#pragma once + +#define RB_MAXLVL_PRINT (16) + +struct rbnode; + +struct rbnode { + struct rbnode __arena *parent; + union { + struct { + struct rbnode __arena *left; + struct rbnode __arena *right; + }; + + struct rbnode __arena *child[2]; + }; + uint64_t key; + /* Used as a linked list or to store KV pairs. */ + union { + struct rbnode __arena *next; + uint64_t value; + }; + bool is_red; +}; + +/* + * Does the rbtree allocate its own nodes, or do they get + * allocated by the caller? + */ +enum rbtree_alloc { + RB_ALLOC, + RB_NOALLOC, +}; + +/* + * Specify the behavior of rbtree insertions when the key is + * already present in the tree. + * + * RB_DEFAULT: Default behavior, reject the new insert. + * + * RB_UPDATE: Update the existing value in the rbtree. + * This updates the node itself, not just the value in + * the existing node. + * + * RB_DUPLICATE: Allow nodes with identical keys in the rbtree. + * Finding/popping/removing a key acts on any of the nodes + * with the appropriate key - there is no ordering by time + * of insertion. + */ +enum rbtree_insert_mode { + RB_DEFAULT, + RB_UPDATE, + RB_DUPLICATE, +}; + +struct rbtree { + struct rbnode __arena *root; + enum rbtree_alloc alloc; + enum rbtree_insert_mode insert; +}; + +#ifdef __BPF__ +struct rbtree __arena *rb_create(enum rbtree_alloc alloc, enum rbtree_insert_mode insert); + +int rb_destroy(struct rbtree __arena *rbtree); +int rb_insert(struct rbtree __arena *rbtree, u64 key, u64 value); +int rb_remove(struct rbtree __arena *rbtree, u64 key); +int rb_find(struct rbtree __arena *rbtree, u64 key, u64 *value); +int rb_print(struct rbtree __arena *rbtree); +int rb_least(struct rbtree __arena *rbtree, u64 *key, u64 *value); +int rb_pop(struct rbtree __arena *rbtree, u64 *key, u64 *value); + +int rb_insert_node(struct rbtree __arena *rbtree, struct rbnode __arena *node); +int rb_remove_node(struct rbtree __arena *rbtree, struct rbnode __arena *node); + +struct rbnode __arena *rb_node_alloc(u64 key, u64 value); +void rb_node_free(struct rbnode __arena *rbnode); + +int rb_integrity_check(struct rbtree __arena *rbtree); + +#endif /* __BPF__ */ diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h b/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h new file mode 100644 index 000000000000..75611276ce13 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/spmc.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause */ + +#pragma once + +struct spmc_arr; + +#define SPMC_ARR_BASESZ 128 +#define SPMC_ARR_ORDERS 10 + +struct spmc_arr { + u64 __arena *data; + u64 order; +}; + +struct spmc { + volatile struct spmc_arr __arena *cur; + volatile u64 top; + volatile u64 bottom; + struct spmc_arr arr[SPMC_ARR_ORDERS]; +}; + +int spmc_owned_add(struct spmc __arena *spmc, u64 val); +int spmc_owned_remove(struct spmc __arena *spmc, u64 *val); +int spmc_steal(struct spmc __arena *spmc, u64 *val); + +struct spmc __arena *spmc_create(void); +int spmc_destroy(struct spmc __arena *spmc); diff --git a/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h new file mode 100644 index 000000000000..fc27a4bcf5d7 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/include/libarena/userspace.h @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <bpf/libbpf.h> +#include <bpf/bpf.h> + +static inline int libarena_run_prog(int prog_fd) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + if (ret) + return ret; + + return opts.retval; +} + +static inline bool libarena_is_test_prog(const char *name) +{ + return strstr(name, "test_") == name; +} + +static inline bool libarena_is_asan_test_prog(const char *name) +{ + return strstr(name, "asan_test") == name; +} + +static inline bool libarena_is_parallel_test_prog(const char *name) +{ + return strstr(name, "parallel_test") == name; +} + + +static inline int libarena_run_prog_args(int prog_fd, void *args, size_t argsize) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + opts.ctx_in = args; + opts.ctx_size_in = argsize; + + ret = bpf_prog_test_run_opts(prog_fd, &opts); + + return ret ?: opts.retval; +} + +static inline int libarena_get_arena_base(int arena_get_info_fd, + void **arena_base) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_get_info_args args = { .arena_base = NULL }; + int ret; + + opts.ctx_in = &args; + opts.ctx_size_in = sizeof(args); + + ret = bpf_prog_test_run_opts(arena_get_info_fd, &opts); + if (ret) + return ret; + if (opts.retval) + return opts.retval; + + *arena_base = args.arena_base; + return 0; +} + +static inline int libarena_get_globals_pages(int arena_get_globals_fd, + size_t arena_all_pages, + u64 *globals_pages) +{ + size_t pgsize = sysconf(_SC_PAGESIZE); + void *arena_base; + ssize_t i; + u8 *vec; + int ret; + + ret = libarena_get_arena_base(arena_get_globals_fd, &arena_base); + if (ret) + return ret; + + if (!arena_base) + return -EINVAL; + + vec = calloc(arena_all_pages, sizeof(*vec)); + if (!vec) + return -ENOMEM; + + if (mincore(arena_base, arena_all_pages * pgsize, vec) < 0) { + ret = -errno; + free(vec); + return ret; + } + + *globals_pages = 0; + for (i = arena_all_pages - 1; i >= 0; i--) { + if (!(vec[i] & 0x1)) + break; + *globals_pages += 1; + } + + free(vec); + return 0; +} + +static inline int libarena_asan_init(int arena_asan_init_fd, + int asan_init_fd, + size_t arena_all_pages) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct asan_init_args args; + u64 globals_pages; + int ret; + + ret = libarena_get_globals_pages(arena_asan_init_fd, + arena_all_pages, &globals_pages); + if (ret) + return ret; + + args = (struct asan_init_args){ + .arena_all_pages = arena_all_pages, + .arena_globals_pages = globals_pages, + }; + + opts.ctx_in = &args; + opts.ctx_size_in = sizeof(args); + + ret = bpf_prog_test_run_opts(asan_init_fd, &opts); + if (ret) + return ret; + return opts.retval; +} diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c new file mode 100644 index 000000000000..686caba2c643 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_buddy.bpf.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <libarena/common.h> +#include <libarena/asan.h> +#include <libarena/buddy.h> + +/* Required for parsing the ASAN call stacks. */ +#include "test_progs_compat.h" + +extern struct buddy __arena buddy; + +#ifdef BPF_ARENA_ASAN + +#include "st_asan_common.h" + +static __always_inline int asan_test_buddy_oob_single(size_t alloc_size) +{ + u8 __arena *mem; + int ret, i; + + ret = asan_validate(); + if (ret < 0) + return ret; + + mem = buddy_alloc(&buddy, alloc_size); + if (!mem) { + arena_stdout("buddy_alloc failed for size %lu", alloc_size); + return -ENOMEM; + } + + ret = asan_validate(); + if (ret < 0) + return ret; + + for (i = zero; i < alloc_size && can_loop; i++) { + mem[i] = 0xba; + ret = asan_validate_addr(false, &mem[i]); + if (ret < 0) + return ret; + } + + mem[alloc_size] = 0xba; + ret = asan_validate_addr(true, &mem[alloc_size]); + if (ret < 0) + return ret; + + buddy_free(&buddy, mem); + + return 0; +} + +/* + * Factored out because asan_validate_addr is complex enough to cause + * verification failures if verified with the rest of asan_test_buddy_uaf_single. + */ +__weak int asan_test_buddy_byte(u8 __arena *mem, int i, bool freed) +{ + int ret; + + /* The header in freed blocks doesn't get poisoned. */ + if (freed && BUDDY_HEADER_OFF <= i && + i < BUDDY_HEADER_OFF + sizeof(struct buddy_header)) + return 0; + + mem[i] = 0xba; + ret = asan_validate_addr(freed, &mem[i]); + if (ret < 0) + return ret; + + return 0; +} + +__weak int asan_test_buddy_uaf_single(size_t alloc_size) +{ + u8 __arena *mem; + int ret; + int i; + + mem = buddy_alloc(&buddy, alloc_size); + if (!mem) { + arena_stdout("buddy_alloc failed for size %lu", alloc_size); + return -ENOMEM; + } + + ret = asan_validate(); + if (ret < 0) + return ret; + + for (i = zero; i < alloc_size && can_loop; i++) { + ret = asan_test_buddy_byte(mem, i, false); + if (ret) + return ret; + } + + ret = asan_validate(); + if (ret < 0) + return ret; + + buddy_free(&buddy, mem); + + for (i = zero; i < alloc_size && can_loop; i++) { + ret = asan_test_buddy_byte(mem, i, true); + if (ret) + return ret; + } + + return 0; +} + +struct buddy_blob { + volatile u8 mem[48]; + u8 oob; +}; + +static __always_inline int asan_test_buddy_blob_single(void) +{ + volatile struct buddy_blob __arena *blob; + const size_t alloc_size = sizeof(struct buddy_blob) - 1; + int ret; + + blob = buddy_alloc(&buddy, alloc_size); + if (!blob) + return -ENOMEM; + + blob->mem[0] = 0xba; + ret = asan_validate_addr(false, &blob->mem[0]); + if (ret < 0) + return ret; + + blob->mem[47] = 0xba; + ret = asan_validate_addr(false, &blob->mem[47]); + if (ret < 0) + return ret; + + blob->oob = 0; + ret = asan_validate_addr(true, &blob->oob); + if (ret < 0) + return ret; + + buddy_free(&buddy, (void __arena *)blob); + + return 0; +} + +SEC("syscall") +__stderr("Memory violation for address {{.*}} for write of size 1") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +__weak int asan_test_buddy_oob(void) +{ + size_t sizes[] = { + 7, 8, 17, 18, 64, 256, 317, 512, 1024, + }; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) { + arena_stdout("buddy_init failed with %d", ret); + return ret; + } + + for (i = zero; i < sizeof(sizes) / sizeof(sizes[0]) && can_loop; i++) { + ret = asan_test_buddy_oob_single(sizes[i]); + if (ret) { + arena_stdout("%s:%d Failed for size %lu", __func__, + __LINE__, sizes[i]); + buddy_destroy(&buddy); + return ret; + } + } + + buddy_destroy(&buddy); + + ret = asan_validate(); + if (ret < 0) + return ret; + + return 0; +} + +SEC("syscall") +__stderr("Memory violation for address {{.*}} for write of size 1") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +__weak int asan_test_buddy_uaf(void) +{ + size_t sizes[] = { 16, 32, 64, 128, 256, 512, 1024, 16384 }; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) { + arena_stdout("buddy_init failed with %d", ret); + return ret; + } + + for (i = zero; i < sizeof(sizes) / sizeof(sizes[0]) && can_loop; i++) { + ret = asan_test_buddy_uaf_single(sizes[i]); + if (ret) { + arena_stdout("%s:%d Failed for size %lu", __func__, + __LINE__, sizes[i]); + buddy_destroy(&buddy); + return ret; + } + } + + buddy_destroy(&buddy); + + ret = asan_validate(); + if (ret < 0) + return ret; + + return 0; +} + +SEC("syscall") +__stderr("Memory violation for address {{.*}} for write of size 1") +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +__weak int asan_test_buddy_blob(void) +{ + const int iters = 10; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) { + arena_stdout("buddy_init failed with %d", ret); + return ret; + } + + for (i = zero; i < iters && can_loop; i++) { + ret = asan_test_buddy_blob_single(); + if (ret) { + arena_stdout("%s:%d Failed on iteration %d", __func__, + __LINE__, i); + buddy_destroy(&buddy); + return ret; + } + } + + buddy_destroy(&buddy); + + ret = asan_validate(); + if (ret < 0) + return ret; + + return 0; +} + +#endif + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h new file mode 100644 index 000000000000..34a7918cb4cf --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/st_asan_common.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#pragma once + +#define ST_PAGES 64 + +static inline void print_asan_map_state(void __arena *addr) +{ + arena_stdout("%s:%d ASAN %p -> (val: %x gran: %x set: [%s])", + __func__, __LINE__, addr, + *(s8 __arena *)(addr), ASAN_GRANULE(addr), + asan_shadow_set(addr) ? "yes" : "no"); +} + +/* + * Emit an error and force the current function to exit if the ASAN + * violation state is unexpected. Reset the violation state after. + */ +static inline int asan_validate_addr(bool cond, void __arena *addr) +{ + if ((asan_violated != 0) == cond) { + asan_violated = 0; + return 0; + } + + arena_stdout("%s:%d ASAN asan_violated %lx", __func__, __LINE__, + (u64)asan_violated); + print_asan_map_state(addr); + + asan_violated = 0; + + return -EINVAL; +} + +static inline int asan_validate(void) +{ + if (!asan_violated) + return 0; + + arena_stdout("%s:%d Found ASAN violation at %lx", __func__, __LINE__, + asan_violated); + + asan_violated = 0; + + return -EINVAL; +} + +struct blob { + volatile u8 mem[59]; + u8 oob; +}; diff --git a/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c new file mode 100644 index 000000000000..b45a306816c0 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/st_buddy.bpf.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <libarena/common.h> + +#include <libarena/asan.h> +#include <libarena/buddy.h> + +extern struct buddy __arena buddy; + +struct segarr_entry { + u8 __arena *block; + size_t sz; + u8 poison; +}; + +#define SEGARRLEN (512) +static struct segarr_entry __arena segarr[SEGARRLEN]; +static void __arena *ptrs[17]; +size_t __arena alloc_sizes[] = { 3, 17, 1025, 129, 16350, 333, 9, 517 }; +size_t __arena alloc_multiple_sizes[] = { 3, 17, 1025, 129, 16350, 333, 9, 517, 2099 }; +size_t __arena alloc_free_sizes[] = { 3, 17, 64, 129, 256, 333, 512, 517 }; +size_t __arena alignment_sizes[] = { 1, 3, 7, 8, 9, 15, 16, 17, 31, + 32, 64, 100, 128, 255, 256, 512, 1000 }; + +SEC("syscall") +__weak int test_buddy_create(void) +{ + const int iters = 10; + int ret, i; + + for (i = zero; i < iters && can_loop; i++) { + ret = buddy_init(&buddy); + if (ret) + return ret; + + ret = buddy_destroy(&buddy); + if (ret) + return ret; + } + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alloc(void) +{ + void __arena *mem; + int ret, i; + + for (i = zero; i < 8 && can_loop; i++) { + ret = buddy_init(&buddy); + if (ret) + return ret; + + mem = buddy_alloc(&buddy, alloc_sizes[i]); + if (!mem) { + buddy_destroy(&buddy); + return -ENOMEM; + } + + buddy_destroy(&buddy); + } + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alloc_free(void) +{ + const int iters = 800; + void __arena *mem; + int ret, i; + + ret = buddy_init(&buddy); + if (ret) + return ret; + + for (i = zero; i < iters && can_loop; i++) { + mem = buddy_alloc(&buddy, alloc_free_sizes[(i * 5) % 8]); + if (!mem) { + buddy_destroy(&buddy); + return -ENOMEM; + } + + buddy_free(&buddy, mem); + } + + buddy_destroy(&buddy); + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alloc_multiple(void) +{ + int ret, j; + u32 i, idx; + u8 __arena *mem; + size_t sz; + u8 poison; + + ret = buddy_init(&buddy); + if (ret) + return ret; + + /* + * Cycle through each size, allocating an entry in the + * segarr. Continue for SEGARRLEN iterations. For every + * allocation write down the size, use the current index + * as a poison value, and log it with the pointer in the + * segarr entry. Use the poison value to poison the entire + * allocated memory according to the size given. + */ + for (i = zero; i < SEGARRLEN && can_loop; i++) { + sz = alloc_multiple_sizes[i % 9]; + poison = (u8)i; + + mem = buddy_alloc(&buddy, sz); + if (!mem) { + buddy_destroy(&buddy); + arena_stdout("%s:%d", __func__, __LINE__); + return -ENOMEM; + } + + segarr[i].block = mem; + segarr[i].sz = sz; + segarr[i].poison = poison; + + for (j = zero; j < sz && can_loop; j++) { + mem[j] = poison; + if (mem[j] != poison) { + buddy_destroy(&buddy); + return -EINVAL; + } + } + } + + /* + * Go to (i * 17) % SEGARRLEN, and free the block pointed to. + * Before freeing, check all bytes have the poisoned value + * corresponding to the element. If any values are unexpected, + * return an error. Skip some elements to test destroying the + * buddy allocator while data is still allocated. + */ + for (i = 10; i < SEGARRLEN && can_loop; i++) { + idx = (i * 17) % SEGARRLEN; + + mem = segarr[idx].block; + sz = segarr[idx].sz; + poison = segarr[idx].poison; + + for (j = zero; j < sz && can_loop; j++) { + if (mem[j] != poison) { + buddy_destroy(&buddy); + arena_stdout("%s:%d %lx %u vs %u", __func__, + __LINE__, (uintptr_t)&mem[j], + mem[j], poison); + return -EINVAL; + } + } + + buddy_free(&buddy, mem); + } + + buddy_destroy(&buddy); + + return 0; +} + +SEC("syscall") +__weak int test_buddy_alignment(void) +{ + int ret, i; + + ret = buddy_init(&buddy); + if (ret) + return ret; + + /* Allocate various sizes and check alignment */ + for (i = zero; i < 17 && can_loop; i++) { + ptrs[i] = buddy_alloc(&buddy, alignment_sizes[i]); + if (!ptrs[i]) { + arena_stdout("alignment test: alloc failed for size %lu", + alignment_sizes[i]); + buddy_destroy(&buddy); + return -ENOMEM; + } + + /* Check 8-byte alignment */ + if ((u64)ptrs[i] & 0x7) { + arena_stdout( + "alignment test: ptr %llx not 8-byte aligned (size %lu)", + (u64)ptrs[i], alignment_sizes[i]); + buddy_destroy(&buddy); + return -EINVAL; + } + } + + /* Free all allocations */ + for (i = zero; i < 17 && can_loop; i++) + buddy_free(&buddy, ptrs[i]); + + buddy_destroy(&buddy); + + return 0; +} + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c new file mode 100644 index 000000000000..f08f2a92e194 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_parallel_spmc.bpf.c @@ -0,0 +1,669 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause + +#include <bpf_atomic.h> + +#include <libarena/common.h> + +#include <libarena/asan.h> +#include <libarena/spmc.h> + +#define TEST_SPMC_THREADS 3 +#define TEST_SPMC_STEALERS (TEST_SPMC_THREADS - 1) + +/* + * The test requires the stealers/owners to sometimes quiesce + * before continuing the benchmark. Normally we'd use something + * like a condition variable, but since the benchmark is short-lived + * and operations are wait-free we just spin around the quiescence + * point instead. If we time out, we just fail the benchmark. + */ +#define TEST_SPMC_SYNC_SPINS BPF_MAX_LOOPS + +/* + * We track all the values we retrieve from the queue + * to get some guarantee we're, not corrupting data, + * e.g., accidentally reusing a past value from a slot. + */ +#define TEST_SPMC_MAX_VALUES (1024) +static u64 __arena seen[TEST_SPMC_MAX_VALUES]; + +/* The single spmc queue for the benchmark. */ +static struct spmc __arena *spmc; + +/* Owner and stealer epochs. We define the , */ +static volatile u64 owner_epoch; +static volatile u64 stealer_epoch; + +/* Map owner epochs to stealer epochs (simply scale by # of stealers). */ +#define STEALER_EPOCH(owner_epoch) ((owner_epoch) * TEST_SPMC_STEALERS) + +/* Global abort switch. If any thread fails, all others exit ASAP. */ +static volatile bool test_abort; + +/* + * Counters useful for ensuring conservation of pushes/pops of unique values + * (we're not stealing/popping more/fewer items than were pushed). + */ +static volatile u64 expected_total; +static volatile u64 total_seen; + +/* Measure how many pops and steals we've made (irrespective of retrieved value). */ +static volatile u64 pops; +static volatile u64 steals; + +/* Used for the resize selftest, see below. */ +static volatile u64 stealers_started; + +/* Used for the mixed selftest, see below. */ +static volatile u64 round_steals; + +/* + * We have multiple stealers and a single owner. We sometimes want the owner + * to successfully outproduce the stealers, we add a busy loop in them. + */ +#define TEST_SPMC_WASTE_ROUNDS (1UL << 12) + +/* + * The spmc data structure depends on the runtime fully + * supporting acquire/release semantics, which is not + * the case for all architectures. + */ +#if defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64)) +static bool spmc_tests_enabled(void) +{ + return true; +} +#else +static bool spmc_tests_enabled(void) +{ + return false; +} +#endif + +/* + * Scaffolding for each parallel test. Each test has setup/teardown, + * a single owner thread that owns the queue, and TEST_SPMC_STEALER + * threads that try to steal. + */ +#define DEFINE_PARALLEL_SPMC_TEST(prefix, expected_total) \ + SEC("syscall") int parallel_test_spmc_##prefix##__enabled(void) \ + { \ + return spmc_tests_enabled() ? 0 : -EOPNOTSUPP; \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__init(void) \ + { \ + return spmc_common_init(expected_total); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__fini(void) \ + { \ + return spmc_common_fini(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__0(void) \ + { \ + return spmc_##prefix##_owner(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__1(void) \ + { \ + return spmc_##prefix##_stealer(); \ + } \ + SEC("syscall") int parallel_test_spmc_##prefix##__2(void) \ + { \ + return spmc_##prefix##_stealer(); \ + } \ + +static int spmc_common_init(u64 total) +{ + u64 i; + + if (total > TEST_SPMC_MAX_VALUES) + return -E2BIG; + + owner_epoch = 0; + stealer_epoch = 0; + test_abort = false; + expected_total = total; + total_seen = 0; + pops = 0; + steals = 0; + stealers_started = 0; + round_steals = 0; + + for (i = zero; i < TEST_SPMC_MAX_VALUES && can_loop; i++) + seen[i] = 0; + + spmc = spmc_create(); + if (!spmc) + return -ENOMEM; + + return 0; +} + +static int spmc_common_fini(void) +{ + int ret; + + ret = spmc_destroy(spmc); + spmc = NULL; + + return ret; +} + +__weak +int spmc_quiesce_on_owner(u64 epoch) +{ + u64 i; + + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + if (test_abort) + return -EINTR; + if (smp_load_acquire(&owner_epoch) >= epoch) + return 0; + } + + test_abort = true; + + return -ETIMEDOUT; +} + +__weak +int spmc_quiesce_on_stealer(u64 epoch) +{ + u64 target, cur; + unsigned int i; + int err = -ETIMEDOUT; + + target = STEALER_EPOCH(epoch); + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + + if (test_abort) { + err = -EINTR; + break; + } + + cur = smp_load_acquire(&stealer_epoch); + if (cur > target) { + err = -EINVAL; + test_abort = true; + break; + } + + if (cur == target) + return 0; + } + + test_abort = true; + + return err; +} + +static int spmc_update_stats(u64 val, bool owner) +{ + u64 total; + + total = expected_total; + if (val >= total || val >= TEST_SPMC_MAX_VALUES) { + test_abort = true; + return -EINVAL; + } + + if (__sync_fetch_and_add(&seen[val], 1) != 0) { + test_abort = true; + return -EINVAL; + } + + __sync_fetch_and_add(&total_seen, 1); + if (owner) + __sync_fetch_and_add(&pops, 1); + else + __sync_fetch_and_add(&steals, 1); + + return 0; +} + +static int spmc_validate_owner_empty(void) +{ + u64 val; + int ret; + + ret = spmc_owned_remove(spmc, &val); + if (ret != -ENOENT) { + test_abort = true; + /* Change a 0 return value into -EINVAL. */ + return ret ?: -EINVAL; + } + + return 0; +} + +__weak +int spmc_validate_all_seen(void) +{ + u64 i, total; + + total = expected_total; + if (total_seen != total) + goto err; + + if (pops + steals != total) + goto err; + + for (i = zero; i < total && can_loop; i++) { + if (seen[i % TEST_SPMC_MAX_VALUES] != 1) + goto err; + } + + return 0; + +err: + test_abort = true; + + return -EINVAL; +} + +/* + * Single value benchmark. The owner adds an item then races with + * the stealers for it. This way directly race between owner and + * stealers on the same slot. + */ + + +#define TEST_SPMC_SINGLEVAL_ITERS (64) + +__weak +int spmc_singleval_tryconsume(u64 expected, bool steal) +{ + u64 val; + int ret; + + while (can_loop) { + if (steal) + ret = spmc_steal(spmc, &val); + else + ret = spmc_owned_remove(spmc, &val); + + /* Success. Update and validate. */ + if (!ret) { + if (val != expected) + return -EINVAL; + + ret = spmc_update_stats(val, !steal); + if (ret) + return ret; + + return 0; + } + + /* + * If we got -ENOENT, the queue is empty + * and we're good to go. + */ + if (ret != -EAGAIN) + return (ret == -ENOENT) ? 0 : ret; + } + + /* Impossible. */ + return -EINVAL; +} + +static int spmc_singleval_owner(void) +{ + int ret; + u64 i; + + for (i = zero; i < TEST_SPMC_SINGLEVAL_ITERS && can_loop; i++) { + ret = spmc_quiesce_on_stealer(i); + if (ret) + goto err; + + ret = spmc_owned_add(spmc, i); + if (ret) + goto err; + + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_singleval_tryconsume(i, false); + if (ret) + goto err; + + ret = spmc_quiesce_on_stealer(i + 1); + if (ret) + goto err; + } + + ret = spmc_validate_owner_empty(); + if (ret) + return ret; + + return spmc_validate_all_seen(); + +err: + test_abort = true; + return -EINVAL; +} + +static int spmc_singleval_stealer(void) +{ + int ret; + u64 i; + + for (i = zero; i < TEST_SPMC_SINGLEVAL_ITERS && can_loop; i++) { + ret = spmc_quiesce_on_owner(i + 1); + if (ret) + goto err; + + ret = spmc_singleval_tryconsume(i, true); + if (ret) + goto err; + + __sync_fetch_and_add(&stealer_epoch, 1); + } + + return 0; + +err: + test_abort = true; + return -EINVAL; +} + +DEFINE_PARALLEL_SPMC_TEST(singleval, TEST_SPMC_SINGLEVAL_ITERS) + +/* + * The resize test. Force a resize from the owner even while the stealers + * are trying to consume. Then make sure the queue is still consistent + * after the resize. + * + * The owner _doesn't_ consume from the queue. The test makes sure that + * switching the array from underneath the stealers works. + */ + +/* Force 2 resizes (since the rate of resize is logarithmic). */ +#define TEST_SPMC_RESIZE_ORDER (2) +#define TEST_SPMC_RESIZE_PREFILL ((SPMC_ARR_BASESZ << TEST_SPMC_RESIZE_ORDER) - 1) + +/* */ +#define TEST_SPMC_RESIZE_TAIL (SPMC_ARR_BASESZ << TEST_SPMC_RESIZE_ORDER) +#define TEST_SPMC_RESIZE_TOTAL (TEST_SPMC_RESIZE_PREFILL + TEST_SPMC_RESIZE_TAIL) + +__weak +int spmc_wait_for_stealers_to_start(u64 target) +{ + u64 i; + + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + if (test_abort) + return -EINTR; + if (READ_ONCE(stealers_started) >= target) + return 0; + } + + test_abort = true; + + return -ETIMEDOUT; +} + +__weak +void spmc_waste_time(void) +{ + int i; + int j; + + for (i = zero; i < TEST_SPMC_WASTE_ROUNDS && can_loop; i++) { + /* Random computation. */ + WRITE_ONCE(j, i * 17 + 23); + } +} + +static int spmc_resize_owner(void) +{ + bool resized = false; + u64 i; + int ret; + + /* Get a head start vs the consumers. */ + for (i = zero; i < TEST_SPMC_RESIZE_PREFILL && can_loop; i++) { + ret = spmc_owned_add(spmc, i); + if (ret) { + test_abort = true; + return ret; + } + } + + __sync_fetch_and_add(&owner_epoch, 1); + + /* Wait for stealers to start then start racing. */ + ret = spmc_wait_for_stealers_to_start(TEST_SPMC_STEALERS); + if (ret) + return ret; + + for (i = TEST_SPMC_RESIZE_PREFILL; i < TEST_SPMC_RESIZE_TOTAL && can_loop; i++) { + ret = spmc_owned_add(spmc, i); + if (ret) { + test_abort = true; + return ret; + } + + if (spmc->cur->order > TEST_SPMC_RESIZE_ORDER) + resized = true; + } + + /* Did we get to resize while racing? */ + if (!resized) { + test_abort = true; + return -EINVAL; + } + + /* + * Wait for the stealers to drain and make sure + * we didn't lose any items along the way. + */ + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_quiesce_on_stealer(1); + if (ret) + return ret; + + ret = spmc_validate_owner_empty(); + if (ret) + return ret; + + return spmc_validate_all_seen(); +} + +static int spmc_resize_stealer(void) +{ + bool owner_done = false; + u64 val; + int ret; + + arena_subprog_init(); + + ret = spmc_quiesce_on_owner(1); + if (ret) + return ret; + + __sync_fetch_and_add(&stealers_started, 1); + + while (can_loop) { + spmc_waste_time(); + if (test_abort) + return -EINTR; + + ret = spmc_steal(spmc, &val); + if (!ret) { + ret = spmc_update_stats(val, false); + if (ret) + return ret; + continue; + } + + if (ret == -EAGAIN) + continue; + + if (ret == -ENOENT) { + if (owner_done) + break; + owner_done = owner_epoch >= 2; + continue; + } + + test_abort = true; + return ret; + } + + __sync_fetch_and_add(&stealer_epoch, 1); + + return 0; +} + +DEFINE_PARALLEL_SPMC_TEST(resize, TEST_SPMC_RESIZE_TOTAL) + +/* + * The burst benchmark. The owner generates data all at once, + * then waits for the stealers to steal half then starts removing + * items until the queue empties. The owner also makes sure the + * item order is not jumbled. + */ + +#define TEST_SPMC_BURST_ROUNDS (4) +#define TEST_SPMC_BURST_BURST (64) +#define TEST_SPMC_BURST_TOTAL (TEST_SPMC_BURST_ROUNDS * TEST_SPMC_BURST_BURST) +#define TEST_SPMC_BURST_STEAL_TARGET (TEST_SPMC_BURST_BURST / 2) + +static int spmc_wait_for_round_steals(u64 target) +{ + u64 i; + + arena_subprog_init(); + + bpf_for(i, 0, TEST_SPMC_SYNC_SPINS) { + if (test_abort) + return -EINTR; + if (round_steals >= target) + return 0; + } + + test_abort = true; + + return -ETIMEDOUT; +} + +__weak int +spmc_burst_owner_round(u64 round) +{ + u64 i, base, stolen, expected, val; + int ret; + + base = round * TEST_SPMC_BURST_BURST; + round_steals = 0; + + for (i = zero; i < TEST_SPMC_BURST_BURST && can_loop; i++) { + ret = spmc_owned_add(spmc, base + i); + if (ret) + return ret; + } + + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_wait_for_round_steals(TEST_SPMC_BURST_STEAL_TARGET); + if (ret == -EINTR || ret == -ETIMEDOUT) + return ret; + + __sync_fetch_and_add(&owner_epoch, 1); + + ret = spmc_quiesce_on_stealer(round + 1); + if (ret) + return ret; + + stolen = round_steals; + if (stolen > TEST_SPMC_BURST_BURST) + return -EINVAL; + + for (i = zero; i < TEST_SPMC_BURST_BURST - stolen && can_loop; i++) { + ret = spmc_owned_remove(spmc, &val); + if (ret) + return ret; + + expected = base + TEST_SPMC_BURST_BURST - 1 - i; + if (val != expected) + return -EINVAL; + + ret = spmc_update_stats(val, true); + if (ret) { + test_abort = true; + return -EINVAL; + } + } + + ret = spmc_validate_owner_empty(); + if (ret) + return ret; + + return 0; +} + +static int spmc_burst_owner(void) +{ + u64 round; + int ret; + + arena_subprog_init(); + + for (round = zero; round < TEST_SPMC_BURST_ROUNDS && can_loop; round++) { + ret = spmc_burst_owner_round(round); + if (ret) + goto err; + } + + return spmc_validate_all_seen(); + +err: + test_abort = true; + return -EINVAL; +} + +static int spmc_burst_stealer(void) +{ + u64 round, val, active_epoch; + int ret; + + arena_subprog_init(); + + for (round = zero; round < TEST_SPMC_BURST_ROUNDS && can_loop; round++) { + active_epoch = round * 2 + 1; + + /* + * Wait till the owner prefills the queue then + * start stealing. + */ + ret = spmc_quiesce_on_owner(active_epoch); + if (ret) + return ret; + + while (owner_epoch == active_epoch && can_loop) { + if (test_abort) + return -EINTR; + + ret = spmc_steal(spmc, &val); + if (!ret) { + ret = spmc_update_stats(val, false); + if (ret) + return ret; + __sync_fetch_and_add(&round_steals, 1); + continue; + } + if (ret == -EAGAIN || ret == -ENOENT) + continue; + + test_abort = true; + return ret; + } + + __sync_fetch_and_add(&stealer_epoch, 1); + } + + return 0; +} + +DEFINE_PARALLEL_SPMC_TEST(burst, TEST_SPMC_BURST_TOTAL) diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h b/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h new file mode 100644 index 000000000000..9d431376c42f --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_progs_compat.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifdef __BPF__ + +/* Selftests use these tags for compatibility with test_progs. */ +#define __test_tag(tag) __attribute__((btf_decl_tag("comment:" XSTR(__COUNTER__) ":" tag))) +#define __stderr(msg) __test_tag("test_expect_stderr=" msg) +#define __stderr_unpriv(msg) __test_tag("test_expect_stderr_unpriv=" msg) + +#define XSTR(s) STR(s) +#define STR(s) #s + +#endif diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c new file mode 100644 index 000000000000..856c484a009a --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_rbtree.bpf.c @@ -0,0 +1,968 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause + +#include <libarena/common.h> + +#include <libarena/asan.h> +#include <libarena/rbtree.h> + +typedef struct node_ctx __arena *node_ctx; + +struct node_ctx { + struct rbnode rbnode; + node_ctx next; +}; + +static const u64 keys[] = { 51, 43, 37, 3, 301, 46, 383, 990, 776, 729, 871, 96, 189, 213, + 376, 167, 131, 939, 626, 119, 374, 700, 772, 154, 883, 620, 641, 5, + 428, 516, 105, 622, 988, 811, 931, 973, 246, 690, 934, 744, 210, 311, + 32, 255, 960, 830, 523, 429, 541, 738, 705, 774, 715, 446, 98, 578, + 777, 191, 279, 91, 767 }; + +static const u64 morekeys[] = { 173, 636, 1201, 8642, 5957, 3617, 4586, 8053, 6551, 7592, 1748, 1589, 8644, 9918, 6977, + 4448, 5852, 4640, 9717, 2303, 7424, 7695, 2334, 8876, 8618, 5745, 7134, 2178, 5280, 2140, 1138, + 5083, 8922, 1516, 2437, 2488, 4307, 4329, 5088, 8456, 5938, 1441, 1684, 5750, 721, 1107, 2089, + 9737, 4687, 5016, 4849, 8193, 9603, 9147, 5992, 166, 6721, 812, 4144, 6237, 6509, 3466, 9255, + 7767, 3960, 6759, 2968, 6046, 9784, 8395, 2619, 1711, 528, 6424, 9084, 3179, 1342, 5676, 9445, + 5691, 6678, 8487, 1627, 998, 6178, 2229, 1987, 3319, 572, 169, 2161, 3018, 5439, 7287, 7265, 5995, + 5003, 5857, 2836, 5634, 4735, 9261, 8287, 5359, 533, 1406, 9573, 4026, 714, 3956, 1722, 6395, + 9648, 3887, 7185, 470, 4482, 4997, 841, 8913, 9946, 3999, 9357, 9847, 277, 8184, 8704, 6766, 3323, + 5468, 8638, 7905, 8858, 6142, 3685, 3452, 4689, 8878, 8836, 158, 831, 7914, 3031, 8374, 4921, + 4207, 3460, 5547, 3358, 1083, 4619, 7818, 2962, 4879, 4583, 2172, 8819, 9830, 1194, 2666, 9812, + 5704, 8432, 5916, 6007, 6609, 4791, 1985, 3226, 2478, 9605, 5236, 8079, 3042, 1965, 3539, 9704, + 4267, 6416, 760, 9968, 2983, 1190, 1964, 3211, 2870, 3106, 2794, 1542, 6916, 5986, 9096, 441, + 5894, 8353, 7765, 3757, 5732, 88, 3091, 5637, 6042, 8447, 4073, 6923, 5491, 7010, 3663, 5029, + 6162, 822, 4874, 7491, 5100, 3461, 6983, 2170, 1458, 1856, 648, 6272, 4887, 976, 2369, 5909, 4274, + 3324, 6968, 2312, 2271, 8891, 6268, 6581, 1610, 8880, 6194, 6144, 9764, 6915, 829, 3774, 2265, + 1752, 1314, 6377, 8760, 8004, 501, 4912, 9278, 1425, 9578, 7337, 307, 1885, 3151, 9617, 1647, + 2458, 3702, 6091, 8902, 5663, 9378, 7640, 3336, 557, 1644, 6848, 1559, 8821, 266, 4330, 9790, + 5920, 4222, 1143, 6248, 5792, 4847, 9726, 6303, 821, 6839, 6062, 7133, 3649, 9888, 2528, 1966, + 5456, 4914, 3615, 1543, 3206, 3353, 6097, 2800, 1424, 9094, 7920, 7243, 1394, 5464, 1707, 576, + 6524, 4261, 4187, 7889, 5336, 3377, 2921, 7244, 2766, 6584, 5514, 1387, 2957, 2258, 1077, 9979, + 1128, 876, 4056, 4668, 4532, 1982, 7093, 4184, 5460, 7588, 4704, 6717, 61, 3959, 1826, 2294, 18, + 8170, 9394, 8796, 7288, 7285, 7143, 148, 6676, 6603, 1051, 8225, 4169, 3230, 7697, 6971, 3454, + 7501, 9514, 394, 2339, 4993, 5606, 6060, 1297, 8273, 3012, 157, 8181, 6765, 7207, 1005, 8833, 1914, + 7456, 1846, 8375, 2741, 2074, 1712, 5286 }; + +SEC("syscall") +__weak int test_rbtree_find_nonexistent(void) +{ + u64 key = 0xdeadbeef; + u64 value = 0; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + /* Should return -EINVAL */ + ret = rb_find(rbtree, key, &value); + if (!ret) + return 2; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_existing(void) +{ + u64 key = 525252; + u64 value = 24; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + ret = rb_insert(rbtree, key, value); + if (ret) + return 2; + + /* Should return -EALREADY. */ + ret = rb_insert(rbtree, key, value); + if (ret != -EALREADY) { + return 3; + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_update_existing(void) +{ + u64 key = 33333; + u64 value; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + value = 52; + ret = rb_insert(rbtree, key, value); + if (ret) + return 2; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 3; + + if (value != 52) + return 4; + + value = 65; + + /* Should succeed. */ + ret = rb_insert(rbtree, key, value); + if (ret) + return 5; + + /* Should be updated. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 6; + + if (value != 65) + return 7; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_one(void) +{ + u64 key = 202020; + u64 value = 0xbadcafe; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + ret = rb_insert(rbtree, key, value); + if (ret) + return 2; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 3; + + if (value != 0xbadcafe) + return 4; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_ten(void) +{ + u64 key, value; + int ret, i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + key = keys[i]; + ret = rb_insert(rbtree, key, 2 * key); + if (ret) + return 2 + 3 * i; + + /* Read it back. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 2 + 3 * i + 1; + + if (value != 2 * key) + return 2 + 3 * i + 2; + } + + /* Go find all inserted pairs. */ + for (i = 0; i < 10 && can_loop; i++) { + key = keys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 35 + 2 * i; + + if (value != 2 * key) + return 35 + 2 * i + 1; + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_duplicate(void) +{ + u64 key = 0x121212; + u64 value; + int ret, i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DUPLICATE); + if (!rbtree) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + ret = rb_insert(rbtree, key, 2 * key); + if (ret) + return 2 + 3 * i; + + /* Read it back. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 2 + 3 * i + 1; + + if (value != 2 * key) + return 2 + 3 * i + 2; + } + + /* Go find all inserted copies and remove them. */ + for (i = 0; i < 10 && can_loop; i++) { + ret = rb_find(rbtree, key, &value); + if (ret) { + rb_print(rbtree); + return 35 + 3 * i; + } + + if (value != 2 * key) + return 35 + 3 * i + 1; + + ret = rb_remove(rbtree, key); + if (ret) + return 35 + 3 * i + 2; + } + + return rb_destroy(rbtree); +} + +static inline int +clean_up_noalloc_tree(struct rbtree __arena *rbtree) +{ + node_ctx nodec; + int ret; + + if (rbtree->alloc != RB_NOALLOC) + return -EINVAL; + + /* Can't destroy an RB_NOALLOC tree that still has nodes. */ + if (rb_destroy(rbtree) != -EBUSY) + return -EINVAL; + + while (rbtree->root && can_loop) { + nodec = (node_ctx)arena_container_of(rbtree->root, struct node_ctx, rbnode); + ret = rb_remove_node(rbtree, &nodec->rbnode); + if (ret) + return ret; + + arena_free(nodec); + } + + return 0; +} + +int insert_many(enum rbtree_alloc alloc, enum rbtree_insert_mode insert) +{ + const size_t numkeys = sizeof(keys) / sizeof(keys[0]); + node_ctx nodec; + u64 key, value; + int ret; + int i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(alloc, insert); + if (!rbtree) + return 1; + + for (i = 0; i < numkeys && can_loop; i++) { + key = keys[i]; + if (rbtree->alloc != RB_ALLOC) { + nodec = arena_malloc(sizeof(*nodec)); + if (!nodec) { + arena_stderr("out of memory\n"); + return -ENOMEM; + } + nodec->rbnode.key = key; + nodec->rbnode.value = 2 * key; + ret = rb_insert_node(rbtree, &nodec->rbnode); + } else { + ret = rb_insert(rbtree, key, 2 * key); + } + if (ret) + return 2 + 3 * i; + + /* Read it back. */ + ret = rb_find(rbtree, key, &value); + if (ret) + return 2 + 3 * i + 1; + + if (value != 2 * key) + return 2 + 3 * i + 2; + } + + /* Go find all inserted pairs. */ + for (i = 0; i < numkeys && can_loop; i++) { + key = keys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return 302 + 2 * i; + + if (value != 2 * key) + return 302 + 2 * i + 1; + } + + /* RB_ALLOC trees are destroyed while still having elements. */ + if (rbtree->alloc == RB_ALLOC) + return rb_destroy(rbtree); + + /* Otherwise manually clean up the tree. */ + if (clean_up_noalloc_tree(rbtree)) + return 5; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_remove_one(void) +{ + u64 key = 20, value = 5, newvalue; + int ret; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + ret = rb_find(rbtree, key, &newvalue); + if (!ret) + return 2; + + ret = rb_insert(rbtree, key, value); + if (ret) + return 3; + + ret = rb_find(rbtree, key, &newvalue); + if (ret || value != newvalue) + return 4; + + ret = rb_remove(rbtree, key); + if (ret) + return 5; + + ret = rb_find(rbtree, key, &newvalue); + if (!ret) + return 6; + + return rb_destroy(rbtree); +} + +static __always_inline int remove_many_verify_all_present(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i++) { + u64 key = morekeys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + return 0; +} + +static __always_inline int remove_many_verify_remaining(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + + ret = rb_find(rbtree, key, &value); + if (!ret) + return -1; + + if (i + 1 >= numkeys) + break; + + key = morekeys[i + 1]; + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + for (i = 1; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + return 0; +} + +static __noinline int remove_many_alloc(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i++) { + u64 key = morekeys[i]; + + ret = rb_insert(rbtree, key, 2 * key); + if (ret) + return -1; + + if (rb_integrity_check(rbtree)) { + arena_stderr("iteration %d\n", i); + return -EINVAL; + } + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + ret = remove_many_verify_all_present(rbtree); + if (ret) + return ret; + + for (i = 0; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + + ret = rb_remove(rbtree, key); + if (ret) { + arena_stderr("Failed to remove %ld\n", key); + return -1; + } + + ret = rb_find(rbtree, key, &value); + if (!ret) + return -1; + } + + return remove_many_verify_remaining(rbtree); +} + +static __noinline int remove_many_noalloc(struct rbtree __arena *rbtree) +{ + const size_t numkeys = sizeof(morekeys) / sizeof(morekeys[0]); + node_ctx first = NULL, last = NULL; + u64 value; + int ret; + int i; + + for (i = 0; i < numkeys && can_loop; i++) { + u64 key = morekeys[i]; + node_ctx nodec = arena_malloc(sizeof(*nodec)); + + if (!nodec) { + arena_stderr("out of memory\n"); + return -ENOMEM; + } + nodec->rbnode.key = key; + nodec->rbnode.value = 2 * key; + nodec->next = NULL; + + if (!first) + first = nodec; + + if (last) + last->next = nodec; + last = nodec; + + ret = rb_insert_node(rbtree, &nodec->rbnode); + if (ret) + return -1; + + if (rb_integrity_check(rbtree)) { + arena_stderr("iteration %d\n", i); + return -EINVAL; + } + + ret = rb_find(rbtree, key, &value); + if (ret) + return -1; + + if (value != 2 * key) + return -1; + } + + ret = remove_many_verify_all_present(rbtree); + if (ret) + return ret; + + for (i = 0; i < numkeys && can_loop; i += 2) { + u64 key = morekeys[i]; + node_ctx nodec = first; + + if (!nodec || key != nodec->rbnode.key) + return -1; + + first = nodec->next ? nodec->next->next : NULL; + ret = rb_remove_node(rbtree, &nodec->rbnode); + if (ret) { + arena_stderr("Failed to remove %ld\n", key); + return -1; + } + + ret = rb_find(rbtree, key, &value); + if (!ret) + return -1; + } + + return remove_many_verify_remaining(rbtree); +} + +static inline int remove_many(enum rbtree_alloc alloc, + enum rbtree_insert_mode insert) +{ + int ret; + struct rbtree __arena *rbtree; + + rbtree = rb_create(alloc, insert); + if (!rbtree) + return -ENOMEM; + + ret = (alloc == RB_ALLOC) ? remove_many_alloc(rbtree) + : remove_many_noalloc(rbtree); + if (ret) + return ret; + + if (alloc == RB_ALLOC) + return rb_destroy(rbtree); + + ret = clean_up_noalloc_tree(rbtree); + if (ret) + return ret; + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_insert_many_update(void) +{ + return insert_many(RB_ALLOC, RB_UPDATE); +} + +SEC("syscall") +__weak int test_rbtree_insert_many_noalloc(void) +{ + return insert_many(RB_NOALLOC, RB_DUPLICATE); +} + +SEC("syscall") +__weak int test_rbtree_remove_many_update(void) +{ + return remove_many(RB_ALLOC, RB_UPDATE); +} + +SEC("syscall") +__weak int test_rbtree_remove_many_noalloc(void) +{ + return remove_many(RB_NOALLOC, RB_DUPLICATE); +} + +SEC("syscall") +__weak int test_rbtree_add_remove_circular(void) +{ + const size_t iters = 60; + const size_t prefill = 10; + const size_t numkeys = 50; + const size_t prefix = 400000; + u64 value, rmval; + int errval = 1; + u64 key; + int ret; + int i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + for (i = 0; i < prefill && can_loop; i++) { + ret = rb_insert(rbtree, prefix + (i % numkeys), i); + if (ret) + return errval; + + errval += 1; + } + + errval = 2 * 1000 * 1000; + + for (i = 0; i < prefill && can_loop; i++) { + /* Read it back. */ + ret = rb_find(rbtree, prefix + (i % numkeys), &value); + if (ret) + return errval; + + if (value != i) + return errval; + } + + errval = 3 * 1000 * 1000; + + for (i = prefill; i < iters && can_loop; i++) { + key = prefix + (i % numkeys); + + ret = rb_find(rbtree, key, &value); + if (!ret) { + arena_stderr("Key %d already present\n", key); + return errval; + } + + errval += 1; + + ret = rb_insert(rbtree, key, i); + if (ret) { + arena_stderr("ITERATION %d\n", i); + rb_print(rbtree); + return errval; + } + + rmval = i - prefill; + + errval += 1; + + ret = rb_find(rbtree, prefix + (rmval % numkeys), &value); + if (ret) + return errval; + + errval += 1; + + if (value != rmval) + return errval; + + errval += 1; + + ret = rb_remove(rbtree, prefix + (rmval % numkeys)); + if (ret) { + arena_stderr("ITERATION %d\n", i); + return errval; + } + + errval += 1; + } + + for (i = 0; i < numkeys && can_loop; i++) { + rb_remove(rbtree, prefix + i); + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_add_remove_circular_reverse(void) +{ + const size_t iters = 110; + const size_t prefill = 10; + const size_t numkeys = 50; + const size_t prefix = 500000; + u64 value, rmval; + int errval = 1; + u64 key; + int ret; + int i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_UPDATE); + if (!rbtree) + return 1; + + for (i = 0; i < prefill && can_loop; i++) { + ret = rb_insert(rbtree, prefix - (i % numkeys), i); + if (ret) + return errval; + + errval += 1; + } + + errval = 2 * 1000 * 1000; + + for (i = 0; i < prefill && can_loop; i++) { + /* Read it back. */ + ret = rb_find(rbtree, prefix - (i % numkeys), &value); + if (ret) + return errval; + + if (value != i) + return errval; + } + + errval = 3 * 1000 * 1000; + + for (i = prefill; i < iters && can_loop; i++) { + key = prefix - (i % numkeys); + + ret = rb_find(rbtree, key, &value); + if (!ret) { + arena_stderr("Key %d already present\n", key); + return errval; + } + + errval += 1; + + ret = rb_insert(rbtree, key, i); + if (ret) { + arena_stderr("error %d on insert\n", ret); + rb_print(rbtree); + return errval; + } + + rmval = i - prefill; + + errval += 1; + + ret = rb_find(rbtree, prefix - (rmval % numkeys), &value); + if (ret) + return errval; + + errval += 1; + + if (value != rmval) + return errval; + + errval += 1; + + ret = rb_remove(rbtree, prefix - (rmval % numkeys)); + if (ret) + return errval; + + errval += 1; + } + + + errval = 4 * 1000 * 1000; + for (i = 0; i < prefill && can_loop; i++) { + ret = rb_remove(rbtree, prefix - i); + if (ret) { + arena_stderr("Did not remove %d, error %d\n", prefix - i, ret); + return errval + i; + } + } + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_least_pop(void) +{ + const size_t keys = 10; + u64 key, value; + int errval = 1; + int ret, i; + + struct rbtree __arena *rbtree; + + rbtree = rb_create(RB_ALLOC, RB_DEFAULT); + if (!rbtree) + return errval; + + errval += 1; + + for (i = 0; i < keys / 2 && can_loop; i++) { + ret = rb_insert(rbtree, i, i); + if (ret) + return errval; + + errval += 1; + + ret = rb_insert(rbtree, keys - 1 - i, keys - 1 - i); + if (ret) + return errval; + + errval += 1; + + ret = rb_least(rbtree, &key, &value); + if (ret) + return errval; + + errval += 1; + + if (key != 0 || value != 0) + return errval; + + errval += 1; + } + + errval = 1000; + + for (i = 0; i < keys && can_loop; i++) { + ret = rb_least(rbtree, &key, &value); + if (ret) { + arena_stderr("rb_least failed with %d\n", ret); + return errval; + } + + errval += 1; + + if (key != i || value != i) { + arena_stderr("Got KV %ld/%ld expected %d\n", key, value, i); + return errval; + } + + errval += 1; + + ret = rb_pop(rbtree, &key, &value); + if (ret) { + arena_stderr("Error %d during pop on iter %d\n", ret, i); + return errval; + } + + errval += 1; + + if (key != i || value != i) + return errval; + } + + return rb_destroy(rbtree); +} + +/* Reject rb_pop() for RB_NOALLOC trees. */ +SEC("syscall") +__weak int test_rbtree_noalloc_pop(void) +{ + const u64 expect_value = 1; + const u64 expect_key = 0; + struct rbtree __arena *rbtree; + struct rbnode __arena *node; + u64 value = 0; + int ret; + + rbtree = rb_create(RB_NOALLOC, RB_DEFAULT); + if (!rbtree) + return 1; + + node = rb_node_alloc(expect_key, expect_value); + if (!node) { + rb_destroy(rbtree); + return 2; + } + + ret = rb_insert_node(rbtree, node); + if (ret) { + rb_node_free(node); + rb_destroy(rbtree); + return 3; + } + + ret = rb_pop(rbtree, NULL, &value); + if (ret != -EINVAL) + return 4; + + ret = rb_find(rbtree, expect_key, &value); + if (ret) + return 5; + + if (value != expect_value) + return 6; + + ret = rb_remove_node(rbtree, node); + if (ret) + return 7; + + rb_node_free(node); + + return rb_destroy(rbtree); +} + +SEC("syscall") +__weak int test_rbtree_alloc_check(void) +{ + struct rbtree __arena *alloc, *noalloc; + struct rbnode __arena *node; + int ret; + + alloc = rb_create(RB_ALLOC, RB_DEFAULT); + if (!alloc) + return 1; + + noalloc = rb_create(RB_NOALLOC, RB_DEFAULT); + if (!noalloc) + return 2; + + + node = rb_node_alloc(0, 0); + if (!node) + return 3; + + /* + * RB_ALLOC trees can use rb_insert, RB_NOALLOC trees can + * use rb_insert_node. RB_ALLOC and RB_NOALLOC trees cannot + * use each other's APIs. + * + * NOTE: This begs the question, why not different types? We + * want to partially share the API and that would require us + * to duplicate it. + */ + if (rb_insert(alloc, 0, 0)) + return 4; + + if (!rb_insert_node(alloc, node)) + return 5; + + if (!rb_remove_node(alloc, node)) + return 6; + + if (rb_remove(alloc, 0)) + return 7; + + if (rb_insert_node(noalloc, node)) + return 8; + + if (!rb_insert(noalloc, 0, 0)) + return 9; + + if (!rb_remove(noalloc, 0)) + return 10; + + if (rb_remove_node(noalloc, node)) + return 11; + + rb_node_free(node); + + ret = rb_destroy(alloc); + if (ret) + return ret; + + return rb_destroy(noalloc); +} diff --git a/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c b/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c new file mode 100644 index 000000000000..4d7a520115d1 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/selftests/test_spmc.bpf.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause + +#include <libarena/common.h> + +#include <libarena/asan.h> +#include <libarena/spmc.h> + +/* + * NOTE: These selftests only test for the single-threaded use case, which for + * Lev-Chase queues is obviously the simplest one. Still, it is important to + * exercise the API to ensure it passes verification and basic checks. + */ + +SEC("syscall") +int test_spmc_remove_empty(void) +{ + u64 val; + int ret; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + ret = spmc_owned_remove(spmc, &val); + if (ret != -ENOENT) + return 1; + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_steal_empty(void) +{ + u64 val; + int ret; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + ret = spmc_steal(spmc, &val); + if (ret != -ENOENT) + return 1; + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_steal_one(void) +{ + u64 val, newval; + int ret, i; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) + return 1; + + ret = spmc_steal(spmc, &newval); + if (ret) + return 2; + + if (val != newval) + return 3; + } + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_remove_one(void) +{ + u64 val, newval; + int ret, i; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 10 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) + return 1; + + ret = spmc_owned_remove(spmc, &newval); + if (ret) + return 2; + + if (val != newval) + return 3; + } + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_remove_many(void) +{ + u64 val, newval; + int ret, i; + u64 expected; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 500 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + } + + for (i = 0; i < 500 && can_loop; i++) { + ret = spmc_owned_remove(spmc, &newval); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + + expected = 500 - 1 - i; + if (newval != expected) { + arena_stderr("%s:%d expected %llu found %llu\n", __func__, __LINE__, expected, newval); + return 1; + } + } + + spmc_destroy(spmc); + + return 0; +} + +SEC("syscall") +int test_spmc_steal_many(void) +{ + u64 val, newval; + int ret, i; + + struct spmc __arena *spmc = spmc_create(); + + if (!spmc) + return 1; + + for (i = 0; i < 500 && can_loop; i++) { + val = i; + + ret = spmc_owned_add(spmc, val); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + } + + for (i = 0; i < 500 && can_loop; i++) { + ret = spmc_steal(spmc, &newval); + if (ret) { + arena_stderr("%s:%d error %d\n", __func__, __LINE__, ret); + return 1; + } + + if (newval != i) { + arena_stderr("%s:%d expected %d found %llu\n", __func__, __LINE__, i, newval); + return 1; + } + } + + spmc_destroy(spmc); + + return 0; +} diff --git a/tools/testing/selftests/bpf/libarena/src/asan.bpf.c b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c new file mode 100644 index 000000000000..5135d5c72a46 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/asan.bpf.c @@ -0,0 +1,553 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <libarena/common.h> +#include <libarena/asan.h> + + +enum { + /* + * Is the access checked by check_region_inline + * a read or a write? + */ + ASAN_READ = 0x0U, + ASAN_WRITE = 0x1U, +}; + +/* + * Address sanitizer (ASAN) for arena-based BPF programs, inspired + * by KASAN. + * + * The API + * ------- + * + * The implementation includes two kinds of components: Implementation + * of ASAN hooks injected by LLVM into the program, and API calls that + * allocators use to mark memory as valid or invalid. The full list is: + * + * LLVM stubs: + * + * void __asan_{load, store}<size>(intptr_t addr) + * Checks whether an access is valid. All variations covered + * by check_region_inline(). + * + * void __asan_{store, load}((intptr_t addr, ssize_t size) + * + * void __asan_report_{load, store}<size>(intptr_t addr) + * Report an access violation for the program. Used when LLVM + * uses direct code generation for shadow map checks. + * + * void *__asan_memcpy(void *d, const void *s, size_t n) + * void *__asan_memmove(void *d, const void *s, size_t n) + * void *__asan_memset(void *p, int c, size_t n) + * Hooks for ASAN instrumentation of the LLVM mem* builtins. + * Currently unimplemented just like the builtins themselves. + * + * API methods: + * + * asan_init() + * Initialize the ASAN map for the arena. + * + * asan_poison() + * Mark a region of memory as poisoned. Accessing poisoned memory + * causes asan_report() to fire. Invoked during free(). + * + * asan_unpoison() + * Mark a region as unpoisoned after alloc(). + * + * asan_shadow_set() + * Check a byte's validity directly. + * + * The Algorithm In Brief + * ---------------------- + * Each group of 8 bytes is mapped to a "granule" in the shadow map. This + * granule is the size of the byte and describes which bytes are valid. + * Possible values are: + * + * 0: All bytes are valid. Makes checks in the middle of an allocated region + * (most of them) fast. + * (0, 7]: How many consecutive bytes are valid, starting from the lowest one. + * The tradeoff is that we can't poison individual bytes in the middle of a + * valid region. + * [0x80, 0xff]: Special poison values, can be used to denote specific error + * modes (e.g., recently freed vs uninitialized memory). + * + * The mapping between a memory location and its shadow is: + * shadow_addr = shadow_base + (addr >> 3). We retain the 8:1 data:shadow + * ratio of existing ASAN implementations as a compromise between tracking + * granularity and space usage/scan overhead. + */ + +#ifdef BPF_ARENA_ASAN + +#pragma clang attribute push(__attribute__((no_sanitize("address"))), \ + apply_to = function) + +#define SHADOW_ALL_ZEROES ((u64)-1) + +/* + * Canary variable for ASAN violations. Set to the offending address. + */ +volatile u64 asan_violated = 0; + +/* + * Shadow map occupancy map. + */ +volatile u64 __asan_shadow_memory_dynamic_address; + +volatile u32 asan_reported = false; +volatile bool asan_inited = false; + +/* + * Set during program load. + */ +volatile bool asan_report_once = false; + +/* + * BPF does not currently support the memset/memcpy/memcmp intrinsics. + * For large sequential copies, or assignments of large data structures, + * the frontend will generate an intrinsic that causes the BPF backend + * to exit due to a missing implementation. Provide a simple implementation + * just for memset to use it for poisoning/unpoisoning the map. + */ +__weak int asan_memset(s8 __arena *dst, s8 val, size_t size) +{ + size_t i; + + for (i = zero; i < size && can_loop; i++) + dst[i] = val; + + return 0; +} + +/* Validate a 1-byte access, always within a single byte. */ +static __always_inline bool memory_is_poisoned_1(s8 __arena *addr) +{ + s8 shadow_value = *(s8 __arena *)mem_to_shadow(addr); + + /* Byte is 0, access is valid. */ + if (likely(!shadow_value)) + return false; + + /* + * Byte is non-zero. Access is valid if granule offset in [0, shadow_value), + * so the memory is poisoned if shadow_value is negative or smaller than + * the granule's value. + */ + + return ASAN_GRANULE(addr) >= shadow_value; +} + +/* Validate a 2- 4-, 8-byte access, shadow spans up to 2 bytes. */ +static __always_inline bool memory_is_poisoned_2_4_8(s8 __arena *addr, u64 size) +{ + u64 end = (u64)addr + size - 1; + + /* + * Region fully within a single byte (addition didn't + * overflow above ASAN_GRANULE). + */ + if (likely(ASAN_GRANULE(end) >= size - 1)) + return memory_is_poisoned_1((s8 __arena *)end); + + /* + * Otherwise first byte must be fully unpoisoned, and second byte + * must be unpoisoned up to the end of the accessed region. + */ + + return *(s8 __arena *)mem_to_shadow(addr) || memory_is_poisoned_1((s8 __arena *)end); +} + +__weak bool asan_shadow_set(void __arena *addr) +{ + return memory_is_poisoned_1(addr); +} + +static __always_inline u64 first_nonzero_byte(u64 addr, size_t size) +{ + while (size && can_loop) { + if (unlikely(*(s8 __arena *)addr)) + return addr; + addr += 1; + size -= 1; + } + + return SHADOW_ALL_ZEROES; +} + +static __always_inline bool memory_is_poisoned_n(s8 __arena *addr, u64 size) +{ + u64 ret; + u64 start; + u64 end; + + /* Size of [start, end] is end - start + 1. */ + start = (u64)mem_to_shadow(addr); + end = (u64)mem_to_shadow(addr + size - 1); + + ret = first_nonzero_byte(start, (end - start) + 1); + if (likely(ret == SHADOW_ALL_ZEROES)) + return false; + + return unlikely(ret != end || ASAN_GRANULE(addr + size - 1) >= *(s8 __arena *)end); +} + +__weak int asan_report(s8 __arena *addr, size_t sz, u32 flags) +{ + u32 reported = __sync_val_compare_and_swap(&asan_reported, false, true); + + /* Only report the first ASAN violation. */ + if (reported && asan_report_once) + return 0; + + asan_violated = (u64)addr; + + arena_stderr("Memory violation for address %p (0x%lx) for %s of size %ld\n", + addr, (u64)addr, + (flags & ASAN_WRITE) ? "write" : "read", + sz); + bpf_stream_print_stack(BPF_STDERR); + + return 0; +} + +static __always_inline bool check_asan_args(s8 __arena *addr, size_t size, + bool *result) +{ + bool valid = true; + + /* Size 0 accesses are valid even if the address is invalid. */ + if (unlikely(size == 0)) + goto confirmed_valid; + + /* + * Wraparound is possible for values close to the the edge of the + * 4GiB boundary of the arena (last valid address is 1UL << 32 - 1). + * + * + * The wraparound detection below works for small sizes. check_asan_args is + * always called from the builtin ASAN checks, so 1 <= size <= 64. Even + * for storeN/loadN that we do not expect to encounter the intrinsics will + * not have a large enough size that: + * + * - addr + size > MAX_U32 + * - (u32)(addr + size) > (u32) addr + * + * which would defeat wraparound detection. + */ + if (unlikely((u32)(u64)(addr + size) < (u32)(u64)addr)) + goto confirmed_invalid; + + return false; + +confirmed_invalid: + valid = false; + + /* FALLTHROUGH */ +confirmed_valid: + *result = valid; + + return true; +} + +static __always_inline bool check_region_inline(intptr_t ptr, size_t size, + u32 flags) +{ + s8 __arena *addr = (s8 __arena *)(u64)ptr; + bool is_poisoned, is_valid; + + if (check_asan_args(addr, size, &is_valid)) { + if (!is_valid) + asan_report(addr, size, flags); + return is_valid; + } + + switch (size) { + case 1: + is_poisoned = memory_is_poisoned_1(addr); + break; + case 2: + case 4: + case 8: + is_poisoned = memory_is_poisoned_2_4_8(addr, size); + break; + default: + is_poisoned = memory_is_poisoned_n(addr, size); + } + + if (is_poisoned) { + asan_report(addr, size, flags); + return false; + } + + return true; +} + +/* + * __alias is not supported for BPF so define *__noabort() variants as wrappers. + */ +#define DEFINE_ASAN_LOAD_STORE(size) \ + __hidden void __asan_store##size(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_store##size##_noabort(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_load##size(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_READ); \ + } \ + __hidden void __asan_load##size##_noabort(intptr_t addr) \ + { \ + check_region_inline(addr, size, ASAN_READ); \ + } \ + __hidden void __asan_report_store##size(intptr_t addr) \ + { \ + asan_report((s8 __arena *)addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_report_store##size##_noabort(intptr_t addr) \ + { \ + asan_report((s8 __arena *)addr, size, ASAN_WRITE); \ + } \ + __hidden void __asan_report_load##size(intptr_t addr) \ + { \ + asan_report((s8 __arena *)addr, size, ASAN_READ); \ + } \ + __hidden void __asan_report_load##size##_noabort(intptr_t addr) \ + { \ + asan_report((s8 __arena *)addr, size, ASAN_READ); \ + } + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); + +void __asan_storeN(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_WRITE); +} + +void __asan_storeN_noabort(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_WRITE); +} + +void __asan_loadN(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_READ); +} + +void __asan_loadN_noabort(intptr_t addr, ssize_t size) +{ + check_region_inline(addr, size, ASAN_READ); +} + +/* + * We currently do not sanitize globals. + */ +void __asan_register_globals(intptr_t globals, size_t n) +{ +} + +void __asan_unregister_globals(intptr_t globals, size_t n) +{ +} + +/* + * We do not currently have memcpy/memmove/memset intrinsics + * in LLVM. Do not implement sanitization. + */ +void *__asan_memcpy(void *d, const void *s, size_t n) +{ + arena_stderr("ASAN: Unexpected %s call", __func__); + return NULL; +} + +void *__asan_memmove(void *d, const void *s, size_t n) +{ + arena_stderr("ASAN: Unexpected %s call", __func__); + return NULL; +} + +void *__asan_memset(void *p, int c, size_t n) +{ + arena_stderr("ASAN: Unexpected %s call", __func__); + return NULL; +} + +/* + * Poisoning code, used when we add more freed memory to the allocator by: + * a) pulling memory from the arena segment using bpf_arena_alloc_pages() + * b) freeing memory from application code + */ +__hidden __noasan int asan_poison(void __arena *addr, s8 val, size_t size) +{ + s8 __arena *shadow; + size_t len; + + /* + * Poisoning from a non-granule address makes no sense: We can only allocate + * memory to the application that has a granule-aligned starting address, + * and bpf_arena_alloc_pages returns page-aligned memory. A non-aligned + * addr then implies we're freeing a different address than the one we + * allocated. + */ + if (unlikely((u64)addr & ASAN_GRANULE_MASK)) + return -EINVAL; + + /* + * We cannot free an unaligned region because it'd be possible that we + * cannot describe the resulting poisoning state of the granule in + * the ASAN encoding. + * + * Every granule represents a region of memory that looks like the + * following (P for poisoned bytes, C for clear): + * + * <Clear> <Poisoned> + * [ C C C ... P P ] + * + * The value of the granule's shadow map is the number of clear bytes in + * it. We cannot represent granules with the following state: + * + * [ P P ... C C ... P P ] + * + * That would be possible if we could free unaligned regions, so prevent that. + */ + if (unlikely(size & ASAN_GRANULE_MASK)) + return -EINVAL; + + shadow = mem_to_shadow(addr); + len = size >> ASAN_SHADOW_SHIFT; + + asan_memset(shadow, val, len); + + return 0; +} + +/* + * Unpoisoning code for marking memory as valid during allocation calls. + * + * Very similar to asan_poison, except we need to round up instead of + * down, then partially poison the last granule if necessary. + * + * Partial poisoning is useful for keeping the padding poisoned. Allocations + * are granule-aligned, so we we're reserving granule-aligned sizes for the + * allocation. However, we want to still treat accesses to the padding as + * invalid. Partial poisoning takes care of that. Freeing and poisoning the + * memory is still done in granule-aligned sizes and repoisons the already + * poisoned padding. + */ +__hidden __noasan int asan_unpoison(void __arena *addr, size_t size) +{ + size_t partial = size & ASAN_GRANULE_MASK; + s8 __arena *shadow; + size_t len; + + /* + * We cannot allocate in the middle of the granule. The ASAN shadow + * map encoding only describes regions of memory where every granule + * follows this format (P for poisoned, C for clear): + * + * <Clear> <Poisoned> + * [ C C C ... P P ] + * + * This is so we can use a single number in [0, ASAN_SHADOW_SCALE) + * to represent the poison state of the granule. + */ + if (unlikely((u64)addr & ASAN_GRANULE_MASK)) + return -EINVAL; + + shadow = mem_to_shadow(addr); + len = size >> ASAN_SHADOW_SHIFT; + + asan_memset(shadow, 0, len); + + /* + * If we are allocating a non-granule aligned region, we need to adjust + * the last byte of the shadow map to list how many bytes in the granule + * are unpoisoned. If the region is aligned, then the memset call above + * was enough. + */ + if (partial) + shadow[len] = partial; + + return 0; +} + +/* + * Initialize ASAN state when necessary. Triggered from userspace before + * allocator startup. + */ +SEC("syscall") +__weak __noasan int asan_init(struct asan_init_args *args) +{ + u64 globals_pages = args->arena_globals_pages; + u64 all_pages = args->arena_all_pages; + u64 shadow_map, shadow_pgoff; + u64 shadow_pages; + + if (asan_inited) + return 0; + + /* + * Round up the shadow map size to the nearest page. + */ + shadow_pages = all_pages >> ASAN_SHADOW_SHIFT; + if ((all_pages & ((1 << ASAN_SHADOW_SHIFT) - 1))) + shadow_pages += 1; + + if (all_pages > (1ULL << 32) / __PAGE_SIZE) { + arena_stderr("error: arena size %lx too large", all_pages); + return -EINVAL; + } + + if (globals_pages > all_pages) { + arena_stderr("error: globals %lx do not fit in arena %lx", + globals_pages, all_pages); + return -EINVAL; + } + + if (globals_pages + shadow_pages >= all_pages) { + arena_stderr("error: globals %lx do not leave room for shadow map %lx " + "(arena pages %lx)", + globals_pages, shadow_pages, all_pages); + return -EINVAL; + } + + shadow_pgoff = all_pages - shadow_pages - globals_pages; + __asan_shadow_memory_dynamic_address = shadow_pgoff * __PAGE_SIZE; + + /* + * Allocate the last (1/ASAN_SHADOW_SCALE)th of an arena's pages for the map + * We find the offset and size from the arena map. + * + * The allocated map pages are zeroed out, meaning all memory is marked as valid + * even if it's not allocated already. This is expected: Since the actual memory + * pages are not allocated, accesses to it will trigger page faults and will be + * reported through BPF streams. Any pages allocated through bpf_arena_alloc_pages + * should be poisoned by the allocator right after the call succeeds. + */ + shadow_map = (u64)bpf_arena_alloc_pages( + &arena, (void __arena *)__asan_shadow_memory_dynamic_address, + shadow_pages, NUMA_NO_NODE, 0); + if (!shadow_map) { + arena_stderr("Could not allocate shadow map\n"); + + __asan_shadow_memory_dynamic_address = 0; + + return -ENOMEM; + } + + asan_inited = true; + + return 0; +} + +#pragma clang attribute pop + +#endif /* BPF_ARENA_ASAN */ + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c new file mode 100644 index 000000000000..c674ee5cfcc1 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/buddy.bpf.c @@ -0,0 +1,903 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <libarena/common.h> +#include <libarena/asan.h> +#include <libarena/buddy.h> + +/* + * Buddy allocator arena-based implementation. + * + * Memory is organized into chunks. These chunks + * cannot be coalesced or split. Allocating + * chunks allocates their memory eagerly. + * + * Internally, each chunk is organized into blocks. + * Blocks _can_ be coalesced/split, but only inside + * the chunk. Each block can be allocated or + * unallocated. If allocated, the entire block holds + * user data. If unallocated, the block is mostly + * invalid memory, with the exception of a header + * used for freelist tracking. + * + * The header is placed at an offset inside the block + * to prevent off-by-one errors from the previous block + * from trivially overwriting the header. Such an error + * is also not catchable by ASAN, since the header remains + * valid memory even after the block is freed. It is still + * theoretically possible for the header to be corrupted + * without being caught by ASAN, but harder. + * + * Since the allocator needs to track order information for + * both allocated and free blocks, and allocated blocks cannot + * store a header, the allocator also stores per-chunk order + * information in a reserved region at the beginning of the + * chunk. The header includes a bitmap with the order of blocks + * and their allocation state. It also includes the freelist + * heads for the allocation itself. + */ + + +enum { + BUDDY_POISONED = (s8)0xef, + + /* Number of pages to be allocated per chunk. */ + BUDDY_CHUNK_PAGES = BUDDY_CHUNK_BYTES / __PAGE_SIZE +}; + +static inline int buddy_lock(struct buddy __arena *buddy) +{ + return arena_spin_lock(&buddy->lock); +} + +static inline void buddy_unlock(struct buddy __arena *buddy) +{ + arena_spin_unlock(&buddy->lock); +} + +/* + * Reserve part of the arena address space for the allocator. We use + * this to get aligned addresses for the chunks, since the arena + * page alloc kfuncs do not support aligning to a boundary (in this + * case 1 MiB, see buddy.h on how this is derived). + */ +static int buddy_reserve_arena_vaddr(struct buddy __arena *buddy) +{ + buddy->vaddr = 0; + + return bpf_arena_reserve_pages(&arena, + (void __arena *)BUDDY_VADDR_OFFSET, + BUDDY_VADDR_SIZE / __PAGE_SIZE); +} + +/* + * Free up any unused address space. Used only during teardown. + */ +static void buddy_unreserve_arena_vaddr(struct buddy __arena *buddy) +{ + bpf_arena_free_pages( + &arena, (void __arena *)(BUDDY_VADDR_OFFSET + buddy->vaddr), + (BUDDY_VADDR_SIZE - buddy->vaddr) / __PAGE_SIZE); + + buddy->vaddr = 0; +} + +/* + * Carve out part of the reserved address space and hand it over + * to the buddy allocator. + * + * We are assuming the buddy allocator is the only allocator in the + * system, so there is no race between this function reserving a + * page range and some other allocator actually making the BPF call + * to really create and reserve it. + * + * However, bump allocation must still be atomic because this function + * is called without the buddy lock from multiple threads concurrently. + */ +__weak int buddy_alloc_arena_vaddr(struct buddy __arena *buddy, u64 *vaddrp) +{ + u64 vaddr, old, new; + + if (!buddy || !vaddrp) + return -EINVAL; + + do { + vaddr = buddy->vaddr; + new = vaddr + BUDDY_CHUNK_BYTES; + + if (new > BUDDY_VADDR_SIZE) + return -EINVAL; + + old = __sync_val_compare_and_swap(&buddy->vaddr, vaddr, new); + } while (old != vaddr && can_loop); + + if (old != vaddr) + return -EINVAL; + + *vaddrp = BUDDY_VADDR_OFFSET + vaddr; + + return 0; +} + +static u64 arena_next_pow2(__u64 n) +{ + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + n++; + + return n; +} + +__weak +int idx_set_allocated(struct buddy_chunk __arena *chunk, u64 idx, bool allocated) +{ + bool already_allocated; + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("setting state of invalid idx (%ld, max %d)\n", idx, + BUDDY_CHUNK_ITEMS); + return -EINVAL; + } + + already_allocated = chunk->allocated[idx / 8] & (1 << (idx % 8)); + if (unlikely(already_allocated == allocated)) { + arena_stderr("Double %s of idx %ld for chunk %p", + allocated ? "alloc" : "free", + idx, chunk); + return -EINVAL; + } + + if (allocated) + chunk->allocated[idx / 8] |= 1 << (idx % 8); + else + chunk->allocated[idx / 8] &= ~(1 << (idx % 8)); + + return 0; +} + +static int idx_is_allocated(struct buddy_chunk __arena *chunk, u64 idx, bool *allocated) +{ + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("getting state of invalid idx (%llu, max %d)\n", idx, + BUDDY_CHUNK_ITEMS); + return -EINVAL; + } + + *allocated = chunk->allocated[idx / 8] & (1 << (idx % 8)); + return 0; +} + +__weak +int idx_set_order(struct buddy_chunk __arena *chunk, u64 idx, u8 order) +{ + u8 prev_order; + + if (unlikely(order >= BUDDY_CHUNK_NUM_ORDERS)) { + arena_stderr("setting invalid order %u\n", order); + return -EINVAL; + } + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("setting order of invalid idx (%d, max %d)\n", idx, + BUDDY_CHUNK_ITEMS); + return -EINVAL; + } + + /* + * We store two order instances per byte, one per nibble. + * Retain the existing nibble. + */ + prev_order = chunk->orders[idx / 2]; + if (idx & 0x1) { + order &= 0xf; + order |= (prev_order & 0xf0); + } else { + order <<= 4; + order |= (prev_order & 0xf); + } + + chunk->orders[idx / 2] = order; + + return 0; +} + +static u8 idx_get_order(struct buddy_chunk __arena *chunk, u64 idx) +{ + u8 result; + + _Static_assert(BUDDY_CHUNK_NUM_ORDERS <= 16, + "order must fit in 4 bits"); + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("getting order of invalid idx %u\n", idx); + return BUDDY_CHUNK_NUM_ORDERS; + } + + result = chunk->orders[idx / 2]; + + return (idx & 0x1) ? (result & 0xf) : (result >> 4); +} + +static void __arena *idx_to_addr(struct buddy_chunk __arena *chunk, size_t idx) +{ + u64 address; + + if (unlikely(idx >= BUDDY_CHUNK_ITEMS)) { + arena_stderr("translating invalid idx %u\n", idx); + return NULL; + } + + /* + * The data blocks start in the chunk after the metadata block. + * We find the actual address by indexing into the region at an + * BUDDY_MIN_ALLOC_BYTES granularity, the minimum allowed. + * The index number already accounts for the fact that the first + * blocks in the chunk are occupied by the metadata, so we do + * not need to offset it. + */ + + address = (u64)chunk + (idx * BUDDY_MIN_ALLOC_BYTES); + + return (void __arena *)address; +} + +static struct buddy_header __arena *idx_to_header(struct buddy_chunk __arena *chunk, size_t idx) +{ + bool allocated; + u64 address; + + if (unlikely(idx_is_allocated(chunk, idx, &allocated))) { + arena_stderr("accessing invalid idx 0x%lx\n", idx); + return NULL; + } + + if (unlikely(allocated)) { + arena_stderr("accessing allocated idx 0x%lx as header\n", idx); + return NULL; + } + + address = (u64)idx_to_addr(chunk, idx); + if (!address) + return NULL; + + /* + * Offset the header within the block. This avoids accidental overwrites + * to the header because of off-by-one errors when using adjacent blocks. + * + * The offset has been chosen as a compromise between ASAN effectiveness + * and allocator granularity: + * 1) ASAN dictates valid data runs are 8-byte aligned. + * 2) We want to keep a low minimum allocation size (currently 16). + * + * As a result, we have only two possible positions for the header: Bytes + * 0 and 8. Keeping the header in byte 0 means off-by-ones from the previous + * block touch the header, and, since the header must be accessible, ASAN + * will not trigger. Keeping the header on byte 8 means off-by-one errors from + * the previous block are caught by ASAN. Negative offsets are rarer, so + * while accesses into the block from the next block are possible, they are + * less probable. + */ + + return (struct buddy_header __arena *)(address + BUDDY_HEADER_OFF); +} + +static void header_add_freelist(struct buddy_chunk __arena *chunk, struct buddy_header __arena *header, + u64 idx, u8 order) +{ + struct buddy_header __arena *tmp_header; + + idx_set_order(chunk, idx, order); + + header->next_index = chunk->freelists[order]; + header->prev_index = BUDDY_CHUNK_ITEMS; + + if (header->next_index != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, header->next_index); + tmp_header->prev_index = idx; + } + + chunk->freelists[order] = idx; +} + +static void header_remove_freelist(struct buddy_chunk __arena *chunk, + struct buddy_header __arena *header, u8 order) +{ + struct buddy_header __arena *tmp_header; + + if (header->prev_index != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, header->prev_index); + tmp_header->next_index = header->next_index; + } + + if (header->next_index != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, header->next_index); + tmp_header->prev_index = header->prev_index; + } + + /* Pop off the list head if necessary. */ + if (idx_to_header(chunk, chunk->freelists[order]) == header) + chunk->freelists[order] = header->next_index; + + header->prev_index = BUDDY_CHUNK_ITEMS; + header->next_index = BUDDY_CHUNK_ITEMS; +} + +static u64 size_to_order(size_t size) +{ + u64 order; + + /* + * Legal sizes are [1, 4GiB] (the biggest possible arena). + * Of course, sizes close to GiB are practically impossible + * to fulfill and allocation will fail, but that's taken care + * of by the caller. + */ + + if (unlikely(size == 0 || size > (1UL << 32))) { + arena_stderr("illegal size request %lu\n", size); + return 64; + } + /* + * To find the order of the allocation we find the first power of two + * >= the requested size, take the log2, then adjust it for the minimum + * allocation size by removing the minimum shift from it. Requests + * smaller than the minimum allocation size are rounded up. + */ + order = arena_fls(arena_next_pow2(size)) - 1; + if (order < BUDDY_MIN_ALLOC_SHIFT) + return 0; + + return order - BUDDY_MIN_ALLOC_SHIFT; +} + +__weak +int add_leftovers_to_freelist(struct buddy_chunk __arena *chunk, u32 cur_idx, + u64 min_order, u64 max_order) +{ + struct buddy_header __arena *header; + u64 ord; + u32 idx; + + for (ord = min_order; ord < max_order && can_loop; ord++) { + /* Mark the buddy as free and add it to the freelists. */ + idx = cur_idx + (1 << ord); + + header = idx_to_header(chunk, idx); + if (unlikely(!header)) { + arena_stderr("idx %u has no header", idx); + return -EINVAL; + } + + asan_unpoison(header, sizeof(*header)); + + header_add_freelist(chunk, header, idx, ord); + } + + return 0; +} + +static struct buddy_chunk __arena *buddy_chunk_get(struct buddy __arena *buddy) +{ + u64 order, ord, min_order, max_order; + struct buddy_chunk __arena *chunk; + size_t left; + int power2; + u64 vaddr; + u32 idx; + int ret; + + /* + * Step 1: Allocate a properly aligned chunk, and + * prep it for insertion into the buddy allocator. + * We don't need the allocator lock until step 2. + */ + + ret = buddy_alloc_arena_vaddr(buddy, &vaddr); + if (ret) + return NULL; + + /* Addresses must be aligned to the chunk boundary. */ + if (vaddr % BUDDY_CHUNK_BYTES) + return NULL; + + /* Unreserve the address space. */ + bpf_arena_free_pages(&arena, (void __arena *)vaddr, + BUDDY_CHUNK_PAGES); + + chunk = bpf_arena_alloc_pages(&arena, (void __arena *)vaddr, + BUDDY_CHUNK_PAGES, NUMA_NO_NODE, 0); + if (!chunk) { + arena_stderr("[ALLOC FAILED]"); + return NULL; + } + + if (buddy_lock(buddy)) { + /* + * We cannot reclaim the vaddr space, but that is ok - this + * operation should always succeed. The error path is to catch + * accidental deadlocks that will cause -ENOMEMs to the program as + * the allocator fails to refill itself, in which case vaddr usage + * is the least of our worries. + */ + bpf_arena_free_pages(&arena, (void __arena *)vaddr, BUDDY_CHUNK_PAGES); + return NULL; + } + + asan_poison(chunk, BUDDY_POISONED, BUDDY_CHUNK_PAGES * __PAGE_SIZE); + + /* Unpoison the chunk itself. */ + asan_unpoison(chunk, sizeof(*chunk)); + + /* Mark all freelists as empty. */ + for (ord = zero; ord < BUDDY_CHUNK_NUM_ORDERS && can_loop; ord++) + chunk->freelists[ord] = BUDDY_CHUNK_ITEMS; + + /* + * Initialize the chunk by carving out a page range to hold the metadata + * struct above, then dumping the rest of the pages into the allocator. + */ + + _Static_assert(BUDDY_CHUNK_PAGES * __PAGE_SIZE >= + BUDDY_MIN_ALLOC_BYTES * + BUDDY_CHUNK_ITEMS, + "chunk must fit within the allocation"); + + /* + * Step 2: Reserve a chunk for the chunk metadata, then breaks + * the rest of the full allocation into the different buckets. + * We allocating the memory by grabbing blocks of progressively + * smaller sizes from the allocator, which are guaranteed to be + * continuous. + * + * This operation also populates the allocator. + * + * Algorithm: + * + * - max_order: The last order allocation we made + * - left: How many bytes are left to allocate + * - cur_index: Current index into the top-level block we are + * allocating from. + * + * Step 3: + * - Find the largest power-of-2 allocation still smaller than left (infimum) + * - Reserve a chunk of that size, along with its buddy + * - For every order from [infimum + 1, last order), carve out a block + * and put it into the allocator. + * + * Example: Chunk size 0b1010000 (80 bytes) + * + * Step 1: + * + * idx infimum 1 << max_order + * 0 64 128 1 << 20 + * |________|_________|______________________| + * + * Blocks set aside: + * [0, 64) - Completely allocated + * [64, 128) - Will be further split in the next iteration + * + * Blocks added to the allocator: + * [128, 256) + * [256, 512) + * ... + * [1 << 18, 1 << 19) + * [1 << 19, 1 << 20) + * + * Step 2: + * + * idx infimum idx + 1 << max_order + * 64 80 96 64 + 1 << 6 = 128 + * |________|_________|______________________| + * + * Blocks set aside: + * [64, 80) - Completely allocated + * + * Blocks added to the allocator: + * [80, 96) - left == 0 so the buddy is unused and marked as freed + * [96, 128) + */ + max_order = BUDDY_CHUNK_NUM_ORDERS; + left = sizeof(*chunk); + idx = 0; + while (left && can_loop) { + power2 = arena_fls(left) - 1; + /* + * Note: The condition below only triggers to catch serious bugs + * early. There is no sane way to undo any block insertions from + * the allocated chunk, so just leak any leftover allocations, + * emit a diagnostic, unlock and exit. + * + */ + if (unlikely(power2 >= BUDDY_CHUNK_NUM_ORDERS)) { + arena_stderr( + "buddy chunk metadata require allocation of order %d\n", + power2); + arena_stderr( + "chunk has size of 0x%lx bytes (left %lx bytes)\n", + sizeof(*chunk), left); + buddy_unlock(buddy); + + return NULL; + } + + /* Round up allocations that are too small. */ + + left -= (power2 >= BUDDY_MIN_ALLOC_SHIFT) ? 1 << power2 : left; + order = (power2 >= BUDDY_MIN_ALLOC_SHIFT) ? power2 - BUDDY_MIN_ALLOC_SHIFT : 0; + + if (idx_set_allocated(chunk, idx, true)) { + buddy_unlock(buddy); + return NULL; + } + + /* + * Starting an order above the one we allocated, populate + * the allocator with free blocks. If this is the last + * allocation (left == 0), also mark the buddy as free. + * + * See comment above about error handling: The error path + * is only there as a way to mitigate deeply buggy allocator + * states by emitting a diagnostic in add_leftovers_to_freelist() + * and leaking any memory not added in the freelists. + */ + min_order = left ? order + 1 : order; + if (add_leftovers_to_freelist(chunk, idx, min_order, max_order)) { + buddy_unlock(buddy); + return NULL; + } + + /* Adjust the index. */ + idx += 1 << order; + max_order = order; + } + + buddy_unlock(buddy); + + return chunk; +} + +__weak int buddy_init(struct buddy __arena *buddy) +{ + struct buddy_chunk __arena *chunk; + int ret; + + if (!asan_ready()) + return -EINVAL; + + /* Reserve enough address space to ensure allocations are aligned. */ + ret = buddy_reserve_arena_vaddr(buddy); + if (ret) + return ret; + + _Static_assert(BUDDY_CHUNK_PAGES > 0, + "chunk must use one or more pages"); + + chunk = buddy_chunk_get(buddy); + + if (buddy_lock(buddy)) { + bpf_arena_free_pages(&arena, chunk, BUDDY_CHUNK_PAGES); + return -EINVAL; + } + + /* Chunk is already properly unpoisoned if allocated. */ + if (chunk) + chunk->next = buddy->first_chunk; + + /* Put the chunk at the beginning of the list. */ + buddy->first_chunk = chunk; + + buddy_unlock(buddy); + + return chunk ? 0 : -ENOMEM; +} + +/* + * Destroy the allocator. This does not check whether there are any allocations + * currently in use, so any pages being accessed will start taking arena faults. + * We do not take a lock because we are freeing arena pages, and nobody should + * be using the allocator at that point in the execution. + */ +__weak int buddy_destroy(struct buddy __arena *buddy) +{ + struct buddy_chunk __arena *chunk, *next; + + if (!buddy) + return -EINVAL; + + /* + * Traverse all buddy chunks and free them back to the arena + * with the same granularity they were allocated with. + */ + for (chunk = buddy->first_chunk; chunk && can_loop; chunk = next) { + next = chunk->next; + + /* Wholesale poison the entire block. */ + asan_poison(chunk, BUDDY_POISONED, + BUDDY_CHUNK_PAGES * __PAGE_SIZE); + bpf_arena_free_pages(&arena, chunk, BUDDY_CHUNK_PAGES); + } + + /* Free up any part of the address space that did not get used. */ + buddy_unreserve_arena_vaddr(buddy); + + /* Clear all fields. */ + buddy->first_chunk = NULL; + + return 0; +} + +__weak u64 buddy_chunk_alloc(struct buddy_chunk __arena *chunk, int order_req) +{ + struct buddy_header __arena *header, *tmp_header, *next_header; + u32 idx, tmpidx, retidx; + u64 address; + u64 order = 0; + u64 i; + + for (order = order_req; order < BUDDY_CHUNK_NUM_ORDERS && can_loop; order++) { + if (chunk->freelists[order] != BUDDY_CHUNK_ITEMS) + break; + } + + if (order >= BUDDY_CHUNK_NUM_ORDERS) + return (u64)NULL; + + retidx = chunk->freelists[order]; + header = idx_to_header(chunk, retidx); + if (unlikely(!header)) + return (u64) NULL; + + chunk->freelists[order] = header->next_index; + + if (header->next_index != BUDDY_CHUNK_ITEMS) { + next_header = idx_to_header(chunk, header->next_index); + next_header->prev_index = BUDDY_CHUNK_ITEMS; + } + + header->prev_index = BUDDY_CHUNK_ITEMS; + header->next_index = BUDDY_CHUNK_ITEMS; + if (idx_set_order(chunk, retidx, order_req)) + return (u64)NULL; + + if (idx_set_allocated(chunk, retidx, true)) + return (u64)NULL; + + /* + * Do not unpoison the address yet, will be done by the caller + * because the caller has the exact allocation size requested. + */ + address = (u64)idx_to_addr(chunk, retidx); + if (!address) + return (u64)NULL; + + /* If we allocated from a larger-order chunk, split the buddies. */ + for (i = order_req; i < order && can_loop; i++) { + /* + * Flip the bit for the current order (the bit is guaranteed + * to be 0, so just add 1 << i). + */ + idx = retidx + (1 << i); + + /* Add the buddy of the allocation to the free list. */ + header = idx_to_header(chunk, idx); + /* Unpoison the buddy header */ + asan_unpoison(header, sizeof(*header)); + + if (idx_set_order(chunk, idx, i)) + return (u64)NULL; + + /* Push the header to the beginning of the freelists list. */ + tmpidx = chunk->freelists[i]; + + header->prev_index = BUDDY_CHUNK_ITEMS; + header->next_index = tmpidx; + + if (tmpidx != BUDDY_CHUNK_ITEMS) { + tmp_header = idx_to_header(chunk, tmpidx); + tmp_header->prev_index = idx; + } + + chunk->freelists[i] = idx; + } + + return address; +} + +/* Scan the existing chunks for available memory. */ +static u64 buddy_alloc_from_existing_chunks(struct buddy __arena *buddy, int order) +{ + struct buddy_chunk __arena *chunk; + u64 address; + + for (chunk = buddy->first_chunk; chunk != NULL && can_loop; + chunk = chunk->next) { + address = buddy_chunk_alloc(chunk, order); + if (address) + return address; + } + + return (u64)NULL; +} + +/* + * Try an allocation from a newly allocated chunk. Also + * incorporate the chunk into the linked list. + */ +static u64 buddy_alloc_from_new_chunk(struct buddy __arena *buddy, struct buddy_chunk __arena *chunk, int order) +{ + u64 address; + + if (buddy_lock(buddy)) + return (u64)NULL; + + + /* + * Add the chunk into the allocator and try + * to allocate specifically from that chunk. + */ + chunk->next = buddy->first_chunk; + buddy->first_chunk = chunk; + + address = buddy_chunk_alloc(buddy->first_chunk, order); + + buddy_unlock(buddy); + + return (u64)address; +} +__weak +void __arena *buddy_alloc(struct buddy __arena *buddy, size_t size) +{ + void __arena *address = NULL; + struct buddy_chunk __arena *chunk; + int order; + + if (!buddy) + return NULL; + + order = size_to_order(size); + if (order >= BUDDY_CHUNK_NUM_ORDERS || order < 0) { + arena_stderr("invalid order %d (sz %lu)\n", order, size); + return NULL; + } + + if (buddy_lock(buddy)) + return NULL; + + address = (u8 __arena *)buddy_alloc_from_existing_chunks(buddy, order); + buddy_unlock(buddy); + if (address) + goto done; + + /* Get a new chunk. */ + chunk = buddy_chunk_get(buddy); + if (chunk) + address = (u8 __arena *)buddy_alloc_from_new_chunk(buddy, chunk, order); + +done: + /* If we failed to allocate memory, return NULL. */ + if (!address) + return NULL; + + /* + * Unpoison exactly the amount of bytes requested. If the + * data is smaller than the header, we must poison any + * unused bytes that were part of the header. + */ + if (size < BUDDY_HEADER_OFF + sizeof(struct buddy_header __arena)) + asan_poison(address + BUDDY_HEADER_OFF, BUDDY_POISONED, + sizeof(struct buddy_header __arena)); + + asan_unpoison(address, size); + + return address; +} + +static __always_inline int buddy_free_unlocked(struct buddy __arena *buddy, u64 addr) +{ + struct buddy_header __arena *header, *buddy_header; + u64 idx, buddy_idx, tmp_idx; + struct buddy_chunk __arena *chunk; + bool allocated; + u8 order; + int ret; + + if (!buddy) + return -EINVAL; + + if (addr & (BUDDY_MIN_ALLOC_BYTES - 1)) { + arena_stderr("Freeing unaligned address %llx\n", addr); + return -EINVAL; + } + + /* Get (chunk, idx) out of the address. */ + chunk = (void __arena *)(addr & ~BUDDY_CHUNK_OFFSET_MASK); + idx = (addr & BUDDY_CHUNK_OFFSET_MASK) / BUDDY_MIN_ALLOC_BYTES; + + /* Mark the block as unallocated so we can access the header. */ + ret = idx_set_allocated(chunk, idx, false); + if (ret) + return ret; + + order = idx_get_order(chunk, idx); + header = idx_to_header(chunk, idx); + + /* The header is in the block itself, keep it unpoisoned. */ + asan_poison((u8 __arena *)addr, BUDDY_POISONED, + BUDDY_MIN_ALLOC_BYTES << order); + asan_unpoison(header, sizeof(*header)); + + /* + * Coalescing loop. Merge with free buddies of equal order. + * For every coalescing step, keep the left buddy and + * drop the right buddy's header. + */ + for (; order < BUDDY_CHUNK_NUM_ORDERS && can_loop; order++) { + buddy_idx = idx ^ (1 << order); + + /* Check if the buddy is actually free. */ + idx_is_allocated(chunk, buddy_idx, &allocated); + if (allocated) + break; + + /* + * If buddy is not the same order as the chunk + * being freed, then we're done coalescing. + */ + if (idx_get_order(chunk, buddy_idx) != order) + break; + + buddy_header = idx_to_header(chunk, buddy_idx); + header_remove_freelist(chunk, buddy_header, order); + + /* Keep the left header out of the two buddies, drop the other one. */ + if (buddy_idx < idx) { + tmp_idx = idx; + idx = buddy_idx; + buddy_idx = tmp_idx; + } + + /* Remove the buddy from the freelists so that we can merge it. */ + idx_set_order(chunk, buddy_idx, order); + + buddy_header = idx_to_header(chunk, buddy_idx); + asan_poison(buddy_header, BUDDY_POISONED, + sizeof(*buddy_header)); + } + + /* Header properly freed but not in any freelists yet .*/ + idx_set_order(chunk, idx, order); + + header = idx_to_header(chunk, idx); + header_add_freelist(chunk, header, idx, order); + + return 0; +} + +__weak int buddy_free(struct buddy __arena *buddy, void __arena *addr) +{ + int ret; + + if (!buddy) + return -EINVAL; + + /* Freeing NULL is a valid no-op. */ + if (!addr) + return 0; + + ret = buddy_lock(buddy); + if (ret) + return ret; + + ret = buddy_free_unlocked(buddy, (u64)addr); + + buddy_unlock(buddy); + + return ret; +} + +__weak char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/src/common.bpf.c b/tools/testing/selftests/bpf/libarena/src/common.bpf.c new file mode 100644 index 000000000000..50be57213dfb --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/common.bpf.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <libarena/common.h> +#include <libarena/asan.h> +#include <libarena/buddy.h> + +const volatile u32 zero = 0; + +struct buddy __arena buddy; + +int arena_fls(__u64 word) +{ + if (!word) + return 0; + + return 64 - __builtin_clzll(word); +} + +SEC("syscall") +__weak int arena_get_info(struct arena_get_info_args *args) +{ + args->arena_base = arena_base(&arena); + + return 0; +} + +SEC("syscall") +__weak int arena_alloc_reserve(struct arena_alloc_reserve_args *args) +{ + return bpf_arena_reserve_pages(&arena, NULL, args->nr_pages); +} + +SEC("syscall") +__weak int arena_buddy_reset(void) +{ + buddy_destroy(&buddy); + + return buddy_init(&buddy); +} + +__weak void __arena *arena_malloc(size_t size) +{ + return buddy_alloc(&buddy, size); +} + +__weak void arena_free(void __arena *ptr) +{ + buddy_free(&buddy, ptr); +} + + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c b/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c new file mode 100644 index 000000000000..7f0f6dc3e17d --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/rbtree.bpf.c @@ -0,0 +1,1047 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* + * Copyright (c) 2025-2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025-2026 Emil Tsalapatis <emil@etsalapatis.com> + */ + +#include <libarena/common.h> + +#include <libarena/asan.h> +#include <libarena/rbtree.h> + +int rb_integrity_check(struct rbtree __arena *rbtree); +void rbnode_print(size_t depth, struct rbnode __arena *rbn); +static int rbnode_replace(struct rbtree __arena *rbtree, + struct rbnode __arena *existing, + struct rbnode __arena *replacement); + +struct rbtree __arena *rb_create(enum rbtree_alloc alloc, + enum rbtree_insert_mode insert) +{ + struct rbtree __arena *rbtree; + + rbtree = arena_malloc(sizeof(*rbtree)); + if (unlikely(!rbtree)) + return NULL; + + /* + * RB_UPDATE overwrites existing values in the nodes, but RB_NOALLOC + * trees manage the tree nodes directly (including holding pointers + * to them). Disallow mixing the two modes to avoid dealing with + * unintuitive semantics. + */ + if (alloc == RB_NOALLOC && insert == RB_UPDATE) { + arena_stderr("WARNING: Cannot combine RB_NOALLOC and RB_UPDATE"); + arena_free(rbtree); + return NULL; + } + + rbtree->alloc = alloc; + rbtree->insert = insert; + rbtree->root = NULL; + + return rbtree; +} + +__weak +int rb_destroy(struct rbtree __arena *rbtree) +{ + int ret = 0; + + arena_subprog_init(); + + if (unlikely(!rbtree)) + return -EINVAL; + + if (rbtree->alloc == RB_NOALLOC) { + /* + * We cannot do anything about RB_NOALLOC nodes. The whole + * point of RB_NOALLOC is that the nodes are directly owned + * by the caller that allocates and inserts them. We could + * unilaterally grab all nodes and free them anyway, but that + * would almost certainly cause UAF as the callers keep accessing + * the now freed nodes. Throw an error instead. + */ + if (rbtree->root) { + arena_stderr("WARNING: Destroying RB_NOALLOC tree with > 0 nodes"); + return -EBUSY; + } + + goto out; + } + + while (rbtree->root && can_loop) { + ret = rb_remove(rbtree, rbtree->root->key); + if (ret) + break; + } + +out: + arena_free(rbtree); + return ret; +} + +static inline int rbnode_dir(struct rbnode __arena *node) +{ + /* Arbitrarily choose a direction for the root. */ + if (unlikely(!node->parent)) + return 0; + + return (node->parent->left == node) ? 0 : 1; +} + +/* + * The __noinline is to prevent inlining from bloating the add + * remove calls, in turn causing register splits and increasing + * stack usage above what is permitted. + */ +__noinline +int rbnode_rotate(struct rbtree __arena *rbtree, + struct rbnode __arena *node, int dir) +{ + struct rbnode __arena *tmp, *parent; + int parentdir; + + parent = node->parent; + if (parent) + parentdir = rbnode_dir(node); + + /* If we're doing a root change, are we the root? */ + if (unlikely(!parent && rbtree->root != node)) + return -EINVAL; + + /* + * Does the node we're turning into the root into exist? + * Note that the new root is on the opposite side of the + * rotation's direction. + */ + tmp = node->child[1 - dir]; + if (unlikely(!tmp)) + return -EINVAL; + + /* Steal the closest child of the new root. */ + node->child[1 - dir] = tmp->child[dir]; + if (node->child[1 - dir]) + node->child[1 - dir]->parent = node; + + /* Put the node below the new root.*/ + tmp->child[dir] = node; + node->parent = tmp; + + tmp->parent = parent; + if (parent) + parent->child[parentdir] = tmp; + else + rbtree->root = tmp; + + return 0; +} + +static +struct rbnode __arena *rbnode_find(struct rbnode __arena *subtree, u64 key) +{ + struct rbnode __arena *node = subtree; + int dir; + + if (!subtree) + return NULL; + + while (can_loop) { + if (node->key == key) + break; + + dir = (key < node->key) ? 0 : 1; + + if (!node->child[dir]) + break; + + node = node->child[dir]; + } + + return node; +} + +static +struct rbnode __arena *rbnode_least_upper_bound(struct rbnode __arena *subtree, uint64_t key) +{ + struct rbnode __arena *node = subtree; + int dir; + + if (!subtree) + return NULL; + + while (can_loop) { + dir = (key <= node->key) ? 0 : 1; + + if (!node->child[dir]) + break; + + node = node->child[dir]; + } + + return node; +} + +__weak +int rb_find(struct rbtree __arena *rbtree, u64 key, u64 *value) +{ + struct rbnode __arena *node; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(!value)) + return -EINVAL; + + node = rbnode_find(rbtree->root, key); + if (!node || node->key != key) + return -ENOENT; + + *value = node->value; + + return 0; +} + +__weak +struct rbnode __arena *rb_node_alloc(u64 key, u64 value) +{ + struct rbnode __arena *rbnode = NULL; + + rbnode = (struct rbnode __arena *)arena_malloc(sizeof(*rbnode)); + if (!rbnode) + return NULL; + + /* + * WARNING: The order of assignments is weird on purpose. + * See comment in rb_insert_node() for more context. + * TL;DR: Prevent consecutive 0 assignments from being + * promoted into an unverifiable memset by the compiler. + */ + + rbnode->key = key; + rbnode->parent = NULL; + rbnode->value = value; + rbnode->left = NULL; + rbnode->is_red = true; + rbnode->right = NULL; + + return rbnode; +} + +__weak +void rb_node_free(struct rbnode __arena *rbnode) +{ + arena_free(rbnode); +} + +static +int rb_node_insert(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + struct rbnode __arena *grandparent, *parent = rbtree->root; + u64 key = node->key; + struct rbnode __arena *uncle; + int dir; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!parent) { + rbtree->root = node; + return 0; + } + + if (rbtree->insert != RB_DUPLICATE) + parent = rbnode_find(parent, key); + else + parent = rbnode_least_upper_bound(parent, key); + + if (key == parent->key && rbtree->insert != RB_DUPLICATE) { + if (rbtree->insert == RB_UPDATE) { + /* + * Replace the old node with the new one. + * Free up the old node. + */ + ret = rbnode_replace(rbtree, parent, node); + if (ret) + return ret; + + if (rbtree->alloc == RB_ALLOC) + rb_node_free(parent); + + return 0; + } + + /* Otherwise it's RB_DEFAULT. */ + return -EALREADY; + } + + node->parent = parent; + /* Also works if key == parent->key. */ + if (key <= parent->key) + parent->left = node; + else + parent->right = node; + + while (can_loop) { + parent = node->parent; + if (!parent) + return 0; + + if (!parent->is_red) + return 0; + + grandparent = parent->parent; + if (!grandparent) { + parent->is_red = false; + return 0; + } + + dir = rbnode_dir(parent); + uncle = grandparent->child[1 - dir]; + + if (!uncle || !uncle->is_red) { + if (node == parent->child[1 - dir]) { + rbnode_rotate(rbtree, parent, dir); + node = parent; + parent = grandparent->child[dir]; + } + + rbnode_rotate(rbtree, grandparent, 1 - dir); + parent->is_red = false; + grandparent->is_red = true; + + return 0; + } + + /* Uncle is red. */ + + parent->is_red = false; + uncle->is_red = false; + grandparent->is_red = true; + + node = grandparent; + } + + return 0; +} + +int rb_insert_node(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc == RB_ALLOC)) + return -EINVAL; + + node->left = NULL; + + /* + * Workaround to break an optimization that causes + * verification failures on some compilers. Assignments + * of the kind + * + * *(r0 + 0) = 0; + * *(r0 + 8) = 0; + * *(r0 + 16) = 0; + * + * get promoted into a memset, and that in turn is not + * handled properly for arena memory by LLVM 21 and GCC 15. + * Add a barrier for now to prevent the assignments from being fused. + */ + barrier(); + + node->parent = NULL; + node->right = NULL; + + node->is_red = true; + + return rb_node_insert(rbtree, node); +} + +__weak +int rb_insert(struct rbtree __arena *rbtree, u64 key, u64 value) +{ + struct rbnode __arena *node; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc != RB_ALLOC)) + return -EINVAL; + + node = rb_node_alloc(key, value); + if (!node) + return -ENOMEM; + + ret = rb_node_insert(rbtree, node); + if (ret) { + rb_node_free(node); + return ret; + } + + return 0; +} + +static inline struct rbnode __arena *rbnode_least(struct rbnode __arena *subtree) +{ + while (subtree->left && can_loop) + subtree = subtree->left; + + return subtree; +} + +__weak int rb_least(struct rbtree __arena *rbtree, u64 *key, u64 *value) +{ + struct rbnode __arena *least; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!rbtree->root) + return -ENOENT; + + least = rbnode_least(rbtree->root); + if (key) + *key = least->key; + if (value) + *value = least->value; + + return 0; +} + + +/* + * If we are referencing ourselves, a and b have a parent-child relation, + * and we should be pointing at the other node instead. + */ +static inline void rbnode_fixup_pointers(struct rbnode __arena *a, + struct rbnode __arena *b) +{ +#define fixup(n1, n2, member) do { if (n1->member == n1) n1->member = n2; } while (0) + fixup(a, b, left); + fixup(a, b, right); + fixup(a, b, parent); +#undef fixup +} + +static inline void rbnode_swap_values(struct rbnode __arena *a, + struct rbnode __arena *b) +{ +#define swap(n1, n2, tmp) do { (tmp) = (n1); (n1) = (n2); (n2) = (tmp); } while (0) + struct rbnode __arena *tmpnode; + u64 tmp; + + /* Swap the pointers. */ + swap(a->is_red, b->is_red, tmp); + + swap(a->left, b->left, tmpnode); + swap(a->right, b->right, tmpnode); + swap(a->parent, b->parent, tmpnode); +#undef swap + + /* Account for the nodes being parent and child. */ + rbnode_fixup_pointers(b, a); + rbnode_fixup_pointers(a, b); +} + +static inline void rbnode_adjust_neighbors(struct rbtree __arena *rbtree, + struct rbnode __arena *node, int dir) +{ + if (node->left) + node->left->parent = node; + if (node->right) + node->right->parent = node; + + if (node->parent) { + node->parent->child[dir] = node; + return; + } + + rbtree->root = node; +} + +/* + * Directly replace an existing node with a replacement. The replacement node + * should not already be in the tree. + */ +static int rbnode_replace(struct rbtree __arena *rbtree, + struct rbnode __arena *existing, + struct rbnode __arena *replacement) +{ + int dir = 0; + + if (unlikely(replacement->parent || replacement->left || replacement->right)) + return -EINVAL; + + if (existing->parent) + dir = rbnode_dir(existing); + + replacement->is_red = existing->is_red; + replacement->left = existing->left; + replacement->right = existing->right; + replacement->parent = existing->parent; + + /* Fix up the new node's neighbors. */ + rbnode_adjust_neighbors(rbtree, replacement, dir); + + return 0; +} + +/* + * Switch two nodes in the tree in place. This is useful during node deletion. + * This is more involved than switching the values of the two nodes because we + * must update all tree pointers. + */ +static void rbnode_switch(struct rbtree __arena *rbtree, + struct rbnode __arena *a, + struct rbnode __arena *b) +{ + int adir = 0, bdir = 0; + + /* + * Store the direction in the parent because we will not + * be able to recompute it once we start swapping values. + */ + if (a->parent) + adir = rbnode_dir(a); + + if (b->parent) + bdir = rbnode_dir(b); + + rbnode_swap_values(a, b); + + /* + * Fix up the pointers from the children/parent to the + * new nodes. + */ + rbnode_adjust_neighbors(rbtree, a, bdir); + rbnode_adjust_neighbors(rbtree, b, adir); +} + +static inline int rbnode_remove_node_single_child(struct rbtree __arena *rbtree, + struct rbnode __arena *node, + bool free) +{ + struct rbnode __arena *child; + int dir; + + if (unlikely(node->is_red)) { + arena_stderr("Node unexpectedly red\n"); + return -EINVAL; + } + + child = node->left ? node->left : node->right; + if (unlikely(!child->is_red)) { + arena_stderr("Only child is black\n"); + return -EINVAL; + } + + /* + * Since it's the immediate child, we can just + * remove the parent. + */ + child->parent = node->parent; + + if (node->parent) { + dir = rbnode_dir(node); + node->parent->child[dir] = child; + } else { + rbtree->root = child; + } + + /* Color the child black. */ + child->is_red = false; + + /* Only free if called from rb_remove. */ + if (free) + rb_node_free(node); + + return 0; +} + +static inline bool rbnode_has_red_children(struct rbnode __arena *node) +{ + if (node->left && node->left->is_red) + return true; + + return node->right && node->right->is_red; +} + +static +int rb_node_remove(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + struct rbnode __arena *parent, *sibling, *close_nephew, *distant_nephew; + bool free = (rbtree->alloc == RB_ALLOC); + struct rbnode __arena *replace, *initial; + bool is_red; + int dir; + + /* Both children present, replace with next largest key. */ + if (node->left && node->right) { + /* + * Swap the node itself instead of just the + * key/value pair to account for nodes embedded + * in other structs. + */ + + replace = rbnode_least(node->right); + rbnode_switch(rbtree, replace, node); + + /* + * FALLTHROUGH: We moved the node we are removing to + * the leftmost position of the subtree. We can now + * remove it as if it was always where we moved it to. + */ + } + + initial = node; + + /* Only one child present, replace with child and paint it black. */ + if (!node->left != !node->right) + return rbnode_remove_node_single_child(rbtree, node, free); + + /* (!node->left && !node->right) */ + + parent = node->parent; + if (!parent) { + /* Check that we're _actually_ the root. */ + if (rbtree->root == node) + rbtree->root = NULL; + else + arena_stderr("WARNING: Attempting to remove detached node from rbtree\n"); + + if (free) + rb_node_free(node); + return 0; + } + + dir = rbnode_dir(node); + parent->child[dir] = NULL; + is_red = node->is_red; + + if (free) + rb_node_free(node); + + /* If we removed a red node, we did not unbalance the tree.*/ + if (is_red) + return 0; + + sibling = parent->child[1 - dir]; + if (unlikely(!sibling)) { + arena_stderr("rbtree: removed black node has no sibling\n"); + return -EINVAL; + } + + /* + * We removed a black node, causing a change in path + * weight. Start rebalancing. The invariant is that + * all paths going through the node are shortened + * by one, and the current node is black. + */ + while (can_loop) { + + /* Balancing reached the root, there can be no imbalance. */ + if (!parent) + return 0; + + /* + * We already determined the dir, either above or + * at the end of the loop. + */ + + /* + * If we have no sibling, the tree was + * already unbalanced. + */ + sibling = parent->child[1 - dir]; + if (unlikely(!sibling)) { + arena_stderr("rbtree: removed black node has no sibling\n"); + return -EINVAL; + } + + /* Sibling is red, turn it into the grandparent. */ + if (sibling->is_red) { + /* + * Sibling is red. Transform the tree to turn + * the sibling into the parent's position, and + * repaint them. This does not balance the tree + * but makes it so we know the sibling is black + * and so can use the transformations to balance. + */ + rbnode_rotate(rbtree, parent, dir); + parent->is_red = true; + sibling->is_red = false; + + /* Our new sibling is now the close nephew. */ + sibling = parent->child[1 - dir]; + /* If sibling has any red siblings, break out. */ + if (rbnode_has_red_children(sibling)) + break; + + /* We can repaint the sibling and parent, we're done. */ + sibling->is_red = true; + parent->is_red = false; + + return 0; + } + + /* Sibling guaranteed to be black. If it has red children, break out. */ + if (rbnode_has_red_children(sibling)) + break; + + /* + * Both sibling and children are black. If parent is red, swap + * colors with the sibling. Otherwise + */ + if (parent->is_red) { + parent->is_red = false; + sibling->is_red = true; + return 0; + } + + /* + * Parent, sibling, and all its children are black. Repaint the sibling. + * This shortens the paths through it, so pop up a level in the + * tree and repeat the balancing. + */ + sibling->is_red = true; + node = parent; + parent = node->parent; + dir = rbnode_dir(node); + } + + if (node != initial) { + dir = rbnode_dir(node); + parent = node->parent; + sibling = parent->child[1-dir]; + } + /* + * Almost there. We know between the parent, sibling, + * and nephews only one or two of the nephews are red. If + * it is the close one, rotate it to the sibling position, + * paint it black, and paint the previous sibling red. + */ + + close_nephew = sibling->child[dir]; + distant_nephew = sibling->child[1 - dir]; + + /* + * If the distant red nephew is not red, rotate + * and repaint. We need the distant nephew + * to be red. We know the close nephew is red + * because at least one of them are, so the + * distant one is black if it exists. + */ + if (!distant_nephew || !distant_nephew->is_red) { + rbnode_rotate(rbtree, sibling, 1 - dir); + sibling->is_red = true; + close_nephew->is_red = false; + distant_nephew = sibling; + sibling = close_nephew; + } + + /* + * We now know it's the distant nephew that's red. + * Rotate the sibling into our parent's position + * and paint both black. + */ + + rbnode_rotate(rbtree, parent, dir); + sibling->is_red = parent->is_red; + parent->is_red = false; + distant_nephew->is_red = false; + + return 0; +} + +__weak +int rb_remove_node(struct rbtree __arena *rbtree, + struct rbnode __arena *node) +{ + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc == RB_ALLOC)) + return -EINVAL; + + return rb_node_remove(rbtree, node); +} + +__weak +int rb_remove(struct rbtree __arena *rbtree, u64 key) +{ + struct rbnode __arena *node; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (unlikely(rbtree->alloc != RB_ALLOC)) + return -EINVAL; + + if (!rbtree->root) + return -ENOENT; + + node = rbnode_find(rbtree->root, key); + if (!node || node->key != key) + return -ENOENT; + + return rb_node_remove(rbtree, node); +} + +__weak +int rb_pop(struct rbtree __arena *rbtree, u64 *key, u64 *value) +{ + struct rbnode __arena *node; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!rbtree->root) + return -ENOENT; + + if (rbtree->alloc != RB_ALLOC) + return -EINVAL; + + node = rbnode_least(rbtree->root); + if (unlikely(!node)) + return -ENOENT; + + if (key) + *key = node->key; + if (value) + *value = node->value; + + return rb_node_remove(rbtree, node); +} + +inline void rbnode_print(size_t depth, struct rbnode __arena *rbn) +{ + arena_stderr("[DEPTH %d] %p (%s)\n PARENT %p", depth, rbn, rbn->is_red ? "red" : "black", rbn->parent); + arena_stderr("\tKV (%ld, %ld)\n LEFT %p RIGHT %p]\n", rbn->key, rbn->value, rbn->left, rbn->right); +} + +enum rb_print_state { + RB_NONE_VISITED, + RB_LEFT_VISITED, + RB_RIGHT_VISITED, +}; + +__weak +enum rb_print_state rb_print_next_state(struct rbnode __arena *rbnode, + enum rb_print_state state, u64 *next) +{ + if (unlikely(!next)) + return RB_NONE_VISITED; + + switch (state) { + case RB_NONE_VISITED: + if (rbnode->left) { + *next = (u64)rbnode->left; + state = RB_LEFT_VISITED; + break; + } + + /* FALLTHROUGH */ + + case RB_LEFT_VISITED: + if (rbnode->right) { + *next = (u64)rbnode->right; + state = RB_RIGHT_VISITED; + break; + } + + /* FALLTHROUGH */ + + default: + *next = 0; + state = RB_RIGHT_VISITED; + } + + return state; +} + +__weak +int rb_print_pop_up(struct rbnode __arena **rbnodep, u8 *depthp, enum rb_print_state (*stack)[RB_MAXLVL_PRINT], enum rb_print_state *state) +{ + struct rbnode __arena *rbnode; + volatile u8 depth; + int j; + + if (unlikely(!rbnodep || !depthp || !stack || !state)) + return -EINVAL; + + rbnode = *rbnodep; + depth = *depthp; + + for (j = 0; j < RB_MAXLVL_PRINT && can_loop; j++) { + if (*state != RB_RIGHT_VISITED) + break; + + depth -= 1; + if (depth < 0 || depth >= RB_MAXLVL_PRINT) + break; + + *state = (*stack)[depth % RB_MAXLVL_PRINT]; + rbnode = rbnode->parent; + } + + *rbnodep = rbnode; + *depthp = depth; + + return 0; +} + +__weak +int rb_print(struct rbtree __arena *rbtree) +{ + enum rb_print_state stack[RB_MAXLVL_PRINT]; + struct rbnode __arena *rbnode = rbtree->root; + enum rb_print_state state; + struct rbnode __arena *next; + u64 next_addr; + u8 depth; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + depth = 0; + state = RB_NONE_VISITED; + + arena_stderr("=== RB TREE START ===\n"); + + if (!rbtree->root) + goto out; + + /* Even with can_loop, the verifier doesn't like infinite loops. */ + while (can_loop) { + if (state == RB_NONE_VISITED) + rbnode_print(depth, rbnode); + + /* Find which child to traverse next. */ + state = rb_print_next_state(rbnode, state, &next_addr); + next = (struct rbnode __arena *)next_addr; + + /* Child found. Store the node state and go on. */ + if (next) { + if (depth < 0 || depth >= RB_MAXLVL_PRINT) + return 0; + + stack[depth++] = state; + + rbnode = next; + state = RB_NONE_VISITED; + + continue; + } + + /* Otherwise, go as far up as possible. */ + ret = rb_print_pop_up(&rbnode, &depth, &stack, &state); + if (ret) + return -EINVAL; + + if (depth < 0 || depth >= RB_MAXLVL_PRINT) { + arena_stderr("=== RB TREE END (depth %d\n)===", depth); + return 0; + } + + } + +out: + arena_stderr("=== RB TREE END ===\n"); + + return 0; +} + +__weak +int rb_integrity_check(struct rbtree __arena *rbtree) +{ + enum rb_print_state stack[RB_MAXLVL_PRINT]; + struct rbnode __arena *rbnode = rbtree->root; + enum rb_print_state state; + struct rbnode __arena *next; + u64 next_addr; + u8 depth; + int ret; + + if (unlikely(!rbtree)) + return -EINVAL; + + if (!rbtree->root) + return 0; + + depth = 0; + state = RB_NONE_VISITED; + + /* Even with can_loop, the verifier doesn't like infinite loops. */ + while (can_loop) { + if (rbnode->parent && rbnode->parent->left != rbnode + && rbnode->parent->right != rbnode) { + arena_stderr("WARNING: Inconsistent tree. Parent %p has no child %p\n", rbnode->parent, rbnode); + return -EINVAL; + } + + if (rbnode->parent == rbnode) { + arena_stderr("WARNING: Inconsistent tree, node %p is its own parent\n", rbnode); + return -EINVAL; + } + + if (rbnode->left == rbnode) { + arena_stderr("WARNING: Inconsistent tree, node %p is its own left child\n", rbnode); + return -EINVAL; + } + + if (rbnode->right == rbnode) { + arena_stderr("WARNING: Inconsistent tree, node %p is its own right child\n", rbnode); + return -EINVAL; + } + + if (rbnode->is_red) { + if (rbnode->left && rbnode->left->is_red) { + arena_stderr("WARNING: Inconsistent tree. Parent has %p has red child %p\n", rbnode, rbnode->left); + return -EINVAL; + } + if (rbnode->right && rbnode->right->is_red) { + arena_stderr("WARNING: Inconsistent tree. Parent has %p has red child %p\n", rbnode, rbnode->right); + return -EINVAL; + } + } else if (rbnode->parent && rbnode->parent->child[1 - rbnode_dir(rbnode)] == NULL) { + arena_stderr("WARNING: Inconsistent tree. Black node %p has no sibling\n", rbnode); + return -EINVAL; + } + + /* Find which child to traverse next. */ + state = rb_print_next_state(rbnode, state, &next_addr); + next = (struct rbnode __arena *)next_addr; + + /* Child found. Store the node state and go on. */ + if (next) { + if (depth < 0 || depth >= RB_MAXLVL_PRINT) + return 0; + + stack[depth++] = state; + + rbnode = next; + state = RB_NONE_VISITED; + + continue; + } + + /* Otherwise, go as far up as possible. */ + ret = rb_print_pop_up(&rbnode, &depth, &stack, &state); + if (ret) + return -EINVAL; + + if (depth < 0 || depth >= RB_MAXLVL_PRINT) { + return 0; + } + + } + + return 0; +} diff --git a/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c b/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c new file mode 100644 index 000000000000..42732b7d29a6 --- /dev/null +++ b/tools/testing/selftests/bpf/libarena/src/spmc.bpf.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* + * Copyright (c) 2025-2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025-2026 Emil Tsalapatis <etsal@meta.com> + */ + +#include <bpf_atomic.h> + +#include <libarena/common.h> + +#include <libarena/asan.h> +#include <libarena/spmc.h> + +static inline +u64 spmc_arr_size(volatile struct spmc_arr __arena *spmc_arr) +{ + return SPMC_ARR_BASESZ << spmc_arr->order; +} + +static inline +u64 spmc_arr_get(volatile struct spmc_arr __arena *spmc_arr, u64 ind) +{ + u64 ret = READ_ONCE(spmc_arr->data[ind % spmc_arr_size(spmc_arr)]); + + return ret; +} + +static inline +void spmc_arr_put(volatile struct spmc_arr __arena *spmc_arr, u64 ind, u64 value) +{ + WRITE_ONCE(spmc_arr->data[ind % spmc_arr_size(spmc_arr)], value); +} + +static inline +void spmc_arr_copy(volatile struct spmc_arr __arena *dst, + volatile struct spmc_arr __arena *src, u64 b, u64 t) +{ + u64 i; + + for (i = t; i < b && can_loop; i++) + spmc_arr_put(dst, i, spmc_arr_get(src, i)); +} + +static inline +int spmc_order_init(struct spmc __arena *spmc, int order) +{ + volatile struct spmc_arr __arena *arr = &spmc->arr[order]; + + if (unlikely(!spmc)) + return -EINVAL; + + if (order >= SPMC_ARR_ORDERS) + return -E2BIG; + + /* Already allocated? */ + if (arr->data) + return 0; + + arr->data = arena_malloc((SPMC_ARR_BASESZ << order) * sizeof(*arr->data)); + if (!arr->data) + return -ENOMEM; + + return 0; +} + +__weak +int spmc_owned_add(struct spmc __arena *spmc, u64 val) +{ + volatile struct spmc_arr __arena *newarr; + volatile struct spmc_arr __arena *arr; + ssize_t sz; + u64 b, t; + int ret; + + if (unlikely(!spmc)) + return -EINVAL; + + /* + * Bottom must always be read first, also + * see spmc_steal(). + */ + b = smp_load_acquire(&spmc->bottom); + t = READ_ONCE(spmc->top); + arr = READ_ONCE(spmc->cur); + + sz = b - t; + if (sz >= spmc_arr_size(arr) - 1) { + ret = spmc_order_init(spmc, arr->order + 1); + if (ret) + return ret; + + newarr = &spmc->arr[arr->order + 1]; + + spmc_arr_copy(newarr, arr, b, t); + smp_store_release(&spmc->cur, newarr); + arr = newarr; + } + + spmc_arr_put(arr, b, val); + smp_store_release(&spmc->bottom, b + 1); + + return 0; +} + + +__weak +int spmc_owned_remove(struct spmc __arena *spmc, u64 *val) +{ + volatile struct spmc_arr __arena *arr; + int ret = 0; + ssize_t sz; + u64 value; + u64 b, t; + + if (unlikely(!spmc || !val)) + return -EINVAL; + + b = READ_ONCE(spmc->bottom) - 1; + WRITE_ONCE(spmc->bottom, b); + smp_mb(); + + t = READ_ONCE(spmc->top); + arr = READ_ONCE(spmc->cur); + + sz = b - t; + if (sz < 0) { + WRITE_ONCE(spmc->bottom, t); + return -ENOENT; + } + + value = spmc_arr_get(arr, b); + if (sz > 0) { + *val = value; + return 0; + } + + if (cmpxchg(&spmc->top, t, t + 1) != t) + ret = -EAGAIN; + + WRITE_ONCE(spmc->bottom, t + 1); + + if (ret) + return ret; + + *val = value; + + return 0; +} + +__weak +int spmc_steal(struct spmc __arena *spmc, u64 *val) +{ + volatile struct spmc_arr __arena *arr; + ssize_t sz; + u64 value; + u64 b, t; + + if (unlikely(!spmc || !val)) + return -EINVAL; + + /* + * It is important that t is read before b for + * stealers to avoid racing with the owner. + * Races between stealers are dealt with using + * CAS to increment the top value below. + */ + t = smp_load_acquire(&spmc->top); + b = smp_load_acquire(&spmc->bottom); + + sz = b - t; + if (sz <= 0) + return -ENOENT; + + arr = smp_load_acquire(&spmc->cur); + value = spmc_arr_get(arr, t); + + if (cmpxchg(&spmc->top, t, t + 1) != t) + return -EAGAIN; + + *val = value; + + return 0; +} + + +__weak +struct spmc __arena *spmc_create(void) +{ + /* + * Marked as volatile because otherwise the array + * reference in the internal loop gets demoted to + * scalar and the program fails verification. + */ + struct spmc __arena *volatile spmc; + int ret, i; + + spmc = arena_malloc(sizeof(*spmc)); + if (!spmc) + return NULL; + + spmc->bottom = 0; + spmc->top = 0; + + for (i = 0; i < SPMC_ARR_ORDERS && can_loop; i++) { + spmc->arr[i].data = NULL; + spmc->arr[i].order = i; + } + + ret = spmc_order_init((struct spmc __arena *)spmc, 0); + if (ret) { + arena_free(spmc); + return NULL; + } + + spmc->cur = &spmc->arr[0]; + + return (struct spmc __arena *)spmc; +} + +__weak +int spmc_destroy(struct spmc __arena *spmc) +{ + int i; + + if (unlikely(!spmc)) + return -EINVAL; + + for (i = 0; i < SPMC_ARR_ORDERS && can_loop; i++) + arena_free(spmc->arr[i].data); + + arena_free(spmc); + + return 0; +} diff --git a/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c b/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c new file mode 100644 index 000000000000..4b4adb3f4b71 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_direct_value.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include <bpf/bpf.h> +#include <errno.h> +#include <sys/mman.h> +#include <unistd.h> + +#define ARENA_PAGES 32 + +static char log_buf[16384]; + +static void test_arena_direct_value_one_past_end(void) +{ + char expected[128]; + __u32 arena_sz = ARENA_PAGES * getpagesize(); + struct bpf_insn insns[] = { + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts); + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts); + void *arena; + int map_fd, prog_fd; + + map_opts.map_flags = BPF_F_MMAPABLE; + prog_opts.log_buf = log_buf; + prog_opts.log_size = sizeof(log_buf); + prog_opts.log_level = 1; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARENA, "arena_direct_value", + 0, 0, ARENA_PAGES, &map_opts); + if (map_fd < 0) { + if (errno == EOPNOTSUPP) { + test__skip(); + return; + } + ASSERT_GE(map_fd, 0, "bpf_map_create"); + return; + } + + arena = mmap(NULL, arena_sz, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); + if (!ASSERT_NEQ(arena, MAP_FAILED, "arena_mmap")) + goto cleanup; + + insns[0].imm = map_fd; + insns[1].imm = arena_sz; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, + "arena_direct_value", "GPL", insns, + ARRAY_SIZE(insns), &prog_opts); + if (!ASSERT_LT(prog_fd, 0, "prog_load")) { + close(prog_fd); + goto cleanup; + } + + snprintf(expected, sizeof(expected), + "invalid access to map value pointer, value_size=0 off=%u", + arena_sz); + ASSERT_HAS_SUBSTR(log_buf, expected, "verifier_log"); + +cleanup: + if (arena != MAP_FAILED) + munmap(arena, arena_sz); + close(map_fd); +} + +void test_arena_direct_value(void) +{ + if (test__start_subtest("one_past_end")) + test_arena_direct_value_one_past_end(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c index 693fd86fbde6..acb9d53b5973 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -5,13 +5,6 @@ #include <sys/sysinfo.h> struct __qspinlock { int val; }; -typedef struct __qspinlock arena_spinlock_t; - -struct arena_qnode { - unsigned long next; - int count; - int locked; -}; #include "arena_spin_lock.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c new file mode 100644 index 000000000000..87842c4347a6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_attr_size.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Google LLC */ +#include <linux/bpf.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <test_progs.h> +#include <cgroup_helpers.h> +#include "cgroup_skb_direct_packet_access.skel.h" + +#define OLD_QUERY_SIZE offsetofend(union bpf_attr, query.prog_cnt) +#define FULL_QUERY_SIZE offsetofend(union bpf_attr, query.revision) + +static void test_query_size_boundaries(void) +{ + struct cgroup_skb_direct_packet_access *skel; + struct bpf_link *link = NULL; + union bpf_attr attr; + int cg_fd = -1; + int err; + + skel = cgroup_skb_direct_packet_access__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + return; + + cg_fd = test__join_cgroup("/attr_size_cg"); + if (!ASSERT_GE(cg_fd, 0, "join_cgroup")) + goto cleanup; + + link = bpf_program__attach_cgroup(skel->progs.direct_packet_access, + cg_fd); + if (!ASSERT_OK_PTR(link, "cg_attach")) + goto cleanup; + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_INET_INGRESS; + attr.query.revision = 0xdeadbeefdeadbeefULL; + + err = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, OLD_QUERY_SIZE); + if (ASSERT_OK(err, "query_old_size")) { + ASSERT_EQ(attr.query.prog_cnt, 1, "prog_cnt_written_old"); + ASSERT_EQ(attr.query.revision, 0xdeadbeefdeadbeefULL, + "revision_not_written_old"); + } + + memset(&attr, 0, sizeof(attr)); + attr.query.target_fd = cg_fd; + attr.query.attach_type = BPF_CGROUP_INET_INGRESS; + + err = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, FULL_QUERY_SIZE); + if (!ASSERT_OK(err, "query_full_size")) + goto cleanup; + + ASSERT_EQ(attr.query.prog_cnt, 1, "prog_cnt_written"); + ASSERT_GT(attr.query.revision, 0, "revision_written"); + +cleanup: + if (link) + bpf_link__destroy(link); + if (cg_fd >= 0) + close(cg_fd); + cgroup_skb_direct_packet_access__destroy(skel); +} + +static void test_map_info_tail_zero(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, map_opts); + struct bpf_map_info_fake { + __u8 info[offsetofend(struct bpf_map_info, hash_size)]; + __u32 pad; + } info = { + .pad = 1, + }; + int map_fd, err; + __u32 info_len; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr", sizeof(int), 1, 1, &map_opts); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) + return; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(map_fd, &info, &info_len); + ASSERT_EQ(err, -E2BIG, "bpf_obj_get_info_by_fd"); + + close(map_fd); +} + +static void test_prog_info_tail_zero(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + struct bpf_prog_info_fake { + __u8 info[offsetofend(struct bpf_prog_info, attach_btf_id)]; + __u32 pad; + } info = { + .pad = 1, + }; + int prog_fd, err; + __u32 info_len; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "test_prog", "GPL", insns, + ARRAY_SIZE(insns), &prog_opts); + if (!ASSERT_GE(prog_fd, 0, "bpf_prog_load")) + return; + + info_len = sizeof(info); + err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + ASSERT_EQ(err, -E2BIG, "bpf_obj_get_info_by_fd"); + + close(prog_fd); +} + +void test_bpf_attr_size(void) +{ + if (test__start_subtest("query_size_boundaries")) + test_query_size_boundaries(); + if (test__start_subtest("map_info_tail_zero")) + test_map_info_tail_zero(); + if (test__start_subtest("prog_info_tail_zero")) + test_prog_info_tail_zero(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c index 35adc3f6d443..fa484d00a7a5 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c @@ -252,10 +252,17 @@ cleanup: kprobe_multi__destroy(skel); } -/* defined in prog_tests/uprobe_multi_test.c */ -void uprobe_multi_func_1(void); -void uprobe_multi_func_2(void); -void uprobe_multi_func_3(void); +/* + * Weak uprobe target stubs. noinline is required because + * uprobe_multi_test_run() takes their addresses to configure the BPF + * program's attachment points; an inlined function has no stable + * address in the binary to probe. The strong definitions in + * uprobe_multi_test.c take precedence when that translation unit is + * linked. + */ +noinline __weak void uprobe_multi_func_1(void) { asm volatile (""); } +noinline __weak void uprobe_multi_func_2(void) { asm volatile (""); } +noinline __weak void uprobe_multi_func_3(void) { asm volatile (""); } static void uprobe_multi_test_run(struct uprobe_multi *skel) { @@ -574,8 +581,6 @@ cleanup: close(fmod_ret_fd); } -int stack_mprotect(void); - static void lsm_subtest(struct test_bpf_cookie *skel) { __u64 cookie; diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index 215878ea04de..14d4c1793aed 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -5,24 +5,26 @@ #include "test_bpf_nf.skel.h" #include "test_bpf_nf_fail.skel.h" +#define CT_OPTS_ERROR_GUARD 0x12345678 + static char log_buf[1024 * 1024]; struct { const char *prog_name; const char *err_msg; } test_bpf_nf_fail_tests[] = { - { "alloc_release", "kernel function bpf_ct_release args#0 expected pointer to STRUCT nf_conn but" }, - { "insert_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" }, - { "lookup_insert", "kernel function bpf_ct_insert_entry args#0 expected pointer to STRUCT nf_conn___init but" }, - { "set_timeout_after_insert", "kernel function bpf_ct_set_timeout args#0 expected pointer to STRUCT nf_conn___init but" }, - { "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" }, - { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, - { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, + { "alloc_release", "kernel function bpf_ct_release R1 expected pointer to STRUCT nf_conn but" }, + { "insert_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" }, + { "lookup_insert", "kernel function bpf_ct_insert_entry R1 expected pointer to STRUCT nf_conn___init but" }, + { "set_timeout_after_insert", "kernel function bpf_ct_set_timeout R1 expected pointer to STRUCT nf_conn___init but" }, + { "set_status_after_insert", "kernel function bpf_ct_set_status R1 expected pointer to STRUCT nf_conn___init but" }, + { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout R1 expected pointer to STRUCT nf_conn but" }, + { "change_status_after_alloc", "kernel function bpf_ct_change_status R1 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, - { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, - { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, - { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, - { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted R2" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted R4" }, }; enum { @@ -119,6 +121,10 @@ static void test_bpf_nf_ct(int mode) ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0"); ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1"); ASSERT_EQ(skel->bss->test_einval_len_opts, -EINVAL, "Test EINVAL for len__opts != NF_BPF_CT_OPTS_SZ"); + ASSERT_EQ(skel->bss->test_einval_len_opts_small_lookup, CT_OPTS_ERROR_GUARD, + "Test no error write for lookup opts__sz before error field"); + ASSERT_EQ(skel->bss->test_einval_len_opts_small_alloc, CT_OPTS_ERROR_GUARD, + "Test no error write for alloc opts__sz before error field"); ASSERT_EQ(skel->bss->test_eproto_l4proto, -EPROTO, "Test EPROTO for l4proto != TCP or UDP"); ASSERT_EQ(skel->bss->test_enonet_netns_id, -ENONET, "Test ENONET for bad but valid netns_id"); ASSERT_EQ(skel->bss->test_enoent_lookup, -ENOENT, "Test ENOENT for failed lookup"); diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c index 730357cd0c9a..77f1c0550c9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -8,6 +8,10 @@ #include "bpf_qdisc_fifo.skel.h" #include "bpf_qdisc_fq.skel.h" #include "bpf_qdisc_fail__incompl_ops.skel.h" +#include "bpf_qdisc_fail__invalid_dynptr.skel.h" +#include "bpf_qdisc_fail__invalid_dynptr_slice.skel.h" +#include "bpf_qdisc_fail__invalid_dynptr_cross_frame.skel.h" +#include "bpf_qdisc_dynptr_use_after_invalidate_clone.skel.h" #define LO_IFINDEX 1 @@ -223,6 +227,10 @@ void test_ns_bpf_qdisc(void) test_qdisc_attach_to_non_root(); if (test__start_subtest("incompl_ops")) test_incompl_ops(); + RUN_TESTS(bpf_qdisc_fail__invalid_dynptr); + RUN_TESTS(bpf_qdisc_fail__invalid_dynptr_cross_frame); + RUN_TESTS(bpf_qdisc_fail__invalid_dynptr_slice); + RUN_TESTS(bpf_qdisc_dynptr_use_after_invalidate_clone); } void serial_test_bpf_qdisc_default(void) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index f829b6f09bc9..fe30181e6336 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -112,6 +112,10 @@ static void test_cubic(void) ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called"); + ASSERT_TRUE(cubic_skel->bss->nodelay_init_reject, "init reject nodelay option"); + ASSERT_TRUE(cubic_skel->bss->nodelay_cwnd_event_tx_start_reject, + "cwnd_event_tx_start reject nodelay option"); + bpf_link__destroy(link); bpf_cubic__destroy(cubic_skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 054ecb6b1e9f..66855cbd6b73 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -1924,11 +1924,11 @@ static struct btf_raw_test raw_tests[] = { }, { - .descr = "invalid BTF_INFO", + .descr = "invalid BTF kind", .raw_types = { /* int */ /* [1] */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), - BTF_TYPE_ENC(0, 0x20000000, 4), + BTF_TYPE_ENC(0, 0x7f000000, 4), BTF_END_RAW, }, .str_sec = "", @@ -1941,7 +1941,7 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 1, .max_entries = 4, .btf_load_err = true, - .err_str = "Invalid btf_info", + .err_str = "Invalid kind", }, { @@ -4121,8 +4121,6 @@ static struct btf_raw_test raw_tests[] = { .key_type_id = 1, .value_type_id = 1, .max_entries = 1, - .btf_load_err = true, - .err_str = "Type tags don't precede modifiers", }, { .descr = "type_tag test #3, type tag order", @@ -4141,8 +4139,6 @@ static struct btf_raw_test raw_tests[] = { .key_type_id = 1, .value_type_id = 1, .max_entries = 1, - .btf_load_err = true, - .err_str = "Type tags don't precede modifiers", }, { .descr = "type_tag test #4, type tag order", @@ -4161,8 +4157,6 @@ static struct btf_raw_test raw_tests[] = { .key_type_id = 1, .value_type_id = 1, .max_entries = 1, - .btf_load_err = true, - .err_str = "Type tags don't precede modifiers", }, { .descr = "type_tag test #5, type tag order", @@ -4198,11 +4192,9 @@ static struct btf_raw_test raw_tests[] = { .map_name = "tag_type_check_btf", .key_size = sizeof(int), .value_size = 4, - .key_type_id = 1, - .value_type_id = 1, + .key_type_id = 4, + .value_type_id = 4, .max_entries = 1, - .btf_load_err = true, - .err_str = "Type tags don't precede modifiers", }, { .descr = "type_tag test #7, tag with kflag", @@ -4258,6 +4250,43 @@ static struct btf_raw_test raw_tests[] = { .max_entries = 1, }, +{ + .descr = "struct test repeated fields count overflow", + .raw_types = { + BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + BTF_STRUCT_ENC(NAME_TBD, 0, 0), /* [2] */ + BTF_TYPE_TAG_ENC(NAME_TBD, 2), /* [3] */ + BTF_PTR_ENC(3), /* [4] */ + BTF_TYPE_ARRAY_ENC(4, 1, 1), /* [5] */ + BTF_STRUCT_ENC(NAME_TBD, 10, 8), /* [6] */ + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_MEMBER_ENC(NAME_TBD, 5, 0), + BTF_TYPE_ARRAY_ENC(6, 1, 0x1999999aU), /* [7] */ + BTF_STRUCT_ENC(NAME_TBD, 2, 8 + 8 * 0x1999999aU), /* [8] */ + BTF_MEMBER_ENC(NAME_TBD, 4, 0), + BTF_MEMBER_ENC(NAME_TBD, 7, 64), + BTF_END_RAW, + }, + BTF_STR_SEC("\0int\0prog_test_ref_kfunc\0kptr_untrusted\0elem" + "\0p0\0p1\0p2\0p3\0p4\0p5\0p6\0p7\0p8\0p9" + "\0outer\0trigger\0elems"), + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "repeat_fields", + .key_size = sizeof(int), + .value_size = 8 + 8 * 0x1999999aU, + .key_type_id = 1, + .value_type_id = 8, + .max_entries = 1, + .btf_load_err = true, +}, }; /* struct btf_raw_test raw_tests[] */ static const char *get_next_str(const char *start, const char *end) @@ -8092,7 +8121,7 @@ static struct btf_dedup_test dedup_tests[] = { static int btf_type_size(const struct btf_type *t) { int base_size = sizeof(struct btf_type); - __u16 vlen = BTF_INFO_VLEN(t->info); + __u32 vlen = BTF_INFO_VLEN(t->info); __u16 kind = BTF_INFO_KIND(t->info); switch (kind) { diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c index 5bc15bb6b7ce..9d6161151593 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dedup_split.c @@ -20,18 +20,22 @@ static void test_split_simple() { btf__add_struct(btf1, "s1", 4); /* [3] struct s1 { */ btf__add_field(btf1, "f1", 1, 0, 0); /* int f1; */ /* } */ + btf__add_typedef(btf1, "t1", 1); /* [4] typedef int */ VALIDATE_RAW_BTF( btf1, "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", "[2] PTR '(anon)' type_id=1", "[3] STRUCT 's1' size=4 vlen=1\n" - "\t'f1' type_id=1 bits_offset=0"); + "\t'f1' type_id=1 bits_offset=0", + "[4] TYPEDEF 't1' type_id=1"); ASSERT_STREQ(btf_type_c_dump(btf1), "\ struct s1 {\n\ int f1;\n\ -};\n\n", "c_dump"); +};\n\ +\n\ +typedef int t1;\n\n", "c_dump"); btf2 = btf__new_empty_split(btf1); if (!ASSERT_OK_PTR(btf2, "empty_split_btf")) @@ -49,39 +53,46 @@ struct s1 {\n\ ASSERT_EQ(btf_is_int(t), true, "int_kind"); ASSERT_STREQ(btf__str_by_offset(btf2, t->name_off), "int", "int_name"); - btf__add_struct(btf2, "s2", 16); /* [4] struct s2 { */ - btf__add_field(btf2, "f1", 6, 0, 0); /* struct s1 f1; */ - btf__add_field(btf2, "f2", 5, 32, 0); /* int f2; */ + btf__add_struct(btf2, "s2", 16); /* [5] struct s2 { */ + btf__add_field(btf2, "f1", 7, 0, 0); /* struct s1 f1; */ + btf__add_field(btf2, "f2", 6, 32, 0); /* int f2; */ btf__add_field(btf2, "f3", 2, 64, 0); /* int *f3; */ /* } */ /* duplicated int */ - btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [5] int */ + btf__add_int(btf2, "int", 4, BTF_INT_SIGNED); /* [6] int */ /* duplicated struct s1 */ - btf__add_struct(btf2, "s1", 4); /* [6] struct s1 { */ - btf__add_field(btf2, "f1", 5, 0, 0); /* int f1; */ + btf__add_struct(btf2, "s1", 4); /* [7] struct s1 { */ + btf__add_field(btf2, "f1", 6, 0, 0); /* int f1; */ /* } */ + /* duplicated typedef t1 */ + btf__add_typedef(btf2, "t1", 6); /* [8] typedef int */ + VALIDATE_RAW_BTF( btf2, "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", "[2] PTR '(anon)' type_id=1", "[3] STRUCT 's1' size=4 vlen=1\n" "\t'f1' type_id=1 bits_offset=0", - "[4] STRUCT 's2' size=16 vlen=3\n" - "\t'f1' type_id=6 bits_offset=0\n" - "\t'f2' type_id=5 bits_offset=32\n" + "[4] TYPEDEF 't1' type_id=1", + "[5] STRUCT 's2' size=16 vlen=3\n" + "\t'f1' type_id=7 bits_offset=0\n" + "\t'f2' type_id=6 bits_offset=32\n" "\t'f3' type_id=2 bits_offset=64", - "[5] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", - "[6] STRUCT 's1' size=4 vlen=1\n" - "\t'f1' type_id=5 bits_offset=0"); + "[6] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[7] STRUCT 's1' size=4 vlen=1\n" + "\t'f1' type_id=6 bits_offset=0", + "[8] TYPEDEF 't1' type_id=6"); ASSERT_STREQ(btf_type_c_dump(btf2), "\ struct s1 {\n\ int f1;\n\ };\n\ \n\ +typedef int t1;\n\ +\n\ struct s1___2 {\n\ int f1;\n\ };\n\ @@ -90,7 +101,9 @@ struct s2 {\n\ struct s1___2 f1;\n\ int f2;\n\ int *f3;\n\ -};\n\n", "c_dump"); +};\n\ +\n\ +typedef int t1___2;\n\n", "c_dump"); err = btf__dedup(btf2, NULL); if (!ASSERT_OK(err, "btf_dedup")) @@ -102,7 +115,8 @@ struct s2 {\n\ "[2] PTR '(anon)' type_id=1", "[3] STRUCT 's1' size=4 vlen=1\n" "\t'f1' type_id=1 bits_offset=0", - "[4] STRUCT 's2' size=16 vlen=3\n" + "[4] TYPEDEF 't1' type_id=1", + "[5] STRUCT 's2' size=16 vlen=3\n" "\t'f1' type_id=3 bits_offset=0\n" "\t'f2' type_id=1 bits_offset=32\n" "\t'f3' type_id=2 bits_offset=64"); @@ -112,6 +126,8 @@ struct s1 {\n\ int f1;\n\ };\n\ \n\ +typedef int t1;\n\ +\n\ struct s2 {\n\ struct s1 f1;\n\ int f2;\n\ @@ -487,9 +503,8 @@ static void test_split_module(void) for (i = 0; i < ARRAY_SIZE(mod_funcs); i++) { const struct btf_param *p; const struct btf_type *t; - __u16 vlen; + __u32 vlen, j; __u32 id; - int j; id = btf__find_by_name_kind(btf1, mod_funcs[i], BTF_KIND_FUNC); if (!ASSERT_GE(id, nr_base_types, "func_id")) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index f1642794f70e..9f1b50e07a29 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -1027,8 +1027,8 @@ static void test_btf_dump_datasec_data(char *str) char license[4] = "GPL"; struct btf_dump *d; - btf = btf__parse("xdping_kern.bpf.o", NULL); - if (!ASSERT_OK_PTR(btf, "xdping_kern.bpf.o BTF not found")) + btf = btf__parse("xdp_dummy.bpf.o", NULL); + if (!ASSERT_OK_PTR(btf, "xdp_dummy.bpf.o BTF not found")) return; d = btf_dump__new(btf, btf_dump_snprintf, str, NULL); diff --git a/tools/testing/selftests/bpf/prog_tests/cb_refs.c b/tools/testing/selftests/bpf/prog_tests/cb_refs.c index c40df623a8f7..78566b817fd7 100644 --- a/tools/testing/selftests/bpf/prog_tests/cb_refs.c +++ b/tools/testing/selftests/bpf/prog_tests/cb_refs.c @@ -11,8 +11,8 @@ struct { const char *prog_name; const char *err_msg; } cb_refs_tests[] = { - { "underflow_prog", "must point to scalar, or struct with scalar" }, - { "leak_prog", "Possibly NULL pointer passed to helper arg2" }, + { "underflow_prog", "release kfunc bpf_kfunc_call_test_release expects referenced PTR_TO_BTF_ID passed to R1" }, + { "leak_prog", "Possibly NULL pointer passed to helper R2" }, { "nested_cb", "Unreleased reference id=4 alloc_insn=2" }, /* alloc_insn=2{4,5} */ { "non_cb_transfer_ref", "Unreleased reference id=4 alloc_insn=1" }, /* alloc_insn=1{1,2} */ }; diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c b/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c index d4d583872fa2..d2ccf409dfe3 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c @@ -102,6 +102,82 @@ close_skel: return err; } +/* + * Replacing a link's program (bpf_link_update) must target the correct slot in + * the effective array even when a BPF_F_PREORDER program is attached to the + * same cgroup. All programs here are attached to a single cgroup; "parent" is + * reused only as a third distinct program. + * + * Attach child(1) normally and child_2(2) with BPF_F_PREORDER, so the effective + * order is [2, 1]. Then replace child(1)'s program with parent(3): only the + * non-preorder slot changes, giving [2, 3]. + */ +static int run_link_replace_test(int cgroup_fd, int sock_fd) +{ + LIBBPF_OPTS(bpf_link_create_opts, create_opts); + int err = 0, normal_link = -1, preorder_link = -1; + struct cgroup_preorder *skel = NULL; + enum bpf_attach_type atype; + __u8 *result, buf = 0x00; + socklen_t optlen = 1; + + skel = cgroup_preorder__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_preorder__open_and_load")) + return -1; + + err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1); + if (!ASSERT_OK(err, "setsockopt")) + goto close_skel; + + atype = bpf_program__expected_attach_type(skel->progs.child); + + create_opts.flags = 0; + normal_link = bpf_link_create(bpf_program__fd(skel->progs.child), + cgroup_fd, atype, &create_opts); + if (!ASSERT_GE(normal_link, 0, "create_normal_link")) { + err = normal_link; + goto close_skel; + } + + create_opts.flags = BPF_F_PREORDER; + preorder_link = bpf_link_create(bpf_program__fd(skel->progs.child_2), + cgroup_fd, atype, &create_opts); + if (!ASSERT_GE(preorder_link, 0, "create_preorder_link")) { + err = preorder_link; + goto close_links; + } + + result = skel->bss->result; + skel->bss->idx = 0; + memset(result, 0, 4); + + err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen); + if (!ASSERT_OK(err, "getsockopt-before")) + goto close_links; + ASSERT_TRUE(result[0] == 2 && result[1] == 1, "order before update"); + + /* Replace the normal link's program child(1) -> parent(3). */ + err = bpf_link_update(normal_link, bpf_program__fd(skel->progs.parent), NULL); + if (!ASSERT_OK(err, "bpf_link_update")) + goto close_links; + + skel->bss->idx = 0; + memset(result, 0, 4); + + err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen); + if (!ASSERT_OK(err, "getsockopt-after")) + goto close_links; + ASSERT_TRUE(result[0] == 2 && result[1] == 3, "order after update"); + +close_links: + if (preorder_link >= 0) + close(preorder_link); + close(normal_link); +close_skel: + cgroup_preorder__destroy(skel); + return err; +} + void test_cgroup_preorder(void) { int cg_parent = -1, cg_child = -1, sock_fd = -1; @@ -120,6 +196,7 @@ void test_cgroup_preorder(void) ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, false), "getsockopt_test_1"); ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, true), "getsockopt_test_2"); + ASSERT_OK(run_link_replace_test(cg_child, sock_fd), "link_replace_test"); out: close(sock_fd); diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c index 478a77cb67e6..c4398ccf3493 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c @@ -176,7 +176,7 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); union bpf_iter_link_info linfo; struct cgrp_ls_sleepable *skel; - struct bpf_link *link; + struct bpf_link *link, *fexit_link; int err, iter_fd; char buf[16]; @@ -200,16 +200,27 @@ static void test_cgroup_iter_sleepable(int cgroup_fd, __u64 cgroup_id) if (!ASSERT_OK_PTR(link, "attach_iter")) goto out; + fexit_link = bpf_program__attach(skel->progs.fexit_update); + if (!ASSERT_OK_PTR(fexit_link, "attach_fexit")) + goto out_link; + iter_fd = bpf_iter_create(bpf_link__fd(link)); if (!ASSERT_GE(iter_fd, 0, "iter_create")) - goto out_link; + goto out_fexit_link; + + skel->bss->target_pid = sys_gettid(); /* trigger the program run */ (void)read(iter_fd, buf, sizeof(buf)); + skel->bss->target_pid = 0; + + ASSERT_EQ(skel->bss->update_err, 0, "update_err"); ASSERT_EQ(skel->bss->cgroup_id, cgroup_id, "cgroup_id"); close(iter_fd); +out_fexit_link: + bpf_link__destroy(fexit_link); out_link: bpf_link__destroy(link); out: diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c index 469e92869523..2c3124092b73 100644 --- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c +++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c @@ -69,19 +69,19 @@ static struct test_case test_cases[] = { #if defined(__x86_64__) || defined(__aarch64__) { N(SCHED_CLS, struct __sk_buff, tstamp), - .read = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" - "if w11 & 0x4 goto pc+1;" + .read = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" + "if w12 & 0x4 goto pc+1;" "goto pc+4;" - "if w11 & 0x3 goto pc+1;" + "if w12 & 0x3 goto pc+1;" "goto pc+2;" "$dst = 0;" "goto pc+1;" "$dst = *(u64 *)($ctx + sk_buff::tstamp);", - .write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" - "if w11 & 0x4 goto pc+1;" + .write = "r12 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" + "if w12 & 0x4 goto pc+1;" "goto pc+2;" - "w11 &= -4;" - "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;" + "w12 &= -4;" + "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r12;" "*(u64 *)($ctx + sk_buff::tstamp) = $src;", }, #endif @@ -253,8 +253,7 @@ static int find_field_offset_aux(struct btf *btf, int btf_id, char *field_name, { const struct btf_type *type = btf__type_by_id(btf, btf_id); const struct btf_member *m; - __u16 mnum; - int i; + __u32 mnum, i; if (!type) { PRINT_FAIL("Can't find btf_type for id %d\n", btf_id); diff --git a/tools/testing/selftests/bpf/prog_tests/exceptions.c b/tools/testing/selftests/bpf/prog_tests/exceptions.c index e8cbaf2a3e82..3588d6f97fd4 100644 --- a/tools/testing/selftests/bpf/prog_tests/exceptions.c +++ b/tools/testing/selftests/bpf/prog_tests/exceptions.c @@ -85,6 +85,13 @@ static void test_exceptions_success(void) RUN_SUCCESS(exception_bad_assert_range_with, 10); RUN_SUCCESS(exception_throw_from_void_global, 11); + if (skel->rodata->has_stack_arg) { + RUN_SUCCESS(exception_throw_stack_arg, 56); + RUN_SUCCESS(exception_throw_after_stack_arg, 56); + RUN_SUCCESS(exception_throw_subprog_stack_arg, 56); + RUN_SUCCESS(exception_throw_subprog_after_stack_arg, 56); + } + #define RUN_EXT(load_ret, attach_err, expr, msg, after_link) \ { \ LIBBPF_OPTS(bpf_object_open_opts, o, .kernel_log_buf = log_buf, \ diff --git a/tools/testing/selftests/bpf/prog_tests/file_reader.c b/tools/testing/selftests/bpf/prog_tests/file_reader.c index 5cde32b35da4..48aae7ea0e4b 100644 --- a/tools/testing/selftests/bpf/prog_tests/file_reader.c +++ b/tools/testing/selftests/bpf/prog_tests/file_reader.c @@ -10,6 +10,7 @@ const char *user_ptr = "hello world"; char file_contents[256000]; +void *addr; void *get_executable_base_addr(void) { @@ -26,8 +27,7 @@ void *get_executable_base_addr(void) static int initialize_file_contents(void) { int fd, page_sz = sysconf(_SC_PAGESIZE); - ssize_t n = 0, cur, off; - void *addr; + ssize_t n = 0, cur; fd = open("/proc/self/exe", O_RDONLY); if (!ASSERT_OK_FD(fd, "Open /proc/self/exe\n")) @@ -52,16 +52,6 @@ static int initialize_file_contents(void) /* page-align base file address */ addr = (void *)((unsigned long)addr & ~(page_sz - 1)); - /* - * Page out range 0..512K, use 0..256K for positive tests and - * 256K..512K for negative tests expecting page faults - */ - for (off = 0; off < sizeof(file_contents) * 2; off += page_sz) { - if (!ASSERT_OK(madvise(addr + off, page_sz, MADV_PAGEOUT), - "madvise pageout")) - return errno; - } - return 0; } @@ -90,6 +80,14 @@ static void run_test(const char *prog_name) if (!ASSERT_OK(err, "file_reader__load")) goto cleanup; + /* + * Page out range 0..512K, use 0..256K for positive tests and + * 256K..512K for negative tests expecting page faults + */ + if (!ASSERT_OK(madvise(addr, sizeof(file_contents) * 2, MADV_PAGEOUT), + "madvise pageout")) + goto cleanup; + err = file_reader__attach(skel); if (!ASSERT_OK(err, "file_reader__attach")) goto cleanup; diff --git a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c index e40114620751..f589eefbf9fb 100644 --- a/tools/testing/selftests/bpf/prog_tests/fill_link_info.c +++ b/tools/testing/selftests/bpf/prog_tests/fill_link_info.c @@ -469,7 +469,7 @@ verify_umulti_link_info(int fd, bool retprobe, __u64 *offsets, ASSERT_EQ(info.uprobe_multi.pid, getpid(), "info.uprobe_multi.pid"); ASSERT_EQ(info.uprobe_multi.count, 3, "info.uprobe_multi.count"); - ASSERT_EQ(info.uprobe_multi.flags & BPF_F_KPROBE_MULTI_RETURN, + ASSERT_EQ(info.uprobe_multi.flags & BPF_F_UPROBE_MULTI_RETURN, retprobe, "info.uprobe_multi.flags.retprobe"); ASSERT_EQ(info.uprobe_multi.path_size, strlen(path) + 1, "info.uprobe_multi.path_size"); ASSERT_STREQ(path_buf, path, "info.uprobe_multi.path"); diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c index ea1a6766fbe9..0a28d4346924 100644 --- a/tools/testing/selftests/bpf/prog_tests/htab_update.c +++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c @@ -23,7 +23,7 @@ static void test_reenter_update(void) if (!ASSERT_OK_PTR(skel, "htab_update__open")) return; - bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true); + bpf_program__set_autoload(skel->progs.bpf_obj_cancel_fields, true); err = htab_update__load(skel); if (!ASSERT_TRUE(!err, "htab_update__load") || err) goto out; @@ -50,7 +50,7 @@ static void test_reenter_update(void) /* * Second update: replace existing element with same key and trigger * the reentrancy of bpf_map_update_elem(). - * check_and_free_fields() calls bpf_obj_free_fields() on the old + * check_and_cancel_fields() calls bpf_obj_cancel_fields() on the old * value, which is where fentry program runs and performs a nested * bpf_map_update_elem(), triggering -EDEADLK. */ diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index a539980a2fbe..c0b6082f345a 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -202,8 +202,6 @@ cleanup: iters_task__destroy(skel); } -extern int stack_mprotect(void); - static void subtest_css_task_iters(void) { struct iters_css_task *skel = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c index 62f3fb79f5d1..3df07680f9e0 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -68,7 +68,7 @@ static struct kfunc_test_params kfunc_tests[] = { TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"), TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"), TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"), - TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "arg#0 expected pointer to ctx, but got scalar"), + TC_FAIL(kfunc_call_test_pointer_arg_type_mismatch, 0, "R1 expected pointer to ctx, but got scalar"), /* success cases */ TC_TEST(kfunc_call_test1, 12), diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c index 8cd298b78e44..04aaf4c9cf5e 100644 --- a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c @@ -14,7 +14,7 @@ static struct { const char *prog_name; int expected_runtime_err; } kfunc_dynptr_tests[] = { - {"dynptr_data_null", -EBADMSG}, + {"dynptr_data_null", -EINVAL}, }; static bool kfunc_not_supported; diff --git a/tools/testing/selftests/bpf/prog_tests/libarena.c b/tools/testing/selftests/bpf/prog_tests/libarena.c new file mode 100644 index 000000000000..61ea68dce410 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/libarena.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <unistd.h> + +#include <libarena/common.h> +#include <libarena/asan.h> +#include <libarena/buddy.h> +#include <libarena/userspace.h> + +#include "libarena/libarena.skel.h" + +static void run_libarena_test(struct libarena *skel, struct bpf_program *prog, + const char *name) +{ + int ret; + + if (!strstr(name, "test_buddy")) { + ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset)); + if (!ASSERT_OK(ret, "arena_buddy_reset")) + return; + } + + ret = libarena_run_prog(bpf_program__fd(prog)); + + ASSERT_OK(ret, name); + +} + +static void *run_libarena_parallel_prog(void *arg) +{ + struct bpf_program *prog = arg; + + return (void *)(long)libarena_run_prog(bpf_program__fd(prog)); +} + +/* Max suffix is ceil((lg 2^32) / (lg 10)) + sizeof("__") = 10 + 2 = 12. */ +#define MAX_PARTEST_SUFFIX (12) +#define MAX_PARTEST_NAME (1024) +#define MAX_PARTEST_PREFIX (MAX_PARTEST_NAME - MAX_PARTEST_SUFFIX) + +static int run_libarena_parallel_fini(struct libarena *skel, const char *name, + size_t prefixlen) +{ + char tdname[MAX_PARTEST_NAME]; + struct bpf_program *fini_prog; + int ret; + + ret = snprintf(tdname, sizeof(tdname), "%.*s__fini", (int)prefixlen, name); + if (!ASSERT_LT(ret, sizeof(tdname), "partest fini name")) + return -ENAMETOOLONG; + + fini_prog = bpf_object__find_program_by_name(skel->obj, tdname); + if (!ASSERT_TRUE(fini_prog, "partest fini prog")) + return -ENOENT; + + ret = libarena_run_prog(bpf_program__fd(fini_prog)); + ASSERT_OK(ret, tdname); + + return ret; +} + +static int run_libarena_parallel_test_workers(struct libarena *skel, + const char *name, size_t prefixlen) +{ + pthread_t *threads = NULL, *tmp_threads; + char tdname[MAX_PARTEST_NAME]; + struct bpf_program *tdprog; + uint32_t nthreads; + void *thread_ret; + int ret, err = 0; + int i; + + for (nthreads = 0; nthreads < UINT_MAX; nthreads++) { + ret = snprintf(tdname, sizeof(tdname), "%.*s__%u", (int)prefixlen, + name, nthreads); + if (!ASSERT_LT(ret, sizeof(tdname), "test worker name")) { + err = -ENAMETOOLONG; + break; + } + + /* + * We enumerate the worker threads for a given test with __0, __1, + * and so on. The suffixes always start from 0 and are contiguous, + * so if we don't find a program with the requested name we have + * discovered all available worker programs. + */ + tdprog = bpf_object__find_program_by_name(skel->obj, tdname); + if (!tdprog) + break; + + /* Bump the alloc array to accommodate the new thread. */ + tmp_threads = realloc(threads, (nthreads + 1) * sizeof(*threads)); + if (!ASSERT_TRUE(tmp_threads, "realloc")) { + err = -ENOMEM; + break; + } + threads = tmp_threads; + + ret = pthread_create(&threads[nthreads], NULL, + run_libarena_parallel_prog, + tdprog); + if (!ASSERT_OK(ret, "pthread_create")) { + err = ret; + break; + } + } + + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(threads[i], &thread_ret); + if (!ASSERT_OK(ret, "pthread_join")) { + err = err ?: ret; + continue; + } + + err = err ?: (long)thread_ret; + } + + free(threads); + + return err; +} + +static bool libarena_parallel_test_enabled(struct libarena *skel, + const char *prefix, + size_t prefixlen) +{ + struct bpf_program *prog; + char progname[MAX_PARTEST_NAME]; + int ret; + + ret = snprintf(progname, sizeof(progname), "%.*s__enabled", (int)prefixlen, + prefix); + if (!ASSERT_LT(ret, sizeof(progname), "partest enabled name")) + return false; + + prog = bpf_object__find_program_by_name(skel->obj, progname); + if (!prog) + return true; + + ret = libarena_run_prog(bpf_program__fd(prog)); + if (ret == -EOPNOTSUPP) + return false; + if (!ASSERT_OK(ret, progname)) + return false; + return true; +} + +static void run_libarena_parallel_test(struct libarena *skel, struct bpf_program *prog, + const char *name) +{ + char testname[MAX_PARTEST_NAME]; + size_t prefixlen; + const char *pos; + int ret; + + /* + * We annotate the initialization prog with __init. If the current prog does + * not match, it is one of the parallel threads instead and is ignored. + * + * We assume the test writer knows what they are doing and do not add __init + * randomly in the middle of a test name. + */ + pos = strstr(name, "__init"); + if (!pos) + return; + + prefixlen = pos - name; + if (!ASSERT_LT(prefixlen, MAX_PARTEST_PREFIX, "partest prefix too long")) + return; + + /* The name of the test without the __init suffix. Looks nicer in the test log. */ + ret = snprintf(testname, sizeof(testname), "%.*s", (int)prefixlen, name); + if (!ASSERT_LT(ret, sizeof(testname), "partest test name")) + return; + + if (!test__start_subtest(testname)) + return; + + if (!libarena_parallel_test_enabled(skel, testname, prefixlen)) { + test__skip(); + return; + } + + ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset)); + if (!ASSERT_OK(ret, "arena_buddy_reset")) + return; + + ret = libarena_run_prog(bpf_program__fd(prog)); + if (!ASSERT_OK(ret, testname)) + return; + + ret = run_libarena_parallel_test_workers(skel, name, prefixlen); + + ASSERT_OK(ret, testname); + + run_libarena_parallel_fini(skel, name, prefixlen); +} + +void test_libarena(void) +{ + struct arena_alloc_reserve_args args; + struct libarena *skel; + struct bpf_program *prog; + int ret; + + skel = libarena__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = libarena__attach(skel); + if (!ASSERT_OK(ret, "attach")) + goto out; + + args.nr_pages = ARENA_RESERVE_PAGES_DFL; + + ret = libarena_run_prog_args(bpf_program__fd(skel->progs.arena_alloc_reserve), + &args, sizeof(args)); + if (!ASSERT_OK(ret, "arena_alloc_reserve")) + goto out; + + bpf_object__for_each_program(prog, skel->obj) { + const char *name = bpf_program__name(prog); + + /* + * Handle parallel test progs separately. For those + * progs it's not a matter of test/skip, because each + * parallel test prog includes an initialization prog + * and a set of progs to be run in parallel. For the + * latter we do not record them as skipped or run, + * because we run them all at once when we come across + * the initialization prog. For more details on how we + * discover the progs see the comment on + * run_libarena_parallel_test. + */ + if (libarena_is_parallel_test_prog(name)) { + run_libarena_parallel_test(skel, prog, name); + continue; + } + + if (!libarena_is_test_prog(name)) + continue; + + if (!test__start_subtest(name)) + continue; + + run_libarena_test(skel, prog, name); + } + +out: + libarena__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/libarena_asan.c b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c new file mode 100644 index 000000000000..d59d9dd12ef2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/libarena_asan.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> + +#ifdef HAS_BPF_ARENA_ASAN +#include <unistd.h> + +#include <libarena/common.h> +#include <libarena/asan.h> +#include <libarena/buddy.h> +#include <libarena/userspace.h> + +#include "libarena/libarena_asan.skel.h" + +static void run_libarena_asan_test(struct libarena_asan *skel, + struct bpf_program *prog, const char *name) +{ + int ret; + + if (!strstr(name, "test_buddy")) { + ret = libarena_run_prog(bpf_program__fd(skel->progs.arena_buddy_reset)); + if (!ASSERT_OK(ret, "arena_buddy_reset")) + return; + } + + ret = libarena_run_prog(bpf_program__fd(prog)); + ASSERT_OK(ret, name); + + verify_test_stderr(skel->obj, prog); +} + +static void run_test(void) +{ + struct arena_alloc_reserve_args args; + struct libarena_asan *skel; + struct bpf_program *prog; + int ret; + + skel = libarena_asan__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = libarena_asan__attach(skel); + if (!ASSERT_OK(ret, "attach")) + goto out; + + args.nr_pages = ARENA_RESERVE_PAGES_DFL; + + ret = libarena_run_prog_args(bpf_program__fd(skel->progs.arena_alloc_reserve), + &args, sizeof(args)); + if (!ASSERT_OK(ret, "arena_alloc_reserve")) + goto out; + + ret = libarena_asan_init( + bpf_program__fd(skel->progs.arena_get_info), + bpf_program__fd(skel->progs.asan_init), + (1ULL << 32) / sysconf(_SC_PAGESIZE)); + if (!ASSERT_OK(ret, "libarena_asan_init")) + goto out; + + bpf_object__for_each_program(prog, skel->obj) { + const char *name = bpf_program__name(prog); + + if (!libarena_is_asan_test_prog(name)) + continue; + + if (!test__start_subtest(name)) + continue; + + run_libarena_asan_test(skel, prog, name); + } + +out: + libarena_asan__destroy(skel); +} + +#endif /* HAS_BPF_ARENA_ASAN */ + +/* + * Run the test depending on whether LLVM can compile arena ASAN + * programs. + */ +void test_libarena_asan(void) +{ +#ifdef HAS_BPF_ARENA_ASAN + run_test(); +#else + test__skip(); +#endif + + return; +} + diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 6f25b5f39a79..8defea0253ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -81,8 +81,8 @@ static struct { { "direct_write_node", "direct access to bpf_list_node is disallowed" }, { "use_after_unlock_push_front", "invalid mem access 'scalar'" }, { "use_after_unlock_push_back", "invalid mem access 'scalar'" }, - { "double_push_front", "arg#1 expected pointer to allocated object" }, - { "double_push_back", "arg#1 expected pointer to allocated object" }, + { "double_push_front", "R2 expected pointer to allocated object" }, + { "double_push_back", "R2 expected pointer to allocated object" }, { "no_node_value_type", "bpf_list_node not found at offset=0" }, { "incorrect_value_type", "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, " @@ -131,13 +131,14 @@ end: linked_list_fail__destroy(skel); } -static void clear_fields(struct bpf_map *map) +static void clear_fields(struct bpf_program *prog) { - char buf[24]; - int key = 0; + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; - memset(buf, 0xff, sizeof(buf)); - ASSERT_OK(bpf_map__update_elem(map, &key, sizeof(key), buf, sizeof(buf), 0), "check_and_free_fields"); + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + ASSERT_OK(ret, "clear_fields"); + ASSERT_OK(opts.retval, "clear_fields retval"); } enum { @@ -170,31 +171,31 @@ static void test_linked_list_success(int mode, bool leave_in_map) ASSERT_OK(ret, "map_list_push_pop"); ASSERT_OK(opts.retval, "map_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop), &opts); ASSERT_OK(ret, "inner_map_list_push_pop"); ASSERT_OK(opts.retval, "inner_map_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop), &opts); ASSERT_OK(ret, "global_list_push_pop"); ASSERT_OK(opts.retval, "global_list_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_nested), &opts); ASSERT_OK(ret, "global_list_push_pop_nested"); ASSERT_OK(opts.retval, "global_list_push_pop_nested retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_nested_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_array_push_pop), &opts); ASSERT_OK(ret, "global_list_array_push_pop"); ASSERT_OK(opts.retval, "global_list_array_push_pop retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_array_list); if (mode == PUSH_POP) goto end; @@ -204,19 +205,19 @@ ppm: ASSERT_OK(ret, "map_list_push_pop_multiple"); ASSERT_OK(opts.retval, "map_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_push_pop_multiple), &opts); ASSERT_OK(ret, "inner_map_list_push_pop_multiple"); ASSERT_OK(opts.retval, "inner_map_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_push_pop_multiple), &opts); ASSERT_OK(ret, "global_list_push_pop_multiple"); ASSERT_OK(opts.retval, "global_list_push_pop_multiple retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); if (mode == PUSH_POP_MULT) goto end; @@ -226,19 +227,19 @@ lil: ASSERT_OK(ret, "map_list_in_list"); ASSERT_OK(opts.retval, "map_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.array_map); + clear_fields(skel->progs.clear_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.inner_map_list_in_list), &opts); ASSERT_OK(ret, "inner_map_list_in_list"); ASSERT_OK(opts.retval, "inner_map_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.inner_map); + clear_fields(skel->progs.clear_inner_map_list); ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.global_list_in_list), &opts); ASSERT_OK(ret, "global_list_in_list"); ASSERT_OK(opts.retval, "global_list_in_list retval"); if (!leave_in_map) - clear_fields(skel->maps.bss_A); + clear_fields(skel->progs.clear_global_list); end: linked_list__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c b/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c new file mode 100644 index 000000000000..60666a9ba41f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lru_lock_nmi.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Stress every LRU lock-failure and orphan-recovery. + * perf_event NMI BPF on every online CPU does + * update+delete on a small LRU map; userspace threads on every CPU do + * the same from syscall context. + */ +#define _GNU_SOURCE +#include <pthread.h> +#include <sched.h> +#include <sys/syscall.h> +#include <linux/perf_event.h> +#include <test_progs.h> +#include "testing_helpers.h" +#include "lru_lock_nmi.skel.h" + +#define MAP_ENTRIES 64 +#define KEY_RANGE (MAP_ENTRIES * 2) +#define STRESS_NS (500 * 1000 * 1000ULL) + +struct hammer_arg { + int map_fd; + int cpu; + __u64 deadline_ns; +}; + +struct refill_arg { + int map_fd; + int cpu; + int per_cpu_quota; + int update_errors; +}; + +/* + * Pin the calling thread to @cpu. Uses dynamically-allocated CPU sets so + * we stay correct on hosts with @cpu >= CPU_SETSIZE (default 1024). + */ +static int pin_to_cpu(int cpu) +{ + cpu_set_t *cs; + size_t cs_size; + int err; + + cs = CPU_ALLOC(cpu + 1); + if (!cs) + return -ENOMEM; + cs_size = CPU_ALLOC_SIZE(cpu + 1); + + CPU_ZERO_S(cs_size, cs); + CPU_SET_S(cpu, cs_size, cs); + err = pthread_setaffinity_np(pthread_self(), cs_size, cs); + CPU_FREE(cs); + return err; +} + +static void *hammer_thread(void *p) +{ + struct hammer_arg *a = p; + int nr_possible_cpus = libbpf_num_possible_cpus(); + __u64 val[nr_possible_cpus]; + unsigned int seed; + __u32 key; + + memset(val, 0, sizeof(val)); + pin_to_cpu(a->cpu); + + seed = (unsigned int)a->cpu ^ (unsigned int)(uintptr_t)pthread_self(); + + while (get_time_ns() < a->deadline_ns) { + bool do_update = rand_r(&seed) & 1; + + key = rand_r(&seed) % KEY_RANGE; + if (do_update) + bpf_map_update_elem(a->map_fd, &key, val, BPF_ANY); + else + bpf_map_delete_elem(a->map_fd, &key); + } + return NULL; +} + +static void *refill_thread(void *p) +{ + struct refill_arg *a = p; + int nr_possible_cpus = libbpf_num_possible_cpus(); + __u64 val[nr_possible_cpus]; + __u32 start, end, key; + + memset(val, 0, sizeof(val)); + pin_to_cpu(a->cpu); + + start = (__u32)a->cpu * (__u32)a->per_cpu_quota; + end = start + (__u32)a->per_cpu_quota; + for (key = start; key < end; key++) + if (bpf_map_update_elem(a->map_fd, &key, val, BPF_ANY)) + a->update_errors++; + return NULL; +} + +/* + * Drain the map, then refill it with each CPU inserting only its own + * quota of keys. + * After refill, lookup every key we inserted - a stranded node on any + * CPU's pool would have forced eviction. + */ +static int drain_then_verify_capacity(int map_fd, int nr_cpus) +{ + int per_cpu_quota = MAP_ENTRIES / nr_cpus; + int total = per_cpu_quota * nr_cpus; + int nr_possible_cpus = libbpf_num_possible_cpus(); + pthread_t threads[nr_cpus]; + struct refill_arg args[nr_cpus]; + __u64 val[nr_possible_cpus]; + int i, hits = 0, nthreads = 0; + __u32 key; + + memset(val, 0, sizeof(val)); + + for (key = 0; key < KEY_RANGE; key++) + bpf_map_delete_elem(map_fd, &key); + + for (i = 0; i < nr_cpus; i++) { + args[i] = (struct refill_arg){ + .map_fd = map_fd, + .cpu = i, + .per_cpu_quota = per_cpu_quota, + }; + if (pthread_create(&threads[nthreads], NULL, refill_thread, &args[i]) == 0) + nthreads++; + } + for (i = 0; i < nthreads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < nr_cpus; i++) + if (args[i].update_errors) + return -ENOMEM; + + for (key = 0; key < (__u32)total; key++) + if (bpf_map_lookup_elem(map_fd, &key, val) == 0) + hits++; + + return hits == total ? 0 : -EIO; +} + +static void run_variant(enum bpf_map_type type, __u32 map_flags, const char *name) +{ + struct perf_event_attr attr = { + .size = sizeof(attr), + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .freq = 1, + }; + int nr_cpus, max_cpus = 64; + struct bpf_link *links[max_cpus]; + pthread_t threads[max_cpus]; + struct hammer_arg args[max_cpus]; + struct lru_lock_nmi *skel = NULL; + int map_fd, i, err, nr_threads = 0, pmu_fd = -1; + __u64 deadline; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "num_cpus")) + return; + + if (nr_cpus > max_cpus) + nr_cpus = max_cpus; + + if (!test__start_subtest(name)) + return; + + memset(links, 0, sizeof(links)); + skel = lru_lock_nmi__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + + err = bpf_map__set_type(skel->maps.lru_map, type); + if (!ASSERT_OK(err, "set_type")) + goto cleanup; + err = bpf_map__set_map_flags(skel->maps.lru_map, map_flags); + if (!ASSERT_OK(err, "set_flags")) + goto cleanup; + err = bpf_map__set_max_entries(skel->maps.lru_map, MAP_ENTRIES); + if (!ASSERT_OK(err, "set_max_entries")) + goto cleanup; + + err = lru_lock_nmi__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + skel->bss->hits = 0; + map_fd = bpf_map__fd(skel->maps.lru_map); + attr.sample_freq = read_perf_max_sample_freq(); + + for (i = 0; i < nr_cpus; i++) { + pmu_fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); + if (pmu_fd < 0) { + if (i == 0 && + (errno == ENOENT || errno == EOPNOTSUPP)) { + test__skip(); + goto cleanup; + } + continue; + } + /* libbpf takes ownership of pfd on success */ + links[i] = bpf_program__attach_perf_event(skel->progs.oncpu, pmu_fd); + if (!links[i]) + close(pmu_fd); + } + + deadline = get_time_ns() + STRESS_NS; + for (i = 0; i < nr_cpus; i++) { + args[i].map_fd = map_fd; + args[i].cpu = i; + args[i].deadline_ns = deadline; + if (pthread_create(&threads[nr_threads], NULL, hammer_thread, &args[i]) == 0) + nr_threads++; + } + for (i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + for (i = 0; i < nr_cpus; i++) { + if (links[i]) { + bpf_link__destroy(links[i]); + links[i] = NULL; + } + } + + ASSERT_GT(skel->bss->hits, 0, "nmi_bpf_ran"); + ASSERT_OK(drain_then_verify_capacity(map_fd, nr_cpus), "drain_then_verify_capacity"); + +cleanup: + for (i = 0; i < nr_cpus; i++) { + if (links[i]) + bpf_link__destroy(links[i]); + } + lru_lock_nmi__destroy(skel); +} + +void serial_test_lru_lock_nmi(void) +{ + run_variant(BPF_MAP_TYPE_LRU_HASH, 0, "common_lru"); + run_variant(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU, "no_common_lru"); + run_variant(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0, "percpu_lru"); +} diff --git a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c index 6df25de8f080..41e867467f6c 100644 --- a/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/prog_tests/lsm_cgroup.c @@ -2,6 +2,7 @@ #include <sys/types.h> #include <sys/socket.h> +#include <sys/xattr.h> #include <test_progs.h> #include <bpf/btf.h> @@ -309,11 +310,89 @@ static void test_lsm_cgroup_nonvoid(void) lsm_cgroup_nonvoid__destroy(skel); } +static void test_lsm_cgroup_retval(void) +{ + struct lsm_cgroup *skel = NULL; + int skipcap_prog_fd1, skipcap_prog_fd2, socket_prog_fd1, socket_prog_fd2; + int cgroup_fd = -1; + int err, fd; + char tmpfile[] = "/tmp/test_lsm_cgroup_retval.XXXXXX"; + + fd = mkstemp(tmpfile); + if (!ASSERT_OK_FD(fd, "mkstemp")) + return; + close(fd); + + cgroup_fd = test__join_cgroup("/default_retval"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup")) + goto cleanup_tmpfile; + + skel = lsm_cgroup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + goto cleanup_cgroup; + + skipcap_prog_fd1 = bpf_program__fd(skel->progs.skipcap_first); + skipcap_prog_fd2 = bpf_program__fd(skel->progs.skipcap_second); + socket_prog_fd1 = bpf_program__fd(skel->progs.socket_first); + socket_prog_fd2 = bpf_program__fd(skel->progs.socket_second); + + err = bpf_prog_attach(skipcap_prog_fd1, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (err == -ENOTSUPP) { + test__skip(); + goto cleanup_skeleton; + } + if (!ASSERT_OK(err, "attach first skipcap prog")) + goto cleanup_skeleton; + + err = bpf_prog_attach(skipcap_prog_fd2, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (!ASSERT_OK(err, "attach second skipcap prog")) + goto cleanup_skipcap1; + + err = bpf_prog_attach(socket_prog_fd1, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (!ASSERT_OK(err, "attach first sock_create prog")) + goto cleanup_skipcap2; + + err = bpf_prog_attach(socket_prog_fd2, cgroup_fd, BPF_LSM_CGROUP, BPF_F_ALLOW_MULTI); + if (!ASSERT_OK(err, "attach second sock_create prog")) + goto cleanup_sock_create1; + + /* trigger the bool hook by setxattr */ + err = setxattr(tmpfile, "user.test", "value", 5, 0); + if (!ASSERT_OK(err, "setxattr")) + goto cleanup_sock_create2; + + /* trigger the errno hook by creating a socket */ + fd = socket(AF_INET, SOCK_STREAM, 0); + if (!ASSERT_OK_FD(fd, "socket")) + goto cleanup_sock_create2; + close(fd); + + ASSERT_EQ(skel->data->skipcap_retval, 0, "bool_hook_retval_should_be_0"); + ASSERT_EQ(skel->data->socket_retval, -EPERM, "errno_hook_retval_should_be_EPERM"); + +cleanup_sock_create2: + bpf_prog_detach2(socket_prog_fd2, cgroup_fd, BPF_LSM_CGROUP); +cleanup_sock_create1: + bpf_prog_detach2(socket_prog_fd1, cgroup_fd, BPF_LSM_CGROUP); +cleanup_skipcap2: + bpf_prog_detach2(skipcap_prog_fd2, cgroup_fd, BPF_LSM_CGROUP); +cleanup_skipcap1: + bpf_prog_detach2(skipcap_prog_fd1, cgroup_fd, BPF_LSM_CGROUP); +cleanup_skeleton: + lsm_cgroup__destroy(skel); +cleanup_cgroup: + close(cgroup_fd); +cleanup_tmpfile: + unlink(tmpfile); +} + void test_lsm_cgroup(void) { if (test__start_subtest("functional")) test_lsm_cgroup_functional(); if (test__start_subtest("nonvoid")) test_lsm_cgroup_nonvoid(); + if (test__start_subtest("retval")) + test_lsm_cgroup_retval(); btf__free(btf); } diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c index b6391af5f6f9..6606f0ed9a9a 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c +++ b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c @@ -3,6 +3,7 @@ #include "network_helpers.h" #include "test_progs.h" +#include "test_lwt_ip_encap.skel.h" #define BPF_FILE "test_lwt_ip_encap.bpf.o" @@ -32,6 +33,9 @@ #define IP6_ADDR_8 "fb08::1" #define IP6_ADDR_GRE "fb10::1" +#define IP4_ADDR_VXLAN "172.16.17.100" +#define IP6_ADDR_VXLAN "fb11::1" + #define IP6_ADDR_SRC IP6_ADDR_1 #define IP6_ADDR_DST IP6_ADDR_4 @@ -538,3 +542,144 @@ void test_lwt_ip_encap_ipv4(void) if (test__start_subtest("ingress")) lwt_ip_encap(IPV4_ENCAP, INGRESS, ""); } + +/* + * VxLAN Setup/topology: + * + * NS1 (IP*_ADDR_1) NS2 NS3 (IP*_ADDR_4) + * [ping src] + * | top route + * veth1 (LWT encap) <<-- veth2 veth3 <<-- veth4 (ping dst) + * | ^ + * (bottom route) | (inner pkt) + * v bottom route | + * veth5 -->> veth6 veth7 -->> veth8 (vxlan decap) + * (IP*_ADDR_VXLAN) + * + * Add the VxLAN endpoint addresses to NS3's veth8, create standard + * VxLAN decap devices bound to those addresses, and install routes so + * NS1/NS2 can reach the endpoints via the bottom route. NS2 here is to + * make sure the LWT-encap VxLAN packets are routed to NS3 correctly. + */ +static int setup_vxlan_routes(const char *ns3, const char *ns1, const char *ns2) +{ + struct nstoken *nstoken; + + nstoken = open_netns(ns3); + if (!ASSERT_OK_PTR(nstoken, "open ns3 for vxlan")) + return -1; + + SYS(fail_close, "ip a add %s/32 dev veth8", IP4_ADDR_VXLAN); + SYS(fail_close, "ip -6 a add %s/128 dev veth8", IP6_ADDR_VXLAN); + /* + * Standard VxLAN devices to decap the encapsulated packets. The inner + * Ethernet frame uses a broadcast dst MAC so the IP stack accepts it + * without ARP or FDB configuration. + */ + SYS(fail_close, "ip link add vxlan4 type vxlan id 1 dstport 4789 local %s dev veth8 nolearning noudpcsum", + IP4_ADDR_VXLAN); + SYS(fail_close, "ip link set vxlan4 up"); + SYS(fail_close, "ip link add vxlan6 type vxlan id 1 dstport 4789 local %s dev veth8 nolearning udp6zerocsumrx", + IP6_ADDR_VXLAN); + SYS(fail_close, "ip link set vxlan6 up"); + close_netns(nstoken); + + SYS(fail, "ip -n %s route add %s/32 dev veth5 via %s", + ns1, IP4_ADDR_VXLAN, IP4_ADDR_6); + SYS(fail, "ip -n %s route add %s/32 dev veth7 via %s", + ns2, IP4_ADDR_VXLAN, IP4_ADDR_8); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth5 via %s", + ns1, IP6_ADDR_VXLAN, IP6_ADDR_6); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth7 via %s", + ns2, IP6_ADDR_VXLAN, IP6_ADDR_8); + return 0; + +fail_close: + close_netns(nstoken); +fail: + return -1; +} + +static void lwt_ip_encap_vxlan(bool ipv4_encap) +{ + char ns1[NETNS_NAME_SIZE] = NETNS_BASE "-1-"; + char ns2[NETNS_NAME_SIZE] = NETNS_BASE "-2-"; + char ns3[NETNS_NAME_SIZE] = NETNS_BASE "-3-"; + const char *sec = ipv4_encap ? "encap_vxlan" : "encap_vxlan6"; + int expected_offset = ipv4_encap ? (int)sizeof(struct iphdr) + : (int)sizeof(struct ipv6hdr); + struct test_lwt_ip_encap *skel = NULL; + int thdr_offset, err; + + if (!ASSERT_OK(create_ns(ns1, NETNS_NAME_SIZE), "create ns1")) + goto out; + if (!ASSERT_OK(create_ns(ns2, NETNS_NAME_SIZE), "create ns2")) + goto out; + if (!ASSERT_OK(create_ns(ns3, NETNS_NAME_SIZE), "create ns3")) + goto out; + + if (!ASSERT_OK(setup_network(ns1, ns2, ns3, ""), "setup network")) + goto out; + + if (!ASSERT_OK(setup_vxlan_routes(ns3, ns1, ns2), "setup vxlan routes")) + goto out; + + skel = test_lwt_ip_encap__open(); + if (!ASSERT_OK_PTR(skel, "test_lwt_ip_encap__open")) + goto out; + + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_gre, false); + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_gre6, false); + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_vxlan, false); + bpf_program__set_autoload(skel->progs.bpf_lwt_encap_vxlan6, false); + bpf_program__set_autoload(skel->progs.fexit_lwt_push_ip_encap, true); + skel->rodata->tgt_ip_version = ipv4_encap ? 4 : 6; + + err = test_lwt_ip_encap__load(skel); + if (!ASSERT_OK(err, "test_lwt_ip_encap__load")) + goto out; + + err = test_lwt_ip_encap__attach(skel); + if (!ASSERT_OK(err, "test_lwt_ip_encap__attach")) + goto out; + + /* Remove the direct NS2->DST route so packets must go via LWT encap. */ + SYS(out, "ip -n %s route del %s/32 dev veth3", ns2, IP4_ADDR_DST); + SYS(out, "ip -n %s -6 route del %s/128 dev veth3", ns2, IP6_ADDR_DST); + + if (ipv4_encap) + SYS(out, "ip -n %s route add %s encap bpf xmit obj %s sec %s dev veth1", + ns1, IP4_ADDR_DST, BPF_FILE, sec); + else + SYS(out, "ip -n %s -6 route add %s encap bpf xmit obj %s sec %s dev veth1", + ns1, IP6_ADDR_DST, BPF_FILE, sec); + + skel->bss->fexit_triggered = false; + + if (ipv4_encap) + SYS(out, "ip netns exec %s ping -c 1 -W1 %s", ns1, IP4_ADDR_DST); + else + SYS(out, "ip netns exec %s ping6 -c 1 -W1 %s", ns1, IP6_ADDR_DST); + + if (!ASSERT_TRUE(skel->bss->fexit_triggered, "fexit_triggered")) + goto out; + + thdr_offset = (int)skel->bss->transport_hdr - (int)skel->bss->network_hdr; + ASSERT_EQ(thdr_offset, expected_offset, "transport_hdr offset"); + +out: + test_lwt_ip_encap__destroy(skel); + SYS_NOFAIL("ip netns del %s", ns1); + SYS_NOFAIL("ip netns del %s", ns2); + SYS_NOFAIL("ip netns del %s", ns3); +} + +void test_lwt_ip_encap_vxlan_ipv4(void) +{ + lwt_ip_encap_vxlan(IPV4_ENCAP); +} + +void test_lwt_ip_encap_vxlan_ipv6(void) +{ + lwt_ip_encap_vxlan(IPV6_ENCAP); +} diff --git a/tools/testing/selftests/bpf/prog_tests/map_excl.c b/tools/testing/selftests/bpf/prog_tests/map_excl.c index 6bdc6d6de0da..3f4422b9ffa6 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_excl.c +++ b/tools/testing/selftests/bpf/prog_tests/map_excl.c @@ -7,6 +7,11 @@ #include <bpf/btf.h> #include "map_excl.skel.h" +#include "bpf_iter_bpf_array_map.skel.h" + +#ifndef SHA256_DIGEST_SIZE +#define SHA256_DIGEST_SIZE 32 +#endif static void test_map_excl_allowed(void) { @@ -45,10 +50,127 @@ out: } +static void test_map_excl_no_map_in_map(void) +{ + __u8 hash[SHA256_DIGEST_SIZE] = {}; + LIBBPF_OPTS(bpf_map_create_opts, excl_opts, + .excl_prog_hash = hash, + .excl_prog_hash_size = sizeof(hash)); + LIBBPF_OPTS(bpf_map_create_opts, outer_opts); + int excl_fd, tmpl_fd = -1, outer_fd = -1, err; + __u32 key = 0; + + excl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl_inner", 4, 4, 1, &excl_opts); + if (!ASSERT_OK_FD(excl_fd, "create exclusive map")) + return; + + outer_opts.inner_map_fd = excl_fd; + err = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer_from_excl", + 4, 4, 1, &outer_opts); + if (err >= 0) + close(err); + ASSERT_EQ(err, -ENOTSUPP, "reject exclusive map as map-in-map template"); + + tmpl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "tmpl", 4, 4, 1, NULL); + if (!ASSERT_OK_FD(tmpl_fd, "create inner template")) + goto out; + + outer_opts.inner_map_fd = tmpl_fd; + outer_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer", 4, 4, 1, &outer_opts); + if (!ASSERT_OK_FD(outer_fd, "create map-of-maps")) + goto out; + + err = bpf_map_update_elem(outer_fd, &key, &excl_fd, 0); + ASSERT_EQ(err, -ENOTSUPP, "reject exclusive map as map-in-map element"); +out: + if (outer_fd >= 0) + close(outer_fd); + if (tmpl_fd >= 0) + close(tmpl_fd); + close(excl_fd); +} + +static void test_map_excl_no_map_iter(void) +{ + __u8 hash[SHA256_DIGEST_SIZE] = {}; + LIBBPF_OPTS(bpf_map_create_opts, excl_opts, + .excl_prog_hash = hash, + .excl_prog_hash_size = sizeof(hash)); + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_bpf_array_map *skel = NULL; + union bpf_iter_link_info linfo; + struct bpf_link *link; + int excl_fd; + + excl_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl_iter", 4, 8, 3, &excl_opts); + if (!ASSERT_OK_FD(excl_fd, "create exclusive map")) + return; + + skel = bpf_iter_bpf_array_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_array_map__open_and_load")) + goto out; + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = excl_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts); + if (!ASSERT_ERR_PTR(link, "reject exclusive map as iter target")) { + bpf_link__destroy(link); + goto out; + } + ASSERT_EQ(libbpf_get_error(link), -EPERM, "iter attach errno"); +out: + bpf_iter_bpf_array_map__destroy(skel); + close(excl_fd); +} + +static void test_map_excl_create_validation(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, o); + __u8 hash[SHA256_DIGEST_SIZE] = {}; + int fd; + + o.excl_prog_hash = hash; + o.excl_prog_hash_size = SHA256_DIGEST_SIZE / 2; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject short excl_prog_hash_size"); + + o.excl_prog_hash = hash; + o.excl_prog_hash_size = SHA256_DIGEST_SIZE * 2; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject long excl_prog_hash_size"); + + o.excl_prog_hash = hash; + o.excl_prog_hash_size = 0; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject hash pointer with zero size"); + + o.excl_prog_hash = NULL; + o.excl_prog_hash_size = SHA256_DIGEST_SIZE; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "excl", 4, 4, 1, &o); + if (fd >= 0) + close(fd); + ASSERT_EQ(fd, -EINVAL, "reject size with NULL hash pointer"); +} + void test_map_excl(void) { if (test__start_subtest("map_excl_allowed")) test_map_excl_allowed(); if (test__start_subtest("map_excl_denied")) test_map_excl_denied(); + if (test__start_subtest("map_excl_no_map_in_map")) + test_map_excl_no_map_in_map(); + if (test__start_subtest("map_excl_no_map_iter")) + test_map_excl_no_map_iter(); + if (test__start_subtest("map_excl_create_validation")) + test_map_excl_create_validation(); } diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c index 14a31109dd0e..c804c3ce9be9 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_init.c +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -212,3 +212,195 @@ void test_map_init(void) if (test__start_subtest("pcpu_lru_map_init")) test_pcpu_lru_map_init(); } + +static void test_map_create(enum bpf_map_type map_type, const char *map_name, + struct bpf_map_create_opts *opts, const char *exp_msg) +{ + const int key_size = 4, value_size = 4, max_entries = 1; + char log_buf[128]; + int fd; + LIBBPF_OPTS(bpf_log_opts, log_opts); + + log_buf[0] = '\0'; + log_opts.buf = log_buf; + log_opts.size = sizeof(log_buf); + log_opts.level = 1; + opts->log_opts = &log_opts; + fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, opts); + if (!ASSERT_LT(fd, 0, "bpf_map_create")) { + close(fd); + return; + } + + ASSERT_STREQ(log_buf, exp_msg, "log_buf"); + ASSERT_EQ(log_opts.true_size, strlen(exp_msg) + 1, "true_size"); +} + +static void test_map_create_array(struct bpf_map_create_opts *opts, const char *exp_msg) +{ + test_map_create(BPF_MAP_TYPE_ARRAY, "test_map_create", opts, exp_msg); +} + +static void test_invalid_vmlinux_value_type_id_struct_ops(void) +{ + const char *msg = "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_vmlinux_value_type_id = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_vmlinux_value_type_id_kv_type_id(void) +{ + const char *msg = "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_vmlinux_value_type_id = 1, + .btf_key_type_id = 1, + ); + + test_map_create(BPF_MAP_TYPE_STRUCT_OPS, "test_map_create", &opts, msg); +} + +static void test_invalid_value_type_id(void) +{ + const char *msg = "Invalid btf_value_type_id.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_key_type_id = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_map_extra(void) +{ + const char *msg = "Invalid map_extra.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_extra = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_numa_node(void) +{ + const char *msg = "Invalid numa_node.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NUMA_NODE, + .numa_node = 0xFF, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_map_type(void) +{ + const char *msg = "Invalid map_type.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts); + + test_map_create(__MAX_BPF_MAP_TYPE, "test_map_create", &opts, msg); +} + +static void test_invalid_token_fd(void) +{ + const char *msg = "Invalid map_token_fd.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_TOKEN_FD, + .token_fd = -1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_invalid_map_name(void) +{ + const char *msg = "Invalid map_name.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts); + + test_map_create(BPF_MAP_TYPE_ARRAY, "test-!@#", &opts, msg); +} + +static void test_invalid_btf_fd(void) +{ + const char *msg = "Invalid btf_fd.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .btf_fd = -1, + .btf_key_type_id = 1, + .btf_value_type_id = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_excl_prog_hash_size_1(void) +{ + const char *msg = "Invalid excl_prog_hash_size.\n"; + const char *hash = "DEADCODE"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .excl_prog_hash = hash, + ); + + test_map_create_array(&opts, msg); +} + +static void test_excl_prog_hash_size_2(void) +{ + const char *msg = "Invalid excl_prog_hash_size.\n"; + LIBBPF_OPTS(bpf_map_create_opts, opts, + .excl_prog_hash_size = 1, + ); + + test_map_create_array(&opts, msg); +} + +static void test_common_attr_padding(void) +{ + struct bpf_common_attr_fake { + __u8 attrs[offsetofend(struct bpf_common_attr, log_true_size)]; + __u32 pad; + } attr_common = { + .pad = 1, + }; + union bpf_attr attr = { + .map_type = BPF_MAP_TYPE_ARRAY, + .key_size = 4, + .value_size = 4, + .max_entries = 1, + }; + int fd; + + fd = syscall(__NR_bpf, BPF_MAP_CREATE | BPF_COMMON_ATTRS, &attr, sizeof(attr), &attr_common, + sizeof(attr_common)); + if (!ASSERT_LT(fd, 0, "syscall")) + close(fd); + else + ASSERT_EQ(errno, E2BIG, "errno"); +} + +void test_map_create_failure(void) +{ + if (test__start_subtest("invalid_vmlinux_value_type_id_struct_ops")) + test_invalid_vmlinux_value_type_id_struct_ops(); + if (test__start_subtest("invalid_vmlinux_value_type_id_kv_type_id")) + test_invalid_vmlinux_value_type_id_kv_type_id(); + if (test__start_subtest("invalid_value_type_id")) + test_invalid_value_type_id(); + if (test__start_subtest("invalid_map_extra")) + test_invalid_map_extra(); + if (test__start_subtest("invalid_numa_node")) + test_invalid_numa_node(); + if (test__start_subtest("invalid_map_type")) + test_invalid_map_type(); + if (test__start_subtest("invalid_token_fd")) + test_invalid_token_fd(); + if (test__start_subtest("invalid_map_name")) + test_invalid_map_name(); + if (test__start_subtest("invalid_btf_fd")) + test_invalid_btf_fd(); + if (test__start_subtest("invalid_excl_prog_hash_size_1")) + test_excl_prog_hash_size_1(); + if (test__start_subtest("invalid_excl_prog_hash_size_2")) + test_excl_prog_hash_size_2(); + if (test__start_subtest("common_attr_padding")) + test_common_attr_padding(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 03b46f17cf53..17e707dddda8 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -51,7 +51,6 @@ static void test_map_kptr_success(bool test_run) ret = bpf_map__update_elem(skel->maps.array_map, &key, sizeof(key), buf, sizeof(buf), 0); ASSERT_OK(ret, "array_map update"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); @@ -59,49 +58,42 @@ static void test_map_kptr_success(bool test_run) ret = bpf_map__update_elem(skel->maps.pcpu_array_map, &key, sizeof(key), pbuf, cpu * sizeof(buf), 0); ASSERT_OK(ret, "pcpu_array_map update"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.pcpu_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "pcpu_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.hash_malloc_map, &key, sizeof(key), 0); ASSERT_OK(ret, "hash_malloc_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.pcpu_hash_malloc_map, &key, sizeof(key), 0); ASSERT_OK(ret, "pcpu_hash_malloc_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.lru_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "lru_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); ret = bpf_map__delete_elem(skel->maps.lru_pcpu_hash_map, &key, sizeof(key), 0); ASSERT_OK(ret, "lru_pcpu_hash_map delete"); - skel->data->ref--; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_map_kptr_ref3), &opts); ASSERT_OK(ret, "test_map_kptr_ref3 refcount"); ASSERT_OK(opts.retval, "test_map_kptr_ref3 retval"); @@ -151,12 +143,68 @@ static void wait_for_map_release(void) map_kptr__destroy(skel); } +enum map_update_kptr_case { + MAP_UPDATE_KPTR_ARRAY, + MAP_UPDATE_KPTR_HASH, + MAP_UPDATE_KPTR_HASH_MALLOC, +}; + +static struct bpf_program *map_update_kptr_prog(struct map_kptr *skel, + enum map_update_kptr_case test) +{ + switch (test) { + case MAP_UPDATE_KPTR_ARRAY: + return skel->progs.test_array_map_update_kptr; + case MAP_UPDATE_KPTR_HASH: + return skel->progs.test_hash_map_update_kptr; + case MAP_UPDATE_KPTR_HASH_MALLOC: + return skel->progs.test_hash_malloc_map_update_kptr; + } + + return NULL; +} + +static void test_map_update_kptr(enum map_update_kptr_case test) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct map_kptr *skel; + struct bpf_program *prog; + int ret; + + skel = map_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load")) + return; + + prog = map_update_kptr_prog(skel, test); + if (!ASSERT_OK_PTR(prog, "map_update_kptr_prog")) + goto out; + + ret = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(ret, "map_update_kptr")) + goto out; + if (!ASSERT_OK(opts.retval, "map_update_kptr retval")) + goto out; + + ASSERT_EQ(skel->bss->num_of_refs, 3, "refs_after_update"); + +out: + map_kptr__destroy(skel); + wait_for_map_release(); +} + void serial_test_map_kptr(void) { struct rcu_tasks_trace_gp *skel; RUN_TESTS(map_kptr_fail); + if (test__start_subtest("update_array_map_kptr")) + test_map_update_kptr(MAP_UPDATE_KPTR_ARRAY); + if (test__start_subtest("update_hash_map_kptr")) + test_map_update_kptr(MAP_UPDATE_KPTR_HASH); + if (test__start_subtest("update_hash_malloc_map_kptr")) + test_map_update_kptr(MAP_UPDATE_KPTR_HASH_MALLOC); + skel = rcu_tasks_trace_gp__open_and_load(); if (!ASSERT_OK_PTR(skel, "rcu_tasks_trace_gp__open_and_load")) return; @@ -175,7 +223,7 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu(), "sync rcu"); wait_for_map_release(); - /* Observe refcount dropping to 1 on synchronous delete elem */ + /* Observe refcount dropping to 1 on map release. */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_array_inner_map.c b/tools/testing/selftests/bpf/prog_tests/percpu_array_inner_map.c new file mode 100644 index 000000000000..2a8b2381306b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/percpu_array_inner_map.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> + +/* + * Test that replacing an inner percpu array map with one that has different + * max_entries is rejected. percpu_array_map_gen_lookup() inlines the + * template's index_mask, so allowing a smaller replacement would cause OOB. + */ +void test_percpu_array_inner_map(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + int outer_fd, tmpl_fd, good_fd, bad_fd, err; + int zero = 0; + + /* Create template: percpu array with 8 entries */ + tmpl_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, "tmpl", + sizeof(int), sizeof(long), 8, NULL); + if (!ASSERT_OK_FD(tmpl_fd, "create_tmpl")) + return; + + /* Create outer array-of-maps using template */ + opts.inner_map_fd = tmpl_fd; + outer_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY_OF_MAPS, "outer", + sizeof(int), sizeof(int), 1, &opts); + if (!ASSERT_OK_FD(outer_fd, "create_outer")) + goto close_tmpl; + + /* Insert template as initial inner map */ + err = bpf_map_update_elem(outer_fd, &zero, &tmpl_fd, 0); + if (!ASSERT_OK(err, "insert_tmpl")) + goto close_outer; + + /* Replacement with same max_entries should succeed */ + good_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, "good", + sizeof(int), sizeof(long), 8, NULL); + if (!ASSERT_OK_FD(good_fd, "create_good")) + goto close_outer; + + err = bpf_map_update_elem(outer_fd, &zero, &good_fd, 0); + ASSERT_OK(err, "replace_same_max_entries"); + close(good_fd); + + /* Replacement with fewer max_entries must fail */ + bad_fd = bpf_map_create(BPF_MAP_TYPE_PERCPU_ARRAY, "bad", + sizeof(int), sizeof(long), 2, NULL); + if (!ASSERT_OK_FD(bad_fd, "create_bad")) + goto close_outer; + + err = bpf_map_update_elem(outer_fd, &zero, &bad_fd, 0); + ASSERT_ERR(err, "replace_smaller_max_entries"); + close(bad_fd); + +close_outer: + close(outer_fd); +close_tmpl: + close(tmpl_fd); +} diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c index d2c0542716a8..1737eba34323 100644 --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c @@ -57,6 +57,7 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void) .data_size_in = sizeof(pkt_v4), .repeat = 1, ); + LIBBPF_OPTS(bpf_test_run_opts, syscall_opts); cpu_nr = libbpf_num_possible_cpus(); if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus")) @@ -87,8 +88,11 @@ void test_percpu_hash_refcounted_kptr_refcount_leak(void) if (!ASSERT_EQ(opts.retval, 2, "opts.retval")) goto out; - err = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, 0); - if (!ASSERT_OK(err, "bpf_map__update_elem")) + fd = bpf_program__fd(skel->progs.clear_percpu_hash_kptr); + err = bpf_prog_test_run_opts(fd, &syscall_opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto out; + if (!ASSERT_EQ(syscall_opts.retval, 1, "syscall_opts.retval")) goto out; fd = bpf_program__fd(skel->progs.check_percpu_hash_refcount); diff --git a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c index 71f5240cc5b7..7f170a69d1d8 100644 --- a/tools/testing/selftests/bpf/prog_tests/reg_bounds.c +++ b/tools/testing/selftests/bpf/prog_tests/reg_bounds.c @@ -478,6 +478,52 @@ static struct range range_refine_in_halves(enum num_t x_t, struct range x, } +static __always_inline u64 next_u32_block(u64 x) { return x + (1ULL << 32); } +static __always_inline u64 prev_u32_block(u64 x) { return x - (1ULL << 32); } + +/* Is v within the circular u64 range [base, base + len]? */ +static __always_inline bool u64_range_contains(u64 v, u64 base, u64 len) +{ + return v - base <= len; +} + +/* Is v within the circular u32 range [base, base + len]? */ +static __always_inline bool u32_range_contains(u32 v, u32 base, u32 len) +{ + return v - base <= len; +} + +static bool range64_range32_intersect(enum num_t a_t, + struct range a /* 64 */, + struct range b /* 32 */, + struct range *out /* 64 */) +{ + u64 b_len = (u32)(b.b - b.a); + u64 a_len = a.b - a.a; + u64 lo, hi; + + if (u32_range_contains((u32)a.a, (u32)b.a, b_len)) { + lo = a.a; + } else { + lo = swap_low32(a.a, (u32)b.a); + if (!u64_range_contains(lo, a.a, a_len)) + lo = next_u32_block(lo); + if (!u64_range_contains(lo, a.a, a_len)) + return false; + } + if (u32_range_contains(a.b, (u32)b.a, b_len)) { + hi = a.b; + } else { + hi = swap_low32(a.b, (u32)b.b); + if (!u64_range_contains(hi, a.a, a_len)) + hi = prev_u32_block(hi); + if (!u64_range_contains(hi, a.a, a_len)) + return false; + } + *out = range(a_t, lo, hi); + return true; +} + static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, struct range y) { struct range y_cast; @@ -533,23 +579,12 @@ static struct range range_refine(enum num_t x_t, struct range x, enum num_t y_t, } } - /* the case when new range knowledge, *y*, is a 32-bit subregister - * range, while previous range knowledge, *x*, is a full register - * 64-bit range, needs special treatment to take into account upper 32 - * bits of full register range - */ if (t_is_32(y_t) && !t_is_32(x_t)) { - struct range x_swap; + struct range x1; - /* some combinations of upper 32 bits and sign bit can lead to - * invalid ranges, in such cases it's easier to detect them - * after cast/swap than try to enumerate all the conditions - * under which transformation and knowledge transfer is valid - */ - x_swap = range(x_t, swap_low32(x.a, y_cast.a), swap_low32(x.b, y_cast.b)); - if (!is_valid_range(x_t, x_swap)) - return x; - return range_intersection(x_t, x, x_swap); + if (range64_range32_intersect(x_t, x, y, &x1)) + return x1; + return x; } /* otherwise, plain range cast and intersection works */ @@ -1300,6 +1335,26 @@ static bool assert_range_eq(enum num_t t, struct range x, struct range y, return false; } +/* For a pair of signed/unsigned t1/t2 checks if r1/r2 intersect in two intervals. */ +static bool needs_two_arcs(enum num_t t1, struct range r1, + enum num_t t2, struct range r2) +{ + u64 lo = cast_t(t1, r2.a); + u64 hi = cast_t(t1, r2.b); + + /* does r2 wrap in t1's domain: [0, hi] ∪ [lo, MAX]? */ + return lo > hi && r1.a <= hi && r1.b >= lo; +} + +static bool reg_state_needs_two_arcs(struct reg_state *s) +{ + if (!s->valid) + return false; + + return needs_two_arcs(U64, s->r[U64], S64, s->r[S64]) || + needs_two_arcs(U32, s->r[U32], S32, s->r[S32]); +} + /* Validate that register states match, and print details if they don't */ static bool assert_reg_state_eq(struct reg_state *r, struct reg_state *e, const char *ctx) { @@ -1524,6 +1579,11 @@ static int verify_case_op(enum num_t init_t, enum num_t cond_t, !assert_reg_state_eq(&fr2, &fe2, "false_reg2") || !assert_reg_state_eq(&tr1, &te1, "true_reg1") || !assert_reg_state_eq(&tr2, &te2, "true_reg2")) { + if (reg_state_needs_two_arcs(&fe1) || reg_state_needs_two_arcs(&fe2) || + reg_state_needs_two_arcs(&te1) || reg_state_needs_two_arcs(&te2)) { + test__skip(); + return 0; + } failed = true; } diff --git a/tools/testing/selftests/bpf/prog_tests/rhash.c b/tools/testing/selftests/bpf/prog_tests/rhash.c new file mode 100644 index 000000000000..98bb66907b7f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/rhash.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> +#include <string.h> +#include <stdio.h> +#include "rhash.skel.h" +#include "bpf_iter_bpf_rhash_map.skel.h" +#include <linux/bpf.h> +#include <linux/perf_event.h> +#include <sys/syscall.h> + +static void rhash_run(const char *prog_name) +{ + struct rhash *skel; + struct bpf_program *prog; + LIBBPF_OPTS(bpf_test_run_opts, opts); + int err; + + skel = rhash__open(); + if (!ASSERT_OK_PTR(skel, "rhash__open")) + return; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + bpf_program__set_autoload(prog, true); + + err = rhash__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "prog run")) + goto cleanup; + + if (!ASSERT_OK(opts.retval, "prog retval")) + goto cleanup; + + if (!ASSERT_OK(skel->bss->err, "bss->err")) + goto cleanup; + +cleanup: + rhash__destroy(skel); +} + +static int rhash_map_create(__u32 max_entries, __u64 map_extra) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_NO_PREALLOC, + .map_extra = map_extra); + + return bpf_map_create(BPF_MAP_TYPE_RHASH, "rhash_extra", + sizeof(__u32), sizeof(__u64), max_entries, &opts); +} + +static void rhash_map_extra_presize(void) +{ + const __u32 max_entries = 1024; + const __u32 nelem_hint = 256; + struct bpf_map_info info = {}; + __u32 info_len = sizeof(info); + __u64 val = 0; + __u32 key; + int fd, i; + + fd = rhash_map_create(max_entries, nelem_hint); + if (!ASSERT_GE(fd, 0, "rhash_map_create presize")) + return; + + if (!ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &info_len), "info")) + goto close; + ASSERT_EQ(info.map_extra, nelem_hint, "info.map_extra"); + + for (i = 0; i < (int)nelem_hint; i++) { + key = i; + if (!ASSERT_OK(bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST), + "update")) + goto close; + } +close: + close(fd); +} + +static void rhash_map_extra_too_big(void) +{ + int fd; + + fd = rhash_map_create(1U << 20, 0x10000); + if (!ASSERT_LT(fd, 0, "rhash_map_create hint > U16_MAX")) + close(fd); +} + +static void rhash_iter_test(void) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + struct bpf_iter_bpf_rhash_map *skel; + int err, i, len, map_fd, iter_fd; + union bpf_iter_link_info linfo; + u32 expected_key_sum = 0, key; + struct bpf_link *link; + u64 val = 0; + char buf[64]; + + skel = bpf_iter_bpf_rhash_map__open(); + if (!ASSERT_OK_PTR(skel, "bpf_iter_bpf_rhash_map__open")) + return; + + err = bpf_iter_bpf_rhash_map__load(skel); + if (!ASSERT_OK(err, "bpf_iter_bpf_rhash_map__load")) + goto out; + + map_fd = bpf_map__fd(skel->maps.rhashmap); + + /* Populate map with test data */ + for (i = 0; i < 64; i++) { + key = i + 1; + expected_key_sum += key; + + err = bpf_map_update_elem(map_fd, &key, &val, BPF_NOEXIST); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = map_fd; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.dump_bpf_rhash_map, &opts); + if (!ASSERT_OK_PTR(link, "attach_iter")) + goto out; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "create_iter")) + goto free_link; + + do { + len = read(iter_fd, buf, sizeof(buf)); + } while (len > 0); + + ASSERT_EQ(skel->bss->key_sum, expected_key_sum, "key_sum"); + ASSERT_EQ(skel->bss->elem_count, 64, "elem_count"); + + close(iter_fd); + +free_link: + bpf_link__destroy(link); +out: + bpf_iter_bpf_rhash_map__destroy(skel); +} + +void test_rhash(void) +{ + if (test__start_subtest("test_rhash_lookup_update")) + rhash_run("test_rhash_lookup_update"); + + if (test__start_subtest("test_rhash_update_delete")) + rhash_run("test_rhash_update_delete"); + + if (test__start_subtest("test_rhash_update_elements")) + rhash_run("test_rhash_update_elements"); + + if (test__start_subtest("test_rhash_update_exist")) + rhash_run("test_rhash_update_exist"); + + if (test__start_subtest("test_rhash_update_any")) + rhash_run("test_rhash_update_any"); + + if (test__start_subtest("test_rhash_noexist_duplicate")) + rhash_run("test_rhash_noexist_duplicate"); + + if (test__start_subtest("test_rhash_delete_nonexistent")) + rhash_run("test_rhash_delete_nonexistent"); + + if (test__start_subtest("test_rhash_map_extra_presize")) + rhash_map_extra_presize(); + + if (test__start_subtest("test_rhash_map_extra_too_big")) + rhash_map_extra_too_big(); + + if (test__start_subtest("test_rhash_iter")) + rhash_iter_test(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c index 77fe1bfb7504..4e91d9b615ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c +++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c @@ -199,6 +199,83 @@ err_out: bpf_link__destroy(getsockopt_link); } +static int connect_to_v4mapped_v6_fd(int server_fd) +{ + struct sockaddr_storage addr; + struct sockaddr_in *addr4 = (void *)&addr; + socklen_t addrlen = sizeof(addr); + struct sockaddr_in6 addr6 = {}; + int fd = -1, v6only = 0, err; + + err = getsockname(server_fd, (struct sockaddr *)&addr, &addrlen); + if (!ASSERT_OK(err, "getsockname")) + return -1; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(fd, 0, "socket")) + return -1; + + err = settimeo(fd, 0); + if (!ASSERT_OK(err, "settimeo")) + goto err_out; + + err = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof(v6only)); + if (!ASSERT_OK(err, "clear_v6only")) + goto err_out; + + addr6.sin6_family = AF_INET6; + addr6.sin6_port = addr4->sin_port; + addr6.sin6_addr.s6_addr[10] = 0xff; + addr6.sin6_addr.s6_addr[11] = 0xff; + memcpy(&addr6.sin6_addr.s6_addr[12], &addr4->sin_addr, sizeof(addr4->sin_addr)); + + err = connect(fd, (struct sockaddr *)&addr6, sizeof(addr6)); + if (!ASSERT_OK(err, "connect")) + goto err_out; + + return fd; + +err_out: + close(fd); + return -1; +} + +static void test_v4mapped_v6_ip_tos(void) +{ + struct setget_sockopt__bss *bss = skel->bss; + int sfd = -1, fd = -1, got = 0, exp = 0x1c; + socklen_t optlen; + + memset(bss, 0, sizeof(*bss)); + bss->v4mapped_v6_ip_tos_enable = 1; + bss->v4mapped_v6_ip_tos_ret = -1; + bss->v4mapped_v6_ip_tos_val = exp; + + sfd = start_server(AF_INET, SOCK_STREAM, addr4_str, 0, 0); + if (!ASSERT_GE(sfd, 0, "start_server")) + goto err_out; + + fd = connect_to_v4mapped_v6_fd(sfd); + if (!ASSERT_GE(fd, 0, "connect_to_v4mapped_v6_fd")) + goto err_out; + + ASSERT_GT(bss->v4mapped_v6_ip_tos_cnt, 0, "v4mapped_v6_ip_tos_cnt"); + ASSERT_EQ(bss->v4mapped_v6_ip_tos_ret, 0, "v4mapped_v6_ip_tos_ret"); + + optlen = sizeof(got); + if (!ASSERT_OK(getsockopt(fd, SOL_IP, IP_TOS, &got, &optlen), "getsockopt_ip_tos")) + goto err_out; + + ASSERT_EQ(got, exp, "ip_tos"); + +err_out: + bss->v4mapped_v6_ip_tos_enable = 0; + if (fd >= 0) + close(fd); + if (sfd >= 0) + close(sfd); +} + void test_setget_sockopt(void) { cg_fd = test__join_cgroup(CG_NAME); @@ -238,6 +315,7 @@ void test_setget_sockopt(void) test_ktls(AF_INET); test_nonstandard_opt(AF_INET); test_nonstandard_opt(AF_INET6); + test_v4mapped_v6_ip_tos(); done: setget_sockopt__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/signed_loader.c b/tools/testing/selftests/bpf/prog_tests/signed_loader.c new file mode 100644 index 000000000000..5fc417e31fc6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/signed_loader.c @@ -0,0 +1,1135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Isovalent */ + +#include <test_progs.h> +#include <sys/syscall.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/keyctl.h> +#include <linux/bpf.h> + +#include "bpf/libbpf_internal.h" /* for libbpf_sha256() */ +#include "bpf/skel_internal.h" /* for loader ctx layout (bpf_loader_ctx etc) */ + +#include "test_signed_loader.skel.h" +#include "test_signed_loader_map.skel.h" +#include "test_signed_loader_data.skel.h" +#include "test_signed_loader_lsm.skel.h" + +#define SIG_MATCH_INSNS 33 /* excl (5) + 4 * sha-dword (7) */ + +enum { + BPF_SIG_UNSIGNED = 0, + BPF_SIG_VERIFIED, +}; + +enum { + BPF_SIG_KEYRING_NONE = 0, + BPF_SIG_KEYRING_BUILTIN, + BPF_SIG_KEYRING_SECONDARY, + BPF_SIG_KEYRING_PLATFORM, + BPF_SIG_KEYRING_USER, +}; + +static int load_loader(const void *insns, __u32 insns_sz, int map_fd, + const void *sig, __u32 sig_sz, __s32 keyring_id) +{ + union bpf_attr attr; + int fd; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insns_sz / sizeof(struct bpf_insn); + attr.license = ptr_to_u64("Dual BSD/GPL"); + attr.prog_flags = BPF_F_SLEEPABLE; + attr.fd_array = ptr_to_u64(&map_fd); + if (sig) { + attr.signature = ptr_to_u64(sig); + attr.signature_size = sig_sz; + attr.keyring_id = keyring_id; + } + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, + offsetofend(union bpf_attr, keyring_id)); + return fd < 0 ? -errno : fd; +} + +static int run_gen_loader(const void *insns, __u32 insns_sz, + const void *data, __u32 data_sz, + const void *excl, __u32 excl_sz, + const void *sig, __u32 sig_sz, + bool get_hash, void *ctx, __u32 ctx_sz, bool *loader_ran) +{ + LIBBPF_OPTS(bpf_map_create_opts, mopts, + .excl_prog_hash = excl, + .excl_prog_hash_size = excl_sz); + __u8 hbuf[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 ilen = sizeof(info), key = 0; + union bpf_attr attr; + int map_fd, prog_fd, ret; + + *loader_ran = false; + + map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", + 4, data_sz, 1, &mopts); + if (map_fd < 0) + return -errno; + if (bpf_map_update_elem(map_fd, &key, data, 0)) { + ret = -errno; + goto out_map; + } + if (bpf_map_freeze(map_fd)) { + ret = -errno; + goto out_map; + } + if (get_hash) { + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(hbuf); + info.hash_size = sizeof(hbuf); + if (bpf_map_get_info_by_fd(map_fd, &info, &ilen)) { + ret = -errno; + goto out_map; + } + } + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insns_sz / sizeof(struct bpf_insn); + attr.license = ptr_to_u64("Dual BSD/GPL"); + attr.prog_flags = BPF_F_SLEEPABLE; + attr.fd_array = ptr_to_u64(&map_fd); + if (sig) { + attr.signature = ptr_to_u64(sig); + attr.signature_size = sig_sz; + attr.keyring_id = KEY_SPEC_SESSION_KEYRING; + } + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, + offsetofend(union bpf_attr, keyring_id)); + if (prog_fd < 0) { + ret = -errno; + goto out_map; + } + + memset(&attr, 0, sizeof(attr)); + attr.test.prog_fd = prog_fd; + attr.test.ctx_in = ptr_to_u64(ctx); + attr.test.ctx_size_in = ctx_sz; + if (syscall(__NR_bpf, BPF_PROG_RUN, &attr, + offsetofend(union bpf_attr, test)) < 0) { + ret = -errno; + goto out_prog; + } + *loader_ran = true; + ret = (int)attr.test.retval; +out_prog: + close(prog_fd); +out_map: + close(map_fd); + return ret; +} + +static void close_loader_ctx_fds(void *ctx, int nr_maps, int nr_progs) +{ + struct bpf_map_desc *md = (struct bpf_map_desc *)((char *)ctx + + sizeof(struct bpf_loader_ctx)); + struct bpf_prog_desc *pd = (struct bpf_prog_desc *)(md + nr_maps); + int i; + + for (i = 0; i < nr_maps; i++) + if (md[i].map_fd > 0) + close(md[i].map_fd); + for (i = 0; i < nr_progs; i++) + if (pd[i].prog_fd > 0) + close(pd[i].prog_fd); +} + +static int run_setup(const char *cmd, const char *dir) +{ + int pid, status; + + pid = fork(); + if (pid < 0) + return -errno; + if (pid == 0) { + execlp("./verify_sig_setup.sh", "./verify_sig_setup.sh", + cmd, dir, NULL); + exit(1); + } + if (waitpid(pid, &status, 0) < 0) + return -errno; + return (WIFEXITED(status) && + WEXITSTATUS(status) == 0) ? 0 : -EINVAL; +} + +static int sign_buf(const char *dir, const void *buf, __u32 len, + void *sig, __u32 *sig_sz) +{ + char data_tmpl[PATH_MAX], key[PATH_MAX]; + char sigpath[PATH_MAX + sizeof(".p7s")]; + int fd, pid, status, ret; + struct stat st; + + ret = snprintf(data_tmpl, sizeof(data_tmpl), "%s/dataXXXXXX", dir); + if (ret < 0 || ret >= (int)sizeof(data_tmpl)) + return -ENAMETOOLONG; + ret = 0; + + fd = mkstemp(data_tmpl); + if (fd < 0) + return -errno; + if (write(fd, buf, len) != (ssize_t)len) { + close(fd); + ret = -EIO; + goto out; + } + close(fd); + + pid = fork(); + if (pid < 0) { + ret = -errno; + goto out; + } + if (pid == 0) { + snprintf(key, sizeof(key), "%s/signing_key.pem", dir); + execlp("./sign-file", "./sign-file", "-d", "sha256", + key, key, data_tmpl, NULL); + exit(1); + } + if (waitpid(pid, &status, 0) < 0 || + !WIFEXITED(status) || WEXITSTATUS(status)) { + ret = -EINVAL; + goto out; + } + + snprintf(sigpath, sizeof(sigpath), "%s.p7s", data_tmpl); + if (stat(sigpath, &st) < 0) { + ret = -errno; + goto out; + } + if (st.st_size > (off_t)*sig_sz) { + ret = -E2BIG; + goto out_sig; + } + fd = open(sigpath, O_RDONLY); + if (fd < 0) { + ret = -errno; + goto out_sig; + } + if (read(fd, sig, st.st_size) != st.st_size) { + close(fd); + ret = -EIO; + goto out_sig; + } + close(fd); + *sig_sz = st.st_size; +out_sig: + unlink(sigpath); +out: + unlink(data_tmpl); + return ret; +} + +static void check_sig_match_shape(const struct bpf_insn *in, int n) +{ + int a = -1, cleanup = -1, i, base, t, br[5], nb = 0; + + /* BPF_PSEUDO_MAP_IDX (the struct bpf_map * form) is used only here. */ + for (i = 0; i + 1 < n; i++) { + if (in[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + in[i].src_reg == BPF_PSEUDO_MAP_IDX) { + a = i; + break; + } + } + if (!ASSERT_GE(a, 0, "emit_signature_match present")) + return; + if (!ASSERT_LE(a + SIG_MATCH_INSNS, n, "block fits in program")) + return; + + /* excl check: r2 = *(u32 *)(map + 32); if r2 != 1 goto cleanup */ + ASSERT_EQ(in[a + 2].code, (BPF_LDX | BPF_MEM | BPF_W), "excl load width"); + ASSERT_EQ(in[a + 2].off, SHA256_DIGEST_LENGTH, "excl field offset"); + ASSERT_EQ(in[a + 4].code, (BPF_JMP | BPF_JNE | BPF_K), "excl branch op"); + ASSERT_EQ(in[a + 4].imm, 1, "excl compared to 1"); + br[nb++] = a + 4; + + /* 4 sha-dword checks: r2 = *(u64 *)(map + i*8); if r2 != r3 goto cleanup */ + for (i = 0; i < 4; i++) { + base = a + 5 + i * 7; + ASSERT_EQ(in[base + 2].code, (BPF_LDX | BPF_MEM | BPF_DW), "sha load width"); + ASSERT_EQ(in[base + 2].off, i * 8, "sha dword offset"); + ASSERT_EQ(in[base + 3].code, (BPF_LD | BPF_IMM | BPF_DW), "sha imm64 (H_meta)"); + ASSERT_EQ(in[base + 6].code, (BPF_JMP | BPF_JNE | BPF_X), "sha branch op"); + br[nb++] = base + 6; + } + + /* + * Locate the real cleanup label so we can pin the exact jump target, + * not just "some backward label". bpf_gen__init() emits the cleanup + * block as a prog-fd close loop whose first instruction is the label + * every error branch jumps to. + */ + for (i = 0; i + 2 < a; i++) { + if (in[i].code == (BPF_LDX | BPF_MEM | BPF_W) && + in[i].dst_reg == BPF_REG_1 && in[i].src_reg == BPF_REG_10 && + in[i + 1].code == (BPF_JMP | BPF_JSLE | BPF_K) && + in[i + 1].dst_reg == BPF_REG_1 && in[i + 1].imm == 0 && + in[i + 1].off == 1 && + in[i + 2].code == (BPF_JMP | BPF_CALL) && + in[i + 2].imm == BPF_FUNC_sys_close) { + cleanup = i; + break; + } + } + if (!ASSERT_GE(cleanup, 0, "cleanup label located")) + return; + for (i = 0; i < nb; i++) { + t = br[i] + 1 + in[br[i]].off; + ASSERT_EQ(t, cleanup, "sig-match lands on cleanup"); + } + /* + * Same invariant for every other cleanup-bound jump in the program: + * emit_check_err() is the only source of "if (r7 < 0) goto cleanup", + * so each of those must also resolve exactly to cleanup. + */ + for (i = 0, t = 0; i < n; i++) { + if (in[i].code != (BPF_JMP | BPF_JSLT | BPF_K) || + in[i].dst_reg != BPF_REG_7 || in[i].imm != 0 || in[i].off >= 0) + continue; + ASSERT_EQ(i + 1 + in[i].off, cleanup, "err-check lands on cleanup"); + t++; + } + ASSERT_GT(t, 0, "found emit_check_err jumps"); +} + +struct gen_loader_fixture { + struct test_signed_loader *skel; + struct gen_loader_opts gopts; + unsigned char *blob; + void *ctx; + __u32 data_sz; + __u32 ctx_sz; + int nr_maps; + int nr_progs; + __u8 excl[SHA256_DIGEST_LENGTH]; +}; + +static int gen_loader_fixture_init(struct gen_loader_fixture *f) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + int nr_maps = 0, nr_progs = 0; + struct bpf_program *p; + struct bpf_map *m; + + memset(f, 0, sizeof(*f)); + f->skel = test_signed_loader__open(); + if (!ASSERT_OK_PTR(f->skel, "skel_open")) + return -1; + if (!ASSERT_OK(bpf_object__gen_loader(f->skel->obj, &gopts), "gen_loader")) + return -1; + if (!ASSERT_OK(bpf_object__load(f->skel->obj), "gen_load")) + return -1; + f->gopts = gopts; + + bpf_object__for_each_program(p, f->skel->obj) + nr_progs++; + bpf_object__for_each_map(m, f->skel->obj) + nr_maps++; + f->nr_maps = nr_maps; + f->nr_progs = nr_progs; + f->ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + f->ctx = calloc(1, f->ctx_sz); + if (!ASSERT_OK_PTR(f->ctx, "ctx_alloc")) + return -1; + ((struct bpf_loader_ctx *)f->ctx)->sz = f->ctx_sz; + + f->data_sz = gopts.data_sz; + f->blob = malloc(f->data_sz); + if (!ASSERT_OK_PTR(f->blob, "blob_alloc")) + return -1; + memcpy(f->blob, gopts.data, f->data_sz); + + /* excl_prog_hash = SHA256(loader insns) == the loader's prog->digest. */ + libbpf_sha256(gopts.insns, gopts.insns_sz, f->excl); + return 0; +} + +static void gen_loader_fixture_fini(struct gen_loader_fixture *f) +{ + if (f->ctx) + close_loader_ctx_fds(f->ctx, f->nr_maps, f->nr_progs); + free(f->blob); + free(f->ctx); + test_signed_loader__destroy(f->skel); +} + +static void metadata_check_shape(void) +{ + struct gen_loader_fixture f; + + if (gen_loader_fixture_init(&f) == 0) + check_sig_match_shape((const struct bpf_insn *)f.gopts.insns, + f.gopts.insns_sz / sizeof(struct bpf_insn)); + gen_loader_fixture_fini(&f); +} + +static void metadata_match(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, f.excl, sizeof(f.excl), NULL, 0, + true, f.ctx, f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, 0, "honest loader retval"); + } + gen_loader_fixture_fini(&f); +} + +static void metadata_sha_mismatch(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * blob[0] lives in the loader's fd_array scratch (first add_data in + * bpf_gen__init); a 0-map program never reads it, so flipping it + * changes only map->sha. The metadata check is the only thing that + * can notice -> isolates emit_signature_match. + */ + f.blob[0] ^= 0xff; + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, f.excl, sizeof(f.excl), NULL, 0, + true, f.ctx, f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, -EINVAL, "tampered blob rejected by emit_signature_match"); + } + gen_loader_fixture_fini(&f); +} + +static void metadata_not_exclusive(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * Correct blob but a non-exclusive metadata map: the verifier does + * not reject (excl_prog_sha unset), so the runtime map->excl == 1 + * check in the loader must. + */ + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, NULL, 0, NULL, 0, true, f.ctx, + f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, -EINVAL, "non-exclusive metadata map rejected"); + } + gen_loader_fixture_fini(&f); +} + +static void metadata_hash_not_computed(void) +{ + struct gen_loader_fixture f; + bool ran; + int r; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * Correct, exclusive, frozen map, but its hash was never computed + * (no OBJ_GET_INFO_BY_FD), so map->sha stays zero. The loader must + * fail closed rather than treat an unset hash as a match. + */ + r = run_gen_loader(f.gopts.insns, f.gopts.insns_sz, f.blob, + f.data_sz, f.excl, sizeof(f.excl), NULL, 0, + false, f.ctx, f.ctx_sz, &ran); + ASSERT_TRUE(ran, "loader ran"); + ASSERT_EQ(r, -EINVAL, "uncomputed metadata hash rejected"); + } + gen_loader_fixture_fini(&f); +} + +static void signature_enforced(void) +{ + static const __u8 junk[64] = { 0x30, 0x42, 0x13, 0x37, }; + struct gen_loader_fixture f; + int fd; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * A present-but-invalid signature (the cert bytes are not a + * PKCS#7 signature) must be rejected at load: the signature + * path is honored, not ignored. (The valid path is covered by + * the signed lskels.) + */ + fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk, + sizeof(junk), KEY_SPEC_SESSION_KEYRING); + ASSERT_LT(fd, 0, "invalid signature rejected at load"); + } + gen_loader_fixture_fini(&f); +} + +static void signature_too_large(void) +{ + static const __u8 junk[64] = {}; + struct gen_loader_fixture f; + int fd; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * signature_size beyond the kernel's bound (KMALLOC_MAX_CACHE_SIZE) + * is rejected before the buffer is read. + */ + fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk, + 64 << 20, KEY_SPEC_SESSION_KEYRING); + ASSERT_EQ(fd, -EINVAL, "oversized signature rejected"); + } + gen_loader_fixture_fini(&f); +} + +static void signature_bad_keyring(void) +{ + static const __u8 junk[64] = {}; + struct gen_loader_fixture f; + int fd; + + if (gen_loader_fixture_init(&f) == 0) { + /* + * A present signature with a keyring_id that resolves to no key is + * rejected up front: bpf_prog_verify_signature() fails the keyring + * lookup (-EINVAL) before it ever looks at the signature bytes. A + * large positive serial takes the user-keyring path and won't exist. + */ + fd = load_loader(f.gopts.insns, f.gopts.insns_sz, -1, junk, + sizeof(junk), INT_MAX); + ASSERT_EQ(fd, -EINVAL, "signature with bad keyring_id rejected"); + } + gen_loader_fixture_fini(&f); +} + +/* + * A signed loader must ignore ctx-supplied map dimensions: the host cannot + * resize a signed program's maps via the loader ctx. Drive a one-map program + * through gen_loader, ask (via ctx) for every map to be resized to a bogus + * value, and confirm the created maps keep their attested size. + */ +#define GATING_BOGUS_MAX 0x4000 + +static void metadata_ctx_max_entries_ignored(void) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + struct test_signed_loader_map *skel; + __u8 excl[SHA256_DIGEST_LENGTH]; + int nr_maps = 0, nr_progs = 0, i, checked = 0, r; + struct bpf_program *p; + struct bpf_map *m; + struct bpf_map_desc *md; + unsigned char *blob; + __u32 ctx_sz, data_sz; + void *ctx; + bool ran; + + skel = test_signed_loader_map__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader")) + goto destroy; + if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load")) + goto destroy; + + bpf_object__for_each_program(p, skel->obj) + nr_progs++; + bpf_object__for_each_map(m, skel->obj) + nr_maps++; + ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + ctx = calloc(1, ctx_sz); + if (!ASSERT_OK_PTR(ctx, "ctx_alloc")) + goto destroy; + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + + md = (struct bpf_map_desc *)((char *)ctx + sizeof(struct bpf_loader_ctx)); + for (i = 0; i < nr_maps; i++) + md[i].max_entries = GATING_BOGUS_MAX; + + libbpf_sha256(gopts.insns, gopts.insns_sz, excl); + data_sz = gopts.data_sz; + blob = malloc(data_sz); + if (!ASSERT_OK_PTR(blob, "blob_alloc")) + goto free_ctx; + memcpy(blob, gopts.data, data_sz); + + r = run_gen_loader(gopts.insns, gopts.insns_sz, blob, data_sz, + excl, sizeof(excl), NULL, 0, true, ctx, ctx_sz, &ran); + if (!ASSERT_TRUE(ran, "loader ran") || + !ASSERT_EQ(r, 0, "loader retval")) + goto free_blob; + + for (i = 0; i < nr_maps; i++) { + struct bpf_map_info info; + __u32 ilen = sizeof(info); + int fd = md[i].map_fd; + + if (fd <= 0) + continue; + memset(&info, 0, sizeof(info)); + if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "map_info")) { + ASSERT_NEQ(info.max_entries, GATING_BOGUS_MAX, + "ctx max_entries ignored for signed loader"); + checked++; + } + } + ASSERT_GT(checked, 0, "inspected a created map"); + +free_blob: + free(blob); +free_ctx: + close_loader_ctx_fds(ctx, nr_maps, nr_progs); + free(ctx); +destroy: + test_signed_loader_map__destroy(skel); +} + +/* + * A signed loader must also ignore ctx-supplied initial_value: the host cannot + * re-seed a signed program's map contents through the loader ctx. Drive a + * program with one initialized global (a .data map) through gen_loader, point + * every map's ctx initial_value at an adversarial buffer, and confirm the + * created map still holds the attested value, never the ctx bytes. + */ +#define DATA_MAGIC 0x5eed1234abad1deaULL + +static void metadata_ctx_initial_value_ignored(void) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + struct test_signed_loader_data *skel; + __u8 excl[SHA256_DIGEST_LENGTH], evil[64]; + int nr_maps = 0, nr_progs = 0, i, found = 0, r; + struct bpf_program *p; + struct bpf_map *m; + struct bpf_map_desc *md; + unsigned char *blob; + __u32 ctx_sz, data_sz; + void *ctx; + bool ran; + + skel = test_signed_loader_data__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader")) + goto destroy; + if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load")) + goto destroy; + + bpf_object__for_each_program(p, skel->obj) + nr_progs++; + bpf_object__for_each_map(m, skel->obj) + nr_maps++; + ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + ctx = calloc(1, ctx_sz); + if (!ASSERT_OK_PTR(ctx, "ctx_alloc")) + goto destroy; + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + + memset(evil, 0xAA, sizeof(evil)); + md = (struct bpf_map_desc *)((char *)ctx + sizeof(struct bpf_loader_ctx)); + for (i = 0; i < nr_maps; i++) + md[i].initial_value = ptr_to_u64(evil); + + libbpf_sha256(gopts.insns, gopts.insns_sz, excl); + data_sz = gopts.data_sz; + blob = malloc(data_sz); + if (!ASSERT_OK_PTR(blob, "blob_alloc")) + goto free_ctx; + memcpy(blob, gopts.data, data_sz); + + r = run_gen_loader(gopts.insns, gopts.insns_sz, blob, data_sz, + excl, sizeof(excl), NULL, 0, true, ctx, ctx_sz, &ran); + if (!ASSERT_TRUE(ran, "loader ran") || + !ASSERT_EQ(r, 0, "loader retval")) + goto free_blob; + + for (i = 0; i < nr_maps; i++) { + struct bpf_map_info info; + __u32 ilen = sizeof(info), key = 0; + __u8 value[64] = {}; + __u64 got; + int fd = md[i].map_fd; + + if (fd <= 0) + continue; + memset(&info, 0, sizeof(info)); + if (!ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "map_info")) + continue; + if (info.value_size <= sizeof(value) && + bpf_map_lookup_elem(fd, &key, value) == 0) { + memcpy(&got, value, sizeof(got)); + /* attested .data survives; ctx bytes (0xAA..) ignored */ + if (got == DATA_MAGIC) + found = 1; + ASSERT_NEQ(got, 0xAAAAAAAAAAAAAAAAULL, + "ctx initial_value ignored for signed loader"); + } + } + ASSERT_EQ(found, 1, "attested .data value preserved"); + +free_blob: + free(blob); +free_ctx: + close_loader_ctx_fds(ctx, nr_maps, nr_progs); + free(ctx); +destroy: + test_signed_loader_data__destroy(skel); +} + +/* + * The load-time signature must authenticate the loader instructions: a valid + * signature loads, and the very same signature over one-byte-tampered insns is + * rejected. Uses ./verify_sig_setup.sh + ./sign-file at runtime, like + * verify_pkcs7_sig, and verifies against the session keyring the key was added + * to. (signature_enforced/_too_large only cover a malformed signature.) + */ +static void signature_authenticates_insns(void) +{ + LIBBPF_OPTS(gen_loader_opts, gopts, .gen_hash = true); + char dir_tmpl[] = "/tmp/signed_loaderXXXXXX", *dir; + struct test_signed_loader *skel = NULL; + __u8 excl[SHA256_DIGEST_LENGTH], sig[8192]; + __u32 sig_sz = sizeof(sig), insns_sz, data_sz, ctx_sz; + unsigned char *insns = NULL, *tampered = NULL, *blob = NULL; + int nr_maps = 0, nr_progs = 0, r; + struct bpf_program *p; + struct bpf_map *m; + void *ctx = NULL; + bool ran; + + syscall(__NR_request_key, "keyring", "_uid.0", NULL, + KEY_SPEC_SESSION_KEYRING); + dir = mkdtemp(dir_tmpl); + if (!ASSERT_OK_PTR(dir, "mkdtemp")) + return; + if (!ASSERT_OK(run_setup("setup", dir), "verify_sig_setup")) { + rmdir(dir); + return; + } + + skel = test_signed_loader__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + goto cleanup; + if (!ASSERT_OK(bpf_object__gen_loader(skel->obj, &gopts), "gen_loader")) + goto cleanup; + if (!ASSERT_OK(bpf_object__load(skel->obj), "gen_load")) + goto cleanup; + + bpf_object__for_each_program(p, skel->obj) + nr_progs++; + bpf_object__for_each_map(m, skel->obj) + nr_maps++; + ctx_sz = sizeof(struct bpf_loader_ctx) + + nr_maps * sizeof(struct bpf_map_desc) + + nr_progs * sizeof(struct bpf_prog_desc); + insns_sz = gopts.insns_sz; + data_sz = gopts.data_sz; + ctx = calloc(1, ctx_sz); + insns = malloc(insns_sz); + tampered = malloc(insns_sz); + blob = malloc(data_sz); + if (!ASSERT_OK_PTR(ctx, "ctx") || + !ASSERT_OK_PTR(insns, "insns") || + !ASSERT_OK_PTR(tampered, "tampered") || + !ASSERT_OK_PTR(blob, "blob")) + goto cleanup; + memcpy(insns, gopts.insns, insns_sz); + memcpy(blob, gopts.data, data_sz); + libbpf_sha256(insns, insns_sz, excl); + + if (!ASSERT_OK(sign_buf(dir, insns, insns_sz, sig, &sig_sz), "sign-file")) + goto cleanup; + + memset(ctx, 0, ctx_sz); + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + r = run_gen_loader(insns, insns_sz, blob, data_sz, excl, sizeof(excl), + sig, sig_sz, true, ctx, ctx_sz, &ran); + ASSERT_TRUE(ran, "valid signature: loader loaded and ran"); + ASSERT_EQ(r, 0, "valid signature accepted"); + close_loader_ctx_fds(ctx, nr_maps, nr_progs); + + memcpy(tampered, insns, insns_sz); + tampered[insns_sz / 2] ^= 0xff; + memset(ctx, 0, ctx_sz); + ((struct bpf_loader_ctx *)ctx)->sz = ctx_sz; + r = run_gen_loader(tampered, insns_sz, blob, data_sz, excl, sizeof(excl), + sig, sig_sz, true, ctx, ctx_sz, &ran); + ASSERT_FALSE(ran, "tampered loader rejected before run"); + ASSERT_EQ(r, -EKEYREJECTED, "signature is bound to the instructions"); +cleanup: + free(insns); + free(tampered); + free(blob); + free(ctx); + test_signed_loader__destroy(skel); + run_setup("cleanup", dir); +} + +static int make_excl_map(__u32 flags, __u32 value_size) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + __u8 hash[SHA256_DIGEST_LENGTH] = { 1 }; /* any 32-byte value */ + + opts.excl_prog_hash = hash; + opts.excl_prog_hash_size = sizeof(hash); + opts.map_flags = flags; + return bpf_map_create(BPF_MAP_TYPE_ARRAY, "md", 4, value_size, 1, &opts); +} + +static void hash_requires_frozen(void) +{ + __u8 hbuf[SHA256_DIGEST_LENGTH], val[64] = {}; + struct bpf_map_info info; + __u32 ilen, key = 0; + int fd; + + fd = make_excl_map(0, sizeof(val)); + if (!ASSERT_OK_FD(fd, "excl_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(hbuf); + info.hash_size = sizeof(hbuf); + ilen = sizeof(info); + ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EPERM, + "hash of unfrozen map rejected"); + close(fd); +} + +static void no_update_after_freeze(void) +{ + __u8 val[64] = {}; + __u32 key = 0; + int fd; + + fd = make_excl_map(0, sizeof(val)); + if (!ASSERT_OK_FD(fd, "excl_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + ASSERT_EQ(bpf_map_update_elem(fd, &key, val, 0), -EPERM, + "update after freeze rejected"); + close(fd); +} + +static void freeze_writable_mmap(void) +{ + void *w; + int fd; + + fd = make_excl_map(BPF_F_MMAPABLE, 4096); + if (!ASSERT_OK_FD(fd, "excl_mmapable_map")) + return; + w = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ASSERT_OK_PTR(w, "writable_mmap")) { + ASSERT_EQ(bpf_map_freeze(fd), -EBUSY, + "freeze rejected while writable mmap held"); + munmap(w, 4096); + } + close(fd); +} + +static void no_writable_mmap_frozen(void) +{ + void *w; + int fd; + + fd = make_excl_map(BPF_F_MMAPABLE, 4096); + if (!ASSERT_OK_FD(fd, "excl_mmapable_map")) + return; + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + w = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + ASSERT_EQ(w, MAP_FAILED, "writable mmap of frozen map rejected"); + if (w != MAP_FAILED) + munmap(w, 4096); + close(fd); +} + +static void map_hash_matches_libbpf(void) +{ + __u8 kbuf[SHA256_DIGEST_LENGTH], lbuf[SHA256_DIGEST_LENGTH], val[64] = {}; + struct bpf_map_info info; + __u32 ilen, key = 0; + int fd, i; + + /* + * The signing scheme assumes the kernel's map hash equals what libbpf + * computes over the same bytes (gen_loader bakes libbpf_sha256(blob); + * the kernel recomputes via array_map_get_hash). Pin that they agree. + */ + for (i = 0; i < (int)sizeof(val); i++) + val[i] = i * 7 + 1; + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, sizeof(val), 1, NULL); + if (!ASSERT_OK_FD(fd, "array_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf); + ilen = sizeof(info); + if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "get_hash")) { + libbpf_sha256(val, sizeof(val), lbuf); + ASSERT_EQ(memcmp(kbuf, lbuf, sizeof(kbuf)), 0, + "kernel map hash matches libbpf_sha256"); + } + close(fd); +} + +static void map_hash_multi_element(void) +{ + const __u32 nr = 8, value_size = 64; + __u8 kbuf[SHA256_DIGEST_LENGTH], lbuf[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 ilen, i, j; + __u8 *full; + int fd; + + /* + * array_map_get_hash() hashes elem_size * max_entries (the whole value + * area), not just element 0. With an 8-aligned value_size elem_size has + * no padding, so pin that a >1-entry array's kernel hash equals + * libbpf_sha256() over the full, concatenated element contents. + */ + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, value_size, nr, NULL); + if (!ASSERT_OK_FD(fd, "array_map")) + return; + full = calloc(nr, value_size); + if (!ASSERT_OK_PTR(full, "buf")) + goto close_fd; + for (i = 0; i < nr; i++) { + __u8 *v = full + i * value_size; + + for (j = 0; j < value_size; j++) + v[j] = i * 31 + j * 7 + 1; + ASSERT_OK(bpf_map_update_elem(fd, &i, v, 0), "update"); + } + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf); + ilen = sizeof(info); + if (ASSERT_OK(bpf_map_get_info_by_fd(fd, &info, &ilen), "get_hash")) { + libbpf_sha256(full, (size_t)nr * value_size, lbuf); + ASSERT_EQ(memcmp(kbuf, lbuf, sizeof(kbuf)), 0, + "kernel hash covers full multi-element value area"); + } + free(full); +close_fd: + close(fd); +} + +static void map_hash_bad_size(void) +{ + __u8 kbuf[SHA256_DIGEST_LENGTH], val[64] = {}; + struct bpf_map_info info; + __u32 ilen, key = 0; + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "h", 4, sizeof(val), 1, NULL); + if (!ASSERT_OK_FD(fd, "array_map")) + return; + ASSERT_OK(bpf_map_update_elem(fd, &key, val, 0), "update"); + ASSERT_OK(bpf_map_freeze(fd), "freeze"); + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf) / 2; + ilen = sizeof(info); + ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EINVAL, + "wrong hash_size rejected"); + close(fd); +} + +static void map_hash_unsupported_type(void) +{ + __u8 kbuf[SHA256_DIGEST_LENGTH]; + struct bpf_map_info info; + __u32 ilen; + int fd; + + /* Only arrays implement map_get_hash; a hash map must be refused. */ + fd = bpf_map_create(BPF_MAP_TYPE_HASH, "h", 4, 8, 4, NULL); + if (!ASSERT_OK_FD(fd, "hash_map")) + return; + memset(&info, 0, sizeof(info)); + info.hash = ptr_to_u64(kbuf); + info.hash_size = sizeof(kbuf); + ilen = sizeof(info); + ASSERT_EQ(bpf_map_get_info_by_fd(fd, &info, &ilen), -EINVAL, + "hash unsupported for non-array map"); + close(fd); +} + +static int setup_meta_map(const struct gen_loader_fixture *f) +{ + LIBBPF_OPTS(bpf_map_create_opts, mopts, + .excl_prog_hash = f->excl, + .excl_prog_hash_size = sizeof(f->excl)); + __u32 key = 0; + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, + f->data_sz, 1, &mopts); + if (fd < 0) + return -errno; + if (bpf_map_update_elem(fd, &key, f->blob, 0) || bpf_map_freeze(fd)) { + close(fd); + return -errno; + } + return fd; +} + +static void lsm_signature_verdict(void) +{ + char dir_tmpl[] = "/tmp/signed_loader_lsmXXXXXX", *dir = NULL; + struct test_signed_loader_lsm *lsm = NULL; + int map_fd = -1, prog_fd = -1; + bool have_fixture = false; + struct gen_loader_fixture f; + __u32 sig_sz = 8192; + __s32 ses_serial; + __u8 sig[8192]; + + lsm = test_signed_loader_lsm__open_and_load(); + if (!ASSERT_OK_PTR(lsm, "lsm_skel_load")) + return; + lsm->bss->monitored_tid = sys_gettid(); + if (!ASSERT_OK(test_signed_loader_lsm__attach(lsm), "lsm_attach")) + goto out; + + have_fixture = true; + if (gen_loader_fixture_init(&f) != 0) + goto out; + + map_fd = setup_meta_map(&f); + if (!ASSERT_OK_FD(map_fd, "meta_map_unsigned")) + goto out; + lsm->bss->seen = 0; + prog_fd = load_loader(f.gopts.insns, f.gopts.insns_sz, map_fd, NULL, 0, 0); + close(map_fd); + map_fd = -1; + if (!ASSERT_OK_FD(prog_fd, "unsigned loader load")) + goto out; + close(prog_fd); + prog_fd = -1; + if (!ASSERT_NEQ(lsm->bss->seen, 0, "bpf LSM in the active LSM set")) + goto out; + ASSERT_EQ(lsm->bss->seen, 1, "unsigned: one observed load"); + ASSERT_EQ(lsm->bss->sig_verdict, BPF_SIG_UNSIGNED, "unsigned verdict"); + ASSERT_EQ(lsm->bss->sig_keyring_type, BPF_SIG_KEYRING_NONE, "unsigned keyring type"); + ASSERT_EQ(lsm->bss->sig_keyring_serial, 0, "unsigned: no keyring serial"); + + syscall(__NR_request_key, "keyring", "_uid.0", NULL, + KEY_SPEC_SESSION_KEYRING); + dir = mkdtemp(dir_tmpl); + if (!ASSERT_OK_PTR(dir, "mkdtemp")) + goto out; + if (!ASSERT_OK(run_setup("setup", dir), "verify_sig_setup")) { + rmdir(dir); + dir = NULL; + goto out; + } + if (!ASSERT_OK(sign_buf(dir, f.gopts.insns, f.gopts.insns_sz, sig, + &sig_sz), "sign-file")) + goto out; + + map_fd = setup_meta_map(&f); + if (!ASSERT_OK_FD(map_fd, "meta_map_signed")) + goto out; + lsm->bss->seen = 0; + prog_fd = load_loader(f.gopts.insns, f.gopts.insns_sz, map_fd, sig, + sig_sz, KEY_SPEC_SESSION_KEYRING); + close(map_fd); + map_fd = -1; + if (!ASSERT_OK_FD(prog_fd, "signed loader load")) + goto out; + close(prog_fd); + prog_fd = -1; + + ses_serial = syscall(__NR_keyctl, KEYCTL_GET_KEYRING_ID, + KEY_SPEC_SESSION_KEYRING, 0); + ASSERT_EQ(lsm->bss->seen, 1, "signed: one observed load"); + ASSERT_EQ(lsm->bss->sig_verdict, BPF_SIG_VERIFIED, "signed verdict"); + ASSERT_EQ(lsm->bss->sig_keyring_type, BPF_SIG_KEYRING_USER, "signed keyring type"); + ASSERT_GT(ses_serial, 0, "session keyring serial resolved"); + ASSERT_EQ(lsm->bss->sig_keyring_serial, ses_serial, + "signed: validated against session keyring"); +out: + if (map_fd >= 0) + close(map_fd); + if (prog_fd >= 0) + close(prog_fd); + if (have_fixture) + gen_loader_fixture_fini(&f); + if (dir) + run_setup("cleanup", dir); + test_signed_loader_lsm__destroy(lsm); +} + +void test_signed_loader(void) +{ + if (test__start_subtest("metadata_check_shape")) + metadata_check_shape(); + if (test__start_subtest("metadata_match")) + metadata_match(); + if (test__start_subtest("metadata_sha_mismatch")) + metadata_sha_mismatch(); + if (test__start_subtest("metadata_not_exclusive")) + metadata_not_exclusive(); + if (test__start_subtest("metadata_hash_not_computed")) + metadata_hash_not_computed(); + if (test__start_subtest("signature_enforced")) + signature_enforced(); + if (test__start_subtest("signature_too_large")) + signature_too_large(); + if (test__start_subtest("signature_bad_keyring")) + signature_bad_keyring(); + if (test__start_subtest("metadata_ctx_max_entries_ignored")) + metadata_ctx_max_entries_ignored(); + if (test__start_subtest("metadata_ctx_initial_value_ignored")) + metadata_ctx_initial_value_ignored(); + if (test__start_subtest("signature_authenticates_insns")) + signature_authenticates_insns(); + if (test__start_subtest("hash_requires_frozen")) + hash_requires_frozen(); + if (test__start_subtest("no_update_after_freeze")) + no_update_after_freeze(); + if (test__start_subtest("freeze_writable_mmap")) + freeze_writable_mmap(); + if (test__start_subtest("no_writable_mmap_frozen")) + no_writable_mmap_frozen(); + if (test__start_subtest("map_hash_matches_libbpf")) + map_hash_matches_libbpf(); + if (test__start_subtest("map_hash_multi_element")) + map_hash_multi_element(); + if (test__start_subtest("map_hash_bad_size")) + map_hash_bad_size(); + if (test__start_subtest("map_hash_unsupported_type")) + map_hash_unsupported_type(); + if (test__start_subtest("lsm_signature_verdict")) + lsm_signature_verdict(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c new file mode 100644 index 000000000000..19500b785ee3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sleepable_tracepoints.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include <unistd.h> +#include "test_sleepable_tracepoints.skel.h" +#include "test_sleepable_tracepoints_fail.skel.h" + +static void run_test(struct test_sleepable_tracepoints *skel) +{ + char buf[PATH_MAX] = "/"; + + skel->bss->target_pid = getpid(); + skel->bss->prog_triggered = 0; + skel->bss->err = 0; + skel->bss->copied_byte = 0; + + syscall(__NR_getcwd, buf, sizeof(buf)); + + ASSERT_EQ(skel->bss->prog_triggered, 1, "prog_triggered"); + ASSERT_EQ(skel->bss->err, 0, "err"); + ASSERT_EQ(skel->bss->copied_byte, '/', "copied_byte"); +} + +static void run_auto_attach_test(struct bpf_program *prog, + struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (!ASSERT_OK_PTR(link, "prog_attach")) + return; + + run_test(skel); + bpf_link__destroy(link); +} + +static void test_attach_only(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_attach_reject(struct bpf_program *prog) +{ + struct bpf_link *link; + + link = bpf_program__attach(prog); + if (!ASSERT_ERR_PTR(link, "attach_should_fail")) + bpf_link__destroy(link); +} + +static void test_raw_tp_bare(struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach_raw_tracepoint(skel->progs.handle_raw_tp_bare, + "sys_enter"); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_tp_bare(struct test_sleepable_tracepoints *skel) +{ + struct bpf_link *link; + + link = bpf_program__attach_tracepoint(skel->progs.handle_tp_bare, + "syscalls", "sys_enter_getcwd"); + if (ASSERT_OK_PTR(link, "attach")) + bpf_link__destroy(link); +} + +static void test_test_run(struct test_sleepable_tracepoints *skel) +{ + __u64 args[2] = {0x1234ULL, 0x5678ULL}; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + int fd, err; + + fd = bpf_program__fd(skel->progs.handle_test_run); + err = bpf_prog_test_run_opts(fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, args[0] + args[1], "test_run_retval"); +} + +static void test_test_run_on_cpu_reject(struct test_sleepable_tracepoints *skel) +{ + __u64 args[2] = {}; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .ctx_in = args, + .ctx_size_in = sizeof(args), + .flags = BPF_F_TEST_RUN_ON_CPU, + ); + int fd, err; + + fd = bpf_program__fd(skel->progs.handle_test_run); + err = bpf_prog_test_run_opts(fd, &topts); + ASSERT_ERR(err, "test_run_on_cpu_reject"); +} + +void test_sleepable_tracepoints(void) +{ + struct test_sleepable_tracepoints *skel; + + skel = test_sleepable_tracepoints__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (test__start_subtest("tp_btf")) + run_auto_attach_test(skel->progs.handle_sys_enter_tp_btf, skel); + if (test__start_subtest("raw_tp")) + run_auto_attach_test(skel->progs.handle_sys_enter_raw_tp, skel); + if (test__start_subtest("tracepoint")) + run_auto_attach_test(skel->progs.handle_sys_enter_tp, skel); + if (test__start_subtest("sys_exit")) + run_auto_attach_test(skel->progs.handle_sys_exit_tp, skel); + if (test__start_subtest("tracepoint_alias")) + test_attach_only(skel->progs.handle_sys_enter_tp_alias); + if (test__start_subtest("raw_tracepoint_alias")) + test_attach_only(skel->progs.handle_sys_enter_raw_tp_alias); + if (test__start_subtest("raw_tp_bare")) + test_raw_tp_bare(skel); + if (test__start_subtest("tp_bare")) + test_tp_bare(skel); + if (test__start_subtest("test_run")) + test_test_run(skel); + if (test__start_subtest("test_run_on_cpu_reject")) + test_test_run_on_cpu_reject(skel); + if (test__start_subtest("raw_tp_non_faultable")) + test_attach_reject(skel->progs.handle_raw_tp_non_faultable); + if (test__start_subtest("tp_non_syscall")) + test_attach_reject(skel->progs.handle_tp_non_syscall); + if (test__start_subtest("tp_btf_non_faultable_reject")) + RUN_TESTS(test_sleepable_tracepoints_fail); + + test_sleepable_tracepoints__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index d2846579285f..cb3229711f93 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -14,6 +14,7 @@ #include "test_sockmap_pass_prog.skel.h" #include "test_sockmap_drop_prog.skel.h" #include "test_sockmap_change_tail.skel.h" +#include "test_sockmap_msg_pop_data.skel.h" #include "bpf_iter_sockmap.skel.h" #include "sockmap_helpers.h" @@ -666,6 +667,51 @@ out: test_sockmap_change_tail__destroy(skel); } +static void test_sockmap_msg_verdict_pop_data(void) +{ + struct test_sockmap_msg_pop_data *skel; + int err, map, verdict; + int c1 = -1, p1 = -1, sent; + int zero = 0; + char *buf; + const size_t len = 32 * 1024; + + skel = test_sockmap_msg_pop_data__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + verdict = bpf_program__fd(skel->progs.prog_msg_pop_data); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1); + if (!ASSERT_OK(err, "create_pair")) + goto out; + + err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto out_close; + + buf = calloc(len, 1); + if (!ASSERT_OK_PTR(buf, "calloc")) + goto out_close; + + sent = xsend(c1, buf, len, 0); + ASSERT_EQ(sent, (ssize_t)len, "xsend"); + ASSERT_EQ(skel->data->pop_data_ret, -EINVAL, "pop_data_rejects overflow"); + + free(buf); + +out_close: + close(c1); + close(p1); +out: + test_sockmap_msg_pop_data__destroy(skel); +} + static void test_sockmap_skb_verdict_peek_helper(int map) { int err, c1, p1, zero = 0, sent, recvd, avail; @@ -1373,6 +1419,8 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_fionread(false); if (test__start_subtest("sockmap skb_verdict change tail")) test_sockmap_skb_verdict_change_tail(); + if (test__start_subtest("sockmap msg_verdict pop_data overflow")) + test_sockmap_msg_verdict_pop_data(); if (test__start_subtest("sockmap skb_verdict msg_f_peek")) test_sockmap_skb_verdict_peek(); if (test__start_subtest("sockmap skb_verdict msg_f_peek with link")) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index b87e7f39e15a..34737e8df6ea 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -9,7 +9,6 @@ #include "test_progs.h" #include "sockmap_helpers.h" #include "test_skmsg_load_helpers.skel.h" -#include "test_sockmap_ktls.skel.h" #define MAX_TEST_NAME 80 #define TCP_ULP 31 @@ -117,6 +116,68 @@ close: close(s); } +static void test_sockmap_ktls_enable_fails_when_in_sockmap(int family, int map) +{ + struct tls12_crypto_info_aes_gcm_128 crypto = { + .info = { + .version = TLS_1_2_VERSION, + .cipher_type = TLS_CIPHER_AES_GCM_128, + }, + }; + struct sockaddr_storage addr = {}; + socklen_t len = sizeof(addr); + struct sockaddr_in6 *v6; + struct sockaddr_in *v4; + int err, s, zero = 0; + + switch (family) { + case AF_INET: + v4 = (struct sockaddr_in *)&addr; + v4->sin_family = AF_INET; + break; + case AF_INET6: + v6 = (struct sockaddr_in6 *)&addr; + v6->sin6_family = AF_INET6; + break; + default: + PRINT_FAIL("unsupported socket family %d", family); + return; + } + + s = socket(family, SOCK_STREAM, 0); + if (!ASSERT_GE(s, 0, "socket")) + return; + + err = bind(s, (struct sockaddr *)&addr, len); + if (!ASSERT_OK(err, "bind")) + goto close; + + err = getsockname(s, (struct sockaddr *)&addr, &len); + if (!ASSERT_OK(err, "getsockname")) + goto close; + + err = connect(s, (struct sockaddr *)&addr, len); + if (!ASSERT_OK(err, "connect")) + goto close; + + /* Add the socket to the sockmap, attaching a psock. */ + err = bpf_map_update_elem(map, &zero, &s, BPF_ANY); + if (!ASSERT_OK(err, "sockmap update elem")) + goto close; + + /* Installing the TLS ULP is allowed, it does not touch the datapath. */ + err = setsockopt(s, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); + if (!ASSERT_OK(err, "setsockopt(TCP_ULP)")) + goto close; + + /* Enabling the TLS crypto datapath must be rejected. */ + err = setsockopt(s, SOL_TLS, TLS_TX, &crypto, sizeof(crypto)); + ASSERT_ERR(err, "setsockopt(TLS_TX)"); + +close: + close(s); +} + static const char *fmt_test_name(const char *subtest_name, int family, enum bpf_map_type map_type) { @@ -160,249 +221,6 @@ out: close(p); } -static void test_sockmap_ktls_tx_cork(int family, int sotype, bool push) -{ - int err, off; - int i, j; - int start_push = 0, push_len = 0; - int c = 0, p = 0, one = 1, sent, recvd; - int prog_fd, map_fd; - char msg[12] = "hello world\0"; - char rcv[20] = {0}; - struct test_sockmap_ktls *skel; - - skel = test_sockmap_ktls__open_and_load(); - if (!ASSERT_TRUE(skel, "open ktls skel")) - return; - - err = create_pair(family, sotype, &c, &p); - if (!ASSERT_OK(err, "create_pair()")) - goto out; - - prog_fd = bpf_program__fd(skel->progs.prog_sk_policy); - map_fd = bpf_map__fd(skel->maps.sock_map); - - err = bpf_prog_attach(prog_fd, map_fd, BPF_SK_MSG_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach sk msg")) - goto out; - - err = bpf_map_update_elem(map_fd, &one, &c, BPF_NOEXIST); - if (!ASSERT_OK(err, "bpf_map_update_elem(c)")) - goto out; - - err = init_ktls_pairs(c, p); - if (!ASSERT_OK(err, "init_ktls_pairs(c, p)")) - goto out; - - skel->bss->cork_byte = sizeof(msg); - if (push) { - start_push = 1; - push_len = 2; - } - skel->bss->push_start = start_push; - skel->bss->push_end = push_len; - - off = sizeof(msg) / 2; - sent = send(c, msg, off, 0); - if (!ASSERT_EQ(sent, off, "send(msg)")) - goto out; - - recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); - if (!ASSERT_EQ(-1, recvd, "expected no data")) - goto out; - - /* send remaining msg */ - sent = send(c, msg + off, sizeof(msg) - off, 0); - if (!ASSERT_EQ(sent, sizeof(msg) - off, "send remaining data")) - goto out; - - recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); - if (!ASSERT_OK(err, "recv(msg)") || - !ASSERT_EQ(recvd, sizeof(msg) + push_len, "check length mismatch")) - goto out; - - for (i = 0, j = 0; i < recvd;) { - /* skip checking the data that has been pushed in */ - if (i >= start_push && i <= start_push + push_len - 1) { - i++; - continue; - } - if (!ASSERT_EQ(rcv[i], msg[j], "data mismatch")) - goto out; - i++; - j++; - } -out: - if (c) - close(c); - if (p) - close(p); - test_sockmap_ktls__destroy(skel); -} - -static void test_sockmap_ktls_tx_no_buf(int family, int sotype, bool push) -{ - int c = -1, p = -1, one = 1, two = 2; - struct test_sockmap_ktls *skel; - unsigned char *data = NULL; - struct msghdr msg = {0}; - struct iovec iov[2]; - int prog_fd, map_fd; - int txrx_buf = 1024; - int iov_length = 8192; - int err; - - skel = test_sockmap_ktls__open_and_load(); - if (!ASSERT_TRUE(skel, "open ktls skel")) - return; - - err = create_pair(family, sotype, &c, &p); - if (!ASSERT_OK(err, "create_pair()")) - goto out; - - err = setsockopt(c, SOL_SOCKET, SO_RCVBUFFORCE, &txrx_buf, sizeof(int)); - err |= setsockopt(p, SOL_SOCKET, SO_SNDBUFFORCE, &txrx_buf, sizeof(int)); - if (!ASSERT_OK(err, "set buf limit")) - goto out; - - prog_fd = bpf_program__fd(skel->progs.prog_sk_policy_redir); - map_fd = bpf_map__fd(skel->maps.sock_map); - - err = bpf_prog_attach(prog_fd, map_fd, BPF_SK_MSG_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach sk msg")) - goto out; - - err = bpf_map_update_elem(map_fd, &one, &c, BPF_NOEXIST); - if (!ASSERT_OK(err, "bpf_map_update_elem(c)")) - goto out; - - err = bpf_map_update_elem(map_fd, &two, &p, BPF_NOEXIST); - if (!ASSERT_OK(err, "bpf_map_update_elem(p)")) - goto out; - - skel->bss->apply_bytes = 1024; - - err = init_ktls_pairs(c, p); - if (!ASSERT_OK(err, "init_ktls_pairs(c, p)")) - goto out; - - data = calloc(iov_length, sizeof(char)); - if (!data) - goto out; - - iov[0].iov_base = data; - iov[0].iov_len = iov_length; - iov[1].iov_base = data; - iov[1].iov_len = iov_length; - msg.msg_iov = iov; - msg.msg_iovlen = 2; - - for (;;) { - err = sendmsg(c, &msg, MSG_DONTWAIT); - if (err <= 0) - break; - } - -out: - if (data) - free(data); - if (c != -1) - close(c); - if (p != -1) - close(p); - - test_sockmap_ktls__destroy(skel); -} - -static void test_sockmap_ktls_tx_pop(int family, int sotype) -{ - char msg[37] = "0123456789abcdefghijklmnopqrstuvwxyz\0"; - int c = 0, p = 0, one = 1, sent, recvd; - struct test_sockmap_ktls *skel; - int prog_fd, map_fd; - char rcv[50] = {0}; - int err; - int i, m, r; - - skel = test_sockmap_ktls__open_and_load(); - if (!ASSERT_TRUE(skel, "open ktls skel")) - return; - - err = create_pair(family, sotype, &c, &p); - if (!ASSERT_OK(err, "create_pair()")) - goto out; - - prog_fd = bpf_program__fd(skel->progs.prog_sk_policy); - map_fd = bpf_map__fd(skel->maps.sock_map); - - err = bpf_prog_attach(prog_fd, map_fd, BPF_SK_MSG_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach sk msg")) - goto out; - - err = bpf_map_update_elem(map_fd, &one, &c, BPF_NOEXIST); - if (!ASSERT_OK(err, "bpf_map_update_elem(c)")) - goto out; - - err = init_ktls_pairs(c, p); - if (!ASSERT_OK(err, "init_ktls_pairs(c, p)")) - goto out; - - struct { - int pop_start; - int pop_len; - } pop_policy[] = { - /* trim the start */ - {0, 2}, - {0, 10}, - {1, 2}, - {1, 10}, - /* trim the end */ - {35, 2}, - /* New entries should be added before this line */ - {-1, -1}, - }; - - i = 0; - while (pop_policy[i].pop_start >= 0) { - skel->bss->pop_start = pop_policy[i].pop_start; - skel->bss->pop_end = pop_policy[i].pop_len; - - sent = send(c, msg, sizeof(msg), 0); - if (!ASSERT_EQ(sent, sizeof(msg), "send(msg)")) - goto out; - - recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); - if (!ASSERT_EQ(recvd, sizeof(msg) - pop_policy[i].pop_len, "pop len mismatch")) - goto out; - - /* verify the data - * msg: 0123456789a bcdefghij klmnopqrstuvwxyz - * | | - * popped data - */ - for (m = 0, r = 0; m < sizeof(msg);) { - /* skip checking the data that has been popped */ - if (m >= pop_policy[i].pop_start && - m <= pop_policy[i].pop_start + pop_policy[i].pop_len - 1) { - m++; - continue; - } - - if (!ASSERT_EQ(msg[m], rcv[r], "data mismatch")) - goto out; - m++; - r++; - } - i++; - } -out: - if (c) - close(c); - if (p) - close(p); - test_sockmap_ktls__destroy(skel); -} - static void run_tests(int family, enum bpf_map_type map_type) { int map; @@ -414,6 +232,9 @@ static void run_tests(int family, enum bpf_map_type map_type) if (test__start_subtest(fmt_test_name("update_fails_when_sock_has_ulp", family, map_type))) test_sockmap_ktls_update_fails_when_sock_has_ulp(family, map); + if (test__start_subtest(fmt_test_name("enable_fails_when_in_sockmap", family, map_type))) + test_sockmap_ktls_enable_fails_when_in_sockmap(family, map); + close(map); } @@ -421,14 +242,6 @@ static void run_ktls_test(int family, int sotype) { if (test__start_subtest("tls simple offload")) test_sockmap_ktls_offload(family, sotype); - if (test__start_subtest("tls tx cork")) - test_sockmap_ktls_tx_cork(family, sotype, false); - if (test__start_subtest("tls tx cork with push")) - test_sockmap_ktls_tx_cork(family, sotype, true); - if (test__start_subtest("tls tx egress with no buf")) - test_sockmap_ktls_tx_no_buf(family, sotype, true); - if (test__start_subtest("tls tx with pop")) - test_sockmap_ktls_tx_pop(family, sotype); } void test_sockmap_ktls(void) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c index 621b3b71888e..1d7231728eaf 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c @@ -431,6 +431,35 @@ out: test_sockmap_strp__destroy(strp); } +static void test_sockmap_strp_parser_reject(void) +{ + struct test_sockmap_strp *strp = NULL; + int parser_mod, parser_ro, link; + int err, map; + + strp = test_sockmap_strp__open_and_load(); + if (!ASSERT_OK_PTR(strp, "test_sockmap_strp__open_and_load")) + return; + + map = bpf_map__fd(strp->maps.sock_map); + parser_mod = bpf_program__fd(strp->progs.prog_skb_parser_resize); + parser_ro = bpf_program__fd(strp->progs.prog_skb_parser); + + err = bpf_prog_attach(parser_mod, map, BPF_SK_SKB_STREAM_PARSER, 0); + ASSERT_ERR(err, "bpf_prog_attach parser_mod"); + + link = bpf_link_create(parser_ro, map, BPF_SK_SKB_STREAM_PARSER, NULL); + if (!ASSERT_GE(link, 0, "bpf_link_create parser_ro")) + goto out; + + err = bpf_link_update(link, parser_mod, NULL); + ASSERT_ERR(err, "bpf_link_update parser_mod"); +out: + if (link >= 0) + close(link); + test_sockmap_strp__destroy(strp); +} + void test_sockmap_strp(void) { if (test__start_subtest("sockmap strp tcp pass")) @@ -451,4 +480,6 @@ void test_sockmap_strp(void) test_sockmap_strp_multiple_pkt(AF_INET, SOCK_STREAM); if (test__start_subtest("sockmap strp tcp dispatch")) test_sockmap_strp_dispatch_pkt(AF_INET, SOCK_STREAM); + if (test__start_subtest("sockmap strp parser reject pkt mod")) + test_sockmap_strp_parser_reject(); } diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 53637431ec5d..3a41c517b918 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -190,7 +190,7 @@ static int getsetsockopt(void) fd = socket(AF_NETLINK, SOCK_RAW, 0); if (fd < 0) { log_err("Failed to create AF_NETLINK socket"); - return -1; + goto err; } buf.u32 = 1; @@ -211,6 +211,21 @@ static int getsetsockopt(void) } ASSERT_EQ(optlen, 8, "Unexpected NETLINK_LIST_MEMBERSHIPS value"); + /* Trick bpf_tcp_sock() with IPPROTO_TCP */ + close(fd); + fd = socket(AF_INET, SOCK_RAW, IPPROTO_TCP); + if (!ASSERT_OK_FD(fd, "socket")) + goto err; + + /* The BPF prog intercepts this before the kernel sees it, any + * optlen works. Go with 4 bytes for simplicity. + */ + buf.u32 = 1; + optlen = sizeof(buf.u32); + err = setsockopt(fd, SOL_TCP, TCP_SAVED_SYN, &buf, optlen); + if (!ASSERT_ERR(err, "setsockopt(TCP_SAVED_SYN)")) + goto err; + free(big_buf); close(fd); return 0; diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index bbe476f4c47d..5c3579438427 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,8 +13,8 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "[0-9]\\+: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2)" - " R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n" + "[0-9]\\+: (bf) r1 = r0 ; R0=ptr_foo(id=2)" + " R1=ptr_foo(id=2) refs=2\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg.c b/tools/testing/selftests/bpf/prog_tests/stack_arg.c new file mode 100644 index 000000000000..57193543f260 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include <network_helpers.h> +#include "stack_arg.skel.h" +#include "stack_arg_kfunc.skel.h" + +static void run_subtest(struct bpf_program *prog, int expected) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, expected, "retval"); +} + +static void test_global_many(void) +{ + struct stack_arg *skel; + + skel = stack_arg__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_global_many_args, 55); + +out: + stack_arg__destroy(skel); +} + +static void test_async_cb_many(void) +{ + struct stack_arg *skel; + + skel = stack_arg__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_async_cb_many_args, 0); + + /* Wait for the timer callback to fire and verify the result. + * 10+20+30+40+50+60+70+80+90+100 = 550 + */ + usleep(50); + ASSERT_EQ(skel->bss->timer_result, 550, "timer_result"); + +out: + stack_arg__destroy(skel); +} + +static void test_bpf2bpf(void) +{ + struct stack_arg *skel; + + skel = stack_arg__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_bpf2bpf_ptr_stack_arg, 75); + run_subtest(skel->progs.test_bpf2bpf_mix_stack_args, 66); + run_subtest(skel->progs.test_bpf2bpf_nesting_stack_arg, 84); + run_subtest(skel->progs.test_bpf2bpf_dynptr_stack_arg, 99); + run_subtest(skel->progs.test_two_callees, 133); + +out: + stack_arg__destroy(skel); +} + +static void test_kfunc(void) +{ + struct stack_arg_kfunc *skel; + + skel = stack_arg_kfunc__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + if (!skel->rodata->has_stack_arg) { + test__skip(); + goto out; + } + + if (!ASSERT_OK(stack_arg_kfunc__load(skel), "load")) + goto out; + + run_subtest(skel->progs.test_stack_arg_scalar, 55); + run_subtest(skel->progs.test_stack_arg_ptr, 75); + run_subtest(skel->progs.test_stack_arg_mix, 66); + run_subtest(skel->progs.test_stack_arg_dynptr, 99); + run_subtest(skel->progs.test_stack_arg_mem, 151); + run_subtest(skel->progs.test_stack_arg_iter, 145); + run_subtest(skel->progs.test_stack_arg_const_str, 45); + run_subtest(skel->progs.test_stack_arg_timer, 45); + +out: + stack_arg_kfunc__destroy(skel); +} + +void test_stack_arg(void) +{ + if (test__start_subtest("global_many_args")) + test_global_many(); + if (test__start_subtest("async_cb_many_args")) + test_async_cb_many(); + if (test__start_subtest("bpf2bpf")) + test_bpf2bpf(); + if (test__start_subtest("kfunc")) + test_kfunc(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c b/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c new file mode 100644 index 000000000000..090af1330953 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg_fail.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include "stack_arg_fail.skel.h" + +void test_stack_arg_fail(void) +{ + RUN_TESTS(stack_arg_fail); +} diff --git a/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c b/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c new file mode 100644 index 000000000000..1ab041d66de3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stack_arg_precision.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include "stack_arg_precision.skel.h" + +void test_stack_arg_precision(void) +{ + RUN_TESTS(stack_arg_precision); +} diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 7d534fde0af9..a5a226d0104c 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -8,6 +8,9 @@ #include "tailcall_freplace.skel.h" #include "tc_bpf2bpf.skel.h" #include "tailcall_fail.skel.h" +#include "tailcall_cgrp_storage_owner.skel.h" +#include "tailcall_cgrp_storage_no_storage.skel.h" +#include "tailcall_cgrp_storage.skel.h" #include "tailcall_sleepable.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations @@ -1654,6 +1657,179 @@ static void test_tailcall_failure() RUN_TESTS(tailcall_fail); } +static void test_tailcall_cgrp_storage(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage *skel = NULL; + int err, key = 0, prog_array_fd, prog_fd, storage_map_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + storage_map_fd = bpf_map__fd(owner_skel->maps.storage_map); + + skel = tailcall_cgrp_storage__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage__open")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.storage_map, storage_map_fd); + if (!ASSERT_OK(err, "reuse_storage_map")) + goto out; + + err = bpf_object__load(skel->obj); + if (!ASSERT_OK(err, "tailcall_cgrp_storage__load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.callee_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + ASSERT_OK(err, "update_prog_array"); +out: + tailcall_cgrp_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_diff_storage(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage *skel = NULL; + int err, prog_array_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + + skel = tailcall_cgrp_storage__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage__open")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_object__load(skel->obj); + ASSERT_ERR(err, "tailcall_cgrp_storage__load"); +out: + tailcall_cgrp_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_no_storage(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage_no_storage *skel = NULL; + int err, prog_array_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + + skel = tailcall_cgrp_storage_no_storage__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage_no_storage__open")) + goto out; + + err = bpf_map__reuse_fd(skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_object__load(skel->obj); + ASSERT_ERR(err, "tailcall_cgrp_storage_no_storage__load"); +out: + tailcall_cgrp_storage_no_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_no_storage_leaf(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage_no_storage *skel = NULL; + int err, key = 0, prog_array_fd, prog_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + + skel = tailcall_cgrp_storage_no_storage__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tailcall_cgrp_storage_no_storage__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.leaf_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update_prog_array_leaf")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.caller_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + ASSERT_ERR(err, "update_prog_array_bridge"); +out: + tailcall_cgrp_storage_no_storage__destroy(skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + +static void test_tailcall_cgrp_storage_no_storage_bridge(void) +{ + struct tailcall_cgrp_storage_owner *owner_skel = NULL; + struct tailcall_cgrp_storage_no_storage *bridge_skel = NULL; + struct tailcall_cgrp_storage *callee_skel = NULL; + int err, key = 0, prog_array_fd, prog_fd, storage_map_fd; + + owner_skel = tailcall_cgrp_storage_owner__open_and_load(); + if (!ASSERT_OK_PTR(owner_skel, "owner_open_and_load")) + return; + + prog_array_fd = bpf_map__fd(owner_skel->maps.prog_array); + storage_map_fd = bpf_map__fd(owner_skel->maps.storage_map); + + callee_skel = tailcall_cgrp_storage__open(); + if (!ASSERT_OK_PTR(callee_skel, "tailcall_cgrp_storage__open")) + goto out; + + bpf_program__set_autoload(callee_skel->progs.caller_prog, false); + + err = bpf_map__reuse_fd(callee_skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_map__reuse_fd(callee_skel->maps.storage_map, storage_map_fd); + if (!ASSERT_OK(err, "reuse_storage_map")) + goto out; + + err = bpf_object__load(callee_skel->obj); + if (!ASSERT_OK(err, "tailcall_cgrp_storage__load")) + goto out; + + prog_fd = bpf_program__fd(callee_skel->progs.callee_prog); + err = bpf_map_update_elem(prog_array_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update_prog_array")) + goto out; + + bridge_skel = tailcall_cgrp_storage_no_storage__open(); + if (!ASSERT_OK_PTR(bridge_skel, "tailcall_cgrp_storage_no_storage__open")) + goto out; + + err = bpf_map__reuse_fd(bridge_skel->maps.prog_array, prog_array_fd); + if (!ASSERT_OK(err, "reuse_prog_array")) + goto out; + + err = bpf_object__load(bridge_skel->obj); + ASSERT_ERR(err, "tailcall_cgrp_storage_no_storage_bridge__load"); +out: + tailcall_cgrp_storage_no_storage__destroy(bridge_skel); + tailcall_cgrp_storage__destroy(callee_skel); + tailcall_cgrp_storage_owner__destroy(owner_skel); +} + noinline void uprobe_sleepable_trigger(void) { asm volatile (""); @@ -1781,4 +1957,14 @@ void test_tailcalls(void) test_tailcall_failure(); if (test__start_subtest("tailcall_sleepable")) test_tailcall_sleepable(); + if (test__start_subtest("tailcall_cgrp_storage")) + test_tailcall_cgrp_storage(); + if (test__start_subtest("tailcall_cgrp_storage_diff_storage")) + test_tailcall_cgrp_storage_diff_storage(); + if (test__start_subtest("tailcall_cgrp_storage_no_storage")) + test_tailcall_cgrp_storage_no_storage(); + if (test__start_subtest("tailcall_cgrp_storage_no_storage_leaf")) + test_tailcall_cgrp_storage_no_storage_leaf(); + if (test__start_subtest("tailcall_cgrp_storage_no_storage_bridge")) + test_tailcall_cgrp_storage_no_storage_bridge(); } diff --git a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c index 83b90335967a..e6e95c1416e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_kfunc.c +++ b/tools/testing/selftests/bpf/prog_tests/task_kfunc.c @@ -68,6 +68,36 @@ cleanup: task_kfunc_success__destroy(skel); } +static void run_syscall_success_test(const char *prog_name) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct task_kfunc_success *skel; + struct bpf_program *prog; + int err; + + skel = open_load_task_kfunc_skel(); + if (!ASSERT_OK_PTR(skel, "open_load_skel")) + return; + + if (!ASSERT_OK(skel->bss->err, "pre_run_err")) + goto cleanup; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name")) + goto cleanup; + + err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts); + if (!ASSERT_OK(err, "bpf_prog_test_run_opts")) + goto cleanup; + if (!ASSERT_EQ(opts.retval, 0, "retval")) + goto cleanup; + + ASSERT_OK(skel->bss->err, "post_run_err"); + +cleanup: + task_kfunc_success__destroy(skel); +} + static int run_vpid_test(void *prog_name) { struct task_kfunc_success *skel; @@ -140,7 +170,6 @@ static const char * const success_tests[] = { "test_task_acquire_release_argument", "test_task_acquire_release_current", "test_task_acquire_leave_in_map", - "test_task_xchg_release", "test_task_map_acquire_release", "test_task_current_acquire_release", "test_task_from_pid_arg", @@ -151,6 +180,10 @@ static const char * const success_tests[] = { "test_task_kfunc_flavor_relo_not_found", }; +static const char * const syscall_success_tests[] = { + "test_task_xchg_release", +}; + static const char * const vpid_success_tests[] = { "test_task_from_vpid_current", "test_task_from_vpid_invalid", @@ -167,6 +200,13 @@ void test_task_kfunc(void) run_success_test(success_tests[i]); } + for (i = 0; i < ARRAY_SIZE(syscall_success_tests); i++) { + if (!test__start_subtest(syscall_success_tests[i])) + continue; + + run_syscall_success_test(syscall_success_tests[i]); + } + for (i = 0; i < ARRAY_SIZE(vpid_success_tests); i++) { if (!test__start_subtest(vpid_success_tests[i])) continue; diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index 1b26c12f255a..5b2b56cc3a4f 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -47,6 +47,7 @@ static void test_sys_enter_exit(void) skel->bss->target_pid = 0; /* 2x gettid syscalls */ + ASSERT_EQ(skel->bss->update_err, 0, "update_err"); ASSERT_EQ(skel->bss->enter_cnt, 2, "enter_cnt"); ASSERT_EQ(skel->bss->exit_cnt, 2, "exit_cnt"); ASSERT_EQ(skel->bss->mismatch_cnt, 0, "mismatch_cnt"); diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index 56685fc03c7e..80e6315da2a5 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -507,6 +507,10 @@ static void misc(void) ASSERT_EQ(misc_skel->bss->nr_hwtstamp, 0, "nr_hwtstamp"); + ASSERT_TRUE(misc_skel->bss->nodelay_est_ok, "nodelay_est_ok"); + ASSERT_TRUE(misc_skel->bss->nodelay_hdr_len_reject, "nodelay_hdr_len_reject"); + ASSERT_TRUE(misc_skel->bss->nodelay_write_hdr_reject, "nodelay_write_hdr_reject"); + check_linum: ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum"); sk_fds_close(&sk_fds); diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c index bdc4fc06bc5a..d7495efd4a56 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_lsm.c +++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c @@ -5,36 +5,14 @@ */ #include <test_progs.h> -#include <sys/mman.h> #include <sys/wait.h> #include <unistd.h> -#include <malloc.h> -#include <stdlib.h> #include "lsm.skel.h" #include "lsm_tailcall.skel.h" char *CMD_ARGS[] = {"true", NULL}; -#define GET_PAGE_ADDR(ADDR, PAGE_SIZE) \ - (char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1)) - -int stack_mprotect(void) -{ - void *buf; - long sz; - int ret; - - sz = sysconf(_SC_PAGESIZE); - if (sz < 0) - return sz; - - buf = alloca(sz * 3); - ret = mprotect(GET_PAGE_ADDR(buf, sz), sz, - PROT_READ | PROT_WRITE | PROT_EXEC); - return ret; -} - int exec_cmd(int *monitored_pid) { int child_pid, child_status; diff --git a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c index 3e98a1665936..1675b32753a8 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xdp_veth.c @@ -456,7 +456,11 @@ static void xdp_veth_egress(u32 flags) .remote_flags = flags, } }; - const char magic_mac[6] = { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF}; + const unsigned char egress_macs[VETH_PAIRS_COUNT][ETH_ALEN] = { + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x01 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x02 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x03 }, + }; struct xdp_redirect_multi_kern *xdp_redirect_multi_kern; struct bpf_object *bpf_objs[VETH_EGRESS_SKEL_NB]; struct xdp_redirect_map *xdp_redirect_map; @@ -512,7 +516,13 @@ static void xdp_veth_egress(u32 flags) &net_config, prog_cfg, i)) goto destroy_xdp_redirect_map; - err = bpf_map_update_elem(mac_map, &ifindex, magic_mac, 0); + { + __be64 mac = 0; + + memcpy(&mac, egress_macs[i], ETH_ALEN); + err = bpf_map_update_elem(mac_map, &ifindex, &mac, 0); + } + if (!ASSERT_OK(err, "bpf_map_update_elem")) goto destroy_xdp_redirect_map; @@ -531,15 +541,162 @@ static void xdp_veth_egress(u32 flags) for (i = 0; i < 2; i++) { u32 key = i; + __be64 expected = 0; u64 res; err = bpf_map_lookup_elem(res_map, &key, &res); if (!ASSERT_OK(err, "get MAC res")) goto destroy_xdp_redirect_map; - ASSERT_STRNEQ((const char *)&res, magic_mac, ETH_ALEN, "compare mac"); + /* store_mac_1/2 run on the second/third remote veths. */ + memcpy(&expected, egress_macs[i + 1], ETH_ALEN); + ASSERT_EQ(res, expected, "compare mac"); + } + +destroy_xdp_redirect_map: + close_netns(nstoken); + xdp_redirect_map__destroy(xdp_redirect_map); +destroy_xdp_redirect_multi_kern: + xdp_redirect_multi_kern__destroy(xdp_redirect_multi_kern); +destroy_xdp_dummy: + xdp_dummy__destroy(xdp_dummy); + + cleanup_network(&net_config); +} + +static void xdp_veth_egress_last_dst(u32 flags) +{ + struct prog_configuration prog_cfg[VETH_PAIRS_COUNT] = { + { + .local_name = "xdp_redirect_map_all_prog", + .remote_name = "xdp_dummy_prog", + .local_flags = flags, + .remote_flags = flags, + }, + { + .local_name = "xdp_redirect_map_all_prog", + .remote_name = "store_mac_1", + .local_flags = flags, + .remote_flags = flags, + }, + { + .local_name = "xdp_redirect_map_all_prog", + .remote_name = "xdp_dummy_prog", + .local_flags = flags, + .remote_flags = flags, + } + }; + const unsigned char egress_macs[VETH_PAIRS_COUNT][ETH_ALEN] = { + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x01 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x02 }, + { 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0x03 }, + }; + struct xdp_redirect_multi_kern *xdp_redirect_multi_kern; + struct bpf_object *bpf_objs[VETH_EGRESS_SKEL_NB]; + struct xdp_redirect_map *xdp_redirect_map; + struct net_configuration net_config = {}; + int mac_map, egress_map, res_map; + struct nstoken *nstoken = NULL; + struct xdp_dummy *xdp_dummy; + __be64 sentinel_mac = 0; + __be64 last_mac = 0; + __be64 res; + u32 key; + int err; + int i; + + xdp_dummy = xdp_dummy__open_and_load(); + if (!ASSERT_OK_PTR(xdp_dummy, "xdp_dummy__open_and_load")) + return; + + xdp_redirect_multi_kern = xdp_redirect_multi_kern__open_and_load(); + if (!ASSERT_OK_PTR(xdp_redirect_multi_kern, "xdp_redirect_multi_kern__open_and_load")) + goto destroy_xdp_dummy; + + xdp_redirect_map = xdp_redirect_map__open_and_load(); + if (!ASSERT_OK_PTR(xdp_redirect_map, "xdp_redirect_map__open_and_load")) + goto destroy_xdp_redirect_multi_kern; + + if (!ASSERT_OK(create_network(&net_config), "create network")) + goto destroy_xdp_redirect_map; + + mac_map = bpf_map__fd(xdp_redirect_multi_kern->maps.mac_map); + if (!ASSERT_OK_FD(mac_map, "open mac_map")) + goto destroy_xdp_redirect_map; + + egress_map = bpf_map__fd(xdp_redirect_multi_kern->maps.map_egress); + if (!ASSERT_OK_FD(egress_map, "open map_egress")) + goto destroy_xdp_redirect_map; + + bpf_objs[0] = xdp_dummy->obj; + bpf_objs[1] = xdp_redirect_multi_kern->obj; + bpf_objs[2] = xdp_redirect_map->obj; + + nstoken = open_netns(net_config.ns0_name); + if (!ASSERT_OK_PTR(nstoken, "open NS0")) + goto destroy_xdp_redirect_map; + + for (i = 0; i < VETH_PAIRS_COUNT; i++) { + struct bpf_devmap_val devmap_val = {}; + int ifindex = if_nametoindex(net_config.veth_cfg[i].local_veth); + u32 key = i; + + SYS(destroy_xdp_redirect_map, + "ip -n %s neigh add %s lladdr 00:00:00:00:00:01 dev %s", + net_config.veth_cfg[i].namespace, IP_NEIGH, + net_config.veth_cfg[i].remote_veth); + + if (attach_programs_to_veth_pair(bpf_objs, VETH_EGRESS_SKEL_NB, + &net_config, prog_cfg, i)) + goto destroy_xdp_redirect_map; + + { + __be64 mac = 0; + + memcpy(&mac, egress_macs[i], ETH_ALEN); + err = bpf_map_update_elem(mac_map, &ifindex, &mac, 0); + } + + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto destroy_xdp_redirect_map; + + devmap_val.ifindex = ifindex; + devmap_val.bpf_prog.fd = -1; + + if (i == VETH_PAIRS_COUNT - 1) + devmap_val.bpf_prog.fd = + bpf_program__fd(xdp_redirect_multi_kern->progs.xdp_devmap_prog); + + err = bpf_map_update_elem(egress_map, &key, &devmap_val, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto destroy_xdp_redirect_map; } + res_map = bpf_map__fd(xdp_redirect_map->maps.rx_mac); + if (!ASSERT_OK_FD(res_map, "open rx_map")) + goto destroy_xdp_redirect_map; + + memcpy(&sentinel_mac, egress_macs[VETH_PAIRS_COUNT - 1], ETH_ALEN); + memcpy(&last_mac, egress_macs[VETH_PAIRS_COUNT - 1], ETH_ALEN); + + key = 0; + err = bpf_map_update_elem(res_map, &key, &sentinel_mac, 0); + if (!ASSERT_OK(err, "init rx mac")) + goto destroy_xdp_redirect_map; + + SYS_NOFAIL("ip netns exec %s ping %s -i 0.1 -c 4 -W1 > /dev/null ", + net_config.veth_cfg[0].namespace, IP_NEIGH); + + err = bpf_map_lookup_elem(res_map, &key, &res); + if (!ASSERT_OK(err, "get MAC res")) + goto destroy_xdp_redirect_map; + + if (!ASSERT_NEQ(res, sentinel_mac, "rx_mac overwritten by store_mac_1")) + goto destroy_xdp_redirect_map; + + if (!ASSERT_NEQ(res, last_mac, "earlier dst not rewritten by last dst")) + goto destroy_xdp_redirect_map; + destroy_xdp_redirect_map: close_netns(nstoken); xdp_redirect_map__destroy(xdp_redirect_map); @@ -596,4 +753,7 @@ void test_xdp_veth_egress(void) if (test__start_subtest("SKB_MODE/egress")) xdp_veth_egress(XDP_FLAGS_SKB_MODE); + + if (test__start_subtest("SKB_MODE/egress_last_dst")) + xdp_veth_egress_last_dst(XDP_FLAGS_SKB_MODE); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index 7950c504ed28..6eb9096d084c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -7,7 +7,6 @@ #include <linux/netdev.h> #include <poll.h> #include <pthread.h> -#include <signal.h> #include <string.h> #include <sys/mman.h> #include <sys/socket.h> @@ -65,11 +64,6 @@ static void gen_eth_hdr(struct xsk_socket_info *xsk, struct ethhdr *eth_hdr) eth_hdr->h_proto = htons(ETH_P_LOOPBACK); } -static bool is_umem_valid(struct ifobject *ifobj) -{ - return !!ifobj->umem->umem; -} - static u32 mode_to_xdp_flags(enum test_mode mode) { return (mode == TEST_MODE_SKB) ? XDP_FLAGS_SKB_MODE : XDP_FLAGS_DRV_MODE; @@ -213,6 +207,7 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, for (i = 0; i < MAX_INTERFACES; i++) { struct ifobject *ifobj = i ? ifobj_rx : ifobj_tx; + struct xsk_umem_info *umem_real; ifobj->xsk = &ifobj->xsk_arr[0]; ifobj->use_poll = false; @@ -229,24 +224,30 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, ifobj->tx_on = false; } - memset(ifobj->umem, 0, sizeof(*ifobj->umem)); - ifobj->umem->num_frames = DEFAULT_UMEM_BUFFERS; - ifobj->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; - + umem_real = ifobj->xsk_arr[0].umem_real; + memset(umem_real, 0, sizeof(*umem_real)); for (j = 0; j < MAX_SOCKETS; j++) { - memset(&ifobj->xsk_arr[j], 0, sizeof(ifobj->xsk_arr[j])); - ifobj->xsk_arr[j].rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; - ifobj->xsk_arr[j].batch_size = DEFAULT_BATCH_SIZE; + struct xsk_socket_info *xsk = &ifobj->xsk_arr[j]; + + memset(xsk, 0, sizeof(*xsk)); + xsk->rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; + if (j == 0) + xsk->umem_real = umem_real; + xsk->umem = umem_real; + xsk->batch_size = DEFAULT_BATCH_SIZE; if (i == 0) - ifobj->xsk_arr[j].pkt_stream = test->tx_pkt_stream_default; + xsk->pkt_stream = test->tx_pkt_stream_default; else - ifobj->xsk_arr[j].pkt_stream = test->rx_pkt_stream_default; + xsk->pkt_stream = test->rx_pkt_stream_default; - memcpy(ifobj->xsk_arr[j].src_mac, g_mac, ETH_ALEN); - memcpy(ifobj->xsk_arr[j].dst_mac, g_mac, ETH_ALEN); - ifobj->xsk_arr[j].src_mac[5] += ((j * 2) + 0); - ifobj->xsk_arr[j].dst_mac[5] += ((j * 2) + 1); + memcpy(xsk->src_mac, g_mac, ETH_ALEN); + memcpy(xsk->dst_mac, g_mac, ETH_ALEN); + xsk->src_mac[5] += ((j * 2) + 0); + xsk->dst_mac[5] += ((j * 2) + 1); } + + ifobj->xsk->umem->num_frames = DEFAULT_UMEM_BUFFERS; + ifobj->xsk->umem->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; } if (ifobj_tx->hw_ring_size_supp) @@ -303,6 +304,18 @@ static void test_spec_reset(struct test_spec *test) __test_spec_init(test, test->ifobj_tx, test->ifobj_rx); } +static void test_spec_set_unaligned(struct test_spec *test) +{ + test->ifobj_tx->xsk->umem->unaligned_mode = true; + test->ifobj_rx->xsk->umem->unaligned_mode = true; +} + +static void test_spec_set_frame_size(struct test_spec *test, u32 size) +{ + test->ifobj_tx->xsk->umem->frame_size = size; + test->ifobj_rx->xsk->umem->frame_size = size; +} + static void test_spec_set_xdp_prog(struct test_spec *test, struct bpf_program *xdp_prog_rx, struct bpf_program *xdp_prog_tx, struct bpf_map *xskmap_rx, struct bpf_map *xskmap_tx) @@ -810,11 +823,11 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp { u32 seqnum, pkt_nb, *pkt_data, words_to_end, expected_seqnum; void *data = xsk_umem__get_data(umem->buffer, addr); + u64 umem_sz = umem_size(umem); addr -= umem->base_addr; - if (addr >= umem->num_frames * umem->frame_size || - addr + len > umem->num_frames * umem->frame_size) { + if (addr >= umem_sz || addr + len > umem_sz) { ksft_print_msg("Frag invalid addr: %llx len: %u\n", (unsigned long long)addr, len); return false; @@ -991,7 +1004,7 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) return TEST_FAILURE; if (!ret) { - if (!is_umem_valid(test->ifobj_tx)) + if (test->poll_tmout) return TEST_PASS; ksft_print_msg("ERROR: [%s] Poll timed out\n", __func__); @@ -1130,7 +1143,7 @@ static int receive_pkts(struct test_spec *test) break; res = __receive_pkts(test, xsk); - if (!(res == TEST_PASS || res == TEST_CONTINUE)) + if (res != TEST_CONTINUE) return res; ret = gettimeofday(&tv_now, NULL); @@ -1147,11 +1160,12 @@ static int receive_pkts(struct test_spec *test) return TEST_PASS; } -static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, bool timeout) +static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, + bool test_timeout) { u32 i, idx = 0, valid_pkts = 0, valid_frags = 0, buffer_len; struct pkt_stream *pkt_stream = xsk->pkt_stream; - struct xsk_umem_info *umem = ifobject->umem; + struct xsk_umem_info *umem = xsk->umem; bool use_poll = ifobject->use_poll; struct pollfd fds = { }; int ret; @@ -1159,7 +1173,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b buffer_len = pkt_get_buffer_len(umem, pkt_stream->max_pkt_len); /* pkts_in_flight might be negative if many invalid packets are sent */ if (pkts_in_flight >= (int)((umem_size(umem) - xsk->batch_size * buffer_len) / - buffer_len)) { + buffer_len) && !test_timeout) { ret = kick_tx(xsk); if (ret) return TEST_FAILURE; @@ -1172,7 +1186,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b while (xsk_ring_prod__reserve(&xsk->tx, xsk->batch_size, &idx) < xsk->batch_size) { if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); - if (timeout) { + if (test_timeout) { if (ret < 0) { ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, errno); @@ -1210,7 +1224,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b while (nb_frags_left--) { struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx + i); - tx_desc->addr = pkt_get_addr(pkt, ifobject->umem); + tx_desc->addr = pkt_get_addr(pkt, umem); if (pkt_stream->verbatim) { tx_desc->len = pkt->len; tx_desc->options = pkt->options; @@ -1252,7 +1266,7 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b if (use_poll) { ret = poll(&fds, 1, POLL_TMOUT); if (ret <= 0) { - if (ret == 0 && timeout) + if (ret == 0 && test_timeout) return TEST_PASS; ksft_print_msg("ERROR: [%s] Poll error %d\n", __func__, ret); @@ -1260,14 +1274,14 @@ static int __send_pkts(struct ifobject *ifobject, struct xsk_socket_info *xsk, b } } - if (!timeout) { + if (!test_timeout) { if (complete_pkts(xsk, i)) return TEST_FAILURE; usleep(10); - return TEST_PASS; } + /* Loop completion is driven by send_pkts() stream progress checks. */ return TEST_CONTINUE; } @@ -1303,7 +1317,6 @@ bool all_packets_sent(struct test_spec *test, unsigned long *bitmap) static int send_pkts(struct test_spec *test, struct ifobject *ifobject) { - bool timeout = !is_umem_valid(test->ifobj_rx); DECLARE_BITMAP(bitmap, test->nb_sockets); u32 i, ret; @@ -1318,19 +1331,18 @@ static int send_pkts(struct test_spec *test, struct ifobject *ifobject) __set_bit(i, bitmap); continue; } - ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], timeout); - if (ret == TEST_CONTINUE && !test->fail) - continue; - - if ((ret || test->fail) && !timeout) - return TEST_FAILURE; - - if (ret == TEST_PASS && timeout) + ret = __send_pkts(ifobject, &ifobject->xsk_arr[i], test->poll_tmout); + if (ret != TEST_CONTINUE) return ret; - ret = wait_for_tx_completion(&ifobject->xsk_arr[i]); - if (ret) + if (test->fail) return TEST_FAILURE; + + if (!test->poll_tmout) { + ret = wait_for_tx_completion(&ifobject->xsk_arr[i]); + if (ret) + return TEST_FAILURE; + } } } @@ -1488,14 +1500,25 @@ static int xsk_configure(struct test_spec *test, struct ifobject *ifobject, static int thread_common_ops_tx(struct test_spec *test, struct ifobject *ifobject) { - int ret = xsk_configure(test, ifobject, test->ifobj_rx->umem, true); + struct xsk_umem_info *umem_rx, *umem_tx; + int ret; + + if (!test->ifobj_rx || !test->ifobj_rx->xsk_arr[0].umem->umem) { + ksft_print_msg("Error: RX UMEM is not initialized before shared-UMEM TX setup\n"); + return -EINVAL; + } + umem_rx = test->ifobj_rx->xsk_arr[0].umem; + umem_tx = ifobject->xsk_arr[0].umem_real; + memcpy(umem_tx, umem_rx, sizeof(*umem_tx)); + umem_tx->base_addr = 0; + umem_tx->next_buffer = 0; + + ret = xsk_configure(test, ifobject, umem_tx, true); if (ret) return ret; ifobject->xsk = &ifobject->xsk_arr[0]; ifobject->xskmap = test->ifobj_rx->xskmap; - memcpy(ifobject->umem, test->ifobj_rx->umem, sizeof(struct xsk_umem_info)); - ifobject->umem->base_addr = 0; return 0; } @@ -1548,31 +1571,37 @@ static int xsk_populate_fill_ring(struct xsk_umem_info *umem, struct pkt_stream static int thread_common_ops(struct test_spec *test, struct ifobject *ifobject) { + struct xsk_umem_info *umem = ifobject->xsk->umem; LIBBPF_OPTS(bpf_xdp_query_opts, opts); int mmap_flags; - u64 umem_sz; + u64 umem_sz, mmap_sz; void *bufs; int ret; u32 i; - umem_sz = ifobject->umem->num_frames * ifobject->umem->frame_size; + umem_sz = umem_size(umem); mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; - if (ifobject->umem->unaligned_mode) + if (umem->unaligned_mode) mmap_flags |= MAP_HUGETLB | MAP_HUGE_2MB; if (ifobject->shared_umem) umem_sz *= 2; - bufs = mmap(NULL, umem_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); + mmap_sz = umem->unaligned_mode ? + ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE : umem_sz; + + bufs = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (bufs == MAP_FAILED) return -errno; - ret = xsk_configure_umem(ifobject, ifobject->umem, bufs, umem_sz); + umem->mmap_size = mmap_sz; + + ret = xsk_configure_umem(ifobject, umem, bufs, umem_sz); if (ret) return ret; - ret = xsk_configure(test, ifobject, ifobject->umem, false); + ret = xsk_configure(test, ifobject, umem, false); if (ret) return ret; @@ -1581,14 +1610,13 @@ static int thread_common_ops(struct test_spec *test, struct ifobject *ifobject) if (!ifobject->rx_on) return 0; - ret = xsk_populate_fill_ring(ifobject->umem, ifobject->xsk->pkt_stream, + ret = xsk_populate_fill_ring(umem, ifobject->xsk->pkt_stream, ifobject->use_fill_ring); if (ret) return ret; for (i = 0; i < test->nb_sockets; i++) { - ifobject->xsk = &ifobject->xsk_arr[i]; - ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk->xsk, i); + ret = xsk_update_xskmap(ifobject->xskmap, ifobject->xsk_arr[i].xsk, i); if (ret) return ret; } @@ -1642,7 +1670,8 @@ void *worker_testapp_validate_rx(void *arg) strerror(-err)); } - pthread_barrier_wait(&barr); + if (test->use_barrier) + pthread_barrier_wait(&barr); /* We leave only now in case of error to avoid getting stuck in the barrier */ if (err) { @@ -1675,19 +1704,10 @@ void *worker_testapp_validate_rx(void *arg) static void testapp_clean_xsk_umem(struct ifobject *ifobj) { - u64 umem_sz = ifobj->umem->num_frames * ifobj->umem->frame_size; - - if (ifobj->shared_umem) - umem_sz *= 2; + struct xsk_umem_info *umem = ifobj->xsk->umem; - umem_sz = ceil_u64(umem_sz, HUGEPAGE_SIZE) * HUGEPAGE_SIZE; - xsk_umem__delete(ifobj->umem->umem); - munmap(ifobj->umem->buffer, umem_sz); -} - -static void handler(int signum) -{ - pthread_exit(NULL); + xsk_umem__delete(umem->umem); + munmap(umem->buffer, umem->mmap_size); } static bool xdp_prog_changed_rx(struct test_spec *test) @@ -1794,9 +1814,18 @@ static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *i return TEST_FAILURE; } - if (ifobj2) { + err = xsk_attach_xdp_progs(test, ifobj1, ifobj2); + if (err) { + ksft_print_msg("Error: failed to attach XDP programs: %d (%s)\n", + err, strerror(-err)); + return TEST_FAILURE; + } + test->use_barrier = !!ifobj2; + + if (test->use_barrier) { if (pthread_barrier_init(&barr, NULL, 2)) return TEST_FAILURE; + pkt_stream_reset(ifobj2->xsk->pkt_stream); } @@ -1804,29 +1833,27 @@ static int __testapp_validate_traffic(struct test_spec *test, struct ifobject *i pkt_stream_reset(ifobj1->xsk->pkt_stream); pkts_in_flight = 0; - signal(SIGUSR1, handler); /*Spawn RX thread */ pthread_create(&t0, NULL, ifobj1->func_ptr, test); - if (ifobj2) { + if (test->use_barrier) { pthread_barrier_wait(&barr); if (pthread_barrier_destroy(&barr)) { - pthread_kill(t0, SIGUSR1); + test->use_barrier = false; + pthread_join(t0, NULL); clean_sockets(test, ifobj1); clean_umem(test, ifobj1, NULL); return TEST_FAILURE; } + } + if (ifobj2) { /*Spawn TX thread */ pthread_create(&t1, NULL, ifobj2->func_ptr, test); - pthread_join(t1, NULL); } - if (!ifobj2) - pthread_kill(t0, SIGUSR1); - else - pthread_join(t0, NULL); + pthread_join(t0, NULL); if (test->total_steps == test->current_step || test->fail) { clean_sockets(test, ifobj1); @@ -1845,8 +1872,8 @@ static int testapp_validate_traffic(struct test_spec *test) struct ifobject *ifobj_rx = test->ifobj_rx; struct ifobject *ifobj_tx = test->ifobj_tx; - if ((ifobj_rx->umem->unaligned_mode && !ifobj_rx->unaligned_supp) || - (ifobj_tx->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) { + if ((ifobj_rx->xsk->umem->unaligned_mode && !ifobj_rx->unaligned_supp) || + (ifobj_tx->xsk->umem->unaligned_mode && !ifobj_tx->unaligned_supp)) { ksft_print_msg("No huge pages present.\n"); return TEST_SKIP; } @@ -1863,8 +1890,6 @@ static int testapp_validate_traffic(struct test_spec *test) } } - if (xsk_attach_xdp_progs(test, ifobj_rx, ifobj_tx)) - return TEST_FAILURE; return __testapp_validate_traffic(test, ifobj_rx, ifobj_tx); } @@ -1953,12 +1978,13 @@ int testapp_xdp_prog_cleanup(struct test_spec *test) int testapp_headroom(struct test_spec *test) { - test->ifobj_rx->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; + test->ifobj_rx->xsk->umem->frame_headroom = UMEM_HEADROOM_TEST_SIZE; return testapp_validate_traffic(test); } int testapp_stats_rx_dropped(struct test_spec *test) { + struct xsk_umem_info *umem = test->ifobj_rx->xsk->umem; u32 umem_tr = test->ifobj_tx->umem_tailroom; if (test->mode == TEST_MODE_ZC) { @@ -1968,7 +1994,7 @@ int testapp_stats_rx_dropped(struct test_spec *test) if (pkt_stream_replace_half(test, (MIN_PKT_SIZE * 3) + umem_tr, 0)) return TEST_FAILURE; - test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - + umem->frame_headroom = umem->frame_size - XDP_PACKET_HEADROOM - (MIN_PKT_SIZE * 2) - umem_tr; if (pkt_stream_receive_half(test)) return TEST_FAILURE; @@ -2025,8 +2051,7 @@ int testapp_stats_fill_empty(struct test_spec *test) int testapp_send_receive_unaligned(struct test_spec *test) { - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; + test_spec_set_unaligned(test); /* Let half of the packets straddle a 4K buffer boundary */ if (pkt_stream_replace_half(test, MIN_PKT_SIZE, -MIN_PKT_SIZE / 2)) return TEST_FAILURE; @@ -2037,8 +2062,7 @@ int testapp_send_receive_unaligned(struct test_spec *test) int testapp_send_receive_unaligned_mb(struct test_spec *test) { test->mtu = MAX_ETH_JUMBO_SIZE; - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; + test_spec_set_unaligned(test); if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MAX_ETH_JUMBO_SIZE)) return TEST_FAILURE; return testapp_validate_traffic(test); @@ -2064,8 +2088,8 @@ int testapp_send_receive_mb(struct test_spec *test) int testapp_invalid_desc_mb(struct test_spec *test) { - struct xsk_umem_info *umem = test->ifobj_tx->umem; - u64 umem_size = umem->num_frames * umem->frame_size; + struct xsk_umem_info *umem = test->ifobj_tx->xsk->umem; + u64 umem_sz = umem_size(umem); struct pkt pkts[] = { /* Valid packet for synch to start with */ {0, MIN_PKT_SIZE, 0, true, 0}, @@ -2075,7 +2099,7 @@ int testapp_invalid_desc_mb(struct test_spec *test) {0, 0, 0, false, 0}, /* Invalid address in the second frame */ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, - {umem_size, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, + {umem_sz, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, /* Invalid len in the middle */ {0, XSK_UMEM__LARGE_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false, XDP_PKT_CONTD}, @@ -2105,8 +2129,8 @@ int testapp_invalid_desc_mb(struct test_spec *test) int testapp_invalid_desc(struct test_spec *test) { - struct xsk_umem_info *umem = test->ifobj_tx->umem; - u64 umem_size = umem->num_frames * umem->frame_size; + struct xsk_umem_info *umem = test->ifobj_tx->xsk->umem; + u64 umem_sz = umem_size(umem); struct pkt pkts[] = { /* Zero packet address allowed */ {0, MIN_PKT_SIZE, 0, true}, @@ -2117,11 +2141,11 @@ int testapp_invalid_desc(struct test_spec *test) /* Packet too large */ {0, XSK_UMEM__INVALID_FRAME_SIZE, 0, false}, /* Up to end of umem allowed */ - {umem_size - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true}, + {umem_sz - MIN_PKT_SIZE - 2 * umem->frame_size, MIN_PKT_SIZE, 0, true}, /* After umem ends */ - {umem_size, MIN_PKT_SIZE, 0, false}, + {umem_sz, MIN_PKT_SIZE, 0, false}, /* Straddle the end of umem */ - {umem_size - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, + {umem_sz - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, /* Straddle a 4K boundary */ {0x1000 - MIN_PKT_SIZE / 2, MIN_PKT_SIZE, 0, false}, /* Straddle a 2K boundary */ @@ -2139,9 +2163,9 @@ int testapp_invalid_desc(struct test_spec *test) } if (test->ifobj_tx->shared_umem) { - pkts[4].offset += umem_size; - pkts[5].offset += umem_size; - pkts[6].offset += umem_size; + pkts[4].offset += umem_sz; + pkts[5].offset += umem_sz; + pkts[6].offset += umem_sz; } if (pkt_stream_generate_custom(test, pkts, ARRAY_SIZE(pkts))) @@ -2202,16 +2226,33 @@ int testapp_xdp_shared_umem(struct test_spec *test) int testapp_poll_txq_tmout(struct test_spec *test) { + bool shared_umem = test->ifobj_tx->shared_umem; + int ret; + + test->poll_tmout = true; + /* + * POLL_TXQ_FULL exercises TX timeout setup in isolation. + * Keep TX out of shared-UMEM mode here so TX setup does not require + * RX UMEM to be initialized first. + */ + test->ifobj_tx->shared_umem = false; test->ifobj_tx->use_poll = true; /* create invalid frame by set umem frame_size and pkt length equal to 2048 */ - test->ifobj_tx->umem->frame_size = 2048; - if (pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048)) + test->ifobj_tx->xsk->umem->frame_size = 2048; + if (pkt_stream_replace(test, 2 * DEFAULT_PKT_CNT, 2048)) { + test->ifobj_tx->shared_umem = shared_umem; return TEST_FAILURE; - return testapp_validate_traffic_single_thread(test, test->ifobj_tx); + } + + ret = testapp_validate_traffic_single_thread(test, test->ifobj_tx); + test->ifobj_tx->shared_umem = shared_umem; + + return ret; } int testapp_poll_rxq_tmout(struct test_spec *test) { + test->poll_tmout = true; test->ifobj_rx->use_poll = true; return testapp_validate_traffic_single_thread(test, test->ifobj_rx); } @@ -2337,8 +2378,7 @@ int testapp_send_receive(struct test_spec *test) int testapp_send_receive_2k_frame(struct test_spec *test) { - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; + test_spec_set_frame_size(test, 2048); if (pkt_stream_replace(test, DEFAULT_PKT_CNT, MIN_PKT_SIZE)) return TEST_FAILURE; return testapp_validate_traffic(test); @@ -2363,34 +2403,30 @@ int testapp_aligned_inv_desc(struct test_spec *test) int testapp_aligned_inv_desc_2k_frame(struct test_spec *test) { - test->ifobj_tx->umem->frame_size = 2048; - test->ifobj_rx->umem->frame_size = 2048; + test_spec_set_frame_size(test, 2048); return testapp_invalid_desc(test); } int testapp_unaligned_inv_desc(struct test_spec *test) { - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; + test_spec_set_unaligned(test); return testapp_invalid_desc(test); } int testapp_unaligned_inv_desc_4001_frame(struct test_spec *test) { - u64 page_size, umem_size; + u64 page_size, umem_sz; /* Odd frame size so the UMEM doesn't end near a page boundary. */ - test->ifobj_tx->umem->frame_size = 4001; - test->ifobj_rx->umem->frame_size = 4001; - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; + test_spec_set_frame_size(test, 4001); + test_spec_set_unaligned(test); /* This test exists to test descriptors that staddle the end of * the UMEM but not a page. */ page_size = sysconf(_SC_PAGESIZE); - umem_size = test->ifobj_tx->umem->num_frames * test->ifobj_tx->umem->frame_size; - assert(umem_size % page_size > MIN_PKT_SIZE); - assert(umem_size % page_size < page_size - MIN_PKT_SIZE); + umem_sz = umem_size(test->ifobj_tx->xsk->umem); + assert(umem_sz % page_size > MIN_PKT_SIZE); + assert(umem_sz % page_size < page_size - MIN_PKT_SIZE); return testapp_invalid_desc(test); } @@ -2402,8 +2438,7 @@ int testapp_aligned_inv_desc_mb(struct test_spec *test) int testapp_unaligned_inv_desc_mb(struct test_spec *test) { - test->ifobj_tx->umem->unaligned_mode = true; - test->ifobj_rx->umem->unaligned_mode = true; + test_spec_set_unaligned(test); return testapp_invalid_desc_mb(test); } @@ -2447,9 +2482,9 @@ int testapp_hw_sw_max_ring_size(struct test_spec *test) test->total_steps = 2; test->ifobj_tx->ring.tx_pending = test->ifobj_tx->ring.tx_max_pending; test->ifobj_tx->ring.rx_pending = test->ifobj_tx->ring.rx_max_pending; - test->ifobj_rx->umem->num_frames = max_descs; - test->ifobj_rx->umem->fill_size = max_descs; - test->ifobj_rx->umem->comp_size = max_descs; + test->ifobj_rx->xsk->umem->num_frames = max_descs; + test->ifobj_rx->xsk->umem->fill_size = max_descs; + test->ifobj_rx->xsk->umem->comp_size = max_descs; test->ifobj_tx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; test->ifobj_rx->xsk->batch_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; @@ -2590,8 +2625,8 @@ struct ifobject *ifobject_create(void) if (!ifobj->xsk_arr) goto out_xsk_arr; - ifobj->umem = calloc(1, sizeof(*ifobj->umem)); - if (!ifobj->umem) + ifobj->xsk_arr[0].umem_real = calloc(1, sizeof(struct xsk_umem_info)); + if (!ifobj->xsk_arr[0].umem_real) goto out_umem; return ifobj; @@ -2605,7 +2640,9 @@ out_xsk_arr: void ifobject_delete(struct ifobject *ifobj) { - free(ifobj->umem); + if (ifobj->xsk_arr) + free(ifobj->xsk_arr[0].umem_real); + free(ifobj->xsk_arr); free(ifobj); } diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.h b/tools/testing/selftests/bpf/prog_tests/test_xsk.h index 1ab8aee4ce56..03753ddc5dcd 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.h +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.h @@ -83,6 +83,7 @@ typedef int (*test_func_t)(struct test_spec *test); struct xsk_socket_info { struct xsk_ring_cons rx; struct xsk_ring_prod tx; + struct xsk_umem_info *umem_real; struct xsk_umem_info *umem; struct xsk_socket *xsk; struct pkt_stream *pkt_stream; @@ -102,6 +103,7 @@ struct xsk_umem_info { struct xsk_ring_cons cq; struct xsk_umem *umem; u64 next_buffer; + u64 mmap_size; u32 num_frames; u32 frame_headroom; void *buffer; @@ -123,7 +125,6 @@ struct ifobject { char ifname[MAX_INTERFACE_NAME_CHARS]; struct xsk_socket_info *xsk; struct xsk_socket_info *xsk_arr; - struct xsk_umem_info *umem; thread_func_t func_ptr; validation_func_t validation_func; struct xsk_xdp_progs *xdp_progs; @@ -206,6 +207,8 @@ struct test_spec { bool set_ring; bool adjust_tail; bool adjust_tail_support; + bool poll_tmout; + bool use_barrier; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c new file mode 100644 index 000000000000..f02ffc7f41d7 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <test_progs.h> +#include <bpf/btf.h> +#include <search.h> +#include "bpf/libbpf_internal.h" +#include "tracing_multi.skel.h" +#include "tracing_multi_module.skel.h" +#include "tracing_multi_intersect.skel.h" +#include "tracing_multi_session.skel.h" +#include "tracing_multi_fail.skel.h" +#include "tracing_multi_verifier.skel.h" +#include "tracing_multi_bench.skel.h" +#include "tracing_multi_rollback.skel.h" +#include "trace_helpers.h" + +static __u64 bpf_fentry_test_cookies[] = { + 8, /* bpf_fentry_test1 */ + 9, /* bpf_fentry_test2 */ + 7, /* bpf_fentry_test3 */ + 5, /* bpf_fentry_test4 */ + 4, /* bpf_fentry_test5 */ + 2, /* bpf_fentry_test6 */ + 3, /* bpf_fentry_test7 */ + 1, /* bpf_fentry_test8 */ + 10, /* bpf_fentry_test9 */ + 6, /* bpf_fentry_test10 */ +}; + +static const char * const bpf_fentry_test[] = { + "bpf_fentry_test1", + "bpf_fentry_test2", + "bpf_fentry_test3", + "bpf_fentry_test4", + "bpf_fentry_test5", + "bpf_fentry_test6", + "bpf_fentry_test7", + "bpf_fentry_test8", + "bpf_fentry_test9", + "bpf_fentry_test10", +}; + +static const char * const bpf_testmod_fentry_test[] = { + "bpf_testmod_fentry_test1", + "bpf_testmod_fentry_test2", + "bpf_testmod_fentry_test3", + "bpf_testmod_fentry_test7", + "bpf_testmod_fentry_test11", +}; + +#define FUNCS_CNT (ARRAY_SIZE(bpf_fentry_test)) + +static int get_random_funcs(const char **funcs) +{ + int i, cnt = 0; + + for (i = 0; i < FUNCS_CNT; i++) { + if (rand() % 2) + funcs[cnt++] = bpf_fentry_test[i]; + } + /* we always need at least one.. */ + if (!cnt) + funcs[cnt++] = bpf_fentry_test[rand() % FUNCS_CNT]; + return cnt; +} + +static int compare(const void *ppa, const void *ppb) +{ + const char *pa = *(const char **) ppa; + const char *pb = *(const char **) ppb; + + return strcmp(pa, pb); +} + +static void tdestroy_free_nop(void *ptr) +{ +} + +static __u32 *get_ids(const char * const funcs[], int funcs_cnt, const char *mod) +{ + struct btf *btf, *vmlinux_btf = NULL; + __u32 nr, type_id, cnt = 0; + void *root = NULL; + __u32 *ids = NULL; + int i, err = 0; + + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf")) + return NULL; + + if (mod) { + vmlinux_btf = btf; + btf = btf__load_module_btf(mod, vmlinux_btf); + if (!ASSERT_OK_PTR(btf, "btf__load_module_btf")) { + btf__free(vmlinux_btf); + return NULL; + } + } + + ids = calloc(funcs_cnt, sizeof(ids[0])); + if (!ids) + goto out; + + /* + * We sort function names by name and search them + * below for each function. + */ + for (i = 0; i < funcs_cnt; i++) { + if (!tsearch(&funcs[i], &root, compare)) { + ASSERT_FAIL("tsearch failed"); + err = -1; + goto error; + } + } + + nr = btf__type_cnt(btf); + for (type_id = 1; type_id < nr && cnt < funcs_cnt; type_id++) { + const struct btf_type *type; + const char *str, ***val; + unsigned int idx; + + type = btf__type_by_id(btf, type_id); + if (!type) { + err = -1; + break; + } + + if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) + continue; + + str = btf__name_by_offset(btf, type->name_off); + if (!str) { + err = -1; + break; + } + + val = tfind(&str, &root, compare); + if (!val) + continue; + + /* + * We keep pointer for each function name so we can get the original + * array index and have the resulting ids array matching the original + * function array. + * + * Doing it this way allow us to easily test the cookies support, + * because each cookie is attached to particular function/id. + */ + idx = *val - funcs; + ids[idx] = type_id; + cnt++; + } + +error: + if (err) { + free(ids); + ids = NULL; + } + +out: + tdestroy(root, tdestroy_free_nop); + btf__free(vmlinux_btf); + btf__free(btf); + return ids; +} + +static void tracing_multi_test_run(struct tracing_multi *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs.test_fentry); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + + /* extra +1 count for sleepable programs */ + ASSERT_EQ(skel->bss->test_result_fentry, FUNCS_CNT + 1, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, FUNCS_CNT + 1, "test_result_fexit"); +} + +static void test_skel_api(void) +{ + struct tracing_multi *skel; + int err; + + skel = tracing_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = tracing_multi__attach(skel); + if (!ASSERT_OK(err, "tracing_multi__attach")) + goto cleanup; + + tracing_multi_test_run(skel); + +cleanup: + tracing_multi__destroy(skel); +} + +static void test_link_api_pattern(void) +{ + struct tracing_multi *skel; + + skel = tracing_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + "bpf_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s, + "bpf_fentry_test1", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s, + "bpf_fentry_test1", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + tracing_multi_test_run(skel); + +cleanup: + tracing_multi__destroy(skel); +} + +static void test_link_api_ids(bool test_cookies) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi *skel; + size_t cnt = FUNCS_CNT; + __u32 *ids; + + skel = tracing_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + skel->bss->test_cookies = test_cookies; + + ids = get_ids(bpf_fentry_test, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + + if (test_cookies) + opts.cookies = bpf_fentry_test_cookies; + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + /* Only bpf_fentry_test1 is allowed for sleepable programs. */ + opts.cnt = 1; + skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fentry_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit_s = bpf_program__attach_tracing_multi(skel->progs.test_fexit_s, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fexit_s, "bpf_program__attach_tracing_multi")) + goto cleanup; + + tracing_multi_test_run(skel); + +cleanup: + tracing_multi__destroy(skel); + free(ids); +} + +static void test_module_skel_api(void) +{ + struct tracing_multi_module *skel = NULL; + int err; + + skel = tracing_multi_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = tracing_multi_module__attach(skel); + if (!ASSERT_OK(err, "tracing_multi__attach")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit"); + +cleanup: + tracing_multi_module__destroy(skel); +} + +static void test_module_link_api_pattern(void) +{ + struct tracing_multi_module *skel = NULL; + + skel = tracing_multi_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load")) + return; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_testmod:bpf_testmod_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + "bpf_testmod:bpf_testmod_fentry_test*", NULL); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit"); + +cleanup: + tracing_multi_module__destroy(skel); +} + +static void test_module_link_api_ids(void) +{ + size_t cnt = ARRAY_SIZE(bpf_testmod_fentry_test); + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_module *skel = NULL; + __u32 *ids; + + skel = tracing_multi_module__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_module__open_and_load")) + return; + + skel->bss->pid = getpid(); + + ids = get_ids(bpf_testmod_fentry_test, cnt, "bpf_testmod"); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_OK_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 5, "test_result_fexit"); + +cleanup: + tracing_multi_module__destroy(skel); + free(ids); +} + +static bool is_set(__u32 mask, __u32 bit) +{ + return (1 << bit) & mask; +} + +static void __test_intersect(__u32 mask, const struct bpf_program *progs[4], __u64 *test_results[4]) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_link *links[4] = { NULL }; + const char *funcs[FUNCS_CNT]; + __u64 expected[4]; + __u32 *ids, i; + int err, cnt; + + /* + * We have 4 programs in progs and the mask bits pick which + * of them gets attached to randomly chosen functions. + */ + for (i = 0; i < 4; i++) { + if (!is_set(mask, i)) + continue; + + cnt = get_random_funcs(funcs); + ids = get_ids(funcs, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + links[i] = bpf_program__attach_tracing_multi(progs[i], NULL, &opts); + free(ids); + + if (!ASSERT_OK_PTR(links[i], "bpf_program__attach_tracing_multi")) + goto cleanup; + + expected[i] = *test_results[i] + cnt; + } + + err = bpf_prog_test_run_opts(bpf_program__fd(progs[0]), &topts); + ASSERT_OK(err, "test_run"); + + for (i = 0; i < 4; i++) { + if (!is_set(mask, i)) + continue; + ASSERT_EQ(*test_results[i], expected[i], "test_results"); + } + +cleanup: + for (i = 0; i < 4; i++) + bpf_link__destroy(links[i]); +} + +static void test_intersect(void) +{ + struct tracing_multi_intersect *skel; + const struct bpf_program *progs[4]; + __u64 *test_results[4]; + __u32 i; + + skel = tracing_multi_intersect__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_intersect__open_and_load")) + return; + + skel->bss->pid = getpid(); + + progs[0] = skel->progs.fentry_1; + progs[1] = skel->progs.fexit_1; + progs[2] = skel->progs.fentry_2; + progs[3] = skel->progs.fexit_2; + + test_results[0] = &skel->bss->test_result_fentry_1; + test_results[1] = &skel->bss->test_result_fexit_1; + test_results[2] = &skel->bss->test_result_fentry_2; + test_results[3] = &skel->bss->test_result_fexit_2; + + for (i = 1; i < 16; i++) + __test_intersect(i, progs, test_results); + + tracing_multi_intersect__destroy(skel); +} + +static void test_session(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct tracing_multi_session *skel; + int err, prog_fd; + + skel = tracing_multi_session__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_session__open_and_load")) + return; + + skel->bss->pid = getpid(); + + err = tracing_multi_session__attach(skel); + if (!ASSERT_OK(err, "tracing_multi_session__attach")) + goto cleanup; + + /* execute kernel session */ + prog_fd = bpf_program__fd(skel->progs.test_session_1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + + /* 10 for test_session_1, 1 for test_fsession_s */ + ASSERT_EQ(skel->bss->test_result_fentry, 11, "test_result_fentry"); + /* extra count (+1 for each fexit execution) for test_result_fexit cookie check/inc */ + ASSERT_EQ(skel->bss->test_result_fexit, 22, "test_result_fexit"); + + skel->bss->test_result_fentry = 0; + skel->bss->test_result_fexit = 0; + + /* execute bpf_testmo.ko session */ + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); + + /* 5 for test_session_2 */ + ASSERT_EQ(skel->bss->test_result_fentry, 5, "test_result_fentry"); + /* extra count (+1 for each fexit execution) for test_result_fexit cookie */ + ASSERT_EQ(skel->bss->test_result_fexit, 10, "test_result_fexit"); + + +cleanup: + tracing_multi_session__destroy(skel); +} + +static void test_attach_api_fails(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + static const char * const func[] = { + "bpf_fentry_test2", + }; + struct tracing_multi_fail *skel = NULL; + __u32 ids[2] = {}, *ids2 = NULL; + __u64 cookies[2]; + + skel = tracing_multi_fail__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_fail__open_and_load")) + return; + + /* fail#1 (libbpf) pattern and opts NULL */ + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, NULL); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_1")) + goto cleanup; + + /* fail#2 (libbpf) pattern and ids */ + LIBBPF_OPTS_RESET(opts, + .ids = ids, + .cnt = 2, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_fentry_test*", &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_2")) + goto cleanup; + + /* fail#3 (libbpf) pattern and cookies */ + LIBBPF_OPTS_RESET(opts, + .ids = NULL, + .cnt = 2, + .cookies = cookies, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_fentry_test*", &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_3")) + goto cleanup; + + /* fail#4 (libbpf) bogus pattern */ + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + "bpf_not_really_a_function*", NULL); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_4")) + goto cleanup; + + /* fail#5 (kernel) abnormal cnt */ + LIBBPF_OPTS_RESET(opts, + .ids = ids, + .cnt = INT_MAX, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -E2BIG, "fail_5")) + goto cleanup; + + /* fail#6 (kernel) attach sleepable program to not-allowed function */ + ids2 = get_ids(func, 1, NULL); + if (!ASSERT_OK_PTR(ids2, "get_ids")) + goto cleanup; + + LIBBPF_OPTS_RESET(opts, + .ids = ids2, + .cnt = 1, + ); + + skel->links.test_fentry_s = bpf_program__attach_tracing_multi(skel->progs.test_fentry_s, + NULL, &opts); + if (!ASSERT_EQ(libbpf_get_error(skel->links.test_fentry_s), -EINVAL, "fail_6")) + goto cleanup; + + /* fail#7 (kernel) attach with duplicate id */ + ids[0] = ids2[0]; + ids[1] = ids2[0]; + + LIBBPF_OPTS_RESET(opts, + .ids = ids, + .cnt = 2, + ); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + ASSERT_EQ(libbpf_get_error(skel->links.test_fentry), -EINVAL, "fail_7"); + +cleanup: + tracing_multi_fail__destroy(skel); + free(ids2); +} + +void serial_test_tracing_multi_bench_attach(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_bench *skel = NULL; + long attach_start_ns, attach_end_ns; + long detach_start_ns, detach_end_ns; + double attach_delta, detach_delta; + struct bpf_link *link = NULL; + size_t i, cap = 0, cnt = 0; + struct ksyms *ksyms = NULL; + void *root = NULL; + void *dups = NULL; + __u32 *ids = NULL; + __u32 nr, type_id; + struct btf *btf; + int err; + +#ifndef __x86_64__ + test__skip(); + return; +#endif + + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(btf, "btf__load_vmlinux_btf")) + return; + + skel = tracing_multi_bench__open_and_load(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_bench__open_and_load")) + goto cleanup; + + if (!ASSERT_OK(bpf_get_ksyms(&ksyms, true), "get_syms")) + goto cleanup; + + /* Get all ftrace 'safe' symbols.. */ + for (i = 0; i < ksyms->filtered_cnt; i++) { + if (!tsearch(&ksyms->filtered_syms[i], &root, compare)) { + ASSERT_FAIL("tsearch failed"); + goto cleanup; + } + } + + /* + * Collect names that are not unique in kallsyms. The kernel resolves a + * tracing-multi BTF id to an address with kallsyms_lookup_name(), which + * returns the first symbol of that name. For a duplicate name that may + * be a different (non-ftrace-able) instance than the ftrace-able one in + * available_filter_functions, so attaching to it by BTF id fails with + * -ENOENT (e.g. t_start/t_next/t_stop). ksyms->syms is sorted by name, + * so equal names are adjacent. + */ + for (i = 1; i < ksyms->sym_cnt; i++) { + if (strcmp(ksyms->syms[i].name, ksyms->syms[i - 1].name)) + continue; + if (!tsearch(&ksyms->syms[i].name, &dups, compare)) { + ASSERT_FAIL("tsearch failed"); + goto cleanup; + } + } + + /* ..and filter them through BTF and btf_type_is_traceable_func. */ + nr = btf__type_cnt(btf); + for (type_id = 1; type_id < nr; type_id++) { + const struct btf_type *type; + const char *str; + + type = btf__type_by_id(btf, type_id); + if (!type) + break; + + if (BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) + continue; + + str = btf__name_by_offset(btf, type->name_off); + if (!str) + break; + + if (!tfind(&str, &root, compare)) + continue; + + /* Skip names that are not unique in kallsyms, see above. */ + if (tfind(&str, &dups, compare)) + continue; + + if (!btf_type_is_traceable_func(btf, type)) + continue; + + err = libbpf_ensure_mem((void **) &ids, &cap, sizeof(*ids), cnt + 1); + if (err) + goto cleanup; + + ids[cnt++] = type_id; + } + + opts.ids = ids; + opts.cnt = cnt; + + attach_start_ns = get_time_ns(); + link = bpf_program__attach_tracing_multi(skel->progs.bench, NULL, &opts); + attach_end_ns = get_time_ns(); + + if (!ASSERT_OK_PTR(link, "bpf_program__attach_tracing_multi")) + goto cleanup; + + detach_start_ns = get_time_ns(); + bpf_link__destroy(link); + detach_end_ns = get_time_ns(); + + attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0; + detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0; + + printf("%s: found %lu functions\n", __func__, cnt); + printf("%s: attached in %7.3lfs\n", __func__, attach_delta); + printf("%s: detached in %7.3lfs\n", __func__, detach_delta); + +cleanup: + tracing_multi_bench__destroy(skel); + tdestroy(root, tdestroy_free_nop); + tdestroy(dups, tdestroy_free_nop); + free_kallsyms_local(ksyms); + free(ids); + btf__free(btf); +} + +static void tracing_multi_rollback_run(struct tracing_multi_rollback *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + prog_fd = bpf_program__fd(skel->progs.test_fentry); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + + /* make sure the rollback code did not leave any program attached */ + ASSERT_EQ(skel->bss->test_result_fentry, 0, "test_result_fentry"); + ASSERT_EQ(skel->bss->test_result_fexit, 0, "test_result_fexit"); +} + +static void test_rollback_put(void) +{ + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_rollback *skel = NULL; + size_t cnt = FUNCS_CNT; + __u32 *ids = NULL; + int err; + + skel = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open")) + return; + + bpf_program__set_autoload(skel->progs.test_fentry, true); + bpf_program__set_autoload(skel->progs.test_fexit, true); + + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + ids = get_ids(bpf_fentry_test, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + /* + * Mangle last id to trigger rollback, which needs to do put + * on get-ed trampolines. + */ + ids[9] = 0; + + opts.ids = ids; + opts.cnt = cnt; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + /* We don't really attach any program, but let's make sure. */ + tracing_multi_rollback_run(skel); + +cleanup: + tracing_multi_rollback__destroy(skel); + free(ids); +} + +static void fillers_cleanup(struct tracing_multi_rollback **skels, int cnt) +{ + int i; + + for (i = 0; i < cnt; i++) + tracing_multi_rollback__destroy(skels[i]); + + free(skels); +} + +static struct tracing_multi_rollback *extra_load_and_link(void) +{ + struct tracing_multi_rollback *skel; + int err; + + skel = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open")) + goto cleanup; + + bpf_program__set_autoload(skel->progs.extra, true); + + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + skel->links.extra = bpf_program__attach_trace(skel->progs.extra); + if (!ASSERT_OK_PTR(skel->links.extra, "bpf_program__attach_trace")) + goto cleanup; + + return skel; + +cleanup: + tracing_multi_rollback__destroy(skel); + return NULL; +} + +static struct tracing_multi_rollback **fillers_load_and_link(int max) +{ + struct tracing_multi_rollback **skels, *skel; + int i, err; + + skels = calloc(max + 1, sizeof(*skels)); + if (!ASSERT_OK_PTR(skels, "calloc")) + return NULL; + + for (i = 0; i < max; i++) { + skel = skels[i] = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skels[i], "tracing_multi_rollback__open")) + goto cleanup; + + bpf_program__set_autoload(skel->progs.filler, true); + + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + skel->links.filler = bpf_program__attach_trace(skel->progs.filler); + if (!ASSERT_OK_PTR(skels[i]->links.filler, "bpf_program__attach_trace")) + goto cleanup; + } + + return skels; + +cleanup: + fillers_cleanup(skels, i + 1); + return NULL; +} + +static void test_rollback_unlink(void) +{ + struct tracing_multi_rollback *skel = NULL, *extra; + LIBBPF_OPTS(bpf_tracing_multi_opts, opts); + struct tracing_multi_rollback **fillers; + size_t cnt = FUNCS_CNT; + __u32 *ids = NULL; + int err, max; + + max = get_bpf_max_tramp_links(); + if (!ASSERT_GE(max, 1, "bpf_max_tramp_links")) + return; + + /* Attach maximum allowed programs to bpf_fentry_test10 */ + fillers = fillers_load_and_link(max); + if (!ASSERT_OK_PTR(fillers, "fillers_load_and_link")) + return; + + extra = extra_load_and_link(); + if (!ASSERT_OK_PTR(extra, "extra_load_and_link")) + goto cleanup; + + skel = tracing_multi_rollback__open(); + if (!ASSERT_OK_PTR(skel, "tracing_multi_rollback__open")) + goto cleanup; + + bpf_program__set_autoload(skel->progs.test_fentry, true); + bpf_program__set_autoload(skel->progs.test_fexit, true); + + /* + * Attach tracing_multi link on bpf_fentry_test1-10, which will + * fail on bpf_fentry_test10 function, because it already has + * maximum allowed programs attached. + * + * The rollback needs to unlink already link-ed trampolines and + * put all of them. + */ + err = tracing_multi_rollback__load(skel); + if (!ASSERT_OK(err, "tracing_multi_rollback__load")) + goto cleanup; + + ids = get_ids(bpf_fentry_test, cnt, NULL); + if (!ASSERT_OK_PTR(ids, "get_ids")) + goto cleanup; + + opts.ids = ids; + opts.cnt = cnt; + + skel->bss->pid = getpid(); + + skel->links.test_fentry = bpf_program__attach_tracing_multi(skel->progs.test_fentry, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fentry, "bpf_program__attach_tracing_multi")) + goto cleanup; + + skel->links.test_fexit = bpf_program__attach_tracing_multi(skel->progs.test_fexit, + NULL, &opts); + if (!ASSERT_ERR_PTR(skel->links.test_fexit, "bpf_program__attach_tracing_multi")) + goto cleanup; + + tracing_multi_rollback_run(skel); + +cleanup: + fillers_cleanup(fillers, max); + tracing_multi_rollback__destroy(extra); + tracing_multi_rollback__destroy(skel); + free(ids); +} + +void serial_test_tracing_multi_attach_rollback(void) +{ + if (test__start_subtest("put")) + test_rollback_put(); + if (test__start_subtest("unlink")) + test_rollback_unlink(); +} + +void test_tracing_multi_test(void) +{ +#ifndef __x86_64__ + test__skip(); + return; +#endif + + if (test__start_subtest("skel_api")) + test_skel_api(); + if (test__start_subtest("link_api_pattern")) + test_link_api_pattern(); + if (test__start_subtest("link_api_ids")) + test_link_api_ids(false); + if (test__start_subtest("module_skel_api")) + test_module_skel_api(); + if (test__start_subtest("module_link_api_pattern")) + test_module_link_api_pattern(); + if (test__start_subtest("module_link_api_ids")) + test_module_link_api_ids(); + if (test__start_subtest("intersect")) + test_intersect(); + if (test__start_subtest("cookies")) + test_link_api_ids(true); + if (test__start_subtest("session")) + test_session(); + if (test__start_subtest("attach_api_fails")) + test_attach_api_fails(); + RUN_TESTS(tracing_multi_verifier); +} diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c index 56cbea280fbd..f0baf5738b75 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c @@ -2,6 +2,7 @@ #include <unistd.h> #include <pthread.h> +#include <fcntl.h> #include <test_progs.h> #include "uprobe_multi.skel.h" #include "uprobe_multi_bench.skel.h" @@ -536,7 +537,37 @@ static void test_attach_api_fails(void) link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); if (!ASSERT_ERR(link_fd, "link_fd")) goto cleanup; - ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong"); + if (!ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong")) + goto cleanup; + + /* wrong path_fd */ + LIBBPF_OPTS_RESET(opts, + .uprobe_multi.path = NULL, + .uprobe_multi.path_fd = -1, + .uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD, + .uprobe_multi.offsets = (unsigned long *)&offset, + .uprobe_multi.cnt = 1, + ); + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + goto cleanup; + if (!ASSERT_EQ(link_fd, -EBADF, "path_fd_is_wrong")) + goto cleanup; + + /* path and path_fd both set with BPF_F_UPROBE_MULTI_PATH_FD flag */ + LIBBPF_OPTS_RESET(opts, + .uprobe_multi.path = path, + .uprobe_multi.path_fd = 1, + .uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD, + .uprobe_multi.offsets = (unsigned long *)&offset, + .uprobe_multi.cnt = 1, + ); + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + goto cleanup; + ASSERT_EQ(link_fd, -EINVAL, "path_and_path_fd_together"); cleanup: if (link_fd >= 0) @@ -757,6 +788,65 @@ static void test_link_api(void) __test_link_api(&child); } +static void test_link_api_path_fd(void) +{ + LIBBPF_OPTS(bpf_link_create_opts, opts); + const char *resolve_path = "/proc/self/exe"; + int prog_fd, link_fd = -1, path_fd = -1; + struct uprobe_multi *skel = NULL; + unsigned long *offsets = NULL; + const char *syms[3] = { + "uprobe_multi_func_1", + "uprobe_multi_func_2", + "uprobe_multi_func_3", + }; + int err; + + err = elf_resolve_syms_offsets(resolve_path, ARRAY_SIZE(syms), syms, + &offsets, STT_FUNC); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets")) + return; + + path_fd = open(resolve_path, O_RDONLY); + if (!ASSERT_GE(path_fd, 0, "path_fd")) + goto cleanup; + + opts.uprobe_multi.path_fd = path_fd; + opts.uprobe_multi.offsets = offsets; + opts.uprobe_multi.cnt = ARRAY_SIZE(syms); + opts.uprobe_multi.flags = BPF_F_UPROBE_MULTI_PATH_FD; + + skel = uprobe_multi__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_multi__open_and_load")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.uprobe); + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_GE(link_fd, 0, "bpf_link_create")) + goto cleanup; + + skel->bss->uprobe_multi_func_1_addr = (__u64)uprobe_multi_func_1; + skel->bss->uprobe_multi_func_2_addr = (__u64)uprobe_multi_func_2; + skel->bss->uprobe_multi_func_3_addr = (__u64)uprobe_multi_func_3; + skel->bss->pid = getpid(); + + uprobe_multi_func_1(); + uprobe_multi_func_2(); + uprobe_multi_func_3(); + + ASSERT_EQ(skel->bss->uprobe_multi_func_1_result, 1, "uprobe_multi_func_1_result"); + ASSERT_EQ(skel->bss->uprobe_multi_func_2_result, 1, "uprobe_multi_func_2_result"); + ASSERT_EQ(skel->bss->uprobe_multi_func_3_result, 1, "uprobe_multi_func_3_result"); + +cleanup: + if (link_fd >= 0) + close(link_fd); + if (path_fd >= 0) + close(path_fd); + uprobe_multi__destroy(skel); + free(offsets); +} + static struct bpf_program * get_program(struct uprobe_multi_consumers *skel, int prog) { @@ -1354,6 +1444,8 @@ void test_uprobe_multi_test(void) test_attach_api_syms(); if (test__start_subtest("link_api")) test_link_api(); + if (test__start_subtest("link_api_path_fd")) + test_link_api_path_fd(); if (test__start_subtest("bench_uprobe")) test_bench_attach_uprobe(); if (test__start_subtest("bench_usdt")) diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index a96b25ebff23..8a3d69e2453c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -22,6 +22,7 @@ #include "verifier_bswap.skel.h" #include "verifier_btf_ctx_access.skel.h" #include "verifier_btf_unreliable_prog.skel.h" +#include "verifier_call_large_imm.skel.h" #include "verifier_cfg.skel.h" #include "verifier_cgroup_inv_retcode.skel.h" #include "verifier_cgroup_skb.skel.h" @@ -37,6 +38,7 @@ #include "verifier_div0.skel.h" #include "verifier_div_mod_bounds.skel.h" #include "verifier_div_overflow.skel.h" +#include "verifier_flow_keys.skel.h" #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" #include "verifier_gotol.skel.h" @@ -91,6 +93,8 @@ #include "verifier_sockmap_mutate.skel.h" #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" +#include "verifier_stack_arg.skel.h" +#include "verifier_stack_arg_order.skel.h" #include "verifier_stack_ptr.skel.h" #include "verifier_store_release.skel.h" #include "verifier_subprog_precision.skel.h" @@ -114,6 +118,7 @@ #include "verifier_xdp.skel.h" #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" +#include "verifier_set_retval.skel.h" #include "verifier_lsm.skel.h" #include "verifier_jit_inline.skel.h" #include "irq.skel.h" @@ -170,6 +175,7 @@ void test_verifier_bpf_trap(void) { RUN(verifier_bpf_trap); } void test_verifier_bswap(void) { RUN(verifier_bswap); } void test_verifier_btf_ctx_access(void) { RUN(verifier_btf_ctx_access); } void test_verifier_btf_unreliable_prog(void) { RUN(verifier_btf_unreliable_prog); } +void test_verifier_call_large_imm(void) { RUN(verifier_call_large_imm); } void test_verifier_cfg(void) { RUN(verifier_cfg); } void test_verifier_cgroup_inv_retcode(void) { RUN(verifier_cgroup_inv_retcode); } void test_verifier_cgroup_skb(void) { RUN(verifier_cgroup_skb); } @@ -185,6 +191,7 @@ void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_st void test_verifier_div0(void) { RUN(verifier_div0); } void test_verifier_div_mod_bounds(void) { RUN(verifier_div_mod_bounds); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } +void test_verifier_flow_keys(void) { RUN(verifier_flow_keys); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } void test_verifier_gotol(void) { RUN(verifier_gotol); } @@ -238,6 +245,8 @@ void test_verifier_sock_addr(void) { RUN(verifier_sock_addr); } void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } +void test_verifier_stack_arg(void) { RUN(verifier_stack_arg); } +void test_verifier_stack_arg_order(void) { RUN(verifier_stack_arg_order); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } void test_verifier_store_release(void) { RUN(verifier_store_release); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } @@ -260,6 +269,7 @@ void test_verifier_xadd(void) { RUN(verifier_xadd); } void test_verifier_xdp(void) { RUN(verifier_xdp); } void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } +void test_verifier_set_retval(void) { RUN(verifier_set_retval); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } diff --git a/tools/testing/selftests/bpf/prog_tests/verifier_log.c b/tools/testing/selftests/bpf/prog_tests/verifier_log.c index c01c0114af1b..4542bb586d72 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier_log.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier_log.c @@ -317,6 +317,7 @@ static void verif_btf_log_subtest(bool bad_btf) res = load_btf(&opts, true); ASSERT_EQ(res, -ENOSPC, "half_log_fd"); ASSERT_EQ(strlen(logs.buf), 24, "log_fixed_25"); + strscpy(op_name, "log_fixed", sizeof(op_name)); ASSERT_STRNEQ(logs.buf, logs.reference, 24, op_name); /* validate rolling verifier log logic: try all variations of log buf diff --git a/tools/testing/selftests/bpf/prog_tests/vmlinux.c b/tools/testing/selftests/bpf/prog_tests/vmlinux.c index 6fb2217d940b..b5fdd593910d 100644 --- a/tools/testing/selftests/bpf/prog_tests/vmlinux.c +++ b/tools/testing/selftests/bpf/prog_tests/vmlinux.c @@ -14,21 +14,61 @@ static void nsleep() (void)syscall(__NR_nanosleep, &ts, NULL); } +static const char *hrtimer_func = "hrtimer_start_range_ns"; + +static int setup_hrtimer_progs(struct test_vmlinux *skel) +{ + int err; + + if (libbpf_find_vmlinux_btf_id("hrtimer_start_range_ns_user", BPF_TRACE_FENTRY) > 0) + hrtimer_func = "hrtimer_start_range_ns_user"; + + err = bpf_program__set_attach_target(skel->progs.handle__fentry, 0, hrtimer_func); + if (err) + return err; + + /* + * Bare SEC("kprobe") has no target function, so attach it manually + * later after selecting the hrtimer function to probe. + */ + bpf_program__set_autoattach(skel->progs.handle__kprobe, false); + + return 0; +} + void test_vmlinux(void) { int err; struct test_vmlinux* skel; struct test_vmlinux__bss *bss; + struct bpf_link *kprobe_link = NULL; - skel = test_vmlinux__open_and_load(); - if (!ASSERT_OK_PTR(skel, "test_vmlinux__open_and_load")) + skel = test_vmlinux__open(); + if (!ASSERT_OK_PTR(skel, "test_vmlinux__open")) return; + + err = setup_hrtimer_progs(skel); + if (!ASSERT_OK(err, "setup_hrtimer_progs")) + goto cleanup; + + err = test_vmlinux__load(skel); + if (!ASSERT_OK(err, "test_vmlinux__load")) + goto cleanup; + bss = skel->bss; err = test_vmlinux__attach(skel); if (!ASSERT_OK(err, "test_vmlinux__attach")) goto cleanup; + /* manually attach kprobe with the selected function */ + if (hrtimer_func) { + kprobe_link = bpf_program__attach_kprobe(skel->progs.handle__kprobe, + false /* retprobe */, hrtimer_func); + if (!ASSERT_OK_PTR(kprobe_link, "bpf_program__attach_kprobe")) + goto cleanup; + } + /* trigger everything */ nsleep(); @@ -39,5 +79,6 @@ void test_vmlinux(void) ASSERT_TRUE(bss->fentry_called, "fentry"); cleanup: + bpf_link__destroy(kprobe_link); test_vmlinux__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/wakeup_source.c b/tools/testing/selftests/bpf/prog_tests/wakeup_source.c new file mode 100644 index 000000000000..ebfdc03271b9 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/wakeup_source.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include <test_progs.h> +#include <bpf/btf.h> +#include <fcntl.h> +#include "test_wakeup_source.skel.h" +#include "wakeup_source_fail.skel.h" +#include "progs/wakeup_source.h" + +static int lock_ws(const char *name) +{ + int fd; + ssize_t bytes; + + fd = open("/sys/power/wake_lock", O_WRONLY); + if (!ASSERT_OK_FD(fd, "open /sys/power/wake_lock")) + return -1; + + bytes = write(fd, name, strlen(name)); + close(fd); + if (!ASSERT_EQ(bytes, strlen(name), "write to wake_lock")) + return -1; + + return 0; +} + +static void unlock_ws(const char *name) +{ + int fd; + + fd = open("/sys/power/wake_unlock", O_WRONLY); + if (fd < 0) + return; + + write(fd, name, strlen(name)); + close(fd); +} + +struct rb_ctx { + const char *name; + bool found; + long long active_time_ns; + long long total_time_ns; +}; + +static int process_sample(void *ctx, void *data, size_t len) +{ + struct rb_ctx *rb_ctx = ctx; + struct wakeup_event_t *e = data; + + if (strcmp(e->name, rb_ctx->name) == 0) { + rb_ctx->found = true; + rb_ctx->active_time_ns = e->active_time_ns; + rb_ctx->total_time_ns = e->total_time_ns; + } + return 0; +} + +void test_wakeup_source(void) +{ + struct btf *btf; + int id; + + btf = btf__load_vmlinux_btf(); + if (!ASSERT_OK_PTR(btf, "btf_vmlinux")) + return; + + id = btf__find_by_name_kind(btf, "bpf_wakeup_sources_get_head", BTF_KIND_FUNC); + btf__free(btf); + + if (id < 0) { + printf("%s:SKIP:bpf_wakeup_sources_get_head kfunc not found in BTF\n", __func__); + test__skip(); + return; + } + + if (test__start_subtest("iterate_and_verify_times")) { + struct test_wakeup_source *skel; + struct ring_buffer *rb = NULL; + struct rb_ctx rb_ctx = { + .name = "bpf_selftest_ws_times", + .found = false, + }; + int err; + + skel = test_wakeup_source__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), process_sample, &rb_ctx, NULL); + if (!ASSERT_OK_PTR(rb, "ring_buffer__new")) + goto destroy; + + /* Create a temporary wakeup source */ + if (!ASSERT_OK(lock_ws(rb_ctx.name), "lock_ws")) + goto unlock; + + err = bpf_prog_test_run_opts(bpf_program__fd( + skel->progs.iterate_wakeupsources), NULL); + ASSERT_OK(err, "bpf_prog_test_run"); + + ring_buffer__consume(rb); + + ASSERT_TRUE(rb_ctx.found, "found_test_ws_in_rb"); + ASSERT_GT(rb_ctx.active_time_ns, 0, "active_time_gt_0"); + ASSERT_GT(rb_ctx.total_time_ns, 0, "total_time_gt_0"); + +unlock: + unlock_ws(rb_ctx.name); +destroy: + if (rb) + ring_buffer__free(rb); + test_wakeup_source__destroy(skel); + } + + RUN_TESTS(wakeup_source_fail); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 26159e0499c7..448807676176 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include <network_helpers.h> +#include <linux/ipv6.h> +#include <arpa/inet.h> #include "test_xdp_context_test_run.skel.h" #include "test_xdp_meta.skel.h" @@ -8,9 +10,12 @@ #define TX_NAME "veth1" #define TX_NETNS "xdp_context_tx" #define RX_NETNS "xdp_context_rx" +#define RX_MAC "02:00:00:00:00:01" +#define TX_MAC "02:00:00:00:00:02" #define TAP_NAME "tap0" #define DUMMY_NAME "dum0" #define TAP_NETNS "xdp_context_tuntap" +#define LWT_NETNS "xdp_context_lwt" #define TEST_PAYLOAD_LEN 32 static const __u8 test_payload[TEST_PAYLOAD_LEN] = { @@ -187,6 +192,42 @@ static int write_test_packet(int tap_fd) return 0; } +/* Inject Ethernet+IPv6+UDP frame into TAP */ +static int write_test_packet_udp(int tap_fd) +{ + __u8 pkt[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + TEST_PAYLOAD_LEN] = {}; + struct ethhdr *eth = (void *)pkt; + struct ipv6hdr *ip6 = (void *)(eth + 1); + struct udphdr *udp = (void *)(ip6 + 1); + __u8 *payload = (void *)(udp + 1); + const __u8 tap_mac[ETH_ALEN] = { 0x02, 0, 0, 0, 0, 0x01 }; + int n; + + memcpy(eth->h_dest, tap_mac, ETH_ALEN); + eth->h_proto = htons(ETH_P_IPV6); + + ip6->version = 6; + ip6->hop_limit = 64; + ip6->nexthdr = IPPROTO_UDP; + ip6->payload_len = htons(sizeof(*udp) + TEST_PAYLOAD_LEN); + inet_pton(AF_INET6, "fd00::2", &ip6->saddr); + inet_pton(AF_INET6, "fd00:1::1", &ip6->daddr); + + udp->source = htons(42); + udp->dest = htons(42); + udp->len = htons(sizeof(*udp) + TEST_PAYLOAD_LEN); + /* UDP checksum is not validated on the forwarding path. */ + + memcpy(payload, test_payload, TEST_PAYLOAD_LEN); + + n = write(tap_fd, pkt, sizeof(pkt)); + if (!ASSERT_EQ(n, sizeof(pkt), "write frame")) + return -1; + + return 0; +} + static void dump_err_stream(const struct bpf_program *prog) { char buf[512]; @@ -518,3 +559,137 @@ void test_xdp_context_tuntap(void) test_xdp_meta__destroy(skel); } + +/* + * Test topology: + * + * tap0 fd00::1 + * RX: injected IPv6 UDP frame, XDP ingress sets metadata + * fwd: encap route prepends outer header(s) + * TX: TC egress validates metadata + * + * A routable IPv6 UDP frame is written into the tap fd, so it enters the RX + * path where XDP stores metadata. Routing then forwards it back out the same + * tap through an encapsulating route that prepends outer header(s). The TC + * egress program checks that the pushed header did not silently corrupt + * metadata. + */ +#define LWT_PIN_PATH "/sys/fs/bpf/xdp_context_lwt_xmit" + +enum lwt_encap_type { + LWT_ENCAP_BPF, + LWT_ENCAP_MPLS, + LWT_ENCAP_SEG6, + LWT_ENCAP_IOAM6, +}; + +static void test_lwt_encap(struct test_xdp_meta *skel, + enum lwt_encap_type type) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct bpf_program *lwt_prog = NULL; + struct netns_obj *ns = NULL; + const char *encap; + bool pinned = false; + int tap_ifindex; + int tap_fd = -1; + int ret; + + skel->bss->test_pass = false; + + switch (type) { + case LWT_ENCAP_BPF: + encap = "encap bpf xmit pinned " LWT_PIN_PATH " via fd00::2"; + lwt_prog = skel->progs.dummy_lwt_xmit; + break; + case LWT_ENCAP_MPLS: + encap = "encap mpls 100 via inet6 fd00::2"; + break; + case LWT_ENCAP_SEG6: + encap = "encap seg6 mode encap segs fd00::2"; + break; + case LWT_ENCAP_IOAM6: + encap = "encap ioam6 mode encap tundst fd00::2 " + "trace prealloc type 0x800000 ns 0 size 4 via fd00::2"; + break; + default: + return; + } + + if (lwt_prog) { + unlink(LWT_PIN_PATH); + ret = bpf_program__pin(lwt_prog, LWT_PIN_PATH); + if (!ASSERT_OK(ret, "pin lwt prog")) + return; + pinned = true; + } + + ns = netns_new(LWT_NETNS, true); + if (!ASSERT_OK_PTR(ns, "netns_new")) + goto close; + + tap_fd = open_tuntap(TAP_NAME, true); + if (!ASSERT_GE(tap_fd, 0, "open_tuntap")) + goto close; + + SYS(close, "ip link set dev " TAP_NAME " address " RX_MAC); + SYS(close, "sysctl -wq net.ipv6.conf.all.forwarding=1"); + SYS(close, "ip addr add fd00::1/64 dev " TAP_NAME " nodad"); + SYS(close, "ip link set dev " TAP_NAME " up"); + SYS(close, "ip neigh add fd00::2 lladdr " TX_MAC " nud permanent dev " TAP_NAME); + SYS(close, "ip -6 route add fd00:1::/64 %s dev %s", encap, TAP_NAME); + + tap_ifindex = if_nametoindex(TAP_NAME); + if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) + goto close; + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(skel->progs.ing_xdp), + 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + tc_hook.ifindex = tap_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(skel->progs.tc_is_meta_empty); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + ret = write_test_packet_udp(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet_udp")) + goto close; + + if (!ASSERT_TRUE(skel->bss->test_pass, "test_pass")) + dump_err_stream(skel->progs.tc_is_meta_empty); + +close: + if (tap_fd >= 0) + close(tap_fd); + netns_free(ns); + if (pinned) + unlink(LWT_PIN_PATH); +} + +void test_xdp_context_lwt_encap(void) +{ + struct test_xdp_meta *skel; + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + return; + + if (test__start_subtest("bpf_encap")) + test_lwt_encap(skel, LWT_ENCAP_BPF); + if (test__start_subtest("mpls_encap")) + test_lwt_encap(skel, LWT_ENCAP_MPLS); + if (test__start_subtest("seg6_encap")) + test_lwt_encap(skel, LWT_ENCAP_SEG6); + if (test__start_subtest("ioam6_encap")) + test_lwt_encap(skel, LWT_ENCAP_IOAM6); + + test_xdp_meta__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index d1841aac94a2..2e7751a85399 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -5,7 +5,7 @@ #include <bpf/bpf_tracing.h> #include <stdbool.h> #include <stdatomic.h> -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #include "../../../include/linux/filter.h" #include "bpf_misc.h" diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c index 086b57a426cf..cf7cda79c16c 100644 --- a/tools/testing/selftests/bpf/progs/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -4,7 +4,8 @@ #include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" -#include "bpf_arena_spin_lock.h" +#include <bpf_arena_common.h> +#include <bpf_arena_spin_lock.h> struct { __uint(type, BPF_MAP_TYPE_ARENA); diff --git a/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h b/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h new file mode 100644 index 000000000000..6a1ad75f1fd7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bench_bpf_timing.bpf.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef __BENCH_BPF_TIMING_BPF_H__ +#define __BENCH_BPF_TIMING_BPF_H__ + +#include <stdbool.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf_may_goto.h> + +#ifndef BENCH_NR_SAMPLES +#define BENCH_NR_SAMPLES 4096 +#endif +#ifndef BENCH_NR_CPUS +#define BENCH_NR_CPUS 256 +#endif +#define BENCH_CPU_MASK (BENCH_NR_CPUS - 1) + +__u64 timing_samples[BENCH_NR_CPUS][BENCH_NR_SAMPLES]; +__u32 timing_idx[BENCH_NR_CPUS]; + +volatile __u32 batch_iters; +volatile __u32 timing_enabled; + +static __always_inline void bench_record_sample(__u64 elapsed_ns) +{ + __u32 cpu, idx; + + if (!timing_enabled) + return; + + cpu = bpf_get_smp_processor_id() & BENCH_CPU_MASK; + idx = timing_idx[cpu]; + + if (idx >= BENCH_NR_SAMPLES) + return; + + timing_samples[cpu][idx] = elapsed_ns; + timing_idx[cpu] = idx + 1; +} + +/* + * @body: expression to time; return value (int) stored in __bench_result. + * @reset: undo body's side-effects so each iteration starts identically. + * May reference __bench_result. Use ({}) for empty reset. + * + * Runs batch_iters timed iterations, then one untimed iteration whose + * return value the macro evaluates to (for validation). + */ +#define BENCH_BPF_LOOP(body, reset) ({ \ + __u64 __bench_start = bpf_ktime_get_ns(); \ + __u32 __bench_i; \ + int __bench_result; \ + \ + for (__bench_i = 0; \ + __bench_i < batch_iters && can_loop; \ + __bench_i++) { \ + __bench_result = (body); \ + reset; \ + } \ + \ + bench_record_sample(bpf_ktime_get_ns() - __bench_start); \ + \ + __bench_result = (body); \ + __bench_result; \ +}) + +#endif /* __BENCH_BPF_TIMING_BPF_H__ */ diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index ce18a4db813f..ebd5a1e69f56 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -16,6 +16,7 @@ #include "bpf_tracing_net.h" #include <bpf/bpf_tracing.h> +#include <errno.h> char _license[] SEC("license") = "GPL"; @@ -170,10 +171,18 @@ static void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } +bool nodelay_init_reject = false; +bool nodelay_cwnd_event_tx_start_reject = false; + SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bpf_bictcp *ca = inet_csk_ca(sk); + int true_val = 1, ret; + + ret = bpf_setsockopt(sk, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_init_reject = true; bictcp_reset(ca); @@ -189,8 +198,13 @@ void BPF_PROG(bpf_cubic_cwnd_event_tx_start, struct sock *sk) { struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 now = tcp_jiffies32; + int true_val = 1, ret; __s32 delta; + ret = bpf_setsockopt(sk, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_cwnd_event_tx_start_reject = true; + delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c new file mode 100644 index 000000000000..86f6c0d5eadb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_rhash_map.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RHASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 64); + __type(key, __u32); + __type(value, __u64); +} rhashmap SEC(".maps"); + +__u32 key_sum = 0; +__u64 val_sum = 0; +__u32 elem_count = 0; +__u32 err = 0; + +SEC("iter/bpf_map_elem") +int dump_bpf_rhash_map(struct bpf_iter__bpf_map_elem *ctx) +{ + __u32 *key = ctx->key; + __u64 *val = ctx->value; + + if (!key || !val) + return 0; + + key_sum += *key; + val_sum += *val; + elem_count++; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c index d64ba7ddaed5..d7fb561ed4fb 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vmas.c @@ -52,7 +52,7 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx) bpf_d_path(&file->f_path, d_path_buf, D_PATH_BUF_SIZE); BPF_SEQ_PRINTF(seq, "%08llx ", vma->vm_pgoff << 12); - BPF_SEQ_PRINTF(seq, "%02x:%02x %u", MAJOR(dev), MINOR(dev), + BPF_SEQ_PRINTF(seq, "%02x:%02x %llu", MAJOR(dev), MINOR(dev), file->f_inode->i_ino); BPF_SEQ_PRINTF(seq, "\t%s\n", d_path_buf); } else { diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index a0d7b15a24b1..b0c441384f20 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -152,11 +152,13 @@ #define __auxiliary __test_tag("test_auxiliary") #define __auxiliary_unpriv __test_tag("test_auxiliary_unpriv") #define __btf_path(path) __test_tag("test_btf_path=" path) +#define __btf_func_path(path) __test_tag("test_btf_func_path=" path) #define __arch(arch) __test_tag("test_arch=" arch) #define __arch_x86_64 __arch("X86_64") #define __arch_arm64 __arch("ARM64") #define __arch_riscv64 __arch("RISCV64") #define __arch_s390x __arch("s390x") +#define __arch_loongarch __arch("LOONGARCH") #define __caps_unpriv(caps) __test_tag("test_caps_unpriv=" EXPAND_QUOTE(caps)) #define __load_if_JITed() __test_tag("load_mode=jited") #define __load_if_no_JITed() __test_tag("load_mode=no_jited") diff --git a/tools/testing/selftests/bpf/progs/bpf_nop_bench.c b/tools/testing/selftests/bpf/progs/bpf_nop_bench.c new file mode 100644 index 000000000000..01ed284c1bb3 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_nop_bench.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bench_bpf_timing.bpf.h" + +SEC("syscall") +int bench_nop(void *ctx) +{ + return BENCH_BPF_LOOP(0, ({})); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c new file mode 100644 index 000000000000..ac626cfa2a98 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_dynptr_use_after_invalidate_clone.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +SEC("struct_ops") +__success +int BPF_PROG(dynptr_use_after_invalidate_clone, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr, ptr_clone; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + bpf_dynptr_clone(&ptr, &ptr_clone); + + hdr = bpf_dynptr_slice(&ptr_clone, 0, NULL, sizeof(*hdr)); + if (!hdr) { + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; + } + + *(int *)&ptr = 0; + + proto = hdr->h_proto; + + bpf_qdisc_skb_drop(skb, to_free); + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)dynptr_use_after_invalidate_clone, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c new file mode 100644 index 000000000000..1d96f7987a3f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +SEC("struct_ops") +__failure __msg("Expected an initialized dynptr as R1") +int BPF_PROG(invalid_dynptr, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + bpf_qdisc_skb_drop(skb, to_free); + + hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr)); + if (!hdr) + return NET_XMIT_DROP; + + proto = hdr->h_proto; + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)invalid_dynptr, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c new file mode 100644 index 000000000000..2e23b8593af9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_cross_frame.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +static __noinline int free_skb(struct sk_buff *skb) +{ + bpf_kfree_skb(skb); + return 0; +} + +SEC("struct_ops") +__failure __msg("invalid mem access 'scalar'") +int BPF_PROG(invalid_dynptr_cross_frame, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr)); + if (!hdr) + return NET_XMIT_DROP; + + free_skb(skb); + + proto = hdr->h_proto; + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)invalid_dynptr_cross_frame, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c new file mode 100644 index 000000000000..731216c4e45a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fail__invalid_dynptr_slice.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +int proto; + +SEC("struct_ops") +__failure __msg("invalid mem access 'scalar'") +int BPF_PROG(invalid_dynptr_slice, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct bpf_dynptr ptr; + struct ethhdr *hdr; + + bpf_dynptr_from_skb((struct __sk_buff *)skb, 0, &ptr); + + hdr = bpf_dynptr_slice(&ptr, 0, NULL, sizeof(*hdr)); + if (!hdr) { + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; + } + + bpf_qdisc_skb_drop(skb, to_free); + + proto = hdr->h_proto; + + return NET_XMIT_DROP; +} + +SEC("struct_ops") +__auxiliary +struct sk_buff *BPF_PROG(bpf_qdisc_test_dequeue, struct Qdisc *sch) +{ + return NULL; +} + +SEC("struct_ops") +__auxiliary +int BPF_PROG(bpf_qdisc_test_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + return 0; +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_reset, struct Qdisc *sch) +{ +} + +SEC("struct_ops") +__auxiliary +void BPF_PROG(bpf_qdisc_test_destroy, struct Qdisc *sch) +{ +} + +SEC(".struct_ops") +struct Qdisc_ops test = { + .enqueue = (void *)invalid_dynptr_slice, + .dequeue = (void *)bpf_qdisc_test_dequeue, + .init = (void *)bpf_qdisc_test_init, + .reset = (void *)bpf_qdisc_test_reset, + .destroy = (void *)bpf_qdisc_test_destroy, + .id = "bpf_qdisc_test", +}; diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c index 1a3233a275c7..8107f5934d2d 100644 --- a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c @@ -196,18 +196,13 @@ fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock, static bool fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock) { - struct bpf_list_node *node; + bool empty; bpf_spin_lock(lock); - node = bpf_list_pop_front(head); - if (node) { - bpf_list_push_front(head, node); - bpf_spin_unlock(lock); - return false; - } + empty = bpf_list_empty(head); bpf_spin_unlock(lock); - return true; + return empty; } /* flow->age is used to denote the state of the flow (not-detached, detached, throttled) diff --git a/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c new file mode 100644 index 000000000000..8d38aafe66a2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__stack_arg_precision.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "../test_kmods/bpf_testmod_kfunc.h" + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size) +{ + char buf[8] = {}; + + return bpf_kfunc_call_stack_arg_mem(a, b, c, d, e, buf, size); +} + +#else + +long subprog_call_mem_kfunc(void) +{ + return 0; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c new file mode 100644 index 000000000000..99bc115f8380 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/btf__verifier_stack_arg_order.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +int subprog_call_before_load_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +int subprog_pruning_call_before_load_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +void subprog_bad_ptr_7args(long *a, int b, int c, int d, int e, int f, int g) +{ +} + +#else + +int subprog_bad_order_6args(void) +{ + return 0; +} + +int subprog_call_before_load_6args(void) +{ + return 0; +} + +int subprog_pruning_call_before_load_6args(void) +{ + return 0; +} + +void subprog_bad_ptr_7args(void) +{ +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c index 9fe9c4a4e8f6..d0d65d6d450c 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c @@ -29,7 +29,7 @@ static struct __cgrps_kfunc_map_value *insert_lookup_cgrp(struct cgroup *cgrp) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -48,7 +48,7 @@ int BPF_PROG(cgrp_kfunc_acquire_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -64,7 +64,7 @@ int BPF_PROG(cgrp_kfunc_acquire_no_null_check, struct cgroup *cgrp, const char * } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 pointer type STRUCT cgroup must point") +__failure __msg("R1 pointer type STRUCT cgroup must point") int BPF_PROG(cgrp_kfunc_acquire_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired, *stack_cgrp = (struct cgroup *)&path; @@ -106,7 +106,7 @@ int BPF_PROG(cgrp_kfunc_acquire_trusted_walked, struct cgroup *cgrp, const char } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_acquire_null, struct cgroup *cgrp, const char *path) { struct cgroup *acquired; @@ -154,7 +154,7 @@ int BPF_PROG(cgrp_kfunc_xchg_unreleased, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("must be referenced or trusted") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) { struct cgroup *kptr; @@ -175,7 +175,7 @@ int BPF_PROG(cgrp_kfunc_rcu_get_release, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value *v; @@ -191,7 +191,7 @@ int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path } SEC("tp_btf/cgroup_mkdir") -__failure __msg("arg#0 pointer type STRUCT cgroup must point") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) { struct cgroup *acquired = (struct cgroup *)&path; @@ -203,7 +203,7 @@ int BPF_PROG(cgrp_kfunc_release_fp, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) { struct __cgrps_kfunc_map_value local, *v; @@ -237,7 +237,7 @@ int BPF_PROG(cgrp_kfunc_release_null, struct cgroup *cgrp, const char *path) } SEC("tp_btf/cgroup_mkdir") -__failure __msg("release kernel function bpf_cgroup_release expects") +__failure __msg("release kfunc bpf_cgroup_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(cgrp_kfunc_release_unacquired, struct cgroup *cgrp, const char *path) { /* Cannot release trusted cgroup pointer which was not acquired. */ diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c index a2de95f85648..37bd6b03ba01 100644 --- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c +++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c @@ -4,6 +4,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "bpf_misc.h" +#include "err.h" char _license[] SEC("license") = "GPL"; @@ -16,6 +17,7 @@ struct { __s32 target_pid; __u64 cgroup_id; +long update_err; int target_hid; bool is_cgroup1; @@ -123,3 +125,19 @@ int yes_rcu_lock(void *ctx) bpf_rcu_read_unlock(); return 0; } + +SEC("fexit/bpf_local_storage_update") +int BPF_PROG(fexit_update, void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags, bool swap_uptrs, + struct bpf_local_storage_data *ret) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + if (task->pid != target_pid) + return 0; + + if (IS_ERR_VALUE(ret)) + update_err = PTR_ERR(ret); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index f05e120f3450..d055fc7b3b95 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -3,7 +3,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "../../../include/linux/filter.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #include "bpf_misc.h" struct { diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 61c32e91e8c3..4c45346fe6f7 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -45,7 +45,7 @@ int BPF_PROG(test_alloc_no_release, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg0") +__failure __msg("NULL pointer passed to trusted R1") int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *cpumask; @@ -73,7 +73,7 @@ int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_fla } SEC("tp_btf/task_newtask") -__failure __msg("bpf_cpumask_set_cpu args#1 expected pointer to STRUCT bpf_cpumask") +__failure __msg("bpf_cpumask_set_cpu R2 expected pointer to STRUCT bpf_cpumask") int BPF_PROG(test_mutate_cpumask, struct task_struct *task, u64 clone_flags) { /* Can't set the CPU of a non-struct bpf_cpumask. */ @@ -107,7 +107,7 @@ int BPF_PROG(test_insert_remove_no_release, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg0") +__failure __msg("NULL pointer passed to trusted R1") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { /* NULL passed to kfunc. */ @@ -151,7 +151,7 @@ int BPF_PROG(test_global_mask_out_of_rcu, struct task_struct *task, u64 clone_fl } SEC("tp_btf/task_newtask") -__failure __msg("NULL pointer passed to trusted arg1") +__failure __msg("NULL pointer passed to trusted R2") int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *local, *prev; @@ -179,7 +179,7 @@ int BPF_PROG(test_global_mask_no_null_check, struct task_struct *task, u64 clone } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to helper arg2") +__failure __msg("Possibly NULL pointer passed to helper R2") int BPF_PROG(test_global_mask_rcu_no_null_check, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *prev, *curr; diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index 0e04c31b91c0..774706e7b058 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -866,7 +866,7 @@ int BPF_PROG(test_populate, struct task_struct *task, u64 clone_flags) * access NR_CPUS, the upper bound for nr_cpus, so we infer * it from the size of cpumask_t. */ - if (nr_cpus < 0 || nr_cpus >= CPUMASK_TEST_MASKLEN * 8) { + if (nr_cpus < 0 || nr_cpus > CPUMASK_TEST_MASKLEN * 8) { err = 3; goto out; } diff --git a/tools/testing/selftests/bpf/progs/crypto_bench.c b/tools/testing/selftests/bpf/progs/crypto_bench.c index 4ac956b26240..4c0a09aa1e6c 100644 --- a/tools/testing/selftests/bpf/progs/crypto_bench.c +++ b/tools/testing/selftests/bpf/progs/crypto_bench.c @@ -11,10 +11,19 @@ #include "crypto_common.h" const volatile unsigned int len = 16; -char cipher[128] = {}; +/* + * cipher[] and key[] are 8-byte aligned and 'params' is kept off the stack to + * work around an LLVM code generation bug. clang lowers the memcpy() of these + * byte-aligned globals into a per-byte load/store sequence staged on the stack, + * and additionally materializes the on-stack 'struct bpf_crypto_params' twice. + * Both blow the 512-byte BPF stack limit. Aligning the sources lets clang copy + * word-wise, and a global 'params' removes the large object from the stack. + */ +char cipher[128] __attribute__((aligned(8))) = {}; u32 key_len, authsize; char dst[256] = {}; -u8 key[256] = {}; +u8 key[256] __attribute__((aligned(8))) = {}; +static struct bpf_crypto_params params; long hits = 0; int status; @@ -22,11 +31,6 @@ SEC("syscall") int crypto_setup(void *args) { struct bpf_crypto_ctx *cctx; - struct bpf_crypto_params params = { - .type = "skcipher", - .key_len = key_len, - .authsize = authsize, - }; int err = 0; status = 0; @@ -36,6 +40,9 @@ int crypto_setup(void *args) return 0; } + __builtin_memcpy(¶ms.type, "skcipher", sizeof("skcipher")); + params.key_len = key_len; + params.authsize = authsize; __builtin_memcpy(¶ms.algo, cipher, sizeof(cipher)); __builtin_memcpy(¶ms.key, key, sizeof(key)); cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index dfd8a258f14a..e81f5ac3b1ae 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -10,11 +10,20 @@ #include "bpf_kfuncs.h" #include "crypto_common.h" -unsigned char key[256] = {}; +/* + * key[] and algo[] are 8-byte aligned and 'params' is kept off the stack to + * work around an LLVM code generation bug. clang lowers the memcpy() of these + * byte-aligned globals into a per-byte load/store sequence staged on the stack, + * and additionally materializes the on-stack 'struct bpf_crypto_params' twice. + * Both blow the 512-byte BPF stack limit. Aligning the sources lets clang copy + * word-wise, and a global 'params' removes the large object from the stack. + */ +unsigned char key[256] __attribute__((aligned(8))) = {}; u16 udp_test_port = 7777; u32 authsize, key_len; -char algo[128] = {}; +char algo[128] __attribute__((aligned(8))) = {}; char dst[16] = {}, dst_bad[8] = {}; +static struct bpf_crypto_params params; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -53,11 +62,6 @@ static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) SEC("syscall") int skb_crypto_setup(void *ctx) { - struct bpf_crypto_params params = { - .type = "skcipher", - .key_len = key_len, - .authsize = authsize, - }; struct bpf_crypto_ctx *cctx; int err; @@ -67,6 +71,9 @@ int skb_crypto_setup(void *ctx) return 0; } + __builtin_memcpy(¶ms.type, "skcipher", sizeof("skcipher")); + params.key_len = key_len; + params.authsize = authsize; __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index b62773ce5219..344fb2aa0813 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -78,7 +78,7 @@ static int get_map_val_dynptr(struct bpf_dynptr *ptr) * bpf_ringbuf_submit/discard_dynptr call */ SEC("?raw_tp") -__failure __msg("Unreleased reference id=2") +__failure __msg("Unreleased reference id=1") int ringbuf_missing_release1(void *ctx) { struct bpf_dynptr ptr = {}; @@ -91,7 +91,7 @@ int ringbuf_missing_release1(void *ctx) } SEC("?raw_tp") -__failure __msg("Unreleased reference id=4") +__failure __msg("Unreleased reference id=3") int ringbuf_missing_release2(void *ctx) { struct bpf_dynptr ptr1, ptr2; @@ -136,7 +136,7 @@ int ringbuf_missing_release_callback(void *ctx) /* Can't call bpf_ringbuf_submit/discard_dynptr on a non-initialized dynptr */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int ringbuf_release_uninit_dynptr(void *ctx) { struct bpf_dynptr ptr; @@ -149,7 +149,7 @@ int ringbuf_release_uninit_dynptr(void *ctx) /* A dynptr can't be used after it has been invalidated */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int use_after_invalid(void *ctx) { struct bpf_dynptr ptr; @@ -448,7 +448,7 @@ int invalid_helper2(void *ctx) /* A bpf_dynptr is invalidated if it's been written into */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int invalid_write1(void *ctx) { struct bpf_dynptr ptr; @@ -650,7 +650,7 @@ int invalid_offset(void *ctx) /* Can't release a dynptr twice */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int release_twice(void *ctx) { struct bpf_dynptr ptr; @@ -677,7 +677,7 @@ static int release_twice_callback_fn(__u32 index, void *data) * within a callback function, fails */ SEC("?raw_tp") -__failure __msg("arg 1 is an unacquired reference") +__failure __msg("Expected an initialized dynptr as R1") int release_twice_callback(void *ctx) { struct bpf_dynptr ptr; @@ -705,6 +705,48 @@ int dynptr_from_mem_invalid_api(void *ctx) return 0; } +/* Cannot create dynptr from dynptr data */ +SEC("?raw_tp") +__failure __msg("Unsupported reg type mem for bpf_dynptr_from_mem data") +int dynptr_from_dynptr_data(void *ctx) +{ + struct bpf_dynptr ptr, ptr2; + __u8 *data; + + if (get_map_val_dynptr(&ptr)) + return 0; + + data = bpf_dynptr_data(&ptr, 0, sizeof(__u32)); + if (!data) + return 0; + + /* this should fail */ + bpf_dynptr_from_mem(data, sizeof(__u32), 0, &ptr2); + + return 0; +} + +/* Cannot create dynptr from dynptr slice */ +SEC("?tc") +__failure __msg("Unsupported reg type mem for bpf_dynptr_from_mem data") +int dynptr_from_dynptr_slice(struct __sk_buff *skb) +{ + struct bpf_dynptr ptr, ptr2; + struct ethhdr *hdr; + char buffer[sizeof(*hdr)] = {}; + + bpf_dynptr_from_skb(skb, 0, &ptr); + + hdr = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); + if (!hdr) + return SK_DROP; + + /* this should fail */ + bpf_dynptr_from_mem(hdr, sizeof(*hdr), 0, &ptr2); + + return SK_PASS; +} + SEC("?tc") __failure __msg("cannot overwrite referenced dynptr") __log_level(2) int dynptr_pruning_overwrite(struct __sk_buff *ctx) @@ -1642,7 +1684,7 @@ int invalid_slice_rdwr_rdonly(struct __sk_buff *skb) /* bpf_dynptr_adjust can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_adjust_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1655,7 +1697,7 @@ int dynptr_adjust_invalid(void *ctx) /* bpf_dynptr_is_null can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_is_null_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1668,7 +1710,7 @@ int dynptr_is_null_invalid(void *ctx) /* bpf_dynptr_is_rdonly can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_is_rdonly_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1681,7 +1723,7 @@ int dynptr_is_rdonly_invalid(void *ctx) /* bpf_dynptr_size can only be called on initialized dynptrs */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int dynptr_size_invalid(void *ctx) { struct bpf_dynptr ptr = {}; @@ -1694,7 +1736,7 @@ int dynptr_size_invalid(void *ctx) /* Only initialized dynptrs can be cloned */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #0") +__failure __msg("Expected an initialized dynptr as R1") int clone_invalid1(void *ctx) { struct bpf_dynptr ptr1 = {}; @@ -1728,7 +1770,7 @@ int clone_invalid2(struct xdp_md *xdp) /* Invalidating a dynptr should invalidate its clones */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate1(void *ctx) { struct bpf_dynptr clone; @@ -1749,7 +1791,7 @@ int clone_invalidate1(void *ctx) /* Invalidating a dynptr should invalidate its parent */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate2(void *ctx) { struct bpf_dynptr ptr; @@ -1770,7 +1812,7 @@ int clone_invalidate2(void *ctx) /* Invalidating a dynptr should invalidate its siblings */ SEC("?raw_tp") -__failure __msg("Expected an initialized dynptr as arg #2") +__failure __msg("Expected an initialized dynptr as R3") int clone_invalidate3(void *ctx) { struct bpf_dynptr ptr; @@ -1981,7 +2023,7 @@ __noinline long global_call_bpf_dynptr(const struct bpf_dynptr *dynptr) } SEC("?raw_tp") -__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") +__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr") int test_dynptr_reg_type(void *ctx) { struct task_struct *current = NULL; diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index e0d672d93adf..e0745b6e467e 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -914,7 +914,7 @@ void *user_ptr; char expected_str[384]; __u32 test_len[7] = {0/* placeholder */, 0, 1, 2, 255, 256, 257}; -typedef int (*bpf_read_dynptr_fn_t)(struct bpf_dynptr *dptr, u64 off, +typedef int (*bpf_read_dynptr_fn_t)(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr); /* Returns the offset just before the end of the maximum sized xdp fragment. @@ -1106,7 +1106,7 @@ int test_copy_from_user_str_dynptr(void *ctx) return 0; } -static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, +static int bpf_copy_data_from_user_task(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); @@ -1114,7 +1114,7 @@ static int bpf_copy_data_from_user_task(struct bpf_dynptr *dptr, u64 off, return bpf_copy_from_user_task_dynptr(dptr, off, size, unsafe_ptr, task); } -static int bpf_copy_data_from_user_task_str(struct bpf_dynptr *dptr, u64 off, +static int bpf_copy_data_from_user_task_str(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr) { struct task_struct *task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/exceptions.c b/tools/testing/selftests/bpf/progs/exceptions.c index 4206f59d7b86..c8d716fbd419 100644 --- a/tools/testing/selftests/bpf/progs/exceptions.c +++ b/tools/testing/selftests/bpf/progs/exceptions.c @@ -379,4 +379,118 @@ int exception_bad_assert_range_with(struct __sk_buff *ctx) return 1; } +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) \ + && defined(__BPF_FEATURE_STACK_ARGUMENT) + +const volatile bool has_stack_arg = true; + +long arg1 = 1, arg2 = 2, arg3 = 3, arg4 = 4, arg5 = 5; +long arg6 = 6, arg7 = 7, arg8 = 8, arg9 = 9, arg10 = 10; + +__noinline static long throwing_many_args(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, long j) +{ + bpf_throw(a + b + c + d + e + f + g + h + i + j); + return 0; +} + +__noinline int exception_cb_sa(u64 cookie) +{ + return cookie + 1; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_stack_arg(struct __sk_buff *ctx) +{ + throwing_many_args(arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9, arg10); + return 0; +} + +__noinline static long no_throw_many_args(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, long j) +{ + return a + b + c + d + e + f + g + h + i + j; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_after_stack_arg(struct __sk_buff *ctx) +{ + long ret; + + ret = no_throw_many_args(arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9, arg10); + if (ret > 0) + bpf_throw(ret); + return 0; +} + +__noinline static long subprog_throw_sa(long val) +{ + throwing_many_args(val, val + 1, val + 2, val + 3, val + 4, + val + 5, val + 6, val + 7, val + 8, val + 9); + return 0; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_subprog_stack_arg(struct __sk_buff *ctx) +{ + subprog_throw_sa(arg1); + return 0; +} + +__noinline static long subprog_throw_after_sa(long val) +{ + long ret; + + ret = no_throw_many_args(val, val + 1, val + 2, val + 3, val + 4, + val + 5, val + 6, val + 7, val + 8, val + 9); + if (ret > 0) + bpf_throw(ret); + return 0; +} + +SEC("tc") +__exception_cb(exception_cb_sa) +int exception_throw_subprog_after_stack_arg(struct __sk_buff *ctx) +{ + subprog_throw_after_sa(arg1); + return 0; +} + +#else + +const volatile bool has_stack_arg = false; + +SEC("tc") +int exception_throw_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +SEC("tc") +int exception_throw_after_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +SEC("tc") +int exception_throw_subprog_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +SEC("tc") +int exception_throw_subprog_after_stack_arg(struct __sk_buff *ctx) +{ + return 0; +} + +#endif + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c index 051e2b6f2694..ac44d60e5066 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_fail.c +++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c @@ -208,6 +208,28 @@ int reject_with_reference(void *ctx) return 0; } +__noinline int global_subprog_may_throw(struct __sk_buff *ctx) +{ + if (ctx->len) + bpf_throw(0); + return 0; +} + +SEC("?tc") +__failure __msg("Unreleased reference") +int reject_global_subprog_throw_with_reference(struct __sk_buff *ctx) +{ + struct foo *f; + + f = bpf_obj_new(typeof(*f)); + if (!f) + return 0; + if (ctx->protocol) + global_subprog_may_throw(ctx); + bpf_obj_drop(f); + return 0; +} + __noinline static int subprog_ref(struct __sk_buff *ctx) { struct foo *f; diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c index 983b7c233382..f4bbf87b82dd 100644 --- a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c @@ -53,14 +53,23 @@ int BPF_PROG(test_subprog1, struct sk_buff *skb, int ret) * r0 = *(u32 *)(r1 + 0) * w0 <<= 1 * exit - * In such case the verifier falls back to conservative and + * Before llvm23, in such case the verifier falls back to conservative and * tracing program can access arguments and return value as u64 - * instead of accurate types. + * instead of accurate types. With llvm23, the true signature + * int test_pkt_access_subprog2(volatile struct __sk_buff *skb) + * is available in btf. */ +#if __clang_major__ >= 23 +struct args_subprog2 { + __u64 args[1]; + __u64 ret; +}; +#else struct args_subprog2 { __u64 args[5]; __u64 ret; }; +#endif __u64 test_result_subprog2 = 0; SEC("fexit/test_pkt_access_subprog2") int test_subprog2(struct args_subprog2 *ctx) diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 462712ff3b8a..aa2c05cce2b3 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -50,7 +50,7 @@ int on_open_expect_fault(void *c) goto out; local_err = bpf_dynptr_read(tmp_buf, user_buf_sz, &dynptr, user_buf_sz, 0); - if (local_err == -EFAULT) { /* Expect page fault */ + if (local_err == -EFAULT || local_err == 0) { /* Expect page fault or success */ local_err = 0; run_success = 1; } diff --git a/tools/testing/selftests/bpf/progs/file_reader_fail.c b/tools/testing/selftests/bpf/progs/file_reader_fail.c index 32fe28ed2439..3bb9e2612f8f 100644 --- a/tools/testing/selftests/bpf/progs/file_reader_fail.c +++ b/tools/testing/selftests/bpf/progs/file_reader_fail.c @@ -30,7 +30,7 @@ int on_nanosleep_unreleased_ref(void *ctx) SEC("xdp") __failure -__msg("Expected a dynptr of type file as arg #0") +__msg("Expected a dynptr of type file as R1") int xdp_wrong_dynptr_type(struct xdp_md *xdp) { struct bpf_dynptr dynptr; @@ -42,7 +42,7 @@ int xdp_wrong_dynptr_type(struct xdp_md *xdp) SEC("xdp") __failure -__msg("Expected an initialized dynptr as arg #0") +__msg("Expected an initialized dynptr as R1") int xdp_no_dynptr_type(struct xdp_md *xdp) { struct bpf_dynptr dynptr; @@ -50,3 +50,63 @@ int xdp_no_dynptr_type(struct xdp_md *xdp) bpf_dynptr_file_discard(&dynptr); return 0; } + +SEC("lsm/file_open") +__failure +__msg("Leaking reference id={{[0-9]+}} alloc_insn={{[0-9]+}}. Release it first.") +int use_file_dynptr_after_put_file(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + struct bpf_dynptr dynptr; + char buf[64]; + + if (!file) + return 0; + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto out; + + /* this should fail - file dynptr should be discarded first to prevent resource leak */ + bpf_put_file(file); + + bpf_dynptr_read(buf, sizeof(buf), &dynptr, 0, 0); + return 0; + +out: + bpf_dynptr_file_discard(&dynptr); + bpf_put_file(file); + return 0; +} + +SEC("lsm/file_open") +__failure +__msg("Leaking reference id={{[0-9]+}} alloc_insn={{[0-9]+}}. Release it first.") +int use_file_dynptr_slice_after_put_file(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct file *file = bpf_get_task_exe_file(task); + struct bpf_dynptr dynptr; + char buf[1]; + const char *data; + + if (!file) + return 0; + + if (bpf_dynptr_from_file(file, 0, &dynptr)) + goto out; + + data = bpf_dynptr_slice(&dynptr, 0, buf, sizeof(buf)); + if (!data) + goto out; + + /* this should fail - file dynptr should be discarded first to prevent resource leak */ + bpf_put_file(file); + + return data[0]; + +out: + bpf_dynptr_file_discard(&dynptr); + bpf_put_file(file); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c index 195d3b2fba00..62c1b1325ec2 100644 --- a/tools/testing/selftests/bpf/progs/htab_update.c +++ b/tools/testing/selftests/bpf/progs/htab_update.c @@ -22,8 +22,8 @@ struct { int pid = 0; int update_err = 0; -SEC("?fentry/bpf_obj_free_fields") -int bpf_obj_free_fields(void *ctx) +SEC("?fentry/bpf_obj_cancel_fields") +int bpf_obj_cancel_fields(void *ctx) { __u32 key = 0; struct val value = { .payload = 1 }; diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c index e11e82d98904..a4a007866a33 100644 --- a/tools/testing/selftests/bpf/progs/irq.c +++ b/tools/testing/selftests/bpf/progs/irq.c @@ -15,7 +15,7 @@ struct bpf_res_spin_lock lockA __hidden SEC(".data.A"); struct bpf_res_spin_lock lockB __hidden SEC(".data.B"); SEC("?tc") -__failure __msg("arg#0 doesn't point to an irq flag on stack") +__failure __msg("R1 doesn't point to an irq flag on stack") int irq_save_bad_arg(struct __sk_buff *ctx) { bpf_local_irq_save(&global_flags); @@ -23,7 +23,7 @@ int irq_save_bad_arg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("arg#0 doesn't point to an irq flag on stack") +__failure __msg("R1 doesn't point to an irq flag on stack") int irq_restore_bad_arg(struct __sk_buff *ctx) { bpf_local_irq_restore(&global_flags); diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 86b74e3579d9..0fa70b133d93 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1605,7 +1605,7 @@ int iter_subprog_check_stacksafe(const void *ctx) struct bpf_iter_num global_it; SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_new_bad_arg(const void *ctx) { bpf_iter_num_new(&global_it, 0, 1); @@ -1613,7 +1613,7 @@ int iter_new_bad_arg(const void *ctx) } SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_next_bad_arg(const void *ctx) { bpf_iter_num_next(&global_it); @@ -1621,7 +1621,7 @@ int iter_next_bad_arg(const void *ctx) } SEC("raw_tp") -__failure __msg("arg#0 expected pointer to an iterator on stack") +__failure __msg("R1 expected pointer to an iterator on stack") int iter_destroy_bad_arg(const void *ctx) { bpf_iter_num_destroy(&global_it); diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index d273b46dfc7c..646026430e9b 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -73,7 +73,7 @@ int create_and_forget_to_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int destroy_without_creating_fail(void *ctx) { /* init with zeros to stop verifier complaining about uninit stack */ @@ -91,7 +91,7 @@ int destroy_without_creating_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int compromise_iter_w_direct_write_fail(void *ctx) { struct bpf_iter_num iter; @@ -143,7 +143,7 @@ int compromise_iter_w_direct_write_and_skip_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int compromise_iter_w_helper_write_fail(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; @@ -230,7 +230,7 @@ int valid_stack_reuse(void *ctx) } SEC("?raw_tp") -__failure __msg("expected uninitialized iter_num as arg #0") +__failure __msg("expected uninitialized iter_num as R1") int double_create_fail(void *ctx) { struct bpf_iter_num iter; @@ -258,7 +258,7 @@ int double_create_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int double_destroy_fail(void *ctx) { struct bpf_iter_num iter; @@ -284,7 +284,7 @@ int double_destroy_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int next_without_new_fail(void *ctx) { struct bpf_iter_num iter; @@ -305,7 +305,7 @@ int next_without_new_fail(void *ctx) } SEC("?raw_tp") -__failure __msg("expected an initialized iter_num as arg #0") +__failure __msg("expected an initialized iter_num as R1") int next_after_destroy_fail(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod.c b/tools/testing/selftests/bpf/progs/iters_testmod.c index 5379e9960ffd..76012dbbdb41 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod.c @@ -29,7 +29,7 @@ out: } SEC("raw_tp/sys_enter") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int iter_next_trusted_or_null(const void *ctx) { struct task_struct *cur_task = bpf_get_current_task_btf(); @@ -67,7 +67,7 @@ out: } SEC("raw_tp/sys_enter") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int iter_next_rcu_or_null(const void *ctx) { struct task_struct *cur_task = bpf_get_current_task_btf(); diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 83791348bed5..d00888f6687a 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,8 +20,8 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) { @@ -38,8 +38,8 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) { @@ -58,8 +58,8 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") -__msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) { @@ -79,7 +79,7 @@ int testmod_seq_truncated(const void *ctx) SEC("?raw_tp") __failure -__msg("expected an initialized iter_testmod_seq as arg #1") +__msg("expected an initialized iter_testmod_seq as R2") int testmod_seq_getter_before_bad(const void *ctx) { struct bpf_iter_testmod_seq it; @@ -89,7 +89,7 @@ int testmod_seq_getter_before_bad(const void *ctx) SEC("?raw_tp") __failure -__msg("expected an initialized iter_testmod_seq as arg #1") +__msg("expected an initialized iter_testmod_seq as R2") int testmod_seq_getter_after_bad(const void *ctx) { struct bpf_iter_testmod_seq it; diff --git a/tools/testing/selftests/bpf/progs/linked_list.c b/tools/testing/selftests/bpf/progs/linked_list.c index 421f40835acd..fa97faa5358b 100644 --- a/tools/testing/selftests/bpf/progs/linked_list.c +++ b/tools/testing/selftests/bpf/progs/linked_list.c @@ -290,6 +290,77 @@ int test_list_in_list(struct bpf_spin_lock *lock, struct bpf_list_head *head) return list_in_list(lock, head, true); } +#define MAX_LIST_CLEAR_NODES 256 + +static __always_inline +int clear_list(struct bpf_spin_lock *lock, struct bpf_list_head *head) +{ + struct bpf_list_node *n; + int i; + + for (i = 0; i < MAX_LIST_CLEAR_NODES; i++) { + bpf_spin_lock(lock); + n = bpf_list_pop_front(head); + bpf_spin_unlock(lock); + if (!n) + return 0; + bpf_obj_drop(container_of(n, struct foo, node2)); + } + return 1; +} + +SEC("syscall") +int clear_map_list(void *ctx) +{ + struct map_value *v; + + v = bpf_map_lookup_elem(&array_map, &(int){0}); + if (!v) + return 1; + return clear_list(&v->lock, &v->head); +} + +SEC("syscall") +int clear_inner_map_list(void *ctx) +{ + struct map_value *v; + void *map; + + map = bpf_map_lookup_elem(&map_of_maps, &(int){0}); + if (!map) + return 1; + v = bpf_map_lookup_elem(map, &(int){0}); + if (!v) + return 1; + return clear_list(&v->lock, &v->head); +} + +SEC("syscall") +int clear_global_list(void *ctx) +{ + return clear_list(&glock, &ghead); +} + +SEC("syscall") +int clear_global_nested_list(void *ctx) +{ + return clear_list(&ghead_nested.inner.lock, &ghead_nested.inner.head); +} + +SEC("syscall") +int clear_global_array_list(void *ctx) +{ + int ret; + + ret = clear_list(&glock_c, &ghead_array[0]); + if (ret) + return ret; + ret = clear_list(&glock_c, &ghead_array[1]); + if (ret) + return ret; + return clear_list(&glock_c, &ghead_array_one[0]); +} + SEC("tc") int map_list_push_pop(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c index a0e6ebd5507a..2831cf4445e8 100644 --- a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c +++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c @@ -7,7 +7,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_core_read.h> #include "bpf_misc.h" -#include "bpf_atomic.h" +#include <bpf_atomic.h> #include "progs/lpm_trie.h" #define BPF_OBJ_NAME_LEN 16U diff --git a/tools/testing/selftests/bpf/progs/lru_lock_nmi.c b/tools/testing/selftests/bpf/progs/lru_lock_nmi.c new file mode 100644 index 000000000000..c0692cd54237 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/lru_lock_nmi.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 64); + __type(key, __u32); + __type(value, __u64); +} lru_map SEC(".maps"); + +int hits; + +SEC("perf_event") +int oncpu(void *ctx) +{ + /* + * Key range deliberately wider than max_entries to force LRU + * eviction on every other update. + */ + __u32 key = bpf_get_prandom_u32() % 128; + bool do_update = bpf_get_prandom_u32() & 1; + __u64 val = 1; + + if (do_update) + bpf_map_update_elem(&lru_map, &key, &val, BPF_ANY); + else + bpf_map_delete_elem(&lru_map, &key); + __sync_fetch_and_add(&hits, 1); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/lsm_cgroup.c b/tools/testing/selftests/bpf/progs/lsm_cgroup.c index d7598538aa2d..3bfa479104be 100644 --- a/tools/testing/selftests/bpf/progs/lsm_cgroup.c +++ b/tools/testing/selftests/bpf/progs/lsm_cgroup.c @@ -35,6 +35,8 @@ int called_socket_bind; int called_socket_bind2; int called_socket_alloc; int called_socket_clone; +int skipcap_retval = -4095; +int socket_retval = -4095; static __always_inline int test_local_storage(void) { @@ -190,3 +192,31 @@ int BPF_PROG(socket_clone, struct sock *newsk, const struct request_sock *req) return 1; } + +SEC("lsm_cgroup/inode_xattr_skipcap") +int BPF_PROG(skipcap_first, const char *name) +{ + return 0; +} + +SEC("lsm_cgroup/inode_xattr_skipcap") +int BPF_PROG(skipcap_second, const char *name) +{ + skipcap_retval = bpf_get_retval(); + bpf_set_retval(0); + return 1; +} + +SEC("lsm_cgroup/socket_create") +int BPF_PROG(socket_first, int family, int type, int protocol, int kern) +{ + return 0; +} + +SEC("lsm_cgroup/socket_create") +int BPF_PROG(socket_second, int family, int type, int protocol, int kern) +{ + socket_retval = bpf_get_retval(); + bpf_set_retval(0); + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index e708ffbe1f61..3fbefc568e0a 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -489,8 +489,7 @@ int test_map_kptr_ref3(struct __sk_buff *ctx) int num_of_refs; -SEC("syscall") -int count_ref(void *ctx) +static __always_inline int read_ref_count(void) { struct prog_test_ref_kfunc *p; unsigned long arg = 0; @@ -500,12 +499,96 @@ int count_ref(void *ctx) return 1; num_of_refs = p->cnt.refs.counter; - bpf_kfunc_call_test_release(p); return 0; } SEC("syscall") +int count_ref(void *ctx) +{ + return read_ref_count(); +} + +static __always_inline int stash_ref_ptr(struct map_value *v) +{ + struct prog_test_ref_kfunc *p, *old; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) { + bpf_kfunc_call_test_release(old); + old = bpf_kptr_xchg(&v->ref_ptr, NULL); + if (old) + bpf_kfunc_call_test_release(old); + return 2; + } + return 0; +} + +static __always_inline int check_refs(int expected) +{ + int ret; + + ret = read_ref_count(); + if (ret) + return ret; + return num_of_refs == expected ? 0 : 3; +} + +SEC("syscall") +int test_array_map_update_kptr(void *ctx) +{ + struct map_value init = {}, *v; + int key = 0, ret; + + v = bpf_map_lookup_elem(&array_map, &key); + if (!v) + return 1; + ret = stash_ref_ptr(v); + if (ret) + return ret; + ret = check_refs(3); + if (ret) + return ret; + ret = bpf_map_update_elem(&array_map, &key, &init, BPF_EXIST); + if (ret) + return 4; + return check_refs(3); +} + +#define DEFINE_HASH_UPDATE_KPTR_TEST(name, map) \ +SEC("syscall") \ +int name(void *ctx) \ +{ \ + struct map_value init = {}, *v; \ + int key = 0, ret; \ + \ + ret = bpf_map_update_elem(&map, &key, &init, BPF_NOEXIST); \ + if (ret) \ + return 1; \ + v = bpf_map_lookup_elem(&map, &key); \ + if (!v) \ + return 2; \ + ret = stash_ref_ptr(v); \ + if (ret) \ + return ret; \ + ret = check_refs(3); \ + if (ret) \ + return ret; \ + ret = bpf_map_update_elem(&map, &key, &init, BPF_EXIST); \ + if (ret) \ + return 4; \ + return check_refs(3); \ +} + +DEFINE_HASH_UPDATE_KPTR_TEST(test_hash_map_update_kptr, hash_map) +DEFINE_HASH_UPDATE_KPTR_TEST(test_hash_malloc_map_update_kptr, hash_malloc_map) + +SEC("syscall") int test_ls_map_kptr_ref1(void *ctx) { struct task_struct *current; diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index ee053b24e6ca..f11848dfa78f 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -252,7 +252,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("R2 must be referenced") +__failure __msg("release helper bpf_kptr_xchg expects referenced PTR_TO_BTF_ID passed to R2") int reject_untrusted_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; @@ -364,7 +364,7 @@ int kptr_xchg_ref_state(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to helper arg2") +__failure __msg("Possibly NULL pointer passed to helper R2") int kptr_xchg_possibly_null(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *p; diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c index 81813c724fa9..08379c3b6a03 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_fail.c @@ -110,7 +110,7 @@ int BPF_PROG(test_array_map_3) } SEC("?fentry.s/bpf_fentry_test1") -__failure __msg("arg#0 expected for bpf_percpu_obj_drop()") +__failure __msg("R1 expected for bpf_percpu_obj_drop()") int BPF_PROG(test_array_map_4) { struct val_t __percpu_kptr *p; @@ -124,7 +124,7 @@ int BPF_PROG(test_array_map_4) } SEC("?fentry.s/bpf_fentry_test1") -__failure __msg("arg#0 expected for bpf_obj_drop()") +__failure __msg("R1 expected for bpf_obj_drop()") int BPF_PROG(test_array_map_5) { struct val_t *p; diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 70b7baf9304b..555379952dcc 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -134,7 +134,7 @@ unlock_err: } SEC("?tc") -__failure __msg("arg#1 expected pointer to allocated object") +__failure __msg("R2 expected pointer to allocated object") long rbtree_api_add_to_multiple_trees(void *ctx) { struct node_data *n; @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; @@ -281,7 +281,7 @@ long add_with_cb(bool (cb)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) } SEC("?tc") -__failure __msg("arg#1 expected pointer to allocated object") +__failure __msg("R2 expected pointer to allocated object") long rbtree_api_add_bad_cb_bad_fn_call_add(void *ctx) { return add_with_cb(less__bad_fn_call_add); diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c index c847398837cc..61906f48025c 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c @@ -368,6 +368,427 @@ INSERT_STASH_READ(true, "insert_stash_read: remove from tree"); INSERT_STASH_READ(false, "insert_stash_read: don't remove from tree"); SEC("tc") +__description("list_empty_test: list empty before add, non-empty after add") +__success __retval(0) +int list_empty_test(void *ctx) +{ + struct node_data *node_new; + + bpf_spin_lock(&lock); + if (!bpf_list_empty(&head)) { + bpf_spin_unlock(&lock); + return -1; + } + bpf_spin_unlock(&lock); + + node_new = bpf_obj_new(typeof(*node_new)); + if (!node_new) + return -2; + + bpf_spin_lock(&lock); + bpf_list_push_front(&head, &node_new->l); + + if (bpf_list_empty(&head)) { + bpf_spin_unlock(&lock); + return -3; + } + bpf_spin_unlock(&lock); + return 0; +} + +static struct node_data *__add_in_list(struct bpf_list_head *head, + struct bpf_spin_lock *lock) +{ + struct node_data *node_new, *node_ref; + + node_new = bpf_obj_new(typeof(*node_new)); + if (!node_new) + return NULL; + + node_ref = bpf_refcount_acquire(node_new); + + bpf_spin_lock(lock); + bpf_list_push_front(head, &node_new->l); + bpf_spin_unlock(lock); + return node_ref; +} + +SEC("tc") +__description("list_is_edge_test1: is_first on first node, is_last on last node") +__success __retval(0) +int list_is_edge_test1(void *ctx) +{ + struct node_data *node_first, *node_last; + int err = 0; + + node_last = __add_in_list(&head, &lock); + if (!node_last) + return -1; + + node_first = __add_in_list(&head, &lock); + if (!node_first) { + bpf_obj_drop(node_last); + return -2; + } + + bpf_spin_lock(&lock); + if (!bpf_list_is_first(&head, &node_first->l)) { + err = -3; + goto fail; + } + if (!bpf_list_is_last(&head, &node_last->l)) + err = -4; + +fail: + bpf_spin_unlock(&lock); + bpf_obj_drop(node_first); + bpf_obj_drop(node_last); + return err; +} + +SEC("tc") +__description("list_is_edge_test2: accept list_front/list_back return value") +__success __retval(0) +int list_is_edge_test2(void *ctx) +{ + struct bpf_list_node *front, *back; + struct node_data *a, *b; + long err = 0; + + a = __add_in_list(&head, &lock); + if (!a) + return -1; + + b = __add_in_list(&head, &lock); + if (!b) { + bpf_obj_drop(a); + return -2; + } + + bpf_spin_lock(&lock); + front = bpf_list_front(&head); + back = bpf_list_back(&head); + if (!front || !back) { + err = -3; + goto out_unlock; + } + + if (!bpf_list_is_first(&head, front) || bpf_list_is_last(&head, front)) { + err = -4; + goto out_unlock; + } + + if (!bpf_list_is_last(&head, back) || bpf_list_is_first(&head, back)) { + err = -5; + goto out_unlock; + } + +out_unlock: + bpf_spin_unlock(&lock); + bpf_obj_drop(a); + bpf_obj_drop(b); + return err; +} + +SEC("tc") +__description("list_is_edge_test3: single node is both first and last") +__success __retval(0) +int list_is_edge_test3(void *ctx) +{ + struct node_data *tmp; + struct bpf_list_node *node; + long err = 0; + + tmp = __add_in_list(&head, &lock); + if (!tmp) + return -1; + + bpf_spin_lock(&lock); + node = bpf_list_front(&head); + if (!node) { + bpf_spin_unlock(&lock); + bpf_obj_drop(tmp); + return -2; + } + + if (!bpf_list_is_first(&head, node) || !bpf_list_is_last(&head, node)) + err = -3; + bpf_spin_unlock(&lock); + + bpf_obj_drop(tmp); + return err; +} + +SEC("tc") +__description("list_del_test1: del returns removed nodes") +__success __retval(0) +int list_del_test1(void *ctx) +{ + struct node_data *node_first, *node_last; + struct bpf_list_node *bpf_node_first, *bpf_node_last; + int err = 0; + + node_last = __add_in_list(&head, &lock); + if (!node_last) + return -1; + + node_first = __add_in_list(&head, &lock); + if (!node_first) { + bpf_obj_drop(node_last); + return -2; + } + + bpf_spin_lock(&lock); + bpf_node_last = bpf_list_del(&head, &node_last->l); + bpf_node_first = bpf_list_del(&head, &node_first->l); + bpf_spin_unlock(&lock); + + if (bpf_node_first) + bpf_obj_drop(container_of(bpf_node_first, struct node_data, l)); + else + err = -3; + + if (bpf_node_last) + bpf_obj_drop(container_of(bpf_node_last, struct node_data, l)); + else + err = -4; + + bpf_obj_drop(node_first); + bpf_obj_drop(node_last); + return err; +} + +SEC("tc") +__description("list_del_test2: remove an arbitrary node from the list") +__success __retval(0) +int list_del_test2(void *ctx) +{ + struct bpf_rb_node *rb; + struct bpf_list_node *l; + struct node_data *n; + long err; + + err = __insert_in_tree_and_list(&head, &root, &lock); + if (err) + return err; + + bpf_spin_lock(&lock); + rb = bpf_rbtree_first(&root); + if (!rb) { + bpf_spin_unlock(&lock); + return -4; + } + + rb = bpf_rbtree_remove(&root, rb); + if (!rb) { + bpf_spin_unlock(&lock); + return -5; + } + + n = container_of(rb, struct node_data, r); + l = bpf_list_del(&head, &n->l); + bpf_spin_unlock(&lock); + bpf_obj_drop(n); + if (!l) + return -6; + + bpf_obj_drop(container_of(l, struct node_data, l)); + return 0; +} + +SEC("tc") +__description("list_del_test3: list_del accepts list_front return value as node") +__success __retval(0) +int list_del_test3(void *ctx) +{ + struct node_data *tmp; + struct bpf_list_node *bpf_node, *l; + long err = 0; + + tmp = __add_in_list(&head, &lock); + if (!tmp) + return -1; + + bpf_spin_lock(&lock); + bpf_node = bpf_list_front(&head); + if (!bpf_node) { + bpf_spin_unlock(&lock); + err = -2; + goto fail; + } + + l = bpf_list_del(&head, bpf_node); + bpf_spin_unlock(&lock); + if (!l) { + err = -3; + goto fail; + } + + bpf_obj_drop(container_of(l, struct node_data, l)); + bpf_obj_drop(tmp); + return 0; + +fail: + bpf_obj_drop(tmp); + return err; +} + +SEC("tc") +__description("list_add_test1: insert new node after prev") +__success __retval(0) +int list_add_test1(void *ctx) +{ + struct node_data *node_first; + struct node_data *new_node; + long err = 0; + + node_first = __add_in_list(&head, &lock); + if (!node_first) + return -1; + + new_node = bpf_obj_new(typeof(*new_node)); + if (!new_node) { + err = -2; + goto fail; + } + + bpf_spin_lock(&lock); + err = bpf_list_add(&head, &new_node->l, &node_first->l); + bpf_spin_unlock(&lock); + if (err) { + err = -3; + goto fail; + } + +fail: + bpf_obj_drop(node_first); + return err; +} + +SEC("tc") +__description("list_add_test2: list_add accepts list_front return value as prev") +__success __retval(0) +int list_add_test2(void *ctx) +{ + struct node_data *new_node, *tmp; + struct bpf_list_node *bpf_node; + long err = 0; + + tmp = __add_in_list(&head, &lock); + if (!tmp) + return -1; + + new_node = bpf_obj_new(typeof(*new_node)); + if (!new_node) { + err = -2; + goto fail; + } + + bpf_spin_lock(&lock); + bpf_node = bpf_list_front(&head); + if (!bpf_node) { + bpf_spin_unlock(&lock); + bpf_obj_drop(new_node); + err = -3; + goto fail; + } + + err = bpf_list_add(&head, &new_node->l, bpf_node); + bpf_spin_unlock(&lock); + if (err) { + err = -4; + goto fail; + } + +fail: + bpf_obj_drop(tmp); + return err; +} + +struct uninit_head_val { + struct bpf_spin_lock lock; + struct bpf_list_head head __contains(node_data, l); +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct uninit_head_val); + __uint(max_entries, 1); +} uninit_head_map SEC(".maps"); + +SEC("tc") +__description("list_push_back_uninit_head: push_back on 0-initialized list head") +__success __retval(0) +int list_push_back_uninit_head(void *ctx) +{ + struct uninit_head_val *st; + struct node_data *node; + int ret = -1, key = 0; + + st = bpf_map_lookup_elem(&uninit_head_map, &key); + if (!st) + return -1; + + node = bpf_obj_new(typeof(*node)); + if (!node) + return -1; + + bpf_spin_lock(&st->lock); + ret = bpf_list_push_back(&st->head, &node->l); + bpf_spin_unlock(&st->lock); + + return ret; +} + +SEC("?tc") +__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head") +long list_del_without_lock_fail(void *ctx) +{ + struct node_data *n; + struct bpf_list_node *l; + + n = bpf_obj_new(typeof(*n)); + if (!n) + return -1; + + /* Error case: delete list node without holding lock */ + l = bpf_list_del(&head, &n->l); + bpf_obj_drop(n); + if (!l) + return -2; + bpf_obj_drop(container_of(l, struct node_data, l)); + + return 0; +} + +SEC("?tc") +__failure __msg("bpf_spin_lock at off=32 must be held for bpf_list_head") +long list_add_without_lock_fail(void *ctx) +{ + struct node_data *n, *prev; + long err; + + n = bpf_obj_new(typeof(*n)); + if (!n) + return -1; + + prev = bpf_obj_new(typeof(*prev)); + if (!prev) { + bpf_obj_drop(n); + return -1; + } + + /* Error case: add list node without holding lock */ + err = bpf_list_add(&head, &n->l, &prev->l); + bpf_obj_drop(prev); + if (err) + return -2; + + return 0; +} + +SEC("tc") __success long rbtree_refcounted_node_ref_escapes(void *ctx) { @@ -615,13 +1036,31 @@ int percpu_hash_refcount_leak(void *ctx) struct map_value *v; int key = 0; - v = bpf_map_lookup_elem(&percpu_hash, &key); + v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0); if (!v) return 0; return __insert_in_list(&head, &lock, &v->node); } +SEC("syscall") +int clear_percpu_hash_kptr(void *ctx) +{ + struct node_data *n; + struct map_value *v; + int key = 0; + + v = bpf_map_lookup_percpu_elem(&percpu_hash, &key, 0); + if (!v) + return 0; + + n = bpf_kptr_xchg(&v->node, NULL); + if (!n) + return 0; + bpf_obj_drop(n); + return probe_read_refcount(); +} + SEC("tc") int check_percpu_hash_refcount(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c index b2808bfcec29..024ef2aae200 100644 --- a/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr_fail.c @@ -13,12 +13,20 @@ struct node_acquire { struct bpf_refcount refcount; }; +struct node_refcounted { + long key; + struct bpf_list_node list; + struct bpf_refcount refcount; +}; + extern void bpf_rcu_read_lock(void) __ksym; extern void bpf_rcu_read_unlock(void) __ksym; #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) private(A) struct bpf_spin_lock glock; private(A) struct bpf_rb_root groot __contains(node_acquire, node); +private(B) struct bpf_spin_lock lock; +private(B) struct bpf_list_head head __contains(node_refcounted, list); static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b) { @@ -54,7 +62,7 @@ long rbtree_refcounted_node_ref_escapes(void *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") long refcount_acquire_maybe_null(void *ctx) { struct node_acquire *n, *m; @@ -93,6 +101,32 @@ long rbtree_refcounted_node_ref_escapes_owning_input(void *ctx) return 0; } +SEC("?tc") +__failure __msg("dereference of modified ptr_ ptr R1") +long refcount_acquire_list_node_offset(void *ctx) +{ + struct node_refcounted *node, *base, *ref; + struct bpf_list_node *list_node; + + node = bpf_obj_new(typeof(*node)); + if (!node) + return 1; + + bpf_spin_lock(&lock); + bpf_list_push_front(&head, &node->list); + list_node = bpf_list_pop_front(&head); + bpf_spin_unlock(&lock); + if (!list_node) + return 2; + + base = container_of(list_node, struct node_refcounted, list); + ref = bpf_refcount_acquire(list_node); + if (ref) + bpf_obj_drop(ref); + bpf_obj_drop(base); + return 0; +} + SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") __failure __msg("function calls are not allowed while holding a lock") int BPF_PROG(rbtree_fail_sleepable_lock_across_rcu, diff --git a/tools/testing/selftests/bpf/progs/rhash.c b/tools/testing/selftests/bpf/progs/rhash.c new file mode 100644 index 000000000000..fc2dac3a719e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/rhash.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <stdbool.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +#define ENOENT 2 +#define EEXIST 17 + +char _license[] SEC("license") = "GPL"; + +int err; + +struct elem { + char arr[128]; + int val; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RHASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __uint(max_entries, 128); + __type(key, int); + __type(value, struct elem); +} rhmap SEC(".maps"); + +SEC("syscall") +int test_rhash_lookup_update(void *ctx) +{ + int key = 5; + struct elem empty = {.val = 3, .arr = {0}}; + struct elem *e; + + err = 1; + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) + return 1; + + err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST); + if (err) + return 1; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != empty.val) { + err = 2; + return 2; + } + + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_delete(void *ctx) +{ + int key = 6; + struct elem empty = {.val = 4, .arr = {0}}; + struct elem *e; + + err = 1; + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) + return 1; + + err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST); + if (err) + return 2; + + err = bpf_map_delete_elem(&rhmap, &key); + if (err) + return 3; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) { + err = 4; + return 4; + } + + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_elements(void *ctx) +{ + int key = 0; + struct elem empty = {.val = 4, .arr = {0}}; + struct elem *e; + int i; + + err = 1; + + for (i = 0; i < 128; ++i) { + key = i; + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) + return 1; + + empty.val = key; + err = bpf_map_update_elem(&rhmap, &key, &empty, BPF_NOEXIST); + if (err) + return 2; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != key) { + err = 4; + return 4; + } + } + + for (i = 0; i < 128; ++i) { + key = i; + err = bpf_map_delete_elem(&rhmap, &key); + if (err) + return 3; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (e) { + err = 5; + return 5; + } + } + + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_exist(void *ctx) +{ + int key = 10; + struct elem val1 = {.val = 100, .arr = {0}}; + struct elem val2 = {.val = 200, .arr = {0}}; + struct elem *e; + int ret; + + err = 1; + + /* BPF_EXIST on non-existent key should fail with -ENOENT */ + ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_EXIST); + if (ret != -ENOENT) + return 1; + + /* Insert element first */ + ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_NOEXIST); + if (ret) + return 2; + + /* Verify initial value */ + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 100) + return 3; + + /* BPF_EXIST on existing key should succeed and update value */ + ret = bpf_map_update_elem(&rhmap, &key, &val2, BPF_EXIST); + if (ret) + return 4; + + /* Verify value was updated */ + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 200) + return 5; + + /* Cleanup */ + bpf_map_delete_elem(&rhmap, &key); + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_update_any(void *ctx) +{ + int key = 11; + struct elem val1 = {.val = 111, .arr = {0}}; + struct elem val2 = {.val = 222, .arr = {0}}; + struct elem *e; + int ret; + + err = 1; + + /* BPF_ANY on non-existent key should insert */ + ret = bpf_map_update_elem(&rhmap, &key, &val1, BPF_ANY); + if (ret) + return 1; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 111) + return 2; + + /* BPF_ANY on existing key should update */ + ret = bpf_map_update_elem(&rhmap, &key, &val2, BPF_ANY); + if (ret) + return 3; + + e = bpf_map_lookup_elem(&rhmap, &key); + if (!e || e->val != 222) + return 4; + + /* Cleanup */ + bpf_map_delete_elem(&rhmap, &key); + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_noexist_duplicate(void *ctx) +{ + int key = 12; + struct elem val = {.val = 600, .arr = {0}}; + int ret; + + err = 1; + + /* Insert element */ + ret = bpf_map_update_elem(&rhmap, &key, &val, BPF_NOEXIST); + if (ret) + return 1; + + /* Try to insert again with BPF_NOEXIST - should fail with -EEXIST */ + ret = bpf_map_update_elem(&rhmap, &key, &val, BPF_NOEXIST); + if (ret != -EEXIST) + return 2; + + /* Cleanup */ + bpf_map_delete_elem(&rhmap, &key); + err = 0; + return 0; +} + +SEC("syscall") +int test_rhash_delete_nonexistent(void *ctx) +{ + int key = 99999; + int ret; + + err = 1; + + /* Delete non-existent key should return -ENOENT */ + ret = bpf_map_delete_elem(&rhmap, &key); + if (ret != -ENOENT) + return 1; + + err = 0; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index d330b1511979..636a7cd8e2fa 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -387,6 +387,24 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } +int v4mapped_v6_ip_tos_enable; +int v4mapped_v6_ip_tos_ret; +int v4mapped_v6_ip_tos_cnt; +int v4mapped_v6_ip_tos_val; + +static void test_v4mapped_v6_ip_tos(struct bpf_sock_ops *skops) +{ + int tos = v4mapped_v6_ip_tos_val; + + if (!v4mapped_v6_ip_tos_enable || skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB) + return; + if (skops->family != AF_INET6) + return; + + v4mapped_v6_ip_tos_cnt++; + v4mapped_v6_ip_tos_ret = bpf_setsockopt(skops, IPPROTO_IP, IP_TOS, &tos, sizeof(tos)); +} + SEC("sockops") int skops_sockopt(struct bpf_sock_ops *skops) { @@ -401,6 +419,11 @@ int skops_sockopt(struct bpf_sock_ops *skops) if (!sk) return 1; + if (v4mapped_v6_ip_tos_enable) { + test_v4mapped_v6_ip_tos(skops); + return 1; + } + switch (skops->op) { case BPF_SOCK_OPS_TCP_LISTEN_CB: nr_listen += !(bpf_test_sockopt(skops, sk) || diff --git a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c index 09a00d11ffcc..bae5283fca6b 100644 --- a/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c +++ b/tools/testing/selftests/bpf/progs/sk_bypass_prot_mem.c @@ -5,6 +5,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <errno.h> +#include "err.h" extern int tcp_memory_per_cpu_fw_alloc __ksym; extern int udp_memory_per_cpu_fw_alloc __ksym; @@ -97,6 +98,7 @@ int sock_create(struct bpf_sock *ctx) return 1; err: + set_if_not_errno_or_zero(err, -EFAULT); bpf_set_retval(err); return 0; } diff --git a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c index c9abfe3a11af..56e9aebf05f2 100644 --- a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c @@ -5,28 +5,6 @@ SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { - void *data_end = (void *)(long) skb->data_end; - void *data = (void *)(long) skb->data; - __u8 *d = data; - int err; - - if (data + 10 > data_end) { - err = bpf_skb_pull_data(skb, 10); - if (err) - return SK_DROP; - - data_end = (void *)(long)skb->data_end; - data = (void *)(long)skb->data; - if (data + 10 > data_end) - return SK_DROP; - } - - /* This write/read is a bit pointless but tests the verifier and - * strparser handler for read/write pkt data and access into sk - * fields. - */ - d = data; - d[7] = 1; return skb->len; } diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index cb990a7d3d45..5e0b27e7855c 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -149,6 +149,20 @@ int _setsockopt(struct bpf_sockopt *ctx) if (sk && sk->family == AF_NETLINK) goto out; + if (sk && sk->family == AF_INET && sk->type == SOCK_RAW) { + struct bpf_tcp_sock *tp = bpf_tcp_sock(sk); + + if (tp) { + char saved_syn[60]; + + bpf_getsockopt(sk, SOL_TCP, TCP_SAVED_SYN, + &saved_syn, sizeof(saved_syn)); + goto consumed; + } + + goto out; + } + /* Make sure bpf_get_netns_cookie is callable. */ if (bpf_get_netns_cookie(NULL) == 0) @@ -224,6 +238,8 @@ int _setsockopt(struct bpf_sockopt *ctx) return 0; /* couldn't get sk storage */ storage->val = optval[0]; + +consumed: ctx->optlen = -1; /* BPF has consumed this option, don't call kernel * setsockopt handler. */ diff --git a/tools/testing/selftests/bpf/progs/stack_arg.c b/tools/testing/selftests/bpf/progs/stack_arg.c new file mode 100644 index 000000000000..944e3bb603e7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <stdbool.h> +#include <bpf/bpf_helpers.h> +#include "bpf_kfuncs.h" + +#define CLOCK_MONOTONIC 1 + +struct timer_elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct timer_elem); +} timer_map SEC(".maps"); + +int timer_result; + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +const volatile bool has_stack_arg = true; + +__noinline static int static_func_many_args(int a, int b, int c, int d, + int e, int f, int g, int h, + int i, int j) +{ + return a + b + c + d + e + f + g + h + i + j; +} + +__noinline int global_calls_many_args(int a, int b, int c) +{ + return static_func_many_args(a, b, c, a + 3, a + 4, a + 5, a + 6, + a + 7, a + 8, a + 9); +} + +SEC("tc") +int test_global_many_args(void) +{ + return global_calls_many_args(1, 2, 3); +} + +struct test_data { + long x; + long y; +}; + +/* 1+2+3+4+5+6+7+8+9+10+20 = 75 */ +__noinline static long func_with_ptr_stack_arg(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, struct test_data *p) +{ + return a + b + c + d + e + f + g + h + i + p->x + p->y; +} + +__noinline long global_ptr_stack_arg(long a, long b, long c, long d, long e) +{ + struct test_data data = { .x = 10, .y = 20 }; + + return func_with_ptr_stack_arg(a, b, c, d, e, a + 5, a + 6, a + 7, + a + 8, &data); +} + +SEC("tc") +int test_bpf2bpf_ptr_stack_arg(void) +{ + return global_ptr_stack_arg(1, 2, 3, 4, 5); +} + +/* 1+2+3+4+5+6+7+10+8+20 = 66 */ +__noinline static long func_with_mix_stack_args(long a, long b, long c, long d, + long e, long f, long g, + struct test_data *p, + long h, struct test_data *q) +{ + return a + b + c + d + e + f + g + p->x + h + q->y; +} + +__noinline long global_mix_stack_args(long a, long b, long c, long d, long e) +{ + struct test_data p = { .x = 10 }; + struct test_data q = { .y = 20 }; + + return func_with_mix_stack_args(a, b, c, d, e, e + 1, e + 2, &p, + e + 3, &q); +} + +SEC("tc") +int test_bpf2bpf_mix_stack_args(void) +{ + return global_mix_stack_args(1, 2, 3, 4, 5); +} + +/* + * Nesting test: func_outer calls func_inner, both with struct pointer + * as stack arg. + * + * func_inner: (a+1)+...+(i+1) + p->x + p->y + * = 2+3+4+5+6+7+8+9+10+10+20 = 84 + */ +__noinline static long func_inner_ptr(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, struct test_data *p) +{ + return a + b + c + d + e + f + g + h + i + p->x + p->y; +} + +__noinline static long func_outer_ptr(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, struct test_data *p) +{ + return func_inner_ptr(a + 1, b + 1, c + 1, d + 1, e + 1, + f + 1, g + 1, h + 1, i + 1, p); +} + +__noinline long global_nesting_ptr(long a, long b, long c, long d, long e) +{ + struct test_data data = { .x = 10, .y = 20 }; + + return func_outer_ptr(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, + &data); +} + +SEC("tc") +int test_bpf2bpf_nesting_stack_arg(void) +{ + return global_nesting_ptr(1, 2, 3, 4, 5); +} + +/* 1+2+3+4+5+6+7+8+9+sizeof(pkt_v4) = 45+54 = 99 */ +__noinline static long func_with_dynptr(long a, long b, long c, long d, + long e, long f, long g, long h, + long i, struct bpf_dynptr *ptr) +{ + return a + b + c + d + e + f + g + h + i + bpf_dynptr_size(ptr); +} + +__noinline long global_dynptr_stack_arg(void *ctx __arg_ctx, long a, long b, + long c, long d) +{ + struct bpf_dynptr ptr; + + bpf_dynptr_from_skb(ctx, 0, &ptr); + return func_with_dynptr(a, b, c, d, d + 1, d + 2, d + 3, d + 4, + d + 5, &ptr); +} + +SEC("tc") +int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb) +{ + return global_dynptr_stack_arg(skb, 1, 2, 3, 4); +} + +/* foo1: a+b+c+d+e+f+g+h+i+j */ +__noinline static int foo1(int a, int b, int c, int d, int e, + int f, int g, int h, int i, int j) +{ + return a + b + c + d + e + f + g + h + i + j; +} + +/* foo2: a+b+c+d+e+f+g+h+i+j+k+l */ +__noinline static int foo2(int a, int b, int c, int d, int e, + int f, int g, int h, int i, int j, + int k, int l) +{ + return a + b + c + d + e + f + g + h + i + j + k + l; +} + +/* global_two_callees calls foo1 (5 stack args) and foo2 (7 stack args). + * The outgoing stack arg area is sized for foo2 (the larger callee). + * Stores for foo1 are a subset of the area used by foo2. + * Result: foo1(1..10) + foo2(1..12) = 55 + 78 = 133 + * + * Pass a-e through so the compiler can't constant-fold the stack args away. + */ +__noinline int global_two_callees(int a, int b, int c, int d, int e) +{ + int ret; + + ret = foo1(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9); + ret += foo2(a, b, c, d, e, a + 5, a + 6, a + 7, a + 8, a + 9, + a + 10, a + 11); + return ret; +} + +SEC("tc") +int test_two_callees(void) +{ + return global_two_callees(1, 2, 3, 4, 5); +} + +const volatile int timer_base = 10; + +static int timer_cb_many_args(void *map, int *key, struct bpf_timer *timer) +{ + int v = timer_base; + + timer_result = static_func_many_args(v, v * 2, v * 3, v * 4, v * 5, + v * 6, v * 7, v * 8, v * 9, + v * 10); + return 0; +} + +SEC("tc") +int test_async_cb_many_args(void) +{ + struct timer_elem *elem; + int key = 0; + + elem = bpf_map_lookup_elem(&timer_map, &key); + if (!elem) + return -1; + + bpf_timer_init(&elem->timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(&elem->timer, timer_cb_many_args); + bpf_timer_start(&elem->timer, 1, 0); + return 0; +} + +#else + +const volatile bool has_stack_arg = false; + +SEC("tc") +int test_global_many_args(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_ptr_stack_arg(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_mix_stack_args(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_nesting_stack_arg(void) +{ + return 0; +} + +SEC("tc") +int test_bpf2bpf_dynptr_stack_arg(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_two_callees(void) +{ + return 0; +} + +SEC("tc") +int test_async_cb_many_args(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stack_arg_fail.c b/tools/testing/selftests/bpf/progs/stack_arg_fail.c new file mode 100644 index 000000000000..ad9d4bfe15dc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg_fail.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "../test_kmods/bpf_testmod_kfunc.h" +#include "bpf_misc.h" + +#if defined(__BPF_FEATURE_STACK_ARGUMENT) + +SEC("tc") +__failure __msg("Unrecognized *(R11-8) type STRUCT") +int test_stack_arg_big(struct __sk_buff *skb) +{ + struct prog_test_big_arg s = { .a = 1, .b = 2 }; + + return bpf_kfunc_call_stack_arg_big(1, 2, 3, 4, 5, s); +} + +SEC("socket") +__description("r11 in ALU instruction") +__failure __msg("R11 is invalid") +__naked void r11_alu_reject(void) +{ + asm volatile ( + "r11 += 1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with non-DW size") +__failure __msg("R11 is invalid") +__naked void r11_store_non_dw(void) +{ + asm volatile ( + "*(u32 *)(r11 - 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with unaligned offset") +__failure __msg("R11 is invalid") +__naked void r11_store_unaligned(void) +{ + asm volatile ( + "*(u64 *)(r11 - 4) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with positive offset") +__failure __msg("R11 is invalid") +__naked void r11_store_positive_off(void) +{ + asm volatile ( + "*(u64 *)(r11 + 8) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 load with negative offset") +__failure __msg("R11 is invalid") +__naked void r11_load_negative_off(void) +{ + asm volatile ( + "r0 = *(u64 *)(r11 - 8);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 load with non-DW size") +__failure __msg("R11 is invalid") +__naked void r11_load_non_dw(void) +{ + asm volatile ( + "r0 = *(u32 *)(r11 + 8);" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__description("r11 store with zero offset") +__failure __msg("R11 is invalid") +__naked void r11_store_zero_off(void) +{ + asm volatile ( + "*(u64 *)(r11 + 0) = r1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +#else + +SEC("tc") +__description("stack_arg_fail: not supported, dummy test") +__success +int test_stack_arg_big(struct __sk_buff *skb) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c new file mode 100644 index 000000000000..345f2da2e361 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg_kfunc.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_kfuncs.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +const volatile bool has_stack_arg = true; + +struct bpf_iter_testmod_seq { + u64 :64; + u64 :64; +}; + +extern int bpf_iter_testmod_seq_new(struct bpf_iter_testmod_seq *it, s64 value, int cnt) __ksym; +extern void bpf_iter_testmod_seq_destroy(struct bpf_iter_testmod_seq *it) __ksym; + +struct timer_map_value { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct timer_map_value); +} kfunc_timer_map SEC(".maps"); + +SEC("tc") +int test_stack_arg_scalar(struct __sk_buff *skb) +{ + return bpf_kfunc_call_stack_arg(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); +} + +SEC("tc") +int test_stack_arg_ptr(struct __sk_buff *skb) +{ + struct prog_test_pass1 p = { .x0 = 10, .x1 = 20 }; + + return bpf_kfunc_call_stack_arg_ptr(1, 2, 3, 4, 5, 6, 7, 8, 9, &p); +} + +SEC("tc") +int test_stack_arg_mix(struct __sk_buff *skb) +{ + struct prog_test_pass1 p = { .x0 = 10 }; + struct prog_test_pass1 q = { .x1 = 20 }; + + return bpf_kfunc_call_stack_arg_mix(1, 2, 3, 4, 5, 6, 7, &p, 8, &q); +} + +/* 1+2+3+4+5+6+7+8+9+sizeof(pkt_v4) = 45+54 = 99 */ +SEC("tc") +int test_stack_arg_dynptr(struct __sk_buff *skb) +{ + struct bpf_dynptr ptr; + + bpf_dynptr_from_skb(skb, 0, &ptr); + return bpf_kfunc_call_stack_arg_dynptr(1, 2, 3, 4, 5, 6, 7, 8, 9, &ptr); +} + +/* 1 + 2 + 3 + 4 + 5 + (1 + 2 + ... + 16) = 15 + 136 = 151 */ +SEC("tc") +int test_stack_arg_mem(struct __sk_buff *skb) +{ + char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + + return bpf_kfunc_call_stack_arg_mem(1, 2, 3, 4, 5, buf, sizeof(buf)); +} + +/* 1+2+3+4+5+6+7+8+9+100 = 145 */ +SEC("tc") +int test_stack_arg_iter(struct __sk_buff *skb) +{ + struct bpf_iter_testmod_seq it; + u64 ret; + + bpf_iter_testmod_seq_new(&it, 100, 10); + ret = bpf_kfunc_call_stack_arg_iter(1, 2, 3, 4, 5, 6, 7, 8, 9, &it); + bpf_iter_testmod_seq_destroy(&it); + return ret; +} + +const char cstr[] = "hello"; + +/* 1+2+3+4+5+6+7+8+9 = 45 */ +SEC("tc") +int test_stack_arg_const_str(struct __sk_buff *skb) +{ + return bpf_kfunc_call_stack_arg_const_str(1, 2, 3, 4, 5, 6, 7, 8, 9, + cstr); +} + +/* 1+2+3+4+5+6+7+8+9 = 45 */ +SEC("tc") +int test_stack_arg_timer(struct __sk_buff *skb) +{ + struct timer_map_value *val; + int key = 0; + + val = bpf_map_lookup_elem(&kfunc_timer_map, &key); + if (!val) + return 0; + return bpf_kfunc_call_stack_arg_timer(1, 2, 3, 4, 5, 6, 7, 8, 9, + &val->timer); +} + +#else + +const volatile bool has_stack_arg = false; + +SEC("tc") +int test_stack_arg_scalar(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_ptr(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_mix(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_dynptr(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_mem(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_iter(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_const_str(struct __sk_buff *skb) +{ + return 0; +} + +SEC("tc") +int test_stack_arg_timer(struct __sk_buff *skb) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stack_arg_precision.c b/tools/testing/selftests/bpf/progs/stack_arg_precision.c new file mode 100644 index 000000000000..bee2eeec021d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stack_arg_precision.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "../test_kmods/bpf_testmod_kfunc.h" +#include "bpf_misc.h" + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +/* Force kfunc extern BTF generation for inline asm call below. + * Uses its own SEC so it's not included as a .text subprog. + * The '?' prefix sets autoload=false so libbpf won't load it. + */ +SEC("?tc") +int __btf_kfunc_gen(struct __sk_buff *ctx) +{ + char buf[8] = {}; + + return bpf_kfunc_call_stack_arg_mem(0, 0, 0, 0, 0, buf, sizeof(buf)); +} + +/* + * Test precision backtracking across bpf-to-bpf call for kfunc stack arg. + * subprog_call_mem_kfunc receives a size as incoming stack arg (arg6) + * and forwards it as mem__sz (arg7) to bpf_kfunc_call_stack_arg_mem. + */ +__naked __noinline __used +static long subprog_call_mem_kfunc(long a, long b, long c, long d, long e, long size) +{ + asm volatile ( + "r1 = *(u64 *)(r11 + 8);" /* r1 = incoming arg6 (size) */ + "r2 = 0x0807060504030201 ll;" /* r2 = buf contents */ + "*(u64 *)(r10 - 8) = r2;" /* store buf to stack */ + "r2 = r10;" + "r2 += -8;" /* r2 = &buf */ + "*(u64 *)(r11 - 8) = r2;" /* outgoing arg6 = buf */ + "*(u64 *)(r11 - 16) = r1;" /* outgoing arg7 = size */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call %[bpf_kfunc_call_stack_arg_mem];" + "exit;" + : + : __imm(bpf_kfunc_call_stack_arg_mem) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: precision backtracking across bpf2bpf call for kfunc") +__success +__log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__btf_func_path("btf__stack_arg_precision.bpf.o") +__msg("mark_precise: frame1: last_idx 26 first_idx 13 subseq_idx -1") +__msg("mark_precise: frame1: regs= stack= before 25: (b7) r5 = 5") +__msg("mark_precise: frame1: regs= stack= before 24: (b7) r4 = 4") +__msg("mark_precise: frame1: regs= stack= before 23: (b7) r3 = 3") +__msg("mark_precise: frame1: regs= stack= before 22: (b7) r2 = 2") +__msg("mark_precise: frame1: regs= stack= before 21: (b7) r1 = 1") +__msg("mark_precise: frame1: regs= stack= before 20: (7b) *(u64 *)(r11 -16) = r1") +__msg("mark_precise: frame1: regs=r1 stack= before 19: (7b) *(u64 *)(r11 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 18: (07) r2 += -8") +__msg("mark_precise: frame1: regs=r1 stack= before 17: (bf) r2 = r10") +__msg("mark_precise: frame1: regs=r1 stack= before 16: (7b) *(u64 *)(r10 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 14: (18) r2 = 0x807060504030201") +__msg("mark_precise: frame1: regs=r1 stack= before 13: (79) r1 = *(u64 *)(r11 +8)") +__msg("mark_precise: frame1: parent state regs= stack=: frame1: R10=fp0") +__msg("mark_precise: frame0: parent state regs= stack=: R10=fp0") +__msg("mark_precise: frame1: last_idx 11 first_idx 11 subseq_idx 13") +__msg("mark_precise: frame1: regs= stack= before 11: (85) call pc+1") +__msg("mark_precise: frame0: parent state regs= stack=: R1=1 R2=2 R3=3 R4=4 R5=5 R10=fp0") +__msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx 11") +__msg("mark_precise: frame0: regs= stack= before 9: (05) goto pc+1") +__msg("mark_precise: frame0: regs= stack= before 8: (7a) *(u64 *)(r11 -8) = 4") +__msg("mark_precise: frame1: last_idx 26 first_idx 13 subseq_idx -1 ") +__msg("mark_precise: frame1: regs= stack= before 25: (b7) r5 = 5") +__msg("mark_precise: frame1: regs= stack= before 24: (b7) r4 = 4") +__msg("mark_precise: frame1: regs= stack= before 23: (b7) r3 = 3") +__msg("mark_precise: frame1: regs= stack= before 22: (b7) r2 = 2") +__msg("mark_precise: frame1: regs= stack= before 21: (b7) r1 = 1") +__msg("mark_precise: frame1: regs= stack= before 20: (7b) *(u64 *)(r11 -16) = r1") +__msg("mark_precise: frame1: regs=r1 stack= before 19: (7b) *(u64 *)(r11 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 18: (07) r2 += -8") +__msg("mark_precise: frame1: regs=r1 stack= before 17: (bf) r2 = r10") +__msg("mark_precise: frame1: regs=r1 stack= before 16: (7b) *(u64 *)(r10 -8) = r2") +__msg("mark_precise: frame1: regs=r1 stack= before 14: (18) r2 = 0x807060504030201") +__msg("mark_precise: frame1: regs=r1 stack= before 13: (79) r1 = *(u64 *)(r11 +8)") +__msg("mark_precise: frame1: parent state regs= stack=: frame1: R10=fp0") +__msg("mark_precise: frame0: parent state regs= stack=: R10=fp0") +__msg("mark_precise: frame1: last_idx 11 first_idx 11 subseq_idx 13 ") +__msg("mark_precise: frame1: regs= stack= before 11: (85) call pc+1") +__msg("mark_precise: frame0: parent state regs= stack=: R1=1 R2=2 R3=3 R4=4 R5=5 R10=fp0") +__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx 11 ") +__msg("mark_precise: frame0: regs= stack= before 10: (7a) *(u64 *)(r11 -8) = 6") +__naked void stack_arg_precision_bpf2bpf(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r6 = r0;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "if r6 < 2 goto l0_%=;" + "*(u64 *)(r11 - 8) = 4;" + "goto l1_%=;" + "l0_%=:" + "*(u64 *)(r11 - 8) = 6;" + "l1_%=:" + "call subprog_call_mem_kfunc;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +#else + +SEC("socket") +__description("stack_arg_precision: not supported, dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 6f999ba951a3..92ba1d72e0ec 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -5,7 +5,7 @@ #include <bpf/bpf_helpers.h> #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> struct arr_elem { struct bpf_res_spin_lock lock; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 8e8249f3521c..21428bb1ee59 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -23,7 +23,7 @@ int stream_vprintk_scalar_arg(void *ctx) } SEC("syscall") -__failure __msg("arg#1 doesn't point to a const string") +__failure __msg("R2 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c index ce97d141daee..c4fadee5aadc 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c @@ -13,11 +13,14 @@ struct { static __noinline int subprog_tail(struct __sk_buff *skb) { + int ret = 1; + if (load_byte(skb, 0)) bpf_tail_call_static(skb, &jmp_table, 1); else bpf_tail_call_static(skb, &jmp_table, 0); - return 1; + barrier_var(ret); + return ret; } int count = 0; diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c index d556b19413d7..1fd07824d88a 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy1.c @@ -16,20 +16,25 @@ int count = 0; static __noinline int subprog_tail(struct __sk_buff *skb) { + int ret = 0; + bpf_tail_call_static(skb, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } SEC("tc") int entry(struct __sk_buff *skb) { - int ret = 1; + int ret = 1, ret1, ret2; clobber_regs_stack(); count++; - subprog_tail(skb); - subprog_tail(skb); + ret1 = subprog_tail(skb); + ret2 = subprog_tail(skb); + __sink(ret1); + __sink(ret2); return ret; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c index ae94c9c70ab7..6fde0ab92148 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy2.c @@ -25,8 +25,11 @@ int count1 = 0; static __noinline int subprog_tail0(struct __sk_buff *skb) { + int ret = 0; + bpf_tail_call_static(skb, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } __auxiliary @@ -41,16 +44,22 @@ int classifier_0(struct __sk_buff *skb) static __noinline int subprog_tail1(struct __sk_buff *skb) { + int ret = 0; + bpf_tail_call_static(skb, &jmp_table, 1); - return 0; + barrier_var(ret); + return ret; } __auxiliary SEC("tc") int classifier_1(struct __sk_buff *skb) { + int ret; + count1++; - subprog_tail1(skb); + ret = subprog_tail1(skb); + __sink(ret); return 0; } @@ -59,13 +68,14 @@ __retval(33) SEC("tc") int tailcall_bpf2bpf_hierarchy_2(struct __sk_buff *skb) { - int ret = 0; + int ret = 0, ret1, ret2; clobber_regs_stack(); - subprog_tail0(skb); - subprog_tail1(skb); - + ret1 = subprog_tail0(skb); + ret2 = subprog_tail1(skb); + __sink(ret1); + __sink(ret2); __sink(ret); return (count1 << 16) | count0; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c index 56b6b0099840..0ef9cfb2da8d 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy3.c @@ -33,17 +33,24 @@ int count = 0; static __noinline int subprog_tail(struct __sk_buff *skb, void *jmp_table) { + int ret = 0; + bpf_tail_call_static(skb, jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } __auxiliary SEC("tc") int classifier_0(struct __sk_buff *skb) { + int ret1, ret2; + count++; - subprog_tail(skb, &jmp_table0); - subprog_tail(skb, &jmp_table1); + ret1 = subprog_tail(skb, &jmp_table0); + ret2 = subprog_tail(skb, &jmp_table1); + __sink(ret1); + __sink(ret2); return count; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c index 5261395713cd..6db9afee2095 100644 --- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c +++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf_hierarchy_fentry.c @@ -18,18 +18,25 @@ int count = 0; static __noinline int subprog_tail(void *ctx) { + int ret = 0; + bpf_tail_call_static(ctx, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } SEC("fentry/dummy") int BPF_PROG(fentry, struct sk_buff *skb) { + int ret1, ret2; + clobber_regs_stack(); count++; - subprog_tail(ctx); - subprog_tail(ctx); + ret1 = subprog_tail(ctx); + ret2 = subprog_tail(ctx); + __sink(ret1); + __sink(ret2); return 0; } diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c new file mode 100644 index 000000000000..4dd3a0033d75 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, __u64); +} storage_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} prog_array SEC(".maps"); + +SEC("cgroup_skb/egress") +int caller_prog(struct __sk_buff *skb) +{ + __u64 *storage; + + storage = bpf_get_local_storage(&storage_map, 0); + if (storage) + *storage = 1; + + bpf_tail_call(skb, &prog_array, 0); + return 1; +} + +SEC("cgroup_skb/egress") +int callee_prog(struct __sk_buff *skb) +{ + __u64 *storage; + + storage = bpf_get_local_storage(&storage_map, 0); + if (storage) + *storage = 1; + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c new file mode 100644 index 000000000000..5c69b0af6ff9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_no_storage.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} prog_array SEC(".maps"); + +SEC("cgroup_skb/egress") +int caller_prog(struct __sk_buff *skb) +{ + bpf_tail_call(skb, &prog_array, 0); + return 1; +} + +SEC("cgroup_skb/egress") +int leaf_prog(struct __sk_buff *skb) +{ + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c new file mode 100644 index 000000000000..d7e8ec9855c5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_cgrp_storage_owner.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, __u64); +} storage_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} prog_array SEC(".maps"); + +SEC("cgroup_skb/egress") +int prog_array_owner(struct __sk_buff *skb) +{ + __u64 *storage; + + storage = bpf_get_local_storage(&storage_map, 0); + if (storage) + *storage = 1; + + bpf_tail_call(skb, &prog_array, 0); + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c index 4c07ea193f72..8942b5478129 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_failure.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_failure.c @@ -5,6 +5,7 @@ #include <bpf/bpf_tracing.h> #include <bpf/bpf_helpers.h> +#include "../bpf_experimental.h" #include "bpf_misc.h" #include "task_kfunc_common.h" @@ -28,7 +29,7 @@ static struct __tasks_kfunc_map_value *insert_lookup_task(struct task_struct *ta } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -49,7 +50,7 @@ int BPF_PROG(task_kfunc_acquire_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 pointer type STRUCT task_struct must point") +__failure __msg("R1 pointer type STRUCT task_struct must point") int BPF_PROG(task_kfunc_acquire_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *stack_task = (struct task_struct *)&clone_flags; @@ -100,7 +101,7 @@ int BPF_PROG(task_kfunc_acquire_unsafe_kretprobe_rcu, struct task_struct *task, } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_null, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -149,7 +150,7 @@ int BPF_PROG(task_kfunc_xchg_unreleased, struct task_struct *task, u64 clone_fla } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -162,7 +163,7 @@ int BPF_PROG(task_kfunc_acquire_release_no_null_check, struct task_struct *task, } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value *v; @@ -178,7 +179,7 @@ int BPF_PROG(task_kfunc_release_untrusted, struct task_struct *task, u64 clone_f } SEC("tp_btf/task_newtask") -__failure __msg("arg#0 pointer type STRUCT task_struct must point") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired = (struct task_struct *)&clone_flags; @@ -190,7 +191,7 @@ int BPF_PROG(task_kfunc_release_fp, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) { struct __tasks_kfunc_map_value local, *v; @@ -224,7 +225,7 @@ int BPF_PROG(task_kfunc_release_null, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__failure __msg("release kernel function bpf_task_release expects") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_flags) { /* Cannot release trusted task pointer which was not acquired. */ @@ -234,7 +235,46 @@ int BPF_PROG(task_kfunc_release_unacquired, struct task_struct *task, u64 clone_ } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("bpf_obj_drop cannot be used in tracing programs on types with NMI unsafe fields") +int BPF_PROG(task_kfunc_obj_drop_with_kptr, struct task_struct *task, u64 clone_flags) +{ + struct __tasks_kfunc_map_value *local; + + local = bpf_obj_new(typeof(*local)); + if (!local) + return 0; + + bpf_obj_drop(local); + return 0; +} + +SEC("tp_btf/task_newtask") +__failure __msg("bpf_obj_drop cannot be used in tracing programs on types with NMI unsafe fields") +int BPF_PROG(task_kfunc_obj_drop_nmi_with_kptr, struct task_struct *task, + u64 clone_flags) +{ + struct __tasks_kfunc_map_value *local; + struct task_struct *acquired, *old; + + (void)clone_flags; + + local = bpf_obj_new(typeof(*local)); + if (!local) + return 0; + + acquired = bpf_task_acquire(task); + if (acquired) { + old = bpf_kptr_xchg(&local->task, acquired); + if (old) + bpf_task_release(old); + } + + bpf_obj_drop(local); + return 0; +} + +SEC("tp_btf/task_newtask") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -248,7 +288,7 @@ int BPF_PROG(task_kfunc_from_pid_no_null_check, struct task_struct *task, u64 cl } SEC("tp_btf/task_newtask") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(task_kfunc_from_vpid_no_null_check, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired; @@ -313,7 +353,7 @@ int BPF_PROG(task_access_comm4, struct task_struct *task, const char *buf, bool } SEC("tp_btf/task_newtask") -__failure __msg("R1 must be referenced or trusted") +__failure __msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(task_kfunc_release_in_map, struct task_struct *task, u64 clone_flags) { struct task_struct *local; diff --git a/tools/testing/selftests/bpf/progs/task_kfunc_success.c b/tools/testing/selftests/bpf/progs/task_kfunc_success.c index 5fb4fc19d26a..d63a79ee33dc 100644 --- a/tools/testing/selftests/bpf/progs/task_kfunc_success.c +++ b/tools/testing/selftests/bpf/progs/task_kfunc_success.c @@ -140,17 +140,17 @@ int BPF_PROG(test_task_acquire_leave_in_map, struct task_struct *task, u64 clone return 0; } -SEC("tp_btf/task_newtask") -int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) +SEC("syscall") +int test_task_xchg_release(const void *ctx) { - struct task_struct *kptr, *acquired; + struct task_struct *task, *kptr, *acquired; struct __tasks_kfunc_map_value *v, *local; int refcnt, refcnt_after_drop; long status; - if (!is_test_kfunc_task()) - return 0; + (void)ctx; + task = bpf_get_current_task_btf(); status = tasks_kfunc_map_insert(task); if (status) { err = 1; @@ -191,7 +191,7 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) return 0; } - /* Stash a copy into local kptr and check if it is released recursively */ + /* Stash a copy into local kptr and check if it is released recursively. */ acquired = bpf_task_acquire(kptr); if (!acquired) { err = 7; @@ -220,7 +220,6 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags) } bpf_task_release(kptr); - return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_local_storage.c b/tools/testing/selftests/bpf/progs/task_local_storage.c index 80a0a20db88d..34fa3d6451d2 100644 --- a/tools/testing/selftests/bpf/progs/task_local_storage.c +++ b/tools/testing/selftests/bpf/progs/task_local_storage.c @@ -14,12 +14,15 @@ struct { __type(value, long); } enter_id SEC(".maps"); +#include "err.h" + #define MAGIC_VALUE 0xabcd1234 pid_t target_pid = 0; int mismatch_cnt = 0; int enter_cnt = 0; int exit_cnt = 0; +long update_err = 0; SEC("tp_btf/sys_enter") int BPF_PROG(on_enter, struct pt_regs *regs, long id) @@ -62,3 +65,19 @@ int BPF_PROG(on_exit, struct pt_regs *regs, long id) __sync_fetch_and_add(&mismatch_cnt, 1); return 0; } + +SEC("fexit/bpf_local_storage_update") +int BPF_PROG(fexit_update, void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags, bool swap_uptrs, + struct bpf_local_storage_data *ret) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + if (task->pid != target_pid) + return 0; + + if (IS_ERR_VALUE(ret)) + update_err = PTR_ERR(ret); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 82e4b8913333..3186e7b4b24e 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -58,7 +58,7 @@ int mismatch_map(struct pt_regs *args) } SEC("perf_event") -__failure __msg("arg#1 doesn't point to a map value") +__failure __msg("R2 doesn't point to a map value") int no_map_task_work(struct pt_regs *args) { struct task_struct *task; @@ -70,7 +70,7 @@ int no_map_task_work(struct pt_regs *args) } SEC("perf_event") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int task_work_null(struct pt_regs *args) { struct task_struct *task; @@ -81,7 +81,7 @@ int task_work_null(struct pt_regs *args) } SEC("perf_event") -__failure __msg("Possibly NULL pointer passed to trusted arg2") +__failure __msg("Possibly NULL pointer passed to trusted R3") int map_null(struct pt_regs *args) { struct elem *work; diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index 076fbf03a126..df43649ecb78 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -10,6 +10,8 @@ #define EINVAL 22 #define ENOENT 2 +#define CT_OPTS_ERROR_GUARD 0x12345678 + #define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) #define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) @@ -19,6 +21,8 @@ int test_einval_reserved = 0; int test_einval_reserved_new = 0; int test_einval_netns_id = 0; int test_einval_len_opts = 0; +int test_einval_len_opts_small_lookup = 0; +int test_einval_len_opts_small_alloc = 0; int test_eproto_l4proto = 0; int test_enonet_netns_id = 0; int test_enoent_lookup = 0; @@ -124,6 +128,28 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, else test_einval_len_opts = opts_def.error; + opts_def.error = CT_OPTS_ERROR_GUARD; + ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, + sizeof(opts_def.netns_id)); + if (ct) { + bpf_ct_release(ct); + test_einval_len_opts_small_lookup = -EINVAL; + } else { + test_einval_len_opts_small_lookup = opts_def.error; + } + + opts_def.error = CT_OPTS_ERROR_GUARD; + ct = alloc_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, + sizeof(opts_def.netns_id)); + if (ct) { + ct = bpf_ct_insert_entry(ct); + if (ct) + bpf_ct_release(ct); + test_einval_len_opts_small_alloc = -EINVAL; + } else { + test_einval_len_opts_small_alloc = opts_def.error; + } + opts_def.l4proto = IPPROTO_ICMP; ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index 2c156cd166af..332cda89caba 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -152,7 +152,7 @@ int change_status_after_alloc(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int lookup_null_bpf_tuple(struct __sk_buff *ctx) { struct bpf_ct_opts___local opts = {}; @@ -165,7 +165,7 @@ int lookup_null_bpf_tuple(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("Possibly NULL pointer passed to trusted arg3") +__failure __msg("Possibly NULL pointer passed to trusted R4") int lookup_null_bpf_opts(struct __sk_buff *ctx) { struct bpf_sock_tuple tup = {}; @@ -178,7 +178,7 @@ int lookup_null_bpf_opts(struct __sk_buff *ctx) } SEC("?xdp") -__failure __msg("Possibly NULL pointer passed to trusted arg1") +__failure __msg("Possibly NULL pointer passed to trusted R2") int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) { struct bpf_ct_opts___local opts = {}; @@ -191,7 +191,7 @@ int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) } SEC("?xdp") -__failure __msg("Possibly NULL pointer passed to trusted arg3") +__failure __msg("Possibly NULL pointer passed to trusted R4") int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) { struct bpf_sock_tuple tup = {}; diff --git a/tools/testing/selftests/bpf/progs/test_fill_link_info.c b/tools/testing/selftests/bpf/progs/test_fill_link_info.c index fac33a14f200..137bd6292163 100644 --- a/tools/testing/selftests/bpf/progs/test_fill_link_info.c +++ b/tools/testing/selftests/bpf/progs/test_fill_link_info.c @@ -12,7 +12,7 @@ extern bool CONFIG_PPC64 __kconfig __weak; /* This function is here to have CONFIG_X86_KERNEL_IBT, * CONFIG_PPC_FTRACE_OUT_OF_LINE, CONFIG_KPROBES_ON_FTRACE, - * CONFIG_PPC6 used and added to object BTF. + * CONFIG_PPC64 used and added to object BTF. */ int unused(void) { diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c index 974fd8c19561..b66abb350fb0 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func3.c +++ b/tools/testing/selftests/bpf/progs/test_global_func3.c @@ -53,9 +53,57 @@ int f8(struct __sk_buff *skb) return f7(skb); } +static __attribute__ ((noinline)) +int f9(struct __sk_buff *skb) +{ + return f8(skb); +} + +static __attribute__ ((noinline)) +int f10(struct __sk_buff *skb) +{ + return f9(skb); +} + +static __attribute__ ((noinline)) +int f11(struct __sk_buff *skb) +{ + return f10(skb); +} + +static __attribute__ ((noinline)) +int f12(struct __sk_buff *skb) +{ + return f11(skb); +} + +static __attribute__ ((noinline)) +int f13(struct __sk_buff *skb) +{ + return f12(skb); +} + +static __attribute__ ((noinline)) +int f14(struct __sk_buff *skb) +{ + return f13(skb); +} + +static __attribute__ ((noinline)) +int f15(struct __sk_buff *skb) +{ + return f14(skb); +} + +static __attribute__ ((noinline)) +int f16(struct __sk_buff *skb) +{ + return f15(skb); +} + SEC("tc") -__failure __msg("the call stack of 9 frames") +__failure __msg("the call stack of 17 frames") int global_func3(struct __sk_buff *skb) { - return f8(skb); + return f16(skb); } diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index d249113ed657..bf48fc43c7ab 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -11,12 +11,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "bpf_misc.h" - -extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; -extern void bpf_key_put(struct bpf_key *key) __ksym; -extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, - struct bpf_dynptr *sig_ptr, - struct bpf_key *trusted_keyring) __ksym; +#include "bpf_kfuncs.h" struct { __uint(type, BPF_MAP_TYPE_RINGBUF); @@ -38,14 +33,14 @@ SEC("?lsm.s/bpf") __failure __msg("cannot pass in dynptr at an offset=-8") int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - unsigned long val; + unsigned long val = 0; return bpf_verify_pkcs7_signature((struct bpf_dynptr *)&val, (struct bpf_dynptr *)&val, NULL); } SEC("?lsm.s/bpf") -__failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") +__failure __msg("R1 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { static struct bpf_dynptr val; diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 967081bbcfe1..ca35b92ea095 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; diff --git a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c index 7a6620671a83..cbe4284c032f 100644 --- a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c +++ b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c @@ -13,9 +13,9 @@ int bpf_decoder(unsigned int *sample) if (LIRC_IS_PULSE(*sample)) { unsigned int duration = LIRC_VALUE(*sample); - if (duration & 0x10000) + if (duration & 0x1000) bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0); - if (duration & 0x20000) + if (duration & 0x2000) bpf_rc_pointer_rel(sample, (duration >> 8) & 0xff, duration & 0xff); } diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c index d6cb986e7533..4a934fccf8f5 100644 --- a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c +++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c @@ -1,11 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include <stddef.h> +#include "vmlinux.h" #include <string.h> -#include <linux/bpf.h> -#include <linux/ip.h> -#include <linux/ipv6.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include <bpf/bpf_tracing.h> struct grehdr { __be16 flags; @@ -64,13 +62,13 @@ int bpf_lwt_encap_gre6(struct __sk_buff *skb) hdr.ip6hdr.nexthdr = 47; /* IPPROTO_GRE */ hdr.ip6hdr.hop_limit = 0x40; /* fb01::1 */ - hdr.ip6hdr.saddr.s6_addr[0] = 0xfb; - hdr.ip6hdr.saddr.s6_addr[1] = 1; - hdr.ip6hdr.saddr.s6_addr[15] = 1; + hdr.ip6hdr.saddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.saddr.in6_u.u6_addr8[1] = 1; + hdr.ip6hdr.saddr.in6_u.u6_addr8[15] = 1; /* fb10::1 */ - hdr.ip6hdr.daddr.s6_addr[0] = 0xfb; - hdr.ip6hdr.daddr.s6_addr[1] = 0x10; - hdr.ip6hdr.daddr.s6_addr[15] = 1; + hdr.ip6hdr.daddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.daddr.in6_u.u6_addr8[1] = 0x10; + hdr.ip6hdr.daddr.in6_u.u6_addr8[15] = 1; hdr.greh.protocol = skb->protocol; @@ -82,4 +80,141 @@ int bpf_lwt_encap_gre6(struct __sk_buff *skb) return BPF_LWT_REROUTE; } +#define VXLAN_PORT 4789 +#define VXLAN_FLAGS 0x08000000 +#define VXLAN_VNI 1 + +#define ETH_ALEN 6 /* Octets in one ethernet addr */ +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ +#define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ + +static const __u8 bcast[ETH_ALEN] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +}; + +static const __u8 srcmac[ETH_ALEN] = { + 0x02, 0x00, 0x00, 0x00, 0x00, 0x01, +}; + +SEC("encap_vxlan") +int bpf_lwt_encap_vxlan(struct __sk_buff *skb) +{ + struct encap_hdr { + struct iphdr iph; + struct udphdr udph; + struct vxlanhdr vxh; + struct ethhdr eth; + } __attribute__((__packed__)) hdr; + int err; + + memset(&hdr, 0, sizeof(hdr)); + + hdr.iph.ihl = 5; + hdr.iph.version = 4; + hdr.iph.ttl = 0x40; + hdr.iph.protocol = 17; /* IPPROTO_UDP */ + hdr.iph.tot_len = bpf_htons(skb->len + sizeof(hdr)); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + hdr.iph.saddr = 0x640510ac; /* 172.16.5.100 */ + hdr.iph.daddr = 0x641110ac; /* 172.16.17.100 */ +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + hdr.iph.saddr = 0xac100564; /* 172.16.5.100 */ + hdr.iph.daddr = 0xac101164; /* 172.16.17.100 */ +#else +#error "Fix your compiler's __BYTE_ORDER__?!" +#endif + + hdr.udph.source = bpf_htons(VXLAN_PORT); + hdr.udph.dest = bpf_htons(VXLAN_PORT); + hdr.udph.len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) + + sizeof(hdr.eth)); + + hdr.vxh.vx_flags = bpf_htonl(VXLAN_FLAGS); + hdr.vxh.vx_vni = bpf_htonl(VXLAN_VNI << 8); + + __builtin_memcpy(hdr.eth.h_dest, bcast, ETH_ALEN); + __builtin_memcpy(hdr.eth.h_source, srcmac, ETH_ALEN); + hdr.eth.h_proto = bpf_htons(ETH_P_IP); + + err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, sizeof(hdr)); + if (err) + return BPF_DROP; + + return BPF_LWT_REROUTE; +} + +SEC("encap_vxlan6") +int bpf_lwt_encap_vxlan6(struct __sk_buff *skb) +{ + struct encap_hdr { + struct ipv6hdr ip6hdr; + struct udphdr udph; + struct vxlanhdr vxh; + struct ethhdr eth; + } __attribute__((__packed__)) hdr; + int err; + + memset(&hdr, 0, sizeof(hdr)); + + hdr.ip6hdr.version = 6; + hdr.ip6hdr.nexthdr = 17; /* IPPROTO_UDP */ + hdr.ip6hdr.hop_limit = 0x40; + hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) + + sizeof(hdr.eth)); + /* fb05::1 */ + hdr.ip6hdr.saddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.saddr.in6_u.u6_addr8[1] = 0x05; + hdr.ip6hdr.saddr.in6_u.u6_addr8[15] = 1; + /* fb11::1 */ + hdr.ip6hdr.daddr.in6_u.u6_addr8[0] = 0xfb; + hdr.ip6hdr.daddr.in6_u.u6_addr8[1] = 0x11; + hdr.ip6hdr.daddr.in6_u.u6_addr8[15] = 1; + + hdr.udph.source = bpf_htons(VXLAN_PORT); + hdr.udph.dest = bpf_htons(VXLAN_PORT); + hdr.udph.len = bpf_htons(skb->len + sizeof(hdr.udph) + sizeof(hdr.vxh) + + sizeof(hdr.eth)); + + hdr.vxh.vx_flags = bpf_htonl(VXLAN_FLAGS); + hdr.vxh.vx_vni = bpf_htonl(VXLAN_VNI << 8); + + __builtin_memcpy(hdr.eth.h_dest, bcast, ETH_ALEN); + __builtin_memcpy(hdr.eth.h_source, srcmac, ETH_ALEN); + hdr.eth.h_proto = bpf_htons(ETH_P_IPV6); + + err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, sizeof(hdr)); + if (err) + return BPF_DROP; + + return BPF_LWT_REROUTE; +} + +volatile const int tgt_ip_version; + +__u16 transport_hdr = 0; +__u16 network_hdr = 0; +bool fexit_triggered = false; + +SEC("?fexit/bpf_lwt_push_ip_encap") +int BPF_PROG(fexit_lwt_push_ip_encap, struct sk_buff *skb, void *hdr, u32 len, bool ingress, + int retval) +{ + struct iphdr *iph; + + if (retval || fexit_triggered) + return 0; + + iph = (typeof(iph)) (skb->head + skb->network_header); + if (iph->version != tgt_ip_version) + return 0; + + if ((iph->version == 4 && iph->protocol == 17 /* IPPROTO_UDP */) || + (iph->version == 6 && ((struct ipv6hdr *)iph)->nexthdr == 17 /* IPPROTO_UDP */)) { + fexit_triggered = true; + transport_hdr = skb->transport_header; + network_hdr = skb->network_header; + } + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c index d487153a839d..ed5a0011b863 100644 --- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c @@ -29,6 +29,10 @@ unsigned int nr_syn = 0; unsigned int nr_fin = 0; unsigned int nr_hwtstamp = 0; +bool nodelay_est_ok = false; +bool nodelay_hdr_len_reject = false; +bool nodelay_write_hdr_reject = false; + /* Check the header received from the active side */ static int __check_active_hdr_in(struct bpf_sock_ops *skops, bool check_syn) { @@ -300,7 +304,7 @@ static int handle_passive_estab(struct bpf_sock_ops *skops) SEC("sockops") int misc_estab(struct bpf_sock_ops *skops) { - int true_val = 1; + int true_val = 1, false_val = 0, ret; switch (skops->op) { case BPF_SOCK_OPS_TCP_LISTEN_CB: @@ -316,10 +320,19 @@ int misc_estab(struct bpf_sock_ops *skops) case BPF_SOCK_OPS_PARSE_HDR_OPT_CB: return handle_parse_hdr(skops); case BPF_SOCK_OPS_HDR_OPT_LEN_CB: + ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_hdr_len_reject = true; return handle_hdr_opt_len(skops); case BPF_SOCK_OPS_WRITE_HDR_OPT_CB: + ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_write_hdr_reject = true; return handle_write_hdr_opt(skops); case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &false_val, sizeof(false_val)); + if (!ret) + nodelay_est_ok = true; return handle_passive_estab(skops); } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c index 21bb7da90ea5..0efafa927a3d 100644 --- a/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_map_key.c @@ -35,7 +35,7 @@ SEC("fentry/" SYS_PREFIX "sys_getpgid") int test_ringbuf_mem_map_key(void *ctx) { int cur_pid = bpf_get_current_pid_tgid() >> 32; - struct sample *sample, sample_copy; + struct sample *sample; int *lookup_val; if (cur_pid != pid) @@ -55,16 +55,11 @@ int test_ringbuf_mem_map_key(void *ctx) lookup_val = (int *)bpf_map_lookup_elem(&hash_map, sample); __sink(lookup_val); - /* workaround - memcpy is necessary so that verifier doesn't - * complain with: - * verifier internal error: more than one arg with ref_obj_id R3 - * when trying to do bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY); - * + /* * Since bpf_map_lookup_elem above uses 'sample' as key, test using * sample field as value below */ - __builtin_memcpy(&sample_copy, sample, sizeof(struct sample)); - bpf_map_update_elem(&hash_map, &sample_copy, &sample->seq, BPF_ANY); + bpf_map_update_elem(&hash_map, sample, &sample->seq, BPF_ANY); bpf_ringbuf_submit(sample, 0); return 0; diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader.c b/tools/testing/selftests/bpf/progs/test_signed_loader.c new file mode 100644 index 000000000000..d9a4b85f9391 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +/* + * Minimal, map-less program. Driven through libbpf's gen_loader (gen_hash) + * by prog_tests/signed_loader.c so the generated light-skeleton loader (with + * the emit_signature_match metadata check) can be exercised against good + * and tampered metadata. A socket filter needs no load-time attach resolution, + * and having no maps keeps the generated loader's ctx trivial (0 maps, 1 prog). + */ +SEC("socket") +int probe(void *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_data.c b/tools/testing/selftests/bpf/progs/test_signed_loader_data.c new file mode 100644 index 000000000000..43e2074d0042 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader_data.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +/* + * A single initialized global, so the generated loader has one internal + * (.data) map that it seeds with an initial value while loading. + * prog_tests/signed_loader.c uses this to check that a signed loader + * keeps the attested contents and ignores a ctx-supplied initial_value: + * the host cannot re-seed a signed program's maps through the loader ctx. + */ +__u64 magic = 0x5eed1234abad1deaULL; + +SEC("socket") +int probe(void *ctx) +{ + return (int)magic; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c b/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c new file mode 100644 index 000000000000..575a9b7910c8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader_lsm.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +__u32 monitored_tid; + +int sig_keyring_serial; +int sig_keyring_type; +int sig_verdict; +int seen; + +SEC("lsm/bpf_prog_load") +int BPF_PROG(inspect_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token, bool kernel) +{ + __u32 tid = bpf_get_current_pid_tgid() & 0xffffffff; + + if (!monitored_tid || tid != monitored_tid) + return 0; + + seen++; + sig_keyring_serial = prog->aux->sig.keyring_serial; + sig_keyring_type = prog->aux->sig.keyring_type; + sig_verdict = prog->aux->sig.verdict; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_signed_loader_map.c b/tools/testing/selftests/bpf/progs/test_signed_loader_map.c new file mode 100644 index 000000000000..4478ce6f1fd9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_signed_loader_map.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +/* + * One explicit array map and no global variables, so the generated loader + * has exactly one map to create (no .rodata/.bss). prog_tests/signed_loader.c + * uses this to check that a signed loader ignores ctx-supplied max_entries: + * the map must keep its attested size (4), not whatever the host puts in + * the loader ctx. + */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 4); + __type(key, __u32); + __type(value, __u64); +} amap SEC(".maps"); + +SEC("socket") +int probe(void *ctx) +{ + __u32 key = 0; + __u64 *val = bpf_map_lookup_elem(&amap, &key); + + return val ? (int)*val : 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c new file mode 100644 index 000000000000..254f7fd895d9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <asm/unistd.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <bpf/bpf_helpers.h> + +char _license[] SEC("license") = "GPL"; + +int target_pid; +int prog_triggered; +long err; +char copied_byte; + +static int copy_getcwd_arg(char *ubuf) +{ + err = bpf_copy_from_user(&copied_byte, sizeof(copied_byte), ubuf); + if (err) + return err; + + prog_triggered = 1; + return 0; +} + +SEC("tp_btf.s/sys_enter") +int BPF_PROG(handle_sys_enter_tp_btf, struct pt_regs *regs, long id) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid || + id != __NR_getcwd) + return 0; + + return copy_getcwd_arg((void *)PT_REGS_PARM1_SYSCALL(regs)); +} + +SEC("raw_tp.s/sys_enter") +int BPF_PROG(handle_sys_enter_raw_tp, struct pt_regs *regs, long id) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid || + id != __NR_getcwd) + return 0; + + return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs)); +} + +SEC("tp.s/syscalls/sys_enter_getcwd") +int handle_sys_enter_tp(struct syscall_trace_enter *args) +{ + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + return copy_getcwd_arg((void *)args->args[0]); +} + +SEC("tp.s/syscalls/sys_exit_getcwd") +int handle_sys_exit_tp(struct syscall_trace_exit *args) +{ + struct pt_regs *regs; + + if ((bpf_get_current_pid_tgid() >> 32) != target_pid) + return 0; + + regs = (struct pt_regs *)bpf_task_pt_regs(bpf_get_current_task_btf()); + return copy_getcwd_arg((void *)PT_REGS_PARM1_CORE_SYSCALL(regs)); +} + +SEC("raw_tp.s") +int BPF_PROG(handle_raw_tp_bare, struct pt_regs *regs, long id) +{ + return 0; +} + +SEC("tp.s") +int handle_tp_bare(void *ctx) +{ + return 0; +} + +SEC("tracepoint.s/syscalls/sys_enter_getcwd") +int handle_sys_enter_tp_alias(struct syscall_trace_enter *args) +{ + return 0; +} + +SEC("raw_tracepoint.s/sys_enter") +int BPF_PROG(handle_sys_enter_raw_tp_alias, struct pt_regs *regs, long id) +{ + return 0; +} + +SEC("raw_tp.s/sys_enter") +int BPF_PROG(handle_test_run, struct pt_regs *regs, long id) +{ + if ((__u64)regs == 0x1234ULL && (__u64)id == 0x5678ULL) + return (__u64)regs + (__u64)id; + + return 0; +} + +SEC("raw_tp.s/sched_switch") +int BPF_PROG(handle_raw_tp_non_faultable, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} + +SEC("tp.s/sched/sched_switch") +int handle_tp_non_syscall(void *ctx) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c new file mode 100644 index 000000000000..1a0748a9520b --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sleepable_tracepoints_fail.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <vmlinux.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +/* Sleepable program on a non-faultable tracepoint should fail to load */ +SEC("tp_btf.s/sched_switch") +__failure __msg("Sleepable program cannot attach to non-faultable tracepoint") +int BPF_PROG(handle_sched_switch, bool preempt, + struct task_struct *prev, struct task_struct *next) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h index f48f85f1bd70..284a2f2e50cf 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h +++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h @@ -85,13 +85,6 @@ struct { __type(value, int); } sock_skb_opts SEC(".maps"); -struct { - __uint(type, TEST_MAP_TYPE); - __uint(max_entries, 20); - __uint(key_size, sizeof(int)); - __uint(value_size, sizeof(int)); -} tls_sock_map SEC(".maps"); - SEC("sk_skb/stream_parser") int bpf_prog1(struct __sk_buff *skb) { @@ -135,55 +128,6 @@ int bpf_prog2(struct __sk_buff *skb) } -static inline void bpf_write_pass(struct __sk_buff *skb, int offset) -{ - int err = bpf_skb_pull_data(skb, 6 + offset); - void *data_end; - char *c; - - if (err) - return; - - c = (char *)(long)skb->data; - data_end = (void *)(long)skb->data_end; - - if (c + 5 + offset < data_end) - memcpy(c + offset, "PASS", 4); -} - -SEC("sk_skb/stream_verdict") -int bpf_prog3(struct __sk_buff *skb) -{ - int err, *f, ret = SK_PASS; - const int one = 1; - - f = bpf_map_lookup_elem(&sock_skb_opts, &one); - if (f && *f) { - __u64 flags = 0; - - ret = 0; - flags = *f; - - err = bpf_skb_adjust_room(skb, -13, 0, 0); - if (err) - return SK_DROP; - err = bpf_skb_adjust_room(skb, 4, 0, 0); - if (err) - return SK_DROP; - bpf_write_pass(skb, 0); -#ifdef SOCKMAP - return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags); -#else - return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); -#endif - } - err = bpf_skb_adjust_room(skb, 4, 0, 0); - if (err) - return SK_DROP; - bpf_write_pass(skb, 13); - return ret; -} - SEC("sockops") int bpf_sockmap(struct bpf_sock_ops *skops) { diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c b/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c deleted file mode 100644 index 83df4919c224..000000000000 --- a/tools/testing/selftests/bpf/progs/test_sockmap_ktls.c +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/bpf.h> -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_endian.h> - -int cork_byte; -int push_start; -int push_end; -int apply_bytes; -int pop_start; -int pop_end; - -struct { - __uint(type, BPF_MAP_TYPE_SOCKMAP); - __uint(max_entries, 20); - __type(key, int); - __type(value, int); -} sock_map SEC(".maps"); - -SEC("sk_msg") -int prog_sk_policy(struct sk_msg_md *msg) -{ - if (cork_byte > 0) - bpf_msg_cork_bytes(msg, cork_byte); - if (push_start > 0 && push_end > 0) - bpf_msg_push_data(msg, push_start, push_end, 0); - if (pop_start >= 0 && pop_end > 0) - bpf_msg_pop_data(msg, pop_start, pop_end, 0); - - return SK_PASS; -} - -SEC("sk_msg") -int prog_sk_policy_redir(struct sk_msg_md *msg) -{ - int two = 2; - - bpf_msg_apply_bytes(msg, apply_bytes); - return bpf_msg_redirect_map(msg, &sock_map, two, 0); -} diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c new file mode 100644 index 000000000000..301e65b95256 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); +} sock_map SEC(".maps"); + +#define POP_START 0x48a3 +#define POP_LEN 0xfffffffd + +long pop_data_ret = 1; + +SEC("sk_msg") +int prog_msg_pop_data(struct sk_msg_md *msg) +{ + if (msg->size <= POP_START) + return SK_PASS; + + pop_data_ret = bpf_msg_pop_data(msg, POP_START, POP_LEN, 0); + return SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_strp.c b/tools/testing/selftests/bpf/progs/test_sockmap_strp.c index dde3d5bec515..fe88fa6d40bc 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_strp.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_strp.c @@ -50,4 +50,11 @@ int prog_skb_parser_partial(struct __sk_buff *skb) return 10; } +SEC("sk_skb/stream_parser") +int prog_skb_parser_resize(struct __sk_buff *skb) +{ + bpf_skb_change_tail(skb, skb->len, 0); + return skb->len; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 32127f1cd687..30f1de458669 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -6,6 +6,7 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ +#define BPF_NO_KFUNC_PROTOTYPES #include "vmlinux.h" #include <bpf/bpf_core_read.h> #include <bpf/bpf_helpers.h> @@ -36,12 +37,10 @@ enum bpf_fou_encap_type___local { FOU_BPF_ENCAP_GUE___local, }; -struct bpf_fou_encap; - int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, - struct bpf_fou_encap *encap, int type) __ksym; + struct bpf_fou_encap___local *encap, int type) __ksym; int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, - struct bpf_fou_encap *encap) __ksym; + struct bpf_fou_encap___local *encap) __ksym; struct xfrm_state * bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz) __ksym; @@ -781,7 +780,7 @@ int ipip_gue_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + ret = bpf_skb_set_fou_encap(skb, &encap, bpf_core_enum_value(enum bpf_fou_encap_type___local, FOU_BPF_ENCAP_GUE___local)); if (ret < 0) { @@ -820,7 +819,7 @@ int ipip_fou_set_tunnel(struct __sk_buff *skb) encap.sport = 0; encap.dport = bpf_htons(5555); - ret = bpf_skb_set_fou_encap(skb, (struct bpf_fou_encap *)&encap, + ret = bpf_skb_set_fou_encap(skb, &encap, FOU_BPF_ENCAP_FOU___local); if (ret < 0) { log_err(ret); @@ -843,7 +842,7 @@ int ipip_encap_get_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - ret = bpf_skb_get_fou_encap(skb, (struct bpf_fou_encap *)&encap); + ret = bpf_skb_get_fou_encap(skb, &encap); if (ret < 0) { log_err(ret); return TC_ACT_SHOT; diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c index 78b23934d9f8..eea556940df6 100644 --- a/tools/testing/selftests/bpf/progs/test_vmlinux.c +++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c @@ -69,7 +69,7 @@ int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id) return 0; } -SEC("kprobe/hrtimer_start_range_ns") +SEC("kprobe") int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { @@ -78,7 +78,7 @@ int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns, return 0; } -SEC("fentry/hrtimer_start_range_ns") +SEC("fentry") int BPF_PROG(handle__fentry, struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { diff --git a/tools/testing/selftests/bpf/progs/test_wakeup_source.c b/tools/testing/selftests/bpf/progs/test_wakeup_source.c new file mode 100644 index 000000000000..fd2fb6aebd82 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_wakeup_source.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_core_read.h> +#include "bpf_experimental.h" +#include "bpf_misc.h" +#include "wakeup_source.h" + +#define MAX_LOOP_ITER 1000 +#define RB_SIZE (16384 * 4) + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RB_SIZE); +} rb SEC(".maps"); + +struct bpf_ws_lock; +struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) __ksym; +void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) __ksym; +void *bpf_wakeup_sources_get_head(void) __ksym; + +SEC("syscall") +__success __retval(0) +int iterate_wakeupsources(void *ctx) +{ + struct list_head *head = bpf_wakeup_sources_get_head(); + struct list_head *pos = head; + struct bpf_ws_lock *lock; + int i; + + lock = bpf_wakeup_sources_read_lock(); + if (!lock) + return 0; + + bpf_for(i, 0, MAX_LOOP_ITER) { + if (bpf_core_read(&pos, sizeof(pos), &pos->next) || !pos || pos == head) + break; + + struct wakeup_event_t *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + + if (!e) + break; + + struct wakeup_source *ws = bpf_core_cast( + (void *)pos - bpf_core_field_offset(struct wakeup_source, entry), + struct wakeup_source); + s64 active_time = 0; + bool active = BPF_CORE_READ_BITFIELD(ws, active); + bool autosleep_enable = BPF_CORE_READ_BITFIELD(ws, autosleep_enabled); + s64 last_time = ws->last_time; + s64 max_time = ws->max_time; + s64 prevent_sleep_time = ws->prevent_sleep_time; + s64 total_time = ws->total_time; + + if (active) { + s64 curr_time = bpf_ktime_get_ns(); + s64 prevent_time = ws->start_prevent_time; + + if (curr_time > last_time) + active_time = curr_time - last_time; + + total_time += active_time; + if (active_time > max_time) + max_time = active_time; + if (autosleep_enable && curr_time > prevent_time) + prevent_sleep_time += curr_time - prevent_time; + } + + e->active_count = ws->active_count; + e->active_time_ns = active_time; + e->event_count = ws->event_count; + e->expire_count = ws->expire_count; + e->last_time_ns = last_time; + e->max_time_ns = max_time; + e->prevent_sleep_time_ns = prevent_sleep_time; + e->total_time_ns = total_time; + e->wakeup_count = ws->wakeup_count; + + if (bpf_probe_read_kernel_str( + e->name, WAKEUP_NAME_LEN, ws->name) < 0) + e->name[0] = '\0'; + + bpf_ringbuf_submit(e, 0); + } + + bpf_wakeup_sources_read_unlock(lock); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index fa73b17cb999..08b03be0b891 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -21,10 +21,6 @@ bool test_pass; -static const __u8 smac_want[ETH_ALEN] = { - 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF, -}; - static const __u8 meta_want[META_SIZE] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, @@ -32,11 +28,6 @@ static const __u8 meta_want[META_SIZE] = { 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, }; -static bool check_smac(const struct ethhdr *eth) -{ - return !__builtin_memcmp(eth->h_source, smac_want, ETH_ALEN); -} - static bool check_metadata(const char *file, int line, __u8 *meta_have) { if (!__builtin_memcmp(meta_have, meta_want, META_SIZE)) @@ -280,18 +271,47 @@ fail: return TC_ACT_SHOT; } +/* Test packets carry test metadata pattern as payload. */ +static bool is_test_packet_xdp(struct xdp_md *ctx) +{ + __u8 meta_have[META_SIZE]; + __u32 len; + + len = bpf_xdp_get_buff_len(ctx); + if (len < META_SIZE) + return false; + if (bpf_xdp_load_bytes(ctx, len - META_SIZE, meta_have, META_SIZE)) + return false; + if (__builtin_memcmp(meta_have, meta_want, META_SIZE)) + return false; + + return true; +} + +/* Test packets carry test metadata pattern as payload. */ +static bool is_test_packet_tc(struct __sk_buff *ctx) +{ + __u8 meta_have[META_SIZE]; + + if (ctx->len < META_SIZE) + return false; + if (bpf_skb_load_bytes(ctx, ctx->len - META_SIZE, meta_have, META_SIZE)) + return false; + if (__builtin_memcmp(meta_have, meta_want, META_SIZE)) + return false; + + return true; +} + /* Reserve and clear space for metadata but don't populate it */ SEC("xdp") int ing_xdp_zalloc_meta(struct xdp_md *ctx) { - struct ethhdr *eth = ctx_ptr(ctx, data); __u8 *meta; int ret; /* Drop any non-test packets */ - if (eth + 1 > ctx_ptr(ctx, data_end)) - return XDP_DROP; - if (!check_smac(eth)) + if (!is_test_packet_xdp(ctx)) return XDP_DROP; ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); @@ -310,33 +330,24 @@ int ing_xdp_zalloc_meta(struct xdp_md *ctx) SEC("xdp") int ing_xdp(struct xdp_md *ctx) { - __u8 *data, *data_meta, *data_end, *payload; - struct ethhdr *eth; + __u8 *data, *data_meta; int ret; + /* Drop any non-test packets */ + if (!is_test_packet_xdp(ctx)) + return XDP_DROP; + ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); if (ret < 0) return XDP_DROP; data_meta = ctx_ptr(ctx, data_meta); - data_end = ctx_ptr(ctx, data_end); data = ctx_ptr(ctx, data); - eth = (struct ethhdr *)data; - payload = data + sizeof(struct ethhdr); - - if (payload + META_SIZE > data_end || - data_meta + META_SIZE > data) + if (data_meta + META_SIZE > data) return XDP_DROP; - /* The Linux networking stack may send other packets on the test - * interface that interfere with the test. Just drop them. - * The test packets can be recognized by their source MAC address. - */ - if (!check_smac(eth)) - return XDP_DROP; - - __builtin_memcpy(data_meta, payload, META_SIZE); + __builtin_memcpy(data_meta, meta_want, META_SIZE); return XDP_PASS; } @@ -353,7 +364,7 @@ int clone_data_meta_survives_data_write(struct __sk_buff *ctx) if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; if (meta_have + META_SIZE > eth) @@ -383,7 +394,7 @@ int clone_data_meta_survives_meta_write(struct __sk_buff *ctx) if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; if (meta_have + META_SIZE > eth) @@ -416,7 +427,7 @@ int clone_meta_dynptr_survives_data_slice_write(struct __sk_buff *ctx) if (!eth) goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; bpf_dynptr_from_skb_meta(ctx, 0, &meta); @@ -436,16 +447,11 @@ out: SEC("tc") int clone_meta_dynptr_survives_meta_slice_write(struct __sk_buff *ctx) { - struct bpf_dynptr data, meta; - const struct ethhdr *eth; + struct bpf_dynptr meta; __u8 *meta_have; - bpf_dynptr_from_skb(ctx, 0, &data); - eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); - if (!eth) - goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; bpf_dynptr_from_skb_meta(ctx, 0, &meta); @@ -471,15 +477,10 @@ int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; __u8 meta_have[META_SIZE]; - const struct ethhdr *eth; int err; - bpf_dynptr_from_skb(ctx, 0, &data); - eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); - if (!eth) - goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; /* Expect read-write metadata before unclone */ @@ -492,6 +493,7 @@ int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx) goto out; /* Helper write to payload will unclone the packet */ + bpf_dynptr_from_skb(ctx, 0, &data); bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); @@ -511,17 +513,12 @@ out: SEC("tc") int clone_meta_dynptr_rw_before_meta_dynptr_write(struct __sk_buff *ctx) { - struct bpf_dynptr data, meta; + struct bpf_dynptr meta; __u8 meta_have[META_SIZE]; - const struct ethhdr *eth; int err; - bpf_dynptr_from_skb(ctx, 0, &data); - eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); - if (!eth) - goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; /* Expect read-write metadata before unclone */ @@ -545,6 +542,28 @@ out: return TC_ACT_SHOT; } +SEC("lwt_xmit") +int dummy_lwt_xmit(struct __sk_buff *ctx) +{ + if (bpf_skb_change_head(ctx, sizeof(struct ipv6hdr), 0)) + return BPF_DROP; + + return BPF_OK; +} + +SEC("tc") +int tc_is_meta_empty(struct __sk_buff *ctx) +{ + if (!is_test_packet_tc(ctx)) + return TC_ACT_OK; + + if (ctx->data_meta != ctx->data) + return TC_ACT_OK; + + test_pass = true; + return TC_ACT_OK; +} + SEC("tc") int helper_skb_vlan_push_pop(struct __sk_buff *ctx) { diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c new file mode 100644 index 000000000000..332d0a423a43 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("fentry.multi/bpf_fentry_test*") +int BPF_PROG(test_fentry) +{ + tracing_multi_arg_check(ctx, &test_result_fentry, false); + return 0; +} + +SEC("fexit.multi/bpf_fentry_test*") +int BPF_PROG(test_fexit) +{ + tracing_multi_arg_check(ctx, &test_result_fexit, true); + return 0; +} + +SEC("fentry.multi.s/bpf_fentry_test1") +int BPF_PROG(test_fentry_s) +{ + tracing_multi_arg_check(ctx, &test_result_fentry, false); + return 0; +} + +SEC("fexit.multi.s/bpf_fentry_test1") +int BPF_PROG(test_fexit_s) +{ + tracing_multi_arg_check(ctx, &test_result_fexit, true); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c new file mode 100644 index 000000000000..b3374f2db450 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_attach_module.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("fentry.multi/bpf_testmod:bpf_testmod_fentry_test*") +int BPF_PROG(test_fentry) +{ + tracing_multi_arg_check(ctx, &test_result_fentry, false); + return 0; +} + +SEC("fexit.multi/bpf_testmod:bpf_testmod_fentry_test*") +int BPF_PROG(test_fexit) +{ + tracing_multi_arg_check(ctx, &test_result_fexit, true); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_bench.c b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c new file mode 100644 index 000000000000..beae946cb8c4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_bench.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +SEC("fentry.multi") +int BPF_PROG(bench) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_check.c b/tools/testing/selftests/bpf/progs/tracing_multi_check.c new file mode 100644 index 000000000000..b2959ba71179 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_check.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +int pid = 0; +bool test_cookies = false; + +/* bpf_fentry_test1 is exported as kfunc via vmlinux.h */ +extern const void bpf_fentry_test2 __ksym; +extern const void bpf_fentry_test3 __ksym; +extern const void bpf_fentry_test4 __ksym; +extern const void bpf_fentry_test5 __ksym; +extern const void bpf_fentry_test6 __ksym; +extern const void bpf_fentry_test7 __ksym; +extern const void bpf_fentry_test8 __ksym; +extern const void bpf_fentry_test9 __ksym; +extern const void bpf_fentry_test10 __ksym; + +extern const void bpf_testmod_fentry_test1 __ksym; +extern const void bpf_testmod_fentry_test2 __ksym; +extern const void bpf_testmod_fentry_test3 __ksym; +extern const void bpf_testmod_fentry_test7 __ksym; +extern const void bpf_testmod_fentry_test11 __ksym; + +int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return) +{ + void *ip = (void *) bpf_get_func_ip(ctx); + __u64 value = 0, ret = 0, cookie = 0; + long err = 0; + + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 1; + + if (is_return) + err |= bpf_get_func_ret(ctx, &ret); + if (test_cookies) + cookie = bpf_get_attach_cookie(ctx); + + if (ip == &bpf_fentry_test1) { + int a; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + + err |= is_return ? ret != 2 : 0; + err |= test_cookies ? cookie != 8 : 0; + + *test_result += err == 0 && a == 1; + } else if (ip == &bpf_fentry_test2) { + __u64 b; + int a; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = value; + + err |= is_return ? ret != 5 : 0; + err |= test_cookies ? cookie != 9 : 0; + + *test_result += err == 0 && a == 2 && b == 3; + } else if (ip == &bpf_fentry_test3) { + __u64 c; + char a; + int b; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (char) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (int) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = value; + + err |= is_return ? ret != 15 : 0; + err |= test_cookies ? cookie != 7 : 0; + + *test_result += err == 0 && a == 4 && b == 5 && c == 6; + } else if (ip == &bpf_fentry_test4) { + void *a; + char b; + int c; + __u64 d; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (void *) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (char) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (int) value; + err |= bpf_get_func_arg(ctx, 3, &value); + d = value; + + err |= is_return ? ret != 34 : 0; + err |= test_cookies ? cookie != 5 : 0; + + *test_result += err == 0 && a == (void *) 7 && b == 8 && c == 9 && d == 10; + } else if (ip == &bpf_fentry_test5) { + __u64 a; + void *b; + short c; + int d; + __u64 e; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (void *) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (short) value; + err |= bpf_get_func_arg(ctx, 3, &value); + d = (int) value; + err |= bpf_get_func_arg(ctx, 4, &value); + e = value; + + err |= is_return ? ret != 65 : 0; + err |= test_cookies ? cookie != 4 : 0; + + *test_result += err == 0 && a == 11 && b == (void *) 12 && c == 13 && d == 14 && e == 15; + } else if (ip == &bpf_fentry_test6) { + __u64 a; + void *b; + short c; + int d; + void *e; + __u64 f; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (void *) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (short) value; + err |= bpf_get_func_arg(ctx, 3, &value); + d = (int) value; + err |= bpf_get_func_arg(ctx, 4, &value); + e = (void *) value; + err |= bpf_get_func_arg(ctx, 5, &value); + f = value; + + err |= is_return ? ret != 111 : 0; + err |= test_cookies ? cookie != 2 : 0; + + *test_result += err == 0 && a == 16 && b == (void *) 17 && c == 18 && d == 19 && e == (void *) 20 && f == 21; + } else if (ip == &bpf_fentry_test7) { + err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 3 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_fentry_test8) { + err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 1 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_fentry_test9) { + err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 10 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_fentry_test10) { + err |= is_return ? ret != 0 : 0; + err |= test_cookies ? cookie != 6 : 0; + + *test_result += err == 0 ? 1 : 0; + } else if (ip == &bpf_testmod_fentry_test1) { + int a; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + + err |= is_return ? ret != 2 : 0; + + *test_result += err == 0 && a == 1; + } else if (ip == &bpf_testmod_fentry_test2) { + int a; + __u64 b; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (int) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (__u64) value; + + err |= is_return ? ret != 5 : 0; + + *test_result += err == 0 && a == 2 && b == 3; + } else if (ip == &bpf_testmod_fentry_test3) { + char a; + int b; + __u64 c; + + err |= bpf_get_func_arg(ctx, 0, &value); + a = (char) value; + err |= bpf_get_func_arg(ctx, 1, &value); + b = (int) value; + err |= bpf_get_func_arg(ctx, 2, &value); + c = (__u64) value; + + err |= is_return ? ret != 15 : 0; + + *test_result += err == 0 && a == 4 && b == 5 && c == 6; + } else if (ip == &bpf_testmod_fentry_test7) { + err |= is_return ? ret != 133 : 0; + + *test_result += err == 0; + } else if (ip == &bpf_testmod_fentry_test11) { + err |= is_return ? ret != 231 : 0; + + *test_result += err == 0; + } + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_fail.c b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c new file mode 100644 index 000000000000..7f0375f4213d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_fail.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +SEC("fentry.multi") +int BPF_PROG(test_fentry) +{ + return 0; +} + +SEC("fentry.multi.s") +int BPF_PROG(test_fentry_s) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c new file mode 100644 index 000000000000..cd5be0bb6ffd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_intersect_attach.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry_1 = 0; +__u64 test_result_fentry_2 = 0; +__u64 test_result_fexit_1 = 0; +__u64 test_result_fexit_2 = 0; + +SEC("fentry.multi") +int BPF_PROG(fentry_1) +{ + tracing_multi_arg_check(ctx, &test_result_fentry_1, false); + return 0; +} + +SEC("fentry.multi") +int BPF_PROG(fentry_2) +{ + tracing_multi_arg_check(ctx, &test_result_fentry_2, false); + return 0; +} + +SEC("fexit.multi") +int BPF_PROG(fexit_1) +{ + tracing_multi_arg_check(ctx, &test_result_fexit_1, true); + return 0; +} + +SEC("fexit.multi") +int BPF_PROG(fexit_2) +{ + tracing_multi_arg_check(ctx, &test_result_fexit_2, true); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c new file mode 100644 index 000000000000..a49d1d841f3a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_rollback.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +int pid = 0; + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("?fentry.multi") +int BPF_PROG(test_fentry) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + test_result_fentry++; + return 0; +} + +SEC("?fexit.multi") +int BPF_PROG(test_fexit) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + test_result_fexit++; + return 0; +} + +SEC("?fentry/bpf_fentry_test1") +int BPF_PROG(extra) +{ + return 0; +} + +SEC("?fentry/bpf_fentry_test10") +int BPF_PROG(filler) +{ + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c new file mode 100644 index 000000000000..7c9a46016ccd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_session_attach.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char _license[] SEC("license") = "GPL"; + +__hidden extern int tracing_multi_arg_check(__u64 *ctx, __u64 *test_result, bool is_return); + +__u64 test_result_fentry = 0; +__u64 test_result_fexit = 0; + +SEC("fsession.multi/bpf_fentry_test*") +int BPF_PROG(test_session_1) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (bpf_session_is_return(ctx)) { + if (tracing_multi_arg_check(ctx, &test_result_fexit, true)) + return 0; + /* extra count for test_result_fexit cookie */ + test_result_fexit += *cookie == 0xbeafbeafbeafbeaf; + } else { + if (tracing_multi_arg_check(ctx, &test_result_fentry, false)) + return 0; + *cookie = 0xbeafbeafbeafbeaf; + } + return 0; +} + +SEC("fsession.multi.s/bpf_fentry_test1") +int BPF_PROG(test_fsession_s) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (bpf_session_is_return(ctx)) { + if (tracing_multi_arg_check(ctx, &test_result_fexit, true)) + return 0; + /* extra count for test_result_fexit cookie */ + test_result_fexit += *cookie == 0xbeafbeafbeafbeaf; + } else { + if (tracing_multi_arg_check(ctx, &test_result_fentry, false)) + return 0; + *cookie = 0xbeafbeafbeafbeaf; + } + return 0; +} + +SEC("fsession.multi/bpf_testmod:bpf_testmod_fentry_test*") +int BPF_PROG(test_session_2) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (bpf_session_is_return(ctx)) { + if (tracing_multi_arg_check(ctx, &test_result_fexit, true)) + return 0; + /* extra count for test_result_fexit cookie */ + test_result_fexit += *cookie == 0xbeafbeafbeafbeaf; + } else { + if (tracing_multi_arg_check(ctx, &test_result_fentry, false)) + return 0; + *cookie = 0xbeafbeafbeafbeaf; + } + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c new file mode 100644 index 000000000000..7b6ed41bf452 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tracing_multi_verifier.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +SEC("fentry.multi/bpf_fentry_test1") +__failure +__msg("func 'bpf_multi_func' doesn't have 1-th argument") +int BPF_PROG(fentry_direct_access, int a) +{ + return a; +} + +SEC("fexit.multi/bpf_fentry_test3") +__failure +__msg("invalid bpf_context access off=24 size=8") +int BPF_PROG(fexit_direct_access, char a, int b, __u64 c, int ret) +{ + return ret; +} + +SEC("fsession.multi/bpf_fentry_test4") +__failure +__msg("invalid bpf_context access off=16 size=8") +int BPF_PROG(fsession_direct_access, void *a, char b, int c, __u64 d, int ret) +{ + return c; +} diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c index 54de0389f878..c0d0422b8030 100644 --- a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c +++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c @@ -146,7 +146,7 @@ try_discard_dynptr(struct bpf_dynptr *dynptr, void *context) * not be able to read past the end of the pointer. */ SEC("?raw_tp") -__failure __msg("cannot release unowned const bpf_dynptr") +__failure __msg("CONST_PTR_TO_DYNPTR cannot be released") int user_ringbuf_callback_discard_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0); @@ -166,7 +166,7 @@ try_submit_dynptr(struct bpf_dynptr *dynptr, void *context) * not be able to read past the end of the pointer. */ SEC("?raw_tp") -__failure __msg("cannot release unowned const bpf_dynptr") +__failure __msg("CONST_PTR_TO_DYNPTR cannot be released") int user_ringbuf_callback_submit_dynptr(void *ctx) { bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0); diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 62e282f4448a..df0e22d1a29b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -8,7 +8,7 @@ #include <bpf/bpf_tracing.h> #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) @@ -607,4 +607,71 @@ int non_arena_ptr_add_to_arena_ptr(void *ctx) #endif +static __noinline +u32 __arena *check_arena_arg_nonglobal(u32 __arena *arg) +{ + volatile u32 val = *arg; + + *arg = val + 1; + + return arg; +} + +__weak +u32 __arena *check_arena_arg_global(u32 __arena *arg) +{ + volatile u32 val = *arg; + + *arg = val + 1; + + return arg; +} + +__weak +u32 volatile __arena *check_arena_arg_quals1(u32 volatile __arena *arg1, u32 __arena volatile *arg2) +{ + *arg1 = *arg1 + 1; + *arg2 = *arg1 + 1; + + return arg2; +} + +__weak +u32 __arena volatile *check_arena_arg_quals2(u32 volatile __arena *arg1, u32 __arena volatile *arg2) +{ + *arg1 = *arg1 + 1; + *arg2 = *arg2 + 1; + + return arg2; +} + +SEC("syscall") +__success __retval(0) +int check_arena_arg_ret(void *ctx) +{ + u32 __arena *page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + u32 __arena *arg = page; + u32 __arena volatile *arg1; + u32 __arena volatile *ret1; + u32 volatile __arena *arg2; + u32 volatile __arena *ret2; + + if (!arg) + return 1; + + /* Make sure we use {arg, ret}{1, 2}. */ + + arg = check_arena_arg_nonglobal(page); + arg = check_arena_arg_global(arg); + + arg1 = arg2 = page; + ret1 = check_arena_arg_quals1(arg1, arg2); + ret2 = check_arena_arg_quals2(arg1, arg2); + + if (!(*ret1 ||*ret2)) + return -EINVAL; + + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c index 83182ddbfb95..45d364b0bc85 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -6,7 +6,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #include "bpf_misc.h" #define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1)) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c index e6bd7b61f9f1..b51594dbc005 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c @@ -7,7 +7,7 @@ #include <bpf/bpf_tracing.h> #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #define ARENA_PAGES (32) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 5f7e7afee169..6ab8730d4878 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -7,7 +7,7 @@ #include <bpf/bpf_tracing.h> #include "bpf_misc.h" #include "bpf_experimental.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #define ARENA_SIZE (1ull << 32) diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c index 8bcddadfc4da..dd97f2027505 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c +++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c @@ -32,7 +32,7 @@ int BPF_PROG(no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp) SEC("iter/cgroup") __description("uninitialized iter in ->next()") -__failure __msg("expected an initialized iter_bits as arg #0") +__failure __msg("expected an initialized iter_bits as R1") int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct bpf_iter_bits it = {}; @@ -43,7 +43,7 @@ int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) SEC("iter/cgroup") __description("uninitialized iter in ->destroy()") -__failure __msg("expected an initialized iter_bits as arg #0") +__failure __msg("expected an initialized iter_bits as R1") int BPF_PROG(destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) { struct bpf_iter_bits it = {}; diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index c1ae013dee29..bc038ac2df98 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1239,7 +1239,8 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +/* cnum can't represent both [0, 0xffff_feff] and [0x8000_0000, 0x7fff_feff], so it picks one */ __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1648,7 +1649,8 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure -__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127)") +/* smin=-128 includes point 0xffffffffffffff80 */ __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { @@ -1890,25 +1892,25 @@ __naked void bounds_refinement_tnum_umax(void *ctx) /* This test covers the bounds deduction when the u64 range and the tnum * overlap only at umin. After instruction 3, the ranges look as follows: * - * 0 umin=0xe00 umax=0xeff U64_MAX + * 0 umin=0xe1 umax=0xf0 U64_MAX * | [xxxxxxxxxxxxxx] | * |----------------------------|------------------------------| * | x x | tnum values * - * The verifier can therefore deduce that the R0=0xe0=224. + * The verifier can therefore deduce that the R0=0xe1=225. */ SEC("socket") __description("bounds refinement with single-value tnum on umin") -__msg("3: (15) if r0 == 0xf0 {{.*}} R0=224") +__msg("3: (15) if r0 == 0xf1 {{.*}} R0=225") __success __log_level(2) __naked void bounds_refinement_tnum_umin(void *ctx) { asm volatile(" \ call %[bpf_get_prandom_u32]; \ - r0 |= 0xe0; \ - r0 &= 0xf0; \ - if r0 == 0xf0 goto +2; \ - if r0 == 0xe0 goto +1; \ + r0 |= 0xe1; \ + r0 &= 0xf1; \ + if r0 == 0xf1 goto +2; \ + if r0 == 0xe1 goto +1; \ r10 = 0; \ exit; \ " : @@ -2043,7 +2045,8 @@ __naked void signed_unsigned_intersection32_case2(void *ctx) */ SEC("socket") __description("bounds refinement: 64bits ranges not overwritten by 32bits ranges") -__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,umin=smin32=umin32=2,umax=0xffffffff00000003,smax32=umax32=3") +__msg("3: (65) if r0 s> 0x2 {{.*}} R0=scalar(smin=0x8000000000000002,smax=2,smin32=umin32=2,smax32=umax32=3,var_off{{.*}}))") +/* Can't represent both [S64_MIN+2, 2] and [2, U64_MAX - U32_MAX + 2] at the same time, picks shorter interval */ __msg("4: (25) if r0 > 0x13 {{.*}} R0=2") __success __log_level(2) __naked void refinement_32bounds_not_overwriting_64bounds(void *ctx) @@ -2184,4 +2187,111 @@ __naked void tnums_equal_impossible_constant(void *ctx) : __clobber_all); } +/* + * 32-bit range starts before 64-bit range low bits in each 2^32 block. + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||----|=====|--|----------||----|=====|-------------||--|-|=====|-------------|| + * |< b >| | |< b >| | |< b >| + * | | | | + * |<---------------+- a -+---------------->| + * | | + * |< t >| refined r0 range + * + * a = u64 [0x1'00000008, 0x3'00000001] + * b = u32 [2, 5] + * t = u64 [0x2'00000002, 0x2'00000005] + */ +SEC("socket") +__success +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void deduce64_from_32_before_block_start(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = 0x100000008 ll; \ + if r0 < r1 goto 2f; \ + r1 = 0x300000001 ll; \ + if r0 > r1 goto 2f; /* u64: [0x1'00000008, 0x3'00000001] */ \ + if w0 < 2 goto 2f; \ + if w0 > 5 goto 2f; /* u32: [2, 5] */ \ + r2 = 0x200000002 ll; \ + r3 = 0x200000005 ll; \ + if r0 >= r2 goto 1f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +1: if r0 <= r3 goto 2f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +2: exit; \ + " + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * 32-bit range crossing U32_MAX / 0 boundary. + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||===|---------|------|===||===|----------------|===||===|---------|------|===|| + * |b >| | |< b||b >| |< b||b >| | |< b| + * | | | | + * |<-----+----------------- a --------------+-------->| + * | | + * |<---------------- t ------------->| refined r0 range + * + * a = u64 [0x1'00000006, 0x2'FFFFFFEF] + * b = s32 [-16, 5] (u32 wrapping [0xFFFFFFF0, 0x00000005]) + * t = u64 [0x1'FFFFFFF0, 0x2'00000005] + */ +SEC("socket") +__success +__flag(BPF_F_TEST_REG_INVARIANTS) +__naked void deduce64_from_32_wrapping_32bit(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = 0x100000006 ll; \ + if r0 < r1 goto 2f; \ + r1 = 0x2ffffffef ll; \ + if r0 > r1 goto 2f; /* u64: [0x1'00000006, 0x2'FFFFFFEF] */ \ + if w0 s< -16 goto 2f; \ + if w0 s> 5 goto 2f; /* s32: [-16, 5] */ \ + r1 = 0x1fffffff0 ll; \ + r2 = 0x200000005 ll; \ + if r0 >= r1 goto 1f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +1: if r0 <= r2 goto 2f; /* should be always true */ \ + r10 = 0; /* dead code */ \ +2: exit; \ + " + :: __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Check that range_within() compares cnum ranges, not min/max projections. */ +SEC("socket") +__failure __msg("div by zero") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void range_within_cnum_cross_both_boundaries(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = 0x80000020; \ + if r0 > r1 goto 1f; \ + r0 += 0x7FFFFFF0; /* PATH 1 */ \ + goto 2f; \ +1: call %[bpf_get_prandom_u32]; /* PATH 2 */ \ + if r0 < 0x100 goto 3f; \ + if r0 > 0x200 goto 3f; \ +2: /* PATH 1: r0 ∈ [0x7FFFFFF0, U32_MAX] ∪ [0, 0x10] */ \ + /* PATH 2: r0 ∈ [0x100, 0x200] */ \ + if r0 != 0x100 goto 3f; /* True only on PATH 2 */ \ + r0 /= 0; \ +3: exit; \ + " + :: __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index fb4fa465d67c..8d7ff38e4c06 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -630,13 +630,13 @@ __xlated("...") __xlated("4: r0 = &(void __percpu *)(r0)") __xlated("...") /* may_goto expansion starts */ -__xlated("6: r11 = *(u64 *)(r10 -24)") -__xlated("7: if r11 == 0x0 goto pc+6") -__xlated("8: r11 -= 1") -__xlated("9: if r11 != 0x0 goto pc+2") -__xlated("10: r11 = -24") +__xlated("6: r12 = *(u64 *)(r10 -24)") +__xlated("7: if r12 == 0x0 goto pc+6") +__xlated("8: r12 -= 1") +__xlated("9: if r12 != 0x0 goto pc+2") +__xlated("10: r12 = -24") __xlated("11: call unknown") -__xlated("12: *(u64 *)(r10 -24) = r11") +__xlated("12: *(u64 *)(r10 -24) = r12") /* may_goto expansion ends */ __xlated("13: *(u64 *)(r10 -8) = r1") __xlated("14: exit") @@ -668,13 +668,13 @@ __xlated("1: *(u64 *)(r10 -16) =") __xlated("2: r1 = 1") __xlated("3: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("4: r11 = *(u64 *)(r10 -24)") -__xlated("5: if r11 == 0x0 goto pc+6") -__xlated("6: r11 -= 1") -__xlated("7: if r11 != 0x0 goto pc+2") -__xlated("8: r11 = -24") +__xlated("4: r12 = *(u64 *)(r10 -24)") +__xlated("5: if r12 == 0x0 goto pc+6") +__xlated("6: r12 -= 1") +__xlated("7: if r12 != 0x0 goto pc+2") +__xlated("8: r12 = -24") __xlated("9: call unknown") -__xlated("10: *(u64 *)(r10 -24) = r11") +__xlated("10: *(u64 *)(r10 -24) = r12") /* may_goto expansion ends */ __xlated("11: *(u64 *)(r10 -8) = r1") __xlated("12: exit") @@ -799,8 +799,7 @@ __naked int bpf_loop_interaction2(void) SEC("raw_tp") __arch_x86_64 -__log_level(4) -__msg("stack depth 512+0") +__log_level(4) __msg("stack depth 512+0 max 512") /* just to print xlated version when debugging */ __xlated("r0 = &(void __percpu *)(r0)") __success diff --git a/tools/testing/selftests/bpf/progs/verifier_call_large_imm.c b/tools/testing/selftests/bpf/progs/verifier_call_large_imm.c new file mode 100644 index 000000000000..7998df07f6a6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_call_large_imm.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +int call_happened = 0; + +/* + * 32765 is the exact minimum number of padding instructions needed to + * trigger the verifier failure, because: + * 1. Counting the wrapper instructions around the padding block (one + * "r0=0" and two "exit" instructions), the actual jump distance + * evaluates to N + 3. + * 2. To overflow the s16 max bound (32767), we need N + 3 > 32767. + * Thus, N = 32765 is the exact minimum padding size required. + */ +static __attribute__((noinline)) void padding_subprog(void) +{ + asm volatile ( + "r0 = 0;" + ".rept 32765;" + "r0 += 0;" + ".endr;" + ::: __clobber_all); +} + +static __attribute__((noinline)) int target_subprog(void) +{ + /* Use volatile variable here to prevent optimization. */ + volatile int magic_ret = 3; + return magic_ret; +} + +SEC("syscall") +__success __retval(3) +int call_large_imm_test(void *ctx) +{ + /* + * Landing pad to handle call error on kernel without the fix, + * preventing kernel panic. + */ + asm volatile ( + "r0 = 0;" + ".rept 32768;" + "r0 += 0;" + ".endr;" + ::: __clobber_all); + + /* + * The call_happened variable is 1 only when the call insn wrongly + * go back to the landing pad above. + */ + if (call_happened == 1) { + /* Use volatile variable here to prevent optimization. */ + volatile int flag = -1; + return flag; + } + + call_happened = 1; + + padding_subprog(); + + return target_subprog(); +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_flow_keys.c b/tools/testing/selftests/bpf/progs/verifier_flow_keys.c new file mode 100644 index 000000000000..d780a36a6e9a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_flow_keys.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Bounds checks for PTR_TO_FLOW_KEYS pointer arithmetic. */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +/* sizeof(struct bpf_flow_keys) is well under 4096, so +0x1000 is OOB. */ + +SEC("flow_dissector") +__description("flow_keys: in-bounds constant pointer arithmetic accepted") +__success +__naked void flow_keys_const_inbounds(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r1 += 8; \ + r0 = *(u64 *)(r1 + 0); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("flow_keys: OOB via constant pointer arithmetic rejected") +__failure __msg("invalid access to flow keys off=4096 size=8") +__naked void flow_keys_const_oob_read(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r1 += 4096; \ + r0 = *(u64 *)(r1 + 0); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("flow_keys: OOB write via constant pointer arithmetic rejected") +__failure __msg("invalid access to flow keys off=4096 size=8") +__naked void flow_keys_const_oob_write(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r1 += 4096; \ + r2 = 0; \ + *(u64 *)(r1 + 0) = r2; \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +/* Equivalent OOB expressed directly in insn->off; this form was always + * rejected and is kept to show both forms now share one diagnostic. + */ +SEC("flow_dissector") +__description("flow_keys: OOB via insn->off rejected") +__failure __msg("invalid access to flow keys off=4096 size=8") +__naked void flow_keys_insn_off_oob(void) +{ + asm volatile (" \ + r1 = *(u64 *)(r1 + %[flow_keys]); \ + r0 = *(u64 *)(r1 + 4096); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("flow_keys: variable pointer arithmetic rejected") +__failure __msg("R1 pointer arithmetic on flow_keys prohibited") +__naked void flow_keys_var_read(void) +{ + asm volatile (" \ + r6 = r1; \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xFFFF; \ + r1 = *(u64 *)(r6 + %[flow_keys]); \ + r1 += r0; \ + r0 = *(u64 *)(r1 + 0); \ + r0 = 0; \ + exit; \ +" : + : __imm_const(flow_keys, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index e7dae0cf9c17..0bdeb7bc4687 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -153,7 +153,7 @@ __weak int subprog_trusted_destroy(struct task_struct *task __arg_trusted) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("release kernel function bpf_task_release expects refcounted PTR_TO_BTF_ID") +__msg("release kfunc bpf_task_release expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(trusted_destroy_fail, struct task_struct *task, u64 clone_flags) { return subprog_trusted_destroy(task); @@ -287,6 +287,25 @@ int trusted_to_untrusted_mem(void *ctx) return subprog_void_untrusted(bpf_get_current_task_btf()); } +__weak int subprog_write_mem_arg(int *p) +{ + if (!p) + return 0; + + *p = 42; + return 0; +} + +SEC("?tp_btf/task_newtask") +__failure +__msg("only read is supported") +int trusted_btf_field_to_writable_mem(void *ctx) +{ + struct task_struct *task = bpf_get_current_task_btf(); + + return subprog_write_mem_arg(&task->prio); +} + SEC("tp_btf/sys_enter") __success int anything_to_untrusted_mem(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c index 1e08aff7532e..75a2e3f48d0f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_subprogs.c @@ -46,12 +46,13 @@ __noinline long global_dead(void) } SEC("?raw_tp") -__success __log_level(2) +__success __log_level(6) /* main prog is validated completely first */ __msg("('global_calls_good_only') is global and assumed valid.") /* eventually global_good() is transitively validated as well */ __msg("Validating global_good() func") __msg("('global_good') is safe for any args that match its prototype") +__msg("insns processed {{[0-9]+\\+[0-9]+\\+[0-9]+$}}") int chained_global_func_calls_success(void) { int sum = 0; @@ -151,6 +152,23 @@ int anon_user_mem_valid(void *ctx) return subprog_user_anon_mem(&t); } +__noinline __weak int subprog_user_anon_mem_huge(int (*p)[0x3fffffff]) +{ + return p ? (*p)[1] : 0; +} + +SEC("?tracepoint") +__failure __log_level(2) +__msg("R1 memory size 4294967292 is too large") +int anon_user_mem_huge_size_invalid(void *ctx) +{ + int (*p)[0x3fffffff]; + int tiny = 42; + + p = (void *)&tiny; + return subprog_user_anon_mem_huge(p) + tiny; +} + __noinline __weak int subprog_nonnull_ptr_good(int *p1 __arg_nonnull, int *p2 __arg_nonnull) { return (*p1) * (*p2); /* good, no need for NULL checks */ diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c index 4ea254063646..02e562f56f9d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -9,7 +9,11 @@ __success __retval(0) __arch_x86_64 __jited(" addq %gs:{{.*}}, %rax") __arch_arm64 -__jited(" mrs x7, SP_EL0") +__jited(" mrs x8, SP_EL0") +__arch_riscv64 +__jited(" mv a5, tp") +__arch_loongarch +__jited(" move $a5, $tp") int inline_bpf_get_current_task(void) { bpf_get_current_task(); @@ -17,4 +21,15 @@ int inline_bpf_get_current_task(void) return 0; } +SEC("fentry/bpf_fentry_test2") +__success __retval(0) +__arch_loongarch +__jited(" ld.wu $a5, $tp, 16") +int inline_bpf_get_smp_processor_id(void) +{ + bpf_get_smp_processor_id(); + + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index c8494b682c31..41340877dc9d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -3,7 +3,7 @@ #include <linux/bpf.h> #include <bpf/bpf_helpers.h> #include "bpf_misc.h" -#include "bpf_arena_common.h" +#include <bpf_arena_common.h> #if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ @@ -274,11 +274,11 @@ __jited("movslq 0x10(%rdi,%r12), %r15") __jited("movswq 0x18(%rdi,%r12), %r15") __jited("movsbq 0x20(%rdi,%r12), %r15") __arch_arm64 -__jited("add x11, x7, x28") +__jited("add x11, x8, x28") __jited("ldrsw x21, [x11, #0x10]") -__jited("add x11, x7, x28") +__jited("add x11, x8, x28") __jited("ldrsh x21, [x11, #0x18]") -__jited("add x11, x7, x28") +__jited("add x11, x8, x28") __jited("ldrsb x21, [x11, #0x20]") __jited("add x11, x0, x28") __jited("ldrsw x22, [x11, #0x10]") diff --git a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c index b058de623200..72646fa2745e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c +++ b/tools/testing/selftests/bpf/progs/verifier_liveness_exp.c @@ -15,7 +15,7 @@ * FP offset at each call site. arg_track keys on (frame, off[]), so * r1=fp-8, r1=fp-16, ... r1=fp-400 produce 50 unique cache keys per level. * - * This test chains 8 subprograms (the MAX_CALL_FRAMES limit). Each + * This test chains 8 subprograms (within the MAX_CALL_FRAMES limit). Each * intermediate function calls the next one 50 times, each time with a * different FP-relative offset in r1. * diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 38e8e9176862..c724bf389f5c 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -188,4 +188,28 @@ int BPF_PROG(null_check, struct file *file) return 0; } +SEC("lsm_cgroup/file_open") +__description("sleepable lsm_cgroup program is rejected") +__failure __msg("Program of this type cannot be sleepable") +__flag(BPF_F_SLEEPABLE) +int BPF_PROG(sleepable_lsm_cgroup) +{ + return 0; +} + +SEC("lsm/file_mprotect") +__description("lsm retval load must reset stale register bounds") +__failure __msg("div by zero") +__naked int retval_load_resets_bounds(void *ctx) +{ + asm volatile ( + "r6 = 0;" + "r6 = *(u64 *)(r1 + 24);" + "if r6 == 0 goto +1;" + "r6 /= 0;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c index 16b761e510f0..b606b5dca734 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_in_map.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_in_map.c @@ -18,6 +18,20 @@ struct { }); } map_in_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1); + __type(key, int); + __type(value, int); + __array(values, struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(map_flags, BPF_F_INNER_MAP); + __uint(max_entries, 8); + __type(key, int); + __type(value, long); + }); +} map_in_map_dyn SEC(".maps"); + SEC("socket") __description("map in map access") __success __success_unpriv __retval(0) @@ -45,6 +59,32 @@ l0_%=: r0 = 0; \ : __clobber_all); } +SEC("socket") +__description("map in map dynamic inner array lookup is nullable") +__failure __msg("invalid mem access 'map_value_or_null'") +__naked void map_in_map_dynamic_inner_array_lookup_is_nullable(void) +{ + asm volatile (" \ + r1 = 0; \ + *(u32*)(r10 - 4) = r1; \ + r2 = r10; \ + r2 += -4; \ + r1 = %[map_in_map_dyn] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + *(u32*)(r10 - 8) = 4; \ + r2 = r10; \ + r2 += -8; \ + r1 = r0; \ + call %[bpf_map_lookup_elem]; \ + r0 = *(u64 *)(r0 + 0); \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_in_map_dyn) + : __clobber_all); +} + SEC("xdp") __description("map in map state pruning") __success __msg("processed 15 insns") diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index e2767d27d8aa..166193659870 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,13 +70,16 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } -/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing - * into this array is valid. The opts field is now at offset 33. +/* + * struct bpf_map starts with the SHA256 hash sha[32] at offset 0 (a readable + * byte array), the u32 excl field at offset 32, and the ops pointer at offset + * 40. Reading a u32 at offset 41 reaches into the middle of the ops pointer, + * i.e. a partial pointer access, which is rejected. */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") +__msg("cannot access ptr member ops with moff 40 in struct bpf_map with off 41 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -85,6 +88,31 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ + r6 = *(u32*)(r1 + 41); \ + r0 = 1; \ + exit; \ +" : + : __imm_addr(map_array_48b) + : __clobber_all); +} + +/* + * The u32 excl field spans offsets 32..35 (mend 36). Reading a u32 at offset + * 33 starts inside excl but extends past its end, which the verifier rejects + * as an out-of-bounds scalar access. + */ +SEC("socket") +__description("bpf_map_ptr: read beyond excl field rejected") +__failure +__msg("access beyond the end of member excl (mend:36) in struct bpf_map with off 33 size 4") +__failure_unpriv +__msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") +__flag(BPF_F_ANY_ALIGNMENT) +__naked void read_beyond_excl_field_rejected(void) +{ + asm volatile (" \ + r6 = 0; \ + r1 = %[map_array_48b] ll; \ r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ @@ -103,7 +131,7 @@ __naked void ptr_read_ops_field_accepted(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u64*)(r1 + 0); \ + r6 = *(u64*)(r1 + 40); \ r0 = 1; \ exit; \ " : diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index 6d1edaef9213..4bdf4256a41e 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -81,13 +81,13 @@ __arch_s390x __arch_arm64 __xlated("0: *(u64 *)(r10 -16) = 65535") __xlated("1: *(u64 *)(r10 -8) = 0") -__xlated("2: r11 = *(u64 *)(r10 -16)") -__xlated("3: if r11 == 0x0 goto pc+6") -__xlated("4: r11 -= 1") -__xlated("5: if r11 != 0x0 goto pc+2") -__xlated("6: r11 = -16") +__xlated("2: r12 = *(u64 *)(r10 -16)") +__xlated("3: if r12 == 0x0 goto pc+6") +__xlated("4: r12 -= 1") +__xlated("5: if r12 != 0x0 goto pc+2") +__xlated("6: r12 = -16") __xlated("7: call unknown") -__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("8: *(u64 *)(r10 -16) = r12") __xlated("9: r0 = 1") __xlated("10: r0 = 2") __xlated("11: exit") diff --git a/tools/testing/selftests/bpf/progs/verifier_private_stack.c b/tools/testing/selftests/bpf/progs/verifier_private_stack.c index 646e8ef82051..bb8206e10880 100644 --- a/tools/testing/selftests/bpf/progs/verifier_private_stack.c +++ b/tools/testing/selftests/bpf/progs/verifier_private_stack.c @@ -86,6 +86,7 @@ __naked static void cumulative_stack_depth_subprog(void) SEC("kprobe") __description("Private stack, subtree > MAX_BPF_STACK") __success +__log_level(4) __msg("stack depth 512+32 max 512") __arch_x86_64 /* private stack fp for the main prog */ __jited(" movabsq $0x{{.*}}, %r9") @@ -93,6 +94,7 @@ __jited(" addq %gs:{{.*}}, %r9") __jited(" movl $0x2a, %edi") __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") +__jited("...") __jited(" callq 0x{{.*}}") __jited(" popq %r9") __jited(" xorl %eax, %eax") @@ -152,11 +154,13 @@ __jited(" endbr64") __jited(" movabsq $0x{{.*}}, %r9") __jited(" addq %gs:{{.*}}, %r9") __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __jited(" movl $0x2a, %edi") __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __arch_arm64 @@ -170,12 +174,12 @@ __jited(" mrs x10, TPIDR_EL{{[0-1]}}") __jited(" add x27, x27, x10") __jited(" add x25, x27, {{.*}}") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") +__jited(" mov x8, x0") __jited(" mov x0, #0x2a") __jited(" str x0, [x27]") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") -__jited(" mov x7, #0x0") +__jited(" mov x8, x0") +__jited(" mov x8, #0x0") __jited(" ldp x25, x27, [sp], {{.*}}") __naked void private_stack_callback(void) { @@ -198,6 +202,7 @@ __description("Private stack, exception in main prog") __success __retval(0) __arch_x86_64 __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __arch_arm64 @@ -220,7 +225,7 @@ __jited(" mov x0, #0x2a") __jited(" str x0, [x27]") __jited(" mov x0, #0x0") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") +__jited(" mov x8, x0") __jited(" ldp x27, x28, [sp], #0x10") int private_stack_exception_main_prog(void) { @@ -245,6 +250,7 @@ __success __retval(0) __arch_x86_64 __jited(" movq %rdi, -0x200(%r9)") __jited(" pushq %r9") +__jited("...") __jited(" callq") __jited(" popq %r9") __arch_arm64 @@ -258,7 +264,7 @@ __jited(" add x25, x27, {{.*}}") __jited(" mov x0, #0x2a") __jited(" str x0, [x27]") __jited(" bl 0x{{.*}}") -__jited(" mov x7, x0") +__jited(" mov x8, x0") __jited(" ldp x27, x28, [sp], #0x10") int private_stack_exception_sub_prog(void) { @@ -324,6 +330,8 @@ int private_stack_async_callback_1(void) SEC("fentry/bpf_fentry_test9") __description("Private stack, async callback, potential nesting") __success __retval(0) +__load_if_JITed() +__log_level(4) __msg("stack depth 8+0+256+0 max 272") __arch_x86_64 __jited(" subq $0x100, %rsp") __arch_arm64 @@ -344,6 +352,18 @@ int private_stack_async_callback_2(void) return 0; } +SEC("fentry/bpf_fentry_test9") +__description("private stack, max stack depth is private stack") +__success +__log_level(4) __msg("stack depth 8+256+0 max 256") +int private_stack_max_depth(void) +{ + int x = 0; + + subprog1(&x); + return 0; +} + #else SEC("kprobe") diff --git a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c index 910365201f68..199ad18f8eb5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c +++ b/tools/testing/selftests/bpf/progs/verifier_ref_tracking.c @@ -263,7 +263,7 @@ l0_%=: r0 = 0; \ SEC("lsm.s/bpf") __description("reference tracking: release user key reference without check") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void user_key_reference_without_check(void) { asm volatile (" \ @@ -282,7 +282,7 @@ __naked void user_key_reference_without_check(void) SEC("lsm.s/bpf") __description("reference tracking: release system key reference without check") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void system_key_reference_without_check(void) { asm volatile (" \ @@ -300,7 +300,7 @@ __naked void system_key_reference_without_check(void) SEC("lsm.s/bpf") __description("reference tracking: release with NULL key pointer") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") __naked void release_with_null_key_pointer(void) { asm volatile (" \ @@ -1288,7 +1288,7 @@ l1_%=: r1 = r6; \ SEC("tc") __description("reference tracking: bpf_sk_release(listen_sk)") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_release_listen_sk(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index 70ae14d6084f..e38f102da45f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -372,37 +372,36 @@ __naked void precision_two_ids(void) SEC("socket") __success __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) -/* check that r0 and r6 have different IDs after 'if', - * collect_linked_regs() can't tie more than 6 registers for a single insn. +/* + * check that r0 and r5 have different IDs after 'if', + * collect_linked_regs() can't tie more than 5 registers for a single insn. */ -__msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("14: (bf) r6 = r6 ; R6=scalar(id=2") -/* check that r{0-5} are marked precise after 'if' */ -__msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") -__msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") +__msg("7: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") +__msg("12: (bf) r5 = r5 ; R5=scalar(id=2") +/* check that r{0-4} are marked precise after 'if' */ +__msg("frame0: regs=r0 stack= before 7: (25) if r0 > 0x7 goto pc+0") +__msg("frame0: parent state regs=r0,r1,r2,r3,r4 stack=:") __naked void linked_regs_too_many_regs(void) { asm volatile ( /* r0 = random number up to 0xff */ "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" - /* tie r{0-6} IDs */ + /* tie r{0-5} IDs */ "r1 = r0;" "r2 = r0;" "r3 = r0;" "r4 = r0;" "r5 = r0;" - "r6 = r0;" - /* propagate range for r{0-6} */ + /* propagate range for r{0-5} */ "if r0 > 7 goto +0;" - /* keep r{1-5} live */ + /* keep r{1-4} live */ "r1 = r1;" "r2 = r2;" "r3 = r3;" "r4 = r4;" + /* make r5 appear in the log */ "r5 = r5;" - /* make r6 appear in the log */ - "r6 = r6;" /* force r0 to be precise, * this would cause r{0-4} to be precise because of shared IDs */ diff --git a/tools/testing/selftests/bpf/progs/verifier_sdiv.c b/tools/testing/selftests/bpf/progs/verifier_sdiv.c index fd59d57e8e37..95f3239ce228 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sdiv.c +++ b/tools/testing/selftests/bpf/progs/verifier_sdiv.c @@ -778,10 +778,10 @@ __arch_x86_64 __xlated("0: r2 = 0x8000000000000000") __xlated("2: r3 = -1") __xlated("3: r4 = r2") -__xlated("4: r11 = r3") -__xlated("5: r11 += 1") -__xlated("6: if r11 > 0x1 goto pc+4") -__xlated("7: if r11 == 0x0 goto pc+1") +__xlated("4: r12 = r3") +__xlated("5: r12 += 1") +__xlated("6: if r12 > 0x1 goto pc+4") +__xlated("7: if r12 == 0x0 goto pc+1") __xlated("8: r2 = 0") __xlated("9: r2 = -r2") __xlated("10: goto pc+1") @@ -812,10 +812,10 @@ __success __retval(-5) __arch_x86_64 __xlated("0: r2 = 5") __xlated("1: r3 = -1") -__xlated("2: r11 = r3") -__xlated("3: r11 += 1") -__xlated("4: if r11 > 0x1 goto pc+4") -__xlated("5: if r11 == 0x0 goto pc+1") +__xlated("2: r12 = r3") +__xlated("3: r12 += 1") +__xlated("4: if r12 > 0x1 goto pc+4") +__xlated("5: if r12 == 0x0 goto pc+1") __xlated("6: r2 = 0") __xlated("7: r2 = -r2") __xlated("8: goto pc+1") @@ -890,10 +890,10 @@ __arch_x86_64 __xlated("0: w2 = -2147483648") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+4") -__xlated("6: if w11 == 0x0 goto pc+1") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+4") +__xlated("6: if w12 == 0x0 goto pc+1") __xlated("7: w2 = 0") __xlated("8: w2 = -w2") __xlated("9: goto pc+1") @@ -925,10 +925,10 @@ __arch_x86_64 __xlated("0: w2 = -5") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+4") -__xlated("6: if w11 == 0x0 goto pc+1") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+4") +__xlated("6: if w12 == 0x0 goto pc+1") __xlated("7: w2 = 0") __xlated("8: w2 = -w2") __xlated("9: goto pc+1") @@ -1004,10 +1004,10 @@ __arch_x86_64 __xlated("0: r2 = 0x8000000000000000") __xlated("2: r3 = -1") __xlated("3: r4 = r2") -__xlated("4: r11 = r3") -__xlated("5: r11 += 1") -__xlated("6: if r11 > 0x1 goto pc+3") -__xlated("7: if r11 == 0x1 goto pc+3") +__xlated("4: r12 = r3") +__xlated("5: r12 += 1") +__xlated("6: if r12 > 0x1 goto pc+3") +__xlated("7: if r12 == 0x1 goto pc+3") __xlated("8: w2 = 0") __xlated("9: goto pc+1") __xlated("10: r2 s%= r3") @@ -1034,10 +1034,10 @@ __arch_x86_64 __xlated("0: r2 = 5") __xlated("1: r3 = -1") __xlated("2: r4 = r2") -__xlated("3: r11 = r3") -__xlated("4: r11 += 1") -__xlated("5: if r11 > 0x1 goto pc+3") -__xlated("6: if r11 == 0x1 goto pc+3") +__xlated("3: r12 = r3") +__xlated("4: r12 += 1") +__xlated("5: if r12 > 0x1 goto pc+3") +__xlated("6: if r12 == 0x1 goto pc+3") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: r2 s%= r3") @@ -1108,10 +1108,10 @@ __arch_x86_64 __xlated("0: w2 = -2147483648") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+3") -__xlated("6: if w11 == 0x1 goto pc+4") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+3") +__xlated("6: if w12 == 0x1 goto pc+4") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: w2 s%= w3") @@ -1140,10 +1140,10 @@ __arch_x86_64 __xlated("0: w2 = -5") __xlated("1: w3 = -1") __xlated("2: w4 = w2") -__xlated("3: r11 = r3") -__xlated("4: w11 += 1") -__xlated("5: if w11 > 0x1 goto pc+3") -__xlated("6: if w11 == 0x1 goto pc+4") +__xlated("3: r12 = r3") +__xlated("4: w12 += 1") +__xlated("5: if w12 > 0x1 goto pc+3") +__xlated("6: if w12 == 0x1 goto pc+4") __xlated("7: w2 = 0") __xlated("8: goto pc+1") __xlated("9: w2 s%= w3") diff --git a/tools/testing/selftests/bpf/progs/verifier_set_retval.c b/tools/testing/selftests/bpf/progs/verifier_set_retval.c new file mode 100644 index 000000000000..1415cd15cede --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_set_retval.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval success") +__success +int BPF_PROG(lsm_cgroup_set_retval_zero_valid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(0); + return 0; +} + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval valid errno") +__success +int BPF_PROG(lsm_cgroup_set_retval_negative_valid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(-12); + return 0; +} + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval invalid negative value") +__failure __msg("should have been in [-4095, 0]") +int BPF_PROG(lsm_cgroup_set_retval_negative_invalid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(-4096); + return 0; +} + +SEC("lsm_cgroup/socket_create") +__description("lsm_cgroup bpf_set_retval invalid positive value") +__failure __msg("should have been in [-4095, 0]") +int BPF_PROG(lsm_cgroup_set_retval_positive_invalid, int family, int type, int protocol, int kern) +{ + bpf_set_retval(1); + return 0; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval success") +__success +int cgroup_dev_set_retval_0(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(0); + return 1; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval valid errno") +__success +int cgroup_dev_set_retval_neg_maxerrno(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(-4095); + return 1; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval invalid positive value") +__failure __msg("should have been in [-4095, 0]") +int cgroup_dev_set_retval_1(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(1); + return 1; +} + +SEC("cgroup/dev") +__description("cgroup_device bpf_set_retval invalid negative value") +__failure __msg("should have been in [-4095, 0]") +int cgroup_dev_set_retval_neg_4096(struct bpf_cgroup_dev_ctx *ctx) +{ + bpf_set_retval(-4096); + return 1; +} + +SEC("cgroup/dev") +__description("bpf_set_retval bounds check survives state pruning") +__failure __msg("should have been in [-4095, 0]") +__naked int cgroup_dev_set_retval_pruning_bypass(struct bpf_cgroup_dev_ctx *ctx) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "if r0 != 0 goto 1f;" + "r0 = r0;" + "r0 = r0;" + "r0 = r0;" + "r0 = r0;" + "goto 2f;" + "1:" + "call %[bpf_get_prandom_u32];" + "2:" + "r1 = r0;" + "call %[bpf_set_retval];" + "r0 = 1;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_set_retval) + : __clobber_common + ); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c index a2132c72d3b8..4f2f3209eec8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_sock.c +++ b/tools/testing/selftests/bpf/progs/verifier_sock.c @@ -603,7 +603,7 @@ l2_%=: r0 = *(u32*)(r0 + %[bpf_tcp_sock_snd_cwnd]); \ SEC("tc") __description("bpf_sk_release(skb->sk)") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_release_skb_sk(void) { asm volatile (" \ @@ -620,7 +620,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("bpf_sk_release(bpf_sk_fullsock(skb->sk))") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_sk_fullsock_skb_sk(void) { asm volatile (" \ @@ -644,7 +644,7 @@ l1_%=: r1 = r0; \ SEC("tc") __description("bpf_sk_release(bpf_tcp_sock(skb->sk))") -__failure __msg("R1 must be referenced when passed to release function") +__failure __msg("release helper bpf_sk_release expects referenced PTR_TO_BTF_ID passed to R1") __naked void bpf_tcp_sock_skb_sk(void) { asm volatile (" \ @@ -1120,8 +1120,11 @@ int tail_call(struct __sk_buff *sk) static __noinline int static_tail_call(struct __sk_buff *sk) { + int ret = 0; + bpf_tail_call_static(sk, &jmp_table, 0); - return 0; + barrier_var(ret); + return ret; } /* Tail calls in sub-programs invalidate packet pointers. */ @@ -1144,10 +1147,12 @@ __failure __msg("invalid mem access") int invalidate_pkt_pointers_by_static_tail_call(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; + int ret; if ((void *)(p + 1) > (void *)(long)sk->data_end) return TCX_DROP; - static_tail_call(sk); + ret = static_tail_call(sk); + __sink(ret); *p = 42; /* this is unsafe */ return TCX_PASS; } diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 6bc721accbae..0174887e28f5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -1359,4 +1359,22 @@ __naked void var_off_write_over_scalar_spill(void) : __clobber_all); } +SEC("socket") +__description("partial fill from cleaned pointer spill") +__failure +__log_level(2) +__msg("1: (05) goto pc+0") +__msg("2: (61) r0 = *(u32 *)(r10 -4)") +__msg("invalid size of register fill") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void partial_fill_from_cleaned_pointer_spill(void) +{ + /* Spill R1(ctx), then force a checkpoint and half-slot cleanup. */ + asm volatile ("*(u64 *)(r10 - 8) = r1;" + "goto +0;" + "r0 = *(u32 *)(r10 - 4);" + "exit;" + ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c new file mode 100644 index 000000000000..7e0ce5db28a0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg.c @@ -0,0 +1,447 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, long long); + __type(value, long long); +} map_hash_8b SEC(".maps"); + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +__noinline __used +static int subprog_6args(int a, int b, int c, int d, int e, int f) +{ + return a + b + c + d + e + f; +} + +__noinline __used +static int subprog_7args(int a, int b, int c, int d, int e, int f, int g) +{ + return a + b + c + d + e + f + g; +} + +__noinline __used +static long subprog_deref_arg6(long a, long b, long c, long d, long e, long *f) +{ + return *f; +} + +SEC("tc") +__description("stack_arg: subprog with 6 args") +__success __retval(21) +__naked void stack_arg_6args(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_6args;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: two subprogs with >5 args") +__success __retval(90) +__naked void stack_arg_two_subprogs(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 10;" + "call subprog_6args;" + "r6 = r0;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 16) = 30;" + "*(u64 *)(r11 - 8) = 20;" + "call subprog_7args;" + "r0 += r6;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: read from uninitialized stack arg slot") +__failure +__msg("invalid read from stack arg off 8 depth 0") +__naked void stack_arg_read_uninitialized(void) +{ + asm volatile ( + "r0 = *(u64 *)(r11 + 8);" + "r0 = 0;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: gap at offset -8, only wrote -16") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__naked void stack_arg_gap_at_minus8(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 16) = 30;" + "call subprog_7args;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: pruning with different stack arg types") +__failure __log_level(2) +__flag(BPF_F_TEST_STATE_FREQ) +__msg("arg JOIN insn 9 -> 10 r1: fp0-8 + _ => fp0-8|fp0+0") +__msg("arg JOIN insn 9 -> 10 sa0: fp0-8 + _ => fp0-8|fp0+0") +__msg("R{{[0-9]}} invalid mem access 'scalar'") +__naked void stack_arg_pruning_type_mismatch(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r6 = r0;" + /* local = 0 on program stack */ + "r7 = 0;" + "*(u64 *)(r10 - 8) = r7;" + /* Branch based on random value */ + "if r6 s> 3 goto l0_%=;" + /* Path 1: store stack pointer to outgoing arg6 */ + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r11 - 8) = r1;" + "goto l1_%=;" + "l0_%=:" + /* Path 2: store scalar to outgoing arg6 */ + "*(u64 *)(r11 - 8) = 42;" + "l1_%=:" + /* Call subprog that dereferences arg6 */ + "r1 = r6;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "call subprog_deref_arg6;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: release_reference invalidates stack arg slot") +__failure +__msg("callee expects 6 args, stack arg1 is not initialized") +__naked void stack_arg_release_ref(void) +{ + asm volatile ( + "r6 = r1;" + /* struct bpf_sock_tuple tuple = {} */ + "r2 = 0;" + "*(u32 *)(r10 - 8) = r2;" + "*(u64 *)(r10 - 16) = r2;" + "*(u64 *)(r10 - 24) = r2;" + "*(u64 *)(r10 - 32) = r2;" + "*(u64 *)(r10 - 40) = r2;" + "*(u64 *)(r10 - 48) = r2;" + /* sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof(tuple), 0, 0) */ + "r1 = r6;" + "r2 = r10;" + "r2 += -48;" + "r3 = %[sizeof_bpf_sock_tuple];" + "r4 = 0;" + "r5 = 0;" + "call %[bpf_sk_lookup_tcp];" + /* r0 = sk (PTR_TO_SOCK_OR_NULL) */ + "if r0 == 0 goto l0_%=;" + /* Store sock ref to outgoing arg6 slot */ + "*(u64 *)(r11 - 8) = r0;" + /* Release the reference — invalidates the stack arg slot */ + "r1 = r0;" + "call %[bpf_sk_release];" + /* Call subprog that dereferences arg6 — should fail */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_deref_arg6;" + "l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_sk_lookup_tcp), + __imm(bpf_sk_release), + __imm_const(sizeof_bpf_sock_tuple, sizeof(struct bpf_sock_tuple)) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: pkt pointer in stack arg slot invalidated after pull_data") +__failure +__msg("callee expects 6 args, stack arg1 is not initialized") +__naked void stack_arg_stale_pkt_ptr(void) +{ + asm volatile ( + "r6 = r1;" + "r7 = *(u32 *)(r6 + %[__sk_buff_data]);" + "r8 = *(u32 *)(r6 + %[__sk_buff_data_end]);" + /* check pkt has at least 1 byte */ + "r0 = r7;" + "r0 += 8;" + "if r0 > r8 goto l0_%=;" + /* Store valid pkt pointer to outgoing arg6 slot */ + "*(u64 *)(r11 - 8) = r7;" + /* bpf_skb_pull_data invalidates all pkt pointers */ + "r1 = r6;" + "r2 = 0;" + "call %[bpf_skb_pull_data];" + /* Call subprog that dereferences arg6 — should fail */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_deref_arg6;" + "l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_skb_pull_data), + __imm_const(__sk_buff_data, offsetof(struct __sk_buff, data)), + __imm_const(__sk_buff_data_end, offsetof(struct __sk_buff, data_end)) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: null propagation rejects deref on null branch") +__failure +__msg("R{{[0-9]}} invalid mem access 'scalar'") +__naked void stack_arg_null_propagation_fail(void) +{ + asm volatile ( + "r1 = 0;" + "*(u64 *)(r10 - 8) = r1;" + /* r0 = bpf_map_lookup_elem(&map_hash_8b, &key) */ + "r2 = r10;" + "r2 += -8;" + "r1 = %[map_hash_8b] ll;" + "call %[bpf_map_lookup_elem];" + /* Store PTR_TO_MAP_VALUE_OR_NULL to outgoing arg6 slot */ + "*(u64 *)(r11 - 8) = r0;" + /* null check on r0 */ + "if r0 != 0 goto l0_%=;" + /* + * On null branch, outgoing slot is SCALAR(0). + * Call subprog that dereferences arg6 — should fail. + */ + "r1 = 0;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "call subprog_deref_arg6;" + "l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm(bpf_map_lookup_elem), + __imm_addr(map_hash_8b) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: missing store on one branch") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__naked void stack_arg_missing_store_one_branch(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write arg7 (r11-16) before branch */ + "*(u64 *)(r11 - 16) = 20;" + "if r0 > 0 goto l0_%=;" + /* Path 1: write arg6 and call */ + "*(u64 *)(r11 - 8) = 10;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "goto l1_%=;" + "l0_%=:" + /* Path 2: missing arg6 store, call should fail */ + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "l1_%=:" + "r0 = 0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: share a store for both branches") +__success __retval(0) +__naked void stack_arg_shared_store(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write arg7 (r11-16) before branch */ + "*(u64 *)(r11 - 16) = 20;" + "if r0 > 0 goto l0_%=;" + /* Path 1: write arg6 and call */ + "*(u64 *)(r11 - 8) = 10;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "goto l1_%=;" + "l0_%=:" + /* Path 2: also write arg6 and call */ + "*(u64 *)(r11 - 8) = 30;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "l1_%=:" + "r0 = 0;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: write beyond max outgoing depth") +__failure +__msg("stack arg write offset -80 exceeds max 7 stack args") +__naked void stack_arg_write_beyond_max(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write to offset -80, way beyond any callee's needs */ + "*(u64 *)(r11 - 80) = 99;" + "*(u64 *)(r11 - 16) = 20;" + "*(u64 *)(r11 - 8) = 10;" + "call subprog_7args;" + "r0 = 0;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: write unused stack arg slot") +__failure +__msg("func#0 writes 5 stack arg slots, but calls only require 2") +__naked void stack_arg_write_unused_slot(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + /* Write to offset -40, unused for the callee */ + "*(u64 *)(r11 - 40) = 99;" + "*(u64 *)(r11 - 16) = 20;" + "*(u64 *)(r11 - 8) = 10;" + "call subprog_7args;" + "r0 = 0;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: sequential calls reuse slots") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__naked void stack_arg_sequential_calls(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "*(u64 *)(r11 - 16) = 7;" + "call subprog_7args;" + "r6 = r0;" + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "call subprog_7args;" + "r0 += r6;" + "exit;" + ::: __clobber_all + ); +} + +#else + +SEC("socket") +__description("stack_arg is not supported by compiler or jit, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c new file mode 100644 index 000000000000..c9fe4857da3f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_stack_arg_order.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +#if (defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)) && \ + defined(__BPF_FEATURE_STACK_ARGUMENT) + +__noinline __used __naked +static int subprog_bad_order_6args(int a, int b, int c, int d, int e, int f) +{ + asm volatile ( + "*(u64 *)(r11 - 8) = r1;" + "r0 = *(u64 *)(r11 + 8);" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: r11 load after r11 store") +__failure +__msg("r11 load must be before any r11 store or call insn") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_load_after_store(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_bad_order_6args;" + "exit;" + ::: __clobber_all + ); +} + +__noinline __used __naked +static int subprog_call_before_load_6args(int a, int b, int c, int d, int e, + int f) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r0 = *(u64 *)(r11 + 8);" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: r11 load after a call") +__failure +__msg("r11 load must be before any r11 store or call insn") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_load_after_call(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_call_before_load_6args;" + "exit;" + ::: __clobber_all + ); +} + +__noinline __used __naked +static int subprog_pruning_call_before_load_6args(int a, int b, int c, int d, + int e, int f) +{ + asm volatile ( + "if r1 s> 0 goto l0_%=;" + "goto l1_%=;" + "l0_%=:" + "call %[bpf_get_prandom_u32];" + "l1_%=:" + "r0 = *(u64 *)(r11 + 8);" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: pruning keeps r11 load ordering") +__failure +__flag(BPF_F_TEST_STATE_FREQ) +__msg("r11 load must be before any r11 store or call insn") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_pruning_load_after_call(void) +{ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r1 = r0;" + "r2 = 2;" + "r3 = 3;" + "r4 = 4;" + "r5 = 5;" + "*(u64 *)(r11 - 8) = 6;" + "call subprog_pruning_call_before_load_6args;" + "exit;" + :: __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +/* + * "bad_ptr": the first arg is 'long *', which is not a recognized pointer + * type for static subprogs (not ctx, dynptr, or tagged). btf_prepare_func_args() + * sets arg_cnt = 7 / stack_arg_cnt = 2, then fails with -EINVAL. The subprog + * is marked unreliable but the call still proceeds for static subprogs. + */ +__noinline __used __naked +static void subprog_bad_ptr_7args(long *a, int b, int c, int d, int e, int f, int g) +{ + asm volatile ( + "r0 = *(u64 *)(r11 + 8);" + "r1 = *(u64 *)(r11 + 16);" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: read without caller write") +__failure +__msg("callee expects 7 args, stack arg1 is not initialized") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_read_without_write_1(void) +{ + asm volatile ( + "r1 = 0;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "call subprog_bad_ptr_7args;" + "exit;" + ::: __clobber_all + ); +} + +SEC("tc") +__description("stack_arg: read with not-initialized caller write") +__failure +__msg("R0 !read_ok") +__btf_func_path("btf__verifier_stack_arg_order.bpf.o") +__naked void stack_arg_read_without_write_2(void) +{ + asm volatile ( + "r1 = 0;" + "r2 = 0;" + "r3 = 0;" + "r4 = 0;" + "r5 = 0;" + "*(u64 *)(r11 - 8) = 0;" + "*(u64 *)(r11 - 16) = 0;" + "call subprog_bad_ptr_7args;" + "call subprog_bad_ptr_7args;" + "exit;" + ::: __clobber_all + ); +} + +#else + +SEC("socket") +__description("stack_arg order is not supported by compiler or jit, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 31832a306f91..73b5b0cf6706 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -558,7 +558,8 @@ __description("arsh32 imm sign negative extend check") __success __retval(0) __log_level(2) __msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") -__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,smin32=0,var_off=(0x0; 0xffffffff00000000))") +/* represents shorter of signed / unsigned 64-bit ranges */ __msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") __naked void arsh32_imm_sign_extend_negative_check(void) { @@ -581,7 +582,8 @@ __description("arsh32 imm sign extend check") __success __retval(0) __log_level(2) __msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") -__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +/* represents shorter of signed / unsigned 64-bit ranges */ __msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") __naked void arsh32_imm_sign_extend_check(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c index 8d60c634a114..48fa34d2959f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c +++ b/tools/testing/selftests/bpf/progs/verifier_tailcall_jit.c @@ -56,6 +56,7 @@ __jited("L1: pushq %rax") /* rbp[-16] = rax */ * (cause original rax might be clobbered by this point) */ __jited(" movq -0x10(%rbp), %rax") +__jited("...") __jited(" callq 0x{{.*}}") /* call to sub() */ __jited(" xorl %eax, %eax") __jited(" leave") diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c index c16f8382cf17..42de5cff7e52 100644 --- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c +++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c @@ -6,6 +6,8 @@ #include "../../../include/linux/filter.h" #include "bpf_misc.h" +extern const int bpf_prog_active __ksym; + #define BPF_SK_LOOKUP(func) \ /* struct bpf_sock_tuple tuple = {} */ \ "r2 = 0;" \ @@ -78,6 +80,23 @@ __naked void dummy_prog_loop1_socket(void) } SEC("socket") +__description("unpriv: pseudo btf id log masks address") +__success_unpriv +__msg_unpriv("0: (18) r1 = 0x0") +__not_msg_unpriv("0: (18) r1 = 0x{{[1-9a-f][0-9a-f]*}}") +__retval_unpriv(0) +__log_level(2) +__naked void pseudo_btf_id_log_masks_address(void) +{ + asm volatile ("r1 = %[bpf_prog_active] ll;" + "r0 = 0;" + "exit;" + : + : __imm_addr(bpf_prog_active) + : __clobber_all); +} + +SEC("socket") __description("unpriv: return pointer") __success __failure_unpriv __msg_unpriv("R0 leaks addr") __retval(POINTER_VALUE) @@ -976,4 +995,26 @@ l0_%=: exit; \ : __clobber_all); } +SEC("socket") +__description("unpriv: Spectre v4 stack write slot index") +__success __success_unpriv +__retval(0) +#ifdef SPEC_V4 +__xlated_unpriv("r0 = 0") +__xlated_unpriv("*(u32 *)(r10 -4) = r0") +__xlated_unpriv("nospec") +__xlated_unpriv("*(u32 *)(r10 -8) = r0") +__xlated_unpriv("nospec") +__xlated_unpriv("exit") +#endif +__naked void stack_write_nospec_slot_index(void) +{ + asm volatile (" \ + r0 = 0; \ + *(u32 *)(r10 - 4) = r0; \ + *(u32 *)(r10 - 8) = r0; \ + exit; \ +" ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index 4b392c6c8fc4..2870738d93f7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -13,7 +13,7 @@ static char buf[PATH_MAX]; SEC("lsm.s/file_open") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(get_task_exe_file_kfunc_null) { struct file *acquired; @@ -28,7 +28,7 @@ int BPF_PROG(get_task_exe_file_kfunc_null) } SEC("lsm.s/inode_getxattr") -__failure __msg("arg#0 pointer type STRUCT task_struct must point to scalar, or struct with scalar") +__failure __msg("R1 pointer type STRUCT task_struct must point to scalar, or struct with scalar") int BPF_PROG(get_task_exe_file_kfunc_fp) { u64 x; @@ -80,7 +80,7 @@ int BPF_PROG(get_task_exe_file_kfunc_unreleased) } SEC("lsm.s/file_open") -__failure __msg("release kernel function bpf_put_file expects") +__failure __msg("release kfunc bpf_put_file expects referenced PTR_TO_BTF_ID passed to R1") int BPF_PROG(put_file_kfunc_unacquired, struct file *file) { /* Can't release an unacquired pointer. */ @@ -89,7 +89,7 @@ int BPF_PROG(put_file_kfunc_unacquired, struct file *file) } SEC("lsm.s/file_open") -__failure __msg("Possibly NULL pointer passed to trusted arg0") +__failure __msg("Possibly NULL pointer passed to trusted R1") int BPF_PROG(path_d_path_kfunc_null) { /* Can't pass NULL value to bpf_path_d_path() kfunc. */ @@ -128,7 +128,7 @@ int BPF_PROG(path_d_path_kfunc_untrusted_from_current) } SEC("lsm.s/file_open") -__failure __msg("kernel function bpf_path_d_path args#0 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") +__failure __msg("kernel function bpf_path_d_path R1 expected pointer to STRUCT path but R1 has a pointer to STRUCT file") int BPF_PROG(path_d_path_kfunc_type_mismatch, struct file *file) { bpf_path_d_path((struct path *)&file->f_task_work, buf, sizeof(buf)); diff --git a/tools/testing/selftests/bpf/progs/wakeup_source.h b/tools/testing/selftests/bpf/progs/wakeup_source.h new file mode 100644 index 000000000000..cd74de92c82f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wakeup_source.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2026 Google LLC */ + +#ifndef __WAKEUP_SOURCE_H__ +#define __WAKEUP_SOURCE_H__ + +#define WAKEUP_NAME_LEN 128 + +struct wakeup_event_t { + unsigned long active_count; + long long active_time_ns; + unsigned long event_count; + unsigned long expire_count; + long long last_time_ns; + long long max_time_ns; + long long prevent_sleep_time_ns; + long long total_time_ns; + unsigned long wakeup_count; + char name[WAKEUP_NAME_LEN]; +}; + +#endif /* __WAKEUP_SOURCE_H__ */ diff --git a/tools/testing/selftests/bpf/progs/wakeup_source_fail.c b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c new file mode 100644 index 000000000000..d4d0f1610853 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/wakeup_source_fail.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2026 Google LLC */ + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +struct bpf_ws_lock; + +struct bpf_ws_lock *bpf_wakeup_sources_read_lock(void) __ksym; +void bpf_wakeup_sources_read_unlock(struct bpf_ws_lock *lock) __ksym; +void *bpf_wakeup_sources_get_head(void) __ksym; + +SEC("syscall") +__failure __msg("BPF_EXIT instruction in main prog would lead to reference leak") +int wakeup_source_lock_no_unlock(void *ctx) +{ + struct bpf_ws_lock *lock; + + lock = bpf_wakeup_sources_read_lock(); + if (!lock) + return 0; + + return 0; +} + +SEC("syscall") +__failure __msg("access beyond struct") +int wakeup_source_access_lock_fields(void *ctx) +{ + struct bpf_ws_lock *lock; + int val; + + lock = bpf_wakeup_sources_read_lock(); + if (!lock) + return 0; + + val = *(int *)lock; + + bpf_wakeup_sources_read_unlock(lock); + return val; +} + +SEC("syscall") +__failure __msg("release kfunc bpf_wakeup_sources_read_unlock expects referenced PTR_TO_BTF_ID passed to R1") +int wakeup_source_unlock_no_lock(void *ctx) +{ + struct bpf_ws_lock *lock = (void *)0x1; + + bpf_wakeup_sources_read_unlock(lock); + + return 0; +} + +SEC("syscall") +__failure __msg("Possibly NULL pointer passed to trusted") +int wakeup_source_unlock_null(void *ctx) +{ + bpf_wakeup_sources_read_unlock(NULL); + + return 0; +} + +SEC("syscall") +__failure __msg("R0 invalid mem access 'scalar'") +int wakeup_source_unsafe_dereference(void *ctx) +{ + struct list_head *head = bpf_wakeup_sources_get_head(); + + if (head->next) + return 1; + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index 3767f5595bbc..32dc8827e128 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -98,7 +98,7 @@ __failure * is a correct bpf_wq pointer. */ __msg(": (85) call bpf_wq_set_callback#") /* anchor message */ -__msg("arg#0 doesn't point to a map value") +__msg("R1 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { int key = 0; diff --git a/tools/testing/selftests/bpf/progs/xdp_flowtable.c b/tools/testing/selftests/bpf/progs/xdp_flowtable.c index 7fdc7b23ee74..e67daa02749d 100644 --- a/tools/testing/selftests/bpf/progs/xdp_flowtable.c +++ b/tools/testing/selftests/bpf/progs/xdp_flowtable.c @@ -15,7 +15,10 @@ struct bpf_flowtable_opts___local { s32 error; }; -struct flow_offload_tuple_rhash * +struct flow_offload_tuple_rhash___local { +}; + +struct flow_offload_tuple_rhash___local * bpf_xdp_flow_lookup(struct xdp_md *, struct bpf_fib_lookup *, struct bpf_flowtable_opts___local *, u32) __ksym; @@ -67,7 +70,7 @@ int xdp_flowtable_do_lookup(struct xdp_md *ctx) { void *data_end = (void *)(long)ctx->data_end; struct bpf_flowtable_opts___local opts = {}; - struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload_tuple_rhash___local *tuplehash; struct bpf_fib_lookup tuple = { .ifindex = ctx->ingress_ifindex, }; diff --git a/tools/testing/selftests/bpf/progs/xdp_lb_bench.c b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c new file mode 100644 index 000000000000..13777b3dcac8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_lb_bench.c @@ -0,0 +1,647 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <stddef.h> +#include <stdbool.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include "bpf_compiler.h" +#include "xdp_lb_bench_common.h" +#include "bench_bpf_timing.bpf.h" + +#ifndef IPPROTO_FRAGMENT +#define IPPROTO_FRAGMENT 44 +#endif + +/* jhash helpers */ + +static inline __u32 rol32(__u32 word, unsigned int shift) +{ + return (word << shift) | (word >> ((-shift) & 31)); +} + +#define __jhash_mix(a, b, c) \ +{ \ + a -= c; a ^= rol32(c, 4); c += b; \ + b -= a; b ^= rol32(a, 6); a += c; \ + c -= b; c ^= rol32(b, 8); b += a; \ + a -= c; a ^= rol32(c, 16); c += b; \ + b -= a; b ^= rol32(a, 19); a += c; \ + c -= b; c ^= rol32(b, 4); b += a; \ +} + +#define __jhash_final(a, b, c) \ +{ \ + c ^= b; c -= rol32(b, 14); \ + a ^= c; a -= rol32(c, 11); \ + b ^= a; b -= rol32(a, 25); \ + c ^= b; c -= rol32(b, 16); \ + a ^= c; a -= rol32(c, 4); \ + b ^= a; b -= rol32(a, 14); \ + c ^= b; c -= rol32(b, 24); \ +} + +#define JHASH_INITVAL 0xdeadbeef + +static inline __u32 __jhash_nwords(__u32 a, __u32 b, __u32 c, __u32 initval) +{ + a += initval; + b += initval; + c += initval; + __jhash_final(a, b, c); + return c; +} + +static inline __u32 jhash_2words(__u32 a, __u32 b, __u32 initval) +{ + return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); +} + +static inline __u32 jhash2_4words(const __u32 *k, __u32 initval) +{ + __u32 a, b, c; + + a = b = c = JHASH_INITVAL + (4 << 2) + initval; + + a += k[0]; b += k[1]; c += k[2]; + __jhash_mix(a, b, c); + + a += k[3]; + __jhash_final(a, b, c); + + return c; +} + +static __always_inline void ipv4_csum(struct iphdr *iph) +{ + __u16 *next_iph = (__u16 *)iph; + __u32 csum = 0; + int i; + + __pragma_loop_unroll_full + for (i = 0; i < (int)(sizeof(*iph) >> 1); i++) + csum += *next_iph++; + + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + iph->check = ~csum; +} + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); + __type(key, struct vip_definition); + __type(value, struct vip_meta); +} vip_map SEC(".maps"); + +struct lru_inner_map { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __type(key, struct flow_key); + __type(value, struct real_pos_lru); + __uint(max_entries, DEFAULT_LRU_SIZE); +} lru_inner SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, BENCH_NR_CPUS); + __array(values, struct lru_inner_map); +} lru_mapping SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, CH_RINGS_SIZE); + __type(key, __u32); + __type(value, __u32); +} ch_rings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_REALS); + __type(key, __u32); + __type(value, struct real_definition); +} reals SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, STATS_SIZE); + __type(key, __u32); + __type(value, struct lb_stats); +} stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, MAX_REALS); + __type(key, __u32); + __type(value, struct lb_stats); +} reals_stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct ctl_value); +} ctl_array SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct vip_definition); +} vip_miss_stats SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, MAX_REALS); + __type(key, __u32); + __type(value, __u32); +} lru_miss_stats SEC(".maps"); + +volatile __u32 flow_mask; +volatile __u32 cold_lru; +__u32 batch_gen; + +/* + * old_eth MUST be read BEFORE writing the outer header because + * bpf_xdp_adjust_head makes them overlap. + */ +static __always_inline int encap_v4(struct xdp_md *xdp, __be32 saddr, __be32 daddr, + __u16 payload_len, const __u8 *dst_mac) +{ + struct ethhdr *new_eth, *old_eth; + void *data, *data_end; + struct iphdr *iph; + + if (bpf_xdp_adjust_head(xdp, -(int)sizeof(struct iphdr))) + return -1; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + iph = data + sizeof(struct ethhdr); + old_eth = data + sizeof(struct iphdr); + + if (new_eth + 1 > data_end || old_eth + 1 > data_end || iph + 1 > data_end) + return -1; + + __builtin_memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); + __builtin_memcpy(new_eth->h_dest, dst_mac, sizeof(new_eth->h_dest)); + new_eth->h_proto = bpf_htons(ETH_P_IP); + + __builtin_memset(iph, 0, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->protocol = IPPROTO_IPIP; + iph->tot_len = bpf_htons(payload_len + sizeof(*iph)); + iph->ttl = 64; + iph->saddr = saddr; + iph->daddr = daddr; + ipv4_csum(iph); + + return 0; +} + +static __always_inline int encap_v6(struct xdp_md *xdp, const __be32 saddr[4], + const __be32 daddr[4], __u8 nexthdr, __u16 payload_len, + const __u8 *dst_mac) +{ + struct ethhdr *new_eth, *old_eth; + void *data, *data_end; + struct ipv6hdr *ip6h; + + if (bpf_xdp_adjust_head(xdp, -(int)sizeof(struct ipv6hdr))) + return -1; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + + new_eth = data; + ip6h = data + sizeof(struct ethhdr); + old_eth = data + sizeof(struct ipv6hdr); + + if (new_eth + 1 > data_end || old_eth + 1 > data_end || ip6h + 1 > data_end) + return -1; + + __builtin_memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source)); + __builtin_memcpy(new_eth->h_dest, dst_mac, sizeof(new_eth->h_dest)); + new_eth->h_proto = bpf_htons(ETH_P_IPV6); + + __builtin_memset(ip6h, 0, sizeof(*ip6h)); + ip6h->version = 6; + ip6h->nexthdr = nexthdr; + ip6h->payload_len = bpf_htons(payload_len); + ip6h->hop_limit = 64; + __builtin_memcpy(&ip6h->saddr, saddr, sizeof(ip6h->saddr)); + __builtin_memcpy(&ip6h->daddr, daddr, sizeof(ip6h->daddr)); + + return 0; +} + +static __always_inline void update_stats(void *map, __u32 key, __u16 bytes) +{ + struct lb_stats *st = bpf_map_lookup_elem(map, &key); + + if (st) { + st->v1 += 1; + st->v2 += bytes; + } +} + +static __always_inline void count_action(int action) +{ + struct lb_stats *st; + __u32 key; + + if (action == XDP_TX) + key = STATS_XDP_TX; + else if (action == XDP_PASS) + key = STATS_XDP_PASS; + else + key = STATS_XDP_DROP; + + st = bpf_map_lookup_elem(&stats, &key); + if (st) + st->v1 += 1; +} + +static __always_inline bool is_under_flood(void) +{ + __u32 key = STATS_NEW_CONN; + struct lb_stats *conn_st = bpf_map_lookup_elem(&stats, &key); + __u64 cur_time; + + if (!conn_st) + return true; + + cur_time = bpf_ktime_get_ns(); + if ((cur_time - conn_st->v2) > ONE_SEC) { + conn_st->v1 = 1; + conn_st->v2 = cur_time; + } else { + conn_st->v1 += 1; + if (conn_st->v1 > MAX_CONN_RATE) + return true; + } + return false; +} + +static __always_inline struct real_definition *connection_table_lookup(void *lru_map, + struct flow_key *flow, + __u32 *out_pos) +{ + struct real_pos_lru *dst_lru; + struct real_definition *real; + __u32 key; + + dst_lru = bpf_map_lookup_elem(lru_map, flow); + if (!dst_lru) + return NULL; + + /* UDP connections use atime-based timeout instead of FIN/RST */ + if (flow->proto == IPPROTO_UDP) { + __u64 cur_time = bpf_ktime_get_ns(); + + if (cur_time - dst_lru->atime > LRU_UDP_TIMEOUT) + return NULL; + dst_lru->atime = cur_time; + } + + key = dst_lru->pos; + *out_pos = key; + real = bpf_map_lookup_elem(&reals, &key); + return real; +} + +static __always_inline bool get_packet_dst(struct real_definition **real, struct flow_key *flow, + struct vip_meta *vip_info, bool is_v6, void *lru_map, + bool is_rst, __u32 *out_pos) +{ + bool under_flood; + __u32 hash, ch_key; + __u32 *ch_val; + __u32 real_pos; + + under_flood = is_under_flood(); + + if (is_v6) { + __u32 src_hash = jhash2_4words((__u32 *)flow->srcv6, MAX_VIPS); + + hash = jhash_2words(src_hash, flow->ports, CH_RING_SIZE); + } else { + hash = jhash_2words(flow->src, flow->ports, CH_RING_SIZE); + } + + ch_key = CH_RING_SIZE * vip_info->vip_num + hash % CH_RING_SIZE; + ch_val = bpf_map_lookup_elem(&ch_rings, &ch_key); + if (!ch_val) + return false; + real_pos = *ch_val; + + *real = bpf_map_lookup_elem(&reals, &real_pos); + if (!(*real)) + return false; + + if (!(vip_info->flags & F_LRU_BYPASS) && !under_flood && !is_rst) { + struct real_pos_lru new_lru = { .pos = real_pos }; + + if (flow->proto == IPPROTO_UDP) + new_lru.atime = bpf_ktime_get_ns(); + bpf_map_update_elem(lru_map, flow, &new_lru, BPF_ANY); + } + + *out_pos = real_pos; + return true; +} + +static __always_inline void update_vip_lru_miss_stats(struct vip_definition *vip, bool is_v6, + __u32 real_idx) +{ + struct vip_definition *miss_vip; + __u32 key = 0; + __u32 *cnt; + + miss_vip = bpf_map_lookup_elem(&vip_miss_stats, &key); + if (!miss_vip) + return; + + if (is_v6) { + if (miss_vip->vipv6[0] != vip->vipv6[0] || miss_vip->vipv6[1] != vip->vipv6[1] || + miss_vip->vipv6[2] != vip->vipv6[2] || miss_vip->vipv6[3] != vip->vipv6[3]) + return; + } else { + if (miss_vip->vip != vip->vip) + return; + } + + if (miss_vip->port != vip->port || miss_vip->proto != vip->proto) + return; + + cnt = bpf_map_lookup_elem(&lru_miss_stats, &real_idx); + if (cnt) + *cnt += 1; +} + +static __noinline int process_packet(struct xdp_md *xdp) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + struct ethhdr *eth = data; + struct real_definition *dst = NULL; + struct vip_definition vip_def = {}; + struct ctl_value *cval; + struct flow_key flow = {}; + struct vip_meta *vip_info; + struct lb_stats *data_stats; + struct udphdr *uh; + __be32 tnl_src[4]; + void *lru_map; + void *l4; + __u16 payload_len; + __u32 real_pos = 0, cpu_num, key; + __u8 proto; + int action = XDP_DROP; + bool is_v6, is_syn = false, is_rst = false; + + if (eth + 1 > data_end) + goto out; + + if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + is_v6 = true; + } else if (eth->h_proto == bpf_htons(ETH_P_IP)) { + is_v6 = false; + } else { + action = XDP_PASS; + goto out; + } + + if (is_v6) { + struct ipv6hdr *ip6h = (void *)(eth + 1); + + if (ip6h + 1 > data_end) + goto out; + if (ip6h->nexthdr == IPPROTO_FRAGMENT) + goto out; + + payload_len = sizeof(struct ipv6hdr) + bpf_ntohs(ip6h->payload_len); + proto = ip6h->nexthdr; + + __builtin_memcpy(flow.srcv6, &ip6h->saddr, sizeof(flow.srcv6)); + __builtin_memcpy(flow.dstv6, &ip6h->daddr, sizeof(flow.dstv6)); + __builtin_memcpy(vip_def.vipv6, &ip6h->daddr, sizeof(vip_def.vipv6)); + l4 = (void *)(ip6h + 1); + } else { + struct iphdr *iph = (void *)(eth + 1); + + if (iph + 1 > data_end) + goto out; + if (iph->ihl != 5) + goto out; + if (iph->frag_off & bpf_htons(PCKT_FRAGMENTED)) + goto out; + + payload_len = bpf_ntohs(iph->tot_len); + proto = iph->protocol; + + flow.src = iph->saddr; + flow.dst = iph->daddr; + vip_def.vip = iph->daddr; + l4 = (void *)(iph + 1); + } + + /* TCP and UDP share the same port layout at offset 0 */ + if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { + action = XDP_PASS; + goto out; + } + + uh = l4; + if ((void *)(uh + 1) > data_end) + goto out; + flow.port16[0] = uh->source; + flow.port16[1] = uh->dest; + + if (proto == IPPROTO_TCP) { + struct tcphdr *th = l4; + + if ((void *)(th + 1) > data_end) + goto out; + is_syn = th->syn; + is_rst = th->rst; + } + + flow.proto = proto; + vip_def.port = flow.port16[1]; + vip_def.proto = proto; + + vip_info = bpf_map_lookup_elem(&vip_map, &vip_def); + if (!vip_info) { + action = XDP_PASS; + goto out; + } + + key = STATS_LRU; + data_stats = bpf_map_lookup_elem(&stats, &key); + if (!data_stats) + goto out; + data_stats->v1 += 1; + + cpu_num = bpf_get_smp_processor_id(); + lru_map = bpf_map_lookup_elem(&lru_mapping, &cpu_num); + if (!lru_map) + goto out; + + if (!(vip_info->flags & F_LRU_BYPASS) && !is_syn) + dst = connection_table_lookup(lru_map, &flow, &real_pos); + + if (!dst) { + if (flow.proto == IPPROTO_TCP) { + struct lb_stats *miss_st; + + key = STATS_LRU_MISS; + miss_st = bpf_map_lookup_elem(&stats, &key); + if (miss_st) + miss_st->v1 += 1; + } + + if (!get_packet_dst(&dst, &flow, vip_info, is_v6, lru_map, is_rst, &real_pos)) + goto out; + + update_vip_lru_miss_stats(&vip_def, is_v6, real_pos); + data_stats->v2 += 1; + } + + key = 0; + cval = bpf_map_lookup_elem(&ctl_array, &key); + if (!cval) + goto out; + + update_stats(&stats, vip_info->vip_num, payload_len); + update_stats(&reals_stats, real_pos, payload_len); + + if (is_v6) { + create_encap_ipv6_src(flow.port16[0], flow.srcv6[0], tnl_src); + if (encap_v6(xdp, tnl_src, dst->dstv6, IPPROTO_IPV6, payload_len, cval->mac)) + goto out; + } else if (dst->flags & F_IPV6) { + create_encap_ipv6_src(flow.port16[0], flow.src, tnl_src); + if (encap_v6(xdp, tnl_src, dst->dstv6, IPPROTO_IPIP, payload_len, cval->mac)) + goto out; + } else { + if (encap_v4(xdp, create_encap_ipv4_src(flow.port16[0], flow.src), dst->dst, + payload_len, cval->mac)) + goto out; + } + + action = XDP_TX; + +out: + count_action(action); + return action; +} + +static __always_inline int strip_encap(struct xdp_md *xdp, const struct ethhdr *saved_eth) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + struct ethhdr *eth = data; + int hdr_sz; + + if (eth + 1 > data_end) + return -1; + + hdr_sz = (eth->h_proto == bpf_htons(ETH_P_IPV6)) ? (int)sizeof(struct ipv6hdr) + : (int)sizeof(struct iphdr); + + if (bpf_xdp_adjust_head(xdp, hdr_sz)) + return -1; + + data = (void *)(long)xdp->data; + data_end = (void *)(long)xdp->data_end; + eth = data; + + if (eth + 1 > data_end) + return -1; + + __builtin_memcpy(eth, saved_eth, sizeof(*saved_eth)); + return 0; +} + +static __always_inline void randomize_src(struct xdp_md *xdp, int saddr_off, __u32 *rand_state) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + __u32 *saddr = data + saddr_off; + + *rand_state ^= *rand_state << 13; + *rand_state ^= *rand_state >> 17; + *rand_state ^= *rand_state << 5; + + if ((void *)(saddr + 1) <= data_end) + *saddr = *rand_state & flow_mask; +} + +SEC("xdp") +int xdp_lb_bench(struct xdp_md *xdp) +{ + void *data = (void *)(long)xdp->data; + void *data_end = (void *)(long)xdp->data_end; + struct ethhdr *eth = data; + struct ethhdr saved_eth; + __u32 rand_state = 0; + __u32 batch_hash = 0; + int saddr_off = 0; + bool is_v6; + + if (eth + 1 > data_end) + return XDP_DROP; + + __builtin_memcpy(&saved_eth, eth, sizeof(saved_eth)); + + is_v6 = (saved_eth.h_proto == bpf_htons(ETH_P_IPV6)); + + saddr_off = sizeof(struct ethhdr) + (is_v6 ? offsetof(struct ipv6hdr, saddr) : + offsetof(struct iphdr, saddr)); + + if (flow_mask) + rand_state = bpf_get_prandom_u32() | 1; + + if (cold_lru) { + __u32 *saddr = data + saddr_off; + + batch_gen++; + batch_hash = (batch_gen + bpf_get_smp_processor_id()) * KNUTH_HASH_MULT; + if ((void *)(saddr + 1) <= data_end) + *saddr ^= batch_hash; + } + + return BENCH_BPF_LOOP( + process_packet(xdp), + ({ + if (__bench_result == XDP_TX) { + if (strip_encap(xdp, &saved_eth)) + return XDP_DROP; + if (rand_state) + randomize_src(xdp, saddr_off, &rand_state); + } + if (cold_lru) { + void *d = (void *)(long)xdp->data; + void *de = (void *)(long)xdp->data_end; + __u32 *__sa = d + saddr_off; + + if ((void *)(__sa + 1) <= de) + *__sa ^= batch_hash; + } + }) + ); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c deleted file mode 100644 index 44e2b0ef23ae..000000000000 --- a/tools/testing/selftests/bpf/progs/xdping_kern.c +++ /dev/null @@ -1,183 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ - -#define KBUILD_MODNAME "foo" -#include <stddef.h> -#include <string.h> -#include <linux/bpf.h> -#include <linux/icmp.h> -#include <linux/in.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> - -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_endian.h> - -#include "bpf_compiler.h" -#include "xdping.h" - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 256); - __type(key, __u32); - __type(value, struct pinginfo); -} ping_map SEC(".maps"); - -static __always_inline void swap_src_dst_mac(void *data) -{ - unsigned short *p = data; - unsigned short dst[3]; - - dst[0] = p[0]; - dst[1] = p[1]; - dst[2] = p[2]; - p[0] = p[3]; - p[1] = p[4]; - p[2] = p[5]; - p[3] = dst[0]; - p[4] = dst[1]; - p[5] = dst[2]; -} - -static __always_inline __u16 csum_fold_helper(__wsum sum) -{ - sum = (sum & 0xffff) + (sum >> 16); - return ~((sum & 0xffff) + (sum >> 16)); -} - -static __always_inline __u16 ipv4_csum(void *data_start, int data_size) -{ - __wsum sum; - - sum = bpf_csum_diff(0, 0, data_start, data_size, 0); - return csum_fold_helper(sum); -} - -#define ICMP_ECHO_LEN 64 - -static __always_inline int icmp_check(struct xdp_md *ctx, int type) -{ - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - struct icmphdr *icmph; - struct iphdr *iph; - - if (data + sizeof(*eth) + sizeof(*iph) + ICMP_ECHO_LEN > data_end) - return XDP_PASS; - - if (eth->h_proto != bpf_htons(ETH_P_IP)) - return XDP_PASS; - - iph = data + sizeof(*eth); - - if (iph->protocol != IPPROTO_ICMP) - return XDP_PASS; - - if (bpf_ntohs(iph->tot_len) - sizeof(*iph) != ICMP_ECHO_LEN) - return XDP_PASS; - - icmph = data + sizeof(*eth) + sizeof(*iph); - - if (icmph->type != type) - return XDP_PASS; - - return XDP_TX; -} - -SEC("xdp") -int xdping_client(struct xdp_md *ctx) -{ - void *data = (void *)(long)ctx->data; - struct pinginfo *pinginfo = NULL; - struct ethhdr *eth = data; - struct icmphdr *icmph; - struct iphdr *iph; - __u64 recvtime; - __be32 raddr; - __be16 seq; - int ret; - __u8 i; - - ret = icmp_check(ctx, ICMP_ECHOREPLY); - - if (ret != XDP_TX) - return ret; - - iph = data + sizeof(*eth); - icmph = data + sizeof(*eth) + sizeof(*iph); - raddr = iph->saddr; - - /* Record time reply received. */ - recvtime = bpf_ktime_get_ns(); - pinginfo = bpf_map_lookup_elem(&ping_map, &raddr); - if (!pinginfo || pinginfo->seq != icmph->un.echo.sequence) - return XDP_PASS; - - if (pinginfo->start) { - __pragma_loop_unroll_full - for (i = 0; i < XDPING_MAX_COUNT; i++) { - if (pinginfo->times[i] == 0) - break; - } - /* verifier is fussy here... */ - if (i < XDPING_MAX_COUNT) { - pinginfo->times[i] = recvtime - - pinginfo->start; - pinginfo->start = 0; - i++; - } - /* No more space for values? */ - if (i == pinginfo->count || i == XDPING_MAX_COUNT) - return XDP_PASS; - } - - /* Now convert reply back into echo request. */ - swap_src_dst_mac(data); - iph->saddr = iph->daddr; - iph->daddr = raddr; - icmph->type = ICMP_ECHO; - seq = bpf_htons(bpf_ntohs(icmph->un.echo.sequence) + 1); - icmph->un.echo.sequence = seq; - icmph->checksum = 0; - icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN); - - pinginfo->seq = seq; - pinginfo->start = bpf_ktime_get_ns(); - - return XDP_TX; -} - -SEC("xdp") -int xdping_server(struct xdp_md *ctx) -{ - void *data = (void *)(long)ctx->data; - struct ethhdr *eth = data; - struct icmphdr *icmph; - struct iphdr *iph; - __be32 raddr; - int ret; - - ret = icmp_check(ctx, ICMP_ECHO); - - if (ret != XDP_TX) - return ret; - - iph = data + sizeof(*eth); - icmph = data + sizeof(*eth) + sizeof(*iph); - raddr = iph->saddr; - - /* Now convert request into echo reply. */ - swap_src_dst_mac(data); - iph->saddr = iph->daddr; - iph->daddr = raddr; - icmph->type = ICMP_ECHOREPLY; - icmph->checksum = 0; - icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN); - - return XDP_TX; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/Makefile b/tools/testing/selftests/bpf/test_kmods/Makefile index 63c4d3f6a12f..031c7454ce65 100644 --- a/tools/testing/selftests/bpf/test_kmods/Makefile +++ b/tools/testing/selftests/bpf/test_kmods/Makefile @@ -1,5 +1,16 @@ TEST_KMOD_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) -KDIR ?= $(abspath $(TEST_KMOD_DIR)/../../../../..) +SRCTREE_KDIR := $(abspath $(TEST_KMOD_DIR)/../../../../..) +# Honor O=/KBUILD_OUTPUT only if they point at a prepared kernel build +# directory (one containing Module.symvers); otherwise treat the value as a +# selftests-only output directory and fall back to in-tree or distro headers. +# The parent bpf/Makefile resolves O=/KBUILD_OUTPUT to absolute paths before +# invoking this sub-make so relative paths still anchor to the user's +# invocation directory. +KMOD_O := $(or $(O),$(KBUILD_OUTPUT)) +KMOD_O_VALID := $(if $(KMOD_O),$(if $(wildcard $(KMOD_O)/Module.symvers),$(KMOD_O))) +KDIR ?= $(if $(KMOD_O_VALID),$(SRCTREE_KDIR), \ + $(if $(wildcard $(SRCTREE_KDIR)/Module.symvers),$(SRCTREE_KDIR), \ + /lib/modules/$(shell uname -r)/build)) ifeq ($(V),1) Q = @@ -14,8 +25,21 @@ $(foreach m,$(MODULES),$(eval obj-m += $(m:.ko=.o))) CFLAGS_bpf_testmod.o = -I$(src) +# When BPF_STRICT_BUILD != 0, a missing KDIR is fatal (the default). +# When permissive, skip silently. +PERMISSIVE := $(filter 0,$(BPF_STRICT_BUILD)) + all: - $(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) modules +ifeq ($(PERMISSIVE),) + $(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \ + M=$(TEST_KMOD_DIR) modules +else ifneq ("$(wildcard $(KDIR))", "") + $(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \ + M=$(TEST_KMOD_DIR) modules +endif clean: - $(Q)$(MAKE) -C $(KDIR) M=$(TEST_KMOD_DIR) clean +ifneq ("$(wildcard $(KDIR))", "") + $(Q)$(MAKE) -C $(KDIR) $(if $(KMOD_O_VALID),O=$(KMOD_O_VALID) KBUILD_OUTPUT=$(KMOD_O_VALID),KBUILD_OUTPUT=) \ + M=$(TEST_KMOD_DIR) clean +endif diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index d876314a4d67..30f1cd23093c 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -825,6 +825,76 @@ __bpf_kfunc int bpf_kfunc_call_test5(u8 a, u16 b, u32 c) return 0; } +__bpf_kfunc u64 bpf_kfunc_call_stack_arg(u64 a, u64 b, u64 c, u64 d, + u64 e, u64 f, u64 g, u64 h, + u64 i, u64 j) +{ + return a + b + c + d + e + f + g + h + i + j; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_ptr(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, + struct prog_test_pass1 *p) +{ + return a + b + c + d + e + f + g + h + i + p->x0 + p->x1; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_mix(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, + struct prog_test_pass1 *p, u64 h, + struct prog_test_pass1 *q) +{ + return a + b + c + d + e + f + g + p->x0 + h + q->x1; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_dynptr(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, + struct bpf_dynptr *ptr) +{ + const struct bpf_dynptr_kern *kern_ptr = (void *)ptr; + + return a + b + c + d + e + f + g + h + i + (kern_ptr->size & 0xFFFFFF); +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_mem(u64 a, u64 b, u64 c, u64 d, u64 e, + void *mem, int mem__sz) +{ + const unsigned char *p = mem; + u64 sum = a + b + c + d + e; + int i; + + for (i = 0; i < mem__sz; i++) + sum += p[i]; + return sum; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_iter(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, + struct bpf_iter_testmod_seq *it__iter) +{ + return a + b + c + d + e + f + g + h + i + it__iter->value; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_const_str(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, + const char *str__str) +{ + return a + b + c + d + e + f + g + h + i; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_timer(u64 a, u64 b, u64 c, u64 d, u64 e, + u64 f, u64 g, u64 h, u64 i, + struct bpf_timer *timer) +{ + return a + b + c + d + e + f + g + h + i; +} + +__bpf_kfunc u64 bpf_kfunc_call_stack_arg_big(u64 a, u64 b, u64 c, u64 d, u64 e, + struct prog_test_big_arg s) +{ + return a + b + c + d + e + s.a + s.b; +} + static struct prog_test_ref_kfunc prog_test_struct = { .a = 42, .b = 108, @@ -1288,6 +1358,15 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test2) BTF_ID_FLAGS(func, bpf_kfunc_call_test3) BTF_ID_FLAGS(func, bpf_kfunc_call_test4) BTF_ID_FLAGS(func, bpf_kfunc_call_test5) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_ptr) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mix) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_dynptr) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_mem) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_iter) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_const_str) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_timer) +BTF_ID_FLAGS(func, bpf_kfunc_call_stack_arg_big) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index aa0b8d41e71b..c36bb911defa 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -26,6 +26,8 @@ struct prog_test_ref_kfunc { }; #endif +struct bpf_iter_testmod_seq; + struct prog_test_pass1 { int x0; struct { @@ -48,6 +50,11 @@ struct prog_test_pass2 { } x; }; +struct prog_test_big_arg { + __u64 a; + __u64 b; +}; + struct prog_test_fail1 { void *p; int x; @@ -111,6 +118,32 @@ int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym; int bpf_kfunc_call_test5(__u8 a, __u16 b, __u32 c) __ksym; +__u64 bpf_kfunc_call_stack_arg(__u64 a, __u64 b, __u64 c, __u64 d, + __u64 e, __u64 f, __u64 g, __u64 h, + __u64 i, __u64 j) __ksym; +__u64 bpf_kfunc_call_stack_arg_ptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, + struct prog_test_pass1 *p) __ksym; +__u64 bpf_kfunc_call_stack_arg_mix(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, + struct prog_test_pass1 *p, __u64 h, + struct prog_test_pass1 *q) __ksym; +__u64 bpf_kfunc_call_stack_arg_dynptr(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, + struct bpf_dynptr *ptr) __ksym; +__u64 bpf_kfunc_call_stack_arg_mem(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + void *mem, int mem__sz) __ksym; +__u64 bpf_kfunc_call_stack_arg_iter(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, + struct bpf_iter_testmod_seq *it__iter) __ksym; +__u64 bpf_kfunc_call_stack_arg_const_str(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, + const char *str__str) __ksym; +__u64 bpf_kfunc_call_stack_arg_timer(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + __u64 f, __u64 g, __u64 h, __u64 i, + struct bpf_timer *timer) __ksym; +__u64 bpf_kfunc_call_stack_arg_big(__u64 a, __u64 b, __u64 c, __u64 d, __u64 e, + struct prog_test_big_arg s) __ksym; void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb) __ksym; void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym; diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c index 88e4aeab21b7..cd191da20d14 100644 --- a/tools/testing/selftests/bpf/test_lirc_mode2_user.c +++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c @@ -50,8 +50,8 @@ int main(int argc, char **argv) { struct bpf_object *obj; int ret, lircfd, progfd, inputfd; - int testir1 = 0x1dead; - int testir2 = 0x20101; + int testir1 = 0x1ead; + int testir2 = 0x2101; u32 prog_ids[10], prog_flags[10], prog_cnt; if (argc != 3) { @@ -125,7 +125,7 @@ int main(int argc, char **argv) } if (event.type == EV_MSC && event.code == MSC_SCAN && - event.value == 0xdead) { + event.value == 0x1ead) { break; } } diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index c4c34cae6102..3ce32d134e2c 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -63,6 +63,7 @@ struct test_spec { struct test_subspec priv; struct test_subspec unpriv; const char *btf_custom_path; + const char *btf_custom_func_path; int log_level; int prog_flags; int mode_mask; @@ -93,7 +94,7 @@ void test_loader_fini(struct test_loader *tester) free(tester->log_buf); } -static void free_msgs(struct expected_msgs *msgs) +void free_msgs(struct expected_msgs *msgs) { int i; @@ -376,6 +377,7 @@ enum arch { ARCH_ARM64 = 0x4, ARCH_RISCV64 = 0x8, ARCH_S390X = 0x10, + ARCH_LOONGARCH = 0x20, }; static int get_current_arch(void) @@ -388,6 +390,8 @@ static int get_current_arch(void) return ARCH_RISCV64; #elif defined(__s390x__) return ARCH_S390X; +#elif defined(__loongarch__) + return ARCH_LOONGARCH; #endif return ARCH_UNKNOWN; } @@ -579,6 +583,8 @@ static int parse_test_spec(struct test_loader *tester, arch = ARCH_RISCV64; } else if (strcmp(val, "s390x") == 0) { arch = ARCH_S390X; + } else if (strcmp(val, "LOONGARCH") == 0) { + arch = ARCH_LOONGARCH; } else { PRINT_FAIL("bad arch spec: '%s'\n", val); err = -EINVAL; @@ -590,6 +596,8 @@ static int parse_test_spec(struct test_loader *tester, jit_on_next_line = true; } else if ((val = str_has_pfx(s, "test_btf_path="))) { spec->btf_custom_path = val; + } else if ((val = str_has_pfx(s, "test_btf_func_path="))) { + spec->btf_custom_func_path = val; } else if ((val = str_has_pfx(s, "test_caps_unpriv="))) { err = parse_caps(val, &spec->unpriv.caps, "test caps"); if (err) @@ -789,6 +797,43 @@ static void emit_stderr(const char *stderr, bool force) fprintf(stdout, "STDERR:\n=============\n%s=============\n", stderr); } +static void verify_stderr(int prog_fd, struct expected_msgs *msgs) +{ + LIBBPF_OPTS(bpf_prog_stream_read_opts, ropts); + char *buf; + int ret; + + if (!msgs->cnt) + return; + + buf = malloc(TEST_LOADER_LOG_BUF_SZ); + if (!ASSERT_OK_PTR(buf, "malloc")) + return; + + ret = bpf_prog_stream_read(prog_fd, 2, buf, TEST_LOADER_LOG_BUF_SZ - 1, + &ropts); + if (ret > 0) { + buf[ret] = '\0'; + emit_stderr(buf, false); + validate_msgs(buf, msgs, emit_stderr); + } else { + ASSERT_GT(ret, 0, "stderr stream read"); + } + + free(buf); +} + +void verify_test_stderr(struct bpf_object *obj, struct bpf_program *prog) +{ + struct test_spec spec = {}; + + if (parse_test_spec(NULL, obj, prog, &spec)) + return; + + verify_stderr(bpf_program__fd(prog), &spec.priv.stderr); + free_test_spec(&spec); +} + static void emit_stdout(const char *bpf_stdout, bool force) { if (!force && env.verbosity == VERBOSE_NONE) @@ -1138,6 +1183,123 @@ static int get_stream(int stream_id, int prog_fd, char *text, size_t text_sz) return ret; } +/* + * Fix up the program's BTF using BTF from a separate file. + * + * For __naked subprogs, clang drops parameter names from BTF. Find FUNC + * entries with anonymous parameters and replace their FUNC_PROTO with the + * properly-named version from the custom file. + */ +static int fixup_btf_from_path(struct bpf_object *obj, const char *path) +{ + struct btf *prog_btf, *custom_btf; + __u32 i, j, cnt, custom_cnt; + int err = 0; + + prog_btf = bpf_object__btf(obj); + if (!prog_btf) + return 0; + + custom_btf = btf__parse(path, NULL); + if (!ASSERT_OK_PTR(custom_btf, "parse_custom_btf")) + return -EINVAL; + + cnt = btf__type_cnt(prog_btf); + custom_cnt = btf__type_cnt(custom_btf); + + /* Fix up FUNC entries with anonymous params. + * Save all data from prog_btf BEFORE calling btf__add_*, + * since those calls may reallocate the BTF data buffer + * and invalidate any pointers obtained from btf__type_by_id. + */ + for (i = 1; i < cnt; i++) { + const struct btf_type *t = btf__type_by_id(prog_btf, i); + const struct btf_type *fp, *custom_t, *custom_fp; + const struct btf_param *params, *custom_params; + __u32 ret_type_id, vlen; + __u32 *prog_param_types = NULL; + const char *name; + int new_proto_id; + + if (!btf_is_func(t)) + continue; + + fp = btf__type_by_id(prog_btf, t->type); + if (!fp || !btf_is_func_proto(fp) || btf_vlen(fp) == 0) + continue; + + /* Check if any param is anonymous */ + params = btf_params(fp); + if (params[0].name_off != 0) + continue; + + /* Find matching FUNC by name in custom BTF */ + name = btf__name_by_offset(prog_btf, t->name_off); + if (!name) + continue; + + for (j = 1; j < custom_cnt; j++) { + const char *cname; + + custom_t = btf__type_by_id(custom_btf, j); + if (!btf_is_func(custom_t)) + continue; + cname = btf__name_by_offset(custom_btf, custom_t->name_off); + if (cname && strcmp(name, cname) == 0) + break; + } + if (j >= custom_cnt) + continue; + + custom_fp = btf__type_by_id(custom_btf, custom_t->type); + if (!custom_fp || !btf_is_func_proto(custom_fp)) + continue; + + vlen = btf_vlen(fp); + if (vlen != btf_vlen(custom_fp)) + continue; + + /* Save data before btf__add_* calls invalidate pointers */ + ret_type_id = fp->type; + prog_param_types = malloc(vlen * sizeof(*prog_param_types)); + if (!prog_param_types) { + err = -ENOMEM; + break; + } + for (j = 0; j < vlen; j++) + prog_param_types[j] = params[j].type; + + /* Add a new FUNC_PROTO: param names from custom, types from prog */ + new_proto_id = btf__add_func_proto(prog_btf, ret_type_id); + if (new_proto_id < 0) { + err = new_proto_id; + free(prog_param_types); + break; + } + + custom_params = btf_params(custom_fp); + for (j = 0; j < vlen; j++) { + const char *pname; + + pname = btf__name_by_offset(custom_btf, custom_params[j].name_off); + err = btf__add_func_param(prog_btf, pname ?: "", prog_param_types[j]); + if (err) + break; + } + free(prog_param_types); + if (err) + break; + + /* Update the FUNC to point to the new FUNC_PROTO (re-fetch + * since btf__add_* may have reallocated the data buffer). + */ + ((struct btf_type *)btf__type_by_id(prog_btf, i))->type = new_proto_id; + } + + btf__free(custom_btf); + return err; +} + /* this function is forced noinline and has short generic name to look better * in test_progs output (in case of a failure) */ @@ -1194,13 +1356,27 @@ void run_subtest(struct test_loader *tester, } } - /* Implicitly reset to NULL if next test case doesn't specify */ + /* Implicitly reset to NULL if next test case doesn't specify. + * btf_custom_func_path also serves as btf_custom_path for kfunc resolution. + */ open_opts->btf_custom_path = spec->btf_custom_path; + if (!open_opts->btf_custom_path) + open_opts->btf_custom_path = spec->btf_custom_func_path; tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, open_opts); if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */ goto subtest_cleanup; + /* Fix up __naked subprog BTF using a separate file with named params */ + if (spec->btf_custom_func_path) { + err = fixup_btf_from_path(tobj, spec->btf_custom_func_path); + if (err) { + PRINT_FAIL("failed to fixup BTF from %s: %d\n", + spec->btf_custom_func_path, err); + goto tobj_cleanup; + } + } + i = 0; bpf_object__for_each_program(tprog_iter, tobj) { spec_iter = &specs[i++]; @@ -1314,17 +1490,7 @@ void run_subtest(struct test_loader *tester, goto tobj_cleanup; } - if (subspec->stderr.cnt) { - err = get_stream(2, bpf_program__fd(tprog), - tester->log_buf, tester->log_buf_sz); - if (err <= 0) { - PRINT_FAIL("Unexpected retval from get_stream(): %d, errno = %d\n", - err, errno); - goto tobj_cleanup; - } - emit_stderr(tester->log_buf, false /*force*/); - validate_msgs(tester->log_buf, &subspec->stderr, emit_stderr); - } + verify_stderr(bpf_program__fd(tprog), &subspec->stderr); if (subspec->stdout.cnt) { err = get_stream(1, bpf_program__fd(tprog), diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index ccc5acd55ff9..c32da7bd8be2 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -260,6 +260,16 @@ static void test_hashmap_percpu(unsigned int task, void *data) close(fd); } +#define MAP_RETRIES 20 + +static bool can_retry(int err) +{ + return (err == EAGAIN || err == EBUSY || + ((err == ENOMEM || err == E2BIG) && + map_opts.map_flags == BPF_F_NO_PREALLOC)); +} + + #define VALUE_SIZE 3 static int helper_fill_hashmap(int max_entries) { @@ -274,10 +284,11 @@ static int helper_fill_hashmap(int max_entries) for (i = 0; i < max_entries; i++) { key = i; value[0] = key; - ret = bpf_map_update_elem(fd, &key, value, BPF_NOEXIST); + ret = map_update_retriable(fd, &key, value, BPF_NOEXIST, + MAP_RETRIES, can_retry); CHECK(ret != 0, "can't update hashmap", - "err: %s\n", strerror(ret)); + "err: %s\n", strerror(-ret)); } return fd; @@ -1392,17 +1403,9 @@ static void test_map_stress(void) #define DO_UPDATE 1 #define DO_DELETE 0 -#define MAP_RETRIES 20 #define MAX_DELAY_US 50000 #define MIN_DELAY_RANGE_US 5000 -static bool can_retry(int err) -{ - return (err == EAGAIN || err == EBUSY || - ((err == ENOMEM || err == E2BIG) && - map_opts.map_flags == BPF_F_NO_PREALLOC)); -} - int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts, retry_for_error_fn need_retry) { diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 7fe16b5131b1..7ba82974ee78 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -165,6 +165,8 @@ struct prog_test_def { void (*run_test)(void); void (*run_serial_test)(void); bool should_run; + bool not_built; + bool selected; bool need_cgroup_cleanup; bool should_tmon; }; @@ -372,6 +374,8 @@ static void print_test_result(const struct prog_test_def *test, const struct tes fprintf(env.stdout_saved, "#%-*d %s:", TEST_NUM_WIDTH, test->test_num, test->test_name); if (test_state->error_cnt) fprintf(env.stdout_saved, "FAIL"); + else if (test->not_built) + fprintf(env.stdout_saved, "SKIP (not built)"); else if (!skipped_cnt) fprintf(env.stdout_saved, "OK"); else if (skipped_cnt == subtests_cnt || !subtests_cnt) @@ -1257,7 +1261,7 @@ int get_bpf_max_tramp_links_from(struct btf *btf) const struct btf_type *t; __u32 i, type_cnt; const char *name; - __u16 j, vlen; + __u32 j, vlen; for (i = 1, type_cnt = btf__type_cnt(btf); i < type_cnt; i++) { t = btf__type_by_id(btf, i); @@ -1641,6 +1645,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) json_writer_t *w = NULL; for (i = 0; i < prog_test_cnt; i++) { + struct prog_test_def *test = &prog_test_defs[i]; struct test_state *state = &test_states[i]; if (!state->tested) @@ -1651,7 +1656,7 @@ static void calculate_summary_and_print_errors(struct test_env *env) if (state->error_cnt) fail_cnt++; - else + else if (!test->not_built) succ_cnt++; } @@ -1700,8 +1705,13 @@ static void calculate_summary_and_print_errors(struct test_env *env) if (env->json) fclose(env->json); - printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", - succ_cnt, sub_succ_cnt, skip_cnt, fail_cnt); + if (env->not_built_cnt) + printf("Summary: %d/%d PASSED, %d SKIPPED (%d not built), %d FAILED\n", + succ_cnt, sub_succ_cnt, skip_cnt, env->not_built_cnt, + fail_cnt); + else + printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n", + succ_cnt, sub_succ_cnt, skip_cnt, fail_cnt); env->succ_cnt = succ_cnt; env->sub_succ_cnt = sub_succ_cnt; @@ -1772,6 +1782,19 @@ static void server_main(void) run_one_test(i); } + /* mark not-built tests as skipped */ + for (int i = 0; i < prog_test_cnt; i++) { + struct prog_test_def *test = &prog_test_defs[i]; + struct test_state *state = &test_states[i]; + + if (test->not_built && test->selected) { + state->tested = true; + state->skip_cnt = 1; + env.not_built_cnt++; + print_test_result(test, state); + } + } + /* generate summary */ fflush(stderr); fflush(stdout); @@ -2046,15 +2069,20 @@ int main(int argc, char **argv) struct prog_test_def *test = &prog_test_defs[i]; test->test_num = i + 1; - test->should_run = should_run(&env.test_selector, - test->test_num, test->test_name); + test->selected = should_run(&env.test_selector, + test->test_num, test->test_name); + test->should_run = test->selected; - if ((test->run_test == NULL && test->run_serial_test == NULL) || - (test->run_test != NULL && test->run_serial_test != NULL)) { + if (test->run_test && test->run_serial_test) { fprintf(stderr, "Test %d:%s must have either test_%s() or serial_test_%sl() defined.\n", test->test_num, test->test_name, test->test_name, test->test_name); exit(EXIT_ERR_SETUP_INFRA); } + if (!test->run_test && !test->run_serial_test) { + test->not_built = true; + test->should_run = false; + continue; + } if (test->should_run) test->should_tmon = should_tmon(&env.tmon_selector, test->test_name); } @@ -2106,9 +2134,18 @@ int main(int argc, char **argv) for (i = 0; i < prog_test_cnt; i++) { struct prog_test_def *test = &prog_test_defs[i]; + struct test_state *state = &test_states[i]; - if (!test->should_run) + if (!test->should_run) { + if (test->not_built && test->selected && + !env.get_test_cnt && !env.list_test_names) { + state->tested = true; + state->skip_cnt = 1; + env.not_built_cnt++; + print_test_result(test, state); + } continue; + } if (env.get_test_cnt) { env.succ_cnt++; diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index 1a44467f4310..2cf950afcd85 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -125,6 +125,7 @@ struct test_env { int sub_succ_cnt; /* successful sub-tests */ int fail_cnt; /* total failed tests + sub-tests */ int skip_cnt; /* skipped tests */ + int not_built_cnt; /* tests not built */ int saved_netns_fd; int workers; /* number of worker process */ @@ -563,5 +564,7 @@ struct expected_msgs { void validate_msgs(const char *log_buf, struct expected_msgs *msgs, void (*emit_fn)(const char *buf, bool force)); +void free_msgs(struct expected_msgs *msgs); +void verify_test_stderr(struct bpf_object *obj, struct bpf_program *prog); #endif /* __TEST_PROGS_H */ diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 76568db7a664..ac814eb63edb 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -26,7 +26,6 @@ #include <linux/sock_diag.h> #include <linux/bpf.h> #include <linux/if_link.h> -#include <linux/tls.h> #include <assert.h> #include <libgen.h> @@ -41,13 +40,6 @@ int running; static void running_handler(int a); -#ifndef TCP_ULP -# define TCP_ULP 31 -#endif -#ifndef SOL_TLS -# define SOL_TLS 282 -#endif - /* randomly selected ports for testing on lo */ #define S1_PORT 10000 #define S2_PORT 10001 @@ -63,10 +55,10 @@ int s1, s2, c1, c2, p1, p2; int test_cnt; int passed; int failed; -int map_fd[9]; -struct bpf_map *maps[9]; -struct bpf_program *progs[9]; -struct bpf_link *links[9]; +int map_fd[8]; +struct bpf_map *maps[8]; +struct bpf_program *progs[8]; +struct bpf_link *links[8]; int txmsg_pass; int txmsg_redir; @@ -81,10 +73,6 @@ int txmsg_start_pop; int txmsg_pop; int txmsg_ingress; int txmsg_redir_skb; -int txmsg_ktls_skb; -int txmsg_ktls_skb_drop; -int txmsg_ktls_skb_redir; -int ktls; int peek_flag; int skb_use_parser; int txmsg_omit_skb_parser; @@ -115,7 +103,6 @@ static const struct option long_options[] = { {"txmsg_pop", required_argument, NULL, 'x'}, {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, - {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1}, {"whitelist", required_argument, NULL, 'n' }, @@ -183,7 +170,6 @@ static void test_reset(void) txmsg_pass = txmsg_drop = txmsg_redir = 0; txmsg_apply = txmsg_cork = 0; txmsg_ingress = txmsg_redir_skb = 0; - txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; txmsg_omit_skb_parser = 0; skb_use_parser = 0; } @@ -238,71 +224,6 @@ static void usage(char *argv[]) printf("\n"); } -char *sock_to_string(int s) -{ - if (s == c1) - return "client1"; - else if (s == c2) - return "client2"; - else if (s == s1) - return "server1"; - else if (s == s2) - return "server2"; - else if (s == p1) - return "peer1"; - else if (s == p2) - return "peer2"; - else - return "unknown"; -} - -static int sockmap_init_ktls(int verbose, int s) -{ - struct tls12_crypto_info_aes_gcm_128 tls_tx = { - .info = { - .version = TLS_1_2_VERSION, - .cipher_type = TLS_CIPHER_AES_GCM_128, - }, - }; - struct tls12_crypto_info_aes_gcm_128 tls_rx = { - .info = { - .version = TLS_1_2_VERSION, - .cipher_type = TLS_CIPHER_AES_GCM_128, - }, - }; - int so_buf = 6553500; - int err; - - err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls")); - if (err) { - fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err); - return -EINVAL; - } - err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx)); - if (err) { - fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err); - return -EINVAL; - } - err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx)); - if (err) { - fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err); - return -EINVAL; - } - err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf)); - if (err) { - fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err); - return -EINVAL; - } - err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf)); - if (err) { - fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err); - return -EINVAL; - } - - if (verbose) - fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s)); - return 0; -} static int sockmap_init_sockets(int verbose) { int i, err, one = 1; @@ -557,19 +478,6 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz, for (i = 0, j = 0; i < msg->msg_iovlen && size; i++, j = 0) { unsigned char *d = msg->msg_iov[i].iov_base; - /* Special case test for skb ingress + ktls */ - if (i == 0 && txmsg_ktls_skb) { - if (msg->msg_iov[i].iov_len < 4) - return -EDATAINTEGRITY; - if (memcmp(d, "PASS", 4) != 0) { - fprintf(stderr, - "detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", - i, 0, d[0], d[1], d[2], d[3]); - return -EDATAINTEGRITY; - } - j = 4; /* advance index past PASS header */ - } - for (; j < msg->msg_iov[i].iov_len && size; j++) { if (push > 0 && check_cnt == verify_push_start + verify_push_len - push) { @@ -849,21 +757,6 @@ static int sendmsg_test(struct sockmap_options *opt) else rx_fd = p2; - if (ktls) { - /* Redirecting into non-TLS socket which sends into a TLS - * socket is not a valid test. So in this case lets not - * enable kTLS but still run the test. - */ - if (!txmsg_redir || txmsg_ingress) { - err = sockmap_init_ktls(opt->verbose, rx_fd); - if (err) - return err; - } - err = sockmap_init_ktls(opt->verbose, c1); - if (err) - return err; - } - if (opt->tx_wait_mem) { struct timeval timeout; int rxtx_buf_len = 1024; @@ -882,7 +775,7 @@ static int sendmsg_test(struct sockmap_options *opt) rxpid = fork(); if (rxpid == 0) { - if (opt->drop_expected || txmsg_ktls_skb_drop) + if (opt->drop_expected) _exit(0); if (!iov_buf) /* zero bytes sent case */ @@ -1073,28 +966,8 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) return -1; } - /* Attach programs to TLS sockmap */ - if (txmsg_ktls_skb) { - if (!txmsg_omit_skb_parser) { - links[2] = bpf_program__attach_sockmap(progs[0], map_fd[8]); - if (!links[2]) { - fprintf(stderr, - "ERROR: bpf_program__attach_sockmap (TLS sockmap %i->%i): (%s)\n", - bpf_program__fd(progs[0]), map_fd[8], strerror(errno)); - return -1; - } - } - - links[3] = bpf_program__attach_sockmap(progs[2], map_fd[8]); - if (!links[3]) { - fprintf(stderr, "ERROR: bpf_program__attach_sockmap (TLS sockmap): (%s)\n", - strerror(errno)); - return -1; - } - } - /* Attach to cgroups */ - err = bpf_prog_attach(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS, 0); + err = bpf_prog_attach(bpf_program__fd(progs[2]), cg_fd, BPF_CGROUP_SOCK_OPS, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", err, strerror(errno)); @@ -1110,15 +983,15 @@ run: /* Attach txmsg program to sockmap */ if (txmsg_pass) - tx_prog = progs[4]; + tx_prog = progs[3]; else if (txmsg_redir) - tx_prog = progs[5]; + tx_prog = progs[4]; else if (txmsg_apply) - tx_prog = progs[6]; + tx_prog = progs[5]; else if (txmsg_cork) - tx_prog = progs[7]; + tx_prog = progs[6]; else if (txmsg_drop) - tx_prog = progs[8]; + tx_prog = progs[7]; else tx_prog = NULL; @@ -1291,34 +1164,6 @@ run: } } - if (txmsg_ktls_skb) { - int ingress = BPF_F_INGRESS; - - i = 0; - err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY); - if (err) { - fprintf(stderr, - "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", - err, strerror(errno)); - } - - if (txmsg_ktls_skb_redir) { - i = 1; - err = bpf_map_update_elem(map_fd[7], - &i, &ingress, BPF_ANY); - if (err) { - fprintf(stderr, - "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", - err, strerror(errno)); - } - } - - if (txmsg_ktls_skb_drop) { - i = 1; - err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY); - } - } - if (txmsg_redir_skb) { int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; @@ -1373,7 +1218,7 @@ run: fprintf(stderr, "unknown test\n"); out: /* Detach and zero all the maps */ - bpf_prog_detach2(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS); + bpf_prog_detach2(bpf_program__fd(progs[2]), cg_fd, BPF_CGROUP_SOCK_OPS); for (i = 0; i < ARRAY_SIZE(links); i++) { if (links[i]) @@ -1457,10 +1302,6 @@ static void test_options(char *options) append_str(options, "ingress,", OPTSTRING); if (txmsg_redir_skb) append_str(options, "redir_skb,", OPTSTRING); - if (txmsg_ktls_skb) - append_str(options, "ktls_skb,", OPTSTRING); - if (ktls) - append_str(options, "ktls,", OPTSTRING); if (peek_flag) append_str(options, "peek,", OPTSTRING); } @@ -1602,57 +1443,6 @@ static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt) test_send(opt, cgrp); } -static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) -{ - bool data = opt->data_test; - int k = ktls; - - opt->data_test = true; - ktls = 1; - - txmsg_pass = txmsg_drop = 0; - txmsg_ingress = txmsg_redir = 0; - txmsg_ktls_skb = 1; - txmsg_pass = 1; - - /* Using data verification so ensure iov layout is - * expected from test receiver side. e.g. has enough - * bytes to write test code. - */ - opt->iov_length = 100; - opt->iov_count = 1; - opt->rate = 1; - test_exec(cgrp, opt); - - txmsg_ktls_skb_drop = 1; - test_exec(cgrp, opt); - - txmsg_ktls_skb_drop = 0; - txmsg_ktls_skb_redir = 1; - test_exec(cgrp, opt); - txmsg_ktls_skb_redir = 0; - - /* Tests that omit skb_parser */ - txmsg_omit_skb_parser = 1; - ktls = 0; - txmsg_ktls_skb = 0; - test_exec(cgrp, opt); - - txmsg_ktls_skb_drop = 1; - test_exec(cgrp, opt); - txmsg_ktls_skb_drop = 0; - - txmsg_ktls_skb_redir = 1; - test_exec(cgrp, opt); - - ktls = 1; - test_exec(cgrp, opt); - txmsg_omit_skb_parser = 0; - - opt->data_test = data; - ktls = k; -} - /* Test cork with hung data. This tests poor usage patterns where * cork can leave data on the ring if user program is buggy and * doesn't flush them somehow. They do take some time however @@ -1908,8 +1698,6 @@ static void test_txmsg_ingress_parser(int cgrp, struct sockmap_options *opt) { txmsg_pass = 1; skb_use_parser = 512; - if (ktls == 1) - skb_use_parser = 570; opt->iov_length = 256; opt->iov_count = 1; opt->rate = 2; @@ -1918,8 +1706,6 @@ static void test_txmsg_ingress_parser(int cgrp, struct sockmap_options *opt) static void test_txmsg_ingress_parser2(int cgrp, struct sockmap_options *opt) { - if (ktls == 1) - return; skb_use_parser = 10; opt->iov_length = 20; opt->iov_count = 1; @@ -1938,7 +1724,6 @@ char *map_names[] = { "sock_bytes", "sock_redir_flags", "sock_skb_opts", - "tls_sock_map", }; static int populate_progs(char *bpf_file) @@ -1988,7 +1773,6 @@ struct _test test[] = { {"txmsg test redirect wait send mem", test_txmsg_redir_wait_sndmem}, {"txmsg test drop", test_txmsg_drop}, {"txmsg test ingress redirect", test_txmsg_ingress_redir}, - {"txmsg test skb", test_txmsg_skb}, {"txmsg test apply", test_txmsg_apply}, {"txmsg test cork", test_txmsg_cork}, {"txmsg test hanging corks", test_txmsg_cork_hangs}, @@ -2085,20 +1869,10 @@ static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt) __test_selftests(cg_fd, opt); } -static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt) -{ - opt->map = BPF_SOCKHASH_FILENAME; - opt->prepend = "ktls"; - ktls = 1; - __test_selftests(cg_fd, opt); - ktls = 0; -} - static int test_selftest(int cg_fd, struct sockmap_options *opt) { test_selftests_sockmap(cg_fd, opt); test_selftests_sockhash(cg_fd, opt); - test_selftests_ktls(cg_fd, opt); test_print_results(); return 0; } diff --git a/tools/testing/selftests/bpf/test_xdping.sh b/tools/testing/selftests/bpf/test_xdping.sh deleted file mode 100755 index c3d82e0a7378..000000000000 --- a/tools/testing/selftests/bpf/test_xdping.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# xdping tests -# Here we setup and teardown configuration required to run -# xdping, exercising its options. -# -# Setup is similar to test_tunnel tests but without the tunnel. -# -# Topology: -# --------- -# root namespace | tc_ns0 namespace -# | -# ---------- | ---------- -# | veth1 | --------- | veth0 | -# ---------- peer ---------- -# -# Device Configuration -# -------------------- -# Root namespace with BPF -# Device names and addresses: -# veth1 IP: 10.1.1.200 -# xdp added to veth1, xdpings originate from here. -# -# Namespace tc_ns0 with BPF -# Device names and addresses: -# veth0 IPv4: 10.1.1.100 -# For some tests xdping run in server mode here. -# - -readonly TARGET_IP="10.1.1.100" -readonly TARGET_NS="xdp_ns0" - -readonly LOCAL_IP="10.1.1.200" - -setup() -{ - ip netns add $TARGET_NS - ip link add veth0 type veth peer name veth1 - ip link set veth0 netns $TARGET_NS - ip netns exec $TARGET_NS ip addr add ${TARGET_IP}/24 dev veth0 - ip addr add ${LOCAL_IP}/24 dev veth1 - ip netns exec $TARGET_NS ip link set veth0 up - ip link set veth1 up -} - -cleanup() -{ - set +e - ip netns delete $TARGET_NS 2>/dev/null - ip link del veth1 2>/dev/null - if [[ $server_pid -ne 0 ]]; then - kill -TERM $server_pid - fi -} - -test() -{ - client_args="$1" - server_args="$2" - - echo "Test client args '$client_args'; server args '$server_args'" - - server_pid=0 - if [[ -n "$server_args" ]]; then - ip netns exec $TARGET_NS ./xdping $server_args & - server_pid=$! - sleep 10 - fi - ./xdping $client_args $TARGET_IP - - if [[ $server_pid -ne 0 ]]; then - kill -TERM $server_pid - server_pid=0 - fi - - echo "Test client args '$client_args'; server args '$server_args': PASS" -} - -set -e - -server_pid=0 - -trap cleanup EXIT - -setup - -for server_args in "" "-I veth0 -s -S" ; do - # client in skb mode - client_args="-I veth1 -S" - test "$client_args" "$server_args" - - # client with count of 10 RTT measurements. - client_args="-I veth1 -S -c 10" - test "$client_args" "$server_args" -done - -# Test drv mode -test "-I veth1 -N" "-I veth0 -s -N" -test "-I veth1 -N -c 10" "-I veth0 -s -N" - -echo "OK. All tests passed" -exit 0 diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c index 6fbe1e995660..c970e7793dfc 100644 --- a/tools/testing/selftests/bpf/testing_helpers.c +++ b/tools/testing/selftests/bpf/testing_helpers.c @@ -5,6 +5,8 @@ #include <stdlib.h> #include <string.h> #include <errno.h> +#include <sys/mman.h> +#include <alloca.h> #include <bpf/bpf.h> #include <bpf/libbpf.h> #include "disasm.h" @@ -516,3 +518,19 @@ bool is_jit_enabled(void) return enabled; } + +int stack_mprotect(void) +{ + void *buf; + long sz; + int ret; + + sz = sysconf(_SC_PAGESIZE); + if (sz < 0) + return sz; + + buf = alloca(sz * 3); + ret = mprotect((void *)(((unsigned long)(buf + sz)) & ~(sz - 1)), sz, + PROT_READ | PROT_WRITE | PROT_EXEC); + return ret; +} diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h index 2ca2356a0b58..2edc6fb7fc52 100644 --- a/tools/testing/selftests/bpf/testing_helpers.h +++ b/tools/testing/selftests/bpf/testing_helpers.h @@ -59,5 +59,6 @@ struct bpf_insn; int get_xlated_program(int fd_prog, struct bpf_insn **buf, __u32 *cnt); int testing_prog_flags(void); bool is_jit_enabled(void); +int stack_mprotect(void); #endif /* __TESTING_HELPERS_H */ diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c index 0e63daf83ed5..679008b310d9 100644 --- a/tools/testing/selftests/bpf/trace_helpers.c +++ b/tools/testing/selftests/bpf/trace_helpers.c @@ -546,9 +546,10 @@ static const char * const trace_blacklist[] = { "__rcu_read_lock", "__rcu_read_unlock", "bpf_get_numa_node_id", + "___migrate_enable", }; -static bool skip_entry(char *name) +bool is_unsafe_function(const char *name) { int i; @@ -651,7 +652,7 @@ int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel) free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; - if (skip_entry(name)) + if (is_unsafe_function(name)) continue; ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); @@ -728,7 +729,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) free(name); if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) continue; - if (skip_entry(name)) + if (is_unsafe_function(name)) continue; if (cnt == max_cnt) { diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index d5bf1433675d..01c8ecc45627 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -63,4 +63,5 @@ int read_build_id(const char *path, char *build_id, size_t size); int bpf_get_ksyms(struct ksyms **ksymsp, bool kernel); int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel); +bool is_unsafe_function(const char *name); #endif diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index 3e58a86b8e25..0af330b6c364 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -144,6 +144,8 @@ int main(int argc, char **argv) return trigger_uprobe(true /* page-in build ID */); error: - fprintf(stderr, "usage: %s <bench|usdt>\n", argv[0]); + fprintf(stderr, + "usage: %s <bench|usdt|uprobe-paged-out|uprobe-paged-in>\n", + argv[0]); return -1; } diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c3164b9b2be5..302d712e0d7e 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -31,7 +31,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type STRUCT prog_test_fail1 must point to scalar", + .errstr = "R1 pointer type STRUCT prog_test_fail1 must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail1", 2 }, }, @@ -46,7 +46,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "max struct nesting depth exceeded\narg#0 pointer type STRUCT prog_test_fail2", + .errstr = "max struct nesting depth exceeded\nR1 pointer type STRUCT prog_test_fail2", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail2", 2 }, }, @@ -61,7 +61,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type STRUCT prog_test_fail3 must point to scalar", + .errstr = "R1 pointer type STRUCT prog_test_fail3 must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_fail3", 2 }, }, @@ -76,7 +76,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 expected pointer to ctx, but got fp", + .errstr = "R1 expected pointer to ctx, but got fp", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_pass_ctx", 2 }, }, @@ -91,7 +91,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 pointer type UNKNOWN must point to scalar", + .errstr = "R1 pointer type UNKNOWN must point to scalar", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_mem_len_fail1", 2 }, }, @@ -109,7 +109,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "Possibly NULL pointer passed to trusted arg0", + .errstr = "Possibly NULL pointer passed to trusted R1", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_acquire", 3 }, { "bpf_kfunc_call_test_release", 5 }, @@ -152,7 +152,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "kernel function bpf_kfunc_call_memb1_release args#0 expected pointer", + .errstr = "kernel function bpf_kfunc_call_memb1_release R1 expected pointer", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_memb_acquire", 1 }, { "bpf_kfunc_call_memb1_release", 5 }, @@ -1219,6 +1219,30 @@ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ BPF_EXIT_INSN(), /* H */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */ + BPF_EXIT_INSN(), + /* I */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */ + BPF_EXIT_INSN(), + /* J */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */ + BPF_EXIT_INSN(), + /* K */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */ + BPF_EXIT_INSN(), + /* L */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */ + BPF_EXIT_INSN(), + /* M */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */ + BPF_EXIT_INSN(), + /* N */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */ + BPF_EXIT_INSN(), + /* O */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */ + BPF_EXIT_INSN(), + /* P */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -1257,6 +1281,30 @@ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ BPF_EXIT_INSN(), /* H */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call I */ + BPF_EXIT_INSN(), + /* I */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call J */ + BPF_EXIT_INSN(), + /* J */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call K */ + BPF_EXIT_INSN(), + /* K */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call L */ + BPF_EXIT_INSN(), + /* L */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call M */ + BPF_EXIT_INSN(), + /* M */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call N */ + BPF_EXIT_INSN(), + /* N */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call O */ + BPF_EXIT_INSN(), + /* O */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call P */ + BPF_EXIT_INSN(), + /* P */ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -2410,27 +2458,3 @@ .errstr_unpriv = "", .prog_type = BPF_PROG_TYPE_CGROUP_SKB, }, -{ - "calls: several args with ref_obj_id", - .insns = { - /* Reserve at least sizeof(struct iphdr) bytes in the ring buffer. - * With a smaller size, the verifier would reject the call to - * bpf_tcp_raw_gen_syncookie_ipv4 before we can reach the - * ref_obj_id error. - */ - BPF_MOV64_IMM(BPF_REG_2, 20), - BPF_MOV64_IMM(BPF_REG_3, 0), - BPF_LD_MAP_FD(BPF_REG_1, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), - /* if r0 == 0 goto <exit> */ - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), - BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), - BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tcp_raw_gen_syncookie_ipv4), - BPF_EXIT_INSN(), - }, - .fixup_map_ringbuf = { 2 }, - .result = REJECT, - .errstr = "more than one arg with ref_obj_id", - .prog_type = BPF_PROG_TYPE_SCHED_CLS, -}, diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c index c2b7f5ebf168..6dabc5522945 100644 --- a/tools/testing/selftests/bpf/verifier/sleepable.c +++ b/tools/testing/selftests/bpf/verifier/sleepable.c @@ -76,7 +76,20 @@ .runs = -1, }, { - "sleepable raw tracepoint reject", + "sleepable raw tracepoint accept", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_TRACING, + .expected_attach_type = BPF_TRACE_RAW_TP, + .kfunc = "sys_enter", + .result = ACCEPT, + .flags = BPF_F_SLEEPABLE, + .runs = -1, +}, +{ + "sleepable raw tracepoint reject non-faultable", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), @@ -85,7 +98,7 @@ .expected_attach_type = BPF_TRACE_RAW_TP, .kfunc = "sched_switch", .result = REJECT, - .errstr = "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable", + .errstr = "Sleepable program cannot attach to non-faultable tracepoint", .flags = BPF_F_SLEEPABLE, .runs = -1, }, diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 5c82950e6978..a7db6f04f7e1 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -48,6 +48,7 @@ enum stat_id { SIZE, JITED_SIZE, STACK, + MAX_STACK, PROG_TYPE, ATTACH_TYPE, MEMORY_PEAK, @@ -789,13 +790,13 @@ cleanup: } static const struct stat_specs default_csv_output_spec = { - .spec_cnt = 15, + .spec_cnt = 16, .ids = { FILE_NAME, PROG_NAME, VERDICT, DURATION, TOTAL_INSNS, TOTAL_STATES, PEAK_STATES, MAX_STATES_PER_INSN, MARK_READ_MAX_LEN, SIZE, JITED_SIZE, PROG_TYPE, ATTACH_TYPE, - STACK, MEMORY_PEAK, + STACK, MAX_STACK, MEMORY_PEAK, }, }; @@ -834,6 +835,7 @@ static struct stat_def { [SIZE] = { "Program size", {"prog_size"}, }, [JITED_SIZE] = { "Jited size", {"prog_size_jited"}, }, [STACK] = {"Stack depth", {"stack_depth", "stack"}, }, + [MAX_STACK] = {"Max stack depth", {"max_stack_depth"}, }, [PROG_TYPE] = { "Program type", {"prog_type"}, }, [ATTACH_TYPE] = { "Attach type", {"attach_type", }, }, [MEMORY_PEAK] = { "Peak memory (MiB)", {"mem_peak", }, }, @@ -1023,7 +1025,7 @@ static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats * &s->stats[MARK_READ_MAX_LEN])) continue; - if (1 == sscanf(cur, "stack depth %511s", stack)) + if (2 == sscanf(cur, "stack depth %511s max %ld", stack, &s->stats[MAX_STACK])) continue; } while ((token = strtok_r(cnt++ ? NULL : stack, "+", &state))) { @@ -2278,6 +2280,7 @@ static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2, case SIZE: case JITED_SIZE: case STACK: + case MAX_STACK: case VERDICT: case DURATION: case TOTAL_INSNS: @@ -2512,6 +2515,7 @@ static void prepare_value(const struct verif_stats *s, enum stat_id id, case MAX_STATES_PER_INSN: case MARK_READ_MAX_LEN: case STACK: + case MAX_STACK: case SIZE: case JITED_SIZE: case MEMORY_PEAK: @@ -2602,7 +2606,8 @@ static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats case SIZE: case JITED_SIZE: case MEMORY_PEAK: - case STACK: { + case STACK: + case MAX_STACK: { long val; int err, n; diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 2f869daf8a06..9ca802285393 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -382,7 +382,7 @@ main() local exit_command="poweroff -f" local debug_shell="no" - while getopts ':hskl:id:j:' opt; do + while getopts ':hsl:id:j:' opt; do case ${opt} in l) LOCAL_ROOTFS_IMAGE="$OPTARG" diff --git a/tools/testing/selftests/bpf/xdp_lb_bench_common.h b/tools/testing/selftests/bpf/xdp_lb_bench_common.h new file mode 100644 index 000000000000..aed20a963701 --- /dev/null +++ b/tools/testing/selftests/bpf/xdp_lb_bench_common.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef XDP_LB_BENCH_COMMON_H +#define XDP_LB_BENCH_COMMON_H + +#define F_IPV6 (1 << 0) +#define F_LRU_BYPASS (1 << 1) + +#define CH_RING_SIZE 65537 /* per-VIP consistent hash ring slots */ +#define MAX_VIPS 16 +#define CH_RINGS_SIZE (MAX_VIPS * CH_RING_SIZE) +#define MAX_REALS 512 +#define DEFAULT_LRU_SIZE 100000 /* connection tracking cache size */ +#define ONE_SEC 1000000000U /* 1 sec in nanosec */ +#define MAX_CONN_RATE 100000000 /* high enough to never trigger in bench */ +#define LRU_UDP_TIMEOUT 30000000000ULL /* 30 sec in nanosec */ +#define PCKT_FRAGMENTED 0x3FFF +#define KNUTH_HASH_MULT 2654435761U +#define IPIP_V4_PREFIX 4268 /* 172.16/12 in network order */ +#define IPIP_V6_PREFIX1 1 /* 0100::/64 (RFC 6666 discard) */ +#define IPIP_V6_PREFIX2 0 +#define IPIP_V6_PREFIX3 0 + +/* Stats indices (0..MAX_VIPS-1 are per-VIP packet/byte counters) */ +#define STATS_LRU (MAX_VIPS + 0) /* v1: total VIP packets, v2: LRU misses */ +#define STATS_XDP_TX (MAX_VIPS + 1) +#define STATS_XDP_PASS (MAX_VIPS + 2) +#define STATS_XDP_DROP (MAX_VIPS + 3) +#define STATS_NEW_CONN (MAX_VIPS + 4) /* v1: conn count, v2: last reset ts */ +#define STATS_LRU_MISS (MAX_VIPS + 5) /* v1: TCP LRU misses */ +#define STATS_SIZE (MAX_VIPS + 6) + +#ifdef __BPF__ +#define lb_htons(x) bpf_htons(x) +#define LB_INLINE static __always_inline +#else +#define lb_htons(x) htons(x) +#define LB_INLINE static inline +#endif + +LB_INLINE __be32 create_encap_ipv4_src(__u16 port, __be32 src) +{ + __u32 ip_suffix = lb_htons(port); + + ip_suffix <<= 16; + ip_suffix ^= src; + return (0xFFFF0000 & ip_suffix) | IPIP_V4_PREFIX; +} + +LB_INLINE void create_encap_ipv6_src(__u16 port, __be32 src, __be32 *saddr) +{ + saddr[0] = IPIP_V6_PREFIX1; + saddr[1] = IPIP_V6_PREFIX2; + saddr[2] = IPIP_V6_PREFIX3; + saddr[3] = src ^ port; +} + +struct flow_key { + union { + __be32 src; + __be32 srcv6[4]; + }; + union { + __be32 dst; + __be32 dstv6[4]; + }; + union { + __u32 ports; + __u16 port16[2]; + }; + __u8 proto; + __u8 pad[3]; +}; + +struct vip_definition { + union { + __be32 vip; + __be32 vipv6[4]; + }; + __u16 port; + __u8 proto; + __u8 pad; +}; + +struct vip_meta { + __u32 flags; + __u32 vip_num; +}; + +struct real_pos_lru { + __u32 pos; + __u64 atime; +}; + +struct real_definition { + __be32 dst; + __be32 dstv6[4]; + __u8 flags; +}; + +struct lb_stats { + __u64 v1; + __u64 v2; +}; + +struct ctl_value { + __u8 mac[6]; + __u8 pad[2]; +}; + +#endif /* XDP_LB_BENCH_COMMON_H */ diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c deleted file mode 100644 index 9ed8c796645d..000000000000 --- a/tools/testing/selftests/bpf/xdping.c +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ - -#include <linux/bpf.h> -#include <linux/if_link.h> -#include <arpa/inet.h> -#include <assert.h> -#include <errno.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <libgen.h> -#include <net/if.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <netdb.h> - -#include "bpf/bpf.h" -#include "bpf/libbpf.h" - -#include "xdping.h" -#include "testing_helpers.h" - -static int ifindex; -static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; - -static void cleanup(int sig) -{ - bpf_xdp_detach(ifindex, xdp_flags, NULL); - if (sig) - exit(1); -} - -static int get_stats(int fd, __u16 count, __u32 raddr) -{ - struct pinginfo pinginfo = { 0 }; - char inaddrbuf[INET_ADDRSTRLEN]; - struct in_addr inaddr; - __u16 i; - - inaddr.s_addr = raddr; - - printf("\nXDP RTT data:\n"); - - if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) { - perror("bpf_map_lookup elem"); - return 1; - } - - for (i = 0; i < count; i++) { - if (pinginfo.times[i] == 0) - break; - - printf("64 bytes from %s: icmp_seq=%d ttl=64 time=%#.5f ms\n", - inet_ntop(AF_INET, &inaddr, inaddrbuf, - sizeof(inaddrbuf)), - count + i + 1, - (double)pinginfo.times[i]/1000000); - } - - if (i < count) { - fprintf(stderr, "Expected %d samples, got %d.\n", count, i); - return 1; - } - - bpf_map_delete_elem(fd, &raddr); - - return 0; -} - -static void show_usage(const char *prog) -{ - fprintf(stderr, - "usage: %s [OPTS] -I interface destination\n\n" - "OPTS:\n" - " -c count Stop after sending count requests\n" - " (default %d, max %d)\n" - " -I interface interface name\n" - " -N Run in driver mode\n" - " -s Server mode\n" - " -S Run in skb mode\n", - prog, XDPING_DEFAULT_COUNT, XDPING_MAX_COUNT); -} - -int main(int argc, char **argv) -{ - __u32 mode_flags = XDP_FLAGS_DRV_MODE | XDP_FLAGS_SKB_MODE; - struct addrinfo *a, hints = { .ai_family = AF_INET }; - __u16 count = XDPING_DEFAULT_COUNT; - struct pinginfo pinginfo = { 0 }; - const char *optstr = "c:I:NsS"; - struct bpf_program *main_prog; - int prog_fd = -1, map_fd = -1; - struct sockaddr_in rin; - struct bpf_object *obj; - struct bpf_map *map; - char *ifname = NULL; - char filename[256]; - int opt, ret = 1; - __u32 raddr = 0; - int server = 0; - char cmd[256]; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 'c': - count = atoi(optarg); - if (count < 1 || count > XDPING_MAX_COUNT) { - fprintf(stderr, - "min count is 1, max count is %d\n", - XDPING_MAX_COUNT); - return 1; - } - break; - case 'I': - ifname = optarg; - ifindex = if_nametoindex(ifname); - if (!ifindex) { - fprintf(stderr, "Could not get interface %s\n", - ifname); - return 1; - } - break; - case 'N': - xdp_flags |= XDP_FLAGS_DRV_MODE; - break; - case 's': - /* use server program */ - server = 1; - break; - case 'S': - xdp_flags |= XDP_FLAGS_SKB_MODE; - break; - default: - show_usage(basename(argv[0])); - return 1; - } - } - - if (!ifname) { - show_usage(basename(argv[0])); - return 1; - } - if (!server && optind == argc) { - show_usage(basename(argv[0])); - return 1; - } - - if ((xdp_flags & mode_flags) == mode_flags) { - fprintf(stderr, "-N or -S can be specified, not both.\n"); - show_usage(basename(argv[0])); - return 1; - } - - if (!server) { - /* Only supports IPv4; see hints initialization above. */ - if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) { - fprintf(stderr, "Could not resolve %s\n", argv[optind]); - return 1; - } - memcpy(&rin, a->ai_addr, sizeof(rin)); - raddr = rin.sin_addr.s_addr; - freeaddrinfo(a); - } - - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); - - snprintf(filename, sizeof(filename), "%s_kern.bpf.o", argv[0]); - - if (bpf_prog_test_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) { - fprintf(stderr, "load of %s failed\n", filename); - return 1; - } - - main_prog = bpf_object__find_program_by_name(obj, - server ? "xdping_server" : "xdping_client"); - if (main_prog) - prog_fd = bpf_program__fd(main_prog); - if (!main_prog || prog_fd < 0) { - fprintf(stderr, "could not find xdping program"); - return 1; - } - - map = bpf_object__next_map(obj, NULL); - if (map) - map_fd = bpf_map__fd(map); - if (!map || map_fd < 0) { - fprintf(stderr, "Could not find ping map"); - goto done; - } - - signal(SIGINT, cleanup); - signal(SIGTERM, cleanup); - - printf("Setting up XDP for %s, please wait...\n", ifname); - - printf("XDP setup disrupts network connectivity, hit Ctrl+C to quit\n"); - - if (bpf_xdp_attach(ifindex, prog_fd, xdp_flags, NULL) < 0) { - fprintf(stderr, "Link set xdp fd failed for %s\n", ifname); - goto done; - } - - if (server) { - close(prog_fd); - close(map_fd); - printf("Running server on %s; press Ctrl+C to exit...\n", - ifname); - do { } while (1); - } - - /* Start xdping-ing from last regular ping reply, e.g. for a count - * of 10 ICMP requests, we start xdping-ing using reply with seq number - * 10. The reason the last "real" ping RTT is much higher is that - * the ping program sees the ICMP reply associated with the last - * XDP-generated packet, so ping doesn't get a reply until XDP is done. - */ - pinginfo.seq = htons(count); - pinginfo.count = count; - - if (bpf_map_update_elem(map_fd, &raddr, &pinginfo, BPF_ANY)) { - fprintf(stderr, "could not communicate with BPF map: %s\n", - strerror(errno)); - cleanup(0); - goto done; - } - - /* We need to wait for XDP setup to complete. */ - sleep(10); - - snprintf(cmd, sizeof(cmd), "ping -c %d -I %s %s", - count, ifname, argv[optind]); - - printf("\nNormal ping RTT data\n"); - printf("[Ignore final RTT; it is distorted by XDP using the reply]\n"); - - ret = system(cmd); - - if (!ret) - ret = get_stats(map_fd, count, raddr); - - cleanup(0); - -done: - if (prog_fd > 0) - close(prog_fd); - if (map_fd > 0) - close(map_fd); - - return ret; -} diff --git a/tools/testing/selftests/bpf/xdping.h b/tools/testing/selftests/bpf/xdping.h deleted file mode 100644 index afc578df77be..000000000000 --- a/tools/testing/selftests/bpf/xdping.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */ - -#define XDPING_MAX_COUNT 10 -#define XDPING_DEFAULT_COUNT 4 - -struct pinginfo { - __u64 start; - __be16 seq; - __u16 count; - __u32 pad; - __u64 times[XDPING_MAX_COUNT]; -}; diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 6a7295347e90..2596c12cd864 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -59,7 +59,8 @@ char *cg_name(const char *root, const char *name) size_t len = strlen(root) + strlen(name) + 2; char *ret = malloc(len); - snprintf(ret, len, "%s/%s", root, name); + if (ret) + snprintf(ret, len, "%s/%s", root, name); return ret; } @@ -69,7 +70,8 @@ char *cg_name_indexed(const char *root, const char *name, int index) size_t len = strlen(root) + strlen(name) + 10; char *ret = malloc(len); - snprintf(ret, len, "%s/%s_%d", root, name, index); + if (ret) + snprintf(ret, len, "%s/%s_%d", root, name, index); return ret; } @@ -79,7 +81,8 @@ char *cg_control(const char *cgroup, const char *control) size_t len = strlen(cgroup) + strlen(control) + 2; char *ret = malloc(len); - snprintf(ret, len, "%s/%s", cgroup, control); + if (ret) + snprintf(ret, len, "%s/%s", cgroup, control); return ret; } @@ -106,8 +109,9 @@ int cg_read_strcmp(const char *cgroup, const char *control, /* Handle the case of comparing against empty string */ if (!expected) return -1; - else - size = strlen(expected) + 1; + + /* needs size > 1, otherwise cg_read() reads 0 bytes */ + size = (expected[0] == '\0') ? 2 : strlen(expected) + 1; buf = malloc(size); if (!buf) @@ -140,7 +144,7 @@ int cg_read_strcmp_wait(const char *cgroup, const char *control, int cg_read_strstr(const char *cgroup, const char *control, const char *needle) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; if (cg_read(cgroup, control, buf, sizeof(buf))) return -1; @@ -170,7 +174,7 @@ long cg_read_long_fd(int fd) long cg_read_key_long(const char *cgroup, const char *control, const char *key) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; char *ptr; if (cg_read(cgroup, control, buf, sizeof(buf))) @@ -206,7 +210,7 @@ long cg_read_key_long_poll(const char *cgroup, const char *control, long cg_read_lc(const char *cgroup, const char *control) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; const char delim[] = "\n"; char *line; long cnt = 0; @@ -258,7 +262,7 @@ int cg_write_numeric(const char *cgroup, const char *control, long value) static int cg_find_root(char *root, size_t len, const char *controller, bool *nsdelegate) { - char buf[10 * PAGE_SIZE]; + char buf[10 * BUF_SIZE]; char *fs, *mount, *type, *options; const char delim[] = "\n\t "; @@ -313,7 +317,7 @@ int cg_create(const char *cgroup) int cg_wait_for_proc_count(const char *cgroup, int count) { - char buf[10 * PAGE_SIZE] = {0}; + char buf[10 * BUF_SIZE] = {0}; int attempts; char *ptr; @@ -338,7 +342,7 @@ int cg_wait_for_proc_count(const char *cgroup, int count) int cg_killall(const char *cgroup) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; char *ptr = buf; /* If cgroup.kill exists use it. */ @@ -548,7 +552,7 @@ int cg_run_nowait(const char *cgroup, int proc_mount_contains(const char *option) { - char buf[4 * PAGE_SIZE]; + char buf[4 * BUF_SIZE]; ssize_t read; read = read_text("/proc/mounts", buf, sizeof(buf)); @@ -560,7 +564,7 @@ int proc_mount_contains(const char *option) int cgroup_feature(const char *feature) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; ssize_t read; read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); @@ -587,7 +591,7 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) return -1; diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index 567b1082974c..febc1723d090 100644 --- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -2,8 +2,8 @@ #include <stdbool.h> #include <stdlib.h> -#ifndef PAGE_SIZE -#define PAGE_SIZE 4096 +#ifndef BUF_SIZE +#define BUF_SIZE 4096 #endif #define MB(x) (x << 20) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 7b83c7e7c9d4..88ca832d4fc1 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -87,7 +87,7 @@ static int test_cgcore_destroy(const char *root) int ret = KSFT_FAIL; char *cg_test = NULL; int child_pid; - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; cg_test = cg_name(root, "cg_test"); diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index c83f05438d7c..7a40d76b9548 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -278,7 +278,7 @@ static int test_cpucg_nice(const char *root) char buf[64]; snprintf(buf, sizeof(buf), "%d", getpid()); if (cg_write(cpucg, "cgroup.procs", buf)) - goto cleanup; + exit(EXIT_FAILURE); /* Try to keep niced CPU usage as constrained to hog_cpu as possible */ nice(1); diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh index a56f4153c64d..0d41aa0d343d 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh @@ -492,6 +492,16 @@ REMOTE_TEST_MATRIX=( " C1-5:P1 . C1-4:P1 C2-3 . . \ . . . P1 . . p1:5|c11:1-4|c12:5 \ p1:P1|c11:P1|c12:P-1" + # Narrowing cpuset.cpus to previously sibling-excluded CPUs should + # not return CPUs that were never actually owned. + " C1-4:P1 . C1-2:P1 C1-3:P2 . . \ + . . . C3 . . p1:4|c11:1-2|c12:3 \ + p1:P1|c11:P1|c12:P2 3" + # Expanding cpuset.cpus to include a previously sibling-excluded CPU + # after the sibling has become a member should correctly request it. + " C1-4:P1 . C1-2:P1 C1-3:P2 . . \ + . . P0 C2-3 . . p1:1,4|c11:1|c12:2-3 \ + p1:P1|c11:P0|c12:P2 2-3" ) # @@ -617,7 +627,7 @@ set_ctrl_state_noerr() online_cpus() { - [[ -n "OFFLINE_CPUS" ]] && { + [[ -n "$OFFLINE_CPUS" ]] && { for C in $OFFLINE_CPUS do write_cpu_online ${C}=1 diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh index 42a6628fb8bc..1c0444729e70 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -18,7 +18,7 @@ write_test() { echo "testing $interface $value" echo $value > $dir/$interface new=$(cat $dir/$interface) - [[ $value -ne $(cat $dir/$interface) ]] && { + [[ "$value" != "$new" ]] && { echo "$interface write $value failed: new:$new" exit 1 } diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 97fae92c8387..0569e93fa6b0 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -642,7 +642,7 @@ cleanup: */ static int proc_check_stopped(int pid) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; int len; len = proc_read_text(pid, 0, "stat", buf, sizeof(buf)); @@ -1353,7 +1353,7 @@ static int test_cgfreezer_time_child(const char *root) } if (ctime <= ptime) { - debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); goto cleanup; } diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c index f451aa449be6..b627d84358b1 100644 --- a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -217,6 +217,14 @@ int main(int argc, char **argv) if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); + if (cg_read_strstr(root, "cgroup.controllers", "memory")) + ksft_exit_skip("memory controller isn't available\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) { + if (cg_write(root, "cgroup.subtree_control", "+memory")) + ksft_exit_skip("Failed to set memory controller\n"); + } + switch (test_hugetlb_memcg(root)) { case KSFT_PASS: ksft_test_result_pass("test_hugetlb_memcg\n"); diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index eeabd34bf083..1db0ba1226b9 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -24,7 +24,7 @@ * the maximum discrepancy between charge and vmstat entries is number * of cpus multiplied by 64 pages. */ -#define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) +#define MAX_VMSTAT_ERROR (sysconf(_SC_PAGESIZE) * 64 * get_nprocs()) #define KMEM_DEAD_WAIT_RETRIES 80 @@ -353,7 +353,7 @@ static int test_percpu_basic(const char *root) { int ret = KSFT_FAIL; char *parent, *child; - long current, percpu; + long current, percpu, slab; int i; parent = cg_name(root, "percpu_basic_test"); @@ -368,24 +368,29 @@ static int test_percpu_basic(const char *root) for (i = 0; i < 1000; i++) { child = cg_name_indexed(parent, "child", i); - if (!child) - return -1; + if (!child) { + ret = -1; + goto cleanup_children; + } - if (cg_create(child)) + if (cg_create(child)) { + free(child); goto cleanup_children; + } free(child); } current = cg_read_long(parent, "memory.current"); percpu = cg_read_key_long(parent, "memory.stat", "percpu "); + slab = cg_read_key_long(parent, "memory.stat", "slab "); - if (current > 0 && percpu > 0 && labs(current - percpu) < - MAX_VMSTAT_ERROR) + if (current > 0 && percpu > 0 && slab >= 0 && + labs(current - (percpu + slab)) < MAX_VMSTAT_ERROR) ret = KSFT_PASS; else - printf("memory.current %ld\npercpu %ld\n", - current, percpu); + printf("memory.current %ld\npercpu %ld\nslab %ld\ndelta %ld\n", + current, percpu, slab, current - (percpu + slab)); cleanup_children: for (i = 0; i < 1000; i++) { diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index b43da9bc20c4..0ebf796f3cff 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -26,6 +26,7 @@ static bool has_localevents; static bool has_recursiveprot; +static int page_size; int get_temp_fd(void) { @@ -34,7 +35,7 @@ int get_temp_fd(void) int alloc_pagecache(int fd, size_t size) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; struct stat st; int i; @@ -55,22 +56,38 @@ cleanup: return -1; } -int alloc_anon(const char *cgroup, void *arg) +static char *alloc_and_populate_anon(size_t size) { - size_t size = (unsigned long)arg; char *buf, *ptr; buf = malloc(size); - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + if (buf == NULL) { + fprintf(stderr, "malloc() failed\n"); + return NULL; + } + + for (ptr = buf; ptr < buf + size; ptr += page_size) *ptr = 0; + return buf; +} + +int alloc_anon(const char *cgroup, void *arg) +{ + size_t size = (unsigned long)arg; + char *buf; + + buf = alloc_and_populate_anon(size); + if (!buf) + return -1; + free(buf); return 0; } int is_swap_enabled(void) { - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; const char delim[] = "\n"; int cnt = 0; char *line; @@ -113,7 +130,7 @@ static int test_memcg_subtree_control(const char *root) { char *parent, *child, *parent2 = NULL, *child2 = NULL; int ret = KSFT_FAIL; - char buf[PAGE_SIZE]; + char buf[BUF_SIZE]; /* Create two nested cgroups with the memory controller enabled */ parent = cg_name(root, "memcg_test_0"); @@ -174,18 +191,13 @@ cleanup_free: static int alloc_anon_50M_check(const char *cgroup, void *arg) { size_t size = MB(50); - char *buf, *ptr; + char *buf; long anon, current; int ret = -1; - buf = malloc(size); - if (buf == NULL) { - fprintf(stderr, "malloc() failed\n"); + buf = alloc_and_populate_anon(size); + if (!buf) return -1; - } - - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) - *ptr = 0; current = cg_read_long(cgroup, "memory.current"); if (current < size) @@ -406,16 +418,11 @@ static int alloc_anon_noexit(const char *cgroup, void *arg) { int ppid = getppid(); size_t size = (unsigned long)arg; - char *buf, *ptr; + char *buf; - buf = malloc(size); - if (buf == NULL) { - fprintf(stderr, "malloc() failed\n"); + buf = alloc_and_populate_anon(size); + if (!buf) return -1; - } - - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) - *ptr = 0; while (getppid() == ppid) sleep(1); @@ -990,18 +997,13 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) { long mem_max = (long)arg; size_t size = MB(50); - char *buf, *ptr; + char *buf; long mem_current, swap_current; int ret = -1; - buf = malloc(size); - if (buf == NULL) { - fprintf(stderr, "malloc() failed\n"); + buf = alloc_and_populate_anon(size); + if (!buf) return -1; - } - - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) - *ptr = 0; mem_current = cg_read_long(cgroup, "memory.current"); if (!mem_current || !values_close(mem_current, mem_max, 3)) @@ -1791,6 +1793,10 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i, proc_status; + page_size = sysconf(_SC_PAGE_SIZE); + if (page_size <= 0) + page_size = BUF_SIZE; + ksft_print_header(); ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index a7bdcdd09d62..49b36ee79160 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -11,10 +11,16 @@ #include <string.h> #include <sys/wait.h> #include <sys/mman.h> +#include <sys/random.h> #include "kselftest.h" #include "cgroup_util.h" +static int page_size; + +#define PATH_ZSWAP "/sys/module/zswap" +#define PATH_ZSWAP_ENABLED "/sys/module/zswap/parameters/enabled" + static int read_int(const char *path, size_t *value) { FILE *file; @@ -70,11 +76,11 @@ static int allocate_and_read_bytes(const char *cgroup, void *arg) if (!mem) return -1; - for (int i = 0; i < size; i += 4095) + for (int i = 0; i < size; i += page_size) mem[i] = 'a'; /* Go through the allocated memory to (z)swap in and out pages */ - for (int i = 0; i < size; i += 4095) { + for (int i = 0; i < size; i += page_size) { if (mem[i] != 'a') ret = -1; } @@ -90,7 +96,7 @@ static int allocate_bytes(const char *cgroup, void *arg) if (!mem) return -1; - for (int i = 0; i < size; i += 4095) + for (int i = 0; i < size; i += page_size) mem[i] = 'a'; free(mem); return 0; @@ -115,6 +121,27 @@ fail: } /* + * Writeback is asynchronous; poll until at least one writeback has + * been recorded for @cg, or until @timeout_ms has elapsed. + */ +static long wait_for_writeback(const char *cg, int timeout_ms) +{ + long elapsed, count; + for (elapsed = 0; elapsed < timeout_ms; elapsed += 100) { + count = get_cg_wb_count(cg); + + if (count < 0) + return -1; + if (count > 0) + return count; + + usleep(100000); + } + + return 0; +} + +/* * Sanity test to check that pages are written into zswap. */ static int test_zswap_usage(const char *root) @@ -162,21 +189,25 @@ out: static int test_swapin_nozswap(const char *root) { int ret = KSFT_FAIL; - char *test_group; - long swap_peak, zswpout; + char *test_group, mem_max_buf[32]; + long swap_peak, zswpout, min_swap; + size_t allocation_size = page_size * 512; + + min_swap = allocation_size / 4; + snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size * 3/4); test_group = cg_name(root, "no_zswap_test"); if (!test_group) goto out; if (cg_create(test_group)) goto out; - if (cg_write(test_group, "memory.max", "8M")) + if (cg_write(test_group, "memory.max", mem_max_buf)) goto out; if (cg_write(test_group, "memory.zswap.max", "0")) goto out; /* Allocate and read more than memory.max to trigger swapin */ - if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32))) + if (cg_run(test_group, allocate_and_read_bytes, (void *)allocation_size)) goto out; /* Verify that pages are swapped out, but no zswap happened */ @@ -186,8 +217,9 @@ static int test_swapin_nozswap(const char *root) goto out; } - if (swap_peak < MB(24)) { - ksft_print_msg("at least 24MB of memory should be swapped out\n"); + if (swap_peak < min_swap) { + ksft_print_msg("at least %ldKB of memory should be swapped out\n", + min_swap / 1024); goto out; } @@ -237,7 +269,7 @@ static int test_zswapin(const char *root) goto out; } - if (zswpin < MB(24) / PAGE_SIZE) { + if (zswpin < MB(24) / page_size) { ksft_print_msg("at least 24MB should be brought back from zswap\n"); goto out; } @@ -257,16 +289,15 @@ out: This will move it into zswap. * 3. Save current zswap usage. * 4. Move the memory allocated in step 1 back in from zswap. - * 5. Set zswap.max to half the amount that was recorded in step 3. + * 5. Set zswap.max to 1/4 of the amount that was recorded in step 3. * 6. Attempt to reclaim memory equal to the amount that was allocated, this will either trigger writeback if it's enabled, or reclamation will fail if writeback is disabled as there isn't enough zswap space. */ static int attempt_writeback(const char *cgroup, void *arg) { - long pagesize = sysconf(_SC_PAGESIZE); - size_t memsize = MB(4); - char buf[pagesize]; + size_t memsize = page_size * 1024; + char buf[page_size]; long zswap_usage; bool wb_enabled = *(bool *) arg; int ret = -1; @@ -281,11 +312,11 @@ static int attempt_writeback(const char *cgroup, void *arg) * half empty, this will result in data that is still compressible * and ends up in zswap, with material zswap usage. */ - for (int i = 0; i < pagesize; i++) - buf[i] = i < pagesize/2 ? (char) i : 0; + for (int i = 0; i < page_size; i++) + buf[i] = i < page_size/2 ? (char) i : 0; - for (int i = 0; i < memsize; i += pagesize) - memcpy(&mem[i], buf, pagesize); + for (int i = 0; i < memsize; i += page_size) + memcpy(&mem[i], buf, page_size); /* Try and reclaim allocated memory */ if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) { @@ -296,19 +327,19 @@ static int attempt_writeback(const char *cgroup, void *arg) zswap_usage = cg_read_long(cgroup, "memory.zswap.current"); /* zswpin */ - for (int i = 0; i < memsize; i += pagesize) { - if (memcmp(&mem[i], buf, pagesize)) { + for (int i = 0; i < memsize; i += page_size) { + if (memcmp(&mem[i], buf, page_size)) { ksft_print_msg("invalid memory\n"); goto out; } } - if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2)) + if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/4)) goto out; /* * If writeback is enabled, trying to reclaim memory now will trigger a - * writeback as zswap.max is half of what was needed when reclaim ran the first time. + * writeback as zswap.max is 1/4 of what was needed when reclaim ran the first time. * If writeback is disabled, memory reclaim will fail as zswap is limited and * it can't writeback to swap. */ @@ -335,7 +366,10 @@ static int test_zswap_writeback_one(const char *cgroup, bool wb) return -1; /* Verify that zswap writeback occurred only if writeback was enabled */ - zswpwb_after = get_cg_wb_count(cgroup); + if (wb) + zswpwb_after = wait_for_writeback(cgroup, 5000); + else + zswpwb_after = get_cg_wb_count(cgroup); if (zswpwb_after < 0) return -1; @@ -417,44 +451,71 @@ static int test_zswap_writeback_disabled(const char *root) static int test_no_invasive_cgroup_shrink(const char *root) { int ret = KSFT_FAIL; - size_t control_allocation_size = MB(10); - char *control_allocation = NULL, *wb_group = NULL, *control_group = NULL; + unsigned int off; + size_t allocation_size = page_size * 1024; + unsigned int nr_pages = allocation_size / page_size; + char zswap_max_buf[32], mem_max_buf[32]; + char *zw_allocation = NULL, *wb_allocation = NULL; + char *zw_group = NULL, *wb_group = NULL; + + snprintf(zswap_max_buf, sizeof(zswap_max_buf), "%d", page_size); + snprintf(mem_max_buf, sizeof(mem_max_buf), "%zu", allocation_size / 2); wb_group = setup_test_group_1M(root, "per_memcg_wb_test1"); if (!wb_group) return KSFT_FAIL; - if (cg_write(wb_group, "memory.zswap.max", "10K")) + if (cg_write(wb_group, "memory.zswap.max", zswap_max_buf)) + goto out; + if (cg_write(wb_group, "memory.max", mem_max_buf)) + goto out; + + zw_group = setup_test_group_1M(root, "per_memcg_wb_test2"); + if (!zw_group) goto out; - control_group = setup_test_group_1M(root, "per_memcg_wb_test2"); - if (!control_group) + if (cg_write(zw_group, "memory.max", mem_max_buf)) goto out; - /* Push some test_group2 memory into zswap */ - if (cg_enter_current(control_group)) + /* Push some zw_group memory into zswap (simple data, easy to compress) */ + if (cg_enter_current(zw_group)) goto out; - control_allocation = malloc(control_allocation_size); - for (int i = 0; i < control_allocation_size; i += 4095) - control_allocation[i] = 'a'; - if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1) + zw_allocation = malloc(allocation_size); + for (int i = 0; i < nr_pages; i++) { + off = (unsigned long)i * page_size; + memset(&zw_allocation[off], 0, page_size); + memset(&zw_allocation[off], 'a', page_size/4); + } + if (cg_read_key_long(zw_group, "memory.stat", "zswapped") < 1) goto out; - /* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */ - if (cg_run(wb_group, allocate_bytes, (void *)MB(10))) + /* Push wb_group memory into zswap with hard-to-compress data to trigger wb */ + if (cg_enter_current(wb_group)) goto out; + wb_allocation = malloc(allocation_size); + if (!wb_allocation) + goto out; + for (int i = 0; i < nr_pages; i++) { + off = (unsigned long)i * page_size; + memset(&wb_allocation[off], 0, page_size); + getrandom(&wb_allocation[off], page_size/4, 0); + } /* Verify that only zswapped memory from gwb_group has been written back */ - if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0) + if (wait_for_writeback(wb_group, 5000) > 0 && get_cg_wb_count(zw_group) == 0) ret = KSFT_PASS; out: cg_enter_current(root); - if (control_group) { - cg_destroy(control_group); - free(control_group); + if (zw_group) { + cg_destroy(zw_group); + free(zw_group); } - cg_destroy(wb_group); - free(wb_group); - if (control_allocation) - free(control_allocation); + if (wb_group) { + cg_destroy(wb_group); + free(wb_group); + } + if (zw_allocation) + free(zw_allocation); + if (wb_allocation) + free(wb_allocation); return ret; } @@ -473,7 +534,7 @@ static int no_kmem_bypass_child(const char *cgroup, void *arg) values->child_allocated = true; return -1; } - for (long i = 0; i < values->target_alloc_bytes; i += 4095) + for (long i = 0; i < values->target_alloc_bytes; i += page_size) ((char *)allocation)[i] = 'a'; values->child_allocated = true; pause(); @@ -521,7 +582,7 @@ static int test_no_kmem_bypass(const char *root) min_free_kb_low = sys_info.totalram / 500000; values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) + sys_info.totalram * 5 / 100; - stored_pages_threshold = sys_info.totalram / 5 / 4096; + stored_pages_threshold = sys_info.totalram / 5 / page_size; trigger_allocation_size = sys_info.totalram / 20; /* Set up test memcg */ @@ -548,7 +609,7 @@ static int test_no_kmem_bypass(const char *root) if (!trigger_allocation) break; - for (int i = 0; i < trigger_allocation_size; i += 4095) + for (int i = 0; i < trigger_allocation_size; i += page_size) trigger_allocation[i] = 'b'; usleep(100000); free(trigger_allocation); @@ -559,8 +620,8 @@ static int test_no_kmem_bypass(const char *root) /* If memory was pushed to zswap, verify it belongs to memcg */ if (stored_pages > stored_pages_threshold) { int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); - int delta = stored_pages * 4096 - zswapped; - int result_ok = delta < stored_pages * 4096 / 4; + int delta = stored_pages * page_size - zswapped; + int result_ok = delta < stored_pages * page_size / 4; ret = result_ok ? KSFT_PASS : KSFT_FAIL; break; @@ -614,7 +675,7 @@ static int allocate_random_and_wait(const char *cgroup, void *arg) close(fd); /* Touch all pages to ensure they're faulted in */ - for (size_t i = 0; i < size; i += PAGE_SIZE) + for (size_t i = 0; i < size; i += page_size) mem[i] = mem[i]; /* Use MADV_PAGEOUT to push pages into zswap */ @@ -725,9 +786,18 @@ struct zswap_test { }; #undef T -static bool zswap_configured(void) +static void check_zswap_enabled(void) { - return access("/sys/module/zswap", F_OK) == 0; + char value[2]; + + if (access(PATH_ZSWAP, F_OK)) + ksft_exit_skip("zswap isn't configured\n"); + + if (read_text(PATH_ZSWAP_ENABLED, value, sizeof(value)) <= 0) + ksft_exit_fail_msg("Failed to read " PATH_ZSWAP_ENABLED "\n"); + + if (value[0] == 'N') + ksft_exit_skip("zswap is disabled (hint: echo 1 > " PATH_ZSWAP_ENABLED ")\n"); } int main(int argc, char **argv) @@ -735,13 +805,16 @@ int main(int argc, char **argv) char root[PATH_MAX]; int i; + page_size = sysconf(_SC_PAGE_SIZE); + if (page_size <= 0) + page_size = BUF_SIZE; + ksft_print_header(); ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); - if (!zswap_configured()) - ksft_exit_skip("zswap isn't configured\n"); + check_zswap_enabled(); /* * Check that memory controller is available: diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c index e82281efa273..ab62bcf4107d 100644 --- a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c +++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c @@ -53,9 +53,6 @@ static int call_clone3_set_tid(struct __test_metadata *_metadata, } if (pid == 0) { - int ret; - char tmp = 0; - TH_LOG("I am the child, my PID is %d (expected %d)", getpid(), set_tid[0]); if (set_tid[0] != getpid()) @@ -87,15 +84,11 @@ static int test_clone3_set_tid(struct __test_metadata *_metadata, return ret; } -struct libcap { - struct __user_cap_header_struct hdr; - struct __user_cap_data_struct data[2]; -}; - static int set_capability(void) { - cap_value_t cap_values[] = { CAP_SETUID, CAP_SETGID }; - struct libcap *cap; + cap_value_t cap_values[] = { + CAP_SETUID, CAP_SETGID, CAP_CHECKPOINT_RESTORE + }; int ret = -1; cap_t caps; @@ -111,14 +104,8 @@ static int set_capability(void) goto out; } - cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_values, CAP_SET); - cap_set_flag(caps, CAP_PERMITTED, 2, cap_values, CAP_SET); - - cap = (struct libcap *) caps; - - /* 40 -> CAP_CHECKPOINT_RESTORE */ - cap->data[1].effective |= 1 << (40 - 32); - cap->data[1].permitted |= 1 << (40 - 32); + cap_set_flag(caps, CAP_EFFECTIVE, 3, cap_values, CAP_SET); + cap_set_flag(caps, CAP_PERMITTED, 3, cap_values, CAP_SET); if (cap_set_proc(caps)) { perror("cap_set_proc"); @@ -135,7 +122,6 @@ TEST(clone3_cap_checkpoint_restore) { pid_t pid; int status; - int ret = 0; pid_t set_tid[1]; test_clone3_supported(); diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 2b4df655d9fd..8b12cc048440 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -132,14 +132,17 @@ class DamosQuota: goals = None # quota goals goal_tuner = None # quota goal tuner reset_interval_ms = None # quota reset interval + fail_charge_num = None + fail_charge_denom = None weight_sz_permil = None weight_nr_accesses_permil = None weight_age_permil = None scheme = None # owner scheme def __init__(self, sz=0, ms=0, goals=None, goal_tuner='consist', - reset_interval_ms=0, weight_sz_permil=0, - weight_nr_accesses_permil=0, weight_age_permil=0): + reset_interval_ms=0, fail_charge_num=0, fail_charge_denom=0, + weight_sz_permil=0, weight_nr_accesses_permil=0, + weight_age_permil=0): self.sz = sz self.ms = ms self.reset_interval_ms = reset_interval_ms @@ -151,6 +154,8 @@ class DamosQuota: for idx, goal in enumerate(self.goals): goal.idx = idx goal.quota = self + self.fail_charge_num = fail_charge_num + self.fail_charge_denom = fail_charge_denom def sysfs_dir(self): return os.path.join(self.scheme.sysfs_dir(), 'quotas') @@ -197,6 +202,18 @@ class DamosQuota: os.path.join(self.sysfs_dir(), 'goal_tuner'), self.goal_tuner) if err is not None: return err + + err = write_file( + os.path.join(self.sysfs_dir(), 'fail_charge_num'), + self.fail_charge_num) + if err is not None: + return err + err = write_file( + os.path.join(self.sysfs_dir(), 'fail_charge_denom'), + self.fail_charge_denom) + if err is not None: + return err + return None class DamosWatermarks: @@ -604,10 +621,11 @@ class DamonCtx: targets = None schemes = None kdamond = None + pause = None idx = None def __init__(self, ops='paddr', monitoring_attrs=DamonAttrs(), targets=[], - schemes=[]): + schemes=[], pause=False): self.ops = ops self.monitoring_attrs = monitoring_attrs self.monitoring_attrs.context = self @@ -622,6 +640,8 @@ class DamonCtx: scheme.idx = idx scheme.context = self + self.pause=pause + def sysfs_dir(self): return os.path.join(self.kdamond.sysfs_dir(), 'contexts', '%d' % self.idx) @@ -662,6 +682,11 @@ class DamonCtx: err = scheme.stage() if err is not None: return err + + err = write_file(os.path.join(self.sysfs_dir(), 'pause'), self.pause) + if err is not None: + return err + return None class Kdamond: diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index af99b07a4f56..972948e6215f 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -112,6 +112,8 @@ def damos_quota_to_dict(quota): ['goals', damos_quota_goals_to_list], ['goal_tuner', int], ['esz', int], + ['fail_charge_num', int], + ['fail_charge_denom', int], ['weight_sz', int], ['weight_nr_accesses', int], ['weight_age', int], @@ -200,6 +202,7 @@ def damon_ctx_to_dict(ctx): ['attrs', attrs_to_dict], ['adaptive_targets', targets_to_list], ['schemes', schemes_to_list], + ['pause', bool], ]) def main(): diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 3aa5c91548a5..aa03a1187489 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -24,9 +24,12 @@ def dump_damon_status_dict(pid): except Exception as e: return None, 'json.load fail (%s)' % e +kdamonds = None def fail(expectation, status): print('unexpected %s' % expectation) print(json.dumps(status, indent=4)) + if kdamonds is not None: + kdamonds.stop() exit(1) def assert_true(condition, expectation, status): @@ -73,6 +76,10 @@ def assert_quota_committed(quota, dump): } assert_true(dump['goal_tuner'] == tuner_val[quota.goal_tuner], 'goal_tuner', dump) + assert_true(dump['fail_charge_num'] == quota.fail_charge_num, + 'fail_charge_num', dump) + assert_true(dump['fail_charge_denom'] == quota.fail_charge_denom, + 'fail_charge_denom', dump) assert_true(dump['weight_sz'] == quota.weight_sz_permil, 'weight_sz', dump) assert_true(dump['weight_nr_accesses'] == quota.weight_nr_accesses_permil, 'weight_nr_accesses', dump) @@ -123,11 +130,12 @@ def assert_scheme_committed(scheme, dump): 'pageout': 2, 'hugepage': 3, 'nohugeapge': 4, - 'lru_prio': 5, - 'lru_deprio': 6, - 'migrate_hot': 7, - 'migrate_cold': 8, - 'stat': 9, + 'collapse': 5, + 'lru_prio': 6, + 'lru_deprio': 7, + 'migrate_hot': 8, + 'migrate_cold': 9, + 'stat': 10, } assert_true(dump['action'] == action_val[scheme.action], 'action', dump) assert_true(dump['apply_interval_us'] == scheme. apply_interval_us, @@ -190,21 +198,60 @@ def assert_ctx_committed(ctx, dump): assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs']) assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) + assert_true(dump['pause'] == ctx.pause, 'pause', dump) def assert_ctxs_committed(kdamonds): + ctxs_paused_for_dump = [] + kdamonds_paused_for_dump = [] + # pause for safe state dumping + for kd in kdamonds.kdamonds: + for ctx in kd.contexts: + if ctx.pause is False: + ctx.pause = True + ctxs_paused_for_dump.append(ctx) + if not kd in kdamonds_paused_for_dump: + kdamonds_paused_for_dump.append(kd) + if kd in kdamonds_paused_for_dump: + err = kd.commit() + if err is not None: + print('pause fail (%s)' % err) + kdamonds.stop() + exit(1) + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) if err is not None: print(err) kdamonds.stop() exit(1) + # resume contexts paused for safe state dumping + for ctx in ctxs_paused_for_dump: + ctx.pause = False + for kd in kdamonds_paused_for_dump: + err = kd.commit() + if err is not None: + print('resume fail (%s)' % err) + kdamonds.stop() + exit(1) + + # restore for comparison + for ctx in ctxs_paused_for_dump: + ctx.pause = True + ctxs = kdamonds.kdamonds[0].contexts dump = status['contexts'] assert_true(len(ctxs) == len(dump), 'ctxs length', dump) for idx, ctx in enumerate(ctxs): assert_ctx_committed(ctx, dump[idx]) + # restore for the caller + for kd in kdamonds.kdamonds: + for ctx in kd.contexts: + if ctx in ctxs_paused_for_dump: + ctx.pause = False + def main(): + global kdamonds kdamonds = _damon_sysfs.Kdamonds( [_damon_sysfs.Kdamond( contexts=[_damon_sysfs.DamonCtx( @@ -239,6 +286,8 @@ def main(): nid=1)], goal_tuner='temporal', reset_interval_ms=1500, + fail_charge_num=1, + fail_charge_denom=4096, weight_sz_permil=20, weight_nr_accesses_permil=200, weight_age_permil=1000), @@ -301,6 +350,7 @@ def main(): print('kdamond start failed: %s' % err) exit(1) kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True + kdamonds.kdamonds[0].contexts[0].pause = True kdamonds.kdamonds[0].commit() del kdamonds.kdamonds[0].contexts[0].targets[1] assert_ctxs_committed(kdamonds) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 83e3b7f63d81..78f4badb5beb 100755 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -282,6 +282,17 @@ test_targets() ensure_dir "$targets_dir/1" "not_exist" } + +test_intervals_goal() +{ + goal_dir=$1 + ensure_dir "$goal_dir" "exist" + ensure_file "$goal_dir/access_bp" "exist" "600" + ensure_file "$goal_dir/aggrs" "exist" "600" + ensure_file "$goal_dir/min_sample_us" "exist" "600" + ensure_file "$goal_dir/max_sample_us" "exist" "600" +} + test_intervals() { intervals_dir=$1 @@ -289,6 +300,54 @@ test_intervals() ensure_file "$intervals_dir/aggr_us" "exist" "600" ensure_file "$intervals_dir/sample_us" "exist" "600" ensure_file "$intervals_dir/update_us" "exist" "600" + test_intervals_goal "$intervals_dir/intervals_goal" +} + +test_damon_filter() +{ + damon_filter_dir=$1 + ensure_file "$damon_filter_dir/type" "exist" "600" + ensure_write_succ "$damon_filter_dir/type" "anon" "valid input" + ensure_write_fail "$damon_filter_dir/type" "foo" "invalid input" + ensure_file "$damon_filter_dir/matching" "exist" "600" + ensure_file "$damon_filter_dir/allow" "exist" "600" +} + +test_damon_filters() +{ + filters_dir=$1 + ensure_dir "$filters_dir" "exist" + ensure_file "$filters_dir/nr_filters" "exist" "600" + ensure_write_succ "$filters_dir/nr_filters" "1" "valid input" + test_damon_filter "$filters_dir/0" + + ensure_write_succ "$filters_dir/nr_filters" "2" "valid input" + test_damon_filter "$filters_dir/0" + test_damon_filter "$filters_dir/1" + + ensure_write_succ "$filters_dir/nr_filters" "0" "valid input" + ensure_dir "$filters_dir/0" "not_exist" + ensure_dir "$filters_dir/1" "not_exist" +} + +test_probe() +{ + probe_dir=$1 + ensure_dir "$probe_dir" "exist" + test_damon_filters "$probe_dir/filters" +} + +test_probes() +{ + probes_dir=$1 + ensure_dir "$probes_dir" "exist" + ensure_file "$probes_dir/nr_probes" "exist" "600" + + ensure_write_succ "$probes_dir/nr_probes" "1" "valid input" + test_probe "$probes_dir/0" + + ensure_write_succ "$probes_dir/nr_probes" "0" "valid input" + ensure_dir "$probes_dir/0" "not_exist" } test_monitoring_attrs() @@ -296,6 +355,7 @@ test_monitoring_attrs() monitoring_attrs_dir=$1 ensure_dir "$monitoring_attrs_dir" "exist" test_intervals "$monitoring_attrs_dir/intervals" + test_probes "$monitoring_attrs_dir/probes" test_range "$monitoring_attrs_dir/nr_regions" } @@ -305,6 +365,8 @@ test_context() ensure_dir "$context_dir" "exist" ensure_file "$context_dir/avail_operations" "exit" 400 ensure_file "$context_dir/operations" "exist" 600 + ensure_file "$context_dir/addr_unit" "exist" 600 + ensure_file "$context_dir/pause" "exist" 600 test_monitoring_attrs "$context_dir/monitoring_attrs" test_targets "$context_dir/targets" test_schemes "$context_dir/schemes" diff --git a/tools/testing/selftests/drivers/net/.gitignore b/tools/testing/selftests/drivers/net/.gitignore index 585ecb4d5dc4..e5314ce4bb2d 100644 --- a/tools/testing/selftests/drivers/net/.gitignore +++ b/tools/testing/selftests/drivers/net/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only napi_id_helper psp_responder +so_txtime diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index b72080c6d06b..d5bf4cb638a8 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -7,6 +7,7 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ TEST_GEN_FILES := \ napi_id_helper \ + so_txtime \ # end of TEST_GEN_FILES TEST_PROGS := \ @@ -21,6 +22,7 @@ TEST_PROGS := \ queues.py \ ring_reconfig.py \ shaper.py \ + so_txtime.py \ stats.py \ xdp.py \ # end of TEST_PROGS diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 9af5f84edd37..6364ca02642d 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -8,10 +8,12 @@ TEST_PROGS := \ bond-lladdr-target.sh \ bond_ipsec_offload.sh \ bond_lacp_prio.sh \ + bond_lacp_strict.sh \ bond_macvlan_ipvlan.sh \ bond_options.sh \ bond_passive_lacp.sh \ bond_stacked_header_parse.sh \ + bond_vlan_real_dev.sh \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_lacp_strict.sh b/tools/testing/selftests/drivers/net/bonding/bond_lacp_strict.sh new file mode 100755 index 000000000000..f1a93a1d952f --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_lacp_strict.sh @@ -0,0 +1,347 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing if bond lacp_strict works +# +# Partner (p_ns) +# +--------------------------+ +# | bond0 | +# | + | +# | eth0 | eth1 | +# | +---+---+ | +# | | | | +# +--------------------------+ +# | | eth0 | eth1 | +# | | | | +# |(br_ns) | br0 | br1 | +# | | | | +# | | eth2 | eth3 | +# +--------------------------+ +# | | | | +# | +---+---+ | +# | eth0 | eth1 | +# | + | +# | bond0 | +# +--------------------------+ +# Dut (d_ns) + +lib_dir=$(dirname "$0") +# shellcheck disable=SC1090 +source "$lib_dir"/../../../net/lib.sh + +COLLECTING_DISTRIBUTING_MASK=48 +COLLECTING_DISTRIBUTING=48 +FAILED=0 + +setup_links() +{ + # shellcheck disable=SC2154 + ip -n "${p_ns}" link add eth0 type veth peer name eth0 netns "${br_ns}" + ip -n "${p_ns}" link add eth1 type veth peer name eth1 netns "${br_ns}" + ip -n "${d_ns}" link add eth0 type veth peer name eth2 netns "${br_ns}" + ip -n "${d_ns}" link add eth1 type veth peer name eth3 netns "${br_ns}" + + ip -n "${br_ns}" link add br0 type bridge + ip -n "${br_ns}" link add br1 type bridge + + ip -n "${br_ns}" link set dev br0 type bridge stp_state 0 + ip -n "${br_ns}" link set dev br1 type bridge stp_state 0 + + ip -n "${br_ns}" link set eth0 master br0 + ip -n "${br_ns}" link set eth2 master br0 + ip -n "${br_ns}" link set eth1 master br1 + ip -n "${br_ns}" link set eth3 master br1 + + # Allow LACP trames forwarding on bridge ports + ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br0/brif/eth0/group_fwd_mask" + ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br1/brif/eth1/group_fwd_mask" + ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br0/brif/eth2/group_fwd_mask" + ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br1/brif/eth3/group_fwd_mask" + + ip -n "${br_ns}" link set eth0 up + ip -n "${br_ns}" link set eth2 up + ip -n "${br_ns}" link set eth1 up + ip -n "${br_ns}" link set eth3 up + + ip -n "${br_ns}" link set br0 up + ip -n "${br_ns}" link set br1 up + + ip -n "${d_ns}" link add bond0 type bond mode 802.3ad miimon 100 \ + lacp_rate fast min_links 1 + ip -n "${p_ns}" link add bond0 type bond mode 802.3ad miimon 100 \ + lacp_rate fast min_links 1 + + ip -n "${d_ns}" link set eth0 master bond0 + ip -n "${d_ns}" link set eth1 master bond0 + ip -n "${p_ns}" link set eth0 master bond0 + ip -n "${p_ns}" link set eth1 master bond0 + + ip -n "${d_ns}" link set bond0 up + ip -n "${p_ns}" link set bond0 up +} + +test_master_carrier() { + local expected=$1 + local mode_name=$2 + local carrier + + carrier=$(ip netns exec "${d_ns}" cat /sys/class/net/bond0/carrier) + [ "$carrier" == "1" ] && carrier="up" || carrier="down" + + [ "$carrier" == "$expected" ] && return + + echo "FAIL: Expected carrier $expected in lacp_strict $mode_name mode, got $carrier" + + RET=1 + +} + +compare_state() { + local actual_state=$1 + local expected_state=$2 + local iface=$3 + local last_attempt=$4 + + [ $((actual_state & COLLECTING_DISTRIBUTING_MASK)) -eq "$expected_state" ] \ + && return 0 + + [ "$last_attempt" -ne 1 ] && return 1 + + printf "FAIL: Expected LACP %s actor state to " "$iface" + if [ "$expected_state" -eq $COLLECTING_DISTRIBUTING ]; then + echo "be in Collecting/Distributing state" + else + echo "have neither Collecting nor Distributing set." + fi + + return 1 +} + +_test_lacp_port_state() { + local interface=$1 + local expected=$2 + local last_attempt=$3 + local eth0_actor_state eth1_actor_state + local ret=0 + + # shellcheck disable=SC2016 + while IFS='=' read -r k v; do + printf -v "$k" '%s' "$v" + done < <( + ip netns exec "${d_ns}" awk ' + /^Slave Interface: / { iface=$3 } + /details actor lacp pdu:/ { ctx="actor" } + /details partner lacp pdu:/ { ctx="partner" } + /^[[:space:]]+port state: / { + if (ctx == "actor") { + gsub(":", "", iface) + printf "%s_%s_state=%s\n", iface, ctx, $3 + } + } + ' /proc/net/bonding/bond0 + ) + + if [ "$interface" == "eth0" ] || [ "$interface" == "both" ]; then + compare_state "$eth0_actor_state" "$expected" eth0 "$last_attempt" || ret=1 + fi + + if [ "$interface" == "eth1" ] || [ "$interface" == "both" ]; then + compare_state "$eth1_actor_state" "$expected" eth1 "$last_attempt" || ret=1 + fi + + return $ret +} + +test_lacp_port_state() { + local interface=$1 + local expected=$2 + local retry=$3 + local last_attempt=0 + local attempt=1 + local ret=1 + + while [ $attempt -le $((retry + 1)) ]; do + [ $attempt -eq $((retry + 1)) ] && last_attempt=1 + _test_lacp_port_state "$interface" "$expected" "$last_attempt" && return + ((attempt++)) + sleep 1 + done + + RET=1 +} + + +trap cleanup_all_ns EXIT +setup_ns d_ns p_ns br_ns +setup_links + +# Initial state +RET=0 +mode=off +test_lacp_port_state both $COLLECTING_DISTRIBUTING 3 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 up" + +# partner eth0 down, eth1 up +# (replicate eth0 state to dut eth0 by shutting a bridge port) +RET=0 +ip -n "${p_ns}" link set eth0 down +ip -n "${br_ns}" link set eth2 down +test_lacp_port_state eth0 $FAILED 5 +test_lacp_port_state eth1 $COLLECTING_DISTRIBUTING 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 down" + +# partner eth0 and eth1 down +RET=0 +ip -n "${p_ns}" link set eth1 down +ip -n "${br_ns}" link set eth3 down +test_lacp_port_state both $FAILED 5 +test_master_carrier down $mode # down because of min_links +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 down" + +# partner eth0 up, eth1 down +RET=0 +ip -n "${p_ns}" link set eth0 up +ip -n "${br_ns}" link set eth2 up +test_lacp_port_state eth0 $COLLECTING_DISTRIBUTING 60 +test_lacp_port_state eth1 $FAILED 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 up, eth1 down" + +# partner eth0 and eth1 up +RET=0 +ip -n "${p_ns}" link set eth1 up +ip -n "${br_ns}" link set eth3 up +test_lacp_port_state both $COLLECTING_DISTRIBUTING 60 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 up" + +# partner eth0 stops LACP and eth1 up +RET=0 +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br0/brif/eth0/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br0/brif/eth2/group_fwd_mask" +test_lacp_port_state eth0 $FAILED 5 +test_lacp_port_state eth1 $COLLECTING_DISTRIBUTING 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 stopped sending LACP" + +# partner eth0 and eth1 stop LACP +RET=0 +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br1/brif/eth1/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br1/brif/eth3/group_fwd_mask" +test_lacp_port_state both $FAILED 5 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 stopped sending LACP" + +# switch to lacp_strict on +RET=0 +mode=on +ip -n "${d_ns}" link set dev bond0 type bond lacp_strict $mode +test_lacp_port_state both $FAILED 1 +test_master_carrier down $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 stopped sending LACP" + +# switch back to lacp_strict off mode +RET=0 +mode=off +ip -n "${d_ns}" link set dev bond0 type bond lacp_strict $mode +test_lacp_port_state both $FAILED 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 stopped sending LACP" + +# eth0 recovers LACP +RET=0 +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br0/brif/eth0/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br0/brif/eth2/group_fwd_mask" +test_lacp_port_state eth0 $COLLECTING_DISTRIBUTING 60 +test_lacp_port_state eth1 $FAILED 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 recovered and eth1 stopped sending LACP" + +# eth1 recovers LACP +RET=0 +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br1/brif/eth1/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br1/brif/eth3/group_fwd_mask" +test_lacp_port_state both $COLLECTING_DISTRIBUTING 60 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 recovered LACP" + +# switch to lacp_strict on +RET=0 +mode=on +ip -n "${d_ns}" link set dev bond0 type bond lacp_strict $mode +test_lacp_port_state both $COLLECTING_DISTRIBUTING 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 up" + +# partner eth0 down, eth1 up +RET=0 +ip -n "${p_ns}" link set eth0 down +ip -n "${br_ns}" link set eth2 down +test_lacp_port_state eth0 $FAILED 5 +test_lacp_port_state eth1 $COLLECTING_DISTRIBUTING 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 down" + +# partner eth0 and eth1 down +RET=0 +ip -n "${p_ns}" link set eth1 down +ip -n "${br_ns}" link set eth3 down +test_lacp_port_state both $FAILED 5 +test_master_carrier down $mode # down because of min_links +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 down" + +# partner eth0 up, eth1 down +RET=0 +ip -n "${p_ns}" link set eth0 up +ip -n "${br_ns}" link set eth2 up +test_lacp_port_state eth0 $COLLECTING_DISTRIBUTING 60 +test_lacp_port_state eth1 $FAILED 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 up, eth1 down" + +# partner eth0 and eth1 up +RET=0 +ip -n "${p_ns}" link set eth1 up +ip -n "${br_ns}" link set eth3 up +test_lacp_port_state both $COLLECTING_DISTRIBUTING 60 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 up" + +# partner eth0 stops LACP and eth1 up +RET=0 +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br0/brif/eth0/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br0/brif/eth2/group_fwd_mask" +test_lacp_port_state eth0 $FAILED 5 +test_lacp_port_state eth1 $COLLECTING_DISTRIBUTING 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 stopped sending LACP" + +# partner eth0 and eth1 stop LACP +RET=0 +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br1/brif/eth1/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 0 > /sys/class/net/br1/brif/eth3/group_fwd_mask" +test_lacp_port_state both $FAILED 5 +test_master_carrier down $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 stopped sending LACP" + +# eth0 recovers LACP +RET=0 +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br0/brif/eth0/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br0/brif/eth2/group_fwd_mask" +test_lacp_port_state eth0 $COLLECTING_DISTRIBUTING 60 +test_lacp_port_state eth1 $FAILED 1 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 recovered and eth1 stopped sending LACP" + +# eth1 recovers LACP +# shellcheck disable=SC2034 +RET=0 +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br1/brif/eth1/group_fwd_mask" +ip netns exec "${br_ns}" sh -c "echo 4 > /sys/class/net/br1/brif/eth3/group_fwd_mask" +test_lacp_port_state both $COLLECTING_DISTRIBUTING 60 +test_master_carrier up $mode +log_test "bond LACP" "lacp_strict $mode - eth0 and eth1 recovered LACP" + +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/bonding/bond_vlan_real_dev.sh b/tools/testing/selftests/drivers/net/bonding/bond_vlan_real_dev.sh new file mode 100755 index 000000000000..542d9ffc4819 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_vlan_real_dev.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test propagation of a real device's state to the VLANs stacked on top of it +# when the real device is (or becomes) a bond member. +# +# The kernel mirrors a real device's UP/DOWN, MTU and feature changes onto its +# VLANs. This is done asynchronously (netdev_work): doing it synchronously from +# the real device's notifier could deadlock. If the real device is brought up +# while enslaved to a bond - so its instance lock is held across NETDEV_UP - and +# a VLAN on top of it is itself a bond member, the synchronous propagation +# re-entered the stack and tried to take the same instance lock again. +# +# Cover both halves: +# - the deferred UP/DOWN, MTU and feature propagation actually lands on the +# VLAN (link state and MTU use an ops-locked dummy, i.e. the deferral path), +# - the deadlock-prone topology - a VLAN on a dummy, with the VLAN and the +# dummy each enslaved to a different bond - can be built without hanging. + +ALL_TESTS=" + vlan_link_state + vlan_mtu + vlan_features + vlan_real_dev_enslave +" + +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +source "$lib_dir"/../../../net/forwarding/lib.sh + +# Return 0 if $dev in netns $ns has flag $flag set (e.g. UP) in its <...> flags. +link_has_flag() +{ + local ns=$1 dev=$2 flag=$3 + + ip -n "$ns" link show dev "$dev" 2>/dev/null | grep -q "[<,]${flag}[,>]" +} + +link_lacks_flag() +{ + ! link_has_flag "$@" +} + +link_mtu_is() +{ + local ns=$1 dev=$2 want=$3 cur + + cur=$(ip -n "$ns" link show dev "$dev" 2>/dev/null | \ + sed -n 's/.* mtu \([0-9]\+\).*/\1/p') + [ "$cur" = "$want" ] +} + +vlan_feature_is() +{ + local ns=$1 dev=$2 feature=$3 value=$4 + + ip netns exec "$ns" ethtool -k "$dev" 2>/dev/null | \ + grep -q "^$feature: $value" +} + +link_has_master() +{ + local ns=$1 dev=$2 master=$3 + + ip -n "$ns" -o link show dev "$dev" 2>/dev/null | grep -q "master $master" +} + +vlan_link_state() +{ + RET=0 + + ip -n "$NS" link add ls_dummy type dummy + ip -n "$NS" link add link ls_dummy name ls_vlan type vlan id 100 + + # Bringing the real device up must propagate UP to the VLAN. + ip -n "$NS" link set ls_dummy up + busywait "$BUSYWAIT_TIMEOUT" link_has_flag "$NS" ls_vlan UP + check_err $? "VLAN did not go UP after the real device went UP" + + # ... and likewise for DOWN. + ip -n "$NS" link set ls_dummy down + busywait "$BUSYWAIT_TIMEOUT" link_lacks_flag "$NS" ls_vlan UP + check_err $? "VLAN did not go DOWN after the real device went DOWN" + + ip -n "$NS" link del ls_vlan + ip -n "$NS" link del ls_dummy + + log_test "VLAN link state follows the real device" +} + +vlan_mtu() +{ + RET=0 + + # The VLAN inherits the real device's MTU (2000) at creation time. + ip -n "$NS" link add mtu_dummy mtu 2000 type dummy + ip -n "$NS" link add link mtu_dummy name mtu_vlan type vlan id 100 + + # Shrinking the real device's MTU must clamp the VLAN's MTU. + ip -n "$NS" link set mtu_dummy mtu 1500 + busywait "$BUSYWAIT_TIMEOUT" link_mtu_is "$NS" mtu_vlan 1500 + check_err $? "VLAN MTU not clamped after the real device's MTU shrank" + + ip -n "$NS" link del mtu_vlan + ip -n "$NS" link del mtu_dummy + + log_test "VLAN MTU clamped to the real device" +} + +vlan_features() +{ + RET=0 + + # Use veth as the real device: unlike dummy it exports vlan_features, so + # the VLAN actually inherits a toggleable offload to assert on. + ip -n "$NS" link add ft_veth type veth peer name ft_veth_pr + ip -n "$NS" link add link ft_veth name ft_vlan type vlan id 100 + + vlan_feature_is "$NS" ft_vlan scatter-gather on + check_err $? "VLAN did not inherit scatter-gather from the real device" + + # Toggling the offload on the real device must propagate to the VLAN. + ip netns exec "$NS" ethtool -K ft_veth sg off + busywait "$BUSYWAIT_TIMEOUT" \ + vlan_feature_is "$NS" ft_vlan scatter-gather off + check_err $? "VLAN scatter-gather still on after disabling it on real dev" + + ip netns exec "$NS" ethtool -K ft_veth sg on + busywait "$BUSYWAIT_TIMEOUT" \ + vlan_feature_is "$NS" ft_vlan scatter-gather on + check_err $? "VLAN scatter-gather still off after enabling it on real dev" + + ip -n "$NS" link del ft_vlan + ip -n "$NS" link del ft_veth + + log_test "VLAN features follow the real device" +} + +vlan_real_dev_enslave() +{ + RET=0 + + # dummy <- VLAN -> bond0, then enslave the dummy itself to bond1. The + # last step brings the dummy up under bond1's instance lock, which used + # to deadlock while synchronously propagating UP to the (bond-enslaved) + # VLAN on top. + ip -n "$NS" link add dl_dummy type dummy + ip -n "$NS" link set dl_dummy up + ip -n "$NS" link add link dl_dummy name dl_vlan type vlan id 100 + + ip -n "$NS" link add dl_bond0 type bond mode active-backup + ip -n "$NS" link set dl_vlan down + ip -n "$NS" link set dl_vlan master dl_bond0 + check_err $? "could not enslave the VLAN to bond0" + + ip -n "$NS" link add dl_bond1 type bond mode active-backup + ip -n "$NS" link set dl_dummy down + ip -n "$NS" link set dl_dummy master dl_bond1 + check_err $? "could not enslave the real device to bond1" + + # If we got here the kernel did not deadlock; make sure it is still + # responsive and the enslave really took effect. + link_has_master "$NS" dl_dummy dl_bond1 + check_err $? "real device not enslaved to bond1" + + ip -n "$NS" link del dl_bond1 + ip -n "$NS" link del dl_bond0 + ip -n "$NS" link del dl_vlan + ip -n "$NS" link del dl_dummy + + log_test "VLAN real device enslaved to a second bond" +} + +setup_ns NS +trap 'cleanup_ns $NS' EXIT + +tests_run + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config index fd16994366f4..91d4fd410914 100644 --- a/tools/testing/selftests/drivers/net/config +++ b/tools/testing/selftests/drivers/net/config @@ -8,5 +8,10 @@ CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m +CONFIG_NETKIT=y +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_FQ=m +CONFIG_PPP=y +CONFIG_PPPOE=y CONFIG_VLAN_8021Q=m CONFIG_XDP_SOCKETS=y diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 221f27e57147..6ab8c97880d1 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -40,7 +40,7 @@ import glob import os import re from lib.py import ksft_run, ksft_exit, ksft_pr -from lib.py import NetDrvEpEnv, KsftXfailEx +from lib.py import NetDrvEpEnv, KsftFailEx, KsftXfailEx from lib.py import NetdevFamily, EthtoolFamily from lib.py import bkg, cmd, defer, ethtool, ip from lib.py import ksft_variants, KsftNamedVariant @@ -132,11 +132,21 @@ def _get_queue_stats(cfg, queue_id): return {} +def _require_ntuple(cfg): + features = ethtool(f"-k {cfg.ifname}", json=True)[0] + if not features["ntuple-filters"]["active"]: + if features["ntuple-filters"]["fixed"]: + raise KsftXfailEx("Device does not support ntuple-filters") + ethtool(f"-K {cfg.ifname} ntuple-filters on") + defer(ethtool, f"-K {cfg.ifname} ntuple-filters off") + + def _setup_isolated_queue(cfg): """Set up an isolated queue for testing using ntuple filter. Remove queue 1 from the default RSS context and steer test traffic to it. """ + _require_ntuple(cfg) test_queue = 1 qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*")) @@ -313,6 +323,12 @@ def _gro_variants(): "ip_frag6", "ip_v6ext_same", "ip_v6ext_diff", ] + # Tests specific to PPPoE + pppoe_tests = [ + "data_same", "data_lrg_sml", "data_sml_lrg", "data_lrg_1byte", + "data_burst", "pppoe_sid", + ] + for mode in ["sw", "hw", "lro"]: for protocol in ["ipv4", "ipv6", "ipip", "ip6ip6"]: for test_name in common_tests: @@ -325,6 +341,11 @@ def _gro_variants(): for test_name in ipv6_tests: yield mode, protocol, test_name + for mode in ["sw"]: + for protocol in ["pppoev4", "pppoev6"]: + for test_name in pppoe_tests: + yield mode, protocol, test_name + @ksft_variants(_gro_variants()) def test(cfg, mode, protocol, test_name): @@ -349,6 +370,12 @@ def test(cfg, mode, protocol, test_name): ksft_pr(rx_proc) + # ret==42 means the receiver detected over-coalescing. + # This is unambiguous proof of a bug, retries can only cause + # false negatives. + if rx_proc.ret == 42: + raise KsftFailEx(f"GRO over-coalesced in {protocol}/{test_name}") + if test_name.startswith("large_") and os.environ.get("KSFT_MACHINE_SLOW"): ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment") return diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 85ca4d1ecf9e..fd0535a96d84 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -31,9 +31,11 @@ TEST_PROGS = \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ iou-zcrx.py \ + ipsec_vxlan.py \ irq.py \ loopback.sh \ nic_timestamp.py \ + nk_devmem.py \ nk_netns.py \ nk_qlease.py \ ntuple.py \ @@ -45,12 +47,14 @@ TEST_PROGS = \ rss_input_xfrm.py \ toeplitz.py \ tso.py \ + userns_devmem.py \ uso.py \ xdp_metadata.py \ xsk_reconfig.py \ # TEST_FILES := \ + devmem_lib.py \ ethtool_lib.sh \ # diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config index dd50cb8a7911..ed8642b68094 100644 --- a/tools/testing/selftests/drivers/net/hw/config +++ b/tools/testing/selftests/drivers/net/hw/config @@ -3,14 +3,24 @@ CONFIG_FAIL_FUNCTION=y CONFIG_FAULT_INJECTION=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FUNCTION_ERROR_INJECTION=y +CONFIG_HUGETLBFS=y +CONFIG_INET6_ESP=y +CONFIG_INET6_ESP_OFFLOAD=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y CONFIG_IO_URING=y CONFIG_IPV6=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_SIT=y +CONFIG_IPV6_TUNNEL=y CONFIG_NET_CLS_ACT=y CONFIG_NET_CLS_BPF=y CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y +CONFIG_NET_IPIP=y CONFIG_NETKIT=y CONFIG_NET_SCH_INGRESS=y CONFIG_UDMABUF=y +CONFIG_USER_NS=y CONFIG_VXLAN=y +CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index ee863e90d1e0..031cf9905f65 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -2,91 +2,40 @@ # SPDX-License-Identifier: GPL-2.0 from os import path -from lib.py import ksft_run, ksft_exit -from lib.py import ksft_eq, KsftSkipEx +from devmem_lib import setup_test, run_rx, run_tx, run_tx_chunks, run_rx_hds +from lib.py import ksft_run, ksft_exit, ksft_disruptive from lib.py import NetDrvEpEnv -from lib.py import bkg, cmd, rand_port, wait_port_listen -from lib.py import ksft_disruptive - - -def require_devmem(cfg): - if not hasattr(cfg, "_devmem_probed"): - probe_command = f"{cfg.bin_local} -f {cfg.ifname}" - cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 - cfg._devmem_probed = True - - if not cfg._devmem_supported: - raise KsftSkipEx("Test requires devmem support") @ksft_disruptive def check_rx(cfg) -> None: - require_devmem(cfg) - - port = rand_port() - socat = f"socat -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},bind={cfg.remote_baddr}:{port}" - listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr} -p {port} -c {cfg.remote_addr} -v 7" - - with bkg(listen_cmd, exit_wait=True) as ncdevmem: - wait_port_listen(port) - cmd(f"yes $(echo -e \x01\x02\x03\x04\x05\x06) | \ - head -c 1K | {socat}", host=cfg.remote, shell=True) - - ksft_eq(ncdevmem.ret, 0) + """Run the devmem RX test.""" + run_rx(cfg) @ksft_disruptive def check_tx(cfg) -> None: - require_devmem(cfg) - - port = rand_port() - listen_cmd = f"socat -U - TCP{cfg.addr_ipver}-LISTEN:{port}" - - with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as socat: - wait_port_listen(port, host=cfg.remote) - cmd(f"echo -e \"hello\\nworld\"| {cfg.bin_local} -f {cfg.ifname} -s {cfg.remote_addr} -p {port}", shell=True) - - ksft_eq(socat.stdout.strip(), "hello\nworld") + """Run the devmem TX test.""" + run_tx(cfg) @ksft_disruptive def check_tx_chunks(cfg) -> None: - require_devmem(cfg) - - port = rand_port() - listen_cmd = f"socat -U - TCP{cfg.addr_ipver}-LISTEN:{port}" - - with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as socat: - wait_port_listen(port, host=cfg.remote) - cmd(f"echo -e \"hello\\nworld\"| {cfg.bin_local} -f {cfg.ifname} -s {cfg.remote_addr} -p {port} -z 3", shell=True) - - ksft_eq(socat.stdout.strip(), "hello\nworld") + """Run the devmem TX chunking test.""" + run_tx_chunks(cfg) def check_rx_hds(cfg) -> None: - """Test HDS splitting across payload sizes.""" - require_devmem(cfg) - - for size in [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: - port = rand_port() - listen_cmd = f"{cfg.bin_local} -L -l -f {cfg.ifname} -s {cfg.addr} -p {port}" - - with bkg(listen_cmd, exit_wait=True) as ncdevmem: - wait_port_listen(port) - cmd(f"dd if=/dev/zero bs={size} count=1 2>/dev/null | " + - f"socat -b {size} -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},nodelay", - host=cfg.remote, shell=True) - - ksft_eq(ncdevmem.ret, 0, f"HDS failed for payload size {size}") + """Run the HDS test.""" + run_rx_hds(cfg) def main() -> None: + """Run the devmem test cases.""" with NetDrvEpEnv(__file__) as cfg: - cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") - cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) - + setup_test(cfg, path.abspath(path.dirname(__file__) + "/ncdevmem")) ksft_run([check_rx, check_tx, check_tx_chunks, check_rx_hds], - args=(cfg, )) + args=(cfg,)) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/devmem_lib.py b/tools/testing/selftests/drivers/net/hw/devmem_lib.py new file mode 100644 index 000000000000..0921ff03eb81 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devmem_lib.py @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: GPL-2.0 +"""Shared helpers for devmem TCP selftests.""" + +import re + +from lib.py import (bkg, cmd, defer, ethtool, rand_port, wait_port_listen, + ksft_eq, KsftSkipEx, NetNSEnter, EthtoolFamily, + NetdevFamily) + + +def require_devmem(cfg): + """Probe ncdevmem on cfg.ifname and SKIP the test if devmem isn't supported.""" + if not hasattr(cfg, "devmem_probed"): + probe_command = f"{cfg.bin_local} -f {cfg.ifname}" + cfg.devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 + cfg.devmem_probed = True + + if not cfg.devmem_supported: + raise KsftSkipEx("Test requires devmem support") + + +def configure_nic(cfg): + """Channels, rings, RSS, queue lease for netkit devmem.""" + if not hasattr(cfg, 'netns'): + return + + cfg.require_ipver('6') + ethnl = EthtoolFamily() + + channels = ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + channels = channels['combined-count'] + if channels < 2: + raise KsftSkipEx( + 'Test requires NETIF with at least 2 combined channels' + ) + + rings = ethnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + orig_rx_rings = rings['rx'] + orig_hds_thresh = rings.get('hds-thresh', 0) + orig_data_split = rings.get('tcp-data-split', 'unknown') + + ethnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled', + 'hds-thresh': 0, + 'rx': min(64, orig_rx_rings)}) + defer(ethnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': orig_data_split, + 'hds-thresh': orig_hds_thresh, + 'rx': orig_rx_rings}) + + cfg.src_queue = channels - 1 + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + if not hasattr(cfg, 'nk_queue'): + with NetNSEnter(str(cfg.netns)): + netdevnl = NetdevFamily() + lease_result = netdevnl.queue_create({ + "ifindex": cfg.nk_guest_ifindex, + "type": "rx", + "lease": { + "ifindex": cfg.ifindex, + "queue": {"id": cfg.src_queue, "type": "rx"}, + "netns-id": 0, + }, + }) + cfg.nk_queue = lease_result['id'] + + +def set_flow_rule(cfg, port): + """Install a flow rule steering to src_queue and return the flow rule ID.""" + output = ethtool( + f"-N {cfg.ifname} flow-type tcp6 dst-port {port}" + f" action {cfg.src_queue}" + ).stdout + return int(re.search(r'ID (\d+)', output).group(1)) + + +def ncdevmem_rx(cfg, port, verify=True, fail_on_linear=False, flow_steer=False): + """Build the ncdevmem RX listener command.""" + if hasattr(cfg, 'netns'): + flow_rule_id = set_flow_rule(cfg, port) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + ifname = cfg.nk_guest_ifname + addr = cfg.nk_guest_ipv6 + extras = [f"-t {cfg.nk_queue}", "-q 1", "-n"] + else: + ifname = cfg.ifname + addr = cfg.addr + extras = [] + if flow_steer: + extras.append(f"-c {cfg.remote_addr}") + + if verify: + extras.append("-v 7") + if fail_on_linear: + extras.append("-L") + + parts = [cfg.bin_local, "-l", f"-f {ifname}", f"-s {addr}", + f"-p {port}", *extras] + return " ".join(parts) + + +def ncdevmem_tx(cfg, port, chunk_size=0): + """Build the ncdevmem TX send command.""" + if hasattr(cfg, 'netns'): + ifname = cfg.nk_guest_ifname + addr = cfg.remote_addr_v['6'] + extras = ["-t 0", "-q 1", "-n"] + else: + ifname = cfg.ifname + addr = cfg.remote_addr + extras = [] + + if chunk_size: + extras.append(f"-z {chunk_size}") + + parts = [cfg.bin_local, f"-f {ifname}", f"-s {addr}", + f"-p {port}", *extras] + return " ".join(parts) + + +def socat_send(cfg, port, buf_size=0): + """Socat command for sending to the devmem listener. + + When buf_size > 0, force one TCP segment per write of exactly that size by + setting socat's buffer (-b) and disabling Nagle (TCP_NODELAY). + """ + proto = f"TCP{cfg.addr_ipver}" + + if hasattr(cfg, 'netns'): + addr = f"[{cfg.nk_guest_ipv6}]" + else: + addr = cfg.baddr + + suffix = f",bind={cfg.remote_baddr}:{port}" + + buf = "" + if buf_size: + buf = f"-b {buf_size}" + suffix += ",nodelay" + + return f"socat {buf} -u - {proto}:{addr}:{port}{suffix}" + + +def socat_listen(cfg, port): + """Socat listen command for TX tests.""" + return f"socat -U - TCP{cfg.addr_ipver}-LISTEN:{port}" + + +def setup_test(cfg, bin_local): + """Stash the local ncdevmem path on cfg and deploy it to the remote.""" + cfg.bin_local = bin_local + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + +def run_rx(cfg): + """Run the devmem RX test.""" + require_devmem(cfg) + configure_nic(cfg) + port = rand_port() + socat = socat_send(cfg, port) + data_pipe = (f"yes $(echo -e \x01\x02\x03\x04\x05\x06) | head -c 1K" + f" | {socat}") + netns = getattr(cfg, "netns", None) + + listen_cmd = ncdevmem_rx(cfg, port, flow_steer=not hasattr(cfg, 'netns')) + with bkg(listen_cmd, exit_wait=True, ns=netns) as ncdevmem: + wait_port_listen(port, proto="tcp", ns=netns) + cmd(data_pipe, host=cfg.remote, shell=True) + ksft_eq(ncdevmem.ret, 0) + + +def run_tx(cfg): + """Run the devmem TX test.""" + require_devmem(cfg) + configure_nic(cfg) + netns = getattr(cfg, "netns", None) + port = rand_port() + tx_cmd = ncdevmem_tx(cfg, port) + listen_cmd = socat_listen(cfg, port) + + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as socat: + wait_port_listen(port, host=cfg.remote) + cmd(f"bash -c 'echo -e \"hello\\nworld\" | {tx_cmd}'", ns=netns, shell=True) + ksft_eq(socat.stdout.strip(), "hello\nworld") + + +def run_tx_chunks(cfg): + """Run the devmem TX chunking test.""" + require_devmem(cfg) + configure_nic(cfg) + netns = getattr(cfg, "netns", None) + port = rand_port() + tx_cmd = ncdevmem_tx(cfg, port, chunk_size=3) + listen_cmd = socat_listen(cfg, port) + + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as socat: + wait_port_listen(port, host=cfg.remote) + cmd(f"bash -c 'echo -e \"hello\\nworld\" | {tx_cmd}'", ns=netns, shell=True) + ksft_eq(socat.stdout.strip(), "hello\nworld") + + +def run_rx_hds(cfg): + """Run the HDS test by running devmem RX across a segment size sweep.""" + require_devmem(cfg) + configure_nic(cfg) + netns = getattr(cfg, "netns", None) + + for size in [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]: + port = rand_port() + + listen_cmd = ncdevmem_rx(cfg, port, verify=False, + fail_on_linear=True) + socat = socat_send(cfg, port, buf_size=size) + + with bkg(listen_cmd, exit_wait=True, ns=netns) as ncdevmem: + wait_port_listen(port, proto="tcp", ns=netns) + cmd(f"dd if=/dev/zero bs={size} count=1 2>/dev/null | " + f"{socat}", host=cfg.remote, shell=True) + ksft_eq(ncdevmem.ret, 0, f"HDS failed for payload size {size}") diff --git a/tools/testing/selftests/drivers/net/hw/gro_hw.py b/tools/testing/selftests/drivers/net/hw/gro_hw.py index 10e08b22ee0e..70e76e3888bd 100755 --- a/tools/testing/selftests/drivers/net/hw/gro_hw.py +++ b/tools/testing/selftests/drivers/net/hw/gro_hw.py @@ -51,11 +51,21 @@ def _resolve_dmac(cfg, ipver): return getattr(cfg, attr) +def _require_ntuple(cfg): + features = ethtool(f"-k {cfg.ifname}", json=True)[0] + if not features["ntuple-filters"]["active"]: + if features["ntuple-filters"]["fixed"]: + raise KsftSkipEx("Device does not support ntuple-filters") + ethtool(f"-K {cfg.ifname} ntuple-filters on") + defer(ethtool, f"-K {cfg.ifname} ntuple-filters off") + + def _setup_isolated_queue(cfg): """Set up an isolated queue for testing using ntuple filter. Remove queue 1 from the default RSS context and steer test traffic to it. """ + _require_ntuple(cfg) test_queue = 1 qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*")) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index 240d13dbc54e..f6a8fc5fac24 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -351,9 +351,6 @@ static void run_server(void) if (ret < 0) error(1, 0, "bind()"); - if (listen(fd, 1024) < 0) - error(1, 0, "listen()"); - flags |= IORING_SETUP_COOP_TASKRUN; flags |= IORING_SETUP_SINGLE_ISSUER; flags |= IORING_SETUP_DEFER_TASKRUN; @@ -366,6 +363,9 @@ static void run_server(void) if (cfg_dry_run) return; + if (listen(fd, 1024) < 0) + error(1, 0, "listen()"); + add_accept(&ring, fd); tstop = gettimeofday_ms() + 5000; diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index e81724cb5542..b7a225fe4bea 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 import re +import resource import time from os import path from lib.py import ksft_run, ksft_exit, KsftSkipEx, ksft_variants, KsftNamedVariant @@ -41,6 +42,23 @@ def set_flow_rule_rss(cfg, rss_ctx_id): return int(values) +def check_iou_rx_buf_len(cfg, expected_rx_buf_len): + """Check the io-uring memory provider exposes the expected rx_buf_len.""" + q = cfg.netnl.queue_get({'ifindex': cfg.ifindex, 'type': 'rx', 'id': cfg.target}) + napi_id = q['napi-id'] + pools = cfg.netnl.page_pool_get({}, dump=True) + pools = [p for p in pools if p.get('napi-id') == napi_id + and 'io-uring' in p] + if len(pools) != 1: + raise Exception(f"Expected 1 io-uring page pool, found {len(pools)}") + rx_buf_len = pools[0]['io-uring'].get('rx-buf-len') + if rx_buf_len is None: + raise KsftSkipEx("io-uring 'rx-buf-len' attribute not supported") + if rx_buf_len != expected_rx_buf_len: + raise Exception(f'Expected io-uring rx-buf-len {expected_rx_buf_len}, ' + f'got {rx_buf_len}') + + def single(cfg): channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) channels = channels['combined-count'] @@ -100,12 +118,22 @@ def rss(cfg): defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") +def _require_ntuple(cfg): + features = ethtool(f"-k {cfg.ifname}", json=True)[0] + if not features["ntuple-filters"]["active"]: + if features["ntuple-filters"]["fixed"]: + raise KsftSkipEx("Device does not support ntuple-filters") + ethtool(f"-K {cfg.ifname} ntuple-filters on") + defer(ethtool, f"-K {cfg.ifname} ntuple-filters off") + + @ksft_variants([ KsftNamedVariant("single", single), KsftNamedVariant("rss", rss), ]) def test_zcrx(cfg, setup) -> None: cfg.require_ipver('6') + _require_ntuple(cfg) setup(cfg) rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target}" @@ -121,6 +149,7 @@ def test_zcrx(cfg, setup) -> None: ]) def test_zcrx_oneshot(cfg, setup) -> None: cfg.require_ipver('6') + _require_ntuple(cfg) setup(cfg) rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -o 4" @@ -134,6 +163,7 @@ def test_zcrx_large_chunks(cfg) -> None: """Test zcrx with large buffer chunks.""" cfg.require_ipver('6') + _require_ntuple(cfg) hp_file = "/proc/sys/vm/nr_hugepages" with open(hp_file, 'r+', encoding='utf-8') as f: @@ -144,7 +174,10 @@ def test_zcrx_large_chunks(cfg) -> None: defer(lambda: open(hp_file, 'w', encoding='utf-8').write(str(nr_hugepages))) single(cfg) - rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -x 2" + page_size = resource.getpagesize() + nr_pages = 2 + rx_buf_len = nr_pages * page_size + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -x {nr_pages}" tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" probe = cmd(rx_cmd + " -d", fail=False) @@ -154,6 +187,9 @@ def test_zcrx_large_chunks(cfg) -> None: mp_clear_wait(cfg) with bkg(rx_cmd, exit_wait=True): wait_port_listen(cfg.port, proto="tcp") + + check_iou_rx_buf_len(cfg, rx_buf_len) + cmd(tx_cmd, host=cfg.remote) diff --git a/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py b/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py new file mode 100755 index 000000000000..0740a4d85240 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""Traffic test for VXLAN + IPsec crypto-offload.""" + +import os + +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge +from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx +from lib.py import CmdExitFailure, NetDrvEpEnv, cmd, defer, ethtool, ip +from lib.py import Iperf3Runner + +# Inner tunnel addresses - TEST-NET-2 (RFC 5737) / doc prefix (RFC 3849) +INNER_V4_LOCAL = "198.51.100.1" +INNER_V4_REMOTE = "198.51.100.2" +INNER_V6_LOCAL = "2001:db8:100::1" +INNER_V6_REMOTE = "2001:db8:100::2" + +# ESP parameters +SPI_OUT = "0x1000" +SPI_IN = "0x1001" +# 128-bit key + 32-bit salt = 20 bytes hex, 128-bit ICV +ESP_AEAD = "aead 'rfc4106(gcm(aes))' 0x" + "01" * 20 + " 128" + + +def xfrm(args, host=None): + """Runs 'ip xfrm' via shell to preserve parentheses in algo names.""" + cmd(f"ip xfrm {args}", shell=True, host=host) + + +def check_xfrm_offload_support(): + """Skips if iproute2 lacks xfrm offload support.""" + out = cmd("ip xfrm state help", fail=False) + if "offload" not in out.stdout + out.stderr: + raise KsftSkipEx("iproute2 too old, missing xfrm offload") + + +def check_esp_hw_offload(cfg): + """Skips if device lacks esp-hw-offload support.""" + check_xfrm_offload_support() + try: + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + except (CmdExitFailure, IndexError) as e: + raise KsftSkipEx(f"can't query features: {e}") from e + if not feat.get("esp-hw-offload", {}).get("active"): + raise KsftSkipEx("Device does not support esp-hw-offload") + + +def get_tx_drops(cfg): + """Returns TX dropped counter from the physical device.""" + stats = ip("-s -s link show dev " + cfg.ifname, json=True)[0] + return stats["stats64"]["tx"]["dropped"] + + +def setup_vxlan_ipsec(cfg, outer_ipver, inner_ipver): + """Sets up VXLAN tunnel with IPsec transport-mode crypto-offload.""" + vxlan_name = f"vx{os.getpid()}" + local_addr = cfg.addr_v[outer_ipver] + remote_addr = cfg.remote_addr_v[outer_ipver] + + if inner_ipver == "4": + inner_local = f"{INNER_V4_LOCAL}/24" + inner_remote = f"{INNER_V4_REMOTE}/24" + addr_extra = "" + else: + inner_local = f"{INNER_V6_LOCAL}/64" + inner_remote = f"{INNER_V6_REMOTE}/64" + addr_extra = " nodad" + + if outer_ipver == "6": + vxlan_opts = "udp6zerocsumtx udp6zerocsumrx" + else: + vxlan_opts = "noudpcsum" + + # VXLAN tunnel - local side + ip(f"link add {vxlan_name} type vxlan id 100 dstport 4789 {vxlan_opts} " + f"local {local_addr} remote {remote_addr} dev {cfg.ifname}") + defer(ip, f"link del {vxlan_name}") + ip(f"addr add {inner_local} dev {vxlan_name}{addr_extra}") + ip(f"link set {vxlan_name} up") + + # VXLAN tunnel - remote side + ip(f"link add {vxlan_name} type vxlan id 100 dstport 4789 {vxlan_opts} " + f"local {remote_addr} remote {local_addr} dev {cfg.remote_ifname}", + host=cfg.remote) + defer(ip, f"link del {vxlan_name}", host=cfg.remote) + ip(f"addr add {inner_remote} dev {vxlan_name}{addr_extra}", + host=cfg.remote) + ip(f"link set {vxlan_name} up", host=cfg.remote) + + # xfrm state - local outbound SA + xfrm(f"state add src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT} " + f"{ESP_AEAD} " + f"mode transport offload crypto dev {cfg.ifname} dir out") + defer(xfrm, f"state del src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT}") + + # xfrm state - local inbound SA + xfrm(f"state add src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN} " + f"{ESP_AEAD} " + f"mode transport offload crypto dev {cfg.ifname} dir in") + defer(xfrm, f"state del src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN}") + + # xfrm state - remote outbound SA (mirror, software crypto) + xfrm(f"state add src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN} " + f"{ESP_AEAD} " + f"mode transport", + host=cfg.remote) + defer(xfrm, f"state del src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN}", host=cfg.remote) + + # xfrm state - remote inbound SA (mirror, software crypto) + xfrm(f"state add src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT} " + f"{ESP_AEAD} " + f"mode transport", + host=cfg.remote) + defer(xfrm, f"state del src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT}", host=cfg.remote) + + # xfrm policy - local out + xfrm(f"policy add src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir out " + f"tmpl src {local_addr} dst {remote_addr} proto esp mode transport") + defer(xfrm, f"policy del src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir out") + + # xfrm policy - local in + xfrm(f"policy add src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir in " + f"tmpl src {remote_addr} dst {local_addr} proto esp mode transport") + defer(xfrm, f"policy del src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir in") + + # xfrm policy - remote out + xfrm(f"policy add src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir out " + f"tmpl src {remote_addr} dst {local_addr} proto esp mode transport", + host=cfg.remote) + defer(xfrm, f"policy del src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir out", host=cfg.remote) + + # xfrm policy - remote in + xfrm(f"policy add src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir in " + f"tmpl src {local_addr} dst {remote_addr} proto esp mode transport", + host=cfg.remote) + defer(xfrm, f"policy del src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir in", host=cfg.remote) + + +def _vxlan_ipsec_variants(): + """Generates outer/inner IP version variants.""" + for outer in ["4", "6"]: + for inner in ["4", "6"]: + yield KsftNamedVariant(f"outer_v{outer}_inner_v{inner}", outer, inner) + + +@ksft_variants(_vxlan_ipsec_variants()) +def test_vxlan_ipsec_crypto_offload(cfg, outer_ipver, inner_ipver): + """Tests VXLAN+IPsec crypto-offload has no TX drops.""" + cfg.require_ipver(outer_ipver) + check_esp_hw_offload(cfg) + + setup_vxlan_ipsec(cfg, outer_ipver, inner_ipver) + + if inner_ipver == "4": + inner_local = INNER_V4_LOCAL + inner_remote = INNER_V4_REMOTE + ping = "ping" + else: + inner_local = INNER_V6_LOCAL + inner_remote = INNER_V6_REMOTE + ping = "ping -6" + + cmd(f"{ping} -c 1 -W 2 {inner_remote}") + + drops_before = get_tx_drops(cfg) + + runner = Iperf3Runner(cfg, server_ip=inner_local, + client_ip=inner_remote) + bw_gbps = runner.measure_bandwidth(reverse=True) + + cfg.wait_hw_stats_settle() + drops_after = get_tx_drops(cfg) + + ksft_eq(drops_after - drops_before, 0, + comment="TX drops during VXLAN+IPsec") + ksft_ge(bw_gbps, 0.1, + comment="Minimum 100Mbps over VXLAN+IPsec") + + +def main(): + """Runs VXLAN+IPsec crypto-offload GSO selftest.""" + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + ksft_run([test_vxlan_ipsec_crypto_offload], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index 84a4dab6c649..8a58cb17cc06 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -18,7 +18,7 @@ try: sys.path.append(KSFT_DIR.as_posix()) # Import one by one to avoid pylint false positives - from net.lib.py import NetNS, NetNSEnter, NetdevSimDev + from net.lib.py import NetNS, NetNSEnter, NetdevSimDev, UserNetNS from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ NlError, RtnlFamily, DevlinkFamily, PSPFamily, Netlink from net.lib.py import CmdExitFailure @@ -34,7 +34,7 @@ try: from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv - __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", + __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "UserNetNS", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", "Netlink", "CmdExitFailure", diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index e098d6534c3c..d96e8a3b5a65 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -93,6 +93,7 @@ static char *port; static size_t do_validation; static int start_queue = -1; static int num_queues = -1; +static int skip_config; static char *ifname; static unsigned int ifindex; static unsigned int dmabuf_id; @@ -828,7 +829,7 @@ static struct netdev_queue_id *create_queues(void) static int do_server(struct memory_buffer *mem) { - struct ethtool_rings_get_rsp *ring_config; + struct ethtool_rings_get_rsp *ring_config = NULL; char ctrl_data[sizeof(int) * 20000]; size_t non_page_aligned_frags = 0; struct sockaddr_in6 client_addr; @@ -851,27 +852,29 @@ static int do_server(struct memory_buffer *mem) return -1; } - ring_config = get_ring_config(); - if (!ring_config) { - pr_err("Failed to get current ring configuration"); - return -1; - } + if (!skip_config) { + ring_config = get_ring_config(); + if (!ring_config) { + pr_err("Failed to get current ring configuration"); + return -1; + } - if (configure_headersplit(ring_config, 1)) { - pr_err("Failed to enable TCP header split"); - goto err_free_ring_config; - } + if (configure_headersplit(ring_config, 1)) { + pr_err("Failed to enable TCP header split"); + goto err_free_ring_config; + } - /* Configure RSS to divert all traffic from our devmem queues */ - if (configure_rss()) { - pr_err("Failed to configure rss"); - goto err_reset_headersplit; - } + /* Configure RSS to divert all traffic from our devmem queues */ + if (configure_rss()) { + pr_err("Failed to configure rss"); + goto err_reset_headersplit; + } - /* Flow steer our devmem flows to start_queue */ - if (configure_flow_steering(&server_sin)) { - pr_err("Failed to configure flow steering"); - goto err_reset_rss; + /* Flow steer our devmem flows to start_queue */ + if (configure_flow_steering(&server_sin)) { + pr_err("Failed to configure flow steering"); + goto err_reset_rss; + } } if (bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys)) { @@ -1052,13 +1055,17 @@ err_free_tmp: err_unbind: ynl_sock_destroy(ys); err_reset_flow_steering: - reset_flow_steering(); + if (!skip_config) + reset_flow_steering(); err_reset_rss: - reset_rss(); + if (!skip_config) + reset_rss(); err_reset_headersplit: - restore_ring_config(ring_config); + if (!skip_config) + restore_ring_config(ring_config); err_free_ring_config: - ethtool_rings_get_rsp_free(ring_config); + if (!skip_config) + ethtool_rings_get_rsp_free(ring_config); return err; } @@ -1404,7 +1411,7 @@ int main(int argc, char *argv[]) int is_server = 0, opt; int ret, err = 1; - while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:")) != -1) { + while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:n")) != -1) { switch (opt) { case 'L': fail_on_linear = true; @@ -1436,6 +1443,9 @@ int main(int argc, char *argv[]) case 'z': max_chunk = atoi(optarg); break; + case 'n': + skip_config = 1; + break; case '?': fprintf(stderr, "unknown option: %c\n", optopt); break; diff --git a/tools/testing/selftests/drivers/net/hw/nk_devmem.py b/tools/testing/selftests/drivers/net/hw/nk_devmem.py new file mode 100755 index 000000000000..300ed2a70ab4 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_devmem.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""Test devmem TCP with netkit.""" + +import os +from devmem_lib import setup_test, run_rx, run_tx, run_tx_chunks, run_rx_hds +from lib.py import ksft_run, ksft_exit, ksft_disruptive +from lib.py import NetDrvContEnv + + +@ksft_disruptive +def check_nk_rx(cfg) -> None: + """Run the devmem RX test through netkit.""" + run_rx(cfg) + + +@ksft_disruptive +def check_nk_tx(cfg) -> None: + """Run the devmem TX test through netkit.""" + run_tx(cfg) + + +@ksft_disruptive +def check_nk_tx_chunks(cfg) -> None: + """Run the devmem TX chunking test through netkit.""" + run_tx_chunks(cfg) + + +def check_nk_rx_hds(cfg) -> None: + """Run the HDS test through netkit.""" + run_rx_hds(cfg) + + +def main() -> None: + """Run the netkit devmem test cases.""" + with NetDrvContEnv(__file__, rxqueues=2, primary_rx_redirect=True) as cfg: + setup_test(cfg, + os.path.join(os.path.dirname(os.path.abspath(__file__)), + "ncdevmem")) + ksft_run([check_nk_rx, check_nk_tx, check_nk_tx_chunks, + check_nk_rx_hds], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/nk_primary_rx_redirect.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_primary_rx_redirect.bpf.c new file mode 100644 index 000000000000..46ff494b23de --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/nk_primary_rx_redirect.bpf.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/pkt_cls.h> +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ipv6.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +#define ctx_ptr(field) ((void *)(long)(field)) + +volatile __u32 phys_ifindex; + +SEC("tc/ingress") +int nk_primary_rx_redirect(struct __sk_buff *skb) +{ + void *data_end = ctx_ptr(skb->data_end); + void *data = ctx_ptr(skb->data); + struct ethhdr *eth; + struct ipv6hdr *ip6h; + + eth = data; + if ((void *)(eth + 1) > data_end) + return TC_ACT_OK; + + if (eth->h_proto != bpf_htons(ETH_P_IPV6)) + return TC_ACT_OK; + + ip6h = data + sizeof(struct ethhdr); + if ((void *)(ip6h + 1) > data_end) + return TC_ACT_OK; + + if (ip6h->nexthdr == IPPROTO_ICMPV6) + return TC_ACT_OK; + + return bpf_redirect_neigh(phys_ifindex, NULL, 0, 0); +} + +char __license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py index aa83dc321328..4f53034c9a50 100755 --- a/tools/testing/selftests/drivers/net/hw/nk_qlease.py +++ b/tools/testing/selftests/drivers/net/hw/nk_qlease.py @@ -18,8 +18,10 @@ from lib.py import ( NetNSEnter, EthtoolFamily, NetdevFamily, + RtnlFamily, ) from lib.py import ( + Netlink, bkg, cmd, defer, @@ -30,10 +32,138 @@ from lib.py import ( ) from lib.py import KsftSkipEx, CmdExitFailure +# iou-zcrx exits with 42 from setup_zcrx() when the NIC does not advertise +# QCFG_RX_PAGE_SIZE (or otherwise rejects the requested rx_buf_len). +SKIP_CODE = 42 -def set_flow_rule(cfg): + +def _restore_hugepages(count): + with open("/proc/sys/vm/nr_hugepages", "w", encoding="utf-8") as f: + f.write(str(count)) + + +def _mp_clear_wait(cfg, src_queue): + """Wait for the io_uring memory provider to clear from the leased + physical queue; io_uring tears it down asynchronously after the + process holding the ifq exits.""" + netdevnl = NetdevFamily() + deadline = time.time() + 5 + while time.time() < deadline: + queue_info = netdevnl.queue_get( + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} + ) + if "io-uring" not in queue_info: + return + time.sleep(0.1) + raise TimeoutError("Timed out waiting for memory provider to clear") + + +def _create_netkit_pair(cfg, rxqueues=2): + if cfg.nk_host_ifname: + cmd(f"ip link del dev {cfg.nk_host_ifname}", fail=False) + cfg.nk_host_ifname = None + cfg.nk_guest_ifname = None + cfg.detach_bpf() + + all_links = ip("-d link show", json=True) + old_idxs = { + link["ifindex"] + for link in all_links + if link.get("linkinfo", {}).get("info_kind") == "netkit" + } + + rtnl = RtnlFamily() + rtnl.newlink( + { + "linkinfo": { + "kind": "netkit", + "data": { + "mode": "l2", + "policy": "forward", + "peer-policy": "forward", + }, + }, + "num-rx-queues": rxqueues, + }, + flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL], + ) + + all_links = ip("-d link show", json=True) + nk_links = [ + link + for link in all_links + if link.get("linkinfo", {}).get("info_kind") == "netkit" + and link["ifindex"] not in old_idxs + ] + if len(nk_links) != 2: + raise KsftSkipEx("Failed to create netkit pair") + + nk_links.sort(key=lambda x: x["ifindex"]) + cfg.nk_host_ifname = nk_links[1]["ifname"] + cfg.nk_guest_ifname = nk_links[0]["ifname"] + cfg.nk_host_ifindex = nk_links[1]["ifindex"] + cfg.nk_guest_ifindex = nk_links[0]["ifindex"] + + ip(f"link set dev {cfg.nk_guest_ifname} netns {cfg.netns.name}") + ip(f"link set dev {cfg.nk_host_ifname} up") + ip(f"-6 addr add fe80::1/64 dev {cfg.nk_host_ifname} nodad") + ip( + f"-6 route add {cfg.nk_guest_ipv6}/128 via fe80::2 " + f"dev {cfg.nk_host_ifname}" + ) + ip(f"link set dev {cfg.nk_guest_ifname} up", ns=cfg.netns) + ip(f"-6 addr add fe80::2/64 dev {cfg.nk_guest_ifname}", ns=cfg.netns) + ip( + f"-6 addr add {cfg.nk_guest_ipv6}/64 dev {cfg.nk_guest_ifname} nodad", + ns=cfg.netns, + ) + ip( + f"-6 route add default via fe80::1 dev {cfg.nk_guest_ifname}", + ns=cfg.netns, + ) + + cfg.attach_bpf() + + +def _setup_lease(cfg, rxqueues=2): + _create_netkit_pair(cfg, rxqueues=rxqueues) + + ethnl = EthtoolFamily() + channels = ethnl.channels_get({"header": {"dev-index": cfg.ifindex}})[ + "combined-count" + ] + if channels < 2: + raise KsftSkipEx( + "Test requires NETIF with at least 2 combined channels" + ) + src_queue = channels - 1 + + with NetNSEnter(str(cfg.netns)): + netdevnl = NetdevFamily() + bind_result = netdevnl.queue_create( + { + "ifindex": cfg.nk_guest_ifindex, + "type": "rx", + "lease": { + "ifindex": cfg.ifindex, + "queue": {"id": src_queue, "type": "rx"}, + "netns-id": 0, + }, + } + ) + return src_queue, bind_result["id"] + + +def _teardown_netkit(cfg): + if cfg.nk_host_ifname: + cmd(f"ip link del dev {cfg.nk_host_ifname}", fail=False) + cfg.nk_host_ifname = None + cfg.nk_guest_ifname = None + + +def set_flow_rule(cfg, src_queue): output = ethtool( - f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}" + f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {src_queue}" ).stdout values = re.search(r"ID (\d+)", output).group(1) return int(values) @@ -41,6 +171,8 @@ def set_flow_rule(cfg): def test_iou_zcrx(cfg) -> None: cfg.require_ipver("6") + src_queue, nk_queue = _setup_lease(cfg) + defer(_teardown_netkit, cfg) ethnl = EthtoolFamily() rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) @@ -65,40 +197,121 @@ def test_iou_zcrx(cfg) -> None: }, ) - ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + ethtool(f"-X {cfg.ifname} equal {src_queue}") defer(ethtool, f"-X {cfg.ifname} default") - flow_rule_id = set_flow_rule(cfg) + flow_rule_id = set_flow_rule(cfg, src_queue) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" + rx_cmd = ( + f"{cfg.bin_local} -s -p {cfg.port} " + f"-i {cfg.nk_guest_ifname} -q {nk_queue}" + ) tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" - with bkg(rx_cmd, exit_wait=True): + with bkg(rx_cmd, exit_wait=True, ns=cfg.netns): + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) + cmd(tx_cmd, host=cfg.remote) + + +def test_iou_zcrx_large_buf(cfg) -> None: + """iou-zcrx with rx_buf_len > page size, going through a netkit-leased + queue. Exercises the queue rx-buf-len path via netif_mp_open_rxq()'s + lease redirect: the netkit ifindex is opaque to io_uring, but + rx_page_size is honoured by the *physical* qops because the lease + pointer rewrites the request from netkit onto the leased physical + rxq before supported_params/validate_qcfg are consulted. + """ + cfg.require_ipver("6") + src_queue, nk_queue = _setup_lease(cfg) + defer(_teardown_netkit, cfg) + ethnl = EthtoolFamily() + + with open("/proc/sys/vm/nr_hugepages", "r+", encoding="utf-8") as f: + nr_hugepages = int(f.read().strip()) + if nr_hugepages < 64: + f.seek(0) + f.write("64") + defer(_restore_hugepages, nr_hugepages) + + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) + rx_rings = rings["rx"] + hds_thresh = rings.get("hds-thresh", 0) + + ethnl.rings_set( + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "enabled", + "hds-thresh": 0, + "rx": 64, + } + ) + defer( + ethnl.rings_set, + { + "header": {"dev-index": cfg.ifindex}, + "tcp-data-split": "unknown", + "hds-thresh": hds_thresh, + "rx": rx_rings, + }, + ) + + ethtool(f"-X {cfg.ifname} equal {src_queue}") + defer(ethtool, f"-X {cfg.ifname} default") + + flow_rule_id = set_flow_rule(cfg, src_queue) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") + + # -x 2 asks iou-zcrx for rx_buf_len = 2 * page_size (8 KiB on x86_64), + # backed by a 2 MiB hugepage area so the chunks are physically + # contiguous, which is what zcrx requires for non-default rx_buf_len. + rx_cmd = ( + f"{cfg.bin_local} -s -p {cfg.port} " + f"-i {cfg.nk_guest_ifname} -q {nk_queue} -x 2" + ) + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" + + # Probe via -d (dry run): exits with SKIP_CODE if the leased physical + # qops doesn't advertise QCFG_RX_PAGE_SIZE (e.g. older bnxt FW/HW). + probe = cmd(rx_cmd + " -d", fail=False, ns=cfg.netns) + if probe.ret == SKIP_CODE: + msg = probe.stdout.strip() or "rx_buf_len not supported by leased NIC" + raise KsftSkipEx(msg) + + # A successful dry run still registered the zcrx ifq on the leased + # physical queue; wait for its async teardown before the real server + # binds the same queue. + _mp_clear_wait(cfg, src_queue) + + with bkg(rx_cmd, exit_wait=True, ns=cfg.netns): wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) cmd(tx_cmd, host=cfg.remote) def test_attrs(cfg) -> None: cfg.require_ipver("6") + src_queue, nk_queue = _setup_lease(cfg) + defer(_teardown_netkit, cfg) netdevnl = NetdevFamily() queue_info = netdevnl.queue_get( - {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} ) - ksft_eq(queue_info["id"], cfg.src_queue) + ksft_eq(queue_info["id"], src_queue) ksft_eq(queue_info["type"], "rx") ksft_eq(queue_info["ifindex"], cfg.ifindex) ksft_in("lease", queue_info) lease = queue_info["lease"] ksft_eq(lease["ifindex"], cfg.nk_guest_ifindex) - ksft_eq(lease["queue"]["id"], cfg.nk_queue) + ksft_eq(lease["queue"]["id"], nk_queue) ksft_eq(lease["queue"]["type"], "rx") ksft_in("netns-id", lease) def test_attach_xdp_with_mp(cfg) -> None: cfg.require_ipver("6") + src_queue, nk_queue = _setup_lease(cfg) + defer(_teardown_netkit, cfg) ethnl = EthtoolFamily() rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) @@ -123,18 +336,21 @@ def test_attach_xdp_with_mp(cfg) -> None: }, ) - ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + ethtool(f"-X {cfg.ifname} equal {src_queue}") defer(ethtool, f"-X {cfg.ifname} default") netdevnl = NetdevFamily() - rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" - with bkg(rx_cmd): + rx_cmd = ( + f"{cfg.bin_local} -s -p {cfg.port} " + f"-i {cfg.nk_guest_ifname} -q {nk_queue}" + ) + with bkg(rx_cmd, ns=cfg.netns): wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) time.sleep(0.1) queue_info = netdevnl.queue_get( - {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} ) ksft_in("io-uring", queue_info) @@ -144,13 +360,15 @@ def test_attach_xdp_with_mp(cfg) -> None: time.sleep(0.1) queue_info = netdevnl.queue_get( - {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} ) ksft_not_in("io-uring", queue_info) def test_destroy(cfg) -> None: cfg.require_ipver("6") + src_queue, nk_queue = _setup_lease(cfg) + defer(_teardown_netkit, cfg) ethnl = EthtoolFamily() rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) @@ -175,16 +393,19 @@ def test_destroy(cfg) -> None: }, ) - ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") + ethtool(f"-X {cfg.ifname} equal {src_queue}") defer(ethtool, f"-X {cfg.ifname} default") - rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" - rx_proc = cmd(rx_cmd, background=True) + rx_cmd = ( + f"{cfg.bin_local} -s -p {cfg.port} " + f"-i {cfg.nk_guest_ifname} -q {nk_queue}" + ) + rx_proc = cmd(rx_cmd, background=True, ns=cfg.netns) wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) netdevnl = NetdevFamily() queue_info = netdevnl.queue_get( - {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} ) ksft_in("io-uring", queue_info) @@ -193,23 +414,20 @@ def test_destroy(cfg) -> None: kill_timer = threading.Timer(1, rx_proc.proc.terminate) kill_timer.start() - ip(f"link del dev {cfg._nk_host_ifname}") + ip(f"link del dev {cfg.nk_host_ifname}") kill_timer.join() - cfg._nk_host_ifname = None - cfg._nk_guest_ifname = None + cfg.nk_host_ifname = None + cfg.nk_guest_ifname = None queue_info = netdevnl.queue_get( - {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} ) ksft_not_in("io-uring", queue_info) - cmd(f"tc filter del dev {cfg.ifname} ingress pref {cfg._bpf_prog_pref}") - cfg._tc_attached = False - - flow_rule_id = set_flow_rule(cfg) + flow_rule_id = set_flow_rule(cfg, src_queue) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") - rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.src_queue}" + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {src_queue}" tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" with bkg(rx_cmd, exit_wait=True): wait_port_listen(cfg.port, proto="tcp") @@ -217,7 +435,7 @@ def test_destroy(cfg) -> None: # Short delay since iou cleanup is async and takes a bit of time. time.sleep(0.1) queue_info = netdevnl.queue_get( - {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} + {"ifindex": cfg.ifindex, "id": src_queue, "type": "rx"} ) ksft_not_in("io-uring", queue_info) @@ -230,32 +448,14 @@ def main() -> None: cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) cfg.port = rand_port() - ethnl = EthtoolFamily() - channels = ethnl.channels_get({"header": {"dev-index": cfg.ifindex}}) - channels = channels["combined-count"] - if channels < 2: - raise KsftSkipEx("Test requires NETIF with at least 2 combined channels") - - cfg.src_queue = channels - 1 - - with NetNSEnter(str(cfg.netns)): - netdevnl = NetdevFamily() - bind_result = netdevnl.queue_create( - { - "ifindex": cfg.nk_guest_ifindex, - "type": "rx", - "lease": { - "ifindex": cfg.ifindex, - "queue": {"id": cfg.src_queue, "type": "rx"}, - "netns-id": 0, - }, - } - ) - cfg.nk_queue = bind_result["id"] - - # test_destroy must be last because it destroys the netkit devices ksft_run( - [test_iou_zcrx, test_attrs, test_attach_xdp_with_mp, test_destroy], + [ + test_iou_zcrx, + test_iou_zcrx_large_buf, + test_attrs, + test_attach_xdp_with_mp, + test_destroy, + ], args=(cfg,), ) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ntuple.py b/tools/testing/selftests/drivers/net/hw/ntuple.py index 232733142c02..ef4604bfa8ef 100755 --- a/tools/testing/selftests/drivers/net/hw/ntuple.py +++ b/tools/testing/selftests/drivers/net/hw/ntuple.py @@ -22,7 +22,10 @@ class NtupleField(Enum): def _require_ntuple(cfg): features = ethtool(f"-k {cfg.ifname}", json=True)[0] if not features["ntuple-filters"]["active"]: - raise KsftSkipEx("Ntuple filters not enabled on the device: " + str(features["ntuple-filters"])) + if features["ntuple-filters"]["fixed"]: + raise KsftSkipEx("Device does not support ntuple-filters") + ethtool(f"-K {cfg.ifname} ntuple-filters on") + defer(ethtool, f"-K {cfg.ifname} ntuple-filters off") def _get_rx_cnts(cfg, prev=None): diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 51f4e7bc3e5d..f36f76d6ca59 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -9,7 +9,7 @@ from lib.py import ksft_disruptive from lib.py import ksft_run, ksft_pr, ksft_exit from lib.py import ksft_eq, ksft_ne, ksft_ge, ksft_in, ksft_lt, ksft_true, ksft_raises from lib.py import NetDrvEpEnv -from lib.py import EthtoolFamily, NetdevFamily +from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import KsftSkipEx, KsftFailEx from lib.py import rand_port, rand_ports from lib.py import cmd, ethtool, ip, defer, CmdExitFailure, wait_file @@ -57,9 +57,10 @@ def ethtool_create(cfg, act, opts): def require_ntuple(cfg): features = ethtool(f"-k {cfg.ifname}", json=True)[0] if not features["ntuple-filters"]["active"]: - # ntuple is more of a capability than a config knob, don't bother - # trying to enable it (until some driver actually needs it). - raise KsftSkipEx("Ntuple filters not enabled on the device: " + str(features["ntuple-filters"])) + if features["ntuple-filters"]["fixed"]: + raise KsftSkipEx("Device does not support ntuple-filters") + ethtool(f"-K {cfg.ifname} ntuple-filters on") + defer(ethtool, f"-K {cfg.ifname} ntuple-filters off") def require_context_cnt(cfg, need_cnt): @@ -828,6 +829,94 @@ def test_rss_default_context_rule(cfg): 'noise' : (0, 1) }) +def _set_flow_hash(cfg, fl_type, fields, context=0): + req = {"header": {"dev-index": cfg.ifindex}, + "flow-hash": {fl_type: fields}} + if context: + req["context"] = context + cfg.ethnl.rss_set(req) + + +def _get_flow_hash(cfg, fl_type, context=0): + req = {"header": {"dev-index": cfg.ifindex}} + if context: + req["context"] = context + rss = cfg.ethnl.rss_get(req) + return rss.get("flow-hash", {}).get(fl_type, set()) + + +def test_rss_context_flow_hash(cfg): + """ + Validate, with traffic, that an additional RSS context honors the + flow-hash field selection. If the driver lacks per-context field + configuration ("ops->rxfh_per_ctx_fields") fall back to setting the + fields on the main context, which the kernel applies device-wide. + """ + + require_ntuple(cfg) + + queue_cnt = len(_get_rx_cnts(cfg)) + if queue_cnt < 6: + try: + ksft_pr(f"Increasing queue count {queue_cnt} -> 6") + ethtool(f"-L {cfg.ifname} combined 6") + defer(ethtool, f"-L {cfg.ifname} combined {queue_cnt}") + except CmdExitFailure as exc: + raise KsftSkipEx("Not enough queues for the test") from exc + + fl_type = f"tcp{cfg.addr_ipver}" + if not _get_flow_hash(cfg, fl_type): + raise KsftSkipEx(f"Device does not report flow-hash for {fl_type}") + + # Reserve queues 0/1 for main, build a new context spanning 2..5 + ethtool(f"-X {cfg.ifname} equal 2") + defer(ethtool, f"-X {cfg.ifname} default") + ctx_id = ethtool_create(cfg, "-X", "context new start 2 equal 4") + defer(ethtool, f"-X {cfg.ifname} context {ctx_id} delete") + + port = rand_port() + flow = f"flow-type {fl_type} dst-ip {cfg.addr} dst-port {port} context {ctx_id}" + ntuple = ethtool_create(cfg, "-N", flow) + defer(ethtool, f"-N {cfg.ifname} delete {ntuple}") + + ip_only = {"ip-src", "ip-dst"} + ip_l4 = ip_only | {"l4-b-0-1", "l4-b-2-3"} + + # Try per-context flow-hash; fall back to main context if unsupported. + cfg_ctx = ctx_id + try: + orig = _get_flow_hash(cfg, fl_type, context=ctx_id) + _set_flow_hash(cfg, fl_type, ip_only, context=ctx_id) + except NlError: + ksft_pr("Per-context flow-hash not supported, using device-wide") + cfg_ctx = 0 + orig = _get_flow_hash(cfg, fl_type) + _set_flow_hash(cfg, fl_type, ip_only) + defer(_set_flow_hash, cfg, fl_type, orig, context=cfg_ctx) + + def measure(): + cnts = _get_rx_cnts(cfg) + GenerateTraffic(cfg, port=port).wait_pkts_and_stop(20000) + cnts = _get_rx_cnts(cfg, prev=cnts) + ctx_cnts = cnts[2:6] + directed = sum(ctx_cnts) + used = sum(1 for c in ctx_cnts if c > directed / 200) + return cnts, directed, used + + # IP-only hash: iperf3 streams share src/dst IP, all should land on the + # same queue inside the context's range. + cnts, directed, used = measure() + ksft_ge(directed, 20000, f"traffic on context {ctx_id} (IP-only): {cnts}") + ksft_eq(used, 1, f"IP-only hash should use one queue in context {ctx_id}, got: {cnts}") + + # IP+L4 hash: streams have distinct src ports, traffic should spread. + _set_flow_hash(cfg, fl_type, ip_l4, context=cfg_ctx) + + cnts, directed, used = measure() + ksft_ge(directed, 20000, f"traffic on context {ctx_id} (IP+L4): {cnts}") + ksft_ge(used, 2, f"IP+L4 hash should spread across context {ctx_id} queues, got: {cnts}") + + @ksft_disruptive def test_rss_context_persist_ifupdown(cfg, pre_down=False): """ @@ -935,6 +1024,7 @@ def main() -> None: test_flow_add_context_missing, test_delete_rss_context_busy, test_rss_ntuple_addition, test_rss_default_context_rule, + test_rss_context_flow_hash, test_rss_context_persist_create_and_ifdown, test_rss_context_persist_ifdown_and_create], args=(cfg, )) diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py index bb675e3dac88..802bb4868046 100755 --- a/tools/testing/selftests/drivers/net/hw/tso.py +++ b/tools/testing/selftests/drivers/net/hw/tso.py @@ -187,28 +187,24 @@ def query_nic_features(cfg) -> None: cfg.wanted_features.add(f["name"]) cfg.hw_features = set() - hw_all_features_cmd = "" for f in features["hw"]["bits"]["bit"]: if f.get("value", False): - feature = f["name"] - cfg.hw_features.add(feature) - hw_all_features_cmd += f" {feature} on" - try: - ethtool(f"-K {cfg.ifname} {hw_all_features_cmd}") - except Exception as e: - ksft_pr(f"WARNING: failure enabling all hw features: {e}") - ksft_pr("partial gso feature detection may be impacted") + cfg.hw_features.add(f["name"]) # Check which features are supported via GSO partial cfg.partial_features = set() if 'tx-gso-partial' in cfg.hw_features: + seg_features = {f for f in cfg.hw_features if "segmentation" in f} + ethtool(f"-K {cfg.ifname} " + + " ".join(f"{f} on" for f in seg_features)) + ethtool(f"-K {cfg.ifname} tx-gso-partial off") no_partial = set() features = cfg.ethnl.features_get({"header": {"dev-index": cfg.ifindex}}) for f in features["active"]["bits"]["bit"]: no_partial.add(f["name"]) - cfg.partial_features = cfg.hw_features - no_partial + cfg.partial_features = seg_features - no_partial ethtool(f"-K {cfg.ifname} tx-gso-partial on") restore_wanted_features(cfg) @@ -239,6 +235,9 @@ def main() -> None: ("vxlan_csum", "", "tx-udp_tnl-csum-segmentation", ("vxlan", "id 100 dstport 4789 udpcsum", ("4", "6"))), ("gre", "4", "tx-gre-segmentation", ("gre", "", ("4", "6"))), ("gre", "6", "tx-gre-segmentation", ("ip6gre","", ("4", "6"))), + ("ip", "6", "tx-ipxip6-segmentation", ("ip6tnl","mode any", ("4", "6"))), + ("ip", "4", "tx-ipxip4-segmentation", ("sit","", ("6", ))), + ("ip", "4", "tx-ipxip4-segmentation", ("ipip","", ("4", ))), ) cases = [] diff --git a/tools/testing/selftests/drivers/net/hw/userns_devmem.py b/tools/testing/selftests/drivers/net/hw/userns_devmem.py new file mode 100755 index 000000000000..2aaf6ea81715 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/userns_devmem.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +""" +Devmem tests for non-init userns. +""" + +import os + +from devmem_lib import run_rx, run_rx_hds, run_tx, run_tx_chunks, setup_test +from lib.py import NetDrvContEnv, ksft_disruptive, ksft_exit, ksft_run + + +@ksft_disruptive +def check_userns_rx(cfg) -> None: + """Run the devmem RX test through non-init userns netkit.""" + run_rx(cfg) + + +@ksft_disruptive +def check_userns_tx(cfg) -> None: + """Run the devmem TX test through non-init userns netkit.""" + run_tx(cfg) + + +@ksft_disruptive +def check_userns_tx_chunks(cfg) -> None: + """Run the devmem TX chunking test through non-init userns netkit.""" + run_tx_chunks(cfg) + + +def check_userns_rx_hds(cfg) -> None: + """Run the HDS test through non-init userns netkit.""" + run_rx_hds(cfg) + + +def main() -> None: + """Run userns devmem RX selftests against the test environment.""" + with NetDrvContEnv(__file__, userns=True, rxqueues=2, + primary_rx_redirect=True) as cfg: + setup_test(cfg, + os.path.join(os.path.dirname(os.path.abspath(__file__)), + "ncdevmem")) + ksft_run([check_userns_rx, check_userns_tx, check_userns_tx_chunks, + check_userns_rx_hds], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 2b5ec0505672..ee903bcf3207 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -18,12 +18,13 @@ try: sys.path.append(KSFT_DIR.as_posix()) # Import one by one to avoid pylint false positives - from net.lib.py import NetNS, NetNSEnter, NetdevSimDev + from net.lib.py import NetNS, NetNSEnter, NetdevSimDev, UserNetNS from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ NlError, RtnlFamily, DevlinkFamily, PSPFamily, Netlink from net.lib.py import CmdExitFailure from net.lib.py import bkg, cmd, bpftool, bpftrace, defer, ethtool, \ - fd_read_timeout, ip, rand_port, rand_ports, wait_port_listen, wait_file + fd_read_timeout, ip, rand_port, rand_ports, tc, wait_port_listen, \ + wait_file from net.lib.py import bpf_map_set, bpf_map_dump, bpf_prog_map_ids from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ @@ -31,12 +32,12 @@ try: from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none - __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", + __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", "UserNetNS", "EthtoolFamily", "NetdevFamily", "NetshaperFamily", "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", "Netlink", "CmdExitFailure", "bkg", "cmd", "bpftool", "bpftrace", "defer", "ethtool", - "fd_read_timeout", "ip", "rand_port", "rand_ports", + "fd_read_timeout", "ip", "rand_port", "rand_ports", "tc", "wait_port_listen", "wait_file", "bpf_map_set", "bpf_map_dump", "bpf_prog_map_ids", "KsftSkipEx", "KsftFailEx", "KsftXfailEx", diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 24ce122abd9c..e4ab99b905b1 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -2,13 +2,14 @@ import ipaddress import os +import sys import time import json from pathlib import Path from lib.py import KsftSkipEx, KsftXfailEx from lib.py import ksft_setup, wait_file from lib.py import cmd, ethtool, ip, CmdExitFailure -from lib.py import NetNS, NetdevSimDev +from lib.py import NetNS, NetdevSimDev, UserNetNS from .remote import Remote from . import bpftool, RtnlFamily, Netlink @@ -336,15 +337,20 @@ class NetDrvContEnv(NetDrvEpEnv): +---------------+ """ - def __init__(self, src_path, rxqueues=1, **kwargs): + def __init__(self, src_path, rxqueues=1, primary_rx_redirect=False, + userns=False, **kwargs): self.netns = None - self._nk_host_ifname = None - self._nk_guest_ifname = None + self._userns = userns + self.nk_host_ifname = None + self.nk_guest_ifname = None self._tc_clsact_added = False self._tc_attached = False + self._primary_rx_redirect_attached = False + self._primary_rx_redirect_clsact_added = False self._bpf_prog_pref = None self._bpf_prog_id = None self._init_ns_attached = False + self._remote_route_added = False self._old_fwd = None self._old_accept_ra = None @@ -389,15 +395,25 @@ class NetDrvContEnv(NetDrvEpEnv): raise KsftSkipEx("Failed to create netkit pair") netkit_links.sort(key=lambda x: x['ifindex']) - self._nk_host_ifname = netkit_links[1]['ifname'] - self._nk_guest_ifname = netkit_links[0]['ifname'] + self.nk_host_ifname = netkit_links[1]['ifname'] + self.nk_guest_ifname = netkit_links[0]['ifname'] self.nk_host_ifindex = netkit_links[1]['ifindex'] self.nk_guest_ifindex = netkit_links[0]['ifindex'] self._setup_ns() - self._attach_bpf() + self.attach_bpf() + if primary_rx_redirect: + self._attach_primary_rx_redirect_bpf() def __del__(self): + if self._primary_rx_redirect_attached: + cmd(f"tc filter del dev {self.nk_host_ifname} ingress", fail=False) + self._primary_rx_redirect_attached = False + + if self._primary_rx_redirect_clsact_added: + cmd(f"tc qdisc del dev {self.nk_host_ifname} clsact", fail=False) + self._primary_rx_redirect_clsact_added = False + if self._tc_attached: cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}") self._tc_attached = False @@ -406,10 +422,15 @@ class NetDrvContEnv(NetDrvEpEnv): cmd(f"tc qdisc del dev {self.ifname} clsact") self._tc_clsact_added = False - if self._nk_host_ifname: - cmd(f"ip link del dev {self._nk_host_ifname}") - self._nk_host_ifname = None - self._nk_guest_ifname = None + if self._remote_route_added: + cmd(f"ip -6 route del {self.nk_guest_ipv6}/128", + host=self.remote, fail=False) + self._remote_route_added = False + + if self.nk_host_ifname: + cmd(f"ip link del dev {self.nk_host_ifname}") + self.nk_host_ifname = None + self.nk_guest_ifname = None if self._init_ns_attached: cmd("ip netns del init", fail=False) @@ -444,28 +465,37 @@ class NetDrvContEnv(NetDrvEpEnv): with open(ra_path, "w", encoding="utf-8") as f: f.write("2") - self.netns = NetNS() + self.netns = UserNetNS() if self._userns else NetNS() cmd("ip netns attach init 1") self._init_ns_attached = True ip("netns set init 0", ns=self.netns) - ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}") - ip(f"link set dev {self._nk_host_ifname} up") - ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad") - ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}") + ip(f"link set dev {self.nk_guest_ifname} netns {self.netns.name}") + nk_guest_dev = ip(f"link show dev {self.nk_guest_ifname}", + json=True, ns=self.netns)[0] + self.nk_guest_ifindex = nk_guest_dev['ifindex'] + ip(f"link set dev {self.nk_host_ifname} up") + ip(f"-6 addr add fe80::1/64 dev {self.nk_host_ifname} nodad") + ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self.nk_host_ifname}") ip("link set lo up", ns=self.netns) - ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns) - ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns) - ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns) - ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns) + ip(f"link set dev {self.nk_guest_ifname} up", ns=self.netns) + ip(f"-6 addr add fe80::2/64 dev {self.nk_guest_ifname}", ns=self.netns) + ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self.nk_guest_ifname} nodad", ns=self.netns) + ip(f"-6 route add default via fe80::1 dev {self.nk_guest_ifname}", ns=self.netns) - def _tc_ensure_clsact(self): - qdisc = json.loads(cmd(f"tc -j qdisc show dev {self.ifname}").stdout) + def _tc_ensure_clsact(self, ifname=None): + """Ensure a clsact qdisc exists on @ifname. + + Returns True if this call added the qdisc, otherwise returns False. + """ + if ifname is None: + ifname = self.ifname + qdisc = json.loads(cmd(f"tc -j qdisc show dev {ifname}").stdout) for q in qdisc: if q['kind'] == 'clsact': - return - cmd(f"tc qdisc add dev {self.ifname} clsact") - self._tc_clsact_added = True + return False + cmd(f"tc qdisc add dev {ifname} clsact") + return True def _get_bpf_prog_ids(self): filters = json.loads(cmd(f"tc -j filter show dev {self.ifname} ingress").stdout) @@ -476,28 +506,43 @@ class NetDrvContEnv(NetDrvEpEnv): return (bpf['pref'], bpf['options']['prog']['id']) raise Exception("Failed to get BPF prog ID") - def _attach_bpf(self): - bpf_obj = self.test_dir / "nk_forward.bpf.o" - if not bpf_obj.exists(): - raise KsftSkipEx("BPF prog not found") + def _find_bss_map_id(self, prog_id): + """Find the .bss map ID for a loaded BPF program.""" + prog_info = bpftool(f"prog show id {prog_id}", json=True) + for map_id in prog_info.get("map_ids", []): + map_info = bpftool(f"map show id {map_id}", json=True) + if map_info.get("name", "").endswith("bss"): + return map_id + raise Exception(f"Failed to find .bss map for prog {prog_id}") + + def _find_bpf_obj(self, name): + bpf_obj = self.test_dir / name + if bpf_obj.exists(): + return bpf_obj + bpf_obj = self.test_dir / "hw" / name + if bpf_obj.exists(): + return bpf_obj + return None + + def detach_bpf(self): + if self._tc_attached: + cmd(f"tc filter del dev {self.ifname} ingress pref " + f"{self._bpf_prog_pref}", fail=False) + self._tc_attached = False + + def attach_bpf(self): + bpf_obj = self._find_bpf_obj("nk_forward.bpf.o") + if not bpf_obj: + raise KsftSkipEx("BPF prog nk_forward.bpf.o not found") - self._tc_ensure_clsact() + if self._tc_ensure_clsact(): + self._tc_clsact_added = True cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj}" " sec tc/ingress direct-action") self._tc_attached = True (self._bpf_prog_pref, self._bpf_prog_id) = self._get_bpf_prog_ids() - prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True) - map_ids = prog_info.get("map_ids", []) - - bss_map_id = None - for map_id in map_ids: - map_info = bpftool(f"map show id {map_id}", json=True) - if map_info.get("name").endswith("bss"): - bss_map_id = map_id - - if bss_map_id is None: - raise Exception("Failed to find .bss map") + bss_map_id = self._find_bss_map_id(self._bpf_prog_id) ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix) ipv6_bytes = ipv6_addr.packed @@ -505,3 +550,36 @@ class NetDrvContEnv(NetDrvEpEnv): value = ipv6_bytes + ifindex_bytes value_hex = ' '.join(f'{b:02x}' for b in value) bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") + + def _attach_primary_rx_redirect_bpf(self): + """Attach BPF redirect program on the primary netkit ingress.""" + bpf_obj = self._find_bpf_obj("nk_primary_rx_redirect.bpf.o") + if not bpf_obj: + raise KsftSkipEx("nk_primary_rx_redirect.bpf.o not found") + + if self._tc_ensure_clsact(self.nk_host_ifname): + self._primary_rx_redirect_clsact_added = True + cmd(f"tc filter add dev {self.nk_host_ifname} ingress" + f" bpf obj {bpf_obj} sec tc/ingress direct-action") + self._primary_rx_redirect_attached = True + + ip(f"-6 route add {self.nk_guest_ipv6}/128 via {self.addr_v['6']}", + host=self.remote) + self._remote_route_added = True + + filters = json.loads( + cmd(f"tc -j filter show dev {self.nk_host_ifname} ingress").stdout) + redirect_prog_id = None + for bpf in filters: + if 'options' not in bpf: + continue + if bpf['options']['bpf_name'].startswith('nk_primary_rx_redirect'): + redirect_prog_id = bpf['options']['prog']['id'] + break + if redirect_prog_id is None: + raise Exception("Failed to get primary RX redirect BPF prog ID") + + bss_map_id = self._find_bss_map_id(redirect_prog_id) + phys_ifindex_bytes = self.ifindex.to_bytes(4, byteorder=sys.byteorder) + value_hex = ' '.join(f'{b:02x}' for b in phys_ifindex_bytes) + bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}") diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index f181fa2d38fc..e24660e5c27f 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -48,7 +48,10 @@ class Iperf3Runner: Starts the iperf3 client with the configured options. """ cmdline = self._build_client(streams, duration, reverse) - return cmd(cmdline, background=background, host=self.env.remote) + kwargs = {"background": background, "host": self.env.remote} + if not background: + kwargs["timeout"] = duration + 5 + return cmd(cmdline, **kwargs) def measure_bandwidth(self, reverse=False): """ diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py index 864d9fce1094..315648a770d0 100755 --- a/tools/testing/selftests/drivers/net/psp.py +++ b/tools/testing/selftests/drivers/net/psp.py @@ -5,6 +5,7 @@ import errno import fcntl +import os import socket import struct import termios @@ -14,9 +15,13 @@ from lib.py import defer from lib.py import ksft_run, ksft_exit, ksft_pr from lib.py import ksft_true, ksft_eq, ksft_ne, ksft_gt, ksft_raises from lib.py import ksft_not_none -from lib.py import KsftSkipEx -from lib.py import NetDrvEpEnv, PSPFamily, NlError +from lib.py import ksft_variants, KsftNamedVariant +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import NetDrvEpEnv, NetDrvContEnv +from lib.py import Netlink, NlError, PSPFamily, RtnlFamily +from lib.py import NetNSEnter from lib.py import bkg, rand_port, wait_port_listen +from lib.py import ip def _get_outq(s): @@ -117,11 +122,13 @@ def _get_stat(cfg, key): # Test case boiler plate # -def _init_psp_dev(cfg): +def _init_psp_dev(cfg, use_psp_ifindex=False): if not hasattr(cfg, 'psp_dev_id'): # Figure out which local device we are testing against + # For NetDrvContEnv: use psp_ifindex instead of ifindex + target_ifindex = cfg.psp_ifindex if use_psp_ifindex else cfg.ifindex for dev in cfg.pspnl.dev_get({}, dump=True): - if dev['ifindex'] == cfg.ifindex: + if dev['ifindex'] == target_ifindex: cfg.psp_info = dev cfg.psp_dev_id = cfg.psp_info['id'] break @@ -571,33 +578,388 @@ def removal_device_bi(cfg): _close_conn(cfg, s) -def psp_ip_ver_test_builder(name, test_func, psp_ver, ipver): - """Build test cases for each combo of PSP version and IP version""" - def test_case(cfg): - cfg.require_ipver(ipver) - test_func(cfg, psp_ver, ipver) +def _get_psp_ver_ip_variants(): + for ver in range(4): + for ipv in ("4", "6"): + yield KsftNamedVariant(f"v{ver}_ip{ipv}", ver, ipv) - test_case.__name__ = f"{name}_v{psp_ver}_ip{ipver}" - return test_case +def _get_ip_variants(): + for ipv in ("4", "6"): + yield KsftNamedVariant(f"ip{ipv}", ipv) -def ipver_test_builder(name, test_func, ipver): - """Build test cases for each IP version""" - def test_case(cfg): - cfg.require_ipver(ipver) - test_func(cfg, ipver) - test_case.__name__ = f"{name}_ip{ipver}" - return test_case +@ksft_variants(_get_psp_ver_ip_variants()) +def data_basic_send(cfg, version, ipver): + """Test basic PSP data send.""" + cfg.require_ipver(ipver) + _data_basic_send(cfg, version, ipver) + + +@ksft_variants(_get_ip_variants()) +def data_mss_adjust(cfg, ipver): + """Test MSS adjustment with PSP.""" + cfg.require_ipver(ipver) + _data_mss_adjust(cfg, ipver) + + +def _check_assoc_list(cfg, psp_dev_id, ifindex, nsid=None): + """Verify assoc-list contains device with given ifindex, no duplicates.""" + dev_info = cfg.pspnl.dev_get({'id': psp_dev_id}) + + ksft_true('assoc-list' in dev_info, + "No assoc-list in dev_get() response after association") + found = False + for assoc in dev_info['assoc-list']: + if assoc['ifindex'] != ifindex: + continue + if nsid is not None and assoc['nsid'] != nsid: + continue + ksft_eq(found, False, "Duplicate assoc entry found") + found = True + ksft_eq(found, True, + "Associated device not found in dev_get() response") + + +def _data_basic_send_netkit_psp_assoc(cfg, version, ipver): + """ + Test basic data send with netkit interface associated with PSP dev. + """ + _assoc_nk_guest(cfg) + + # Enter guest namespace (netns) to run PSP test + with NetNSEnter(cfg.netns.name): + cfg.pspnl = PSPFamily() + + sock = _make_psp_conn(cfg, version, ipver) + + rx_assoc = cfg.pspnl.rx_assoc({"version": version, + "dev-id": cfg.psp_dev_id, + "sock-fd": sock.fileno()}) + rx_key = rx_assoc['rx-key'] + tx_key = _spi_xchg(sock, rx_key) + + cfg.pspnl.tx_assoc({"dev-id": cfg.psp_dev_id, + "version": version, + "tx-key": tx_key, + "sock-fd": sock.fileno()}) + + data_len = _send_careful(cfg, sock, 100) + _check_data_rx(cfg, data_len) + _close_psp_conn(cfg, sock) + + +def _assoc_check_list(cfg): + """Test that assoc-list is correctly populated after dev-assoc.""" + _assoc_nk_guest(cfg) + _check_assoc_list(cfg, cfg.psp_dev_id, cfg.nk_guest_ifindex, + cfg.psp_dev_peer_nsid) + + +def _get_psp_ver_ip6_variants(): + for ver in range(4): + yield KsftNamedVariant(f"v{ver}_ip6", ver, "6") + + +@ksft_variants(_get_psp_ver_ip6_variants()) +def data_basic_send_netkit_psp_assoc(cfg, version, ipver): + """Test PSP data send via netkit with dev-assoc.""" + cfg.require_ipver(ipver) + _data_basic_send_netkit_psp_assoc(cfg, version, ipver) + + +def _key_rotation_notify_multi_ns_netkit(cfg): + """ Test key rotation notifications across multiple namespaces using netkit """ + _assoc_nk_guest(cfg) + + # Create listener in guest namespace; socket stays bound to that ns + with NetNSEnter(cfg.netns.name): + peer_pspnl = PSPFamily() + peer_pspnl.ntf_subscribe('use') + + # Create listener in main namespace + main_pspnl = PSPFamily() + main_pspnl.ntf_subscribe('use') + + # Trigger key rotation on the PSP device + cfg.pspnl.key_rotate({"id": cfg.psp_dev_id}) + + # Poll both sockets from main thread + for pspnl, label in [(main_pspnl, "main"), (peer_pspnl, "guest")]: + for ntf in pspnl.poll_ntf(duration=10): + if ntf['msg'].get('id') == cfg.psp_dev_id: + break + else: + raise KsftFailEx( + f"No key rotation notification received" + f" in {label} namespace") + + +def _dev_change_notify_multi_ns_netkit(cfg): + """ Test dev_change notifications across multiple namespaces using netkit """ + _assoc_nk_guest(cfg) + + # Create listener in guest namespace; socket stays bound to that ns + with NetNSEnter(cfg.netns.name): + peer_pspnl = PSPFamily() + peer_pspnl.ntf_subscribe('mgmt') + + # Create listener in main namespace + main_pspnl = PSPFamily() + main_pspnl.ntf_subscribe('mgmt') + + # Trigger dev_change by calling dev_set (notification is always sent) + cfg.pspnl.dev_set({'id': cfg.psp_dev_id, + 'psp-versions-ena': cfg.psp_info['psp-versions-cap']}) + + # Poll both sockets from main thread + for pspnl, label in [(main_pspnl, "main"), (peer_pspnl, "guest")]: + for ntf in pspnl.poll_ntf(duration=10): + if ntf['msg'].get('id') == cfg.psp_dev_id: + break + else: + raise KsftFailEx( + f"No dev_change notification received" + f" in {label} namespace") + + +def _psp_dev_get_check_netkit_psp_assoc(cfg): + """ Check psp dev-get output with netkit interface associated with PSP dev """ + _assoc_nk_guest(cfg) + + # Check 1: In default netns, verify dev-get has correct ifindex and assoc-list + dev_info = cfg.pspnl.dev_get({'id': cfg.psp_dev_id}) + ksft_eq(dev_info['ifindex'], cfg.psp_ifindex) + _check_assoc_list(cfg, cfg.psp_dev_id, cfg.nk_guest_ifindex, + cfg.psp_dev_peer_nsid) + + # Check 2: In guest netns, verify dev-get has assoc-list with nk_guest device + with NetNSEnter(cfg.netns.name): + peer_pspnl = PSPFamily() + + # Dump all devices in the guest namespace + peer_devices = peer_pspnl.dev_get({}, dump=True) + + # Find the device with by-association flag + peer_dev = None + for dev in peer_devices: + if dev.get('by-association'): + peer_dev = dev + break + + ksft_not_none(peer_dev, "No PSP device found with by-association flag in guest netns") + + # Verify assoc-list contains the nk_guest device + ksft_true('assoc-list' in peer_dev and len(peer_dev['assoc-list']) > 0, + "Guest device should have assoc-list with local devices") + + # Verify the assoc-list contains nk_guest ifindex with nsid=-1 (same namespace) + found = False + for assoc in peer_dev['assoc-list']: + if assoc['ifindex'] == cfg.nk_guest_ifindex: + ksft_eq(assoc['nsid'], -1, + "nsid should be -1 (NETNSA_NSID_NOT_ASSIGNED) for same-namespace device") + found = True + break + ksft_true(found, "nk_guest ifindex not found in assoc-list") + + +def _dev_assoc_no_nsid(cfg): + """ Test dev-assoc and dev-disassoc without nsid attribute """ + _init_psp_dev(cfg, True) + + # Associate without nsid - should look up ifindex in caller's netns + cfg.pspnl.dev_assoc({'id': cfg.psp_dev_id, + 'ifindex': cfg.nk_host_ifindex}) + defer(_try_disassoc, cfg, + cfg.psp_dev_id, cfg.nk_host_ifindex) + defer(delattr, cfg, 'psp_dev_id') + defer(delattr, cfg, 'psp_info') + + # Verify assoc-list contains the device (match by ifindex only) + _check_assoc_list(cfg, cfg.psp_dev_id, cfg.nk_host_ifindex) + + # Disassociate without nsid - should also use caller's netns + cfg.pspnl.dev_disassoc({'id': cfg.psp_dev_id, + 'ifindex': cfg.nk_host_ifindex}) + + # Verify assoc-list no longer contains the device + dev_info = cfg.pspnl.dev_get({'id': cfg.psp_dev_id}) + found = False + if 'assoc-list' in dev_info: + for assoc in dev_info['assoc-list']: + if assoc['ifindex'] == cfg.nk_host_ifindex: + found = True + break + ksft_true(not found, "Device should not be in assoc-list after disassociation") + + +def _psp_dev_assoc_cleanup_on_netkit_del(cfg): + """Test that assoc-list is cleared when associated netkit is deleted. + + Creates a disposable netkit pair for this test to avoid destroying + the shared environment. + """ + _init_psp_dev(cfg, True) + defer(delattr, cfg, 'psp_dev_id') + defer(delattr, cfg, 'psp_info') + + existing = {cfg.nk_host_ifindex, cfg.nk_guest_ifindex} + + # Create a temporary netkit pair + tmp_host_name = "tmp_nk_host" + tmp_guest_name = "tmp_nk_guest" + rtnl = RtnlFamily() + rtnl.newlink( + { + "ifname": tmp_host_name, + "linkinfo": { + "kind": "netkit", + "data": { + "mode": "l2", + "policy": "forward", + "peer-policy": "forward", + }, + }, + }, + flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL], + ) + cleanup_netkit = defer(ip, f"link del {tmp_host_name}") + + # Find the peer by diffing against existing netkit ifindexes + all_links = ip("-d link show", json=True) + tmp_peer = [link for link in all_links + if link.get('linkinfo', {}).get('info_kind') == 'netkit' + and link['ifindex'] not in existing + and link['ifname'] != tmp_host_name] + ksft_eq(len(tmp_peer), 1, + "Failed to find temporary netkit peer") + guest_name = tmp_peer[0]['ifname'] + + # Rename and move guest end into the test namespace + ip(f"link set dev {guest_name} name {tmp_guest_name}") + ip(f"link set dev {tmp_guest_name} netns {cfg.netns.name}") + tmp_guest_dev = ip(f"link show dev {tmp_guest_name}", + json=True, ns=cfg.netns)[0] + tmp_guest_ifindex = tmp_guest_dev['ifindex'] + ip(f"link set dev {tmp_guest_name} up", ns=cfg.netns) + + # Associate PSP device with the temporary guest interface + cfg.pspnl.dev_assoc({'id': cfg.psp_dev_id, + 'ifindex': tmp_guest_ifindex, + 'nsid': cfg.psp_dev_peer_nsid}) + + # Verify assoc-list contains the temporary device + _check_assoc_list(cfg, cfg.psp_dev_id, tmp_guest_ifindex, + cfg.psp_dev_peer_nsid) + + # Delete the temporary netkit pair (deleting one end removes both) + ip(f"link del {tmp_host_name}") + cleanup_netkit.cancel() + + # Verify assoc-list is cleared after netkit deletion + dev_info = cfg.pspnl.dev_get({'id': cfg.psp_dev_id}) + ksft_true('assoc-list' not in dev_info + or len(dev_info['assoc-list']) == 0, + "assoc-list should be empty after netkit deletion") + + +def _try_disassoc(cfg, psp_dev_id, ifindex, nsid=None): + """Best-effort disassociate, ignoring errors if already removed.""" + try: + params = {'id': psp_dev_id, 'ifindex': ifindex} + if nsid is not None: + params['nsid'] = nsid + cfg.pspnl.dev_disassoc(params) + except NlError: + pass + + +def _assoc_nk_guest(cfg): + """Associate nk_guest with PSP device and register cleanup via defer().""" + _init_psp_dev(cfg, True) + + cfg.pspnl.dev_assoc({'id': cfg.psp_dev_id, + 'ifindex': cfg.nk_guest_ifindex, + 'nsid': cfg.psp_dev_peer_nsid}) + defer(_disassoc_nk_guest, cfg, + cfg.psp_dev_id, cfg.nk_guest_ifindex) + + +def _disassoc_nk_guest(cfg, psp_dev_id, nk_guest_ifindex): + """Disassociate nk_guest and reset cfg PSP state.""" + pspnl = PSPFamily() + pspnl.dev_disassoc({'id': psp_dev_id, 'ifindex': nk_guest_ifindex, + 'nsid': cfg.psp_dev_peer_nsid}) + cfg.pspnl = pspnl + del cfg.psp_dev_id + del cfg.psp_info + + +def _get_nsid(ns_name): + """Get the nsid for a namespace.""" + for entry in ip("netns list-id", json=True): + if entry.get("name") == str(ns_name): + return entry["nsid"] + raise KsftSkipEx(f"nsid not found for namespace {ns_name}") + + +def _setup_psp_attributes(cfg): + # pylint: disable=protected-access + """ + Set up PSP-specific attributes on the environment. + + This sets attributes needed for PSP tests based on whether we're using + netdevsim or a real NIC. + """ + if cfg._ns is not None: + # netdevsim case: PSP device is the local dev (in host namespace) + cfg.psp_dev = cfg._ns.nsims[0].dev + cfg.psp_ifname = cfg.psp_dev['ifname'] + cfg.psp_ifindex = cfg.psp_dev['ifindex'] + + # PSP peer device is the remote dev (in _netns, where psp_responder runs) + cfg.psp_dev_peer = cfg._ns_peer.nsims[0].dev + cfg.psp_dev_peer_ifname = cfg.psp_dev_peer['ifname'] + cfg.psp_dev_peer_ifindex = cfg.psp_dev_peer['ifindex'] + else: + # Real NIC case: PSP device is the local interface + cfg.psp_dev = cfg.dev + cfg.psp_ifname = cfg.ifname + cfg.psp_ifindex = cfg.ifindex + + # PSP peer device is the remote interface + cfg.psp_dev_peer = cfg.remote_dev + cfg.psp_dev_peer_ifname = cfg.remote_ifname + cfg.psp_dev_peer_ifindex = cfg.remote_ifindex + + # Get nsid for the guest namespace (netns) where nk_guest is + cfg.psp_dev_peer_nsid = _get_nsid(cfg.netns.name) + def main() -> None: """ Ksft boiler plate main """ - with NetDrvEpEnv(__file__) as cfg: + # Make sure LOCAL_PREFIX_V6 is set + if "LOCAL_PREFIX_V6" not in os.environ: + os.environ["LOCAL_PREFIX_V6"] = "2001:db8:2::" + + try: + env = NetDrvContEnv(__file__, primary_rx_redirect=True) + has_cont = True + except KsftSkipEx: + env = NetDrvEpEnv(__file__) + has_cont = False + + with env as cfg: cfg.pspnl = PSPFamily() + if has_cont: + _setup_psp_attributes(cfg) + # Set up responder and communication sock + # psp_responder runs in _netns (remote namespace with psp_dev_peer) responder = cfg.remote.deploy("psp_responder") cfg.comm_port = rand_port() @@ -611,17 +973,18 @@ def main() -> None: cfg.comm_port), timeout=1) - cases = [ - psp_ip_ver_test_builder( - "data_basic_send", _data_basic_send, version, ipver - ) - for version in range(0, 4) - for ipver in ("4", "6") - ] - cases += [ - ipver_test_builder("data_mss_adjust", _data_mss_adjust, ipver) - for ipver in ("4", "6") - ] + cases = [data_basic_send, data_mss_adjust] + + if has_cont: + cases += [ + _assoc_check_list, + data_basic_send_netkit_psp_assoc, + _key_rotation_notify_multi_ns_netkit, + _dev_change_notify_multi_ns_netkit, + _psp_dev_get_check_netkit_psp_assoc, + _dev_assoc_no_nsid, + _psp_dev_assoc_cleanup_on_netkit_del, + ] ksft_run(cases=cases, globs=globals(), case_pfx={"dev_", "data_", "assoc_", "removal_"}, diff --git a/tools/testing/selftests/drivers/net/shaper.py b/tools/testing/selftests/drivers/net/shaper.py index 11310f19bfa0..e39d270e688d 100755 --- a/tools/testing/selftests/drivers/net/shaper.py +++ b/tools/testing/selftests/drivers/net/shaper.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_true, KsftSkipEx +import errno + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_raises, ksft_true, KsftSkipEx from lib.py import EthtoolFamily, NetshaperFamily from lib.py import NetDrvEnv from lib.py import NlError @@ -438,6 +441,21 @@ def queue_update(cfg, nl_shaper) -> None: nl_shaper.delete({'ifindex': cfg.ifindex, 'handle': {'scope': 'queue', 'id': i}}) +def dup_leaves(cfg, nl_shaper) -> None: + """ Ensure that the kernel rejects duplicate leaves. """ + if not cfg.groups: + raise KsftSkipEx("device does not support node scope") + + with ksft_raises(NlError) as cm: + nl_shaper.group({ + 'ifindex': cfg.ifindex, + 'leaves':[{'handle': {'scope': 'queue', 'id': 0}}, + {'handle': {'scope': 'queue', 'id': 0}}], + 'handle': {'scope':'node'}, + 'metric': 'bps', + 'bw-max': 10000}) + ksft_eq(cm.exception.error, errno.EINVAL) + def main() -> None: with NetDrvEnv(__file__, queue_count=4) as cfg: cfg.queues = False @@ -453,7 +471,9 @@ def main() -> None: basic_groups, qgroups, delegation, - queue_update], args=(cfg, NetshaperFamily())) + dup_leaves, + queue_update], + args=(cfg, NetshaperFamily())) ksft_exit() diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/drivers/net/so_txtime.c index b76df1efc2ef..55a386f3d1b9 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/drivers/net/so_txtime.c @@ -33,9 +33,12 @@ #include <unistd.h> #include <poll.h> +#include "kselftest.h" + static int cfg_clockid = CLOCK_TAI; static uint16_t cfg_port = 8000; -static int cfg_variance_us = 4000; +static int cfg_variance_us = 8000; +static bool cfg_machine_slow; static uint64_t cfg_start_time_ns; static int cfg_mark; static bool cfg_rx; @@ -43,6 +46,8 @@ static bool cfg_rx; static uint64_t glob_tstart; static uint64_t tdeliver_max; +static int errors; + /* encode one timed transmission (of a 1B payload) */ struct timed_send { char data; @@ -131,13 +136,15 @@ static void do_recv_one(int fdr, struct timed_send *ts) fprintf(stderr, "payload:%c delay:%lld expected:%lld (us)\n", rbuf[0], (long long)tstop, (long long)texpect); - if (rbuf[0] != ts->data) - error(1, 0, "payload mismatch. expected %c", ts->data); + if (rbuf[0] != ts->data) { + fprintf(stderr, "payload mismatch. expected %c\n", ts->data); + errors++; + } if (llabs(tstop - texpect) > cfg_variance_us) { fprintf(stderr, "exceeds variance (%d us)\n", cfg_variance_us); - if (!getenv("KSFT_MACHINE_SLOW")) - exit(1); + if (!cfg_machine_slow) + errors++; } } @@ -255,8 +262,12 @@ static void start_time_wait(void) return; now = gettime_ns(CLOCK_REALTIME); - if (cfg_start_time_ns < now) + if (cfg_start_time_ns < now) { + fprintf(stderr, "FAIL: start time already passed\n"); + if (!cfg_machine_slow) + errors++; return; + } err = usleep((cfg_start_time_ns - now) / 1000); if (err) @@ -316,6 +327,9 @@ static int setup_rx(struct sockaddr *addr, socklen_t alen) if (bind(fd, addr, alen)) error(1, errno, "bind"); + if (cfg_machine_slow) + tv.tv_sec = 2; + if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv))) error(1, errno, "setsockopt rcv timeout"); @@ -502,6 +516,8 @@ static void parse_opts(int argc, char **argv) setup_sockaddr(domain, saddr, &cfg_src_addr); cfg_num_pkt = parse_io(argv[optind], cfg_buf); + + cfg_machine_slow = getenv("KSFT_MACHINE_SLOW"); } int main(int argc, char **argv) @@ -513,5 +529,10 @@ int main(int argc, char **argv) else do_test_tx((void *)&cfg_src_addr, cfg_alen); - return 0; + if (errors) { + fprintf(stderr, "FAIL: %d errors\n", errors); + return KSFT_FAIL; + } + + return KSFT_PASS; } diff --git a/tools/testing/selftests/drivers/net/so_txtime.py b/tools/testing/selftests/drivers/net/so_txtime.py new file mode 100755 index 000000000000..adf6c848d6d8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/so_txtime.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Regression tests for the SO_TXTIME interface. + +Test delivery time in FQ and ETF qdiscs. +""" + +import os +import time + +from lib.py import ksft_exit, ksft_run, ksft_variants +from lib.py import KsftNamedVariant, KsftSkipEx +from lib.py import NetDrvEpEnv, bkg, cmd, defer, tc + + +def test_so_txtime(cfg, clockid, ipver, args_tx, args_rx, expect_success): + """Main function. Run so_txtime as sender and receiver.""" + slow_machine = os.environ.get('KSFT_MACHINE_SLOW') + + if not hasattr(cfg, "bin_remote"): + cfg.bin_local = cfg.test_dir / "so_txtime" + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + tstart = time.time_ns() + (2000_000_000 if slow_machine else 200_000_000) + + cmd_addr = f"-S {cfg.addr_v[ipver]} -D {cfg.remote_addr_v[ipver]}" + cmd_args = f"-{ipver} -c {clockid} -t {tstart} {cmd_addr}" + cmd_rx = f"{cfg.bin_remote} {cmd_args} {args_rx} -r" + cmd_tx = f"{cfg.bin_local} {cmd_args} {args_tx}" + + expect_fail = not expect_success + if slow_machine: + expect_success = False + + with bkg(cmd_rx, host=cfg.remote, fail=expect_success, + expect_fail=expect_fail, exit_wait=True): + cmd(cmd_tx) + + +def _qdisc_setup(ifname, qdisc, optargs=""): + """Replace root qdisc. Restore the original after the test. + + If the original is mq, children will be of type default_qdisc. + """ + orig = tc(f"qdisc show dev {ifname} root", json=True)[0].get("kind", None) + defer(tc, f"qdisc replace dev {ifname} root {orig}") + tc(f"qdisc replace dev {ifname} root {qdisc} {optargs}") + + +def _test_variants_fq(): + for ipver in ["4", "6"]: + for testcase in [ + ["no_delay", "a,-1", "a,-1"], + ["zero_delay", "a,0", "a,0"], + ["one_pkt", "a,10", "a,10"], + ["in_order", "a,10,b,20", "a,10,b,20"], + ["reverse_order", "a,20,b,10", "b,10,a,20"], + ]: + name = f"v{ipver}_{testcase[0]}" + yield KsftNamedVariant(name, ipver, testcase[1], testcase[2]) + + +@ksft_variants(_test_variants_fq()) +def test_so_txtime_fq_mono(cfg, ipver, args_tx, args_rx): + """Run all variants of monotonic (fq) tests.""" + cfg.require_ipver(ipver) + _qdisc_setup(cfg.ifname, "fq") + test_so_txtime(cfg, "mono", ipver, args_tx, args_rx, True) + + +@ksft_variants(_test_variants_fq()) +def test_so_txtime_fq_tai(cfg, ipver, args_tx, args_rx): + """Run all variants of fq tests, but pass CLOCK_TAI to test conversion.""" + cfg.require_ipver(ipver) + _qdisc_setup(cfg.ifname, "fq") + test_so_txtime(cfg, "tai", ipver, args_tx, args_rx, True) + + +def _test_variants_etf(): + for ipver in ["4", "6"]: + for testcase in [ + ["no_delay", "a,-1", "a,-1", False], + ["zero_delay", "a,0", "a,0", False], + ["one_pkt", "a,10", "a,10", True], + ["in_order", "a,10,b,20", "a,10,b,20", True], + ["reverse_order", "a,20,b,10", "b,10,a,20", True], + ]: + name = f"v{ipver}_{testcase[0]}" + yield KsftNamedVariant( + name, ipver, testcase[1], testcase[2], testcase[3] + ) + + +@ksft_variants(_test_variants_etf()) +def test_so_txtime_etf(cfg, ipver, args_tx, args_rx, expect_fail): + """Run all variants of etf tests.""" + cfg.require_ipver(ipver) + try: + _qdisc_setup(cfg.ifname, "etf", "clockid CLOCK_TAI delta 400000") + except Exception as e: + raise KsftSkipEx("tc does not support qdisc etf. skipping") from e + + test_so_txtime(cfg, "tai", ipver, args_tx, args_rx, expect_fail) + + +def main() -> None: + """Boilerplate ksft main.""" + with NetDrvEpEnv(__file__) as cfg: + ksft_run( + [test_so_txtime_fq_mono, test_so_txtime_fq_tai, test_so_txtime_etf], + args=(cfg,), + ) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/filelock/.gitignore b/tools/testing/selftests/filelock/.gitignore new file mode 100644 index 000000000000..825e899a121b --- /dev/null +++ b/tools/testing/selftests/filelock/.gitignore @@ -0,0 +1 @@ +ofdlocks diff --git a/tools/testing/selftests/filelock/ofdlocks.c b/tools/testing/selftests/filelock/ofdlocks.c index ff8d47fc373a..68bac28b234b 100644 --- a/tools/testing/selftests/filelock/ofdlocks.c +++ b/tools/testing/selftests/filelock/ofdlocks.c @@ -16,7 +16,7 @@ static int lock_set(int fd, struct flock *fl) fl->l_whence = SEEK_SET; ret = fcntl(fd, F_OFD_SETLK, fl); if (ret) - perror("fcntl()"); + ksft_perror("fcntl()"); return ret; } @@ -28,7 +28,7 @@ static int lock_get(int fd, struct flock *fl) fl->l_whence = SEEK_SET; ret = fcntl(fd, F_OFD_GETLK, fl); if (ret) - perror("fcntl()"); + ksft_perror("fcntl()"); return ret; } @@ -39,94 +39,82 @@ int main(void) int fd = open("/tmp/aa", O_RDWR | O_CREAT | O_EXCL, 0600); int fd2 = open("/tmp/aa", O_RDONLY); + ksft_print_header(); + ksft_set_plan(4); + unlink("/tmp/aa"); assert(fd != -1); assert(fd2 != -1); - ksft_print_msg("[INFO] opened fds %i %i\n", fd, fd2); + ksft_print_msg("opened fds %i %i\n", fd, fd2); /* Set some read lock */ fl.l_type = F_RDLCK; fl.l_start = 5; fl.l_len = 3; rc = lock_set(fd, &fl); - if (rc == 0) { - ksft_print_msg - ("[SUCCESS] set OFD read lock on first fd\n"); - } else { - ksft_print_msg("[FAIL] to set OFD read lock on first fd\n"); - return -1; - } + ksft_test_result(rc == 0, "set OFD read lock on first fd\n"); + if (rc != 0) + ksft_finished(); + /* Make sure read locks do not conflict on different fds. */ fl.l_type = F_RDLCK; fl.l_start = 5; fl.l_len = 1; rc = lock_get(fd2, &fl); if (rc != 0) - return -1; - if (fl.l_type != F_UNLCK) { - ksft_print_msg("[FAIL] read locks conflicted\n"); - return -1; - } + ksft_finished(); + if (fl.l_type != F_UNLCK) + ksft_exit_fail_msg("read locks conflicted\n"); + /* Make sure read/write locks do conflict on different fds. */ fl.l_type = F_WRLCK; fl.l_start = 5; fl.l_len = 1; rc = lock_get(fd2, &fl); if (rc != 0) - return -1; - if (fl.l_type != F_UNLCK) { - ksft_print_msg - ("[SUCCESS] read and write locks conflicted\n"); - } else { - ksft_print_msg - ("[SUCCESS] read and write locks not conflicted\n"); - return -1; - } + ksft_finished(); + ksft_test_result(fl.l_type != F_UNLCK, + "read and write locks conflicted\n"); + if (fl.l_type == F_UNLCK) + ksft_finished(); + /* Get info about the lock on first fd. */ fl.l_type = F_UNLCK; fl.l_start = 5; fl.l_len = 1; rc = lock_get(fd, &fl); - if (rc != 0) { - ksft_print_msg - ("[FAIL] F_OFD_GETLK with F_UNLCK not supported\n"); - return -1; - } - if (fl.l_type != F_UNLCK) { - ksft_print_msg - ("[SUCCESS] F_UNLCK test returns: locked, type %i pid %i len %zi\n", - fl.l_type, fl.l_pid, fl.l_len); - } else { - ksft_print_msg - ("[FAIL] F_OFD_GETLK with F_UNLCK did not return lock info\n"); - return -1; - } + if (rc != 0) + ksft_exit_fail_msg("F_OFD_GETLK with F_UNLCK not supported\n"); + ksft_test_result(fl.l_type != F_UNLCK, + "F_OFD_GETLK with F_UNLCK returned lock info\n"); + if (fl.l_type == F_UNLCK) + ksft_exit_fail(); + ksft_print_msg("F_UNLCK test returns: locked, type %i pid %i len %zi\n", + fl.l_type, fl.l_pid, fl.l_len); + /* Try the same but by locking everything by len==0. */ fl2.l_type = F_UNLCK; fl2.l_start = 0; fl2.l_len = 0; rc = lock_get(fd, &fl2); - if (rc != 0) { - ksft_print_msg - ("[FAIL] F_OFD_GETLK with F_UNLCK not supported\n"); - return -1; - } + if (rc != 0) + ksft_exit_fail_msg + ("F_OFD_GETLK with F_UNLCK not supported\n"); + ksft_test_result(memcmp(&fl, &fl2, sizeof(fl)) == 0, + "F_UNLCK with len==0 returned the same\n"); if (memcmp(&fl, &fl2, sizeof(fl))) { - ksft_print_msg - ("[FAIL] F_UNLCK test returns: locked, type %i pid %i len %zi\n", + ksft_exit_fail_msg + ("F_UNLCK test returns: locked, type %i pid %i len %zi\n", fl.l_type, fl.l_pid, fl.l_len); - return -1; } - ksft_print_msg("[SUCCESS] F_UNLCK with len==0 returned the same\n"); + /* Get info about the lock on second fd - no locks on it. */ fl.l_type = F_UNLCK; fl.l_start = 0; fl.l_len = 0; lock_get(fd2, &fl); - if (fl.l_type != F_UNLCK) { - ksft_print_msg - ("[FAIL] F_OFD_GETLK with F_UNLCK return lock info from another fd\n"); - return -1; - } - return 0; + ksft_test_result(fl.l_type == F_UNLCK, + "F_OFD_GETLK with F_UNLCK return lock info from another fd\n"); + + ksft_finished(); } diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index 64ac0dfa46b7..a78f894157de 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -5,3 +5,4 @@ fclog file_stressor anon_inode_test kernfs_test +idmapped_tmpfile diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index 85427d7f19b9..a7ec2ba2dd83 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -2,6 +2,10 @@ CFLAGS += $(KHDR_INCLUDES) TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog +TEST_GEN_PROGS += idmapped_tmpfile TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk + +$(OUTPUT)/idmapped_tmpfile: LDLIBS += -lcap +$(OUTPUT)/idmapped_tmpfile: utils.c diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index 8bc57a2ef966..f6f1a7ff01b0 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3493,4 +3493,49 @@ TEST(epoll64) close(ctx.sfd[1]); } +static void *epoll65_wait(void *ctx_) +{ + struct epoll_mtcontext *ctx = ctx_; + struct epoll_event event; + + for (int i = 0; i < 100000; ++i) { + if (!epoll_wait(ctx->efd[0], &event, 1, 0)) + return (void *)ENODATA; + } + + return (void *)0; +} + +TEST(epoll65) +{ + struct epoll_mtcontext ctx; + struct epoll_event event; + int64_t dummy_data = 99; + pthread_t threads[64]; + uintptr_t ret; + int i, err; + + ctx.efd[0] = epoll_create(1); + ASSERT_GE(ctx.efd[0], 0); + ctx.efd[1] = eventfd(0, 0); + ASSERT_GE(ctx.efd[1], 0); + + event.events = EPOLLIN; + err = epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &event); + ASSERT_EQ(err, 0); + + write(ctx.efd[1], &dummy_data, sizeof(dummy_data)); + + for (i = 0; i < ARRAY_SIZE(threads); ++i) + ASSERT_EQ(pthread_create(&threads[i], NULL, epoll65_wait, &ctx), 0); + + for (i = 0; i < ARRAY_SIZE(threads); ++i) { + ASSERT_EQ(pthread_join(threads[i], (void **)&ret), 0); + ASSERT_EQ(ret, 0); + } + + close(ctx.efd[0]); + close(ctx.efd[1]); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/idmapped_tmpfile.c b/tools/testing/selftests/filesystems/idmapped_tmpfile.c new file mode 100644 index 000000000000..bc411ab8281e --- /dev/null +++ b/tools/testing/selftests/filesystems/idmapped_tmpfile.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/fsuid.h> +#include <sys/stat.h> +#include <sys/syscall.h> + +#include <linux/mount.h> +#include <linux/types.h> + +#include "kselftest_harness.h" +#include "wrappers.h" +#include "utils.h" + +/* + * The test mount maps caller-visible ids [0, MAP_RANGE) onto the on-disk range + * [MAP_HOST, MAP_HOST + MAP_RANGE). An id outside [0, MAP_RANGE) therefore has + * no mapping in the mount and is not representable in the filesystem. + */ +#define MAP_HOST 10000 +#define MAP_RANGE 10000 +#define UNMAPPED 50000 + +#ifndef MOUNT_ATTR_IDMAP +#define MOUNT_ATTR_IDMAP 0x00100000 +#endif + +#ifndef __NR_mount_setattr +#define __NR_mount_setattr 442 +#endif + +static inline int sys_mount_setattr(int dfd, const char *path, + unsigned int flags, + struct mount_attr *attr, size_t size) +{ + return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); +} + +/* + * Clone @path into a detached mount idmapped so that caller-visible ids + * [0, MAP_RANGE) map onto the on-disk ids [MAP_HOST, MAP_HOST + MAP_RANGE). + * Returns the mount fd, or -1 if idmapped mounts are not available. + */ +static int idmapped_clone(const char *path) +{ + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + }; + int fd_tree, userns_fd, ret; + + fd_tree = sys_open_tree(AT_FDCWD, path, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + if (fd_tree < 0) + return -1; + + userns_fd = get_userns_fd(MAP_HOST, 0, MAP_RANGE); + if (userns_fd < 0) { + close(fd_tree); + return -1; + } + + attr.userns_fd = userns_fd; + ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); + close(userns_fd); + if (ret) { + close(fd_tree); + return -1; + } + + return fd_tree; +} + +FIXTURE(idmapped_tmpfile) { + char dir[64]; /* non-idmapped path to the layer directory */ +}; + +FIXTURE_SETUP(idmapped_tmpfile) +{ + /* Private mount namespace so test mounts need no cleanup. */ + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + ASSERT_EQ(sys_mount("tmpfs", "/tmp", "tmpfs", 0, NULL), 0); + + snprintf(self->dir, sizeof(self->dir), "/tmp/d"); + ASSERT_EQ(mkdir(self->dir, 0777), 0); + /* World-writable so an unmapped caller still passes permission(). */ + ASSERT_EQ(chmod(self->dir, 0777), 0); +} + +FIXTURE_TEARDOWN(idmapped_tmpfile) +{ +} + +/* + * A caller whose fsuid/fsgid have no mapping in the idmapped mount must not be + * able to create an O_TMPFILE. Without the check in vfs_tmpfile() the inode + * would be created owned by (uid_t)-1 and could then be linked into the + * namespace. + */ +TEST_F(idmapped_tmpfile, unmapped_caller_is_refused) +{ + int mfd, fd; + + mfd = idmapped_clone(self->dir); + if (mfd < 0) + SKIP(return, "idmapped mounts not supported"); + + /* Become a caller outside the mount's [0, MAP_RANGE) range. */ + setfsgid(UNMAPPED); + setfsuid(UNMAPPED); + ASSERT_EQ(setfsuid(-1), UNMAPPED); + + fd = openat(mfd, ".", O_TMPFILE | O_WRONLY, 0644); + ASSERT_LT(fd, 0); + EXPECT_EQ(errno, EOVERFLOW); + if (fd >= 0) + close(fd); + + EXPECT_EQ(close(mfd), 0); +} + +/* + * A mapped caller can create an O_TMPFILE and link it into the namespace; the + * ownership round-trips through the mount idmap. This is what makes refusing + * the unmapped case above necessary in the first place. + */ +TEST_F(idmapped_tmpfile, mapped_caller_creates_and_links) +{ + char path[PATH_MAX]; + struct stat st; + int mfd, fd; + + mfd = idmapped_clone(self->dir); + if (mfd < 0) + SKIP(return, "idmapped mounts not supported"); + + /* Caller is uid/gid 0, which maps to MAP_HOST through the mount. */ + fd = openat(mfd, ".", O_TMPFILE | O_RDWR, 0600); + ASSERT_GE(fd, 0); + + ASSERT_EQ(fstat(fd, &st), 0); + EXPECT_EQ(st.st_uid, 0); + EXPECT_EQ(st.st_gid, 0); + + /* The tmpfile is linkable: splice it into the directory. */ + ASSERT_EQ(linkat(fd, "", mfd, "linked", AT_EMPTY_PATH), 0); + EXPECT_EQ(close(fd), 0); + + ASSERT_EQ(fstatat(mfd, "linked", &st, 0), 0); + EXPECT_EQ(st.st_uid, 0); + EXPECT_EQ(st.st_gid, 0); + + /* On the underlying, non-idmapped tmpfs it is stored as MAP_HOST. */ + snprintf(path, sizeof(path), "%s/linked", self->dir); + ASSERT_EQ(stat(path, &st), 0); + EXPECT_EQ(st.st_uid, MAP_HOST); + EXPECT_EQ(st.st_gid, MAP_HOST); + + EXPECT_EQ(close(mfd), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/openat2/.gitignore b/tools/testing/selftests/filesystems/openat2/.gitignore index 82a4846cbc4b..82a4846cbc4b 100644 --- a/tools/testing/selftests/openat2/.gitignore +++ b/tools/testing/selftests/filesystems/openat2/.gitignore diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/filesystems/openat2/Makefile index 185dc76ebb5f..d848aac96bde 100644 --- a/tools/testing/selftests/openat2/Makefile +++ b/tools/testing/selftests/filesystems/openat2/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-or-later -CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined -TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test +CFLAGS += $(KHDR_INCLUDES) +CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined $(TOOLS_INCLUDES) +TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test emptypath_test # gcc requires -static-libasan in order to ensure that Address Sanitizer's # library is the first one loaded. However, clang already statically links the @@ -13,6 +14,4 @@ endif LOCAL_HDRS += helpers.h -include ../lib.mk - -$(TEST_GEN_PROGS): helpers.c +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/openat2/emptypath_test.c b/tools/testing/selftests/filesystems/openat2/emptypath_test.c new file mode 100644 index 000000000000..be37ccba57ec --- /dev/null +++ b/tools/testing/selftests/filesystems/openat2/emptypath_test.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <sys/stat.h> + +#include "kselftest_harness.h" + +#ifndef O_EMPTYPATH +#define O_EMPTYPATH (1 << 26) +#endif + +#define EMPTYPATH_TEST_FILE "/tmp/emptypath_test" + +FIXTURE(emptypath) { + int opath_fd; +}; + +FIXTURE_SETUP(emptypath) +{ + int fd; + + self->opath_fd = -1; + + fd = open(EMPTYPATH_TEST_FILE, O_CREAT | O_WRONLY, S_IRWXU); + ASSERT_GE(fd, 0) { + TH_LOG("create %s: %s", EMPTYPATH_TEST_FILE, strerror(errno)); + } + close(fd); + + self->opath_fd = open(EMPTYPATH_TEST_FILE, O_PATH); + ASSERT_GE(self->opath_fd, 0) { + TH_LOG("open %s O_PATH: %s", EMPTYPATH_TEST_FILE, strerror(errno)); + } +} + +FIXTURE_TEARDOWN(emptypath) +{ + if (self->opath_fd >= 0) + close(self->opath_fd); + unlink(EMPTYPATH_TEST_FILE); +} + +/* An empty path is rejected with ENOENT unless O_EMPTYPATH is set. */ +TEST_F(emptypath, without_flag_returns_enoent) +{ + int fd = openat(self->opath_fd, "", O_RDONLY); + + if (fd >= 0) + close(fd); + ASSERT_LT(fd, 0) { + TH_LOG("empty path without O_EMPTYPATH unexpectedly succeeded"); + } + EXPECT_EQ(errno, ENOENT) { + TH_LOG("expected ENOENT, got %s", strerror(errno)); + } +} + +/* O_EMPTYPATH reopens the O_PATH fd through an empty path. */ +TEST_F(emptypath, reopens_opath_fd) +{ + int fd = openat(self->opath_fd, "", O_RDONLY | O_EMPTYPATH); + + if (fd < 0 && errno == EINVAL) + SKIP(return, "O_EMPTYPATH not supported"); + + ASSERT_GE(fd, 0) { + TH_LOG("O_EMPTYPATH failed: %s", strerror(errno)); + } + close(fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/openat2/helpers.h b/tools/testing/selftests/filesystems/openat2/helpers.h new file mode 100644 index 000000000000..3f01fb68c5a6 --- /dev/null +++ b/tools/testing/selftests/filesystems/openat2/helpers.h @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2018-2019 SUSE LLC. + * Copyright (C) 2026 Amutable GmbH + */ + +#ifndef __RESOLVEAT_H__ +#define __RESOLVEAT_H__ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdbool.h> +#include <errno.h> +#include <limits.h> +#include <linux/types.h> +#include <linux/unistd.h> +#include <linux/openat2.h> +#include "kselftest_harness.h" + +#define BUILD_BUG_ON(e) ((void)(sizeof(struct { int:(-!!(e)); }))) + +#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */ +#define OPEN_HOW_SIZE_LATEST OPEN_HOW_SIZE_VER0 + +__maybe_unused +static bool needs_openat2(const struct open_how *how) +{ + return how->resolve != 0; +} + +__maybe_unused +static int raw_openat2(int dfd, const char *path, void *how, size_t size) +{ + int ret = syscall(__NR_openat2, dfd, path, how, size); + + return ret >= 0 ? ret : -errno; +} + +__maybe_unused +static int sys_openat2(int dfd, const char *path, struct open_how *how) +{ + return raw_openat2(dfd, path, how, sizeof(*how)); +} + +__maybe_unused +static int sys_openat(int dfd, const char *path, struct open_how *how) +{ + int ret = openat(dfd, path, how->flags, how->mode); + + return ret >= 0 ? ret : -errno; +} + +__maybe_unused +static int sys_renameat2(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath, unsigned int flags) +{ + int ret = syscall(__NR_renameat2, olddirfd, oldpath, + newdirfd, newpath, flags); + + return ret >= 0 ? ret : -errno; +} + +__maybe_unused +static int touchat(int dfd, const char *path) +{ + int fd = openat(dfd, path, O_CREAT, 0700); + + if (fd >= 0) + close(fd); + return fd; +} + +__maybe_unused +static char *fdreadlink(struct __test_metadata *_metadata, int fd) +{ + char *target, *tmp; + + ASSERT_GT(asprintf(&tmp, "/proc/self/fd/%d", fd), 0); + + target = malloc(PATH_MAX); + ASSERT_NE(target, NULL); + memset(target, 0, PATH_MAX); + + ASSERT_GT(readlink(tmp, target, PATH_MAX), 0); + + free(tmp); + return target; +} + +__maybe_unused +static bool fdequal(struct __test_metadata *_metadata, int fd, + int dfd, const char *path) +{ + char *fdpath, *dfdpath, *other; + bool cmp; + + fdpath = fdreadlink(_metadata, fd); + dfdpath = fdreadlink(_metadata, dfd); + + if (!path) { + ASSERT_GT(asprintf(&other, "%s", dfdpath), 0); + } else if (*path == '/') { + ASSERT_GT(asprintf(&other, "%s", path), 0); + } else { + ASSERT_GT(asprintf(&other, "%s/%s", dfdpath, path), 0); + } + + cmp = !strcmp(fdpath, other); + + free(fdpath); + free(dfdpath); + free(other); + return cmp; +} + +static bool openat2_supported = false; + +__attribute__((constructor)) +static void __detect_openat2_supported(void) +{ + struct open_how how = {}; + int fd; + + BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_VER0); + + /* Check openat2(2) support. */ + fd = sys_openat2(AT_FDCWD, ".", &how); + openat2_supported = (fd >= 0); + + if (fd >= 0) + close(fd); +} + +#endif /* __RESOLVEAT_H__ */ diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/filesystems/openat2/openat2_test.c index 0e161ef9e9e4..6f5afbe2d8d3 100644 --- a/tools/testing/selftests/openat2/openat2_test.c +++ b/tools/testing/selftests/filesystems/openat2/openat2_test.c @@ -15,8 +15,8 @@ #include <stdbool.h> #include <string.h> -#include "kselftest.h" #include "helpers.h" +#include "kselftest_harness.h" /* * O_LARGEFILE is set to 0 by glibc. @@ -45,13 +45,29 @@ struct struct_test { int err; }; -#define NUM_OPENAT2_STRUCT_TESTS 7 -#define NUM_OPENAT2_STRUCT_VARIATIONS 13 +struct flag_test { + const char *name; + struct open_how how; + int err; +}; + +FIXTURE(openat2) {}; -void test_openat2_struct(void) +FIXTURE_SETUP(openat2) { - int misalignments[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 17, 87 }; + if (!openat2_supported) + SKIP(return, "openat2(2) not supported"); +} + +FIXTURE_TEARDOWN(openat2) {} +/* + * Verify that the struct size and misalignment handling for openat2(2) is + * correct, including that is_zeroed_user() works. + */ +TEST_F(openat2, struct_argument_sizes) +{ + int misalignments[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 17, 87 }; struct struct_test tests[] = { /* Normal struct. */ { .name = "normal struct", @@ -83,26 +99,14 @@ void test_openat2_struct(void) .size = sizeof(struct open_how_ext), .err = -E2BIG }, }; - BUILD_BUG_ON(ARRAY_LEN(misalignments) != NUM_OPENAT2_STRUCT_VARIATIONS); - BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_STRUCT_TESTS); - - for (int i = 0; i < ARRAY_LEN(tests); i++) { + for (int i = 0; i < ARRAY_SIZE(tests); i++) { struct struct_test *test = &tests[i]; struct open_how_ext how_ext = test->arg; - for (int j = 0; j < ARRAY_LEN(misalignments); j++) { + for (int j = 0; j < ARRAY_SIZE(misalignments); j++) { int fd, misalign = misalignments[j]; - char *fdpath = NULL; - bool failed; - void (*resultfn)(const char *msg, ...) = ksft_test_result_pass; - void *copy = NULL, *how_copy = &how_ext; - - if (!openat2_supported) { - ksft_print_msg("openat2(2) unsupported\n"); - resultfn = ksft_test_result_skip; - goto skip; - } + char *fdpath = NULL; if (misalign) { /* @@ -119,50 +123,42 @@ void test_openat2_struct(void) } fd = raw_openat2(AT_FDCWD, ".", how_copy, test->size); - if (test->err >= 0) - failed = (fd < 0); - else - failed = (fd != test->err); if (fd >= 0) { - fdpath = fdreadlink(fd); + fdpath = fdreadlink(_metadata, fd); close(fd); } - if (failed) { - resultfn = ksft_test_result_fail; - - ksft_print_msg("openat2 unexpectedly returned "); - if (fdpath) - ksft_print_msg("%d['%s']\n", fd, fdpath); - else - ksft_print_msg("%d (%s)\n", fd, strerror(-fd)); + if (test->err >= 0) { + EXPECT_GE(fd, 0) { + TH_LOG("openat2 with %s [misalign=%d] should succeed, got %d (%s)", + test->name, misalign, + fd, strerror(-fd)); + } + } else { + EXPECT_EQ(test->err, fd) { + if (fdpath) + TH_LOG("openat2 with %s [misalign=%d] should fail with %d (%s), got %d['%s']", + test->name, misalign, + test->err, + strerror(-test->err), + fd, fdpath); + else + TH_LOG("openat2 with %s [misalign=%d] should fail with %d (%s), got %d (%s)", + test->name, misalign, + test->err, + strerror(-test->err), + fd, strerror(-fd)); + } } -skip: - if (test->err >= 0) - resultfn("openat2 with %s argument [misalign=%d] succeeds\n", - test->name, misalign); - else - resultfn("openat2 with %s argument [misalign=%d] fails with %d (%s)\n", - test->name, misalign, test->err, - strerror(-test->err)); - free(copy); free(fdpath); - fflush(stdout); } } } -struct flag_test { - const char *name; - struct open_how how; - int err; -}; - -#define NUM_OPENAT2_FLAG_TESTS 25 - -void test_openat2_flags(void) +/* Verify openat2(2) flag and mode validation. */ +TEST_F(openat2, flag_validation) { struct flag_test tests[] = { /* O_TMPFILE is incompatible with O_PATH and O_CREAT. */ @@ -241,20 +237,10 @@ void test_openat2_flags(void) .how.resolve = 0, .err = -EINVAL }, }; - BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_FLAG_TESTS); - - for (int i = 0; i < ARRAY_LEN(tests); i++) { + for (int i = 0; i < ARRAY_SIZE(tests); i++) { int fd, fdflags = -1; char *path, *fdpath = NULL; - bool failed = false; struct flag_test *test = &tests[i]; - void (*resultfn)(const char *msg, ...) = ksft_test_result_pass; - - if (!openat2_supported) { - ksft_print_msg("openat2(2) unsupported\n"); - resultfn = ksft_test_result_skip; - goto skip; - } path = (test->how.flags & O_CREAT) ? "/tmp/ksft.openat2_tmpfile" : "."; unlink(path); @@ -265,74 +251,112 @@ void test_openat2_flags(void) * Skip the testcase if it failed because not supported * by FS. (e.g. a valid O_TMPFILE combination on NFS) */ - ksft_test_result_skip("openat2 with %s fails with %d (%s)\n", - test->name, fd, strerror(-fd)); - goto next; + TH_LOG("openat2 with %s not supported by FS -- skipping", + test->name); + continue; } - if (test->err >= 0) - failed = (fd < 0); - else - failed = (fd != test->err); - if (fd >= 0) { - int otherflags; - - fdpath = fdreadlink(fd); - fdflags = fcntl(fd, F_GETFL); - otherflags = fcntl(fd, F_GETFD); - close(fd); - - E_assert(fdflags >= 0, "fcntl F_GETFL of new fd"); - E_assert(otherflags >= 0, "fcntl F_GETFD of new fd"); - - /* O_CLOEXEC isn't shown in F_GETFL. */ - if (otherflags & FD_CLOEXEC) - fdflags |= O_CLOEXEC; - /* O_CREAT is hidden from F_GETFL. */ - if (test->how.flags & O_CREAT) - fdflags |= O_CREAT; - if (!(test->how.flags & O_LARGEFILE)) - fdflags &= ~O_LARGEFILE; - failed |= (fdflags != test->how.flags); - } + if (test->err >= 0) { + EXPECT_GE(fd, 0) { + TH_LOG("openat2 with %s should succeed, got %d (%s)", + test->name, fd, strerror(-fd)); + } + if (fd >= 0) { + int otherflags; - if (failed) { - resultfn = ksft_test_result_fail; + fdpath = fdreadlink(_metadata, fd); + fdflags = fcntl(fd, F_GETFL); + otherflags = fcntl(fd, F_GETFD); + close(fd); - ksft_print_msg("openat2 unexpectedly returned "); - if (fdpath) - ksft_print_msg("%d['%s'] with %X (!= %llX)\n", - fd, fdpath, fdflags, - test->how.flags); - else - ksft_print_msg("%d (%s)\n", fd, strerror(-fd)); + ASSERT_GE(fdflags, 0); + ASSERT_GE(otherflags, 0); + + /* O_CLOEXEC isn't shown in F_GETFL. */ + if (otherflags & FD_CLOEXEC) + fdflags |= O_CLOEXEC; + /* O_CREAT is hidden from F_GETFL. */ + if (test->how.flags & O_CREAT) + fdflags |= O_CREAT; + if (!(test->how.flags & O_LARGEFILE)) + fdflags &= ~O_LARGEFILE; + + EXPECT_EQ(fdflags, (int)test->how.flags) { + TH_LOG("openat2 with %s: flags mismatch %X != %llX", + test->name, fdflags, + (unsigned long long)test->how.flags); + } + } + } else { + EXPECT_EQ(test->err, fd) { + if (fd >= 0) { + fdpath = fdreadlink(_metadata, fd); + TH_LOG("openat2 with %s should fail with %d (%s), got %d['%s']", + test->name, test->err, + strerror(-test->err), + fd, fdpath); + } else { + TH_LOG("openat2 with %s should fail with %d (%s), got %d (%s)", + test->name, test->err, + strerror(-test->err), + fd, strerror(-fd)); + } + } + if (fd >= 0) + close(fd); } -skip: - if (test->err >= 0) - resultfn("openat2 with %s succeeds\n", test->name); - else - resultfn("openat2 with %s fails with %d (%s)\n", - test->name, test->err, strerror(-test->err)); -next: free(fdpath); - fflush(stdout); } } -#define NUM_TESTS (NUM_OPENAT2_STRUCT_VARIATIONS * NUM_OPENAT2_STRUCT_TESTS + \ - NUM_OPENAT2_FLAG_TESTS) +#ifndef OPENAT2_REGULAR +#define OPENAT2_REGULAR ((__u64)1 << 32) +#endif + +#ifndef EFTYPE +#define EFTYPE 134 +#endif + +/* Kernel-internal carrier for OPENAT2_REGULAR (see __O_REGULAR in fcntl.h). */ +#ifndef __O_REGULAR +#define __O_REGULAR (1 << 30) +#endif -int main(int argc, char **argv) +/* Verify that OPENAT2_REGULAR rejects non-regular files with EFTYPE. */ +TEST_F(openat2, regular_flag) { - ksft_print_header(); - ksft_set_plan(NUM_TESTS); + struct open_how how = { + .flags = OPENAT2_REGULAR | O_RDONLY, + }; + int fd; - test_openat2_struct(); - test_openat2_flags(); + fd = sys_openat2(AT_FDCWD, "/dev/null", &how); + if (fd == -ENOENT) + SKIP(return, "/dev/null does not exist"); - if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) - ksft_exit_fail(); - else - ksft_exit_pass(); + EXPECT_EQ(-EFTYPE, fd) { + TH_LOG("openat2 with OPENAT2_REGULAR should fail with %d (%s), got %d (%s)", + -EFTYPE, strerror(EFTYPE), fd, strerror(-fd)); + } + if (fd >= 0) + close(fd); } + +/* open()/openat() must keep ignoring the internal __O_REGULAR bit. */ +TEST(legacy_openat_ignores_o_regular) +{ + int fd; + + fd = openat(AT_FDCWD, "/dev/null", O_RDONLY | __O_REGULAR); + if (fd < 0 && errno == ENOENT) + SKIP(return, "/dev/null does not exist"); + + ASSERT_GE(fd, 0) { + TH_LOG("legacy openat() must ignore the __O_REGULAR carrier bit, got errno %d (%s)", + errno, strerror(errno)); + } + close(fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/openat2/rename_attack_test.c b/tools/testing/selftests/filesystems/openat2/rename_attack_test.c new file mode 100644 index 000000000000..1f33c34f56be --- /dev/null +++ b/tools/testing/selftests/filesystems/openat2/rename_attack_test.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2018-2019 SUSE LLC. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sched.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <syscall.h> +#include <limits.h> +#include <unistd.h> + +#include "helpers.h" +#include "kselftest_harness.h" + +#define ROUNDS 400000 + +/* Swap @dirfd/@a and @dirfd/@b constantly. Parent must kill this process. */ +pid_t spawn_attack(struct __test_metadata *_metadata, + int dirfd, char *a, char *b) +{ + pid_t child = fork(); + if (child != 0) + return child; + + /* If the parent (the test process) dies, kill ourselves too. */ + ASSERT_EQ(prctl(PR_SET_PDEATHSIG, SIGKILL), 0); + + /* Swap @a and @b. */ + for (;;) + renameat2(dirfd, a, dirfd, b, RENAME_EXCHANGE); + exit(1); +} + +/* + * Construct a test directory with the following structure: + * + * root/ + * |-- a/ + * | `-- c/ + * `-- b/ + */ +FIXTURE(rename_attack) { + int dfd; + int afd; + pid_t child; +}; + +FIXTURE_SETUP(rename_attack) +{ + char dirname[] = "/tmp/ksft-openat2-rename-attack.XXXXXX"; + + self->dfd = -1; + self->afd = -1; + self->child = 0; + + /* Make the top-level directory. */ + ASSERT_NE(mkdtemp(dirname), NULL); + self->dfd = open(dirname, O_PATH | O_DIRECTORY); + ASSERT_GE(self->dfd, 0); + + ASSERT_EQ(mkdirat(self->dfd, "a", 0755), 0); + ASSERT_EQ(mkdirat(self->dfd, "b", 0755), 0); + ASSERT_EQ(mkdirat(self->dfd, "a/c", 0755), 0); + + self->afd = openat(self->dfd, "a", O_PATH); + ASSERT_GE(self->afd, 0); + + self->child = spawn_attack(_metadata, self->dfd, "a/c", "b"); + ASSERT_GT(self->child, 0); +} + +FIXTURE_TEARDOWN(rename_attack) +{ + if (self->child > 0) + kill(self->child, SIGKILL); + if (self->afd >= 0) + close(self->afd); + if (self->dfd >= 0) + close(self->dfd); +} + +FIXTURE_VARIANT(rename_attack) { + int resolve; + const char *name; +}; + +FIXTURE_VARIANT_ADD(rename_attack, resolve_beneath) { + .resolve = RESOLVE_BENEATH, + .name = "RESOLVE_BENEATH", +}; + +FIXTURE_VARIANT_ADD(rename_attack, resolve_in_root) { + .resolve = RESOLVE_IN_ROOT, + .name = "RESOLVE_IN_ROOT", +}; + +TEST_F_TIMEOUT(rename_attack, test, 120) +{ + int escapes = 0, successes = 0, other_errs = 0, exdevs = 0, eagains = 0; + char *victim_path = "c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../.."; + struct open_how how = { + .flags = O_PATH, + .resolve = variant->resolve, + }; + + if (!openat2_supported) { + how.resolve = 0; + TH_LOG("openat2(2) unsupported -- using openat(2) instead"); + } + + for (int i = 0; i < ROUNDS; i++) { + int fd; + + if (openat2_supported) + fd = sys_openat2(self->afd, victim_path, &how); + else + fd = sys_openat(self->afd, victim_path, &how); + + if (fd < 0) { + if (fd == -EAGAIN) + eagains++; + else if (fd == -EXDEV) + exdevs++; + else if (fd == -ENOENT) + escapes++; /* escaped outside and got ENOENT... */ + else + other_errs++; /* unexpected error */ + } else { + if (fdequal(_metadata, fd, self->afd, NULL)) + successes++; + else + escapes++; /* we got an unexpected fd */ + } + if (fd >= 0) + close(fd); + } + + TH_LOG("non-escapes: EAGAIN=%d EXDEV=%d E<other>=%d success=%d", + eagains, exdevs, other_errs, successes); + ASSERT_EQ(escapes, 0) { + TH_LOG("rename attack with %s (%d runs, got %d escapes)", + variant->name, ROUNDS, escapes); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/openat2/resolve_test.c b/tools/testing/selftests/filesystems/openat2/resolve_test.c index a76ef15ceb90..eacde59ce158 100644 --- a/tools/testing/selftests/openat2/resolve_test.c +++ b/tools/testing/selftests/filesystems/openat2/resolve_test.c @@ -14,8 +14,81 @@ #include <stdbool.h> #include <string.h> -#include "kselftest.h" #include "helpers.h" +#include "kselftest_harness.h" + +struct resolve_test { + const char *name; + const char *dir; + const char *path; + struct open_how how; + bool pass; + union { + int err; + const char *path; + } out; +}; + +/* + * Verify a single resolve test case. This must be called from within a TEST_F + * function with _metadata in scope. + */ +static void verify_resolve_test(struct __test_metadata *_metadata, + int rootfd, int hardcoded_fd, + const struct resolve_test *test) +{ + struct open_how how = test->how; + int dfd, fd; + char *fdpath = NULL; + + /* Auto-set O_PATH. */ + if (!(how.flags & O_CREAT)) + how.flags |= O_PATH; + + if (test->dir) + dfd = openat(rootfd, test->dir, O_PATH | O_DIRECTORY); + else + dfd = dup(rootfd); + ASSERT_GE(dfd, 0) TH_LOG("failed to open dir '%s': %m", test->dir ?: "."); + ASSERT_EQ(dup2(dfd, hardcoded_fd), hardcoded_fd); + + fd = sys_openat2(dfd, test->path, &how); + + if (test->pass) { + EXPECT_GE(fd, 0) { + TH_LOG("%s: expected success, got %d (%s)", + test->name, fd, strerror(-fd)); + } + if (fd >= 0) { + EXPECT_TRUE(fdequal(_metadata, fd, rootfd, test->out.path)) { + fdpath = fdreadlink(_metadata, fd); + TH_LOG("%s: wrong path '%s', expected '%s'", + test->name, fdpath, + test->out.path ?: "."); + free(fdpath); + } + } + } else { + EXPECT_EQ(test->out.err, fd) { + if (fd >= 0) { + fdpath = fdreadlink(_metadata, fd); + TH_LOG("%s: expected %d (%s), got %d['%s']", + test->name, test->out.err, + strerror(-test->out.err), fd, fdpath); + free(fdpath); + } else { + TH_LOG("%s: expected %d (%s), got %d (%s)", + test->name, test->out.err, + strerror(-test->out.err), + fd, strerror(-fd)); + } + } + } + + if (fd >= 0) + close(fd); + close(dfd); +} /* * Construct a test directory with the following structure: @@ -39,101 +112,110 @@ * |-- absself -> / * |-- self -> ../../root/ * |-- garbageself -> /../../root/ - * |-- passwd -> ../cheeky/../cheeky/../etc/../etc/passwd - * |-- abspasswd -> /../cheeky/../cheeky/../etc/../etc/passwd + * |-- passwd -> ../cheeky/../etc/../etc/passwd + * |-- abspasswd -> /../cheeky/../etc/../etc/passwd * |-- dotdotlink -> ../../../../../../../../../../../../../../etc/passwd * `-- garbagelink -> /../../../../../../../../../../../../../../etc/passwd */ -int setup_testdir(void) +FIXTURE(openat2_resolve) { + int rootfd; + int hardcoded_fd; + char *hardcoded_fdpath; + char *procselfexe; +}; + +FIXTURE_SETUP(openat2_resolve) { - int dfd, tmpfd; char dirname[] = "/tmp/ksft-openat2-testdir.XXXXXX"; + int dfd, tmpfd; + + self->rootfd = -1; + self->hardcoded_fd = -1; + self->hardcoded_fdpath = NULL; + self->procselfexe = NULL; + + /* NOTE: We should be checking for CAP_SYS_ADMIN here... */ + if (geteuid() != 0) + SKIP(return, "all tests require euid == 0"); + if (!openat2_supported) + SKIP(return, "openat2(2) not supported"); /* Unshare and make /tmp a new directory. */ - E_unshare(CLONE_NEWNS); - E_mount("", "/tmp", "", MS_PRIVATE, ""); + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(mount("", "/tmp", "", MS_PRIVATE, ""), 0); /* Make the top-level directory. */ - if (!mkdtemp(dirname)) - ksft_exit_fail_msg("setup_testdir: failed to create tmpdir\n"); + ASSERT_NE(mkdtemp(dirname), NULL); dfd = open(dirname, O_PATH | O_DIRECTORY); - if (dfd < 0) - ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n"); + ASSERT_GE(dfd, 0); /* A sub-directory which is actually used for tests. */ - E_mkdirat(dfd, "root", 0755); + ASSERT_EQ(mkdirat(dfd, "root", 0755), 0); tmpfd = openat(dfd, "root", O_PATH | O_DIRECTORY); - if (tmpfd < 0) - ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n"); + ASSERT_GE(tmpfd, 0); close(dfd); dfd = tmpfd; - E_symlinkat("/proc/self/exe", dfd, "procexe"); - E_symlinkat("/proc/self/root", dfd, "procroot"); - E_mkdirat(dfd, "root", 0755); + ASSERT_EQ(symlinkat("/proc/self/exe", dfd, "procexe"), 0); + ASSERT_EQ(symlinkat("/proc/self/root", dfd, "procroot"), 0); + ASSERT_EQ(mkdirat(dfd, "root", 0755), 0); /* There is no mountat(2), so use chdir. */ - E_mkdirat(dfd, "mnt", 0755); - E_fchdir(dfd); - E_mount("tmpfs", "./mnt", "tmpfs", MS_NOSUID | MS_NODEV, ""); - E_symlinkat("../mnt/", dfd, "mnt/self"); - E_symlinkat("/mnt/", dfd, "mnt/absself"); - - E_mkdirat(dfd, "etc", 0755); - E_touchat(dfd, "etc/passwd"); - - E_symlinkat("/newfile3", dfd, "creatlink"); - E_symlinkat("etc/", dfd, "reletc"); - E_symlinkat("etc/passwd", dfd, "relsym"); - E_symlinkat("/etc/", dfd, "absetc"); - E_symlinkat("/etc/passwd", dfd, "abssym"); - E_symlinkat("/cheeky", dfd, "abscheeky"); - - E_mkdirat(dfd, "cheeky", 0755); - - E_symlinkat("/", dfd, "cheeky/absself"); - E_symlinkat("../../root/", dfd, "cheeky/self"); - E_symlinkat("/../../root/", dfd, "cheeky/garbageself"); - - E_symlinkat("../cheeky/../etc/../etc/passwd", dfd, "cheeky/passwd"); - E_symlinkat("/../cheeky/../etc/../etc/passwd", dfd, "cheeky/abspasswd"); - - E_symlinkat("../../../../../../../../../../../../../../etc/passwd", - dfd, "cheeky/dotdotlink"); - E_symlinkat("/../../../../../../../../../../../../../../etc/passwd", - dfd, "cheeky/garbagelink"); - - return dfd; + ASSERT_EQ(mkdirat(dfd, "mnt", 0755), 0); + ASSERT_EQ(fchdir(dfd), 0); + ASSERT_EQ(mount("tmpfs", "./mnt", "tmpfs", MS_NOSUID | MS_NODEV, ""), 0); + ASSERT_EQ(symlinkat("../mnt/", dfd, "mnt/self"), 0); + ASSERT_EQ(symlinkat("/mnt/", dfd, "mnt/absself"), 0); + + ASSERT_EQ(mkdirat(dfd, "etc", 0755), 0); + ASSERT_GE(touchat(dfd, "etc/passwd"), 0); + + ASSERT_EQ(symlinkat("/newfile3", dfd, "creatlink"), 0); + ASSERT_EQ(symlinkat("etc/", dfd, "reletc"), 0); + ASSERT_EQ(symlinkat("etc/passwd", dfd, "relsym"), 0); + ASSERT_EQ(symlinkat("/etc/", dfd, "absetc"), 0); + ASSERT_EQ(symlinkat("/etc/passwd", dfd, "abssym"), 0); + ASSERT_EQ(symlinkat("/cheeky", dfd, "abscheeky"), 0); + + ASSERT_EQ(mkdirat(dfd, "cheeky", 0755), 0); + + ASSERT_EQ(symlinkat("/", dfd, "cheeky/absself"), 0); + ASSERT_EQ(symlinkat("../../root/", dfd, "cheeky/self"), 0); + ASSERT_EQ(symlinkat("/../../root/", dfd, "cheeky/garbageself"), 0); + + ASSERT_EQ(symlinkat("../cheeky/../etc/../etc/passwd", + dfd, "cheeky/passwd"), 0); + ASSERT_EQ(symlinkat("/../cheeky/../etc/../etc/passwd", + dfd, "cheeky/abspasswd"), 0); + + ASSERT_EQ(symlinkat("../../../../../../../../../../../../../../etc/passwd", + dfd, "cheeky/dotdotlink"), 0); + ASSERT_EQ(symlinkat("/../../../../../../../../../../../../../../etc/passwd", + dfd, "cheeky/garbagelink"), 0); + + self->rootfd = dfd; + + self->hardcoded_fd = open("/dev/null", O_RDONLY); + ASSERT_GE(self->hardcoded_fd, 0); + ASSERT_GE(asprintf(&self->hardcoded_fdpath, "self/fd/%d", + self->hardcoded_fd), 0); + ASSERT_GE(asprintf(&self->procselfexe, "/proc/%d/exe", getpid()), 0); } -struct basic_test { - const char *name; - const char *dir; - const char *path; - struct open_how how; - bool pass; - union { - int err; - const char *path; - } out; -}; - -#define NUM_OPENAT2_OPATH_TESTS 88 - -void test_openat2_opath_tests(void) +FIXTURE_TEARDOWN(openat2_resolve) { - int rootfd, hardcoded_fd; - char *procselfexe, *hardcoded_fdpath; - - E_asprintf(&procselfexe, "/proc/%d/exe", getpid()); - rootfd = setup_testdir(); - - hardcoded_fd = open("/dev/null", O_RDONLY); - E_assert(hardcoded_fd >= 0, "open fd to hardcode"); - E_asprintf(&hardcoded_fdpath, "self/fd/%d", hardcoded_fd); + free(self->procselfexe); + free(self->hardcoded_fdpath); + if (self->hardcoded_fd >= 0) + close(self->hardcoded_fd); + if (self->rootfd >= 0) + close(self->rootfd); +} - struct basic_test tests[] = { - /** RESOLVE_BENEATH **/ +/* Attempts to cross the dirfd should be blocked with -EXDEV. */ +TEST_F(openat2_resolve, resolve_beneath) +{ + struct resolve_test tests[] = { /* Attempts to cross dirfd should be blocked. */ { .name = "[beneath] jump to /", .path = "/", .how.resolve = RESOLVE_BENEATH, @@ -206,9 +288,17 @@ void test_openat2_opath_tests(void) { .name = "[beneath] tricky absolute + garbage link outside $root", .path = "abscheeky/garbagelink", .how.resolve = RESOLVE_BENEATH, .out.err = -EXDEV, .pass = false }, + }; + + for (int i = 0; i < ARRAY_SIZE(tests); i++) + verify_resolve_test(_metadata, self->rootfd, + self->hardcoded_fd, &tests[i]); +} - /** RESOLVE_IN_ROOT **/ - /* All attempts to cross the dirfd will be scoped-to-root. */ +/* All attempts to cross the dirfd will be scoped-to-root. */ +TEST_F(openat2_resolve, resolve_in_root) +{ + struct resolve_test tests[] = { { .name = "[in_root] jump to /", .path = "/", .how.resolve = RESOLVE_IN_ROOT, .out.path = NULL, .pass = true }, @@ -297,8 +387,17 @@ void test_openat2_opath_tests(void) .how.mode = 0700, .how.resolve = RESOLVE_IN_ROOT, .out.path = "newfile3", .pass = true }, + }; - /** RESOLVE_NO_XDEV **/ + for (int i = 0; i < ARRAY_SIZE(tests); i++) + verify_resolve_test(_metadata, self->rootfd, + self->hardcoded_fd, &tests[i]); +} + +/* Crossing mount boundaries should be blocked. */ +TEST_F(openat2_resolve, resolve_no_xdev) +{ + struct resolve_test tests[] = { /* Crossing *down* into a mountpoint is disallowed. */ { .name = "[no_xdev] cross into $mnt", .path = "mnt", .how.resolve = RESOLVE_NO_XDEV, @@ -347,10 +446,19 @@ void test_openat2_opath_tests(void) .out.err = -EXDEV, .pass = false }, /* Except magic-link jumps inside the same vfsmount. */ { .name = "[no_xdev] jump through magic-link to same procfs", - .dir = "/proc", .path = hardcoded_fdpath, .how.resolve = RESOLVE_NO_XDEV, - .out.path = "/proc", .pass = true, }, + .dir = "/proc", .path = self->hardcoded_fdpath, .how.resolve = RESOLVE_NO_XDEV, + .out.path = "/proc", .pass = true, }, + }; + + for (int i = 0; i < ARRAY_SIZE(tests); i++) + verify_resolve_test(_metadata, self->rootfd, + self->hardcoded_fd, &tests[i]); +} - /** RESOLVE_NO_MAGICLINKS **/ +/* Procfs-style magic-link resolution should be blocked. */ +TEST_F(openat2_resolve, resolve_no_magiclinks) +{ + struct resolve_test tests[] = { /* Regular symlinks should work. */ { .name = "[no_magiclinks] ordinary relative symlink", .path = "relsym", .how.resolve = RESOLVE_NO_MAGICLINKS, @@ -365,7 +473,7 @@ void test_openat2_opath_tests(void) { .name = "[no_magiclinks] normal path to magic-link with O_NOFOLLOW", .path = "/proc/self/exe", .how.flags = O_NOFOLLOW, .how.resolve = RESOLVE_NO_MAGICLINKS, - .out.path = procselfexe, .pass = true }, + .out.path = self->procselfexe, .pass = true }, { .name = "[no_magiclinks] symlink to magic-link path component", .path = "procroot/etc", .how.resolve = RESOLVE_NO_MAGICLINKS, .out.err = -ELOOP, .pass = false }, @@ -376,8 +484,17 @@ void test_openat2_opath_tests(void) .path = "/proc/self/root/etc", .how.flags = O_NOFOLLOW, .how.resolve = RESOLVE_NO_MAGICLINKS, .out.err = -ELOOP, .pass = false }, + }; + + for (int i = 0; i < ARRAY_SIZE(tests); i++) + verify_resolve_test(_metadata, self->rootfd, + self->hardcoded_fd, &tests[i]); +} - /** RESOLVE_NO_SYMLINKS **/ +/* All symlink resolution should be blocked. */ +TEST_F(openat2_resolve, resolve_no_symlinks) +{ + struct resolve_test tests[] = { /* Normal paths should work. */ { .name = "[no_symlinks] ordinary path to '.'", .path = ".", .how.resolve = RESOLVE_NO_SYMLINKS, @@ -436,88 +553,9 @@ void test_openat2_opath_tests(void) .out.err = -ELOOP, .pass = false }, }; - BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_OPATH_TESTS); - - for (int i = 0; i < ARRAY_LEN(tests); i++) { - int dfd, fd; - char *fdpath = NULL; - bool failed; - void (*resultfn)(const char *msg, ...) = ksft_test_result_pass; - struct basic_test *test = &tests[i]; - - if (!openat2_supported) { - ksft_print_msg("openat2(2) unsupported\n"); - resultfn = ksft_test_result_skip; - goto skip; - } - - /* Auto-set O_PATH. */ - if (!(test->how.flags & O_CREAT)) - test->how.flags |= O_PATH; - - if (test->dir) - dfd = openat(rootfd, test->dir, O_PATH | O_DIRECTORY); - else - dfd = dup(rootfd); - E_assert(dfd, "failed to openat root '%s': %m", test->dir); - - E_dup2(dfd, hardcoded_fd); - - fd = sys_openat2(dfd, test->path, &test->how); - if (test->pass) - failed = (fd < 0 || !fdequal(fd, rootfd, test->out.path)); - else - failed = (fd != test->out.err); - if (fd >= 0) { - fdpath = fdreadlink(fd); - close(fd); - } - close(dfd); - - if (failed) { - resultfn = ksft_test_result_fail; - - ksft_print_msg("openat2 unexpectedly returned "); - if (fdpath) - ksft_print_msg("%d['%s']\n", fd, fdpath); - else - ksft_print_msg("%d (%s)\n", fd, strerror(-fd)); - } - -skip: - if (test->pass) - resultfn("%s gives path '%s'\n", test->name, - test->out.path ?: "."); - else - resultfn("%s fails with %d (%s)\n", test->name, - test->out.err, strerror(-test->out.err)); - - fflush(stdout); - free(fdpath); - } - - free(procselfexe); - close(rootfd); - - free(hardcoded_fdpath); - close(hardcoded_fd); + for (int i = 0; i < ARRAY_SIZE(tests); i++) + verify_resolve_test(_metadata, self->rootfd, + self->hardcoded_fd, &tests[i]); } -#define NUM_TESTS NUM_OPENAT2_OPATH_TESTS - -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_set_plan(NUM_TESTS); - - /* NOTE: We should be checking for CAP_SYS_ADMIN here... */ - if (geteuid() != 0) - ksft_exit_skip("all tests require euid == 0\n"); - - test_openat2_opath_tests(); - - if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) - ksft_exit_fail(); - else - ksft_exit_pass(); -} +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc index e71cc3ad0bdf..6d00d3c0f493 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/test_ownership.tc @@ -6,7 +6,7 @@ original_group=`stat -c "%g" .` original_owner=`stat -c "%u" .` -local mount_point=$(get_mount_point) +mount_point=$(get_mount_point) mount_options=$(get_mnt_options "$mount_point") diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc index 8e905d4fe6dd..f985ff391463 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc @@ -36,15 +36,23 @@ make_str() { data=`printf -- 'X%.0s' $(seq $cnt)` - printf "${val}${data}" + # Return escape-sequence text (e.g. "\003\000..."); the caller + # converts to binary. Shell command substitution strips NUL bytes, + # so the binary form cannot survive being captured into a variable. + printf '%s' "${val}${data}" } write_buffer() { id=$1 size=$2 - # write the string into the raw marker - make_str $id $size > trace_marker_raw + str=`make_str $id $size` + len=`printf "$str" | wc -c` + # Pipe through dd to ensure a single atomic write() syscall + # on architectures with 64K pages, where shell's printf builtin + # uses stdio buffering which may split the output into multiple + # writes. + printf "$str" | dd of=trace_marker_raw bs=$len iflag=fullblock } diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc index 4f5e8c665156..2a680c086047 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc @@ -20,7 +20,7 @@ check_error 'e:foo/^123456789012345678901234567890123456789012345678901234567890 check_error 'e:foo/^bar.1 syscalls/sys_enter_openat' # BAD_EVENT_NAME check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd' # BAD_FETCH_ARG -check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo' # BAD_ATTACH_ARG +check_error 'e:foo/bar syscalls/sys_enter_openat arg=^$foo' # BAD_ATTACH_ARG if grep -q '<attached-group>\.<attached-event>.*\[if <filter>\]' README; then check_error 'e:foo/bar syscalls/sys_enter_openat if ^' # NO_EP_FILTER diff --git a/tools/testing/selftests/futex/functional/robust_list.c b/tools/testing/selftests/futex/functional/robust_list.c index e7d1254e18ca..b3fab60181d5 100644 --- a/tools/testing/selftests/futex/functional/robust_list.c +++ b/tools/testing/selftests/futex/functional/robust_list.c @@ -27,12 +27,15 @@ #include "futextest.h" #include "../../kselftest_harness.h" +#include <dlfcn.h> #include <errno.h> #include <pthread.h> #include <signal.h> +#include <stdint.h> #include <stdatomic.h> #include <stdbool.h> #include <stddef.h> +#include <sys/auxv.h> #include <sys/mman.h> #include <sys/wait.h> @@ -42,6 +45,10 @@ #define SLEEP_US 100 +#if __SIZEOF_LONG__ == 8 +# define BUILD_64 +#endif + static pthread_barrier_t barrier, barrier2; static int set_robust_list(struct robust_list_head *head, size_t len) @@ -54,6 +61,12 @@ static int get_robust_list(int pid, struct robust_list_head **head, size_t *len_ return syscall(SYS_get_robust_list, pid, head, len_ptr); } +static int sys_futex_robust_unlock(_Atomic(uint32_t) *uaddr, unsigned int op, int val, + void *list_op_pending, unsigned int val3) +{ + return syscall(SYS_futex, uaddr, op, val, NULL, list_op_pending, val3, 0); +} + /* * Basic lock struct, contains just the futex word and the robust list element * Real implementations have also a *prev to easily walk in the list @@ -549,4 +562,230 @@ TEST(test_circular_list) ksft_test_result_pass("%s\n", __func__); } +/* + * Below are tests for the fix of robust release race condition. Please read the following + * thread to learn more about the issue in the first place and why the following functions fix it: + * https://lore.kernel.org/lkml/20260316162316.356674433@kernel.org/ + */ + +/* + * Auxiliary code for binding the vDSO functions + */ +static void *get_vdso_func_addr(const char *function) +{ + const char *vdso_names[] = { + "linux-vdso.so.1", "linux-gate.so.1", "linux-vdso32.so.1", "linux-vdso64.so.1", + }; + + for (int i = 0; i < ARRAY_SIZE(vdso_names); i++) { + void *vdso = dlopen(vdso_names[i], RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + + if (vdso) + return dlsym(vdso, function); + } + return NULL; +} + +/* + * These are the real vDSO function signatures: + * + * __vdso_futex_robust_list64_try_unlock(__u32 *lock, __u32 tid, __u64 *pop) + * __vdso_futex_robust_list32_try_unlock(__u32 *lock, __u32 tid, __u32 *pop) + * + * So for the generic entry point we need to use a void pointer as the last argument + */ +FIXTURE(vdso_unlock) +{ + uint32_t (*vdso)(_Atomic(uint32_t) *lock, uint32_t tid, void *pop); +}; + +FIXTURE_VARIANT(vdso_unlock) +{ + bool is_32; + char func_name[]; +}; + +FIXTURE_SETUP(vdso_unlock) +{ + self->vdso = get_vdso_func_addr(variant->func_name); +} + +FIXTURE_TEARDOWN(vdso_unlock) {} + +FIXTURE_VARIANT_ADD(vdso_unlock, 32) +{ + .func_name = "__vdso_futex_robust_list32_try_unlock", + .is_32 = true, +}; + +FIXTURE_VARIANT_ADD(vdso_unlock, 64) +{ + .func_name = "__vdso_futex_robust_list64_try_unlock", + .is_32 = false, +}; + +/* + * Test the vDSO robust_listXX_try_unlock() for the uncontended case. The virtual syscall should + * return the thread ID of the lock owner, the lock word must be 0 and the list_op_pending should + * be NULL. + */ +TEST_F(vdso_unlock, test_robust_try_unlock_uncontended) +{ + struct lock_struct lock = { .futex = 0 }; + _Atomic(unsigned int) *futex = &lock.futex; + struct robust_list_head head; + uintptr_t exp = (uintptr_t) NULL; + pid_t tid = gettid(); + int ret; + + if (!self->vdso) { + ksft_test_result_skip("%s not found\n", variant->func_name); + return; + } + + *futex = tid; + + ret = set_list(&head); + if (ret) + ksft_test_result_fail("set_robust_list error\n"); + + head.list_op_pending = &lock.list; + + ret = self->vdso(futex, tid, &head.list_op_pending); + + ASSERT_EQ(ret, tid); + ASSERT_EQ(*futex, 0); + + /* Check only the lower 32 bits for the 32-bit entry point */ + if (variant->is_32) { + exp = (uintptr_t)(unsigned long)&lock.list; + exp &= ~0xFFFFFFFFULL; + } + + ASSERT_EQ((uintptr_t)(unsigned long)head.list_op_pending, exp); +} + +/* + * If the lock is contended, the operation fails. The return value is the value found at the + * futex word (tid | FUTEX_WAITERS), the futex word is not modified and the list_op_pending is_32 + * not cleared. + */ +TEST_F(vdso_unlock, test_robust_try_unlock_contended) +{ + struct lock_struct lock = { .futex = 0 }; + _Atomic(unsigned int) *futex = &lock.futex; + struct robust_list_head head; + pid_t tid = gettid(); + int ret; + + if (!self->vdso) { + ksft_test_result_skip("%s not found\n", variant->func_name); + return; + } + + *futex = tid | FUTEX_WAITERS; + + ret = set_list(&head); + if (ret) + ksft_test_result_fail("set_robust_list error\n"); + + head.list_op_pending = &lock.list; + + ret = self->vdso(futex, tid, &head.list_op_pending); + + ASSERT_EQ(ret, tid | FUTEX_WAITERS); + ASSERT_EQ(*futex, tid | FUTEX_WAITERS); + ASSERT_EQ(head.list_op_pending, &lock.list); +} + +FIXTURE(futex_op) {}; + +FIXTURE_VARIANT(futex_op) +{ + unsigned int op; + unsigned int val3; +}; + +FIXTURE_SETUP(futex_op) {} + +FIXTURE_TEARDOWN(futex_op) {} + +FIXTURE_VARIANT_ADD(futex_op, wake) +{ + .op = FUTEX_WAKE, + .val3 = 0, +}; + +FIXTURE_VARIANT_ADD(futex_op, wake_bitset) +{ + .op = FUTEX_WAKE_BITSET, + .val3 = FUTEX_BITSET_MATCH_ANY, +}; + +FIXTURE_VARIANT_ADD(futex_op, unlock_pi) +{ + .op = FUTEX_UNLOCK_PI, + .val3 = 0, +}; + +FIXTURE_VARIANT_ADD(futex_op, wake32) +{ + .op = FUTEX_WAKE | FUTEX_ROBUST_LIST32, + .val3 = 0, +}; + +FIXTURE_VARIANT_ADD(futex_op, wake_bitset32) +{ + .op = FUTEX_WAKE_BITSET | FUTEX_ROBUST_LIST32, + .val3 = FUTEX_BITSET_MATCH_ANY, +}; + +FIXTURE_VARIANT_ADD(futex_op, unlock_pi32) +{ + .op = FUTEX_UNLOCK_PI | FUTEX_ROBUST_LIST32, + .val3 = 0, +}; + +/* + * The syscall should return the number of tasks waken (for this test, 0), clear the futex word and + * clear list_op_pending + */ +TEST_F(futex_op, test_futex_robust_unlock) +{ + struct lock_struct lock = { .futex = 0 }; + _Atomic(unsigned int) *futex = &lock.futex; + uintptr_t exp = (uintptr_t) NULL; + struct robust_list_head head; + pid_t tid = gettid(); + int ret; + +#ifndef BUILD_64 + if (!(variant->op & FUTEX_ROBUST_LIST32)) { + ksft_test_result_skip("Not supported for 32 bit build\n"); + return; + } +#endif + + *futex = tid | FUTEX_WAITERS; + + ret = set_list(&head); + if (ret) + ksft_test_result_fail("set_robust_list error\n"); + + head.list_op_pending = &lock.list; + + ret = sys_futex_robust_unlock(futex, FUTEX_ROBUST_UNLOCK | variant->op, tid, + &head.list_op_pending, variant->val3); + + ASSERT_EQ(ret, 0); + ASSERT_EQ(*futex, 0); + + if (variant->op & FUTEX_ROBUST_LIST32) { + exp = (uint64_t)(unsigned long)&lock.list; + exp &= ~0xFFFFFFFFULL; + } + + ASSERT_EQ((uintptr_t)(unsigned long)head.list_op_pending, exp); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h index 3d48e9789d9f..df33f31d6994 100644 --- a/tools/testing/selftests/futex/include/futextest.h +++ b/tools/testing/selftests/futex/include/futextest.h @@ -38,6 +38,12 @@ typedef volatile u_int32_t futex_t; #ifndef FUTEX_CMP_REQUEUE_PI #define FUTEX_CMP_REQUEUE_PI 12 #endif +#ifndef FUTEX_ROBUST_UNLOCK +#define FUTEX_ROBUST_UNLOCK 512 +#endif +#ifndef FUTEX_ROBUST_LIST32 +#define FUTEX_ROBUST_LIST32 1024 +#endif #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile index 50ec9e0406ab..96071b4800e8 100644 --- a/tools/testing/selftests/hid/Makefile +++ b/tools/testing/selftests/hid/Makefile @@ -105,13 +105,6 @@ $(MAKE_DIRS): $(call msg,MKDIR,,$@) $(Q)mkdir -p $@ -# LLVM's ld.lld doesn't support all the architectures, so use it only on x86 -ifeq ($(SRCARCH),x86) -LLD := lld -else -LLD := ld -endif - DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool TEST_GEN_PROGS_EXTENDED += $(DEFAULT_BPFTOOL) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index d1fe5dbc2813..d44b34b05757 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -556,6 +556,21 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) 1, &num_inv); assert(!num_inv); + /* Negative test: entry_len is bounded by PAGE_SIZE */ + num_inv = 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + PAGE_SIZE + 1, &num_inv); + assert(!num_inv); + + /* Negative test: entry_num is bounded */ +#define IOMMU_HWPT_INVALIDATE_ENTRY_NUM_MAX (1U << 19) + num_inv = IOMMU_HWPT_INVALIDATE_ENTRY_NUM_MAX + 1; + test_err_hwpt_invalidate(EINVAL, nested_hwpt_id[0], inv_reqs, + IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, + sizeof(*inv_reqs), &num_inv); + assert(!num_inv); + /* Negative test: invalid flag is passed */ num_inv = 1; inv_reqs[0].flags = 0xffffffff; @@ -2980,22 +2995,54 @@ TEST_F(iommufd_viommu, vdevice_alloc) uint32_t veventq_id; uint32_t veventq_fd; int prev_seq = -1; + size_t hdr_size = sizeof(struct iommufd_vevent_header); + char vbuf[64]; if (dev_id) { /* Must allocate vdevice before attaching to a nested hwpt */ test_err_mock_domain_replace(ENOENT, self->stdev_id, self->nested_hwpt_id); + /* Test depth lower and upper bounds (mirrors kernel cap) */ +#define VEVENTQ_MAX_DEPTH (1U << 19) + test_err_veventq_alloc(EINVAL, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, 0, NULL, + NULL); + test_err_veventq_alloc(EINVAL, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, + VEVENTQ_MAX_DEPTH + 1, NULL, NULL); + test_cmd_veventq_alloc(viommu_id, IOMMU_VEVENTQ_TYPE_SELFTEST, + VEVENTQ_MAX_DEPTH, &veventq_id, + &veventq_fd); + close(veventq_fd); + test_ioctl_destroy(veventq_id); + /* Allocate a vEVENTQ with veventq_depth=2 */ test_cmd_veventq_alloc(viommu_id, IOMMU_VEVENTQ_TYPE_SELFTEST, - &veventq_id, &veventq_fd); + 2, &veventq_id, &veventq_fd); test_err_veventq_alloc(EEXIST, viommu_id, - IOMMU_VEVENTQ_TYPE_SELFTEST, NULL, NULL); + IOMMU_VEVENTQ_TYPE_SELFTEST, 2, NULL, + NULL); + + /* Invalid read counts on an empty vEVENTQ */ + ASSERT_EQ(-1, read(veventq_fd, vbuf, 0)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(-1, read(veventq_fd, vbuf, hdr_size - 1)); + ASSERT_EQ(EINVAL, errno); + /* Set vdev_id to 0x99, unset it, and set to 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); test_cmd_mock_domain_replace(self->stdev_id, self->nested_hwpt_id); test_cmd_trigger_vevents(dev_id, 1); + + /* Invalid read counts on a non-empty vEVENTQ */ + ASSERT_EQ(-1, read(veventq_fd, vbuf, 0)); + ASSERT_EQ(EINVAL, errno); + /* header fits but the event's payload doesn't */ + ASSERT_EQ(-1, read(veventq_fd, vbuf, hdr_size)); + ASSERT_EQ(EINVAL, errno); + test_cmd_read_vevents(veventq_fd, 1, 0x99, &prev_seq); test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99, &vdev_id); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 45c14323a618..25495d8dceb3 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -712,7 +712,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) return -1; if (_test_cmd_veventq_alloc(self->fd, viommu_id, - IOMMU_VEVENTQ_TYPE_SELFTEST, &veventq_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, 2, &veventq_id, &veventq_fd)) return -1; close(veventq_fd); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 5502751d500c..b4928cbd4d9c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -1060,12 +1060,13 @@ static int _test_cmd_hw_queue_alloc(int fd, __u32 viommu_id, __u32 type, base_addr, len, out_qid)) static int _test_cmd_veventq_alloc(int fd, __u32 viommu_id, __u32 type, - __u32 *veventq_id, __u32 *veventq_fd) + __u32 depth, __u32 *veventq_id, + __u32 *veventq_fd) { struct iommu_veventq_alloc cmd = { .size = sizeof(cmd), .type = type, - .veventq_depth = 2, + .veventq_depth = depth, .viommu_id = viommu_id, }; int ret; @@ -1080,13 +1081,13 @@ static int _test_cmd_veventq_alloc(int fd, __u32 viommu_id, __u32 type, return 0; } -#define test_cmd_veventq_alloc(viommu_id, type, veventq_id, veventq_fd) \ - ASSERT_EQ(0, _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ +#define test_cmd_veventq_alloc(viommu_id, type, depth, veventq_id, veventq_fd) \ + ASSERT_EQ(0, _test_cmd_veventq_alloc(self->fd, viommu_id, type, depth, \ veventq_id, veventq_fd)) -#define test_err_veventq_alloc(_errno, viommu_id, type, veventq_id, \ - veventq_fd) \ - EXPECT_ERRNO(_errno, \ - _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ +#define test_err_veventq_alloc(_errno, viommu_id, type, depth, veventq_id, \ + veventq_fd) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_veventq_alloc(self->fd, viommu_id, type, depth, \ veventq_id, veventq_fd)) static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents) diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh index 49fdac8e8b15..0014bd76e88d 100755 --- a/tools/testing/selftests/kho/vmtest.sh +++ b/tools/testing/selftests/kho/vmtest.sh @@ -59,10 +59,14 @@ function build_kernel() { tee "$kconfig" > "$kho_config" <<EOF CONFIG_BLK_DEV_INITRD=y CONFIG_KEXEC_HANDOVER=y +CONFIG_KEXEC_HANDOVER_DEBUG=y CONFIG_KEXEC_HANDOVER_DEBUGFS=y CONFIG_TEST_KEXEC_HANDOVER=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_SMP=y +CONFIG_DEFERRED_STRUCT_PAGE_INIT=y $arch_kconfig EOF diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 6d809f08ab7b..ae18c491ae53 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -346,9 +346,9 @@ void ksft_test_result_code(int exit_code, const char *test_name, } /** - * ksft_test_result() - Report test success based on truth of condition + * ksft_test_result_report() - Report test result based on a kselftest exit code * - * @condition: if true, report test success, otherwise failure. + * @result: a kselftest exit code */ #define ksft_test_result_report(result, fmt, ...) do { \ switch (result) { \ @@ -450,7 +450,7 @@ static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ... */ if (ksft_plan || ksft_test_num()) { ksft_cnt.ksft_xskip++; - printf("ok %u # SKIP ", 1 + ksft_test_num()); + printf("ok %u # SKIP ", ksft_test_num()); } else { printf("1..0 # SKIP "); } diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 75fb016cd190..261e4df94d9d 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -76,7 +76,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) memset(s, c, n); } -#define KSELFTEST_PRIO_TEST_F 20000 +#define KSELFTEST_PRIO_TEST 20000 #define KSELFTEST_PRIO_XFAIL 20001 #define TEST_TIMEOUT_DEFAULT 30 @@ -194,7 +194,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) .fixture = &_fixture_global, \ .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ - static void __attribute__((constructor)) _register_##test_name(void) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) _register_##test_name(void) \ { \ __register_test(&_##test_name##_object); \ } \ @@ -238,7 +238,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) FIXTURE_VARIANT(fixture_name); \ static struct __fixture_metadata _##fixture_name##_fixture_object = \ { .name = #fixture_name, }; \ - static void __attribute__((constructor)) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) \ _register_##fixture_name##_data(void) \ { \ __register_fixture(&_##fixture_name##_fixture_object); \ @@ -364,7 +364,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) _##fixture_name##_##variant_name##_object = \ { .name = #variant_name, \ .data = &_##fixture_name##_##variant_name##_variant}; \ - static void __attribute__((constructor)) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) \ _register_##fixture_name##_##variant_name(void) \ { \ __register_fixture_variant(&_##fixture_name##_fixture_object, \ @@ -468,7 +468,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) fixture_name##_teardown(_metadata, self, variant); \ } \ static struct __test_metadata *_##fixture_name##_##test_name##_object; \ - static void __attribute__((constructor(KSELFTEST_PRIO_TEST_F))) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) \ _register_##fixture_name##_##test_name(void) \ { \ struct __test_metadata *object = mmap(NULL, sizeof(*object), \ @@ -996,6 +996,7 @@ static void __wait_for_test(struct __test_metadata *t) poll_child.fd = childfd; poll_child.events = POLLIN; ret = poll(&poll_child, 1, t->timeout * 1000); + close(childfd); if (ret == -1) { t->exit_code = KSFT_FAIL; fprintf(TH_LOG_STREAM, @@ -1323,7 +1324,7 @@ static int test_harness_run(int argc, char **argv) return KSFT_FAIL; } -static void __attribute__((constructor)) __constructor_order_first(void) +static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) __constructor_order_first(void) { __constructor_order_forward = true; } diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 9118a5a51b89..d28a057fa6c2 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -97,6 +97,7 @@ TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test TEST_GEN_PROGS_x86 += x86/nested_set_state_test +TEST_GEN_PROGS_x86 += x86/nested_tdp_fault_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/nested_vmsave_vmload_test @@ -117,6 +118,7 @@ TEST_GEN_PROGS_x86 += x86/svm_nested_clear_efer_svme TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test TEST_GEN_PROGS_x86 += x86/svm_nested_vmcb12_gpa +TEST_GEN_PROGS_x86 += x86/svm_nested_pat_test TEST_GEN_PROGS_x86 += x86/svm_lbr_nested_state TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync TEST_GEN_PROGS_x86 += x86/sync_regs_test @@ -140,6 +142,7 @@ TEST_GEN_PROGS_x86 += x86/tsc_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_pmu_caps_test TEST_GEN_PROGS_x86 += x86/xen_shinfo_test TEST_GEN_PROGS_x86 += x86/xen_vmcall_test +TEST_GEN_PROGS_x86 += x86/sev_dbg_test TEST_GEN_PROGS_x86 += x86/sev_init2_tests TEST_GEN_PROGS_x86 += x86/sev_migrate_tests TEST_GEN_PROGS_x86 += x86/sev_smoke_test @@ -210,6 +213,7 @@ TEST_GEN_PROGS_s390 += s390/keyop TEST_GEN_PROGS_s390 += rseq_test TEST_GEN_PROGS_s390 += s390/irq_routing TEST_GEN_PROGS_s390 += mmu_stress_test +TEST_GEN_PROGS_s390 += pre_fault_memory_test TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index e5bbdb5bbdc3..4415c94b2866 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -41,10 +41,10 @@ #include <inttypes.h> #include <limits.h> #include <pthread.h> -#include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> +#include "kvm_syscalls.h" #include "kvm_util.h" #include "test_util.h" #include "memstress.h" diff --git a/tools/testing/selftests/kvm/arm64/no-vgic.c b/tools/testing/selftests/kvm/arm64/no-vgic.c index 25b2e3222f68..ab57902ce429 100644 --- a/tools/testing/selftests/kvm/arm64/no-vgic.c +++ b/tools/testing/selftests/kvm/arm64/no-vgic.c @@ -159,6 +159,7 @@ static void guest_code_gicv5(void) check_gicv5_gic_op(CDAFF); check_gicv5_gic_op(CDDI); check_gicv5_gic_op(CDDIS); + check_gicv5_gic_op(CDEN); check_gicv5_gic_op(CDEOI); check_gicv5_gic_op(CDHM); check_gicv5_gic_op(CDPEND); diff --git a/tools/testing/selftests/kvm/arm64/vgic_v5.c b/tools/testing/selftests/kvm/arm64/vgic_v5.c index d785b660d847..96cfd6bb32f6 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_v5.c +++ b/tools/testing/selftests/kvm/arm64/vgic_v5.c @@ -20,8 +20,6 @@ struct vm_gic { u32 gic_dev_type; }; -static u64 max_phys_size; - #define GUEST_CMD_IRQ_CDIA 10 #define GUEST_CMD_IRQ_DIEOI 11 #define GUEST_CMD_IS_AWAKE 12 @@ -131,6 +129,8 @@ static void test_vgic_v5_ppis(u32 gic_dev_type) while (1) { ret = run_vcpu(vcpus[0]); + if (ret) + break; switch (get_ucall(vcpus[0], &uc)) { case UCALL_SYNC: @@ -146,7 +146,7 @@ static void test_vgic_v5_ppis(u32 gic_dev_type) irq = FIELD_PREP(KVM_ARM_IRQ_NUM_MASK, 3); irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT; - _kvm_irq_line(v.vm, irq, level); + kvm_irq_line(v.vm, irq, level); } else if (uc.args[1] == GUEST_CMD_IS_AWAKE) { pr_info("Guest skipping WFI due to pending IRQ\n"); } else if (uc.args[1] == GUEST_CMD_IRQ_CDIA) { @@ -208,13 +208,9 @@ void run_tests(u32 gic_dev_type) int main(int ac, char **av) { int ret; - int pa_bits; test_disable_default_vgic(); - pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits; - max_phys_size = 1ULL << pa_bits; - ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V5); if (ret) { pr_info("No GICv5 support; Not running GIC_v5 tests.\n"); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 12446a4b6e8d..74ca096bf976 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -694,7 +694,17 @@ static void run_test(enum vm_guest_mode mode, void *arg) pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); for (iteration = 1; iteration <= p->iterations; iteration++) { - unsigned long i; + unsigned long i, reap_i; + + /* + * Select a random point in the time interval to reap the dirty + * bitmap/ring while the guest is running, i.e. randomize how + * long the guest gets to initially run and thus how many pages + * it can dirty, before collecting the dirty bitmap/ring. See + * the loop below for details. + */ + reap_i = random() % p->interval; + printf("Reaping after a %lu ms delay\n", reap_i); sync_global_to_guest(vm, iteration); @@ -729,13 +739,17 @@ static void run_test(enum vm_guest_mode mode, void *arg) * that's effectively blocked. Collecting while the * guest is running also verifies KVM doesn't lose any * state. - * + */ + if (i < reap_i) + continue; + + /* * For bitmap modes, KVM overwrites the entire bitmap, * i.e. collecting the bitmaps is destructive. Collect - * the bitmap only on the first pass, otherwise this - * test would lose track of dirty pages. + * the bitmap while the guest is running only once, + * otherwise this test would lose track of dirty pages. */ - if (i && host_log_mode != LOG_MODE_DIRTY_RING) + if (i > reap_i && host_log_mode != LOG_MODE_DIRTY_RING) continue; /* @@ -745,7 +759,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) * the ring on every pass would make it unlikely the * vCPU would ever fill the fing). */ - if (i && !READ_ONCE(dirty_ring_vcpu_ring_full)) + if (i > reap_i && !READ_ONCE(dirty_ring_vcpu_ring_full)) continue; log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index d6528c6f5e03..2233d871a38f 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -14,10 +14,10 @@ #include <linux/bitmap.h> #include <linux/falloc.h> #include <linux/sizes.h> -#include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> +#include "kvm_syscalls.h" #include "kvm_util.h" #include "numaif.h" #include "test_util.h" @@ -345,6 +345,16 @@ static void test_invalid_punch_hole(int fd, size_t total_size) } } +static void test_invalid_binding(struct kvm_vm *vm, int fd, size_t size) +{ + int r; + + r = __vm_set_user_memory_region2(vm, 0, KVM_MEM_GUEST_MEMFD, 0, size, 0, + fd, ALIGN_DOWN(INT64_MAX, page_size)); + TEST_ASSERT(r && errno == EINVAL, + "Memslot with out-of-range offset+size should fail"); +} + static void test_create_guest_memfd_invalid_sizes(struct kvm_vm *vm, u64 guest_memfd_flags) { @@ -408,17 +418,26 @@ static void test_guest_memfd_flags(struct kvm_vm *vm) } } -#define __gmem_test(__test, __vm, __flags, __gmem_size) \ +#define ____gmem_test(__test, __vm, __flags, __gmem_size, args...) \ do { \ int fd = vm_create_guest_memfd(__vm, __gmem_size, __flags); \ \ - test_##__test(fd, __gmem_size); \ + test_##__test(args); \ close(fd); \ } while (0) +#define __gmem_test(__test, __vm, __flags, __gmem_size) \ + ____gmem_test(__test, __vm, __flags, __gmem_size, fd, __gmem_size) + #define gmem_test(__test, __vm, __flags) \ __gmem_test(__test, __vm, __flags, page_size * 4) +#define __gmem_test_vm(__test, __vm, __flags, __gmem_size) \ + ____gmem_test(__test, __vm, __flags, __gmem_size, __vm, fd, __gmem_size) + +#define gmem_test_vm(__test, __vm, __flags) \ + __gmem_test_vm(__test, __vm, __flags, page_size * 4) + static void __test_guest_memfd(struct kvm_vm *vm, u64 flags) { test_create_guest_memfd_multiple(vm); @@ -447,6 +466,7 @@ static void __test_guest_memfd(struct kvm_vm *vm, u64 flags) gmem_test(file_size, vm, flags); gmem_test(fallocate, vm, flags); gmem_test(invalid_punch_hole, vm, flags); + gmem_test_vm(invalid_binding, vm, flags); } static void test_guest_memfd(unsigned long vm_type) @@ -510,7 +530,12 @@ static void test_guest_memfd_guest(void) "Default VM type should support INIT_SHARED, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); - size = vm->page_size; + /* + * Use the max of the host or guest page size for all operations, as + * KVM requires guest_memfd files and memslots to be sized to multiples + * of the host page size. + */ + size = max_t(size_t, vm->page_size, page_size); fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); @@ -519,7 +544,7 @@ static void test_guest_memfd_guest(void) memset(mem, 0xaa, size); kvm_munmap(mem, size); - virt_pg_map(vm, gpa, gpa); + virt_map(vm, gpa, gpa, size / vm->page_size); vcpu_args_set(vcpu, 2, gpa, size); vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h index 843c9904c46f..6cb3bed29b81 100644 --- a/tools/testing/selftests/kvm/include/kvm_syscalls.h +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -2,8 +2,18 @@ #ifndef SELFTEST_KVM_SYSCALLS_H #define SELFTEST_KVM_SYSCALLS_H +/* + * Include both the kernel and libc versions of mman.h. The kernel provides + * the most up-to-date flags and definitions, while libc provides the syscall + * wrappers tests expect. + */ +#include <linux/mman.h> + +#include <sys/mman.h> #include <sys/syscall.h> +#include <test_util.h> + #define MAP_ARGS0(m,...) #define MAP_ARGS1(m,t,a,...) m(t,a) #define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) @@ -79,4 +89,10 @@ __KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, l __KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length); __KVM_SYSCALL_DEFINE(madvise, 3, void *, addr, size_t, length, int, advice); +#define kvm_free_fd(fd) \ +do { \ + kvm_close(fd); \ + (fd) = -1; \ +} while (0) + #endif /* SELFTEST_KVM_SYSCALLS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2ecaaa0e9965..04a910164a29 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -876,7 +876,7 @@ static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) { int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); - TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vcpu->vm); return fd; } diff --git a/tools/testing/selftests/kvm/include/lru_gen_util.h b/tools/testing/selftests/kvm/include/lru_gen_util.h index d32ff5d8ffd0..49c8139d398c 100644 --- a/tools/testing/selftests/kvm/include/lru_gen_util.h +++ b/tools/testing/selftests/kvm/include/lru_gen_util.h @@ -14,7 +14,7 @@ #include "test_util.h" #define MAX_NR_GENS 16 /* MAX_NR_GENS in include/linux/mmzone.h */ -#define MAX_NR_NODES 4 /* Maximum number of nodes supported by the test */ +#define MAX_NR_NODES 32 /* Maximum number of nodes supported by the test */ #define LRU_GEN_DEBUGFS "/sys/kernel/debug/lru_gen" #define LRU_GEN_ENABLED_PATH "/sys/kernel/mm/lru_gen/enabled" diff --git a/tools/testing/selftests/kvm/include/s390/facility.h b/tools/testing/selftests/kvm/include/s390/facility.h index 41a265742666..e5259f63be22 100644 --- a/tools/testing/selftests/kvm/include/s390/facility.h +++ b/tools/testing/selftests/kvm/include/s390/facility.h @@ -11,6 +11,7 @@ #ifndef SELFTEST_KVM_FACILITY_H #define SELFTEST_KVM_FACILITY_H +#include <linux/atomic.h> #include <linux/bitops.h> /* alt_stfle_fac_list[16] + stfle_fac_list[16] */ @@ -19,6 +20,11 @@ extern u64 stfl_doublewords[NB_STFL_DOUBLEWORDS]; extern bool stfle_flag; +static inline bool clear_bit_inv(unsigned long nr, unsigned long *ptr) +{ + return clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + static inline bool test_bit_inv(unsigned long nr, const unsigned long *ptr) { return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index d9b433b834f1..a56271c237ae 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -19,9 +19,9 @@ #include <errno.h> #include <unistd.h> #include <fcntl.h> -#include <sys/mman.h> #include "kselftest.h" +#include <linux/mman.h> #include <linux/types.h> #define msecs_to_usecs(msec) ((msec) * 1000ULL) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 77f576ee7789..513e4a1075fa 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -38,7 +38,24 @@ extern u64 guest_tsc_khz; const char *ex_str(int vector); -#define X86_EFLAGS_FIXED (1u << 1) +#define X86_EFLAGS_CF BIT(0) /* Carry Flag */ +#define X86_EFLAGS_FIXED BIT(1) /* Bit 1 - always on */ +#define X86_EFLAGS_PF BIT(2) /* Parity Flag */ +#define X86_EFLAGS_AF BIT(4) /* Auxiliary carry Flag */ +#define X86_EFLAGS_ZF BIT(6) /* Zero Flag */ +#define X86_EFLAGS_SF BIT(7) /* Sign Flag */ +#define X86_EFLAGS_TF BIT(8) /* Trap Flag */ +#define X86_EFLAGS_IF BIT(9) /* Interrupt Flag */ +#define X86_EFLAGS_DF BIT(10) /* Direction Flag */ +#define X86_EFLAGS_OF BIT(11) /* Overflow Flag */ +#define X86_EFLAGS_IOPL BIT(12) /* I/O Privilege Level (2 bits) */ +#define X86_EFLAGS_NT BIT(14) /* Nested Task */ +#define X86_EFLAGS_RF BIT(16) /* Resume Flag */ +#define X86_EFLAGS_VM BIT(17) /* Virtual Mode */ +#define X86_EFLAGS_AC BIT(18) /* Alignment Check/Access Control */ +#define X86_EFLAGS_VIF BIT(19) /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP BIT(20) /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID BIT(21) /* CPUID detection */ #define X86_CR4_VME (1ul << 0) #define X86_CR4_PVI (1ul << 1) @@ -209,6 +226,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) #define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3) #define X86_FEATURE_SEV_SNP KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 4) +#define X86_FEATURE_GP_ON_USER_CPUID KVM_X86_CPU_FEATURE(0x80000021, 0, EAX, 17) #define X86_FEATURE_PERFMON_V2 KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 0) #define X86_FEATURE_LBR_PMC_FREEZE KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 2) @@ -1556,6 +1574,15 @@ u64 *tdp_get_pte(struct kvm_vm *vm, u64 l2_gpa); #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) +#define EPT_VIOLATION_ACC_READ BIT(0) +#define EPT_VIOLATION_ACC_WRITE BIT(1) +#define EPT_VIOLATION_ACC_INSTR BIT(2) +#define EPT_VIOLATION_PROT_READ BIT(3) +#define EPT_VIOLATION_PROT_WRITE BIT(4) +#define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_GVA_IS_VALID BIT(7) +#define EPT_VIOLATION_GVA_TRANSLATED BIT(8) + bool sys_clocksource_is_based_on_tsc(void); #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/x86/sev.h b/tools/testing/selftests/kvm/include/x86/sev.h index 1af44c151d60..dec383e59a47 100644 --- a/tools/testing/selftests/kvm/include/x86/sev.h +++ b/tools/testing/selftests/kvm/include/x86/sev.h @@ -144,4 +144,28 @@ static inline void snp_launch_update_data(struct kvm_vm *vm, gpa_t gpa, vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_UPDATE, &update_data); } +static inline void sev_dbg_crypt_memory(struct kvm_vm *vm, unsigned int cmd, + void *dst, void *src, unsigned int len) +{ + struct kvm_sev_dbg dbg = { + .src_uaddr = (unsigned long)src, + .dst_uaddr = (unsigned long)dst, + .len = len, + }; + + vm_sev_ioctl(vm, cmd, &dbg); +} + +static inline void sev_decrypt_memory(struct kvm_vm *vm, void *dst, void *src, + unsigned int len) +{ + sev_dbg_crypt_memory(vm, KVM_SEV_DBG_DECRYPT, dst, src, len); +} + +static inline void sev_encrypt_memory(struct kvm_vm *vm, void *dst, void *src, + unsigned int len) +{ + sev_dbg_crypt_memory(vm, KVM_SEV_DBG_ENCRYPT, dst, src, len); +} + #endif /* SELFTEST_KVM_SEV_H */ diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index fc5242fb956f..a910e3abb8c7 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -230,6 +230,7 @@ struct test_params { u64 phys_offset; u64 test_mem_size; enum vm_mem_backing_src_type src_type; + bool misalign_slot_gpa; }; static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) @@ -244,6 +245,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) u64 guest_num_pages; u64 alignment; void *host_test_mem; + struct userspace_mem_region *region; struct kvm_vm *vm; /* Align up the test memory size */ @@ -276,13 +278,22 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Add an extra memory slot with specified backing src type */ vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem, TEST_MEM_SLOT_INDEX, guest_num_pages, 0); + region = memslot2region(vm, TEST_MEM_SLOT_INDEX); + host_test_mem = region->host_mem; + + if (p->misalign_slot_gpa) { + TEST_ASSERT(is_backing_src_hugetlb(src_type), + "Memslot GPA misalignment requires hugetlb backing"); + TEST_ASSERT(guest_num_pages > 1, + "Need at least two guest pages to misalign memslot GPA"); + + guest_test_phys_mem += guest_page_size; + vm_mem_region_move(vm, TEST_MEM_SLOT_INDEX, guest_test_phys_mem); + } /* Do mapping(GVA->GPA) for the testing memory slot */ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); - /* Cache the HVA pointer of the region */ - host_test_mem = addr_gpa2hva(vm, (gpa_t)guest_test_phys_mem); - /* Export shared structure test_args to guest */ sync_global_to_guest(vm, test_args); @@ -417,8 +428,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-p offset] [-m mode] " - "[-b mem-size] [-v vcpus] [-s mem-type]\n", name); + printf("usage: %s [-h] [-p offset] [-m mode] [-b mem-size]\n", name); + printf(" [-v vcpus] [-s mem-type] [-u]\n"); puts(""); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); @@ -428,6 +439,8 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run\n" " (default: 1)\n"); backing_src_help("-s"); + printf(" -u: move the test memslot GPA by one guest page after creating\n" + " the memslot, forcing a hugetlb HVA/GPA offset mismatch\n"); puts(""); } @@ -442,7 +455,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) { + while ((opt = getopt(argc, argv, "hp:m:b:v:s:u")) != -1) { switch (opt) { case 'p': p.phys_offset = strtoull(optarg, NULL, 0); @@ -461,6 +474,9 @@ int main(int argc, char *argv[]) case 's': p.src_type = parse_backing_src_type(optarg); break; + case 'u': + p.misalign_slot_gpa = true; + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index b49690658c60..8be0d09ecf0f 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -6,11 +6,14 @@ */ #include "test_util.h" -#include <execinfo.h> + #include <sys/syscall.h> #include "kselftest.h" +#ifdef __GLIBC__ +#include <execinfo.h> + /* Dumps the current stack trace to stderr. */ static void __attribute__((noinline)) test_dump_stack(void); static void test_dump_stack(void) @@ -57,6 +60,9 @@ static void test_dump_stack(void) system(cmd); #pragma GCC diagnostic pop } +#else +static void test_dump_stack(void) {} +#endif static pid_t _gettid(void) { diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index b689c4df4a01..1924a9895834 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -7,7 +7,7 @@ #include "test_util.h" -#include <bits/endian.h> +#include <endian.h> #include <linux/elf.h> #include "kvm_util.h" diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 2a76eca7029d..195f3fdae1e3 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -5,13 +5,13 @@ * Copyright (C) 2018, Google LLC. */ #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" #include <assert.h> #include <sched.h> -#include <sys/mman.h> #include <sys/resource.h> #include <sys/types.h> #include <sys/stat.h> @@ -77,7 +77,8 @@ static ssize_t get_module_param(const char *module_name, const char *param, int fd, r; /* Verify KVM is loaded, to provide a more helpful SKIP message. */ - close(open_kvm_dev_path_or_exit()); + fd = open_kvm_dev_path_or_exit(); + kvm_free_fd(fd); r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", module_name, param); @@ -90,8 +91,7 @@ static ssize_t get_module_param(const char *module_name, const char *param, TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes", path, bytes_read, buffer_size); - r = close(fd); - TEST_ASSERT(!r, "close(%s) failed", path); + kvm_free_fd(fd); return bytes_read; } @@ -160,7 +160,7 @@ unsigned int kvm_check_cap(long cap) ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap); TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); - close(kvm_fd); + kvm_free_fd(kvm_fd); return (unsigned int)ret; } @@ -747,8 +747,7 @@ static void kvm_stats_release(struct kvm_binary_stats *stats) stats->desc = NULL; } - kvm_close(stats->fd); - stats->fd = -1; + kvm_free_fd(stats->fd); } __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) @@ -777,7 +776,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_munmap(vcpu->run, vcpu_mmap_sz()); - kvm_close(vcpu->fd); + kvm_free_fd(vcpu->fd); kvm_stats_release(&vcpu->stats); list_del(&vcpu->list); @@ -793,8 +792,8 @@ void kvm_vm_release(struct kvm_vm *vmp) list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) vm_vcpu_rm(vmp, vcpu); - kvm_close(vmp->fd); - kvm_close(vmp->kvm_fd); + kvm_free_fd(vmp->fd); + kvm_free_fd(vmp->kvm_fd); /* Free cached stats metadata and close FD */ kvm_stats_release(&vmp->stats); @@ -815,10 +814,10 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, if (region->fd >= 0) { /* There's an extra map when using shared memory. */ kvm_munmap(region->mmap_alias, region->mmap_size); - close(region->fd); + kvm_free_fd(region->fd); } - if (region->region.guest_memfd >= 0) - close(region->region.guest_memfd); + if ((int)region->region.guest_memfd >= 0) + kvm_free_fd(region->region.guest_memfd); free(region); } @@ -1311,7 +1310,7 @@ static size_t vcpu_mmap_sz(void) TEST_ASSERT(ret >= 0 && ret >= sizeof(struct kvm_run), KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret)); - close(dev_fd); + kvm_free_fd(dev_fd); return ret; } diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index b51467d70f6e..4ca48de7a926 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -848,7 +848,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, u32 vcpu_id) /* Setup guest general purpose registers */ vcpu_regs_get(vcpu, ®s); - regs.rflags = regs.rflags | 0x2; + regs.rflags = regs.rflags | X86_EFLAGS_FIXED; regs.rsp = stack_gva; vcpu_regs_set(vcpu, ®s); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index 67642759e4a0..7c10ba6e6fb4 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -360,7 +360,7 @@ static inline void init_vmcs_guest_state(void *rip, void *rsp) vmwrite(GUEST_DR7, 0x400); vmwrite(GUEST_RSP, (u64)rsp); vmwrite(GUEST_RIP, (u64)rip); - vmwrite(GUEST_RFLAGS, 2); + vmwrite(GUEST_RFLAGS, X86_EFLAGS_FIXED); vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0); vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP)); vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP)); diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 3d02db371422..4d9ad6104a6e 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -15,7 +15,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/mman.h> #include <time.h> #include <unistd.h> @@ -23,6 +22,7 @@ #include <linux/sizes.h> #include <test_util.h> +#include <kvm_syscalls.h> #include <kvm_util.h> #include <processor.h> #include <ucall_common.h> @@ -111,6 +111,7 @@ struct sync_area { */ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); +static int wait_timeout = 10; static sem_t vcpu_ready; static bool map_unmap_verify; @@ -418,7 +419,7 @@ static bool _guest_should_exit(void) */ static noinline void host_perform_sync(struct sync_area *sync) { - alarm(10); + alarm(wait_timeout); atomic_store_explicit(&sync->sync_flag, true, memory_order_release); while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire)) @@ -900,7 +901,7 @@ static void help(char *name, struct test_args *targs) { int ctr; - pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n", + pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count] [-t wait_timeout]\n", name); pr_info(" -h: print this help screen.\n"); pr_info(" -v: enable verbose mode (not for benchmarking).\n"); @@ -916,6 +917,8 @@ static void help(char *name, struct test_args *targs) targs->seconds); pr_info(" -r: specify the number of runs per test (currently: %i)\n", targs->runs); + pr_info(" -t: specify the number of seconds for host wait timeout (currently: %i)\n", + wait_timeout); pr_info("\nAvailable tests:\n"); for (ctr = 0; ctr < NTESTS; ctr++) @@ -964,7 +967,7 @@ static bool parse_args(int argc, char *argv[], u32 max_mem_slots; int opt; - while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) { + while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:t:")) != -1) { switch (opt) { case 'h': default: @@ -1007,6 +1010,9 @@ static bool parse_args(int argc, char *argv[], case 'r': targs->runs = atoi_positive("Runs per test", optarg); break; + case 't': + wait_timeout = atoi_positive("Host wait timeout", optarg); + break; } } diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index fcb57fd034e6..a0fcae3cb7a8 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -11,6 +11,7 @@ #include <kvm_util.h> #include <processor.h> #include <pthread.h> +#include <ucall_common.h> /* Arbitrarily chosen values */ #define TEST_SIZE (SZ_2M + PAGE_SIZE) @@ -167,7 +168,6 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) .type = vm_type, }; struct kvm_vcpu *vcpu; - struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; @@ -193,11 +193,6 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) vcpu_args_set(vcpu, 1, gva); vcpu_run(vcpu); - run = vcpu->run; - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Wanted KVM_EXIT_IO, got exit reason: %u (%s)", - run->exit_reason, exit_reason_str(run->exit_reason)); - switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: REPORT_GUEST_ASSERT(uc); diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index 8d6fdb5d38b8..cb86cb6b3635 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -27,6 +27,7 @@ enum { }; static bool isa_ext_cant_disable[KVM_RISCV_ISA_EXT_MAX]; +static bool sbi_ext_enabled[KVM_RISCV_SBI_EXT_MAX]; bool filter_reg(__u64 reg) { @@ -149,6 +150,14 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio1h): case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio2h): return isa_ext_cant_disable[KVM_RISCV_ISA_EXT_SSAIA]; + /* + * FWFT misaligned delegation registers are always visible when the SBI FWFT + * extension is enable and the host supports the misaligned delegation. + */ + case KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(misaligned_deleg.enable): + case KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(misaligned_deleg.flags): + case KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(misaligned_deleg.value): + return sbi_ext_enabled[KVM_RISCV_SBI_EXT_FWFT]; default: break; } @@ -193,6 +202,27 @@ static int override_vector_reg_size(struct kvm_vcpu *vcpu, struct vcpu_reg_subli return 0; } +void check_fwft_feature(struct kvm_vcpu *vcpu, struct vcpu_reg_sublist *s, u64 feature) +{ + unsigned long value; + int rc; + + /* Enable SBI FWFT extension so that we can check the supported register */ + rc = __vcpu_set_reg(vcpu, feature, 1); + if (rc) + return; + + for (int i = 0; i < s->regs_n; i++) { + if ((s->regs[i] & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_SBI_STATE) { + rc = __vcpu_get_reg(vcpu, s->regs[i], &value); + __TEST_REQUIRE(!rc, "%s not available, skipping tests", s->name); + } + } + + /* We should assert if disabling failed here while enabling succeeded before */ + vcpu_set_reg(vcpu, feature, 0); +} + void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) { unsigned long isa_ext_state[KVM_RISCV_ISA_EXT_MAX] = { 0 }; @@ -235,6 +265,9 @@ void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) break; case VCPU_FEATURE_SBI_EXT: feature = RISCV_SBI_EXT_REG(s->feature); + if (s->feature == KVM_RISCV_SBI_EXT_FWFT) + check_fwft_feature(vcpu, s, feature); + sbi_ext_enabled[s->feature] = true; break; default: TEST_FAIL("Unknown feature type"); @@ -897,11 +930,15 @@ static __u64 sbi_sta_regs[] = { KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_STA | KVM_REG_RISCV_SBI_STA_REG(shmem_hi), }; -static __u64 sbi_fwft_regs[] = { +static __u64 sbi_fwft_misaligned_deleg_regs[] = { KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_FWFT, KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(misaligned_deleg.enable), KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(misaligned_deleg.flags), KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(misaligned_deleg.value), +}; + +static __u64 sbi_fwft_pointer_masking_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_FWFT, KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(pointer_masking.enable), KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(pointer_masking.flags), KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_FWFT | KVM_REG_RISCV_SBI_FWFT_REG(pointer_masking.value), @@ -1013,7 +1050,7 @@ static __u64 fp_d_regs[] = { }; /* Define a default vector registers with length. This will be overwritten at runtime */ -static __u64 vector_regs[] = { +static __u64 v_regs[] = { KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_VECTOR | KVM_REG_RISCV_VECTOR_CSR_REG(vstart), KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_VECTOR | KVM_REG_RISCV_VECTOR_CSR_REG(vl), KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_VECTOR | KVM_REG_RISCV_VECTOR_CSR_REG(vtype), @@ -1057,37 +1094,17 @@ static __u64 vector_regs[] = { #define SUBLIST_BASE \ {"base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), \ .skips_set = base_skips_set, .skips_set_n = ARRAY_SIZE(base_skips_set),} -#define SUBLIST_SBI_BASE \ - {"sbi-base", .feature_type = VCPU_FEATURE_SBI_EXT, .feature = KVM_RISCV_SBI_EXT_V01, \ - .regs = sbi_base_regs, .regs_n = ARRAY_SIZE(sbi_base_regs),} -#define SUBLIST_SBI_STA \ - {"sbi-sta", .feature_type = VCPU_FEATURE_SBI_EXT, .feature = KVM_RISCV_SBI_EXT_STA, \ - .regs = sbi_sta_regs, .regs_n = ARRAY_SIZE(sbi_sta_regs),} -#define SUBLIST_SBI_FWFT \ - {"sbi-fwft", .feature_type = VCPU_FEATURE_SBI_EXT, .feature = KVM_RISCV_SBI_EXT_FWFT, \ - .regs = sbi_fwft_regs, .regs_n = ARRAY_SIZE(sbi_fwft_regs),} -#define SUBLIST_ZICBOM \ - {"zicbom", .feature = KVM_RISCV_ISA_EXT_ZICBOM, .regs = zicbom_regs, .regs_n = ARRAY_SIZE(zicbom_regs),} -#define SUBLIST_ZICBOP \ - {"zicbop", .feature = KVM_RISCV_ISA_EXT_ZICBOP, .regs = zicbop_regs, .regs_n = ARRAY_SIZE(zicbop_regs),} -#define SUBLIST_ZICBOZ \ - {"zicboz", .feature = KVM_RISCV_ISA_EXT_ZICBOZ, .regs = zicboz_regs, .regs_n = ARRAY_SIZE(zicboz_regs),} -#define SUBLIST_AIA \ - {"aia", .feature = KVM_RISCV_ISA_EXT_SSAIA, .regs = aia_regs, .regs_n = ARRAY_SIZE(aia_regs),} -#define SUBLIST_SMSTATEEN \ - {"smstateen", .feature = KVM_RISCV_ISA_EXT_SMSTATEEN, .regs = smstateen_regs, .regs_n = ARRAY_SIZE(smstateen_regs),} -#define SUBLIST_FP_F \ - {"fp_f", .feature = KVM_RISCV_ISA_EXT_F, .regs = fp_f_regs, \ - .regs_n = ARRAY_SIZE(fp_f_regs),} -#define SUBLIST_FP_D \ - {"fp_d", .feature = KVM_RISCV_ISA_EXT_D, .regs = fp_d_regs, \ - .regs_n = ARRAY_SIZE(fp_d_regs),} - -#define SUBLIST_V \ - {"v", .feature = KVM_RISCV_ISA_EXT_V, .regs = vector_regs, .regs_n = ARRAY_SIZE(vector_regs),} + +#define SUBLIST_ISA(ext, extu) \ + { \ + .name = #ext, \ + .feature = KVM_RISCV_ISA_EXT_##extu, \ + .regs = ext##_regs, \ + .regs_n = ARRAY_SIZE(ext##_regs), \ + } #define KVM_ISA_EXT_SIMPLE_CONFIG(ext, extu) \ -static __u64 regs_##ext[] = { \ +static __u64 ext##_regs[] = { \ KVM_REG_RISCV | KVM_REG_SIZE_ULONG | \ KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | \ KVM_RISCV_ISA_EXT_##extu, \ @@ -1095,18 +1112,22 @@ static __u64 regs_##ext[] = { \ static struct vcpu_reg_list config_##ext = { \ .sublists = { \ SUBLIST_BASE, \ - { \ - .name = #ext, \ - .feature = KVM_RISCV_ISA_EXT_##extu, \ - .regs = regs_##ext, \ - .regs_n = ARRAY_SIZE(regs_##ext), \ - }, \ + SUBLIST_ISA(ext, extu), \ {0}, \ }, \ } \ +#define SUBLIST_SBI(ext, extu) \ + { \ + .name = "sbi-"#ext, \ + .feature_type = VCPU_FEATURE_SBI_EXT, \ + .feature = KVM_RISCV_SBI_EXT_##extu, \ + .regs = sbi_##ext##_regs, \ + .regs_n = ARRAY_SIZE(sbi_##ext##_regs), \ + } + #define KVM_SBI_EXT_SIMPLE_CONFIG(ext, extu) \ -static __u64 regs_sbi_##ext[] = { \ +static __u64 sbi_##ext##_regs[] = { \ KVM_REG_RISCV | KVM_REG_SIZE_ULONG | \ KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | \ KVM_RISCV_SBI_EXT_##extu, \ @@ -1114,13 +1135,7 @@ static __u64 regs_sbi_##ext[] = { \ static struct vcpu_reg_list config_sbi_##ext = { \ .sublists = { \ SUBLIST_BASE, \ - { \ - .name = "sbi-"#ext, \ - .feature_type = VCPU_FEATURE_SBI_EXT, \ - .feature = KVM_RISCV_SBI_EXT_##extu, \ - .regs = regs_sbi_##ext, \ - .regs_n = ARRAY_SIZE(regs_sbi_##ext), \ - }, \ + SUBLIST_SBI(ext, extu), \ {0}, \ }, \ } \ @@ -1129,7 +1144,7 @@ static struct vcpu_reg_list config_sbi_##ext = { \ static struct vcpu_reg_list config_##ext = { \ .sublists = { \ SUBLIST_BASE, \ - SUBLIST_##extu, \ + SUBLIST_ISA(ext, extu), \ {0}, \ }, \ } \ @@ -1138,24 +1153,23 @@ static struct vcpu_reg_list config_##ext = { \ static struct vcpu_reg_list config_sbi_##ext = { \ .sublists = { \ SUBLIST_BASE, \ - SUBLIST_SBI_##extu, \ + SUBLIST_SBI(ext, extu), \ {0}, \ }, \ } \ /* Note: The below list is alphabetically sorted. */ -KVM_SBI_EXT_SUBLIST_CONFIG(base, BASE); +KVM_SBI_EXT_SUBLIST_CONFIG(base, V01); KVM_SBI_EXT_SUBLIST_CONFIG(sta, STA); KVM_SBI_EXT_SIMPLE_CONFIG(pmu, PMU); KVM_SBI_EXT_SIMPLE_CONFIG(dbcn, DBCN); KVM_SBI_EXT_SIMPLE_CONFIG(susp, SUSP); KVM_SBI_EXT_SIMPLE_CONFIG(mpxy, MPXY); -KVM_SBI_EXT_SUBLIST_CONFIG(fwft, FWFT); -KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA); -KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F); -KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D); +KVM_ISA_EXT_SUBLIST_CONFIG(aia, SSAIA); +KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, F); +KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, D); KVM_ISA_EXT_SUBLIST_CONFIG(v, V); KVM_ISA_EXT_SIMPLE_CONFIG(h, H); KVM_ISA_EXT_SIMPLE_CONFIG(smnpm, SMNPM); @@ -1228,6 +1242,23 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zvksed, ZVKSED); KVM_ISA_EXT_SIMPLE_CONFIG(zvksh, ZVKSH); KVM_ISA_EXT_SIMPLE_CONFIG(zvkt, ZVKT); +static struct vcpu_reg_list config_sbi_fwft_misaligned_deleg = { + .sublists = { + SUBLIST_BASE, + SUBLIST_SBI(fwft_misaligned_deleg, FWFT), + {0}, + }, +}; + +static struct vcpu_reg_list config_sbi_fwft_pointer_masking = { + .sublists = { + SUBLIST_BASE, + SUBLIST_ISA(smnpm, SMNPM), + SUBLIST_SBI(fwft_pointer_masking, FWFT), + {0}, + }, +}; + struct vcpu_reg_list *vcpu_configs[] = { &config_sbi_base, &config_sbi_sta, @@ -1235,7 +1266,8 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_sbi_dbcn, &config_sbi_susp, &config_sbi_mpxy, - &config_sbi_fwft, + &config_sbi_fwft_misaligned_deleg, + &config_sbi_fwft_pointer_masking, &config_aia, &config_fp_f, &config_fp_d, diff --git a/tools/testing/selftests/kvm/s390/cmma_test.c b/tools/testing/selftests/kvm/s390/cmma_test.c index e39a724fe860..15d81b2ed7ad 100644 --- a/tools/testing/selftests/kvm/s390/cmma_test.c +++ b/tools/testing/selftests/kvm/s390/cmma_test.c @@ -34,16 +34,22 @@ static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; /** * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot, * so use_cmma goes on and the CMMA related ioctls do something. + * Touch the page at offset 1M inside TEST_DATA to make sure its page + * tables are allocated in the host. */ static void guest_do_one_essa(void) { asm volatile( /* load TEST_DATA_START_GFN into r1 */ + " xgr 1,1\n" " llilf 1,%[start_gfn]\n" /* calculate the address from the gfn */ " sllg 1,1,12(0)\n" /* set the first page in TEST_DATA memslot to STABLE */ " .insn rrf,0xb9ab0000,2,1,1,0\n" + " agfi 1,0x100000\n" + /* also touch the first page of the second MB of TEST_DATA */ + " .insn rrf,0xb9ab0000,2,1,1,0\n" /* hypercall */ " diag 0,0,0x501\n" "0: j 0b" diff --git a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c index a9e5a01200b8..478381e6f84e 100644 --- a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c +++ b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c @@ -4,11 +4,10 @@ * * Copyright (C) 2024, Red Hat, Inc. */ -#include <sys/mman.h> - #include <linux/fs.h> #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" diff --git a/tools/testing/selftests/kvm/s390/tprot.c b/tools/testing/selftests/kvm/s390/tprot.c index 8054d2b178f0..d86179827a18 100644 --- a/tools/testing/selftests/kvm/s390/tprot.c +++ b/tools/testing/selftests/kvm/s390/tprot.c @@ -4,8 +4,8 @@ * * Copyright IBM Corp. 2021 */ -#include <sys/mman.h> #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c index 714906c1d12a..b24c1f9dbbe8 100644 --- a/tools/testing/selftests/kvm/s390/user_operexec.c +++ b/tools/testing/selftests/kvm/s390/user_operexec.c @@ -6,6 +6,7 @@ * Authors: * Janosch Frank <frankja@linux.ibm.com> */ +#include "facility.h" #include "kselftest.h" #include "kvm_util.h" #include "test_util.h" @@ -109,6 +110,111 @@ static void test_user_operexec_combined(void) kvm_vm_free(vm); } +static struct kvm_vm *create_vm_without_sthyi(void) +{ + struct kvm_s390_vm_cpu_processor info; + struct kvm_vm *vm; + + vm = vm_create(1); + + kvm_device_attr_get(vm->fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + + clear_bit_inv(74, (unsigned long *)&info.fac_list); + kvm_device_attr_set(vm->fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + + return vm; +} + +static void test_user_instr0_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_instr0); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0000); + + kvm_vm_free(vm); +} + +static void test_user_operexec_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_user_operexec); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + +static void test_instr0_combined_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_instr0); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0000); + + kvm_vm_free(vm); +} + +static void test_operexec_combined_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_user_operexec); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + /* * Run all tests above. * @@ -122,6 +228,10 @@ static struct testdef { { "instr0", test_user_instr0 }, { "operexec", test_user_operexec }, { "operexec_combined", test_user_operexec_combined}, + { "instr0_no_stfle_74", test_user_instr0_no_stfle_74 }, + { "instr0_combined_no_stfle_74", test_instr0_combined_no_stfle_74 }, + { "operexec_combined_no_stfle_74", test_operexec_combined_no_stfle_74 }, + { "operexec_no_stfle_74", test_user_operexec_no_stfle_74 }, }; int main(int argc, char *argv[]) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 9b919a231c93..a152ab65c657 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -8,11 +8,11 @@ #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> -#include <sys/mman.h> #include <linux/compiler.h> #include <test_util.h> +#include <kvm_syscalls.h> #include <kvm_util.h> #include <processor.h> @@ -510,7 +510,7 @@ static void test_add_overlapping_private_memory_regions(void) vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); - memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 5, 0); vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, MEM_REGION_GPA, MEM_REGION_SIZE * 2, 0, memfd, 0); @@ -526,12 +526,35 @@ static void test_add_overlapping_private_memory_regions(void) vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, MEM_REGION_GPA, 0, NULL, -1, 0); - /* Overlap the front half of the other slot. */ + /* + * Verify that overlap in the guest_memfd bindings (i.e. in guest_memfd + * file offsets), but _not_ in the GPA space, fails with -EEXIST. + */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, + MEM_REGION_SIZE * 2, + 0, memfd, MEM_REGION_SIZE); + TEST_ASSERT(r == -1 && errno == EEXIST, + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + /* And now the back half of the other slot's guest_memfd binding. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, + MEM_REGION_SIZE * 2, + 0, memfd, MEM_REGION_SIZE * 3); + TEST_ASSERT(r == -1 && errno == EEXIST, + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + /* + * Repeat the overlap tests, but this time with overlap in the memslots + * GPA space. Regardless of where there is overlap, KVM should return + * -EEXIST. + */ r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, MEM_REGION_GPA * 2 - MEM_REGION_SIZE, MEM_REGION_SIZE * 2, 0, memfd, 0); - TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + TEST_ASSERT(r == -1 && errno == EEXIST, "Overlapping guest_memfd() bindings should fail with EEXIST"); /* And now the back half of the other slot. */ @@ -539,7 +562,7 @@ static void test_add_overlapping_private_memory_regions(void) MEM_REGION_GPA * 2 + MEM_REGION_SIZE, MEM_REGION_SIZE * 2, 0, memfd, 0); - TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + TEST_ASSERT(r == -1 && errno == EEXIST, "Overlapping guest_memfd() bindings should fail with EEXIST"); close(memfd); diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 7df2bc8eec02..76fcdd1fd3cb 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -220,6 +220,8 @@ static void check_steal_time_uapi(void) }; vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, 1, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, 1); st_ipa = (ulong)ST_GPA_BASE | 1; ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); diff --git a/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c b/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c index 404f0028e110..0c84c27ea584 100644 --- a/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c +++ b/tools/testing/selftests/kvm/x86/apic_bus_clock_test.c @@ -137,6 +137,10 @@ static void run_apic_bus_clock_test(u64 apic_hz, u64 delay_ms, vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS, NSEC_PER_SEC / apic_hz); + TEST_ASSERT_EQ(kvm_check_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS), 1); + TEST_ASSERT_EQ(vm_check_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS), + NSEC_PER_SEC / apic_hz); + vcpu = vm_vcpu_add(vm, 0, apic_guest_code); vcpu_args_set(vcpu, 2, apic_hz, delay_ms); diff --git a/tools/testing/selftests/kvm/x86/debug_regs.c b/tools/testing/selftests/kvm/x86/debug_regs.c index 0dfaf03cd0a0..2a2ef3e179ff 100644 --- a/tools/testing/selftests/kvm/x86/debug_regs.c +++ b/tools/testing/selftests/kvm/x86/debug_regs.c @@ -15,10 +15,51 @@ #define IRQ_VECTOR 0xAA +#define CAST_TO_RIP(v) ((unsigned long long)&(v)) + /* For testing data access debug BP */ u32 guest_value; extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; +extern unsigned char fep_bd_start, fep_sti_start, fep_sti_end; + +static int irqs_received; + +static void guest_db_handler(struct ex_regs *regs) +{ + static int count; + unsigned long target_rips[2] = { + CAST_TO_RIP(fep_sti_start), + CAST_TO_RIP(fep_sti_end), + }; + + __GUEST_ASSERT(regs->rip == target_rips[count], + "STI[%u]: unexpected rip 0x%lx (should be 0x%lx)", + count, regs->rip, target_rips[count]); + regs->rflags &= ~X86_EFLAGS_TF; + count++; +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + /* + * The pending IRQ should finally be take when KVM_GUESTDBG_BLOCKIRQ is + * cleared and IRQs are enabled. Note, the IRQ is expected to arrive + * on the instruction immediately after STI, even though its in an STI + * shadow. Because the next instruction has a coincident #DB, and #DBs + * are not subject to STI-blocking, the #DB will push RFLAGS.IF=1 on + * the stack, and the eventual IRET will unmask IRQs and obliterate the + * STI shadow in the process. + */ + unsigned long target_rip = CAST_TO_RIP(fep_sti_start); + + __GUEST_ASSERT(regs->rip == target_rip, + "IRQ: unexpected rip 0x%lx (should be 0x%lx)", + regs->rip, target_rip); + + irqs_received++; + x2apic_write_reg(APIC_EOI, 0); +} static void guest_code(void) { @@ -64,11 +105,33 @@ static void guest_code(void) /* DR6.BD test */ asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax"); + + /* + * Note, the IRET from the #DB that occurs in the below STI-shadow will + * unmask IRQs, i.e. the pending interrupt will be delivered after #DB + * handling, on the CLI! + */ + if (is_forced_emulation_enabled) { + asm volatile(KVM_FEP "fep_bd_start: mov %%dr0, %%rax" : : : "rax"); + + /* pending debug exceptions for emulation */ + asm volatile("pushf\n\t" + "orq $" __stringify(X86_EFLAGS_TF) ", (%rsp)\n\t" + "popf\n\t" + "sti\n\t" + "fep_sti_start:" + "cli\n\t" + "pushf\n\t" + "orq $" __stringify(X86_EFLAGS_TF) ", (%rsp)\n\t" + "popf\n\t" + KVM_FEP "sti\n\t" + "fep_sti_end:" + "cli\n\t"); + GUEST_ASSERT(irqs_received == 1); + } GUEST_DONE(); } -#define CAST_TO_RIP(v) ((unsigned long long)&(v)) - static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len) { struct kvm_regs regs; @@ -185,7 +248,7 @@ int main(void) target_dr6); } - /* Finally test global disable */ + /* test global disable */ memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[7] = 0x400 | DR7_GD; @@ -202,10 +265,29 @@ int main(void) run->debug.arch.pc, target_rip, run->debug.arch.dr6, target_dr6); + /* test global disable in emulation */ + if (is_forced_emulation_enabled) { + /* Skip the 3-bytes "mov dr0" */ + vcpu_skip_insn(vcpu, 3); + vcpu_run(vcpu); + TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && + run->debug.arch.exception == DB_VECTOR && + run->debug.arch.pc == CAST_TO_RIP(fep_bd_start) && + run->debug.arch.dr6 == target_dr6, + "DR7.GD: exit %d exception %d rip 0x%llx " + "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)", + run->exit_reason, run->debug.arch.exception, + run->debug.arch.pc, CAST_TO_RIP(fep_bd_start), + run->debug.arch.dr6, target_dr6); + } + /* Disable all debug controls, run to the end */ memset(&debug, 0, sizeof(debug)); vcpu_guest_debug_set(vcpu, &debug); + vm_install_exception_handler(vm, DB_VECTOR, guest_db_handler); + vm_install_exception_handler(vm, IRQ_VECTOR, guest_irq_handler); + vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); cmd = get_ucall(vcpu, &uc); diff --git a/tools/testing/selftests/kvm/x86/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86/hwcr_msr_test.c index 8e20a03b3329..53b7971aa072 100644 --- a/tools/testing/selftests/kvm/x86/hwcr_msr_test.c +++ b/tools/testing/selftests/kvm/x86/hwcr_msr_test.c @@ -11,12 +11,17 @@ void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit) { const u64 ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8); - const u64 valid = BIT_ULL(18) | BIT_ULL(24); - const u64 legal = ignored | valid; + u64 valid = BIT_ULL(18) | BIT_ULL(24); u64 val = BIT_ULL(bit); u64 actual; + u64 legal; int r; + if (kvm_cpu_has(X86_FEATURE_GP_ON_USER_CPUID)) + valid |= BIT_ULL(35); + + legal = ignored | valid; + r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val); TEST_ASSERT(val & ~legal ? !r : r == 1, "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s", diff --git a/tools/testing/selftests/kvm/x86/hyperv_features.c b/tools/testing/selftests/kvm/x86/hyperv_features.c index 7347f1fe5157..2effde85c4c8 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86/hyperv_features.c @@ -26,6 +26,7 @@ struct msr_data { bool fault_expected; bool write; u64 write_val; + bool reset_expected; }; struct hcall_data { @@ -267,14 +268,9 @@ static void guest_test_msrs_access(void) case 16: msr->idx = HV_X64_MSR_RESET; msr->write = true; - /* - * TODO: the test only writes '0' to HV_X64_MSR_RESET - * at the moment, writing some other value there will - * trigger real vCPU reset and the code is not prepared - * to handle it yet. - */ - msr->write_val = 0; + msr->write_val = 1; msr->fault_expected = false; + msr->reset_expected = true; break; case 17: @@ -457,7 +453,7 @@ static void guest_test_msrs_access(void) msr->fault_expected = true; break; case 45: - /* MSR is vailable when CPUID feature bit is set */ + /* MSR is available when CPUID feature bit is set */ if (!has_invtsc) goto next_stage; vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT); @@ -497,6 +493,15 @@ static void guest_test_msrs_access(void) msr->idx, msr->write ? "write" : "read"); vcpu_run(vcpu); + + if (msr->reset_expected) { + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SYSTEM_EVENT); + TEST_ASSERT(vcpu->run->system_event.type == KVM_SYSTEM_EVENT_RESET, + "Expected reset system event, got type %u", + vcpu->run->system_event.type); + goto next_stage; + } + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); switch (get_ucall(vcpu, &uc)) { diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index 15ee8b7bfc11..b4be9a175379 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -142,17 +142,6 @@ static void swap_two_test_pages(gpa_t pte_gva1, gpa_t pte_gva2) } /* - * TODO: replace the silly NOP loop with a proper udelay() implementation. - */ -static inline void do_delay(void) -{ - int i; - - for (i = 0; i < 1000000; i++) - asm volatile("nop"); -} - -/* * Prepare to test: 'disable' workers by setting the expectation to '0', * clear hypercall input page and then swap two test pages. */ @@ -169,7 +158,7 @@ static inline void prepare_to_test(struct test_data *data) wmb(); /* Make sure workers have enough time to notice */ - do_delay(); + udelay(100); /* Swap test page mappings */ swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]); @@ -189,7 +178,7 @@ static inline void post_test(struct test_data *data, u64 exp1, u64 exp2) set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2); /* Make sure workers have enough time to test */ - do_delay(); + udelay(100); } #define TESTVAL1 0x0101010101010101 diff --git a/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c b/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c new file mode 100644 index 000000000000..fa95568f55ff --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_tdp_fault_test.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" + +#define L2_GUEST_STACK_SIZE 64 + +enum test_type { + TEST_FINAL_PAGE_UNMAPPED, /* Final data page not present */ + TEST_PT_PAGE_UNMAPPED, /* Page table page not present */ + TEST_FINAL_PAGE_WRITE_PROTECTED, /* Final data page read-only */ + TEST_PT_PAGE_WRITE_PROTECTED, /* Page table page read-only */ +}; + +static gva_t l2_test_page; +static void (*l2_entry)(void); + +#define TEST_IO_PORT 0x80 +#define TEST1_VADDR 0x8000000ULL +#define TEST2_VADDR 0x10000000ULL +#define TEST3_VADDR 0x18000000ULL +#define TEST4_VADDR 0x20000000ULL + +/* + * L2 executes OUTS reading from l2_test_page, triggering a nested page + * fault on the read access. + */ +static void l2_guest_code_outs(void) +{ + asm volatile("outsb" ::"S"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_FAIL("L2 should not reach here"); +} + +/* + * L2 executes INS writing to l2_test_page, triggering a nested page + * fault on the write access. + */ +static void l2_guest_code_ins(void) +{ + asm volatile("insb" ::"D"(l2_test_page), "d"(TEST_IO_PORT) : "memory"); + GUEST_FAIL("L2 should not reach here"); +} + +#define GUEST_ASSERT_EXIT_QUAL(ac_eq, ex_eq) \ + __GUEST_ASSERT((ac_eq) == (ex_eq), \ + "Wanted EXIT_QUAL '0x%lx', got '0x%lx'", ex_eq, ac_eq) + +static void l1_vmx_code(struct vmx_pages *vmx, u64 expected_fault_gpa, + u64 test_type) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 exit_qual; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_entry, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + + /* Verify we got an EPT violation exit */ + __GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION, + "Expected EPT violation (0x%x), got 0x%lx", + EXIT_REASON_EPT_VIOLATION, + vmreadz(VM_EXIT_REASON)); + + __GUEST_ASSERT(vmreadz(GUEST_PHYSICAL_ADDRESS) == expected_fault_gpa, + "Expected guest_physical_address = 0x%lx, got 0x%lx", + expected_fault_gpa, + vmreadz(GUEST_PHYSICAL_ADDRESS)); + + exit_qual = vmreadz(EXIT_QUALIFICATION); + + /* + * Note, EPT page table accesses are always read+write, e.g. so that + * the CPU can do A/D updates at-will. + */ + switch (test_type) { + case TEST_FINAL_PAGE_UNMAPPED: + GUEST_ASSERT_EXIT_QUAL(exit_qual, EPT_VIOLATION_ACC_READ | + EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); + break; + case TEST_PT_PAGE_UNMAPPED: + GUEST_ASSERT_EXIT_QUAL(exit_qual, EPT_VIOLATION_ACC_READ | + EPT_VIOLATION_ACC_WRITE | + EPT_VIOLATION_GVA_IS_VALID); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + GUEST_ASSERT_EXIT_QUAL(exit_qual, EPT_VIOLATION_ACC_WRITE | + EPT_VIOLATION_PROT_READ | + EPT_VIOLATION_PROT_EXEC | + EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + GUEST_ASSERT_EXIT_QUAL(exit_qual, EPT_VIOLATION_ACC_READ | + EPT_VIOLATION_ACC_WRITE | + EPT_VIOLATION_PROT_READ | + EPT_VIOLATION_PROT_EXEC | + EPT_VIOLATION_GVA_IS_VALID); + break; + } + + GUEST_DONE(); +} + +#define GUEST_ASSERT_NPF_EC(ac_ec, ex_ec) \ + __GUEST_ASSERT((ac_ec) == (ex_ec), \ + "Wanted NPF error code '0x%lx', got '0x%lx'", (u64)(ex_ec), ac_ec) + + +static void l1_svm_code(struct svm_test_data *svm, u64 expected_fault_gpa, + u64 test_type) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + u64 exit_info_1; + + generic_svm_setup(svm, l2_entry, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(vmcb, svm->vmcb_gpa); + + /* Verify we got an NPF exit */ + __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_NPF, + "Expected NPF exit (0x%x), got 0x%lx", SVM_EXIT_NPF, + vmcb->control.exit_code); + + __GUEST_ASSERT(vmcb->control.exit_info_2 == expected_fault_gpa, + "Expected exit_info_2 = 0x%lx, got 0x%lx", + expected_fault_gpa, + vmcb->control.exit_info_2); + + exit_info_1 = vmcb->control.exit_info_1; + + /* + * Note, without GMET enabled, NPT walks are always user accesses. And + * like EPT, page table accesses are always read+write. + */ + switch (test_type) { + case TEST_FINAL_PAGE_UNMAPPED: + GUEST_ASSERT_NPF_EC(exit_info_1, PFERR_USER_MASK | + PFERR_GUEST_FINAL_MASK); + break; + case TEST_PT_PAGE_UNMAPPED: + GUEST_ASSERT_NPF_EC(exit_info_1, PFERR_WRITE_MASK | + PFERR_USER_MASK | + PFERR_GUEST_PAGE_MASK); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + GUEST_ASSERT_NPF_EC(exit_info_1, PFERR_PRESENT_MASK | + PFERR_WRITE_MASK | + PFERR_USER_MASK | + PFERR_GUEST_FINAL_MASK); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + GUEST_ASSERT_NPF_EC(exit_info_1, PFERR_PRESENT_MASK | + PFERR_WRITE_MASK | + PFERR_USER_MASK | + PFERR_GUEST_PAGE_MASK); + break; + } + + GUEST_DONE(); +} + +static void l1_guest_code(void *data, u64 expected_fault_gpa, + u64 test_type) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data, expected_fault_gpa, test_type); + else + l1_svm_code(data, expected_fault_gpa, test_type); +} + +/* Returns the GPA of the PT page that maps @vaddr. */ +static u64 get_pt_gpa_for_vaddr(struct kvm_vm *vm, u64 vaddr) +{ + u64 *pte; + + pte = vm_get_pte(vm, vaddr); + TEST_ASSERT(pte && (*pte & 0x1), "PTE not present for vaddr 0x%lx", + (unsigned long)vaddr); + + return addr_hva2gpa(vm, (void *)((u64)pte & ~0xFFFULL)); +} + +static void run_test(enum test_type type) +{ + gpa_t expected_fault_gpa; + gva_t nested_gva; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_tdp(vm); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); + + switch (type) { + case TEST_FINAL_PAGE_UNMAPPED: + /* + * Unmap the final data page from NPT/EPT. The guest page + * table walk succeeds, but the final GPA->HPA translation + * fails. L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_alloc(vm, vm->page_size, TEST1_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, l2_test_page); + break; + case TEST_PT_PAGE_UNMAPPED: + /* + * Unmap a page table page from NPT/EPT. The hardware page + * table walk fails when translating the PT page's GPA + * through NPT/EPT. L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_alloc(vm, vm->page_size, TEST2_VADDR); + expected_fault_gpa = get_pt_gpa_for_vaddr(vm, l2_test_page); + break; + case TEST_FINAL_PAGE_WRITE_PROTECTED: + /* + * Write-protect the final data page in NPT/EPT. The page + * is present and readable, but not writable. L2 writes to + * the page via INS, triggering a protection violation. + */ + l2_entry = l2_guest_code_ins; + l2_test_page = vm_alloc(vm, vm->page_size, TEST3_VADDR); + expected_fault_gpa = addr_gva2gpa(vm, l2_test_page); + break; + case TEST_PT_PAGE_WRITE_PROTECTED: + /* + * Write-protect a page table page in NPT/EPT. The page is + * present and readable, but not writable. The guest page + * table walk needs write access to set A/D bits, so it + * triggers a protection violation on the PT page. + * L2 reads from the page via OUTS. + */ + l2_entry = l2_guest_code_outs; + l2_test_page = vm_alloc(vm, vm->page_size, TEST4_VADDR); + expected_fault_gpa = get_pt_gpa_for_vaddr(vm, l2_test_page); + break; + } + + tdp_identity_map_default_memslots(vm); + + if (type == TEST_FINAL_PAGE_WRITE_PROTECTED || + type == TEST_PT_PAGE_WRITE_PROTECTED) + *tdp_get_pte(vm, expected_fault_gpa) &= ~PTE_WRITABLE_MASK(&vm->stage2_mmu); + else + *tdp_get_pte(vm, expected_fault_gpa) &= ~(PTE_PRESENT_MASK(&vm->stage2_mmu) | + PTE_READABLE_MASK(&vm->stage2_mmu) | + PTE_WRITABLE_MASK(&vm->stage2_mmu) | + PTE_EXECUTABLE_MASK(&vm->stage2_mmu)); + + sync_global_to_guest(vm, l2_entry); + sync_global_to_guest(vm, l2_test_page); + vcpu_args_set(vcpu, 3, nested_gva, expected_fault_gpa, (u64)type); + + /* + * For the INS-based write test, KVM emulates the instruction and + * first reads from the I/O port, which exits to userspace. + * Re-enter the guest so emulation can proceed to the memory + * write, where the nested page fault is triggered. + */ + for (;;) { + vcpu_run(vcpu); + + if (vcpu->run->exit_reason == KVM_EXIT_IO && + vcpu->run->io.port == TEST_IO_PORT && + vcpu->run->io.direction == KVM_EXIT_IO_IN) { + continue; + } + break; + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected exit reason: %d", vcpu->run->exit_reason); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_cpu_has_tdp()); + + run_test(TEST_FINAL_PAGE_UNMAPPED); + run_test(TEST_PT_PAGE_UNMAPPED); + run_test(TEST_FINAL_PAGE_WRITE_PROTECTED); + run_test(TEST_PT_PAGE_WRITE_PROTECTED); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c index c1232344fda8..84e4c6ca67a3 100644 --- a/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86/pmu_event_filter_test.c @@ -731,6 +731,8 @@ static void test_filter_ioctl(struct kvm_vcpu *vcpu) static void intel_run_fixed_counter_guest_code(u8 idx) { + u8 nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + for (;;) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0); @@ -738,6 +740,10 @@ static void intel_run_fixed_counter_guest_code(u8 idx) /* Only OS_EN bit is enabled for fixed counter[idx]. */ wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL)); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx)); + if (nr_fixed_counters > 1) + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, + FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL) | + FIXED_PMC_CTRL((idx + 1) % nr_fixed_counters, FIXED_PMC_KERNEL)); __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); diff --git a/tools/testing/selftests/kvm/x86/sev_dbg_test.c b/tools/testing/selftests/kvm/x86/sev_dbg_test.c new file mode 100644 index 000000000000..a9d8e4c059f9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/sev_dbg_test.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <fcntl.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "sev.h" + +#define BUFFER_SIZE (PAGE_SIZE * 2) + +static u8 *data; +static u8 src[BUFFER_SIZE] __aligned(PAGE_SIZE); +static u8 dst[BUFFER_SIZE] __aligned(PAGE_SIZE); + +static void validate_dst(int i, int nr_bytes, u8 pattern) +{ + for ( ; i < nr_bytes; i++) + TEST_ASSERT(dst[i] == pattern, + "Expected 0x%x at byte %u, got 0x%x", + pattern, i, dst[i]); +} + +static void validate_buffers(void) +{ + int i; + + for (i = 0; i < BUFFER_SIZE; i++) + TEST_ASSERT(src[i] == dst[i], + "Expected src[%u] (0x%x) == dst[%u] (0x%x)", + i, src[i], i, dst[i]); +} + +static void ____test_sev_dbg(struct kvm_vm *vm, int i, int j, int nr_bytes) +{ + u8 pattern = guest_random_u32(&guest_rng); + + if (i + nr_bytes > BUFFER_SIZE || j + nr_bytes > BUFFER_SIZE) + return; + + memset(&src[i], pattern, nr_bytes); + sev_encrypt_memory(vm, &data[j], &src[i], nr_bytes); + sev_decrypt_memory(vm, &dst[i], &data[j], nr_bytes); + validate_buffers(); + validate_dst(i, nr_bytes, pattern); +} + +static void __test_sev_dbg(struct kvm_vm *vm, int nr_bytes) +{ + /* + * In a perfect world, all sizes at all combinations within the buffers + * would be tested. In reality, even this much testing is quite slow. + * Target sizes and offsets around the chunk (16 bytes) and page (4096 + * bytes) sizes. + */ + int x[] = { 1, 8, 15, 16, 23 }; + int p = PAGE_SIZE - 24; + int i, j; + + ____test_sev_dbg(vm, 0, 0, nr_bytes); + + for (i = 0; i < ARRAY_SIZE(x); i++) { + for (j = 0; j < ARRAY_SIZE(x); j++) { + ____test_sev_dbg(vm, x[i], x[j], nr_bytes); + ____test_sev_dbg(vm, x[i], p + x[j], nr_bytes); + ____test_sev_dbg(vm, p + x[i], x[j], nr_bytes); + ____test_sev_dbg(vm, p + x[i], p + x[j], nr_bytes); + } + } +} + +static void test_sev_dbg(u32 type, u64 policy) +{ + int sizes[] = { 1, 8, 15, 16, 17, 32, 33 }; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int i; + + if (!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(type))) + return; + + vm = vm_sev_create_with_one_vcpu(type, NULL, &vcpu); + + data = addr_gva2hva(vm, vm_alloc(vm, BUFFER_SIZE, KVM_UTIL_MIN_VADDR)); + memset(data, 0xaa, BUFFER_SIZE); + + vm_sev_launch(vm, policy, NULL); + + sev_decrypt_memory(vm, dst, data, BUFFER_SIZE); + validate_dst(0, BUFFER_SIZE, 0xaa); + + memset(src, 0x55, BUFFER_SIZE); + sev_encrypt_memory(vm, data, src, BUFFER_SIZE); + sev_decrypt_memory(vm, dst, data, BUFFER_SIZE); + validate_dst(0, BUFFER_SIZE, 0x55); + + __test_sev_dbg(vm, PAGE_SIZE); + + for (i = 0; i < ARRAY_SIZE(sizes); i++) { + __test_sev_dbg(vm, sizes[i]); + __test_sev_dbg(vm, PAGE_SIZE - sizes[i]); + __test_sev_dbg(vm, PAGE_SIZE + sizes[i]); + __test_sev_dbg(vm, BUFFER_SIZE - sizes[i]); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); + + /* Note, KVM doesn't support {de,en}crypt commands for SNP. */ + test_sev_dbg(KVM_X86_SEV_VM, 0); + test_sev_dbg(KVM_X86_SEV_ES_VM, SEV_POLICY_ES); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/sev_init2_tests.c b/tools/testing/selftests/kvm/x86/sev_init2_tests.c index 8eeba2327c7c..8db88c355f16 100644 --- a/tools/testing/selftests/kvm/x86/sev_init2_tests.c +++ b/tools/testing/selftests/kvm/x86/sev_init2_tests.c @@ -136,16 +136,14 @@ int main(int argc, char *argv[]) kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM); TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)); - have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + have_sev_es = kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM); - TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)), - "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", - kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + TEST_ASSERT(!have_sev_es || kvm_cpu_has(X86_FEATURE_SEV_ES), + "sev-es: SEV_ES_VM supported without SEV_ES in CPUID"); - have_snp = kvm_cpu_has(X86_FEATURE_SEV_SNP); - TEST_ASSERT(have_snp == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SNP_VM)), - "sev-snp: KVM_CAP_VM_TYPES (%x) indicates SNP support (bit %d), but CPUID does not", - kvm_check_cap(KVM_CAP_VM_TYPES), KVM_X86_SNP_VM); + have_snp = kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SNP_VM); + TEST_ASSERT(!have_snp || kvm_cpu_has(X86_FEATURE_SEV_SNP), + "sev-snp: SNP_VM supported without SEV_SNP in CPUID"); test_vm_types(); diff --git a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c index 6b0928e69051..42bc023d5193 100644 --- a/tools/testing/selftests/kvm/x86/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86/sev_migrate_tests.c @@ -374,7 +374,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); - have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + have_sev_es = kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM); if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { test_sev_migrate_from(/* es= */ false); diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index 1a49ee391586..6b2cbe2a90b7 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -249,10 +249,10 @@ int main(int argc, char *argv[]) test_sev_smoke(guest_sev_code, KVM_X86_SEV_VM, 0); - if (kvm_cpu_has(X86_FEATURE_SEV_ES)) + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)) test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES); - if (kvm_cpu_has(X86_FEATURE_SEV_SNP)) + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SNP_VM)) test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy()); return 0; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c b/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c new file mode 100644 index 000000000000..92da8ff34da1 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_nested_pat_test.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + * + * Test that KVM correctly virtualizes the PAT MSR and VMCB g_pat field + * for nested SVM guests: + * + * o With nested NPT disabled: + * - L1 and L2 share the same PAT + * - The vmcb12.g_pat is ignored + * o With nested NPT enabled: + * - Invalid g_pat in vmcb12 should cause VMEXIT_INVALID + * - L2 should see vmcb12.g_pat via RDMSR, not L1's PAT + * - L2's writes to PAT should be saved to vmcb12 on exit + * - L1's PAT should be restored after #VMEXIT from L2 + * - State save/restore should preserve both L1's and L2's PAT values + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" + +#define L2_GUEST_STACK_SIZE 256 + +#define PAT_DEFAULT 0x0007040600070406ULL +#define L1_PAT_VALUE 0x0007040600070404ULL /* Change PA0 to WT */ +#define L2_VMCB12_PAT 0x0606060606060606ULL /* All WB */ +#define L2_PAT_MODIFIED 0x0606060606060604ULL /* Change PA0 to WT */ +#define INVALID_PAT_VALUE 0x0808080808080808ULL /* 8 is reserved */ + +bool npt_enabled; +int nr_iterations; + +static void l2_guest_code(void) +{ + u64 expected_pat = npt_enabled ? L2_VMCB12_PAT : L1_PAT_VALUE; + int i; + + for (i = 0; i < nr_iterations; i++) { + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), expected_pat); + GUEST_SYNC(1); + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), expected_pat); + + wrmsr(MSR_IA32_CR_PAT, L2_PAT_MODIFIED); + expected_pat = L2_PAT_MODIFIED; + + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L2_PAT_MODIFIED); + GUEST_SYNC(2); + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L2_PAT_MODIFIED); + + vmmcall(); + } +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + int i; + + wrmsr(MSR_IA32_CR_PAT, L1_PAT_VALUE); + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L1_PAT_VALUE); + + generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + vmcb->save.g_pat = L2_VMCB12_PAT; + vmcb->control.intercept &= ~(1ULL << INTERCEPT_MSR_PROT); + + for (i = 0; i < nr_iterations; i++) { + run_guest(vmcb, svm->vmcb_gpa); + + GUEST_ASSERT_EQ(vmcb->control.exit_code, SVM_EXIT_VMMCALL); + + /* + * If NPT is enabled by L1, L2 has a unique PAT and L1's PAT is + * unchanged. Otherwise, PAT is shared between L1 and L2. + */ + if (npt_enabled) { + GUEST_ASSERT_EQ(vmcb->save.g_pat, L2_PAT_MODIFIED); + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L1_PAT_VALUE); + } else { + GUEST_ASSERT_EQ(rdmsr(MSR_IA32_CR_PAT), L2_PAT_MODIFIED); + } + vmcb->save.rip += 3; /* skip over VMMCALL */ + } + + GUEST_DONE(); +} + +static void l1_guest_code_invalid_gpat(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + /* VMRUN should fail without running L2 */ + generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + vmcb->save.g_pat = INVALID_PAT_VALUE; + run_guest(vmcb, svm->vmcb_gpa); + + GUEST_ASSERT_EQ(vmcb->control.exit_code, SVM_EXIT_ERR); + GUEST_DONE(); +} + +static void run_test(void *guest_code, bool do_save_restore, int nr_iters) +{ + struct kvm_x86_state *state; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + gva_t svm_gva; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); + + if (npt_enabled) + vm_enable_npt(vm); + + vcpu_alloc_svm(vm, &svm_gva); + + if (npt_enabled) + tdp_identity_map_default_memslots(vm); + + vcpu_args_set(vcpu, 1, svm_gva); + + nr_iterations = nr_iters; + sync_global_to_guest(vm, npt_enabled); + sync_global_to_guest(vm, nr_iterations); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + if (do_save_restore) { + state = vcpu_save_state(vcpu); + kvm_vm_release(vm); + vcpu = vm_recreate_with_one_vcpu(vm); + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + } + break; + case UCALL_DONE: + kvm_vm_free(vm); + return; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +} + +#define gpat_test(test_name, guest_code, npt_setting) \ +do { \ + npt_setting; \ + \ + if (npt_enabled && !kvm_cpu_has(X86_FEATURE_NPT)) { \ + pr_info("Skipping: " test_name " (no NPT support)\n"); \ + break; \ + } \ + \ + pr_info("Testing: " test_name "\n"); \ + run_test(guest_code, false, 1); \ + \ + if (guest_code == l1_guest_code) { \ + pr_info("Testing: " test_name " Save/Restore\n"); \ + run_test(guest_code, true, 1); \ + \ + pr_info("Testing: " test_name " Multiple VMRUNs\n"); \ + run_test(guest_code, false, 10); \ + } \ +} while (0) + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & + KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT); + + gpat_test("Invalid gPAT", l1_guest_code_invalid_gpat, npt_enabled = true); + gpat_test("Nested NPT enabled", l1_guest_code, npt_enabled = true); + gpat_test("Nested NPT disabled", l1_guest_code, npt_enabled = false); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/sync_regs_test.c b/tools/testing/selftests/kvm/x86/sync_regs_test.c index e0c52321f87c..5b0c2359bbb4 100644 --- a/tools/testing/selftests/kvm/x86/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86/sync_regs_test.c @@ -255,7 +255,6 @@ KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code) struct kvm_regs regs; /* Request and verify all valid register sets. */ - /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */ run->kvm_valid_regs = TEST_SYNC_FIELDS; vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); diff --git a/tools/testing/selftests/landlock/audit.h b/tools/testing/selftests/landlock/audit.h index 834005b2b0f0..f45fdef35681 100644 --- a/tools/testing/selftests/landlock/audit.h +++ b/tools/testing/selftests/landlock/audit.h @@ -45,17 +45,25 @@ struct audit_message { }; }; -static const struct timeval audit_tv_dom_drop = { +static const struct timeval audit_tv_default = { /* - * Because domain deallocation is tied to asynchronous credential - * freeing, receiving such event may take some time. In practice, - * on a small VM, it should not exceed 100k usec, but let's wait up - * to 1 second to be safe. + * Default socket timeout for audit_match_record() callers that expect a + * record to arrive. Asynchronous kauditd delivery can exceed 1 usec + * under heavy debug configs (KASAN, lockdep), where kauditd_thread + * scheduling between audit_log_end() and netlink_unicast() takes longer + * than the previous 1 usec timeout. 1 second is a generous ceiling: on + * the happy path, kauditd delivers within dozens of usec. */ .tv_sec = 1, }; -static const struct timeval audit_tv_default = { +static const struct timeval audit_tv_fast = { + /* + * Fast timeout for paths that expect no record (audit_init() drain, + * audit_count_records(), probes). Causes audit_recv() to return + * -EAGAIN once the socket buffer is empty, naturally terminating the + * read loop. + */ .tv_usec = 1, }; @@ -334,8 +342,13 @@ static int __maybe_unused matches_log_domain_allocated(int audit_fd, pid_t pid, * Matches a domain deallocation record. When expected_domain_id is non-zero, * the pattern includes the specific domain ID so that stale deallocation * records from a previous test (with a different domain ID) are skipped by - * audit_match_record(), and the socket timeout is temporarily increased to - * audit_tv_dom_drop to wait for the asynchronous kworker deallocation. + * audit_match_record(), waiting for the asynchronous kworker deallocation with + * the default patient timeout. + * + * When expected_domain_id is zero, the caller is probing for any dealloc record + * that may or may not arrive. Temporarily lowers the socket timeout to + * audit_tv_fast for this probe so it returns promptly when no record is + * pending; restores audit_tv_default after. */ static int __maybe_unused matches_log_domain_deallocated(int audit_fd, unsigned int num_denials, @@ -361,16 +374,21 @@ matches_log_domain_deallocated(int audit_fd, unsigned int num_denials, if (log_match_len >= sizeof(log_match)) return -E2BIG; - if (expected_domain_id) - setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, - &audit_tv_dom_drop, sizeof(audit_tv_dom_drop)); + if (!expected_domain_id) { + if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, + &audit_tv_fast, sizeof(audit_tv_fast))) + return -errno; + } err = audit_match_record(audit_fd, AUDIT_LANDLOCK_DOMAIN, log_match, domain_id); - if (expected_domain_id) - setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default, - sizeof(audit_tv_default)); + if (!expected_domain_id) { + if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, + &audit_tv_default, sizeof(audit_tv_default)) && + !err) + err = -errno; + } return err; } @@ -381,30 +399,46 @@ struct audit_records { }; /* - * WARNING: Do not assert records.domain == 0 without a preceding - * audit_match_record() call. Domain deallocation records are emitted - * asynchronously from kworker threads and can arrive after the drain in - * audit_init(), corrupting the domain count. A preceding audit_match_record() - * call consumes stale records while scanning, making the assertion safe in - * practice because stale deallocation records arrive before the expected access - * records. + * Counts remaining audit records by type, skipping domain deallocation records. + * Deallocation records are emitted asynchronously from kworker threads after a + * previous test's child has exited, so they can arrive after the drain in + * audit_init() and after the preceding audit_match_record() call. Allocation + * records are emitted synchronously during landlock_log_denial() in the current + * test's syscall context, so only those are counted in records->domain. + * + * Temporarily lowers SO_RCVTIMEO to audit_tv_fast for the read loop: this is a + * "no record expected" path that should terminate on the first -EAGAIN. The + * default patient timeout is restored on exit for subsequent + * audit_match_record() callers. */ static int audit_count_records(int audit_fd, struct audit_records *records) { + static const char dealloc_pattern[] = REGEX_LANDLOCK_PREFIX + " status=deallocated "; struct audit_message msg; - int err; + regex_t dealloc_re; + int ret, err = 0; + + ret = regcomp(&dealloc_re, dealloc_pattern, 0); + if (ret) + return -ENOMEM; records->access = 0; records->domain = 0; + if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_fast, + sizeof(audit_tv_fast))) { + err = -errno; + goto out; + } + do { memset(&msg, 0, sizeof(msg)); err = audit_recv(audit_fd, &msg); if (err) { if (err == -EAGAIN) - return 0; - else - return err; + err = 0; + break; } switch (msg.header.nlmsg_type) { @@ -412,12 +446,24 @@ static int audit_count_records(int audit_fd, struct audit_records *records) records->access++; break; case AUDIT_LANDLOCK_DOMAIN: - records->domain++; + ret = regexec(&dealloc_re, msg.data, 0, NULL, 0); + if (ret == REG_NOMATCH) { + records->domain++; + } else if (ret != 0) { + err = -EIO; + goto out; + } break; } } while (true); - return 0; +out: + if (setsockopt(audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default, + sizeof(audit_tv_default)) && + !err) + err = -errno; + regfree(&dealloc_re); + return err; } static int audit_init(void) @@ -436,9 +482,9 @@ static int audit_init(void) if (err) goto err_close; - /* Sets a timeout for negative tests. */ - err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default, - sizeof(audit_tv_default)); + /* Uses the fast timeout to drain stale records below. */ + err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_fast, + sizeof(audit_tv_fast)); if (err) { err = -errno; goto err_close; @@ -454,6 +500,19 @@ static int audit_init(void) while (audit_recv(fd, NULL) == 0) ; + /* + * Restores the default timeout for audit_match_record() callers that + * expect a record to arrive. Paths that expect no record restore the + * fast timeout locally (audit_count_records(), the expected_domain_id + * == 0 probe in matches_log_domain_deallocated()). + */ + err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default, + sizeof(audit_tv_default)); + if (err) { + err = -errno; + goto err_close; + } + return fd; err_close: @@ -494,10 +553,9 @@ static int audit_init_filter_exe(struct audit_filter *filter, const char *path) static int audit_cleanup(int audit_fd, struct audit_filter *filter) { struct audit_filter new_filter; + int err = 0; if (audit_fd < 0 || !filter) { - int err; - /* * Simulates audit_init_with_exe_filter() when called from * FIXTURE_TEARDOWN_PARENT(). @@ -508,23 +566,19 @@ static int audit_cleanup(int audit_fd, struct audit_filter *filter) filter = &new_filter; err = audit_init_filter_exe(filter, NULL); - if (err) { - close(audit_fd); - return err; - } + if (err) + goto err_close; } /* Filters might not be in place. */ audit_filter_exe(audit_fd, filter, AUDIT_DEL_RULE); audit_filter_drop(audit_fd, AUDIT_DEL_RULE); - /* - * Because audit_cleanup() might not be called by the test auditd - * process, it might not be possible to explicitly set it. Anyway, - * AUDIT_STATUS_ENABLED will implicitly be set to 0 when the auditd - * process will exit. - */ - return close(audit_fd); + err = audit_set_status(audit_fd, AUDIT_STATUS_ENABLED, 0); + +err_close: + close(audit_fd); + return err; } static int audit_init_with_exe_filter(struct audit_filter *filter) diff --git a/tools/testing/selftests/landlock/audit_test.c b/tools/testing/selftests/landlock/audit_test.c index 93ae5bd0dcce..72b5612375dd 100644 --- a/tools/testing/selftests/landlock/audit_test.c +++ b/tools/testing/selftests/landlock/audit_test.c @@ -76,7 +76,7 @@ TEST_F(audit, layers) .scoped = LANDLOCK_SCOPE_SIGNAL, }; int status, ruleset_fd, i; - __u64(*domain_stack)[16]; + __u64(*domain_stack)[LANDLOCK_MAX_NUM_LAYERS]; __u64 prev_dom = 3; pid_t child; @@ -607,30 +607,42 @@ FIXTURE(audit_flags) FIXTURE_VARIANT(audit_flags) { const int restrict_flags; + const __u64 quiet_scoped; }; /* clang-format off */ FIXTURE_VARIANT_ADD(audit_flags, default) { /* clang-format on */ .restrict_flags = 0, + .quiet_scoped = 0, }; /* clang-format off */ FIXTURE_VARIANT_ADD(audit_flags, same_exec_off) { /* clang-format on */ .restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF, + .quiet_scoped = 0, }; /* clang-format off */ FIXTURE_VARIANT_ADD(audit_flags, subdomains_off) { /* clang-format on */ .restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF, + .quiet_scoped = 0, }; /* clang-format off */ FIXTURE_VARIANT_ADD(audit_flags, cross_exec_on) { /* clang-format on */ .restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON, + .quiet_scoped = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(audit_flags, signal_quieted) { + /* clang-format on */ + .restrict_flags = 0, + .quiet_scoped = LANDLOCK_SCOPE_SIGNAL, }; FIXTURE_SETUP(audit_flags) @@ -674,12 +686,16 @@ TEST_F(audit_flags, signal) pid_t child; struct audit_records records; __u64 deallocated_dom = 2; + bool expect_audit = !(variant->restrict_flags & + LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF) && + !(variant->quiet_scoped & LANDLOCK_SCOPE_SIGNAL); child = fork(); ASSERT_LE(0, child); if (child == 0) { const struct landlock_ruleset_attr ruleset_attr = { .scoped = LANDLOCK_SCOPE_SIGNAL, + .quiet_scoped = variant->quiet_scoped, }; int ruleset_fd; @@ -696,8 +712,7 @@ TEST_F(audit_flags, signal) EXPECT_EQ(-1, kill(getppid(), 0)); EXPECT_EQ(EPERM, errno); - if (variant->restrict_flags & - LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF) { + if (!expect_audit) { EXPECT_EQ(-EAGAIN, matches_log_signal( _metadata, self->audit_fd, getppid(), self->domain_id)); @@ -724,12 +739,12 @@ TEST_F(audit_flags, signal) /* Makes sure there is no superfluous logged records. */ EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); - if (variant->restrict_flags & - LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF) { + if (!expect_audit) { EXPECT_EQ(0, records.access); } else { EXPECT_EQ(1, records.access); } + EXPECT_EQ(0, records.domain); /* Updates filter rules to match the drop record. */ set_cap(_metadata, CAP_AUDIT_CONTROL); @@ -748,8 +763,7 @@ TEST_F(audit_flags, signal) WEXITSTATUS(status) != EXIT_SUCCESS) _metadata->exit_code = KSFT_FAIL; - if (variant->restrict_flags & - LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF) { + if (!expect_audit) { /* * No deallocation record: denials=0 never matches a real * record. @@ -849,10 +863,8 @@ FIXTURE_SETUP(audit_exec) FIXTURE_TEARDOWN(audit_exec) { set_cap(_metadata, CAP_AUDIT_CONTROL); - EXPECT_EQ(0, audit_filter_exe(self->audit_fd, &self->audit_filter, - AUDIT_DEL_RULE)); + EXPECT_EQ(0, audit_cleanup(self->audit_fd, &self->audit_filter)); clear_cap(_metadata, CAP_AUDIT_CONTROL); - EXPECT_EQ(0, close(self->audit_fd)); } TEST_F(audit_exec, signal_and_open) @@ -917,6 +929,7 @@ TEST_F(audit_exec, signal_and_open) /* Tests that there was no denial until now. */ EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); /* * Wait for the child to do a first denied action by layer1 and diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c index 30d37234086c..cbd3c1669951 100644 --- a/tools/testing/selftests/landlock/base_test.c +++ b/tools/testing/selftests/landlock/base_test.c @@ -76,8 +76,8 @@ TEST(abi_version) const struct landlock_ruleset_attr ruleset_attr = { .handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE, }; - ASSERT_EQ(9, landlock_create_ruleset(NULL, 0, - LANDLOCK_CREATE_RULESET_VERSION)); + ASSERT_EQ(10, landlock_create_ruleset(NULL, 0, + LANDLOCK_CREATE_RULESET_VERSION)); ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0, LANDLOCK_CREATE_RULESET_VERSION)); @@ -201,7 +201,7 @@ TEST(add_rule_checks_ordering) ASSERT_LE(0, ruleset_fd); /* Checks invalid flags. */ - ASSERT_EQ(-1, landlock_add_rule(-1, 0, NULL, 1)); + ASSERT_EQ(-1, landlock_add_rule(-1, 0, NULL, 100)); ASSERT_EQ(EINVAL, errno); /* Checks invalid ruleset FD. */ @@ -526,4 +526,120 @@ TEST(cred_transfer) EXPECT_EQ(EACCES, errno); } +TEST(useless_quiet_rule_fs) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + .quiet_access_fs = 0, + }; + struct landlock_path_beneath_attr path_beneath_attr = { + .allowed_access = LANDLOCK_ACCESS_FS_READ_DIR, + }; + int ruleset_fd, root_fd; + + drop_caps(_metadata); + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + + root_fd = open("/", O_PATH | O_CLOEXEC); + ASSERT_LE(0, root_fd); + path_beneath_attr.parent_fd = root_fd; + ASSERT_EQ(-1, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, + &path_beneath_attr, + LANDLOCK_ADD_RULE_QUIET)); + ASSERT_EQ(EINVAL, errno); + + /* Check that the rule had not been added. */ + ASSERT_EQ(0, close(root_fd)); + enforce_ruleset(_metadata, ruleset_fd); + ASSERT_EQ(0, close(ruleset_fd)); + + ASSERT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC)); + ASSERT_EQ(EACCES, errno); +} + +TEST(useless_quiet_rule_net) +{ + struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP, + .quiet_access_net = 0, + }; + struct landlock_net_port_attr net_port_attr = { + .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP, + .port = 1024, + }; + int ruleset_fd; + + drop_caps(_metadata); + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + + ASSERT_EQ(-1, + landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &net_port_attr, LANDLOCK_ADD_RULE_QUIET)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(0, close(ruleset_fd)); +} + +TEST(invalid_quiet_bits_1) +{ + const struct landlock_ruleset_attr ruleset_attr_fs = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + .quiet_access_fs = LANDLOCK_ACCESS_FS_WRITE_FILE, + }; + const struct landlock_ruleset_attr ruleset_attr_net = { + .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP, + .quiet_access_net = LANDLOCK_ACCESS_NET_CONNECT_TCP, + }; + const struct landlock_ruleset_attr ruleset_attr_scoped = { + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, + .quiet_scoped = LANDLOCK_SCOPE_SIGNAL, + }; + + /* Quiet bit set but not part of the handled mask. */ + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr_fs, + sizeof(ruleset_attr_fs), 0)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr_net, + sizeof(ruleset_attr_net), 0)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr_scoped, + sizeof(ruleset_attr_scoped), 0)); + ASSERT_EQ(EINVAL, errno); +} + +TEST(invalid_quiet_bits_2) +{ + const struct landlock_ruleset_attr ruleset_attr_fs = { + .handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR, + .quiet_access_fs = 1ULL << 63, + }; + const struct landlock_ruleset_attr ruleset_attr_net = { + .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP, + .quiet_access_net = 1ULL << 63, + }; + const struct landlock_ruleset_attr ruleset_attr_scoped = { + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, + .quiet_scoped = 1ULL << 63, + }; + + /* Quiet bit outside of the valid access range. */ + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr_fs, + sizeof(ruleset_attr_fs), 0)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr_net, + sizeof(ruleset_attr_net), 0)); + ASSERT_EQ(EINVAL, errno); + + ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr_scoped, + sizeof(ruleset_attr_scoped), 0)); + ASSERT_EQ(EINVAL, errno); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 90551650299c..7206d5105d66 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -25,6 +25,8 @@ /* TEST_F_FORK() should not be used for new tests. */ #define TEST_F_FORK(fixture_name, test_name) TEST_F(fixture_name, test_name) +#define LANDLOCK_MAX_NUM_LAYERS 16 + static const char bin_sandbox_and_launch[] = "./sandbox-and-launch"; static const char bin_wait_pipe[] = "./wait-pipe"; static const char bin_wait_pipe_sandbox[] = "./wait-pipe-sandbox"; diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index cdb47fc1fc0a..86e08aa6e0a7 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -720,7 +720,7 @@ TEST_F_FORK(layout1, rule_with_unhandled_access) static void add_path_beneath(struct __test_metadata *const _metadata, const int ruleset_fd, const __u64 allowed_access, - const char *const path) + const char *const path, __u32 flags) { struct landlock_path_beneath_attr path_beneath = { .allowed_access = allowed_access, @@ -733,7 +733,7 @@ static void add_path_beneath(struct __test_metadata *const _metadata, strerror(errno)); } ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_PATH_BENEATH, - &path_beneath, 0)) + &path_beneath, flags)) { TH_LOG("Failed to update the ruleset with \"%s\": %s", path, strerror(errno)); @@ -780,7 +780,7 @@ static int create_ruleset(struct __test_metadata *const _metadata, continue; add_path_beneath(_metadata, ruleset_fd, rules[i].access, - rules[i].path); + rules[i].path, 0); } return ruleset_fd; } @@ -1310,7 +1310,7 @@ TEST_F_FORK(layout1, inherit_subset) * ANDed with the previous ones. */ add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_WRITE_FILE, - dir_s1d2); + dir_s1d2, 0); /* * According to ruleset_fd, dir_s1d2 should now have the * LANDLOCK_ACCESS_FS_READ_FILE and LANDLOCK_ACCESS_FS_WRITE_FILE @@ -1342,7 +1342,7 @@ TEST_F_FORK(layout1, inherit_subset) * Try to get more privileges by adding new access rights to the parent * directory: dir_s1d1. */ - add_path_beneath(_metadata, ruleset_fd, ACCESS_RW, dir_s1d1); + add_path_beneath(_metadata, ruleset_fd, ACCESS_RW, dir_s1d1, 0); enforce_ruleset(_metadata, ruleset_fd); /* Same tests and results as above. */ @@ -1365,7 +1365,7 @@ TEST_F_FORK(layout1, inherit_subset) * that there was no rule tied to it before. */ add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_WRITE_FILE, - dir_s1d3); + dir_s1d3, 0); enforce_ruleset(_metadata, ruleset_fd); ASSERT_EQ(0, close(ruleset_fd)); @@ -1417,7 +1417,7 @@ TEST_F_FORK(layout1, inherit_superset) add_path_beneath(_metadata, ruleset_fd, LANDLOCK_ACCESS_FS_READ_FILE | LANDLOCK_ACCESS_FS_READ_DIR, - dir_s1d2); + dir_s1d2, 0); enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); @@ -1441,7 +1441,7 @@ TEST_F_FORK(layout0, max_layers) }; const int ruleset_fd = create_ruleset(_metadata, ACCESS_RW, rules); - for (i = 0; i < 16; i++) + for (i = 0; i < LANDLOCK_MAX_NUM_LAYERS; i++) enforce_ruleset(_metadata, ruleset_fd); for (i = 0; i < 2; i++) { @@ -3970,7 +3970,7 @@ static int ioctl_error(struct __test_metadata *const _metadata, int fd, unsigned int cmd) { char buf[128]; /* sufficiently large */ - int res, stdinbak_fd; + int res, stdinbak_fd, err; /* * Depending on the IOCTL command, parts of the zeroed-out buffer might @@ -3985,13 +3985,14 @@ static int ioctl_error(struct __test_metadata *const _metadata, int fd, /* Invokes the IOCTL with a zeroed-out buffer. */ bzero(&buf, sizeof(buf)); res = ioctl(fd, cmd, &buf); + err = errno; /* Restores the old FD 0 and closes the backup FD. */ ASSERT_EQ(0, dup2(stdinbak_fd, 0)); ASSERT_EQ(0, close(stdinbak_fd)); if (res < 0) - return errno; + return err; return 0; } @@ -4789,6 +4790,7 @@ FIXTURE(layout1_bind) {}; static const char bind_dir_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3"; static const char bind_file1_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f1"; +static const char bind_file2_s1d3[] = TMP_DIR "/s2d1/s2d2/s1d3/f2"; /* Move targets for disconnected path tests. */ static const char dir_s4d1[] = TMP_DIR "/s4d1"; @@ -7764,4 +7766,2427 @@ TEST_F(audit_layout1, mount) EXPECT_EQ(1, records.domain); } +static bool debug_quiet_tests; + +FIXTURE(audit_quiet_layout1) +{ + struct audit_filter audit_filter; + int audit_fd; +}; + +FIXTURE_SETUP(audit_quiet_layout1) +{ + prepare_layout(_metadata); + create_layout1(_metadata); + + set_cap(_metadata, CAP_AUDIT_CONTROL); + self->audit_fd = audit_init_with_exe_filter(&self->audit_filter); + EXPECT_LE(0, self->audit_fd); + clear_cap(_metadata, CAP_AUDIT_CONTROL); + + if (getenv("DEBUG_QUIET_TESTS")) + debug_quiet_tests = true; +} + +FIXTURE_TEARDOWN_PARENT(audit_quiet_layout1) +{ + remove_layout1(_metadata); + cleanup_layout(_metadata); + + set_cap(_metadata, CAP_AUDIT_CONTROL); + EXPECT_EQ(0, audit_cleanup(-1, NULL)); + clear_cap(_metadata, CAP_AUDIT_CONTROL); +} + +struct a_rule { + const char *path; + __u64 access; + bool quiet; +}; + +struct a_layer { + __u64 handled_access_fs; + __u64 quiet_access_fs; + struct a_rule rules[6]; + __u64 restrict_flags; +}; + +struct a_target { + /* File/dir to try open. */ + const char *target; + /* Open mode (one of O_RDONLY, O_WRONLY, or O_RDWR). */ + int open_mode; + /* Should open succeed? */ + bool expect_open_success; + /* If open fails, whether to expect an audit log for read. */ + bool audit_read_blocked; + /* If open fails, whether to expect an audit log for write. */ + bool audit_write_blocked; + /* If ftruncate() is expected to be allowed. */ + bool expect_truncate_success; + /* If ftruncate fails, whether to expect an audit log. */ + bool audit_truncate; + /* + * If ioctl() is expected to be allowed (ioctl not attempted if neither + * this nor expect_ioctl_denied is set). + */ + bool expect_ioctl_allowed; + /* If ioctl() is expected to be denied. */ + bool expect_ioctl_denied; + /* If ioctl fails, whether to expect an audit log. */ + bool audit_ioctl; +}; + +#define AUDIT_QUIET_MAX_TARGETS 10 + +FIXTURE_VARIANT(audit_quiet_layout1) +{ + struct a_layer layers[3]; + struct a_target targets[AUDIT_QUIET_MAX_TARGETS]; +}; + +#define FS_R LANDLOCK_ACCESS_FS_READ_FILE +#define FS_W LANDLOCK_ACCESS_FS_WRITE_FILE +#define FS_TRUNC LANDLOCK_ACCESS_FS_TRUNCATE +#define FS_IOCTL LANDLOCK_ACCESS_FS_IOCTL_DEV + +static int sprint_access_bits(char *buf, size_t buflen, __u64 access) +{ + size_t offset = 0; + + if (buflen < strlen("rwti make_reg remove_file refer") + 1) + abort(); + + buf[0] = '\0'; + if (access & FS_R) + offset += snprintf(buf + offset, buflen - offset, "r"); + if (access & FS_W) + offset += snprintf(buf + offset, buflen - offset, "w"); + if (access & FS_TRUNC) + offset += snprintf(buf + offset, buflen - offset, "t"); + if (access & FS_IOCTL) + offset += snprintf(buf + offset, buflen - offset, "i"); + if (access & LANDLOCK_ACCESS_FS_MAKE_REG) + offset += snprintf(buf + offset, buflen - offset, ",make_reg"); + if (access & LANDLOCK_ACCESS_FS_REMOVE_FILE) + offset += + snprintf(buf + offset, buflen - offset, ",remove_file"); + if (access & LANDLOCK_ACCESS_FS_REFER) + offset += snprintf(buf + offset, buflen - offset, ",refer"); + + if (buf[0] == ',') { + offset--; + memmove(buf, buf + 1, offset); + buf[offset] = '\0'; + } + + return offset; +} + +static int apply_a_layer(struct __test_metadata *const _metadata, + const struct a_layer *l) +{ + struct landlock_ruleset_attr rs_attr = { + .handled_access_fs = l->handled_access_fs, + .quiet_access_fs = l->quiet_access_fs, + }; + int rs_fd; + int i; + const struct a_rule *r; + char handled_access_s[33], quiet_access_s[33], rule_access_s[33]; + + if (!l->handled_access_fs) + return 0; + + rs_fd = landlock_create_ruleset(&rs_attr, sizeof(rs_attr), 0); + ASSERT_LE(0, rs_fd); + + for (i = 0; i < ARRAY_SIZE(l->rules); i++) { + r = &l->rules[i]; + if (!r->path) + continue; + + add_path_beneath(_metadata, rs_fd, r->access, r->path, + r->quiet ? LANDLOCK_ADD_RULE_QUIET : 0); + } + + ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, landlock_restrict_self(rs_fd, l->restrict_flags)) + { + TH_LOG("Failed to enforce ruleset: %s", strerror(errno)); + } + ASSERT_EQ(0, close(rs_fd)); + + if (debug_quiet_tests) { + sprint_access_bits(handled_access_s, sizeof(handled_access_s), + l->handled_access_fs); + sprint_access_bits(quiet_access_s, sizeof(quiet_access_s), + l->quiet_access_fs); + TH_LOG("applied layer: handled=%s quiet=%s restrict_flags=0x%llx", + handled_access_s, quiet_access_s, + (unsigned long long)l->restrict_flags); + for (i = 0; i < ARRAY_SIZE(l->rules); i++) { + r = &l->rules[i]; + if (!r->path) + continue; + + sprint_access_bits(rule_access_s, sizeof(rule_access_s), + r->access); + TH_LOG(" rule[%d]: path=%s access=%s quiet=%d", i, + r->path, rule_access_s, r->quiet); + } + } + return 0; +} + +void audit_quiet_layout1_test_body(struct __test_metadata *const _metadata, + FIXTURE_DATA(audit_quiet_layout1) * self, + const struct a_target *targets) +{ + struct audit_records records = {}; + int i; + const struct a_target *target; + int fd = -1; + int open_mode; + int ret; + bool expect_audit; + const char *blocker; + + for (i = 0; i < AUDIT_QUIET_MAX_TARGETS; i++) { + target = &targets[i]; + if (!target->target) + continue; + + open_mode = target->open_mode & (O_RDONLY | O_WRONLY | O_RDWR); + + EXPECT_TRUE(open_mode == O_RDONLY || open_mode == O_WRONLY || + open_mode == O_RDWR); + + if (target->expect_open_success) { + EXPECT_FALSE(target->audit_read_blocked); + EXPECT_FALSE(target->audit_write_blocked); + } + if (target->expect_truncate_success) + EXPECT_TRUE(target->expect_open_success && + !target->audit_truncate); + + if (debug_quiet_tests) + TH_LOG("Try open \"%s\" with %s%s", target->target, + open_mode != O_WRONLY ? "r" : "", + open_mode != O_RDONLY ? "w" : ""); + + fd = openat(AT_FDCWD, target->target, open_mode | O_CLOEXEC); + if (target->expect_open_success) { + ASSERT_LE(0, fd) + { + TH_LOG("Failed to open \"%s\": %s", + target->target, strerror(errno)); + }; + } else { + ASSERT_EQ(-1, fd); + ASSERT_EQ(EACCES, errno); + } + + expect_audit = true; + + if (target->audit_read_blocked && target->audit_write_blocked) + blocker = "fs\\.write_file,fs\\.read_file"; + else if (target->audit_read_blocked) + blocker = "fs\\.read_file"; + else if (target->audit_write_blocked) + blocker = "fs\\.write_file"; + else + expect_audit = false; + + if (expect_audit) + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + blocker, target->target)); + + /* Check that we see no (other) logs. */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); + + if (target->expect_open_success && fd >= 0) { + if (debug_quiet_tests) + TH_LOG("Try ftruncate \"%s\"", target->target); + + ret = ftruncate(fd, 0); + if (target->expect_truncate_success) { + ASSERT_EQ(0, ret); + } else { + ASSERT_EQ(-1, ret); + if (open_mode != O_RDONLY) + ASSERT_EQ(EACCES, errno); + } + + if (target->audit_truncate) + ASSERT_EQ(0, matches_log_fs(_metadata, + self->audit_fd, + "fs\\.truncate", + target->target)); + + if (target->expect_ioctl_allowed || + target->expect_ioctl_denied) { + if (debug_quiet_tests) + TH_LOG("Try ioctl FIONREAD on \"%s\"", + target->target); + + ret = ioctl_error(_metadata, fd, FIONREAD); + if (target->expect_ioctl_allowed) { + ASSERT_NE(EACCES, ret); + } else { + ASSERT_EQ(EACCES, ret); + } + } + + if (target->audit_ioctl) + ASSERT_EQ(0, matches_log_fs_extra( + _metadata, self->audit_fd, + "fs\\.ioctl_dev", + target->target, + " ioctlcmd=0x541b\\+")); + + /* Check that we see no other logs. */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, + &records)); + ASSERT_EQ(0, records.access); + ASSERT_EQ(0, close(fd)); + } + } +} + +TEST_F(audit_quiet_layout1, base) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) + ASSERT_EQ(0, apply_a_layer(_metadata, &variant->layers[i])); + + audit_quiet_layout1_test_body(_metadata, self, variant->targets); +} + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_simple) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_R, + .rules = { + { .path = dir_s1d1, .access = 0, .quiet = true }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + }, + /* Not covered by quiet */ + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + /* Access not quieted */ + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .audit_write_blocked = true, + }, + /* + * Quiet flag only takes effect if all blocked access bits are + * quieted, otherwise audit log emitted as normal (with all + * blockers) + */ + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .audit_read_blocked = true, + .audit_write_blocked = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_allow_read) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_W, + .rules = { + { .path = dir_s1d1, .access = FS_R, .quiet = true }, + /* Quiet flags inherit down and are not overridden */ + { .path = file1_s1d1, .access = FS_R, .quiet = false }, + { .path = file1_s2d3, .access = 0, .quiet = true }, + }, + }, + }, + .targets = { + /* Read ok */ + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + .expect_open_success = true, + }, + /* Write quieted */ + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + }, + /* Read allowed, write quieted so no audit */ + { + .target = file1_s1d1, + .open_mode = O_RDWR, + }, + /* Not covered by quiet */ + { + .target = file1_s2d2, + .open_mode = O_WRONLY, + .audit_write_blocked = true, + }, + { + .target = file1_s2d2, + .open_mode = O_RDWR, + .audit_read_blocked = true, + .audit_write_blocked = true, + }, + /* Single file quiet */ + { + .target = file1_s2d3, + .open_mode = O_WRONLY, + }, + /* Wrong file */ + { + .target = file2_s2d3, + .open_mode = O_WRONLY, + .audit_write_blocked = true, + }, + /* Access not quieted */ + { + .target = file1_s2d3, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + /* Some access not quieted */ + { + .target = file1_s2d3, + .open_mode = O_RDWR, + .audit_read_blocked = true, + .audit_write_blocked = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_allow_write) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_R, + .rules = { + { .path = dir_s1d1, .access = FS_W, .quiet = true }, + }, + }, + }, + .targets = { + /* Read quieted */ + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + }, + /* Truncate not quieted */ + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + /* Not covered by quiet */ + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + /* Write allowed, read quieted so no audit */ + { + .target = file1_s1d1, + .open_mode = O_RDWR, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, allow_write_quiet_trunc) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_TRUNC, + .rules = { + { .path = dir_s1d1, .access = FS_W, .quiet = true }, + { .path = dir_s2d1, .access = FS_W, .quiet = false }, + }, + }, + }, + .targets = { + /* Read not allowed and not quieted */ + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + /* Truncate quieted */ + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + }, + /* Not covered by quiet (truncate) */ + { + .target = file1_s2d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + /* Not covered by quiet (read/write) */ + { + .target = file1_s3d1, + .open_mode = O_RDWR, + .audit_read_blocked = true, + .audit_write_blocked = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, allow_rw_quiet_trunc) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_TRUNC, + .rules = { + { .path = dir_s1d1, .access = FS_R | FS_W, .quiet = true }, + { .path = dir_s2d1, .access = FS_R | FS_W, .quiet = false }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .expect_open_success = true, + }, + { + .target = file1_s2d1, + .open_mode = O_RDWR, + .expect_open_success = true, + .audit_truncate = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_all) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { .path = dir_s1d1, .access = 0, .quiet = true }, + { .path = file1_s2d1, .access = FS_R | FS_W, .quiet = true }, + { .path = file1_s2d3, .access = 0, .quiet = true }, + { .path = dir_s3d1, .access = FS_W, .quiet = false }, + { .path = "/dev/zero", .access = FS_R, .quiet = false }, + { .path = "/dev/null", .access = FS_R, .quiet = true }, + }, + }, + }, + .targets = { + /* No logs */ + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + }, + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + }, + { + .target = file1_s1d1, + .open_mode = O_RDWR, + }, + /* Truncate quieted - no log */ + { + .target = file1_s2d1, + .open_mode = O_RDWR, + .expect_open_success = true, + }, + /* Truncate not covered by quiet */ + { + .target = file1_s3d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + /* Not covered by quiet */ + { + .target = file1_s3d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + /* Single file quiet */ + { + .target = file1_s2d3, + .open_mode = O_RDWR, + }, + /* Wrong file */ + { + .target = file2_s2d3, + .open_mode = O_RDWR, + .audit_read_blocked = true, + .audit_write_blocked = true, + }, + /* Ioctl quieted */ + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + }, + /* Ioctl not quieted */ + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + .audit_ioctl = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_across_mountpoint) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_R, + .rules = { + { .path = dir_s3d1, .access = 0, .quiet = true }, + }, + }, + }, + .targets = { + { + .target = file1_s3d3, + .open_mode = O_RDONLY, + }, + /* Not covered by quiet */ + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .audit_read_blocked = true, + .audit_write_blocked = true, + }, + /* Access not quieted */ + { + .target = file1_s3d3, + .open_mode = O_WRONLY, + .audit_write_blocked = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, allow_all_quiet) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = true + }, + { + .path = "/dev/null", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = true + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .expect_open_success = true, + .expect_truncate_success = true, + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_allowed = true, + }, + }, +}; + +/* + * With LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF, it doesn't matter what the + * quiet flags below the layer say. + */ +FIXTURE_VARIANT_ADD(audit_quiet_layout1, subdomains_off) { + .layers = { + { + .handled_access_fs = FS_R, + .restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF, + .rules = { + { .path = "/", .access = FS_R, .quiet = false }, + } + }, + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R, + .rules = { + { .path = dir_s1d1, .access = 0, .quiet = true }, + { .path = file1_s2d2, .access = FS_R | FS_W, .quiet = true }, + { .path = file1_s2d3, .access = FS_R | FS_W, .quiet = false }, + { .path = "/dev/null", .access = FS_R | FS_W, .quiet = true }, + { .path = "/dev/zero", .access = FS_R | FS_W, .quiet = false }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + }, + { + .target = file1_s2d1, + .open_mode = O_RDWR, + }, + { + .target = file1_s2d2, + .open_mode = O_RDWR, + .expect_open_success = true, + /* No audit_truncate */ + }, + { + .target = file1_s2d3, + .open_mode = O_RDWR, + .expect_open_success = true, + /* No audit_truncate */ + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + /* No audit_ioctl */ + }, + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + /* No audit_ioctl */ + }, + }, +}; + +/* + * With LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF, it doesn't matter what the + * quiet flags on the layer say. + */ +FIXTURE_VARIANT_ADD(audit_quiet_layout1, same_exec_off) { + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R, + .restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF, + .rules = { + { .path = dir_s1d1, .access = 0, .quiet = true }, + { .path = file1_s2d2, .access = FS_R | FS_W, .quiet = true }, + { .path = file1_s2d3, .access = FS_R | FS_W, .quiet = false }, + { .path = "/dev/null", .access = FS_R | FS_W, .quiet = true }, + { .path = "/dev/zero", .access = FS_R | FS_W, .quiet = false }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + }, + { + .target = file1_s2d1, + .open_mode = O_RDWR, + }, + { + .target = file1_s2d2, + .open_mode = O_RDWR, + .expect_open_success = true, + /* No audit_truncate */ + }, + { + .target = file1_s2d3, + .open_mode = O_RDWR, + .expect_open_success = true, + /* No audit_truncate */ + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + /* No audit_ioctl */ + }, + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + /* No audit_ioctl */ + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_1) { + /* Here, rules that deny access are always quiet. */ + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_W, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = false, + }, + { + .path = "/dev/null", + .access = FS_R, + .quiet = true, + }, + { + .path = "/dev/zero", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = false, + }, + }, + }, + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = FS_W, + .quiet = true, + }, + { + .path = "/dev/null", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = false, + }, + { + .path = "/dev/zero", + .access = FS_R, + .quiet = true, + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + }, + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + }, + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + }, + { + .target = file1_s2d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + }, + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_2) { + /* Here, rules that deny access are never quiet. */ + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_W, + .quiet = false + }, + { + .path = dir_s2d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = true + }, + { + .path = "/dev/null", + .access = FS_R, + .quiet = false + }, + { + .path = "/dev/zero", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = true + }, + }, + }, + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = true + }, + { + .path = dir_s2d1, + .access = FS_W, + .quiet = false + }, + { + .path = "/dev/null", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = true + }, + { + .path = "/dev/zero", + .access = FS_R, + .quiet = false + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + { + .target = file1_s2d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + .audit_ioctl = true, + }, + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + .audit_ioctl = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_3) { + /* This time only the second layer quiets things. */ + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_W, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = false, + }, + { + .path = "/dev/null", + .access = FS_R, + .quiet = false, + }, + { + .path = "/dev/zero", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = false, + }, + }, + }, + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = FS_W, + .quiet = true, + }, + { + .path = "/dev/null", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = false, + }, + { + .path = "/dev/zero", + .access = FS_R, + .quiet = true, + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + }, + { + .target = file1_s2d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + .audit_ioctl = true, + }, + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_different_quiet_access) { + /* Here, rules that deny access are always quiet. */ + .layers = { + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_W, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = false, + }, + { + .path = "/dev/null", + .access = FS_R, + .quiet = true, + }, + { + .path = "/dev/zero", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = false, + }, + }, + }, + { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_IOCTL, + .rules = { + { + .path = dir_s1d1, + .access = FS_R | FS_W | FS_TRUNC, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = FS_W, + .quiet = true, + }, + { + .path = "/dev/null", + .access = FS_R | FS_W | FS_IOCTL, + .quiet = false, + }, + { + .path = "/dev/zero", + .access = FS_R, + .quiet = true, + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + }, + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + }, + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + { + .target = file1_s2d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .audit_truncate = true, + }, + { + .target = "/dev/null", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + }, + { + .target = "/dev/zero", + .open_mode = O_RDONLY, + .expect_open_success = true, + .expect_ioctl_denied = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_different_handled_1) { + /* Quiet from layer 1 */ + .layers = { + { + .handled_access_fs = FS_R, + .quiet_access_fs = FS_R, + .rules = { + { + .path = file1_s1d1, + .access = FS_R, + .quiet = true, + }, + { + .path = file2_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = file1_s1d2, + .access = 0, + .quiet = true, + }, + { + .path = file2_s1d2, + .access = FS_R, + .quiet = true, + }, + }, + }, + { + .handled_access_fs = FS_W, + .quiet_access_fs = FS_W, + .rules = { + { + .path = file1_s1d1, + .access = FS_W, + .quiet = false, + }, + /* Nothing for file2_s1d1 */ + { + .path = file1_s1d2, + .access = FS_W, + .quiet = false, + }, + /* Nothing for file2_s1d2 */ + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .expect_open_success = true, + .expect_truncate_success = true, + }, + /* Missing both, youngest layer denies write, not quiet */ + { + .target = file2_s1d1, + .open_mode = O_RDWR, + .audit_write_blocked = true, + }, + /* Missing read, denied and quieted by layer 1 */ + { + .target = file1_s1d2, + .open_mode = O_RDWR, + }, + /* Missing write, denied and not quieted by layer 2 */ + { + .target = file2_s1d2, + .open_mode = O_RDWR, + .audit_write_blocked = true, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_different_handled_2) { + /* Quiet from layer 2 */ + .layers = { + { + .handled_access_fs = FS_R, + .quiet_access_fs = FS_R, + .rules = { + { + .path = file1_s1d1, + .access = FS_R, + .quiet = false, + }, + /* Nothing for file2_s1d1 and file1_s1d2 */ + { + .path = file2_s1d2, + .access = FS_R, + .quiet = false, + }, + }, + }, + { + .handled_access_fs = FS_W, + .quiet_access_fs = FS_W, + .rules = { + { + .path = file1_s1d1, + .access = FS_W, + .quiet = true, + }, + { + .path = file2_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = file1_s1d2, + .access = FS_W, + .quiet = true, + }, + { + .path = file2_s1d2, + .access = 0, + .quiet = true, + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .expect_open_success = true, + .expect_truncate_success = true, + }, + /* Missing both, youngest layer denies write, quiet */ + { + .target = file2_s1d1, + .open_mode = O_RDWR, + }, + /* Missing read, denied and not quieted by layer 1 */ + { + .target = file1_s1d2, + .open_mode = O_RDWR, + .audit_read_blocked = true, + }, + /* Missing write, denied and quieted by layer 2 */ + { + .target = file2_s1d2, + .open_mode = O_RDWR, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, quiet_two_layers_different_handled_3) { + /* Quiet from both layers */ + .layers = { + { + .handled_access_fs = FS_R, + .quiet_access_fs = FS_R, + .rules = { + { + .path = file1_s1d1, + .access = FS_R, + .quiet = true, + }, + { + .path = file2_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = file1_s1d2, + .access = 0, + .quiet = true, + }, + { + .path = file2_s1d2, + .access = FS_R, + .quiet = true, + }, + }, + }, + { + .handled_access_fs = FS_W, + .quiet_access_fs = FS_W, + .rules = { + { + .path = file1_s1d1, + .access = FS_W, + .quiet = true, + }, + { + .path = file2_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = file1_s1d2, + .access = FS_W, + .quiet = true, + }, + { + .path = file2_s1d2, + .access = 0, + .quiet = true, + }, + }, + }, + }, + .targets = { + { + .target = file1_s1d1, + .open_mode = O_RDWR, + .expect_open_success = true, + .expect_truncate_success = true, + }, + { + .target = file2_s1d1, + .open_mode = O_RDWR, + }, + { + .target = file1_s1d2, + .open_mode = O_RDWR, + }, + { + .target = file2_s1d2, + .open_mode = O_RDWR, + }, + }, +}; + +FIXTURE_VARIANT_ADD(audit_quiet_layout1, without_quiet_then_with_quiet) { + .layers = { + { + .handled_access_fs = FS_R | FS_W, + .quiet_access_fs = FS_R, + .rules = { + { .path = dir_s1d1, .access = FS_W, .quiet = false }, + { .path = dir_s1d1, .access = 0, .quiet = true }, + }, + }, + }, + .targets = { + /* Read denied and quieted */ + { + .target = file1_s1d1, + .open_mode = O_RDONLY, + }, + /* Write ok */ + { + .target = file1_s1d1, + .open_mode = O_WRONLY, + .expect_open_success = true, + .expect_truncate_success = true, + }, + /* Write ok, read denied and quieted */ + { + .target = file1_s1d1, + .open_mode = O_RDWR, + }, + /* Not covered by quiet */ + { + .target = file1_s2d1, + .open_mode = O_RDONLY, + .audit_read_blocked = true, + }, + }, +}; + +/* + * The following TEST_F extend the above test cases to test more layers, with + * the inserted layers having varying configurations. + */ + +/* Extra allow all layers, quiet or not, does not change any behaviour. */ +TEST_F(audit_quiet_layout1, allow_all_layer) +{ + struct a_layer allow_all_layer = { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = 0, + .rules = { + { + .path = "/", + .access = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet = false, + }, + }, + }; + int i; + + ASSERT_EQ(0, apply_a_layer(_metadata, &allow_all_layer)); + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) + ASSERT_EQ(0, apply_a_layer(_metadata, &variant->layers[i])); + + audit_quiet_layout1_test_body(_metadata, self, variant->targets); + + ASSERT_EQ(0, apply_a_layer(_metadata, &allow_all_layer)); + + audit_quiet_layout1_test_body(_metadata, self, variant->targets); + + /* + * SELF_LOG flags or quiet bits from inner allowing layers should not + * affect behaviour. + */ + allow_all_layer.quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL; + allow_all_layer.rules[0].quiet = true; + /* + * Note: this only works because we're not checking counts of domain + * alloc/dealloc logs + */ + allow_all_layer.restrict_flags = + LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF | + LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF; + ASSERT_EQ(0, apply_a_layer(_metadata, &allow_all_layer)); + + audit_quiet_layout1_test_body(_metadata, self, variant->targets); +} + +/* + * Add useless outer layers until we reach the layer limit. Should not change + * anything. + */ +TEST_F(audit_quiet_layout1, many_outer_layers) +{ + struct a_layer useless_layer = { + .handled_access_fs = FS_R | FS_W | FS_TRUNC, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC, + .rules = { + { .path = "/", .access = FS_R | FS_W | FS_TRUNC, .quiet = true }, + }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) { + if (variant->layers[i].handled_access_fs == 0) + break; + } + + for (; i < LANDLOCK_MAX_NUM_LAYERS; i++) + ASSERT_EQ(0, apply_a_layer(_metadata, &useless_layer)); + + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) + ASSERT_EQ(0, apply_a_layer(_metadata, &variant->layers[i])); + + audit_quiet_layout1_test_body(_metadata, self, variant->targets); +} + +/* An inner layer that denies and quiets everything should result in no logs. */ +TEST_F(audit_quiet_layout1, deny_all_quiet_layer) +{ + struct a_layer deny_all_layer = { + .handled_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .quiet_access_fs = FS_R | FS_W | FS_TRUNC | FS_IOCTL, + .rules = { + { .path = "/", .access = 0, .quiet = true }, + }, + }; + int i; + FIXTURE_VARIANT(audit_quiet_layout1) variant_2 = {}; + + /* Any open should fail with no logs. */ + for (i = 0; i < ARRAY_SIZE(variant->targets); i++) { + const struct a_target *target = &variant->targets[i]; + + variant_2.targets[i] = (struct a_target){ + .target = target->target, + .open_mode = target->open_mode, + /* We denied everything, open should always fail. */ + .expect_open_success = false, + }; + } + + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) + ASSERT_EQ(0, apply_a_layer(_metadata, &variant->layers[i])); + ASSERT_EQ(0, apply_a_layer(_metadata, &deny_all_layer)); + + audit_quiet_layout1_test_body(_metadata, self, variant_2.targets); +} + +/* + * An inner layer that denies everything without quiet should produce logs for + * all access. + */ +TEST_F(audit_quiet_layout1, deny_all_layer) +{ + struct a_layer deny_all_layer = { + .handled_access_fs = FS_R | FS_W, + .quiet_access_fs = FS_R | FS_W, + }; + int i; + FIXTURE_VARIANT(audit_quiet_layout1) variant_2 = {}; + bool test_has_subdomains_off = false; + + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) { + if (variant->layers[i].restrict_flags & + LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF) { + test_has_subdomains_off = true; + break; + } + } + + for (i = 0; i < ARRAY_SIZE(variant->targets); i++) { + const struct a_target *target = &variant->targets[i]; + + variant_2.targets[i] = (struct a_target){ + .target = target->target, + .open_mode = target->open_mode, + + /* We denied everything, open should always fail. */ + .expect_open_success = false, + /* Audit should always happen as long as open request contains read. */ + .audit_read_blocked = !test_has_subdomains_off && + target->open_mode != O_WRONLY, + /* Audit should always happen as long as open request contains write. */ + .audit_write_blocked = !test_has_subdomains_off && + target->open_mode != O_RDONLY, + }; + } + + for (i = 0; i < ARRAY_SIZE(variant->layers); i++) + ASSERT_EQ(0, apply_a_layer(_metadata, &variant->layers[i])); + ASSERT_EQ(0, apply_a_layer(_metadata, &deny_all_layer)); + + audit_quiet_layout1_test_body(_metadata, self, variant_2.targets); +} + +/* Uses layout1_bind hierarchy */ +FIXTURE(audit_quiet_rename) +{ + struct audit_filter audit_filter; + int audit_fd; +}; + +FIXTURE_SETUP(audit_quiet_rename) +{ + prepare_layout(_metadata); + create_layout1(_metadata); + + set_cap(_metadata, CAP_SYS_ADMIN); + ASSERT_EQ(0, mount(dir_s1d2, dir_s2d2, NULL, MS_BIND, NULL)); + clear_cap(_metadata, CAP_SYS_ADMIN); + + set_cap(_metadata, CAP_AUDIT_CONTROL); + self->audit_fd = audit_init_with_exe_filter(&self->audit_filter); + EXPECT_LE(0, self->audit_fd); + clear_cap(_metadata, CAP_AUDIT_CONTROL); + + if (getenv("DEBUG_QUIET_TESTS")) + debug_quiet_tests = true; +} + +FIXTURE_TEARDOWN_PARENT(audit_quiet_rename) +{ + remove_layout1(_metadata); + cleanup_layout(_metadata); + + /* umount(dir_s2d2)) is handled by namespace lifetime. */ + + remove_path(file1_s4d1); + remove_path(file2_s4d1); + + set_cap(_metadata, CAP_AUDIT_CONTROL); + EXPECT_EQ(0, audit_cleanup(-1, NULL)); + clear_cap(_metadata, CAP_AUDIT_CONTROL); +} + +static void simple_quiet_rename(struct __test_metadata *const _metadata, + FIXTURE_DATA(audit_quiet_rename) *const self, + __u64 handled_access, __u64 quiet_access, + bool source_allow, bool dest_allow, + bool source_quiet, bool dest_quiet, + const char *source_blockers, + const char *dest_blockers) +{ + /* We will move file1_s1d1 to file1_s2d1 */ + struct a_layer layer = { + .handled_access_fs = handled_access, + .quiet_access_fs = quiet_access, + .rules = { + { + .path = dir_s1d1, + .access = source_allow ? handled_access : 0, + .quiet = source_quiet, + }, + { + .path = dir_s2d1, + .access = dest_allow ? handled_access : 0, + .quiet = dest_quiet, + }, + }, + }; + struct audit_records records = {}; + int ret, err; + + /* Skip landlock_add_rule for useless rules. */ + if (!source_allow && !source_quiet) + layer.rules[0].path = NULL; + if (!dest_allow && !dest_quiet) + layer.rules[1].path = NULL; + + EXPECT_EQ(0, unlink(file1_s2d1)); + EXPECT_EQ(0, apply_a_layer(_metadata, &layer)); + + if (debug_quiet_tests) + TH_LOG("Try renameat \"%s\" to \"%s\"", file1_s1d1, file1_s2d1); + ret = renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1); + err = errno; + if (ret != 0 && debug_quiet_tests) { + TH_LOG("renameat error: %s", err == EXDEV ? "EXDEV" : + err == EACCES ? "EACCES" : + strerror(err)); + } + if (source_allow && dest_allow) { + ASSERT_EQ(0, ret); + } else { + ASSERT_EQ(-1, ret); + if (handled_access & (LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE)) { + ASSERT_EQ(EACCES, err); + } else { + ASSERT_EQ(EXDEV, err); + } + + if (source_blockers) + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + source_blockers, dir_s1d1)); + if (dest_blockers) + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + dest_blockers, dir_s2d1)); + } + /* + * No other logs. records.domain not checked per reasoning in + * audit_quiet_layout1_test_body. + */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, rename_ok) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, true, true, false, + false, NULL, NULL); +} + +TEST_F(audit_quiet_rename, no_quiet) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, false, false, + false, false, "fs\\.remove_file,fs\\.refer", + "fs\\.make_reg,fs\\.refer"); +} + +TEST_F(audit_quiet_rename, quiet) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, false, false, true, + true, NULL, NULL); +} + +TEST_F(audit_quiet_rename, source_no_quiet_dest_quiet) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, false, false, + false, true, "fs\\.remove_file,fs\\.refer", NULL); +} + +TEST_F(audit_quiet_rename, source_quiet_dest_no_quiet) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, false, false, true, + false, NULL, "fs\\.make_reg,fs\\.refer"); +} + +TEST_F(audit_quiet_rename, only_quiet_refer) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, LANDLOCK_ACCESS_FS_REFER, + false, false, true, true, + "fs\\.remove_file,fs\\.refer", + "fs\\.make_reg,fs\\.refer"); +} + +TEST_F(audit_quiet_rename, source_allow_dest_quiet) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, true, false, false, + true, NULL, NULL); +} + +TEST_F(audit_quiet_rename, source_quiet_dest_allow) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + + simple_quiet_rename(_metadata, self, access, access, false, true, true, + false, NULL, NULL); +} + +TEST_F(audit_quiet_rename, handle_all_deny_quiet_refer) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = LANDLOCK_ACCESS_FS_REFER, + .rules = { + { + .path = dir_s1d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EXDEV, errno); + + /* No logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, handle_all_deny_not_quiet_refer) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = 0, + .rules = { + { + .path = dir_s1d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + .quiet = false, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EXDEV, errno); + + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, "fs\\.refer", + dir_s1d1)); + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, "fs\\.refer", + dir_s2d1)); + + /* No other logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, handle_all_deny_refer_quiet_source_not_quiet_dest) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = LANDLOCK_ACCESS_FS_REFER, + .rules = { + { + .path = dir_s1d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE, + .quiet = false, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EXDEV, errno); + + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, "fs\\.refer", + dir_s2d1)); + + /* No other logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_same_dir) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file2_s1d1)); + ASSERT_EQ(EACCES, errno); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_flag_on_file_ignored) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = file1_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = file1_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EACCES, errno); + + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.remove_file,fs\\.refer", dir_s1d1)); + /* We didn't unlink destination file */ + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.remove_file,fs\\.make_reg,fs\\.refer", + dir_s2d1)); + + /* No other logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_flag_on_file_ignored_same_dir) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = file1_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = file2_s1d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file2_s1d1)); + ASSERT_EQ(EACCES, errno); + + ASSERT_EQ(0, + matches_log_fs(_metadata, self->audit_fd, + "fs\\.remove_file,fs\\.make_reg", dir_s1d1)); + + /* No other logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, two_layers_different_quiet1) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer1 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = access, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct a_layer layer2 = { + .handled_access_fs = access, + .quiet_access_fs = LANDLOCK_ACCESS_FS_REFER, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = access, + .quiet = false, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer2)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EACCES, errno); + + /* + * The youngest denial will be layer 2. Refer is quieted but we are + * also missing remove_file on source. + */ + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.remove_file,fs\\.refer", dir_s1d1)); + /* No other logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, two_layers_different_quiet2) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer1 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = access, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct a_layer layer2 = { + .handled_access_fs = LANDLOCK_ACCESS_FS_REFER, + .quiet_access_fs = LANDLOCK_ACCESS_FS_REFER, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_REFER, + .quiet = false, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer2)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EACCES, errno); + + /* + * The youngest denial will be layer 2, but refer is quieted (and that + * layer does not handle any other accesses). + */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, two_layers_different_quiet3) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer1 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = access, + .quiet = false, + }, + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct a_layer layer2 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = access, + .quiet = false, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer2)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EACCES, errno); + + /* + * The youngest denial will be layer 2, in which everything is quieted. + */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, + first_layer_quiet_deny_all_second_layer_not_quiet_deny_all) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer1 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct a_layer layer2 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = {}, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer2)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EACCES, errno); + + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.remove_file,fs\\.refer", dir_s1d1)); + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.make_reg,fs\\.refer", dir_s2d1)); + /* No other logs. */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, + first_layer_quiet_deny_all_second_layer_dest_not_quiet) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer1 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct a_layer layer2 = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file1_s2d1)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer1)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer2)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1)); + ASSERT_EQ(EACCES, errno); + + /* Source is quieted but destination is not. */ + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.make_reg,fs\\.refer", dir_s2d1)); + /* No other logs. */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, rename_xchg) +{ + struct a_layer layer = { + .handled_access_fs = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER, + .quiet_access_fs = LANDLOCK_ACCESS_FS_MAKE_REG, + .rules = { { + .path = dir_s1d1, + .access = LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER, + .quiet = true, + }, + { + .path = dir_s2d1, + .access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER, + .quiet = false, + } }, + }; + struct audit_records records = {}; + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat2(AT_FDCWD, file1_s1d1, AT_FDCWD, file1_s2d1, + RENAME_EXCHANGE)); + ASSERT_EQ(EACCES, errno); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_on_parent_mount) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file2_s1d3)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, bind_file1_s1d3, AT_FDCWD, + bind_file2_s1d3)); + ASSERT_EQ(EACCES, errno); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_behind_mountpoint_ignored) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s1d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + + EXPECT_EQ(0, unlink(file2_s1d3)); + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, renameat(AT_FDCWD, bind_file1_s1d3, AT_FDCWD, + bind_file2_s1d3)); + ASSERT_EQ(EACCES, errno); + ASSERT_EQ(0, matches_log_fs(_metadata, self->audit_fd, + "fs\\.remove_file,fs\\.make_reg", + bind_dir_s1d3)); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_on_parent_mount_disconnected) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s2d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + int bind_s1d3_fd; + + EXPECT_EQ(0, unlink(file2_s1d3)); + + bind_s1d3_fd = open(bind_dir_s1d3, O_PATH | O_DIRECTORY); + ASSERT_GE(bind_s1d3_fd, 0); + + /* Make s1d3 disconnected. */ + create_directory(_metadata, dir_s4d1); + ASSERT_EQ(0, renameat(AT_FDCWD, dir_s1d3, AT_FDCWD, dir_s4d2)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, + renameat(bind_s1d3_fd, file1_name, bind_s1d3_fd, file2_name)); + ASSERT_EQ(EACCES, errno); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + +TEST_F(audit_quiet_rename, quiet_behind_mountpoint_disconnected) +{ + __u64 access = LANDLOCK_ACCESS_FS_MAKE_REG | + LANDLOCK_ACCESS_FS_REMOVE_FILE | + LANDLOCK_ACCESS_FS_REFER; + struct a_layer layer = { + .handled_access_fs = access, + .quiet_access_fs = access, + .rules = { + { + .path = dir_s4d1, + .access = 0, + .quiet = true, + }, + }, + }; + struct audit_records records = {}; + int bind_s1d3_fd; + + EXPECT_EQ(0, unlink(file2_s1d3)); + + bind_s1d3_fd = open(bind_dir_s1d3, O_PATH | O_DIRECTORY); + ASSERT_GE(bind_s1d3_fd, 0); + + /* Make s1d3 disconnected. */ + create_directory(_metadata, dir_s4d1); + ASSERT_EQ(0, renameat(AT_FDCWD, dir_s1d3, AT_FDCWD, dir_s4d2)); + + ASSERT_EQ(0, apply_a_layer(_metadata, &layer)); + + ASSERT_EQ(-1, + renameat(bind_s1d3_fd, file1_name, bind_s1d3_fd, file2_name)); + ASSERT_EQ(EACCES, errno); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + ASSERT_EQ(0, records.access); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 4c528154ea92..2ed1f76b7a8b 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -35,6 +35,7 @@ enum sandbox_type { NO_SANDBOX, /* This may be used to test rules that allow *and* deny accesses. */ TCP_SANDBOX, + UDP_SANDBOX, }; static int set_service(struct service_fixture *const srv, @@ -93,23 +94,53 @@ static bool prot_is_tcp(const struct protocol_variant *const prot) (prot->protocol == IPPROTO_TCP || prot->protocol == IPPROTO_IP); } +static bool prot_is_udp(const struct protocol_variant *const prot) +{ + return (prot->domain == AF_INET || prot->domain == AF_INET6) && + prot->type == SOCK_DGRAM && + (prot->protocol == IPPROTO_UDP || prot->protocol == IPPROTO_IP); +} + static bool is_restricted(const struct protocol_variant *const prot, const enum sandbox_type sandbox) { if (sandbox == TCP_SANDBOX) return prot_is_tcp(prot); + else if (sandbox == UDP_SANDBOX) + return prot_is_udp(prot); return false; } static int socket_variant(const struct service_fixture *const srv) { + /* Arbitrary value just to not block other tests indefinitely. */ + const struct timeval timeout = { + .tv_sec = 0, + .tv_usec = 100000, + }; + int sockfd; int ret; - ret = socket(srv->protocol.domain, srv->protocol.type | SOCK_CLOEXEC, - srv->protocol.protocol); - if (ret < 0) + sockfd = socket(srv->protocol.domain, srv->protocol.type | SOCK_CLOEXEC, + srv->protocol.protocol); + if (sockfd < 0) return -errno; - return ret; + + ret = setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO, &timeout, + sizeof(timeout)); + if (ret != 0) { + ret = -errno; + close(sockfd); + return ret; + } + ret = setsockopt(sockfd, SOL_SOCKET, SO_SNDTIMEO, &timeout, + sizeof(timeout)); + if (ret != 0) { + ret = -errno; + close(sockfd); + return ret; + } + return sockfd; } #ifndef SIN6_LEN_RFC2133 @@ -258,9 +289,163 @@ static int connect_variant(const int sock_fd, return connect_variant_addrlen(sock_fd, srv, get_addrlen(srv, false)); } +static int sendto_variant_addrlen(const int sock_fd, + const struct service_fixture *const srv, + const socklen_t addrlen, void *buf, + size_t len, size_t flags) +{ + const struct sockaddr *dst = NULL; + ssize_t ret; + + /* + * We never want our processes to be killed by SIGPIPE: we check return + * codes and errno, so that we have actual error messages. + */ + flags |= MSG_NOSIGNAL; + + if (srv != NULL) { + switch (srv->protocol.domain) { + case AF_UNSPEC: + case AF_INET: + dst = (const struct sockaddr *)&srv->ipv4_addr; + break; + + case AF_INET6: + dst = (const struct sockaddr *)&srv->ipv6_addr; + break; + + case AF_UNIX: + dst = (const struct sockaddr *)&srv->unix_addr; + break; + + default: + errno = EAFNOSUPPORT; + return -errno; + } + } + + ret = sendto(sock_fd, buf, len, flags, dst, addrlen); + if (ret < 0) + return -errno; + + /* errno is not set in cases of partial writes. */ + if (ret != len) + return -EINTR; + + return 0; +} + +static int sendto_variant(const int sock_fd, + const struct service_fixture *const srv, void *buf, + size_t len, size_t flags) +{ + socklen_t addrlen = 0; + + if (srv != NULL) + addrlen = get_addrlen(srv, false); + + return sendto_variant_addrlen(sock_fd, srv, addrlen, buf, len, flags); +} + +static int test_sendmsg(struct __test_metadata *const _metadata, + const struct protocol_variant *prot, int client_fd, + int server_fd, const struct service_fixture *srv, + bool bind_denied, bool send_denied) +{ + int ret; + socklen_t opt_len; + int sock_type; + int addr_family; + struct sockaddr_storage peer_addr = { 0 }; + bool has_remote_port; + bool needs_autobind; + char read_buf[1] = { 0 }; + + /* + * Prepare the test by inspecting the socket type and whether it has a + * local/remote address set (all of which determine the expected + * outcomes). + */ + opt_len = sizeof(sock_type); + ASSERT_EQ(0, getsockopt(client_fd, SOL_SOCKET, SO_TYPE, &sock_type, + &opt_len)); + opt_len = sizeof(addr_family); + ASSERT_EQ(0, getsockopt(client_fd, SOL_SOCKET, SO_DOMAIN, &addr_family, + &opt_len)); + opt_len = sizeof(peer_addr); + has_remote_port = (getpeername(client_fd, (struct sockaddr *)&peer_addr, + &opt_len) == 0); + needs_autobind = (addr_family == AF_INET || addr_family == AF_INET6) && + get_binded_port(client_fd, prot) == 0; + + /* First, check error code with truncated explicit address. */ + if (srv != NULL) { + ret = sendto_variant_addrlen( + client_fd, srv, get_addrlen(srv, true) - 1, "A", 1, 0); + if (sock_type == SOCK_STREAM && !has_remote_port) { + EXPECT_EQ(-EPIPE, ret) + { + return -1; + } + } else if (bind_denied && needs_autobind) { + EXPECT_EQ(-EACCES, ret) + { + return -1; + } + } else { + EXPECT_EQ(-EINVAL, ret) + { + return -1; + } + } + } + + /* With or without explicit destination address (srv can be NULL). */ + ret = sendto_variant(client_fd, srv, "B", 1, 0); + if (sock_type == SOCK_STREAM && !has_remote_port) { + EXPECT_EQ(-EPIPE, ret) + { + return -1; + } + } else if ((send_denied && srv != NULL) || + (bind_denied && needs_autobind)) { + ASSERT_EQ(-EACCES, ret) + { + return -1; + } + } else if (srv == NULL && !has_remote_port) { + if (addr_family == AF_UNIX) { + ASSERT_EQ(-ENOTCONN, ret) + { + return -1; + } + } else if (sock_type == SOCK_STREAM) { + ASSERT_EQ(-EPIPE, ret) + { + return -1; + } + } else { + ASSERT_EQ(-EDESTADDRREQ, ret) + { + return -1; + } + } + } else { + ASSERT_EQ(0, ret); + ASSERT_EQ(1, recv(server_fd, read_buf, 1, 0)); + ASSERT_EQ(read_buf[0], 'B') + { + return -1; + } + } + + return 0; +} + FIXTURE(protocol) { - struct service_fixture srv0, srv1, srv2, unspec_any0, unspec_srv0; + struct service_fixture srv0, srv1, srv2; + struct service_fixture unspec_any0, unspec_srv0, unspec_srv1; }; FIXTURE_VARIANT(protocol) @@ -271,10 +456,9 @@ FIXTURE_VARIANT(protocol) FIXTURE_SETUP(protocol) { - const struct protocol_variant prot_unspec = { - .domain = AF_UNSPEC, - .type = SOCK_STREAM, - }; + struct protocol_variant prot_unspec = variant->prot; + + prot_unspec.domain = AF_UNSPEC; disable_caps(_metadata); @@ -283,6 +467,7 @@ FIXTURE_SETUP(protocol) ASSERT_EQ(0, set_service(&self->srv2, variant->prot, 2)); ASSERT_EQ(0, set_service(&self->unspec_srv0, prot_unspec, 0)); + ASSERT_EQ(0, set_service(&self->unspec_srv1, prot_unspec, 1)); ASSERT_EQ(0, set_service(&self->unspec_any0, prot_unspec, 0)); self->unspec_any0.ipv4_addr.sin_addr.s_addr = htonl(INADDR_ANY); @@ -510,6 +695,92 @@ FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_unix_datagram) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_ipv4_udp1) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_ipv4_udp2) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_DGRAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_ipv6_udp1) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_ipv6_udp2) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_DGRAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_ipv4_tcp) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_ipv6_tcp) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_unix_stream) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_UNIX, + .type = SOCK_STREAM, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, udp_sandbox_with_unix_datagram) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_UNIX, + .type = SOCK_DGRAM, + }, +}; + static void test_bind_and_connect(struct __test_metadata *const _metadata, const struct service_fixture *const srv, const bool deny_bind, const bool deny_connect) @@ -602,7 +873,7 @@ static void test_bind_and_connect(struct __test_metadata *const _metadata, ret = connect_variant(connect_fd, srv); if (deny_connect) { EXPECT_EQ(-EACCES, ret); - } else if (deny_bind) { + } else if (deny_bind && srv->protocol.type == SOCK_STREAM) { /* No listening server. */ EXPECT_EQ(-ECONNREFUSED, ret); } else { @@ -641,18 +912,25 @@ static void test_bind_and_connect(struct __test_metadata *const _metadata, TEST_F(protocol, bind) { - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { + const __u64 bind_access = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const __u64 conn_access = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + .handled_access_net = bind_access | conn_access, }; - const struct landlock_net_port_attr tcp_bind_connect_p0 = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_net_port_attr bind_connect_p0 = { + .allowed_access = bind_access | conn_access, .port = self->srv0.port, }; - const struct landlock_net_port_attr tcp_connect_p1 = { - .allowed_access = LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_net_port_attr connect_p1 = { + .allowed_access = conn_access, .port = self->srv1.port, }; int ruleset_fd; @@ -664,12 +942,26 @@ TEST_F(protocol, bind) /* Allows connect and bind for the first port. */ ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind_connect_p0, 0)); + &bind_connect_p0, 0)); /* Allows connect and denies bind for the second port. */ ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_connect_p1, 0)); + &connect_p1, 0)); + + /* + * For UDP sockets, allows binding to ephemeral ports (required + * to connect or send a first datagram) + */ + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_net_port_attr bind_ephemeral = { + .allowed_access = bind_access, + .port = 0, + }; + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_NET_PORT, + &bind_ephemeral, 0)); + } enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); @@ -691,18 +983,25 @@ TEST_F(protocol, bind) TEST_F(protocol, connect) { - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { + const __u64 bind_access = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const __u64 conn_access = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + .handled_access_net = bind_access | conn_access, }; - const struct landlock_net_port_attr tcp_bind_connect_p0 = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_net_port_attr bind_connect_p0 = { + .allowed_access = bind_access | conn_access, .port = self->srv0.port, }; - const struct landlock_net_port_attr tcp_bind_p1 = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP, + const struct landlock_net_port_attr bind_p1 = { + .allowed_access = bind_access, .port = self->srv1.port, }; int ruleset_fd; @@ -714,12 +1013,26 @@ TEST_F(protocol, connect) /* Allows connect and bind for the first port. */ ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind_connect_p0, 0)); + &bind_connect_p0, 0)); /* Allows bind and denies connect for the second port. */ ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind_p1, 0)); + &bind_p1, 0)); + + /* + * For UDP sockets, allows binding to ephemeral ports (required + * to connect or send a first datagram) + */ + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_net_port_attr bind_ephemeral = { + .allowed_access = bind_access, + .port = 0, + }; + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_NET_PORT, + &bind_ephemeral, 0)); + } enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); @@ -737,16 +1050,20 @@ TEST_F(protocol, connect) TEST_F(protocol, bind_unspec) { + const __u64 bind_access = (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP, + .handled_access_net = bind_access, }; - const struct landlock_net_port_attr tcp_bind = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP, + const struct landlock_net_port_attr rule_bind = { + .allowed_access = bind_access, .port = self->srv0.port, }; int bind_fd, ret; - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { const int ruleset_fd = landlock_create_ruleset( &ruleset_attr, sizeof(ruleset_attr), 0); ASSERT_LE(0, ruleset_fd); @@ -754,7 +1071,7 @@ TEST_F(protocol, bind_unspec) /* Allows bind. */ ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind, 0)); + &rule_bind, 0)); enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); } @@ -782,7 +1099,8 @@ TEST_F(protocol, bind_unspec) } EXPECT_EQ(0, close(bind_fd)); - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { const int ruleset_fd = landlock_create_ruleset( &ruleset_attr, sizeof(ruleset_attr), 0); ASSERT_LE(0, ruleset_fd); @@ -828,11 +1146,21 @@ TEST_F(protocol, bind_unspec) TEST_F(protocol, connect_unspec) { - const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_CONNECT_TCP, + const __u64 connect_right = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); + const __u64 bind_right = (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const struct landlock_ruleset_attr ruleset_conn = { + .handled_access_net = connect_right, }; - const struct landlock_net_port_attr tcp_connect = { - .allowed_access = LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_ruleset_attr ruleset_conn_bind = { + .handled_access_net = connect_right | bind_right, + }; + const struct landlock_net_port_attr rule_connect = { + .allowed_access = connect_right, .port = self->srv0.port, }; int bind_fd, client_fd, status; @@ -865,15 +1193,16 @@ TEST_F(protocol, connect_unspec) EXPECT_EQ(0, ret); } - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { const int ruleset_fd = landlock_create_ruleset( - &ruleset_attr, sizeof(ruleset_attr), 0); + &ruleset_conn, sizeof(ruleset_conn), 0); ASSERT_LE(0, ruleset_fd); /* Allows connect. */ ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_connect, 0)); + &rule_connect, 0)); enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); } @@ -896,12 +1225,14 @@ TEST_F(protocol, connect_unspec) EXPECT_EQ(0, ret); } - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { const int ruleset_fd = landlock_create_ruleset( - &ruleset_attr, sizeof(ruleset_attr), 0); + &ruleset_conn_bind, sizeof(ruleset_conn_bind), + 0); ASSERT_LE(0, ruleset_fd); - /* Denies connect. */ + /* Denies connect and bind. */ enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); } @@ -950,6 +1281,441 @@ TEST_F(protocol, connect_unspec) EXPECT_EQ(0, close(bind_fd)); } +TEST_F(protocol, sendmsg_stream) +{ + int srv0_fd, tmp_fd, client_fd, res; + char read_buf[1] = { 0 }; + + /* + * Simple test for stream sockets: just deny all connect()/ + * send(explicit addr)/bind(), and make sure we don't interfere with any + * operation. + */ + if (variant->prot.type != SOCK_STREAM) + return; + + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = + LANDLOCK_ACCESS_NET_BIND_UDP | + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + }; + const int ruleset_fd = landlock_create_ruleset( + &ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + } + + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + ASSERT_LE(0, srv0_fd = socket_variant(&self->srv0)); + ASSERT_EQ(0, bind_variant(srv0_fd, &self->srv0)); + ASSERT_EQ(0, listen(srv0_fd, backlog)); + + /* Send on a non-connected socket. */ + res = sendto_variant(client_fd, NULL, "A", 1, 0); + if (variant->prot.domain == AF_UNIX) { + EXPECT_EQ(-ENOTCONN, res); + } else { + EXPECT_EQ(-EPIPE, res); + } + + /* Send to a truncated (invalid) address on a non-connected socket. */ + res = sendto_variant_addrlen(client_fd, &self->srv0, + get_addrlen(&self->srv0, true) - 1, "B", 1, + 0); + if (variant->prot.domain == AF_UNIX) { + EXPECT_EQ(-EOPNOTSUPP, res); + } else { + EXPECT_EQ(-EPIPE, res); + } + + /* Connect. */ + ASSERT_EQ(0, connect_variant(client_fd, &self->srv0)); + tmp_fd = accept(srv0_fd, NULL, 0); + ASSERT_LE(0, tmp_fd); + EXPECT_EQ(0, close(srv0_fd)); + srv0_fd = tmp_fd; + + /* Send without an explicit address. */ + EXPECT_EQ(0, sendto_variant(client_fd, NULL, "C", 1, 0)); + EXPECT_EQ(1, recv(srv0_fd, read_buf, 1, 0)) + { + TH_LOG("recv() failed: %s", strerror(errno)); + } + EXPECT_EQ(read_buf[0], 'C'); + + /* Send to a truncated (invalid) address. */ + res = sendto_variant_addrlen(client_fd, &self->srv0, + get_addrlen(&self->srv0, true) - 1, "D", 1, + 0); + if (variant->prot.domain == AF_UNIX) { + EXPECT_EQ(-EISCONN, res); + } else { + ASSERT_EQ(0, res); + EXPECT_EQ(1, recv(srv0_fd, read_buf, 1, 0)) + { + TH_LOG("recv() failed: %s", strerror(errno)); + } + EXPECT_EQ(read_buf[0], 'D'); + } + + /* Send to a valid but different address. */ + res = sendto_variant(client_fd, &self->srv1, "E", 1, 0); + if (variant->prot.domain == AF_UNIX) { + EXPECT_EQ(-EISCONN, res); + } else { + ASSERT_EQ(0, res); + EXPECT_EQ(1, recv(srv0_fd, read_buf, 1, 0)) + { + TH_LOG("recv() failed: %s", strerror(errno)); + } + EXPECT_EQ(read_buf[0], 'E'); + } + + EXPECT_EQ(0, close(client_fd)); +} + +TEST_F(protocol, sendmsg_dgram) +{ + const bool restricted = is_restricted(&variant->prot, variant->sandbox); + int srv0_fd, srv1_fd, client_fd, child, status, res; + + if (variant->prot.type != SOCK_DGRAM) + return; + + /* Prepare server on port #0 to be allowed. */ + ASSERT_LE(0, srv0_fd = socket_variant(&self->srv0)); + ASSERT_EQ(0, bind_variant(srv0_fd, &self->srv0)); + + /* And another server on port #1 to be denied. */ + ASSERT_LE(0, srv1_fd = socket_variant(&self->srv1)); + ASSERT_EQ(0, bind_variant(srv1_fd, &self->srv1)); + + /* + * Check that sockets connected before restrictions are not impacted in + * any way. + */ + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + ASSERT_EQ(0, connect_variant(client_fd, &self->srv0)); + if (variant->sandbox == UDP_SANDBOX) { + /* Deny all connect()/send(explicit addr)/bind(). */ + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = + LANDLOCK_ACCESS_NET_BIND_UDP | + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + }; + const int ruleset_fd = landlock_create_ruleset( + &ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + } + EXPECT_EQ(0, + test_sendmsg(_metadata, &variant->prot, client_fd, + srv0_fd, NULL, restricted, restricted)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv0_fd, &self->srv0, restricted, + restricted)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv1_fd, &self->srv1, restricted, + restricted)); + EXPECT_EQ(0, close(client_fd)); + _exit(_metadata->exit_code); + } + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* + * Restrict connect/send, but not bind(). Then try sending with no + * destination (and no remote peer set), an allowed destination, then a + * denied destination. + */ + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + }; + const struct landlock_net_port_attr send_p0 = { + .allowed_access = + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + .port = self->srv0.port, + }; + const int ruleset_fd = landlock_create_ruleset( + &ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_NET_PORT, + &send_p0, 0)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + } + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + -1, NULL, false, false)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv0_fd, &self->srv0, false, false)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv1_fd, &self->srv1, false, + restricted)); + EXPECT_EQ(0, close(client_fd)); + _exit(_metadata->exit_code); + return; + } + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* + * Rest of this test is just for autobind enforcement, which only exists + * in IP sockets. + */ + if (variant->prot.domain != AF_INET && variant->prot.domain != AF_INET6) + return; + + /* Restrict bind() to explicit calls with an arbitrary (non-0) port. */ + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + const uint16_t allowed_src_port = 42424; + struct service_fixture allowed_src; + + allowed_src = self->srv0; + set_port(&allowed_src, allowed_src_port); + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = + LANDLOCK_ACCESS_NET_BIND_UDP, + }; + const struct landlock_net_port_attr rule = { + .allowed_access = LANDLOCK_ACCESS_NET_BIND_UDP, + .port = allowed_src_port, + }; + const int ruleset_fd = landlock_create_ruleset( + &ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_NET_PORT, + &rule, 0)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + } + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + + /* Check that implicit bind(0) in sendmsg() is denied. */ + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv0_fd, &self->srv0, restricted, + false)); + + /* Same thing for autobind in connect(). */ + res = connect_variant(client_fd, &self->srv0); + if (restricted) { + EXPECT_EQ(-EACCES, res); + } else { + EXPECT_EQ(0, res); + } + EXPECT_EQ(0, close(client_fd)); + + /* Make sendmsg() work by explicitly binding to the only allowed port. */ + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + EXPECT_EQ(0, bind_variant(client_fd, &allowed_src)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv0_fd, &self->srv0, restricted, + false)); + EXPECT_EQ(0, close(client_fd)); + + /* Make connect() work by explicitly binding to the only allowed port. */ + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + EXPECT_EQ(0, bind_variant(client_fd, &allowed_src)); + EXPECT_EQ(0, connect_variant(client_fd, &self->srv0)); + EXPECT_EQ(0, close(client_fd)); + + _exit(_metadata->exit_code); + return; + } + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); + + /* + * Check that %LANDLOCK_ACCESS_NET_BIND_UDP on port 0 allows implicit + * autobinds. + */ + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = + LANDLOCK_ACCESS_NET_BIND_UDP, + }; + const struct landlock_net_port_attr rule = { + .allowed_access = LANDLOCK_ACCESS_NET_BIND_UDP, + .port = 0, + }; + const int ruleset_fd = landlock_create_ruleset( + &ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_NET_PORT, + &rule, 0)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + } + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + EXPECT_EQ(0, test_sendmsg(_metadata, &variant->prot, client_fd, + srv0_fd, &self->srv0, false, false)); + EXPECT_EQ(0, close(client_fd)); + _exit(_metadata->exit_code); + } + EXPECT_EQ(child, waitpid(child, &status, 0)); + EXPECT_EQ(1, WIFEXITED(status)); + EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status)); +} + +TEST_F(protocol, sendmsg_unspec) +{ + const bool restricted = is_restricted(&variant->prot, variant->sandbox); + int client_fd, srv0_fd, srv1_fd, res; + char read_buf[1] = { 0 }; + + /* + * We already test for the absence of influence on sendmsg for other + * socket types and other address families, there's no point in adapting + * this test for stream sockets too. + */ + if (variant->prot.type != SOCK_DGRAM) + return; + + /* Prepare client of the right family. */ + ASSERT_LE(0, client_fd = socket_variant(&self->srv0)); + + /* Prepare server on port #0 to be allowed. */ + ASSERT_LE(0, srv0_fd = socket_variant(&self->srv0)); + ASSERT_EQ(0, bind_variant(srv0_fd, &self->srv0)); + + /* And another server on port #1 to be denied. */ + ASSERT_LE(0, srv1_fd = socket_variant(&self->srv1)); + ASSERT_EQ(0, bind_variant(srv1_fd, &self->srv1)); + + if (variant->sandbox == UDP_SANDBOX) { + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + }; + const struct landlock_net_port_attr rule = { + .allowed_access = LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + .port = self->srv0.port, + }; + const int ruleset_fd = landlock_create_ruleset( + &ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, + landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &rule, 0)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + } + + /* Explicit AF_UNSPEC address but truncated. */ + EXPECT_EQ(-EINVAL, sendto_variant_addrlen( + client_fd, &self->unspec_srv0, + get_addrlen(&self->unspec_srv0, true) - 1, + "A", 1, 0)); + + /* + * Explicit AF_UNSPEC address, should be treated as AF_INET by IPv4 + * sockets (and thus map to srv0, allowed), but be denied by IPv6 + * sockets. + */ + res = sendto_variant(client_fd, &self->unspec_srv0, "B", 1, 0); + if (variant->prot.domain == AF_INET6) { + if (restricted) { + /* Always denied on IPv6 socket. */ + EXPECT_EQ(-EACCES, res); + } else { + /* IPv6 sockets treat AF_UNSPEC as a NULL address. */ + EXPECT_EQ(-EDESTADDRREQ, res); + } + } else if (variant->prot.domain == AF_INET) { + ASSERT_EQ(0, res); + EXPECT_EQ(1, read(srv0_fd, read_buf, 1)) + { + TH_LOG("read() failed: %s", strerror(errno)); + } + EXPECT_EQ(read_buf[0], 'B'); + } else { + /* Unix sockets don't accept AF_UNSPEC. */ + EXPECT_EQ(-EINVAL, res); + } + + /* + * Explicit AF_UNSPEC address, should be treated as AF_INET on IPv4 + * sockets (and thus map to srv1, denied), and be denied on IPv6 sockets + * as always. + */ + res = sendto_variant(client_fd, &self->unspec_srv1, "C", 1, 0); + if (variant->prot.domain == AF_INET6) { + if (restricted) { + /* Always denied on IPv6 socket. */ + EXPECT_EQ(-EACCES, res); + } else { + /* IPv6 sockets treat AF_UNSPEC as a NULL address. */ + EXPECT_EQ(-EDESTADDRREQ, res); + } + } else if (variant->prot.domain == AF_INET) { + if (restricted) { + /* Sending to srv1 is not allowed, only srv0. */ + EXPECT_EQ(-EACCES, res); + } else { + ASSERT_EQ(0, res); + EXPECT_EQ(1, read(srv1_fd, read_buf, 1)) + { + TH_LOG("read() failed: %s", strerror(errno)); + } + EXPECT_EQ(read_buf[0], 'C'); + } + } else { + /* Unix sockets don't accept AF_UNSPEC. */ + EXPECT_EQ(-EINVAL, res); + } + + ASSERT_EQ(0, connect_variant(client_fd, &self->srv0)); + + /* Minimal explicit AF_UNSPEC address (just the sa_family_t field) */ + res = sendto_variant_addrlen(client_fd, &self->unspec_srv0, + get_addrlen(&self->unspec_srv0, true), "D", + 1, 0); + if (variant->prot.domain == AF_INET6) { + if (restricted) { + /* AF_UNSPEC is always denied in IPv6. */ + EXPECT_EQ(-EACCES, res); + } else { + /* + * IPv6 sockets treat AF_UNSPEC as a NULL address, + * falling back to the connected address. + */ + ASSERT_EQ(0, res); + EXPECT_EQ(1, read(srv0_fd, read_buf, 1)); + EXPECT_EQ(read_buf[0], 'D'); + } + } else { + /* + * IPv4 socket will expect a struct sockaddr_in, our address is + * considered truncated. And Unix sockets don't accept + * AF_UNSPEC at all. + */ + EXPECT_EQ(-EINVAL, res); + } +} + FIXTURE(ipv4) { struct service_fixture srv0, srv1; @@ -976,6 +1742,13 @@ FIXTURE_VARIANT_ADD(ipv4, tcp_sandbox_with_tcp) { }; /* clang-format off */ +FIXTURE_VARIANT_ADD(ipv4, udp_sandbox_with_tcp) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .type = SOCK_STREAM, +}; + +/* clang-format off */ FIXTURE_VARIANT_ADD(ipv4, no_sandbox_with_udp) { /* clang-format on */ .sandbox = NO_SANDBOX, @@ -989,6 +1762,13 @@ FIXTURE_VARIANT_ADD(ipv4, tcp_sandbox_with_udp) { .type = SOCK_DGRAM, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(ipv4, udp_sandbox_with_udp) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .type = SOCK_DGRAM, +}; + FIXTURE_SETUP(ipv4) { const struct protocol_variant prot = { @@ -1012,14 +1792,19 @@ TEST_F(ipv4, from_unix_to_inet) { int unix_stream_fd, unix_dgram_fd; - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { + const __u64 access_rights = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP | + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP | + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + .handled_access_net = access_rights, }; const struct landlock_net_port_attr tcp_bind_connect_p0 = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + .allowed_access = access_rights, .port = self->srv0.port, }; int ruleset_fd; @@ -1326,11 +2111,13 @@ FIXTURE_TEARDOWN(mini) /* clang-format off */ -#define ACCESS_LAST LANDLOCK_ACCESS_NET_CONNECT_TCP +#define ACCESS_LAST LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP #define ACCESS_ALL ( \ LANDLOCK_ACCESS_NET_BIND_TCP | \ - LANDLOCK_ACCESS_NET_CONNECT_TCP) + LANDLOCK_ACCESS_NET_CONNECT_TCP | \ + LANDLOCK_ACCESS_NET_BIND_UDP | \ + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP) /* clang-format on */ @@ -1678,6 +2465,7 @@ TEST_F(ipv4_tcp, with_fs) FIXTURE(port_specific) { struct service_fixture srv0; + struct service_fixture cli1; }; FIXTURE_VARIANT(port_specific) @@ -1697,7 +2485,7 @@ FIXTURE_VARIANT_ADD(port_specific, no_sandbox_with_ipv4) { }; /* clang-format off */ -FIXTURE_VARIANT_ADD(port_specific, sandbox_with_ipv4) { +FIXTURE_VARIANT_ADD(port_specific, tcp_sandbox_with_ipv4) { /* clang-format on */ .sandbox = TCP_SANDBOX, .prot = { @@ -1707,6 +2495,16 @@ FIXTURE_VARIANT_ADD(port_specific, sandbox_with_ipv4) { }; /* clang-format off */ +FIXTURE_VARIANT_ADD(port_specific, udp_sandbox_with_ipv4) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_DGRAM, + }, +}; + +/* clang-format off */ FIXTURE_VARIANT_ADD(port_specific, no_sandbox_with_ipv6) { /* clang-format on */ .sandbox = NO_SANDBOX, @@ -1717,7 +2515,7 @@ FIXTURE_VARIANT_ADD(port_specific, no_sandbox_with_ipv6) { }; /* clang-format off */ -FIXTURE_VARIANT_ADD(port_specific, sandbox_with_ipv6) { +FIXTURE_VARIANT_ADD(port_specific, tcp_sandbox_with_ipv6) { /* clang-format on */ .sandbox = TCP_SANDBOX, .prot = { @@ -1726,11 +2524,22 @@ FIXTURE_VARIANT_ADD(port_specific, sandbox_with_ipv6) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(port_specific, udp_sandbox_with_ipv6) { + /* clang-format on */ + .sandbox = UDP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_DGRAM, + }, +}; + FIXTURE_SETUP(port_specific) { disable_caps(_metadata); ASSERT_EQ(0, set_service(&self->srv0, variant->prot, 0)); + ASSERT_EQ(0, set_service(&self->cli1, variant->prot, 1)); setup_loopback(_metadata); }; @@ -1745,14 +2554,19 @@ TEST_F(port_specific, bind_connect_zero) uint16_t port; /* Adds a rule layer with bind and connect actions. */ - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { + const __u64 access_rights = + (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP | + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP | + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP + .handled_access_net = access_rights, }; - const struct landlock_net_port_attr tcp_bind_connect_zero = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_net_port_attr bind_connect_zero = { + .allowed_access = access_rights, .port = 0, }; int ruleset_fd; @@ -1764,7 +2578,7 @@ TEST_F(port_specific, bind_connect_zero) /* Checks zero port value on bind and connect actions. */ EXPECT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind_connect_zero, 0)); + &bind_connect_zero, 0)); enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); @@ -1785,11 +2599,16 @@ TEST_F(port_specific, bind_connect_zero) ret = bind_variant(bind_fd, &self->srv0); EXPECT_EQ(0, ret); - EXPECT_EQ(0, listen(bind_fd, backlog)); + if (variant->prot.type == SOCK_STREAM) + EXPECT_EQ(0, listen(bind_fd, backlog)); /* Connects on port 0. */ ret = connect_variant(connect_fd, &self->srv0); - EXPECT_EQ(-ECONNREFUSED, ret); + if (variant->prot.type == SOCK_STREAM) { + EXPECT_EQ(-ECONNREFUSED, ret); + } else { + EXPECT_EQ(0, ret); + } /* Sets binded port for both protocol families. */ port = get_binded_port(bind_fd, &variant->prot); @@ -1813,23 +2632,35 @@ TEST_F(port_specific, bind_connect_1023) int bind_fd, connect_fd, ret; /* Adds a rule layer with bind and connect actions. */ - if (variant->sandbox == TCP_SANDBOX) { + if (variant->sandbox == TCP_SANDBOX || + variant->sandbox == UDP_SANDBOX) { + const __u64 bind_right = (variant->sandbox == TCP_SANDBOX ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const __u64 access_rights = + (variant->sandbox == TCP_SANDBOX ? + (LANDLOCK_ACCESS_NET_BIND_TCP | + LANDLOCK_ACCESS_NET_CONNECT_TCP) : + (LANDLOCK_ACCESS_NET_BIND_UDP | + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP)); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP + .handled_access_net = access_rights, }; /* A rule with port value less than 1024. */ - const struct landlock_net_port_attr tcp_bind_connect_low_range = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_net_port_attr bind_connect_low_range = { + .allowed_access = access_rights, .port = 1023, }; /* A rule with 1024 port. */ - const struct landlock_net_port_attr tcp_bind_connect = { - .allowed_access = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + const struct landlock_net_port_attr bind_connect = { + .allowed_access = access_rights, .port = 1024, }; + /* A rule with cli1's port, to use as source port. */ + const struct landlock_net_port_attr srcport = { + .allowed_access = bind_right, + .port = self->cli1.port, + }; int ruleset_fd; ruleset_fd = landlock_create_ruleset(&ruleset_attr, @@ -1838,10 +2669,15 @@ TEST_F(port_specific, bind_connect_1023) ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind_connect_low_range, 0)); + &bind_connect_low_range, 0)); ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, - &tcp_bind_connect, 0)); + &bind_connect, 0)); + if (variant->sandbox == UDP_SANDBOX) { + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, + LANDLOCK_RULE_NET_PORT, + &srcport, 0)); + } enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); @@ -1850,9 +2686,6 @@ TEST_F(port_specific, bind_connect_1023) bind_fd = socket_variant(&self->srv0); ASSERT_LE(0, bind_fd); - connect_fd = socket_variant(&self->srv0); - ASSERT_LE(0, connect_fd); - /* Sets address port to 1023 for both protocol families. */ set_port(&self->srv0, 1023); /* Binds on port 1023. */ @@ -1865,8 +2698,19 @@ TEST_F(port_specific, bind_connect_1023) ret = bind_variant(bind_fd, &self->srv0); clear_cap(_metadata, CAP_NET_BIND_SERVICE); EXPECT_EQ(0, ret); - EXPECT_EQ(0, listen(bind_fd, backlog)); + if (variant->prot.type == SOCK_STREAM) + EXPECT_EQ(0, listen(bind_fd, backlog)); + connect_fd = socket_variant(&self->srv0); + ASSERT_LE(0, connect_fd); + if (variant->prot.type == SOCK_DGRAM) { + /* + * We are about to connect(), but bind() is restricted, so for + * UDP sockets we need to use cli1's port as source port (the + * only one we are allowed to use). + */ + EXPECT_EQ(0, bind_variant(connect_fd, &self->cli1)); + } /* Connects on the binded port 1023. */ ret = connect_variant(connect_fd, &self->srv0); EXPECT_EQ(0, ret); @@ -1885,7 +2729,10 @@ TEST_F(port_specific, bind_connect_1023) /* Binds on port 1024. */ ret = bind_variant(bind_fd, &self->srv0); EXPECT_EQ(0, ret); - EXPECT_EQ(0, listen(bind_fd, backlog)); + if (variant->prot.type == SOCK_STREAM) + EXPECT_EQ(0, listen(bind_fd, backlog)); + if (variant->prot.type == SOCK_DGRAM) + EXPECT_EQ(0, bind_variant(connect_fd, &self->cli1)); /* Connects on the binded port 1024. */ ret = connect_variant(connect_fd, &self->srv0); @@ -1895,23 +2742,41 @@ TEST_F(port_specific, bind_connect_1023) EXPECT_EQ(0, close(bind_fd)); } -static int matches_log_tcp(const int audit_fd, const char *const blockers, - const char *const dir_addr, const char *const addr, - const char *const dir_port) +/** + * matches_auditlog - Check audit log for a network access denial + * + * @audit_fd: Audit file descriptor. + * @blockers: A regex-escaped blocker string, e.g., "net\.bind_tcp". + * @dir_addr: Either "saddr" or "daddr", ignored if addr is NULL. + * @addr: A regex-escaped IP address string, or NULL. + * @dir_port: Either "src" or "dest", ignored if addr is NULL. + * @port: A port number, ignored if addr is NULL. + */ +static int matches_auditlog(const int audit_fd, const char *const blockers, + const char *const dir_addr, const char *const addr, + const char *const dir_port, const __u16 port) { - static const char log_template[] = REGEX_LANDLOCK_PREFIX - " blockers=%s %s=%s %s=1024$"; + static const char log_with_addrport_tmpl[] = REGEX_LANDLOCK_PREFIX + " blockers=%s %s=%s %s=%u$"; + static const char log_without_addrport_tmpl[] = REGEX_LANDLOCK_PREFIX + " blockers=%s"; /* * Max strlen(blockers): 16 * Max strlen(dir_addr): 5 * Max strlen(addr): 12 * Max strlen(dir_port): 4 + * Max strlen(%u port): 5 */ - char log_match[sizeof(log_template) + 37]; + char log_match[sizeof(log_with_addrport_tmpl) + 42]; int log_match_len; - log_match_len = snprintf(log_match, sizeof(log_match), log_template, - blockers, dir_addr, addr, dir_port); + if (addr == NULL) + log_match_len = snprintf(log_match, sizeof(log_match), + log_without_addrport_tmpl, blockers); + else + log_match_len = snprintf(log_match, sizeof(log_match), + log_with_addrport_tmpl, blockers, + dir_addr, addr, dir_port, port); if (log_match_len > sizeof(log_match)) return -E2BIG; @@ -1922,6 +2787,10 @@ static int matches_log_tcp(const int audit_fd, const char *const blockers, FIXTURE(audit) { struct service_fixture srv0; + struct service_fixture srv1; + /* srv2 has a rule with no access but quiet bit set. */ + struct service_fixture srv2; + struct service_fixture unspec_srv0; struct audit_filter audit_filter; int audit_fd; }; @@ -1933,7 +2802,7 @@ FIXTURE_VARIANT(audit) }; /* clang-format off */ -FIXTURE_VARIANT_ADD(audit, ipv4) { +FIXTURE_VARIANT_ADD(audit, ipv4_tcp) { /* clang-format on */ .addr = "127\\.0\\.0\\.1", .prot = { @@ -1943,7 +2812,17 @@ FIXTURE_VARIANT_ADD(audit, ipv4) { }; /* clang-format off */ -FIXTURE_VARIANT_ADD(audit, ipv6) { +FIXTURE_VARIANT_ADD(audit, ipv4_udp) { + /* clang-format on */ + .addr = "127\\.0\\.0\\.1", + .prot = { + .domain = AF_INET, + .type = SOCK_DGRAM, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(audit, ipv6_tcp) { /* clang-format on */ .addr = "::1", .prot = { @@ -1952,9 +2831,27 @@ FIXTURE_VARIANT_ADD(audit, ipv6) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(audit, ipv6_udp) { + /* clang-format on */ + .addr = "::1", + .prot = { + .domain = AF_INET6, + .type = SOCK_DGRAM, + }, +}; + FIXTURE_SETUP(audit) { + struct protocol_variant prot_unspec = variant->prot; + + prot_unspec.domain = AF_UNSPEC; + ASSERT_EQ(0, set_service(&self->srv0, variant->prot, 0)); + ASSERT_EQ(0, set_service(&self->srv1, variant->prot, 1)); + ASSERT_EQ(0, set_service(&self->srv2, variant->prot, 2)); + ASSERT_EQ(0, set_service(&self->unspec_srv0, prot_unspec, 0)); + setup_loopback(_metadata); set_cap(_metadata, CAP_AUDIT_CONTROL); @@ -1972,9 +2869,22 @@ FIXTURE_TEARDOWN(audit) TEST_F(audit, bind) { + const char *audit_evt = (variant->prot.type == SOCK_STREAM ? + "net\\.bind_tcp" : + "net\\.bind_udp"); + const __u64 access_rights = + (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_BIND_TCP | + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP | + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + .handled_access_net = access_rights, + .quiet_access_net = access_rights, + }; + const struct landlock_net_port_attr quiet_rule = { + .allowed_access = 0, + .port = self->srv2.port, }; struct audit_records records; int ruleset_fd, sock_fd; @@ -1982,27 +2892,58 @@ TEST_F(audit, bind) ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &quiet_rule, LANDLOCK_ADD_RULE_QUIET)); enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); sock_fd = socket_variant(&self->srv0); ASSERT_LE(0, sock_fd); EXPECT_EQ(-EACCES, bind_variant(sock_fd, &self->srv0)); - EXPECT_EQ(0, matches_log_tcp(self->audit_fd, "net\\.bind_tcp", "saddr", - variant->addr, "src")); + EXPECT_EQ(0, matches_auditlog(self->audit_fd, audit_evt, "saddr", + variant->addr, "src", self->srv0.port)); EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); EXPECT_EQ(0, records.access); EXPECT_EQ(1, records.domain); EXPECT_EQ(0, close(sock_fd)); + + /* Bind to srv2 (with quiet rule): no new audit logs. */ + sock_fd = socket_variant(&self->srv2); + ASSERT_LE(0, sock_fd); + EXPECT_EQ(-EACCES, bind_variant(sock_fd, &self->srv2)); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); + + EXPECT_EQ(0, close(sock_fd)); } TEST_F(audit, connect) { + const char *audit_evt = (variant->prot.type == SOCK_STREAM ? + "net\\.connect_tcp" : + "net\\.connect_send_udp"); + const __u64 bind_right = (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const __u64 conn_right = (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); + const __u64 access_rights = bind_right | conn_right; const struct landlock_ruleset_attr ruleset_attr = { - .handled_access_net = LANDLOCK_ACCESS_NET_BIND_TCP | - LANDLOCK_ACCESS_NET_CONNECT_TCP, + .handled_access_net = access_rights, + .quiet_access_net = access_rights, + }; + const struct landlock_net_port_attr rule_connect_p1 = { + .allowed_access = conn_right, + .port = self->srv1.port, + }; + const struct landlock_net_port_attr quiet_rule = { + .allowed_access = 0, + .port = self->srv2.port, }; struct audit_records records; int ruleset_fd, sock_fd; @@ -2010,14 +2951,179 @@ TEST_F(audit, connect) ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &rule_connect_p1, 0)); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &quiet_rule, LANDLOCK_ADD_RULE_QUIET)); enforce_ruleset(_metadata, ruleset_fd); EXPECT_EQ(0, close(ruleset_fd)); sock_fd = socket_variant(&self->srv0); ASSERT_LE(0, sock_fd); EXPECT_EQ(-EACCES, connect_variant(sock_fd, &self->srv0)); - EXPECT_EQ(0, matches_log_tcp(self->audit_fd, "net\\.connect_tcp", - "daddr", variant->addr, "dest")); + EXPECT_EQ(0, matches_auditlog(self->audit_fd, audit_evt, "daddr", + variant->addr, "dest", self->srv0.port)); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(1, records.domain); + + if (variant->prot.type == SOCK_DGRAM) { + /* Check that autobind generates a denied bind event. */ + EXPECT_EQ(-EACCES, connect_variant(sock_fd, &self->srv1)); + + EXPECT_EQ(0, matches_auditlog(self->audit_fd, "net\\.bind_udp", + NULL, NULL, NULL, 0)); + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); + } + + EXPECT_EQ(0, close(sock_fd)); + + /* Connect to srv2 (with quiet rule): no new audit logs. */ + sock_fd = socket_variant(&self->srv2); + ASSERT_LE(0, sock_fd); + EXPECT_EQ(-EACCES, connect_variant(sock_fd, &self->srv2)); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); + + EXPECT_EQ(0, close(sock_fd)); +} + +/* Quieting bind access has no effect on connect. */ +TEST_F(audit, connect_quiet_bind) +{ + const char *audit_evt = (variant->prot.type == SOCK_STREAM ? + "net\\.connect_tcp" : + "net\\.connect_send_udp"); + const int bind_right = (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const int conn_right = (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); + const int access_rights = bind_right | conn_right; + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = access_rights, + .quiet_access_net = bind_right, + }; + const struct landlock_ruleset_attr ruleset_attr_2 = { + .handled_access_net = access_rights, + .quiet_access_net = conn_right, + }; + const struct landlock_net_port_attr quiet_rule = { + .allowed_access = 0, + .port = self->srv2.port, + }; + struct audit_records records; + int ruleset_fd, sock_fd; + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &quiet_rule, LANDLOCK_ADD_RULE_QUIET)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + sock_fd = socket_variant(&self->srv2); + ASSERT_LE(0, sock_fd); + EXPECT_EQ(-EACCES, connect_variant(sock_fd, &self->srv2)); + EXPECT_EQ(0, matches_auditlog(self->audit_fd, audit_evt, "daddr", + variant->addr, "dest", self->srv2.port)); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + + EXPECT_EQ(0, close(sock_fd)); + + /* New layer that also denies connect but has the correct quiet bit. */ + ruleset_fd = landlock_create_ruleset(&ruleset_attr_2, + sizeof(ruleset_attr_2), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &quiet_rule, LANDLOCK_ADD_RULE_QUIET)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + sock_fd = socket_variant(&self->srv2); + ASSERT_LE(0, sock_fd); + EXPECT_EQ(-EACCES, connect_variant(sock_fd, &self->srv2)); + + /* Quieted - no logs expected. */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + + EXPECT_EQ(0, close(sock_fd)); +} + +static int matches_log_connect_bound(int audit_fd, const char *const blockers, + const char *const addr, __u16 lport, + __u16 dport) +{ + static const char log_template[] = REGEX_LANDLOCK_PREFIX + " blockers=%s laddr=%s lport=%u daddr=%s dest=%u$"; + /* Slack for the blockers, two addresses and two port numbers. */ + char log_match[sizeof(log_template) + 60]; + int log_match_len; + + log_match_len = snprintf(log_match, sizeof(log_match), log_template, + blockers, addr, lport, addr, dport); + if (log_match_len > sizeof(log_match)) + return -E2BIG; + + return audit_match_record(audit_fd, AUDIT_LANDLOCK_ACCESS, log_match, + NULL); +} + +/* + * After a bind() to an allowed port, a denied connect must report laddr/lport + * from the bound socket (made available through audit_net.sk) in addition to + * the connect sockaddr's daddr/dest. + */ +TEST_F(audit, connect_bound) +{ + const __u64 bind_right = (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_BIND_TCP : + LANDLOCK_ACCESS_NET_BIND_UDP); + const __u64 conn_right = (variant->prot.type == SOCK_STREAM ? + LANDLOCK_ACCESS_NET_CONNECT_TCP : + LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP); + const char *const audit_evt = (variant->prot.type == SOCK_STREAM ? + "net\\.connect_tcp" : + "net\\.connect_send_udp"); + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = bind_right | conn_right, + }; + const struct landlock_net_port_attr rule_bind = { + .allowed_access = bind_right, + .port = self->srv0.port, + }; + struct service_fixture srv_remote; + struct audit_records records; + int ruleset_fd, sock_fd; + + /* Uses a second port as the denied connect target. */ + ASSERT_EQ(0, set_service(&srv_remote, variant->prot, 1)); + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &rule_bind, 0)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + sock_fd = socket_variant(&self->srv0); + ASSERT_LE(0, sock_fd); + EXPECT_EQ(0, bind_variant(sock_fd, &self->srv0)); + EXPECT_EQ(-EACCES, connect_variant(sock_fd, &srv_remote)); + EXPECT_EQ(0, matches_log_connect_bound(self->audit_fd, audit_evt, + variant->addr, self->srv0.port, + srv_remote.port)); EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); EXPECT_EQ(0, records.access); @@ -2026,4 +3132,60 @@ TEST_F(audit, connect) EXPECT_EQ(0, close(sock_fd)); } +TEST_F(audit, sendmsg) +{ + const struct landlock_ruleset_attr ruleset_attr = { + .handled_access_net = LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP | + LANDLOCK_ACCESS_NET_BIND_UDP, + }; + const struct landlock_net_port_attr rule = { + .allowed_access = LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP, + .port = self->srv1.port, + }; + struct audit_records records; + int ruleset_fd; + int sock_fd; + + /* Sendmsg on stream sockets is never denied. */ + if (variant->prot.type != SOCK_DGRAM) + return; + + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd); + ASSERT_EQ(0, landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT, + &rule, 0)); + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); + + sock_fd = socket_variant(&self->srv0); + ASSERT_LE(0, sock_fd); + EXPECT_EQ(-EACCES, sendto_variant(sock_fd, &self->srv0, "A", 1, 0)); + EXPECT_EQ(0, matches_auditlog(self->audit_fd, "net\\.connect_send_udp", + "daddr", variant->addr, "dest", + self->srv0.port)); + + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(1, records.domain); + + /* Check that autobind generates a denied bind event. */ + EXPECT_EQ(-EACCES, sendto_variant(sock_fd, &self->srv1, "A", 1, 0)); + EXPECT_EQ(0, matches_auditlog(self->audit_fd, "net\\.bind_udp", NULL, + NULL, NULL, 0)); + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); + + EXPECT_EQ(-EACCES, + sendto_variant(sock_fd, &self->unspec_srv0, "B", 1, 0)); + EXPECT_EQ(0, matches_auditlog(self->audit_fd, "net\\.connect_send_udp", + "daddr", NULL, "dest", 0)); + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); + + EXPECT_EQ(0, close(sock_fd)); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c index 1b6c8b53bf33..4f64c90583cd 100644 --- a/tools/testing/selftests/landlock/ptrace_test.c +++ b/tools/testing/selftests/landlock/ptrace_test.c @@ -342,6 +342,7 @@ TEST_F(audit, trace) /* Makes sure there is no superfluous logged records. */ EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); yama_ptrace_scope = get_yama_ptrace_scope(); ASSERT_LE(0, yama_ptrace_scope); diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c index c47491d2d1c1..40fc82fbf01d 100644 --- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c +++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c @@ -293,6 +293,45 @@ FIXTURE_TEARDOWN_PARENT(scoped_audit) EXPECT_EQ(0, audit_cleanup(-1, NULL)); } +FIXTURE_VARIANT(scoped_audit) +{ + const __u64 scoped; + const __u64 quiet_scoped; +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(scoped_audit, no_quiet) +{ + /* clang-format on */ + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, + .quiet_scoped = 0, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(scoped_audit, quiet_abstract_socket) +{ + /* clang-format on */ + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, + .quiet_scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(scoped_audit, quiet_abstract_socket_2) +{ + /* clang-format on */ + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL, + .quiet_scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | + LANDLOCK_SCOPE_SIGNAL, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(scoped_audit, quiet_unrelated) +{ + /* clang-format on */ + .scoped = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET | LANDLOCK_SCOPE_SIGNAL, + .quiet_scoped = LANDLOCK_SCOPE_SIGNAL, +}; + /* python -c 'print(b"\0selftests-landlock-abstract-unix-".hex().upper())' */ #define ABSTRACT_SOCKET_PATH_PREFIX \ "0073656C6674657374732D6C616E646C6F636B2D61627374726163742D756E69782D" @@ -308,10 +347,18 @@ TEST_F(scoped_audit, connect_to_child) char buf; int dgram_client; struct audit_records records; + int ruleset_fd; + const struct landlock_ruleset_attr ruleset_attr = { + .scoped = variant->scoped, + .quiet_scoped = variant->quiet_scoped, + }; + bool should_audit = + !(variant->quiet_scoped & LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET); /* Makes sure there is no superfluous logged records. */ EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); EXPECT_EQ(0, records.access); + EXPECT_EQ(0, records.domain); ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC)); ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC)); @@ -344,7 +391,14 @@ TEST_F(scoped_audit, connect_to_child) EXPECT_EQ(0, close(pipe_child[1])); EXPECT_EQ(0, close(pipe_parent[0])); - create_scoped_domain(_metadata, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET); + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, ruleset_fd) + { + TH_LOG("Failed to create a ruleset: %s", strerror(errno)); + } + enforce_ruleset(_metadata, ruleset_fd); + EXPECT_EQ(0, close(ruleset_fd)); /* Signals that the parent is in a domain, if any. */ ASSERT_EQ(1, write(pipe_parent[1], ".", 1)); @@ -359,14 +413,20 @@ TEST_F(scoped_audit, connect_to_child) EXPECT_EQ(-1, err_dgram); EXPECT_EQ(EPERM, errno); - EXPECT_EQ( - 0, - audit_match_record( - self->audit_fd, AUDIT_LANDLOCK_ACCESS, - REGEX_LANDLOCK_PREFIX - " blockers=scope\\.abstract_unix_socket path=" ABSTRACT_SOCKET_PATH_PREFIX - "[0-9A-F]\\+$", - NULL)); + if (should_audit) { + EXPECT_EQ( + 0, + audit_match_record( + self->audit_fd, AUDIT_LANDLOCK_ACCESS, + REGEX_LANDLOCK_PREFIX + " blockers=scope\\.abstract_unix_socket path=" ABSTRACT_SOCKET_PATH_PREFIX + "[0-9A-F]\\+$", + NULL)); + } + + /* No other logs */ + EXPECT_EQ(0, audit_count_records(self->audit_fd, &records)); + EXPECT_EQ(0, records.access); ASSERT_EQ(1, write(pipe_parent[1], ".", 1)); EXPECT_EQ(0, close(dgram_client)); diff --git a/tools/testing/selftests/landlock/scoped_signal_test.c b/tools/testing/selftests/landlock/scoped_signal_test.c index d8bf33417619..f24f2c28f62e 100644 --- a/tools/testing/selftests/landlock/scoped_signal_test.c +++ b/tools/testing/selftests/landlock/scoped_signal_test.c @@ -559,4 +559,186 @@ TEST_F(fown, sigurg_socket) _metadata->exit_code = KSFT_FAIL; } +/* + * Checks that LANDLOCK_SCOPE_SIGNAL is enforced on the asynchronous SIGIO + * delivery path (fcntl(F_SETOWN)) when the file owner is a process group. + * + * A sandboxed process sitting at the head of its process group's PID hlist (the + * default position right after fork()) used to escape the fcntl(F_SETOWN, + * -pgrp) domain recording: pid_task(pgrp, PIDTYPE_PGID) resolved to the process + * itself, so the same-thread-group exemption skipped recording its Landlock + * domain. At SIGIO time that domain was then unset and the signal fanned out + * to every group member, including non-sandboxed processes outside the domain. + */ +TEST(sigio_to_pgid_members) +{ + int trigger[2], sync_child[2]; + char buf; + pid_t child; + int status, i; + + drop_caps(_metadata); + + /* + * Isolates the test in its own process group so the SIGIO fan-out stays + * bounded to this parent and the child forked below. + */ + ASSERT_EQ(0, setpgid(0, 0)); + + /* The non-sandboxed parent is the protected (out-of-domain) target. */ + ASSERT_EQ(0, setup_signal_handler(SIGURG)); + signal_received = 0; + + ASSERT_EQ(0, pipe2(trigger, O_CLOEXEC)); + ASSERT_EQ(0, pipe2(sync_child, O_CLOEXEC)); + + child = fork(); + ASSERT_LE(0, child); + if (child == 0) { + /* + * The child inherits the parent's new process group and, just + * attached with hlist_add_head_rcu(), is now the head of the + * pgid hlist: this is the case that used to skip the recording. + */ + EXPECT_EQ(0, close(sync_child[0])); + + /* In-domain positive control: the child must be signaled. */ + ASSERT_EQ(0, setup_signal_handler(SIGURG)); + signal_received = 0; + + create_scoped_domain(_metadata, LANDLOCK_SCOPE_SIGNAL); + + /* Owns the SIGIO source for the whole process group. */ + ASSERT_EQ(0, fcntl(trigger[0], F_SETSIG, SIGURG)); + ASSERT_EQ(0, fcntl(trigger[0], F_SETOWN, -getpgrp())); + ASSERT_EQ(0, fcntl(trigger[0], F_SETFL, O_ASYNC)); + + /* Fans SIGURG out to every member of the process group. */ + ASSERT_EQ(1, write(trigger[1], ".", 1)); + + /* + * The sandboxed child is in its own domain and must always be + * signaled: this proves the SIGIO actually fired. + */ + for (i = 0; i < 1000 && !signal_received; i++) + usleep(1000); + EXPECT_EQ(1, signal_received); + + ASSERT_EQ(1, write(sync_child[1], ".", 1)); + EXPECT_EQ(0, close(sync_child[1])); + + _exit(_metadata->exit_code); + return; + } + EXPECT_EQ(0, close(sync_child[1])); + EXPECT_EQ(0, close(trigger[0])); + EXPECT_EQ(0, close(trigger[1])); + + /* Waits for the child to generate the SIGIO. */ + ASSERT_EQ(1, read(sync_child[0], &buf, 1)); + EXPECT_EQ(0, close(sync_child[0])); + + /* Lets a delivered-but-pending signal run our handler, if any. */ + for (i = 0; i < 100 && !signal_received; i++) + usleep(1000); + + /* + * SCOPE_SIGNAL must block the fan-out to this non-sandboxed parent, + * which is outside the child's Landlock domain. Before the fix the + * parent was signaled here. + */ + EXPECT_EQ(0, signal_received); + + ASSERT_EQ(child, waitpid(child, &status, 0)); + if (WIFSIGNALED(status) || !WIFEXITED(status) || + WEXITSTATUS(status) != EXIT_SUCCESS) + _metadata->exit_code = KSFT_FAIL; +} + +static void *thread_setown_scoped(void *arg) +{ + const int fd = *(int *)arg; + int ruleset_fd; + const struct landlock_ruleset_attr ruleset_attr = { + .scoped = LANDLOCK_SCOPE_SIGNAL, + }; + + /* Sandboxes only this non-leader thread (no thread syncing). */ + ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + if (ruleset_fd < 0) + return (void *)THREAD_ERROR; + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || + landlock_restrict_self(ruleset_fd, 0)) { + close(ruleset_fd); + return (void *)THREAD_ERROR; + } + close(ruleset_fd); + + /* Makes this process group own the SIGIO source. */ + if (fcntl(fd, F_SETSIG, SIGURG) || fcntl(fd, F_SETOWN, -getpgrp()) || + fcntl(fd, F_SETFL, O_ASYNC)) + return (void *)THREAD_ERROR; + + return (void *)THREAD_SUCCESS; +} + +/* + * Checks that the SIGIO fan-out is still delivered to the file owner's own + * process when fcntl(F_SETOWN, -pgrp) was issued from a sandboxed non-leader + * thread. + * + * The Landlock domain is recorded for a process-group owner (so out-of-domain + * members stay blocked, see sigio_to_pgid_members), but the kernel signals a + * process group through its members' thread-group leaders. Here the leader is + * not sandboxed and thus has a different domain than the registering thread, so + * the registration-time check cannot tell that it belongs to the owner's own + * process. hook_file_send_sigiotask() must recognize it through the recorded + * thread group and allow the delivery, matching the same-process guarantee of + * commit 18eb75f3af40. Without that exemption the leader is wrongly denied and + * never signaled. + */ +TEST(sigio_to_pgid_self) +{ + int trigger[2]; + pthread_t thread; + enum thread_return ret = THREAD_INVALID; + int i; + + drop_caps(_metadata); + + /* Bounds the SIGIO fan-out to this process. */ + ASSERT_EQ(0, setpgid(0, 0)); + + /* The non-sandboxed thread-group leader is the SIGIO target. */ + ASSERT_EQ(0, setup_signal_handler(SIGURG)); + signal_received = 0; + + ASSERT_EQ(0, pipe2(trigger, O_CLOEXEC)); + + /* + * Registers the process-group fowner from a sibling thread that + * sandboxes only itself, so its domain differs from the leader's. + */ + ASSERT_EQ(0, pthread_create(&thread, NULL, thread_setown_scoped, + &trigger[0])); + ASSERT_EQ(0, pthread_join(thread, (void **)&ret)); + ASSERT_EQ(THREAD_SUCCESS, ret); + + /* Fans SIGURG out to the process group. */ + ASSERT_EQ(1, write(trigger[1], ".", 1)); + + for (i = 0; i < 1000 && !signal_received; i++) + usleep(1000); + + /* + * Same-process delivery must always be allowed, even though the owner + * was registered from a sandboxed sibling thread. + */ + EXPECT_EQ(1, signal_received); + + EXPECT_EQ(0, close(trigger[0])); + EXPECT_EQ(0, close(trigger[1])); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index 8ec0cb64ad94..30dc677b2f45 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -4,6 +4,8 @@ # Shell functions for the rest of the scripts. +export LC_ALL=C + MAX_RETRIES=600 RETRY_INTERVAL=".1" # seconds SYSFS_KERNEL_DIR="/sys/kernel" @@ -339,6 +341,16 @@ function check_result { fi } +# does_sysfs_exist(modname, attr) - check sysfs attribute existence +# modname - livepatch module creating the sysfs interface +# attr - attribute name to be checked +function does_sysfs_exist() { + local mod="$1"; shift + local attr="$1"; shift + + [[ -f "$SYSFS_KLP_DIR/$mod/$attr" ]] +} + # check_sysfs_rights(modname, rel_path, expected_rights) - check sysfs # path permissions # modname - livepatch module creating the sysfs interface diff --git a/tools/testing/selftests/livepatch/test-kprobe.sh b/tools/testing/selftests/livepatch/test-kprobe.sh index b67dfad03d97..7ced4082cff3 100755 --- a/tools/testing/selftests/livepatch/test-kprobe.sh +++ b/tools/testing/selftests/livepatch/test-kprobe.sh @@ -20,11 +20,11 @@ start_test "livepatch interaction with kprobed function with post_handler" echo 1 > "$SYSFS_KPROBES_DIR/enabled" -load_mod $MOD_KPROBE has_post_handler=true +load_mod $MOD_KPROBE has_post_handler=y load_failing_mod $MOD_LIVEPATCH unload_mod $MOD_KPROBE -check_result "% insmod test_modules/test_klp_kprobe.ko has_post_handler=true +check_result "% insmod test_modules/test_klp_kprobe.ko has_post_handler=y % insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition @@ -39,14 +39,14 @@ insmod: ERROR: could not insert module test_modules/$MOD_LIVEPATCH.ko: Device or start_test "livepatch interaction with kprobed function without post_handler" -load_mod $MOD_KPROBE has_post_handler=false +load_mod $MOD_KPROBE has_post_handler=n load_lp $MOD_LIVEPATCH unload_mod $MOD_KPROBE disable_lp $MOD_LIVEPATCH unload_lp $MOD_LIVEPATCH -check_result "% insmod test_modules/test_klp_kprobe.ko has_post_handler=false +check_result "% insmod test_modules/test_klp_kprobe.ko has_post_handler=n % insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition diff --git a/tools/testing/selftests/livepatch/test-sysfs.sh b/tools/testing/selftests/livepatch/test-sysfs.sh index 58fe1d96997c..3b16285c6e67 100755 --- a/tools/testing/selftests/livepatch/test-sysfs.sh +++ b/tools/testing/selftests/livepatch/test-sysfs.sh @@ -8,6 +8,10 @@ MOD_LIVEPATCH=test_klp_livepatch MOD_LIVEPATCH2=test_klp_callbacks_demo MOD_LIVEPATCH3=test_klp_syscall +HAS_PATCH_ATTR=0 +HAS_REPLACE_ATTR=0 +HAS_STACK_ORDER_ATTR=0 + setup_config # - load a livepatch and verifies the sysfs entries work as expected @@ -20,13 +24,25 @@ check_sysfs_rights "$MOD_LIVEPATCH" "" "drwxr-xr-x" check_sysfs_rights "$MOD_LIVEPATCH" "enabled" "-rw-r--r--" check_sysfs_value "$MOD_LIVEPATCH" "enabled" "1" check_sysfs_rights "$MOD_LIVEPATCH" "force" "--w-------" -check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--" -check_sysfs_rights "$MOD_LIVEPATCH" "stack_order" "-r--r--r--" -check_sysfs_value "$MOD_LIVEPATCH" "stack_order" "1" check_sysfs_rights "$MOD_LIVEPATCH" "transition" "-r--r--r--" check_sysfs_value "$MOD_LIVEPATCH" "transition" "0" -check_sysfs_rights "$MOD_LIVEPATCH" "vmlinux/patched" "-r--r--r--" -check_sysfs_value "$MOD_LIVEPATCH" "vmlinux/patched" "1" + +if does_sysfs_exist "$MOD_LIVEPATCH/vmlinux" "patched"; then + check_sysfs_rights "$MOD_LIVEPATCH" "vmlinux/patched" "-r--r--r--" + check_sysfs_value "$MOD_LIVEPATCH" "vmlinux/patched" "1" + HAS_PATCH_ATTR=1 +fi + +if does_sysfs_exist "$MOD_LIVEPATCH" "replace"; then + check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--" + HAS_REPLACE_ATTR=1 +fi + +if does_sysfs_exist "$MOD_LIVEPATCH" "stack_order"; then + check_sysfs_rights "$MOD_LIVEPATCH" "stack_order" "-r--r--r--" + check_sysfs_value "$MOD_LIVEPATCH" "stack_order" "1" + HAS_STACK_ORDER_ATTR=1 +fi disable_lp $MOD_LIVEPATCH @@ -45,123 +61,127 @@ livepatch: '$MOD_LIVEPATCH': completing unpatching transition livepatch: '$MOD_LIVEPATCH': unpatching complete % rmmod $MOD_LIVEPATCH" -start_test "sysfs test object/patched" +if [[ "$HAS_PATCH_ATTR" == "1" ]]; then + start_test "sysfs test object/patched" -MOD_LIVEPATCH=test_klp_callbacks_demo -MOD_TARGET=test_klp_callbacks_mod -load_lp $MOD_LIVEPATCH + MOD_TARGET=test_klp_callbacks_mod + load_lp $MOD_LIVEPATCH2 -# check the "patch" file changes as target module loads/unloads -check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0" -load_mod $MOD_TARGET -check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "1" -unload_mod $MOD_TARGET -check_sysfs_value "$MOD_LIVEPATCH" "$MOD_TARGET/patched" "0" + # check the "patch" file changes as target module loads/unloads + check_sysfs_value "$MOD_LIVEPATCH2" "$MOD_TARGET/patched" "0" + load_mod $MOD_TARGET + check_sysfs_value "$MOD_LIVEPATCH2" "$MOD_TARGET/patched" "1" + unload_mod $MOD_TARGET + check_sysfs_value "$MOD_LIVEPATCH2" "$MOD_TARGET/patched" "0" -disable_lp $MOD_LIVEPATCH -unload_lp $MOD_LIVEPATCH + disable_lp $MOD_LIVEPATCH2 + unload_lp $MOD_LIVEPATCH2 -check_result "% insmod test_modules/test_klp_callbacks_demo.ko -livepatch: enabling patch 'test_klp_callbacks_demo' -livepatch: 'test_klp_callbacks_demo': initializing patching transition -test_klp_callbacks_demo: pre_patch_callback: vmlinux -livepatch: 'test_klp_callbacks_demo': starting patching transition -livepatch: 'test_klp_callbacks_demo': completing patching transition -test_klp_callbacks_demo: post_patch_callback: vmlinux -livepatch: 'test_klp_callbacks_demo': patching complete -% insmod test_modules/test_klp_callbacks_mod.ko -livepatch: applying patch 'test_klp_callbacks_demo' to loading module 'test_klp_callbacks_mod' -test_klp_callbacks_demo: pre_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init -test_klp_callbacks_demo: post_patch_callback: test_klp_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init -test_klp_callbacks_mod: test_klp_callbacks_mod_init -% rmmod test_klp_callbacks_mod -test_klp_callbacks_mod: test_klp_callbacks_mod_exit -test_klp_callbacks_demo: pre_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away -livepatch: reverting patch 'test_klp_callbacks_demo' on unloading module 'test_klp_callbacks_mod' -test_klp_callbacks_demo: post_unpatch_callback: test_klp_callbacks_mod -> [MODULE_STATE_GOING] Going away -% echo 0 > $SYSFS_KLP_DIR/test_klp_callbacks_demo/enabled -livepatch: 'test_klp_callbacks_demo': initializing unpatching transition -test_klp_callbacks_demo: pre_unpatch_callback: vmlinux -livepatch: 'test_klp_callbacks_demo': starting unpatching transition -livepatch: 'test_klp_callbacks_demo': completing unpatching transition -test_klp_callbacks_demo: post_unpatch_callback: vmlinux -livepatch: 'test_klp_callbacks_demo': unpatching complete -% rmmod test_klp_callbacks_demo" - -start_test "sysfs test replace enabled" - -MOD_LIVEPATCH=test_klp_atomic_replace -load_lp $MOD_LIVEPATCH replace=1 - -check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--" -check_sysfs_value "$MOD_LIVEPATCH" "replace" "1" + check_result "% insmod test_modules/$MOD_LIVEPATCH2.ko +livepatch: enabling patch '$MOD_LIVEPATCH2' +livepatch: '$MOD_LIVEPATCH2': initializing patching transition +$MOD_LIVEPATCH2: pre_patch_callback: vmlinux +livepatch: '$MOD_LIVEPATCH2': starting patching transition +livepatch: '$MOD_LIVEPATCH2': completing patching transition +$MOD_LIVEPATCH2: post_patch_callback: vmlinux +livepatch: '$MOD_LIVEPATCH2': patching complete +% insmod test_modules/$MOD_TARGET.ko +livepatch: applying patch '$MOD_LIVEPATCH2' to loading module '$MOD_TARGET' +$MOD_LIVEPATCH2: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init +$MOD_LIVEPATCH2: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init +$MOD_TARGET: test_klp_callbacks_mod_init +% rmmod $MOD_TARGET +$MOD_TARGET: test_klp_callbacks_mod_exit +$MOD_LIVEPATCH2: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away +livepatch: reverting patch '$MOD_LIVEPATCH2' on unloading module '$MOD_TARGET' +$MOD_LIVEPATCH2: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away +% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH2/enabled +livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition +$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux +livepatch: '$MOD_LIVEPATCH2': starting unpatching transition +livepatch: '$MOD_LIVEPATCH2': completing unpatching transition +$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux +livepatch: '$MOD_LIVEPATCH2': unpatching complete +% rmmod $MOD_LIVEPATCH2" +fi -disable_lp $MOD_LIVEPATCH -unload_lp $MOD_LIVEPATCH +if [[ "$HAS_REPLACE_ATTR" == "1" ]]; then + start_test "sysfs test replace enabled" -check_result "% insmod test_modules/$MOD_LIVEPATCH.ko replace=1 -livepatch: enabling patch '$MOD_LIVEPATCH' -livepatch: '$MOD_LIVEPATCH': initializing patching transition -livepatch: '$MOD_LIVEPATCH': starting patching transition -livepatch: '$MOD_LIVEPATCH': completing patching transition -livepatch: '$MOD_LIVEPATCH': patching complete -% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled -livepatch: '$MOD_LIVEPATCH': initializing unpatching transition -livepatch: '$MOD_LIVEPATCH': starting unpatching transition -livepatch: '$MOD_LIVEPATCH': completing unpatching transition -livepatch: '$MOD_LIVEPATCH': unpatching complete -% rmmod $MOD_LIVEPATCH" + MOD_ATOMIC_REPLACE=test_klp_atomic_replace + load_lp $MOD_ATOMIC_REPLACE replace=1 -start_test "sysfs test replace disabled" + check_sysfs_rights "$MOD_ATOMIC_REPLACE" "replace" "-r--r--r--" + check_sysfs_value "$MOD_ATOMIC_REPLACE" "replace" "1" -load_lp $MOD_LIVEPATCH replace=0 + disable_lp $MOD_ATOMIC_REPLACE + unload_lp $MOD_ATOMIC_REPLACE -check_sysfs_rights "$MOD_LIVEPATCH" "replace" "-r--r--r--" -check_sysfs_value "$MOD_LIVEPATCH" "replace" "0" + check_result "% insmod test_modules/$MOD_ATOMIC_REPLACE.ko replace=1 +livepatch: enabling patch '$MOD_ATOMIC_REPLACE' +livepatch: '$MOD_ATOMIC_REPLACE': initializing patching transition +livepatch: '$MOD_ATOMIC_REPLACE': starting patching transition +livepatch: '$MOD_ATOMIC_REPLACE': completing patching transition +livepatch: '$MOD_ATOMIC_REPLACE': patching complete +% echo 0 > $SYSFS_KLP_DIR/$MOD_ATOMIC_REPLACE/enabled +livepatch: '$MOD_ATOMIC_REPLACE': initializing unpatching transition +livepatch: '$MOD_ATOMIC_REPLACE': starting unpatching transition +livepatch: '$MOD_ATOMIC_REPLACE': completing unpatching transition +livepatch: '$MOD_ATOMIC_REPLACE': unpatching complete +% rmmod $MOD_ATOMIC_REPLACE" -disable_lp $MOD_LIVEPATCH -unload_lp $MOD_LIVEPATCH + start_test "sysfs test replace disabled" -check_result "% insmod test_modules/$MOD_LIVEPATCH.ko replace=0 -livepatch: enabling patch '$MOD_LIVEPATCH' -livepatch: '$MOD_LIVEPATCH': initializing patching transition -livepatch: '$MOD_LIVEPATCH': starting patching transition -livepatch: '$MOD_LIVEPATCH': completing patching transition -livepatch: '$MOD_LIVEPATCH': patching complete -% echo 0 > $SYSFS_KLP_DIR/$MOD_LIVEPATCH/enabled -livepatch: '$MOD_LIVEPATCH': initializing unpatching transition -livepatch: '$MOD_LIVEPATCH': starting unpatching transition -livepatch: '$MOD_LIVEPATCH': completing unpatching transition -livepatch: '$MOD_LIVEPATCH': unpatching complete -% rmmod $MOD_LIVEPATCH" + load_lp $MOD_ATOMIC_REPLACE replace=0 -start_test "sysfs test stack_order value" + check_sysfs_rights "$MOD_ATOMIC_REPLACE" "replace" "-r--r--r--" + check_sysfs_value "$MOD_ATOMIC_REPLACE" "replace" "0" -load_lp $MOD_LIVEPATCH + disable_lp $MOD_ATOMIC_REPLACE + unload_lp $MOD_ATOMIC_REPLACE -check_sysfs_value "$MOD_LIVEPATCH" "stack_order" "1" + check_result "% insmod test_modules/$MOD_ATOMIC_REPLACE.ko replace=0 +livepatch: enabling patch '$MOD_ATOMIC_REPLACE' +livepatch: '$MOD_ATOMIC_REPLACE': initializing patching transition +livepatch: '$MOD_ATOMIC_REPLACE': starting patching transition +livepatch: '$MOD_ATOMIC_REPLACE': completing patching transition +livepatch: '$MOD_ATOMIC_REPLACE': patching complete +% echo 0 > $SYSFS_KLP_DIR/$MOD_ATOMIC_REPLACE/enabled +livepatch: '$MOD_ATOMIC_REPLACE': initializing unpatching transition +livepatch: '$MOD_ATOMIC_REPLACE': starting unpatching transition +livepatch: '$MOD_ATOMIC_REPLACE': completing unpatching transition +livepatch: '$MOD_ATOMIC_REPLACE': unpatching complete +% rmmod $MOD_ATOMIC_REPLACE" +fi -load_lp $MOD_LIVEPATCH2 +if [[ "$HAS_STACK_ORDER_ATTR" == "1" ]]; then + start_test "sysfs test stack_order value" -check_sysfs_value "$MOD_LIVEPATCH2" "stack_order" "2" + load_lp $MOD_LIVEPATCH -load_lp $MOD_LIVEPATCH3 + check_sysfs_value "$MOD_LIVEPATCH" "stack_order" "1" -check_sysfs_value "$MOD_LIVEPATCH3" "stack_order" "3" + load_lp $MOD_LIVEPATCH2 -disable_lp $MOD_LIVEPATCH2 -unload_lp $MOD_LIVEPATCH2 + check_sysfs_value "$MOD_LIVEPATCH2" "stack_order" "2" -check_sysfs_value "$MOD_LIVEPATCH" "stack_order" "1" -check_sysfs_value "$MOD_LIVEPATCH3" "stack_order" "2" + load_lp $MOD_LIVEPATCH3 -disable_lp $MOD_LIVEPATCH3 -unload_lp $MOD_LIVEPATCH3 + check_sysfs_value "$MOD_LIVEPATCH3" "stack_order" "3" -disable_lp $MOD_LIVEPATCH -unload_lp $MOD_LIVEPATCH + disable_lp $MOD_LIVEPATCH2 + unload_lp $MOD_LIVEPATCH2 -check_result "% insmod test_modules/$MOD_LIVEPATCH.ko + check_sysfs_value "$MOD_LIVEPATCH" "stack_order" "1" + check_sysfs_value "$MOD_LIVEPATCH3" "stack_order" "2" + + disable_lp $MOD_LIVEPATCH3 + unload_lp $MOD_LIVEPATCH3 + + disable_lp $MOD_LIVEPATCH + unload_lp $MOD_LIVEPATCH + + check_result "% insmod test_modules/$MOD_LIVEPATCH.ko livepatch: enabling patch '$MOD_LIVEPATCH' livepatch: '$MOD_LIVEPATCH': initializing patching transition livepatch: '$MOD_LIVEPATCH': starting patching transition @@ -201,5 +221,6 @@ livepatch: '$MOD_LIVEPATCH': starting unpatching transition livepatch: '$MOD_LIVEPATCH': completing unpatching transition livepatch: '$MOD_LIVEPATCH': unpatching complete % rmmod $MOD_LIVEPATCH" +fi exit 0 diff --git a/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c b/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c index dd802783ea84..08aacc0e14de 100644 --- a/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c +++ b/tools/testing/selftests/livepatch/test_modules/test_klp_syscall.c @@ -12,15 +12,26 @@ #include <linux/slab.h> #include <linux/livepatch.h> -#if defined(__x86_64__) -#define FN_PREFIX __x64_ -#elif defined(__s390x__) -#define FN_PREFIX __s390x_ -#elif defined(__aarch64__) -#define FN_PREFIX __arm64_ +/* + * Before CONFIG_ARCH_HAS_SYSCALL_WRAPPER was introduced there were no + * prefixes for system calls. + * powerpc set this config based on configs, so it can be enabled or not. + */ +#if defined(CONFIG_ARCH_HAS_SYSCALL_WRAPPER) + #if defined(__x86_64__) + #define FN_PREFIX __x64_ + #elif defined(__s390x__) + #define FN_PREFIX __s390x_ + #elif defined(__aarch64__) + #define FN_PREFIX __arm64_ + #elif defined(__powerpc__) + #define FN_PREFIX + #else + #error "Missing syscall wrapper for the given architecture." + #endif #else -/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER */ -#define FN_PREFIX + /* Do not set a prefix for architectures that do not enable wrappers. */ + #define FN_PREFIX #endif /* Protects klp_pids */ @@ -98,7 +109,11 @@ static int livepatch_init(void) */ npids = npids_pending; - return klp_enable_patch(&patch); + ret = klp_enable_patch(&patch); + if (ret) + kobject_put(klp_kobj); + + return ret; } static void livepatch_exit(void) diff --git a/tools/testing/selftests/liveupdate/Makefile b/tools/testing/selftests/liveupdate/Makefile index 080754787ede..30689d22cb02 100644 --- a/tools/testing/selftests/liveupdate/Makefile +++ b/tools/testing/selftests/liveupdate/Makefile @@ -6,6 +6,8 @@ TEST_GEN_PROGS += liveupdate TEST_GEN_PROGS_EXTENDED += luo_kexec_simple TEST_GEN_PROGS_EXTENDED += luo_multi_session +TEST_GEN_PROGS_EXTENDED += luo_stress_sessions +TEST_GEN_PROGS_EXTENDED += luo_stress_files TEST_FILES += do_kexec.sh diff --git a/tools/testing/selftests/liveupdate/liveupdate.c b/tools/testing/selftests/liveupdate/liveupdate.c index 37c808fbe1e9..502fb3567e38 100644 --- a/tools/testing/selftests/liveupdate/liveupdate.c +++ b/tools/testing/selftests/liveupdate/liveupdate.c @@ -26,6 +26,7 @@ #include <linux/liveupdate.h> +#include "luo_test_utils.h" #include "../kselftest.h" #include "../kselftest_harness.h" @@ -102,6 +103,22 @@ static int create_session(int lu_fd, const char *name) return args.fd; } +/* Helper function to get a session name via ioctl. */ +static int get_session_name(int session_fd, char *name, size_t name_len) +{ + struct liveupdate_session_get_name args = {}; + + args.size = sizeof(args); + + if (ioctl(session_fd, LIVEUPDATE_SESSION_GET_NAME, &args)) + return -errno; + + strncpy(name, (char *)args.name, name_len - 1); + name[name_len - 1] = '\0'; + + return 0; +} + /* * Test Case: Create Duplicate Session * @@ -386,4 +403,175 @@ TEST_F(liveupdate_device, prevent_double_preservation) ASSERT_EQ(close(session_fd2), 0); } +/* + * Test Case: Create Session with No Null Termination + * + * Verifies that filling the entire 64-byte name field with non-null characters + * (no '\0' terminator) is rejected by the kernel with EINVAL. + */ +TEST_F(liveupdate_device, create_session_no_null_termination) +{ + struct liveupdate_ioctl_create_session args = {}; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + /* Fill entire name field with 'X', no null terminator */ + args.size = sizeof(args); + memset(args.name, 'X', sizeof(args.name)); + + EXPECT_LT(ioctl(self->fd1, LIVEUPDATE_IOCTL_CREATE_SESSION, &args), 0); + EXPECT_EQ(errno, EINVAL); +} + +/* + * Test Case: Create Session with Empty Name + * + * Verifies that creating a session with an empty string name fails + * with EINVAL. + */ +TEST_F(liveupdate_device, create_session_empty_name) +{ + int session_fd; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, ""); + EXPECT_EQ(session_fd, -EINVAL); +} + +/* + * Test Case: Get Session Name + * + * Verifies that the full session name can be retrieved from a session file + * descriptor via ioctl. + */ +TEST_F(liveupdate_device, get_session_name) +{ + char name_buf[LIVEUPDATE_SESSION_NAME_LENGTH] = {}; + const char *session_name = "get-name-test-session"; + int session_fd; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, session_name); + ASSERT_GE(session_fd, 0); + + ASSERT_EQ(get_session_name(session_fd, name_buf, sizeof(name_buf)), 0); + ASSERT_STREQ(name_buf, session_name); + + ASSERT_EQ(close(session_fd), 0); +} + +/* + * Test Case: Get Session Name at Maximum Length + * + * Verifies that a session name using the full LIVEUPDATE_SESSION_NAME_LENGTH + * (minus the null terminator) can be correctly retrieved. + */ +TEST_F(liveupdate_device, get_session_name_max_length) +{ + char name_buf[LIVEUPDATE_SESSION_NAME_LENGTH] = {}; + char long_name[LIVEUPDATE_SESSION_NAME_LENGTH]; + int session_fd; + + memset(long_name, 'A', sizeof(long_name) - 1); + long_name[sizeof(long_name) - 1] = '\0'; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, long_name); + ASSERT_GE(session_fd, 0); + + ASSERT_EQ(get_session_name(session_fd, name_buf, sizeof(name_buf)), 0); + ASSERT_STREQ(name_buf, long_name); + + ASSERT_EQ(close(session_fd), 0); +} + +/* + * Test Case: Manage Many Sessions + * + * Verifies that a large number of sessions can be created and then + * destroyed during normal system operation. This specifically tests the + * dynamic block allocation and reuse logic for session metadata management + * without preserving any files. + */ +TEST_F(liveupdate_device, preserve_many_sessions) +{ +#define MANY_SESSIONS 2000 + int session_fds[MANY_SESSIONS]; + int ret, i; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + ret = luo_ensure_nofile_limit(MANY_SESSIONS); + if (ret == -EPERM) + SKIP(return, "Insufficient privileges to set RLIMIT_NOFILE"); + ASSERT_EQ(ret, 0); + + for (i = 0; i < MANY_SESSIONS; i++) { + char name[64]; + + snprintf(name, sizeof(name), "many-session-%d", i); + session_fds[i] = create_session(self->fd1, name); + ASSERT_GE(session_fds[i], 0); + } + + for (i = 0; i < MANY_SESSIONS; i++) + ASSERT_EQ(close(session_fds[i]), 0); +} + +/* + * Test Case: Preserve Many Files + * + * Verifies that a large number of files can be preserved in a single session + * and then destroyed during normal system operation. This tests the dynamic + * block allocation and management for outgoing files. + */ +TEST_F(liveupdate_device, preserve_many_files) +{ +#define MANY_FILES 500 + int mem_fds[MANY_FILES]; + int session_fd, ret, i; + + self->fd1 = open(LIVEUPDATE_DEV, O_RDWR); + if (self->fd1 < 0 && errno == ENOENT) + SKIP(return, "%s does not exist", LIVEUPDATE_DEV); + ASSERT_GE(self->fd1, 0); + + session_fd = create_session(self->fd1, "many-files-test"); + ASSERT_GE(session_fd, 0); + + ret = luo_ensure_nofile_limit(MANY_FILES + 10); + if (ret == -EPERM) + SKIP(return, "Insufficient privileges to set RLIMIT_NOFILE"); + ASSERT_EQ(ret, 0); + + for (i = 0; i < MANY_FILES; i++) { + mem_fds[i] = memfd_create("test-memfd", 0); + ASSERT_GE(mem_fds[i], 0); + ASSERT_EQ(preserve_fd(session_fd, mem_fds[i], i), 0); + } + + for (i = 0; i < MANY_FILES; i++) + ASSERT_EQ(close(mem_fds[i]), 0); + + ASSERT_EQ(close(session_fd), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/liveupdate/luo_stress_files.c b/tools/testing/selftests/liveupdate/luo_stress_files.c new file mode 100644 index 000000000000..0cdf9cd4bac7 --- /dev/null +++ b/tools/testing/selftests/liveupdate/luo_stress_files.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2026, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + * + * Validate that LUO can handle a large number of files per session across + * a kexec reboot. + */ + +#include <stdio.h> +#include <unistd.h> +#include "luo_test_utils.h" + +#define NUM_FILES 500 +#define STATE_SESSION_NAME "kexec_many_files_state" +#define STATE_MEMFD_TOKEN 9999 +#define TEST_SESSION_NAME "many_files_session" + +/* Stage 1: Executed before the kexec reboot. */ +static void run_stage_1(int luo_fd) +{ + int session_fd, i; + + ksft_print_msg("[STAGE 1] Creating state file for next stage (2)...\n"); + create_state_file(luo_fd, STATE_SESSION_NAME, STATE_MEMFD_TOKEN, 2); + + ksft_print_msg("[STAGE 1] Creating test session '%s'...\n", TEST_SESSION_NAME); + session_fd = luo_create_session(luo_fd, TEST_SESSION_NAME); + if (session_fd < 0) + fail_exit("luo_create_session"); + + ksft_print_msg("[STAGE 1] Preserving %d files...\n", NUM_FILES); + for (i = 0; i < NUM_FILES; i++) { + char data[64]; + + snprintf(data, sizeof(data), "file-data-%d", i); + if (create_and_preserve_memfd(session_fd, i, data) < 0) + fail_exit("create_and_preserve_memfd for index %d", i); + } + + ksft_print_msg("[STAGE 1] Successfully preserved %d files.\n", NUM_FILES); + + close(luo_fd); + daemonize_and_wait(); +} + +/* Stage 2: Executed after the kexec reboot. */ +static void run_stage_2(int luo_fd, int state_session_fd) +{ + int session_fd; + int i, stage; + + ksft_print_msg("[STAGE 2] Starting post-kexec verification...\n"); + + restore_and_read_stage(state_session_fd, STATE_MEMFD_TOKEN, &stage); + if (stage != 2) { + fail_exit("Expected stage 2, but state file contains %d", + stage); + } + + ksft_print_msg("[STAGE 2] Retrieving test session '%s'...\n", TEST_SESSION_NAME); + session_fd = luo_retrieve_session(luo_fd, TEST_SESSION_NAME); + if (session_fd < 0) + fail_exit("luo_retrieve_session"); + + ksft_print_msg("[STAGE 2] Verifying %d files...\n", NUM_FILES); + for (i = 0; i < NUM_FILES; i++) { + char data[64]; + int fd; + + snprintf(data, sizeof(data), "file-data-%d", i); + fd = restore_and_verify_memfd(session_fd, i, data); + if (fd < 0) + fail_exit("restore_and_verify_memfd for index %d", i); + close(fd); + } + + ksft_print_msg("[STAGE 2] Finishing test session...\n"); + if (luo_session_finish(session_fd) < 0) + fail_exit("luo_session_finish for test session"); + close(session_fd); + + ksft_print_msg("[STAGE 2] Finalizing state session...\n"); + if (luo_session_finish(state_session_fd) < 0) + fail_exit("luo_session_finish for state session"); + close(state_session_fd); + + ksft_print_msg("\n--- MANY-FILES KEXEC TEST PASSED (%d files) ---\n", + NUM_FILES); +} + +int main(int argc, char *argv[]) +{ + return luo_test(argc, argv, STATE_SESSION_NAME, + run_stage_1, run_stage_2); +} diff --git a/tools/testing/selftests/liveupdate/luo_stress_sessions.c b/tools/testing/selftests/liveupdate/luo_stress_sessions.c new file mode 100644 index 000000000000..f201b1839d1d --- /dev/null +++ b/tools/testing/selftests/liveupdate/luo_stress_sessions.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2026, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + * + * Validate that LUO can handle a large number of sessions across a kexec + * reboot. + */ + +#include <stdio.h> +#include <unistd.h> +#include "luo_test_utils.h" + +#define NUM_SESSIONS 2000 +#define STATE_SESSION_NAME "kexec_many_state" +#define STATE_MEMFD_TOKEN 999 + +/* Stage 1: Executed before the kexec reboot. */ +static void run_stage_1(int luo_fd) +{ + int ret, i; + + ksft_print_msg("[STAGE 1] Increasing ulimit for open files...\n"); + ret = luo_ensure_nofile_limit(NUM_SESSIONS); + if (ret == -EPERM) + ksft_exit_skip("Insufficient privileges to set RLIMIT_NOFILE\n"); + if (ret < 0) + ksft_exit_fail_msg("luo_ensure_nofile_limit failed: %s\n", strerror(-ret)); + + ksft_print_msg("[STAGE 1] Creating state file for next stage (2)...\n"); + create_state_file(luo_fd, STATE_SESSION_NAME, STATE_MEMFD_TOKEN, 2); + + ksft_print_msg("[STAGE 1] Creating %d sessions...\n", NUM_SESSIONS); + + for (i = 0; i < NUM_SESSIONS; i++) { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + int s_fd; + + snprintf(name, sizeof(name), "many-test-%d", i); + s_fd = luo_create_session(luo_fd, name); + if (s_fd < 0) { + fail_exit("luo_create_session for '%s' at index %d", + name, i); + } + } + + ksft_print_msg("[STAGE 1] Successfully created %d sessions.\n", + NUM_SESSIONS); + + close(luo_fd); + daemonize_and_wait(); +} + +/* Stage 2: Executed after the kexec reboot. */ +static void run_stage_2(int luo_fd, int state_session_fd) +{ + int i, stage; + + ksft_print_msg("[STAGE 2] Starting post-kexec verification...\n"); + + restore_and_read_stage(state_session_fd, STATE_MEMFD_TOKEN, &stage); + if (stage != 2) { + fail_exit("Expected stage 2, but state file contains %d", + stage); + } + + ksft_print_msg("[STAGE 2] Retrieving and finishing %d sessions...\n", + NUM_SESSIONS); + + for (i = 0; i < NUM_SESSIONS; i++) { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + int s_fd; + + snprintf(name, sizeof(name), "many-test-%d", i); + s_fd = luo_retrieve_session(luo_fd, name); + if (s_fd < 0) { + fail_exit("luo_retrieve_session for '%s' at index %d", + name, i); + } + + if (luo_session_finish(s_fd) < 0) { + fail_exit("luo_session_finish for '%s' at index %d", + name, i); + } + close(s_fd); + } + + ksft_print_msg("[STAGE 2] Finalizing state session...\n"); + if (luo_session_finish(state_session_fd) < 0) + fail_exit("luo_session_finish for state session"); + close(state_session_fd); + + ksft_print_msg("\n--- MANY-SESSIONS KEXEC TEST PASSED (%d sessions) ---\n", + NUM_SESSIONS); +} + +int main(int argc, char *argv[]) +{ + return luo_test(argc, argv, STATE_SESSION_NAME, + run_stage_1, run_stage_2); +} diff --git a/tools/testing/selftests/liveupdate/luo_test_utils.c b/tools/testing/selftests/liveupdate/luo_test_utils.c index 3c8721c505df..333a3530051b 100644 --- a/tools/testing/selftests/liveupdate/luo_test_utils.c +++ b/tools/testing/selftests/liveupdate/luo_test_utils.c @@ -17,6 +17,7 @@ #include <sys/syscall.h> #include <sys/mman.h> #include <sys/types.h> +#include <sys/resource.h> #include <sys/stat.h> #include <errno.h> #include <stdarg.h> @@ -28,6 +29,29 @@ int luo_open_device(void) return open(LUO_DEVICE, O_RDWR); } +int luo_ensure_nofile_limit(long min_limit) +{ + struct rlimit hl; + + /* Allow to extra files to be used by test itself */ + min_limit += 32; + + if (getrlimit(RLIMIT_NOFILE, &hl) < 0) + return -errno; + + if (hl.rlim_cur >= min_limit) + return 0; + + hl.rlim_cur = min_limit; + if (hl.rlim_cur > hl.rlim_max) + hl.rlim_max = hl.rlim_cur; + + if (setrlimit(RLIMIT_NOFILE, &hl) < 0) + return -errno; + + return 0; +} + int luo_create_session(int luo_fd, const char *name) { struct liveupdate_ioctl_create_session arg = { .size = sizeof(arg) }; diff --git a/tools/testing/selftests/liveupdate/luo_test_utils.h b/tools/testing/selftests/liveupdate/luo_test_utils.h index 90099bf49577..6a0d85386613 100644 --- a/tools/testing/selftests/liveupdate/luo_test_utils.h +++ b/tools/testing/selftests/liveupdate/luo_test_utils.h @@ -26,6 +26,8 @@ int luo_create_session(int luo_fd, const char *name); int luo_retrieve_session(int luo_fd, const char *name); int luo_session_finish(int session_fd); +int luo_ensure_nofile_limit(long min_limit); + int create_and_preserve_memfd(int session_fd, int token, const char *data); int restore_and_verify_memfd(int session_fd, int token, const char *expected_data); diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index 3245032db34d..d8180bbe31e8 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -86,3 +86,4 @@ FORTIFY_STR_MEMBER detected buffer overflow FORTIFY_MEM_OBJECT detected buffer overflow FORTIFY_MEM_MEMBER detected field-spanning write PPC_SLB_MULTIHIT Recovered +#PPC_RADIX_TLBIEL Triggers unrecoverable MCE diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile index fc840e06ff56..829f95c83515 100644 --- a/tools/testing/selftests/membarrier/Makefile +++ b/tools/testing/selftests/membarrier/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -g $(KHDR_INCLUDES) +CFLAGS += -g $(KHDR_INCLUDES) -pthread -I../../../../tools/include LDLIBS += -lpthread TEST_GEN_PROGS := membarrier_test_single_thread \ - membarrier_test_multi_thread + membarrier_test_multi_thread \ + membarrier_rseq_stress include ../lib.mk diff --git a/tools/testing/selftests/membarrier/membarrier_rseq_stress.c b/tools/testing/selftests/membarrier/membarrier_rseq_stress.c new file mode 100644 index 000000000000..c188d7498610 --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_rseq_stress.c @@ -0,0 +1,951 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Membarrier stress test for CFS throttle interactions. + * + * Reproducer for the interaction between CFS throttle and expedited membarrier. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <pthread.h> +#include <syscall.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <fcntl.h> +#include <stdint.h> +#include <errno.h> +#include <sched.h> +#include <time.h> +#include <signal.h> +#include <stdatomic.h> +#include <dirent.h> +#include <sys/prctl.h> +#include <sys/mman.h> + +#include "../kselftest.h" + +/* -- Architecture-specific rseq signature -- */ +#if defined(__x86_64__) || defined(__i386__) +# define RSEQ_SIG 0x53053053U +#elif defined(__aarch64__) +# define RSEQ_SIG 0xd428bc00U +#elif defined(__powerpc__) || defined(__powerpc64__) +# define RSEQ_SIG 0x0f000000U +#elif defined(__s390__) || defined(__s390x__) +# define RSEQ_SIG 0x0c000000U +#else +# define RSEQ_SIG 0 +# define UNSUPPORTED_ARCH 1 +#endif + +/* -- rseq ABI (kernel uapi; define locally for portability) -- */ +#define RSEQ_CPU_ID_UNINITIALIZED ((__u32)-1) + +#include <linux/compiler.h> + +struct rseq_abi { + __u32 cpu_id_start; + __u32 cpu_id; + __u64 rseq_cs; + __u32 flags; + __u32 node_id; + __u32 mm_cid; + char end[0]; +} __aligned(32); + +/* -- membarrier constants (not in all distro headers) -- */ +#ifndef MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ +# define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) +#endif +#ifndef MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ +# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) +#endif +#ifndef MEMBARRIER_CMD_FLAG_CPU +# define MEMBARRIER_CMD_FLAG_CPU (1 << 0) +#endif + +/* -- Test parameters -- */ +#define N_SIBLINGS 2000 +#define NEST_DEPTH 5 +static char g_cgroup_path[4096]; +static int use_cgroup_v2; + +#define CFS_QUOTA_US 1000 +#define CFS_PERIOD_US 5000 +#define N_HAMMER_PER_CPU 25 +#define N_BURNER_PER_CPU 50 +#define MAX_STRESS_CPUS 1024 +#define TEST_DURATION_SEC 20 + +/* Latency thresholds for the sentinel */ +#define LATENCY_WARN_MS 50 +#define LATENCY_CRITICAL_MS 200 + +/* Sentinel sampling interval */ +#define SENTINEL_INTERVAL_US 500 + +/* -- Shared globals -- */ +static atomic_int g_stop; +static atomic_int g_stop_sentinel; +static atomic_long g_max_latency_us; +static atomic_long g_interval_max_latency_us; +static atomic_long g_mb_ok; +static atomic_long g_mb_err; +static int g_ncpus_stress; +static int *g_stress_cpus; + +static atomic_int g_test_ready; + +/* Per-thread rseq ABI block registered with the kernel */ +static __thread struct rseq_abi tls_rseq + __attribute__((tls_model("initial-exec"))) __aligned(32) = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, +}; + +/* -- Utility -- */ +static int write_file(const char *path, const char *val) +{ + int fd = open(path, O_WRONLY | O_CLOEXEC); + + if (fd < 0) + return -errno; + + size_t len = strlen(val); + ssize_t r = write(fd, val, len); + + close(fd); + if (r < 0) + return -errno; + if ((size_t)r != len) + return -EIO; + return 0; +} + +static uint64_t monotonic_us(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000ULL + ts.tv_nsec / 1000ULL; +} + +static void update_max_latency(long lat) +{ + long old = atomic_load_explicit(&g_max_latency_us, memory_order_relaxed); + + while (lat > old) { + if (atomic_compare_exchange_weak_explicit(&g_max_latency_us, &old, lat, + memory_order_relaxed, memory_order_relaxed)) + break; + } + + old = atomic_load_explicit(&g_interval_max_latency_us, memory_order_relaxed); + while (lat > old) { + if (atomic_compare_exchange_weak_explicit(&g_interval_max_latency_us, &old, lat, + memory_order_relaxed, memory_order_relaxed)) + break; + } +} + +static void init_stress_cpus(void) +{ + cpu_set_t set; + int capacity = MAX_STRESS_CPUS; + + g_stress_cpus = malloc(capacity * sizeof(int)); + if (!g_stress_cpus) + ksft_exit_fail_msg("malloc failed for g_stress_cpus\n"); + + if (sched_getaffinity(0, sizeof(set), &set) < 0) + ksft_exit_fail_msg("sched_getaffinity failed\n"); + + for (int i = 0; i < CPU_SETSIZE && g_ncpus_stress < capacity; i++) { + if (CPU_ISSET(i, &set)) + g_stress_cpus[g_ncpus_stress++] = i; + } + + if (g_ncpus_stress == 0) + ksft_exit_skip("No CPUs available for stress test\n"); + + ksft_print_msg("Stressing %d CPUs discovered via affinity\n", g_ncpus_stress); +} + +/* -- rseq / membarrier helpers -- */ +static int rseq_register_thread(void) +{ + int r = syscall(SYS_rseq, &tls_rseq, sizeof(tls_rseq), 0, RSEQ_SIG); + + return (r == 0 || errno == EBUSY || errno == EINVAL) ? 0 : -1; +} + +static int rseq_register_thread_at(struct rseq_abi *rseq) +{ + int r = syscall(SYS_rseq, rseq, sizeof(*rseq), 0, RSEQ_SIG); + + return (r == 0 || errno == EBUSY || errno == EINVAL) ? 0 : -1; +} + +static int membarrier_register_rseq_mm(void) +{ + return syscall(SYS_membarrier, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0); +} + +/* -- cgroup helpers -- */ +static void rm_cgroup_recursive(const char *path) +{ + DIR *dir = opendir(path); + + if (!dir) + return; + struct dirent *entry; + + while ((entry = readdir(dir)) != NULL) { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + continue; + if (entry->d_type == DT_DIR) { + char sub_path[4096]; + + snprintf(sub_path, sizeof(sub_path), "%s/%s", path, entry->d_name); + rm_cgroup_recursive(sub_path); + } + } + closedir(dir); + rmdir(path); +} + +static void cgroup_teardown(void); + +static int cgroup_setup(void) +{ + struct stat st; + + if (stat("/sys/fs/cgroup/cpu", &st) == 0) { + use_cgroup_v2 = 0; + snprintf(g_cgroup_path, sizeof(g_cgroup_path), + "/sys/fs/cgroup/cpu/membarrier_stress_test"); + } else if (stat("/dev/cgroup/cpu", &st) == 0) { + use_cgroup_v2 = 0; + snprintf(g_cgroup_path, sizeof(g_cgroup_path), + "/dev/cgroup/cpu/membarrier_stress_test"); + } else if (stat("/cgroup/cpu", &st) == 0) { + use_cgroup_v2 = 0; + snprintf(g_cgroup_path, sizeof(g_cgroup_path), + "/cgroup/cpu/membarrier_stress_test"); + } else if (stat("/sys/fs/cgroup/cgroup.controllers", &st) == 0) { + use_cgroup_v2 = 1; + snprintf(g_cgroup_path, sizeof(g_cgroup_path), + "/sys/fs/cgroup/membarrier_stress_test"); + } else { + ksft_print_msg("WARN: cgroup mount not found. Using v2 at /sys/fs/cgroup\n"); + use_cgroup_v2 = 1; + snprintf(g_cgroup_path, sizeof(g_cgroup_path), + "/sys/fs/cgroup/membarrier_stress_test"); + } + + /* Robust cleanup before setup */ + cgroup_teardown(); + + if (use_cgroup_v2) { + /* Enable cpu controller in root cgroup */ + if (write_file("/sys/fs/cgroup/cgroup.subtree_control", "+cpu") < 0) + ksft_print_msg("WARN: failed to enable cpu controller in /sys/fs/cgroup\n"); + } + + if (mkdir(g_cgroup_path, 0755) < 0 && errno != EEXIST) { + ksft_print_msg("mkdir base %s failed: %s\n", g_cgroup_path, strerror(errno)); + return -1; + } + + if (use_cgroup_v2) { + char ctrl_path[4096]; + + snprintf(ctrl_path, sizeof(ctrl_path), "%s/cgroup.subtree_control", g_cgroup_path); + if (write_file(ctrl_path, "+cpu") < 0) + ksft_print_msg("WARN: failed to enable cpu controller in %s\n", + g_cgroup_path); + } + + for (int i = 0; i < N_SIBLINGS; i++) { + char sibling_path[4096]; + + snprintf(sibling_path, sizeof(sibling_path), "%s/n%d", g_cgroup_path, i); + if (mkdir(sibling_path, 0755) < 0 && errno != EEXIST) { + ksft_print_msg("mkdir wide %s failed: %s\n", sibling_path, strerror(errno)); + return -1; + } + + if (use_cgroup_v2) { + char ctrl_path[4096]; + + snprintf(ctrl_path, sizeof(ctrl_path), + "%s/cgroup.subtree_control", sibling_path); + if (write_file(ctrl_path, "+cpu") < 0) + ksft_print_msg("WARN: failed to enable cpu controller in %s\n", + sibling_path); + } + + char current_path[4096]; + + snprintf(current_path, sizeof(current_path), "%s", sibling_path); + for (int j = 0; j < NEST_DEPTH; j++) { + snprintf(current_path + strlen(current_path), + sizeof(current_path) - strlen(current_path), "/d%d", j); + if (mkdir(current_path, 0755) < 0 && errno != EEXIST) { + ksft_print_msg("mkdir deep %s failed: %s\n", + current_path, strerror(errno)); + return -1; + } + + /* Enable for all but the leaf */ + if (use_cgroup_v2 && j < NEST_DEPTH - 1) { + char ctrl_path[4096]; + + snprintf(ctrl_path, sizeof(ctrl_path), "%s/cgroup.subtree_control", + current_path); + if (write_file(ctrl_path, "+cpu") < 0) + ksft_print_msg("WARN: cannot enable cpu controller in %s\n", + current_path); + } + } + } + + char quota[64], period[64], max_str[128]; + + snprintf(quota, sizeof(quota), "%d", CFS_QUOTA_US); + snprintf(period, sizeof(period), "%d", CFS_PERIOD_US); + snprintf(max_str, sizeof(max_str), "%d %d", CFS_QUOTA_US, CFS_PERIOD_US); + + if (use_cgroup_v2) { + char max_path[4096]; + + snprintf(max_path, sizeof(max_path), "%s/cpu.max", g_cgroup_path); + if (write_file(max_path, max_str) < 0) { + ksft_print_msg("ERROR: cannot write cpu.max at %s\n", max_path); + return -1; + } + ksft_print_msg("cgroup (v2) %s: cpu.max=%s\n", g_cgroup_path, max_str); + } else { + char quota_path[4096], period_path[4096]; + + snprintf(quota_path, sizeof(quota_path), "%s/cpu.cfs_quota_us", g_cgroup_path); + snprintf(period_path, sizeof(period_path), "%s/cpu.cfs_period_us", g_cgroup_path); + + if (write_file(period_path, period) < 0) { + ksft_print_msg("ERROR: cannot write cpu.cfs_period_us at %s\n", + period_path); + return -1; + } + if (write_file(quota_path, quota) < 0) { + ksft_print_msg("ERROR: cannot write cpu.cfs_quota_us at %s\n", quota_path); + return -1; + } + ksft_print_msg("cgroup (v1) %s: cpu.cfs_quota_us=%d cpu.cfs_period_us=%d\n", + g_cgroup_path, CFS_QUOTA_US, CFS_PERIOD_US); + } + + return 0; +} + +static int cgroup_add_pid_to_path(pid_t pid, const char *path) +{ + char buf[32], file_path[4096]; + + snprintf(buf, sizeof(buf), "%d", (int)pid); + if (use_cgroup_v2) { + snprintf(file_path, sizeof(file_path), "%s/cgroup.procs", path); + return write_file(file_path, buf); + } + /* In v1, try tasks first, fallback to cgroup.procs */ + snprintf(file_path, sizeof(file_path), "%s/tasks", path); + int r = write_file(file_path, buf); + + if (r < 0) { + snprintf(file_path, sizeof(file_path), "%s/cgroup.procs", path); + r = write_file(file_path, buf); + } + return r; +} + +static void cgroup_teardown(void) +{ + rm_cgroup_recursive(g_cgroup_path); +} + +static void cgroup_unthrottle(void) +{ + if (use_cgroup_v2) { + char max_path[4096]; + + snprintf(max_path, sizeof(max_path), "%s/cpu.max", g_cgroup_path); + write_file(max_path, "max"); + } else { + char quota_path[4096]; + + snprintf(quota_path, sizeof(quota_path), "%s/cpu.cfs_quota_us", g_cgroup_path); + write_file(quota_path, "-1"); + } +} + +/* -- CPU burner (inside throttled child process) -- */ +static void *burner_thread_fn(void *arg) +{ + struct rseq_abi my_rseq; + int cpu = (int)(uintptr_t)arg; + + memset(&my_rseq, 0, sizeof(my_rseq)); + my_rseq.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + + if (rseq_register_thread_at(&my_rseq) < 0) { + perror("rseq_register (burner)"); + return NULL; + } + + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu, &set); + if (sched_setaffinity(0, sizeof(set), &set) < 0) + perror("sched_setaffinity (burner)"); + + unsigned long sink = 0; + + while (!atomic_load_explicit(&g_stop, memory_order_relaxed)) { + sink++; + /* Prevent compiler from optimizing the loop away */ + asm volatile("" : "+g"(sink)); + } + + return NULL; +} + +static int burner_thread_fn_wrapper(void *arg) +{ + burner_thread_fn(arg); + return 0; +} + +static int leaf_child_fn(void *arg) +{ + int i = (int)(uintptr_t)arg; + int total_burners = g_ncpus_stress * N_BURNER_PER_CPU; + int n_threads_per_leaf = total_burners / N_SIBLINGS; + + if (i < (total_burners % N_SIBLINGS)) + n_threads_per_leaf++; + + prctl(PR_SET_PDEATHSIG, SIGTERM); + if (getppid() == 1) + _exit(1); + + char leaf_path[4096]; + + snprintf(leaf_path, sizeof(leaf_path), "%s/n%d", g_cgroup_path, i); + for (int j = 0; j < NEST_DEPTH; j++) + snprintf(leaf_path + strlen(leaf_path), + sizeof(leaf_path) - strlen(leaf_path), "/d%d", j); + + int r = cgroup_add_pid_to_path(getpid(), leaf_path); + + if (r < 0) { + char buf[512]; + int len = snprintf(buf, sizeof(buf), + "[leaf child %d] failed to join cgroup %s: err %d\n", + i, leaf_path, -r); + (void)!write(2, buf, len); + _exit(1); + } + + for (int j = 0; j < n_threads_per_leaf; j++) { + int cpu = g_stress_cpus[(i * n_threads_per_leaf + j) % g_ncpus_stress]; + + /* Allocate stack via mmap (bypasses heap) */ + size_t stack_size = 64 * 1024; + void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (stack == MAP_FAILED) { + const char *msg = "mmap stack failed\n"; + (void)!write(2, msg, strlen(msg)); + _exit(1); + } + + /* Use raw clone to create a thread sharing the VM and thread group */ + pid_t pid = clone(burner_thread_fn_wrapper, stack + stack_size, + CLONE_VM | CLONE_THREAD | CLONE_SIGHAND, + (void *)(uintptr_t)cpu); + if (pid < 0) { + const char *msg = "clone burner failed\n"; + (void)!write(2, msg, strlen(msg)); + _exit(1); + } + } + + // Wait for SIGTERM + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, SIGTERM); + int sig; + + sigwait(&mask, &sig); + + _exit(0); +} + +struct leaf_info { + pid_t pid; + void *stack; +}; + +static int run_throttle_child(void *arg) +{ + (void)arg; + prctl(PR_SET_PDEATHSIG, SIGTERM); + if (getppid() == 1) + _exit(1); + + int n_leafs = N_SIBLINGS; + + /* Block signals before spawning to avoid missing early failures */ + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGCHLD); + sigprocmask(SIG_BLOCK, &mask, NULL); + + /* Use mmap for tracking structures to avoid glibc heap usage */ + struct leaf_info *leaves = mmap(NULL, n_leafs * sizeof(struct leaf_info), + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (leaves == MAP_FAILED) { + const char *msg = "mmap leaves array failed\n"; + (void)!write(2, msg, strlen(msg)); + _exit(1); + } + + for (int i = 0; i < n_leafs; i++) { + size_t stack_size = 64 * 1024; + void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (stack == MAP_FAILED) { + const char *msg = "mmap leaf stack failed\n"; + (void)!write(2, msg, strlen(msg)); + _exit(1); + } + + leaves[i].stack = stack; + + pid_t pid = clone(leaf_child_fn, stack + stack_size, + CLONE_VM | SIGCHLD, (void *)(uintptr_t)i); + + if (pid < 0) { + const char *msg = "clone (leaf child) failed\n"; + (void)!write(2, msg, strlen(msg)); + + /* Clean up successfully spawned children */ + for (int j = 0; j < i; j++) { + kill(leaves[j].pid, SIGTERM); + waitpid(leaves[j].pid, NULL, 0); + munmap(leaves[j].stack, stack_size); + } + munmap(leaves, n_leafs * sizeof(struct leaf_info)); + + if (errno == EAGAIN) + _exit(4); + else + _exit(1); + } + leaves[i].pid = pid; + } + + int failed = 0; + + while (1) { + int sig; + + sigwait(&mask, &sig); + + if (sig == SIGTERM) { + break; + } else if (sig == SIGCHLD) { + int status; + pid_t pid; + + // Reap all dead children + while ((pid = waitpid(-1, &status, WNOHANG)) > 0) { + for (int i = 0; i < n_leafs; i++) { + if (leaves[i].pid == pid) { + leaves[i].pid = 0; + break; + } + } + if ((WIFEXITED(status) && WEXITSTATUS(status) != 0) || + WIFSIGNALED(status)) { + char buf[128]; + int len = snprintf(buf, sizeof(buf), + "[manager] child %d died unexpectedly (status %d)\n", + pid, WEXITSTATUS(status)); + (void)!write(2, buf, len); + failed = 1; + } + } + if (failed) + break; + } + } + + // Terminate all leaf kids + for (int i = 0; i < n_leafs; i++) { + if (leaves[i].pid > 0) + kill(leaves[i].pid, SIGTERM); + } + + for (int i = 0; i < n_leafs; i++) { + if (leaves[i].pid > 0) + waitpid(leaves[i].pid, NULL, 0); + munmap(leaves[i].stack, 64 * 1024); + } + + munmap(leaves, n_leafs * sizeof(struct leaf_info)); + + _exit(failed ? 1 : 0); +} + +/* -- Membarrier hammer thread -- */ +static void *hammer_thread_fn(void *arg) +{ + int target_cpu = *(int *)arg; + long local_ok = 0; + long local_err = 0; + int count = 0; + const int batch_size = 1024; + + if (rseq_register_thread() < 0) { + ksft_print_msg("[hammer] rseq_register failed: %s\n", strerror(errno)); + return NULL; + } + + membarrier_register_rseq_mm(); + + while (!atomic_load_explicit(&g_stop, memory_order_relaxed)) { + int r = syscall(SYS_membarrier, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_FLAG_CPU, + target_cpu); + if (__builtin_expect(r == 0, 1)) + local_ok++; + else + local_err++; + + count++; + if (__builtin_expect(count >= batch_size, 0)) { + atomic_fetch_add_explicit(&g_mb_ok, local_ok, memory_order_relaxed); + atomic_fetch_add_explicit(&g_mb_err, local_err, memory_order_relaxed); + local_ok = 0; + local_err = 0; + count = 0; + } + } + + /* Flush any remaining counts on exit */ + if (local_ok > 0) + atomic_fetch_add_explicit(&g_mb_ok, local_ok, memory_order_relaxed); + if (local_err > 0) + atomic_fetch_add_explicit(&g_mb_err, local_err, memory_order_relaxed); + + return NULL; +} + +/* -- Latency sentinel -- */ +static void *sentinel_thread_fn(void *arg) +{ + (void)arg; + struct sched_param sp = { .sched_priority = 20 }; + + if (sched_setscheduler(0, SCHED_FIFO, &sp) < 0) + ksft_print_msg("WARN: no SCHED_FIFO for sentinel (less precise)\n"); + + while (!atomic_load_explicit(&g_test_ready, memory_order_relaxed) && + !atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) { + struct timespec ts = {0, 1000 * 1000}; /* 1ms */ + + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL); + } + + uint64_t prev = monotonic_us(); + + while (!atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) { + struct timespec ts = { + .tv_sec = 0, + .tv_nsec = SENTINEL_INTERVAL_US * 1000L, + }; + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL); + + uint64_t now = monotonic_us(); + long latency_us = (long)(now - prev) - SENTINEL_INTERVAL_US; + + prev = now; + + if (latency_us <= 0) + continue; + + update_max_latency(latency_us); + + if (latency_us > LATENCY_CRITICAL_MS * 1000L) { + ksft_print_msg("\n[SENTINEL] CRITICAL: %ld ms delay (lockup precursor!)\n", + latency_us / 1000); + } else if (latency_us > LATENCY_WARN_MS * 1000L) { + ksft_print_msg("\n[SENTINEL] WARN: %ld ms latency spike\n", + latency_us / 1000); + } + } + return NULL; +} + +/* -- Progress reporter -- */ +static void *reporter_thread_fn(void *arg) +{ + (void)arg; + int elapsed = 0; + + while (!atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) { + for (int i = 0; i < 5; i++) { + sleep(1); + if (atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) + break; + } + if (atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) + break; + elapsed += 5; + long interval_max = atomic_exchange_explicit(&g_interval_max_latency_us, + 0, memory_order_relaxed); + + ksft_print_msg("[%3ds] mb: ok=%-10ld err=%-8ld | max_lat=%ld us\n", + elapsed, + atomic_load(&g_mb_ok), + atomic_load(&g_mb_err), + interval_max); + } + return NULL; +} + +/* -- Main -- */ +int main(void) +{ + ksft_print_header(); +#ifdef UNSUPPORTED_ARCH + ksft_exit_skip("Unsupported architecture\n"); +#endif + ksft_set_plan(1); + + if (geteuid() != 0) + ksft_exit_skip("Must run as root (cgroup + SCHED_FIFO)\n"); + + init_stress_cpus(); + + ksft_print_msg("=== membarrier rseq + CFS unthrottle stress ===\n"); + ksft_print_msg("Stressing CPUs: %d\n", g_ncpus_stress); + ksft_print_msg("Quota: %d/%d us (~%d unthrottles/sec/CPU)\n", + CFS_QUOTA_US, CFS_PERIOD_US, + 1000000 / CFS_PERIOD_US); + ksft_print_msg("Hammer threads: %d per CPU (%d total)\n", + N_HAMMER_PER_CPU, g_ncpus_stress * N_HAMMER_PER_CPU); + ksft_print_msg("Duration: %d seconds\n\n", TEST_DURATION_SEC); + + if (cgroup_setup() < 0) { + cgroup_teardown(); + ksft_exit_skip("cgroup_setup failed (missing permissions or v2 ctrls?)\n"); + } + + if (rseq_register_thread() < 0) { + ksft_print_msg("rseq_register (%s) failed: %s\n", __func__, strerror(errno)); + cgroup_teardown(); + ksft_exit_skip("rseq syscall failed or not available\n"); + } + if (membarrier_register_rseq_mm() < 0) { + ksft_print_msg("MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: %s\n" + "Kernel >= 5.10 with CONFIG_RSEQ required.\n", + strerror(errno)); + cgroup_teardown(); + ksft_exit_skip("membarrier register failed\n"); + } + ksft_print_msg("rseq membarrier registered OK\n"); + + sigset_t sigmask; + + sigemptyset(&sigmask); + sigaddset(&sigmask, SIGTERM); + sigprocmask(SIG_BLOCK, &sigmask, NULL); + + void *stack = malloc(1024 * 1024); + + if (!stack) { + perror("malloc stack"); + cgroup_teardown(); + ksft_exit_fail_msg("Malloc stack failed\n"); + } + pid_t child = clone(run_throttle_child, stack + 1024 * 1024, CLONE_VM | SIGCHLD, NULL); + + if (child < 0) { + perror("clone"); + cgroup_teardown(); + ksft_exit_fail_msg("Clone failed\n"); + } + + sigprocmask(SIG_UNBLOCK, &sigmask, NULL); + ksft_print_msg("Throttle child PID %d started\n", child); + + int n_threads = g_ncpus_stress * N_HAMMER_PER_CPU + 2; + pthread_t *threads = (pthread_t *)calloc(n_threads, sizeof(pthread_t)); + int *cpuargs = (int *)calloc(g_ncpus_stress * N_HAMMER_PER_CPU, sizeof(int)); + + if (!threads || !cpuargs) { + perror("calloc"); + kill(child, SIGTERM); + waitpid(child, NULL, 0); + cgroup_teardown(); + ksft_exit_fail_msg("Thread allocation failed\n"); + } + + int ti = 0, ai = 0; + int r; + + ksft_print_msg("Creating sentinel thread...\n"); + r = pthread_create(&threads[ti], NULL, sentinel_thread_fn, NULL); + if (r != 0) { + kill(child, SIGTERM); + waitpid(child, NULL, 0); + cgroup_teardown(); + free(threads); + free(cpuargs); + free(g_stress_cpus); + ksft_exit_fail_msg("pthread_create (sentinel) failed: %s\n", strerror(r)); + } + ti++; + + ksft_print_msg("Creating reporter thread...\n"); + r = pthread_create(&threads[ti], NULL, reporter_thread_fn, NULL); + if (r != 0) { + atomic_store(&g_stop_sentinel, 1); + pthread_join(threads[0], NULL); + kill(child, SIGTERM); + waitpid(child, NULL, 0); + cgroup_teardown(); + free(threads); + free(cpuargs); + free(g_stress_cpus); + ksft_exit_fail_msg("pthread_create (reporter) failed: %s\n", strerror(r)); + } + ti++; + + ksft_print_msg("Creating %d hammer threads...\n", g_ncpus_stress * N_HAMMER_PER_CPU); + for (int i = 0; i < g_ncpus_stress; i++) { + int cpu = g_stress_cpus[i]; + + for (int j = 0; j < N_HAMMER_PER_CPU; j++) { + cpuargs[ai] = cpu; + r = pthread_create(&threads[ti], NULL, hammer_thread_fn, &cpuargs[ai]); + if (r != 0) { + ksft_print_msg("pthread_create failed at thread %d: %s\n", + ti, strerror(r)); + + atomic_store(&g_stop_sentinel, 1); + pthread_join(threads[0], NULL); + pthread_join(threads[1], NULL); + + atomic_store(&g_stop, 1); + for (int k = 2; k < ti; k++) + pthread_join(threads[k], NULL); + + kill(child, SIGTERM); + waitpid(child, NULL, 0); + cgroup_teardown(); + + free(threads); + free(cpuargs); + free(g_stress_cpus); + + if (r == EAGAIN) + ksft_exit_skip("Resource limits prevent threads\n"); + else + ksft_exit_fail_msg("Failed to create hammer thread\n"); + } + ti++; + ai++; + } + } + + ksft_print_msg("All threads running. Tip: monitor dmesg for lockups\n\n"); + + atomic_store_explicit(&g_test_ready, 1, memory_order_relaxed); + int child_failed = 0; + int child_status = 0; + + for (int i = 0; i < TEST_DURATION_SEC; i++) { + sleep(1); + int r = waitpid(child, &child_status, WNOHANG); + + if (r == child) { + child_failed = 1; + break; + } + } + + atomic_store(&g_stop_sentinel, 1); + pthread_join(threads[0], NULL); + pthread_join(threads[1], NULL); + + atomic_store(&g_stop, 1); + + /* Unthrottle to allow children to exit quickly */ + cgroup_unthrottle(); + + if (!child_failed) { + kill(child, SIGTERM); + waitpid(child, NULL, 0); + } + for (int i = 2; i < ti; i++) + pthread_join(threads[i], NULL); + + long max_lat = atomic_load(&g_max_latency_us); + long total_ok = atomic_load(&g_mb_ok); + long total_err = atomic_load(&g_mb_err); + + ksft_print_msg("\n=== RESULTS ===\n"); + ksft_print_msg("membarrier syscalls : %ld ok %ld errors\n", total_ok, total_err); + ksft_print_msg("Max scheduler latency: %ld us (%ld ms)\n", max_lat, max_lat / 1000); + cgroup_teardown(); + free(threads); + free(cpuargs); + free(g_stress_cpus); + + if (child_failed) { + if (WIFEXITED(child_status) && WEXITSTATUS(child_status) == 4) + ksft_exit_skip("Manager child skipped (resource limits?)\n"); + ksft_test_result_fail("membarrier_rseq_stress: Manager child died early\n"); + ksft_exit_fail(); + } else if (total_ok == 0) { + ksft_test_result_fail("membarrier_rseq_stress: No successful membarrier calls\n"); + ksft_exit_fail(); + } else if (total_err > 0) { + ksft_test_result_fail("membarrier_rseq_stress: syscall errors\n"); + ksft_exit_fail(); + } else if (max_lat > LATENCY_CRITICAL_MS * 1000L) { + ksft_test_result_fail("membarrier_rseq_stress: LOCKUP PRECURSOR\n"); + ksft_exit_fail(); + } else if (max_lat > LATENCY_WARN_MS * 1000L) { + ksft_test_result_fail("membarrier_rseq_stress: significant latency spike\n"); + ksft_exit_fail(); + } else { + ksft_test_result_pass("membarrier_rseq_stress\n"); + ksft_exit_pass(); + } + + return 0; +} diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index dbc171a3806d..510056c1b0d0 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -162,7 +162,7 @@ static void *global_p = NULL; static int sealing_thread_fn(void *arg) { - int sig, r; + int r; /* * This thread first waits 200ms so any pending operation in the parent diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 2ca07ea7202a..cdab3a837624 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -688,9 +688,9 @@ static void mfd_assert_grow_write(int fd) if (hugetlbfs_test) return; - buf = malloc(mfd_def_size * 8); + buf = calloc(1, mfd_def_size * 8); if (!buf) { - printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); + printf("calloc(1, %zu) failed: %m\n", mfd_def_size * 8); abort(); } diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index b0c30c5ee9e3..9ccd9e1447e6 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -4,6 +4,10 @@ hugepage-mmap hugepage-mremap hugepage-shm hugepage-vmemmap +hugetlb-mmap +hugetlb-mremap +hugetlb-shm +hugetlb-vmemmap hugetlb-madvise hugetlb-read-hwpoison hugetlb-soft-offline diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index cd24596cdd27..e6df968f0971 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -61,16 +61,15 @@ TEST_GEN_FILES += gup_longterm TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugetlb-madvise +TEST_GEN_FILES += hugetlb-mmap +TEST_GEN_FILES += hugetlb-mremap TEST_GEN_FILES += hugetlb-read-hwpoison +TEST_GEN_FILES += hugetlb-shm TEST_GEN_FILES += hugetlb-soft-offline -TEST_GEN_FILES += hugepage-mmap -TEST_GEN_FILES += hugepage-mremap -TEST_GEN_FILES += hugepage-shm -TEST_GEN_FILES += hugepage-vmemmap +TEST_GEN_FILES += hugetlb-vmemmap TEST_GEN_FILES += khugepaged TEST_GEN_FILES += madv_populate TEST_GEN_FILES += map_fixed_noreplace -TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64 loongarch32 loongarch64)) TEST_GEN_FILES += memfd_secret @@ -151,6 +150,7 @@ TEST_PROGS += ksft_gup_test.sh TEST_PROGS += ksft_hmm.sh TEST_PROGS += ksft_hugetlb.sh TEST_PROGS += ksft_hugevm.sh +TEST_PROGS += ksft_kmemleak_dedup.sh TEST_PROGS += ksft_ksm.sh TEST_PROGS += ksft_ksm_numa.sh TEST_PROGS += ksft_madv_guard.sh @@ -187,8 +187,8 @@ TEST_FILES += write_hugetlb_memory.sh include ../lib.mk -$(TEST_GEN_PROGS): vm_util.c thp_settings.c -$(TEST_GEN_FILES): vm_util.c thp_settings.c +$(TEST_GEN_PROGS): vm_util.c hugepage_settings.c +$(TEST_GEN_FILES): vm_util.c hugepage_settings.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c @@ -216,7 +216,8 @@ ifeq ($(CAN_BUILD_I386),1) $(BINARIES_32): CFLAGS += -m32 -mxsave $(BINARIES_32): LDLIBS += -lrt -ldl -lm $(BINARIES_32): $(OUTPUT)/%_32: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif @@ -224,7 +225,8 @@ ifeq ($(CAN_BUILD_X86_64),1) $(BINARIES_64): CFLAGS += -m64 -mxsave $(BINARIES_64): LDLIBS += -lrt -ldl $(BINARIES_64): $(OUTPUT)/%_64: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ + $(call msg,CC,,$@) + $(Q)$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) endif @@ -261,7 +263,8 @@ $(OUTPUT)/migration: LDLIBS += -lnuma $(OUTPUT)/rmap: LDLIBS += -lnuma local_config.mk local_config.h: check_config.sh - CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh + $(call msg,CHK,config,$@) + $(Q)CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh EXTRA_CLEAN += local_config.mk local_config.h diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh index 44f4e703deb9..a1cfd3a349db 100755 --- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -17,6 +17,7 @@ if ! command -v killall >/dev/null 2>&1; then fi nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) +trap 'echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages' EXIT INT TERM fault_limit_file=limit_in_bytes reservation_limit_file=rsvd.limit_in_bytes @@ -70,7 +71,6 @@ function cleanup() { if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then rmdir $cgroup_path/hugetlb_cgroup_test2 fi - echo 0 >/proc/sys/vm/nr_hugepages echo CLEANUP DONE } @@ -94,6 +94,15 @@ function get_machine_hugepage_size() { } MB=$(get_machine_hugepage_size) +if (( MB >= 1024 )); then + # For 1GB hugepages + UNIT="GB" + MB_DISPLAY=$((MB / 1024)) +else + # For 2MB hugepages + UNIT="MB" + MB_DISPLAY=$MB +fi function setup_cgroup() { local name="$1" @@ -103,11 +112,12 @@ function setup_cgroup() { mkdir $cgroup_path/$name echo writing cgroup limit: "$cgroup_limit" - echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file + echo "$cgroup_limit" > \ + $cgroup_path/$name/hugetlb.${MB_DISPLAY}${UNIT}.$fault_limit_file echo writing reservation limit: "$reservation_limit" echo "$reservation_limit" > \ - $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file + $cgroup_path/$name/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_limit_file if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then echo 0 >$cgroup_path/$name/cpuset.cpus @@ -142,7 +152,7 @@ function wait_for_file_value() { function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file" wait_for_file_value "$path" "0" } @@ -150,7 +160,7 @@ function wait_for_hugetlb_memory_to_get_depleted() { function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file" wait_for_file_value "$path" "$size" } @@ -158,7 +168,7 @@ function wait_for_hugetlb_memory_to_get_reserved() { function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file" wait_for_file_value "$path" "$size" } @@ -180,8 +190,8 @@ function write_hugetlbfs_and_get_usage() { hugetlb_difference=0 reserved_difference=0 - local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file - local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file + local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file + local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file local hugetlb_before=$(cat $hugetlb_usage) local reserved_before=$(cat $reserved_usage) @@ -312,8 +322,10 @@ function run_test() { cleanup_hugetlb_memory "hugetlb_cgroup_test" - local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) - local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) + local final_hugetlb=$(cat \ + $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file) + local final_reservation=$(cat \ + $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file) echo $hugetlb_difference echo $reserved_difference @@ -369,10 +381,14 @@ function run_multiple_cgroup_test() { reservation_failed1=$reservation_failed oom_killed1=$oom_killed - local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file - local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file - local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file - local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file + local cgroup1_hugetlb_usage=\ + $cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file + local cgroup1_reservation_usage=\ + $cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file + local cgroup2_hugetlb_usage=\ + $cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB_DISPLAY}${UNIT}.$fault_usage_file + local cgroup2_reservation_usage=\ + $cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB_DISPLAY}${UNIT}.$reservation_usage_file local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) @@ -599,4 +615,3 @@ if [[ $do_umount ]]; then rmdir $cgroup_path fi -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh index b84c82bbf875..32beaefe279e 100755 --- a/tools/testing/selftests/mm/check_config.sh +++ b/tools/testing/selftests/mm/check_config.sh @@ -16,7 +16,7 @@ echo "#include <sys/types.h>" > $tmpfile_c echo "#include <liburing.h>" >> $tmpfile_c echo "int func(void) { return 0; }" >> $tmpfile_c -$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o +$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c index 30209c40b697..5b582588e015 100644 --- a/tools/testing/selftests/mm/compaction_test.c +++ b/tools/testing/selftests/mm/compaction_test.c @@ -17,6 +17,7 @@ #include <string.h> #include "kselftest.h" +#include "hugepage_settings.h" #define MAP_SIZE_MB 100 #define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) @@ -82,124 +83,44 @@ int prereq(void) return -1; } -int check_compaction(unsigned long mem_free, unsigned long hugepage_size, - unsigned long initial_nr_hugepages) +int check_compaction(unsigned long mem_free, unsigned long hugepage_size) { - unsigned long nr_hugepages_ul; - int fd, ret = -1; + unsigned long nr_hugepages; int compaction_index = 0; - char nr_hugepages[20] = {0}; - char init_nr_hugepages[24] = {0}; - char target_nr_hugepages[24] = {0}; - int slen; - - snprintf(init_nr_hugepages, sizeof(init_nr_hugepages), - "%lu", initial_nr_hugepages); + int ret = -1; /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ mem_free = mem_free * 0.8; - fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); - if (fd < 0) { - ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - ret = -1; - goto out; - } - /* * Request huge pages for about half of the free memory. The Kernel * will allocate as much as it can, and we expect it will get at least 1/3 */ - nr_hugepages_ul = mem_free / hugepage_size / 2; - snprintf(target_nr_hugepages, sizeof(target_nr_hugepages), - "%lu", nr_hugepages_ul); - - slen = strlen(target_nr_hugepages); - if (write(fd, target_nr_hugepages, slen) != slen) { - ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n", - nr_hugepages_ul, strerror(errno)); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { - ksft_print_msg("Failed to re-read from /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - goto close_fd; - } + nr_hugepages = mem_free / hugepage_size / 2; + hugetlb_set_nr_default_pages(nr_hugepages); /* We should have been able to request at least 1/3 rd of the memory in huge pages */ - nr_hugepages_ul = strtoul(nr_hugepages, NULL, 10); - if (!nr_hugepages_ul) { + nr_hugepages = hugetlb_nr_default_pages(); + if (!nr_hugepages) { ksft_print_msg("ERROR: No memory is available as huge pages\n"); - goto close_fd; - } - compaction_index = mem_free/(nr_hugepages_ul * hugepage_size); - - lseek(fd, 0, SEEK_SET); - - if (write(fd, init_nr_hugepages, strlen(init_nr_hugepages)) - != strlen(init_nr_hugepages)) { - ksft_print_msg("Failed to write value to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - goto close_fd; + goto out; } + compaction_index = mem_free/(nr_hugepages * hugepage_size); - ksft_print_msg("Number of huge pages allocated = %lu\n", - nr_hugepages_ul); + ksft_print_msg("Number of huge pages allocated = %lu\n", nr_hugepages); if (compaction_index > 3) { ksft_print_msg("ERROR: Less than 1/%d of memory is available\n" "as huge pages\n", compaction_index); - goto close_fd; - } - - ret = 0; - - close_fd: - close(fd); - out: - ksft_test_result(ret == 0, "check_compaction\n"); - return ret; -} - -int set_zero_hugepages(unsigned long *initial_nr_hugepages) -{ - int fd, ret = -1; - char nr_hugepages[20] = {0}; - - fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); - if (fd < 0) { - ksft_print_msg("Failed to open /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); goto out; } - if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { - ksft_print_msg("Failed to read from /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - /* Start with the initial condition of 0 huge pages */ - if (write(fd, "0", sizeof(char)) != sizeof(char)) { - ksft_print_msg("Failed to write 0 to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); - goto close_fd; - } - *initial_nr_hugepages = strtoul(nr_hugepages, NULL, 10); ret = 0; - close_fd: - close(fd); - out: + ksft_test_result(ret == 0, "check_compaction\n"); return ret; } @@ -212,18 +133,17 @@ int main(int argc, char **argv) unsigned long mem_free = 0; unsigned long hugepage_size = 0; long mem_fragmentable_MB = 0; - unsigned long initial_nr_hugepages; ksft_print_header(); if (prereq() || geteuid()) ksft_exit_skip("Prerequisites unsatisfied\n"); - ksft_set_plan(1); - /* Start the test without hugepages reducing mem_free */ - if (set_zero_hugepages(&initial_nr_hugepages)) - ksft_exit_fail(); + if (!hugetlb_setup_default_exact(0)) + ksft_exit_skip("Could not reset nr_hugepages\n"); + + ksft_set_plan(1); lim.rlim_cur = RLIM_INFINITY; lim.rlim_max = RLIM_INFINITY; @@ -261,6 +181,9 @@ int main(int argc, char **argv) mem_fragmentable_MB -= MAP_SIZE_MB; } + /* Unmap every other entry in the list to create fragmentation with + * locked pages before invoking check_compaction(). + */ for (entry = list; entry != NULL; entry = entry->next) { munmap(entry->map, MAP_SIZE); if (!entry->next) @@ -268,8 +191,7 @@ int main(int argc, char **argv) entry = entry->next; } - if (check_compaction(mem_free, hugepage_size, - initial_nr_hugepages) == 0) + if (check_compaction(mem_free, hugepage_size) == 0) ksft_exit_pass(); ksft_exit_fail(); diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index d9c69c04b67d..0c627ea89ff7 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -29,7 +29,7 @@ #include "../../../../mm/gup_test.h" #include "kselftest.h" #include "vm_util.h" -#include "thp_settings.h" +#include "hugepage_settings.h" static size_t pagesize; static int pagemap_fd; @@ -37,7 +37,7 @@ static size_t pmdsize; static int nr_thpsizes; static size_t thpsizes[20]; static int nr_hugetlbsizes; -static size_t hugetlbsizes[10]; +static unsigned long hugetlbsizes[10]; static int gup_fd; static bool has_huge_zeropage; @@ -202,7 +202,7 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, log_test_result(KSFT_FAIL); goto close_comm_pipes; } else if (!ret) { - exit(fn(mem, size, &comm_pipes)); + _exit(fn(mem, size, &comm_pipes)); } while (read(comm_pipes.child_ready[0], &buf, 1) != 1) @@ -333,7 +333,7 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size, ; /* Modify page content in the child. */ memset(mem, 0xff, size); - exit(0); + _exit(0); } if (!before_fork) { @@ -480,7 +480,7 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork) write(comm_pipes.child_ready[1], "0", 1); while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) ; - exit(0); + _exit(0); } while (read(comm_pipes.child_ready[0], &buf, 1) != 1) @@ -645,7 +645,7 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, write(comm_pipes.child_ready[1], "0", 1); while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) ; - exit(0); + _exit(0); } /* Wait until our child is ready. */ @@ -956,7 +956,7 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) log_test_result(KSFT_FAIL); goto munmap; } else if (!ret) { - exit(0); + _exit(0); } wait(&ret); /* Allow for sharing all pages again. */ @@ -1347,13 +1347,13 @@ static void do_test_anon_thp_collapse(char *mem, size_t size, switch (test) { case ANON_THP_COLLAPSE_UNSHARED: case ANON_THP_COLLAPSE_FULLY_SHARED: - exit(child_memcmp_fn(mem, size, &comm_pipes)); + _exit(child_memcmp_fn(mem, size, &comm_pipes)); break; case ANON_THP_COLLAPSE_LOWER_SHARED: - exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + _exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); break; case ANON_THP_COLLAPSE_UPPER_SHARED: - exit(child_memcmp_fn(mem + size / 2, size / 2, + _exit(child_memcmp_fn(mem + size / 2, size / 2, &comm_pipes)); break; default: @@ -1881,21 +1881,21 @@ int main(int argc, char **argv) ksft_print_header(); + thp_save_settings(); + pagesize = getpagesize(); pmdsize = read_pmd_pagesize(); if (pmdsize) { /* Only if THP is supported. */ thp_read_settings(&default_settings); default_settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_INHERIT; - thp_save_settings(); thp_push_settings(&default_settings); ksft_print_msg("[INFO] detected PMD size: %zu KiB\n", pmdsize / 1024); nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); } - nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, - ARRAY_SIZE(hugetlbsizes)); + nr_hugetlbsizes = hugetlb_setup(2, hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); has_huge_zeropage = detect_huge_zeropage(); ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + @@ -1911,10 +1911,5 @@ int main(int argc, char **argv) run_anon_thp_test_cases(); run_non_anon_test_cases(); - if (pmdsize) { - /* Only if THP is supported. */ - thp_restore_settings(); - } - ksft_finished(); } diff --git a/tools/testing/selftests/mm/droppable.c b/tools/testing/selftests/mm/droppable.c index 44940f75c461..57e1b6fc5569 100644 --- a/tools/testing/selftests/mm/droppable.c +++ b/tools/testing/selftests/mm/droppable.c @@ -17,37 +17,50 @@ int main(int argc, char *argv[]) { - size_t alloc_size = 134217728; - size_t page_size = getpagesize(); + const size_t alloc_size = 2 * 1024 * 1024; + int retry_count = 10; + bool dropped; void *alloc; - pid_t child; ksft_print_header(); ksft_set_plan(1); alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); - assert(alloc != MAP_FAILED); - memset(alloc, 'A', alloc_size); - for (size_t i = 0; i < alloc_size; i += page_size) - assert(*(uint8_t *)(alloc + i)); - - child = fork(); - assert(child >= 0); - if (!child) { - for (;;) - *(char *)malloc(page_size) = 'B'; + if (alloc == MAP_FAILED) { + if ((errno == EOPNOTSUPP) || (errno == EINVAL)) { + ksft_test_result_skip("MAP_DROPPABLE not supported\n"); + exit(KSFT_SKIP); + } + ksft_test_result_fail("mmap error: %s\n", strerror(errno)); + exit(KSFT_FAIL); } + memset(alloc, 'A', alloc_size); - for (bool done = false; !done;) { - for (size_t i = 0; i < alloc_size; i += page_size) { - if (!*(uint8_t *)(alloc + i)) { - done = true; - break; + while (retry_count--) { + if (madvise(alloc, alloc_size, MADV_PAGEOUT)) { + if (errno == EINVAL) { + ksft_test_result_skip("madvise(MADV_PAGEOUT) not supported\n"); + exit(KSFT_SKIP); } + ksft_test_result_fail("madvise(MADV_PAGEOUT) error: %s\n", strerror(errno)); + exit(KSFT_FAIL); } + + dropped = memchr(alloc, 'A', alloc_size) == NULL; + + /* + * Speculative reference can temporarily prevent some + * pages from getting dropped. So sleep and retry. + * + * If a page is not droppable for 10s, something + * is seriously messed up and we want to fail. + */ + if (dropped) + break; + sleep(1); } - kill(child, SIGTERM); - ksft_test_result_pass("MAP_DROPPABLE: PASS\n"); - exit(KSFT_PASS); + ksft_test_result(dropped, "madvise(MADV_PAGEOUT) behavior\n"); + + ksft_finished(); } diff --git a/tools/testing/selftests/mm/folio_split_race_test.c b/tools/testing/selftests/mm/folio_split_race_test.c index ff026f183ac7..6329e37fff4c 100644 --- a/tools/testing/selftests/mm/folio_split_race_test.c +++ b/tools/testing/selftests/mm/folio_split_race_test.c @@ -25,7 +25,7 @@ #include <unistd.h> #include "vm_util.h" #include "kselftest.h" -#include "thp_settings.h" +#include "hugepage_settings.h" uint64_t page_size; uint64_t pmd_pagesize; @@ -226,23 +226,6 @@ static uint64_t run_iteration(void) return reader_failures; } -static void thp_cleanup_handler(int signum) -{ - thp_restore_settings(); - /* - * Restore default handler and re-raise the signal to exit. - * This is to ensure the test process exits with the correct - * status code corresponding to the signal. - */ - signal(signum, SIG_DFL); - raise(signum); -} - -static void thp_settings_cleanup(void) -{ - thp_restore_settings(); -} - int main(void) { struct thp_settings current_settings; @@ -261,12 +244,6 @@ int main(void) ksft_exit_skip("Please run the test as root\n"); thp_save_settings(); - /* make sure thp settings are restored */ - if (atexit(thp_settings_cleanup) != 0) - ksft_exit_fail_msg("atexit failed\n"); - - signal(SIGINT, thp_cleanup_handler); - signal(SIGTERM, thp_cleanup_handler); thp_read_settings(¤t_settings); current_settings.shmem_enabled = SHMEM_ADVISE; diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index 48e8b1539be3..b21df3040b1c 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -21,7 +21,7 @@ #include <sys/uio.h> #include <unistd.h> #include "vm_util.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #include "../pidfd/pidfd.h" @@ -2203,17 +2203,6 @@ TEST_F(guard_regions, collapse) if (variant->backing != ANON_BACKED) ASSERT_EQ(ftruncate(self->fd, size), 0); - /* - * We must close and re-open local-file backed as read-only for - * CONFIG_READ_ONLY_THP_FOR_FS to work. - */ - if (variant->backing == LOCAL_FILE_BACKED) { - ASSERT_EQ(close(self->fd), 0); - - self->fd = open(self->path, O_RDONLY); - ASSERT_GE(self->fd, 0); - } - ptr = mmap_(self, variant, NULL, size, PROT_READ, 0, 0); ASSERT_NE(ptr, MAP_FAILED); @@ -2237,9 +2226,10 @@ TEST_F(guard_regions, collapse) /* * Now collapse the entire region. This should fail in all cases. * - * The madvise() call will also fail if CONFIG_READ_ONLY_THP_FOR_FS is - * not set for the local file case, but we can't differentiate whether - * this occurred or if the collapse was rightly rejected. + * The madvise() call will also fail if the file system does not support + * large folio or the supported orders do not include PMD_ORDER for the + * local file case, but we can't differentiate whether this occurred or + * if the collapse was rightly rejected. */ EXPECT_NE(madvise(ptr, size, MADV_COLLAPSE), 0); diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index f61150d28eb2..eb8963e9d98f 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -29,10 +29,11 @@ #include "../../../../mm/gup_test.h" #include "kselftest.h" #include "vm_util.h" +#include "hugepage_settings.h" static size_t pagesize; static int nr_hugetlbsizes; -static size_t hugetlbsizes[10]; +static unsigned long hugetlbsizes[10]; static int gup_fd; static __fsword_t get_fs_type(int fd) @@ -509,7 +510,7 @@ int main(int argc, char **argv) int i; pagesize = getpagesize(); - nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, + nr_hugetlbsizes = hugetlb_setup(2, hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); ksft_print_header(); diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index fb8f9ae49efa..3f841a96f870 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -14,6 +14,7 @@ #include <mm/gup_test.h> #include "kselftest.h" #include "vm_util.h" +#include "hugepage_settings.h" #define MB (1UL << 20) @@ -94,6 +95,7 @@ int main(int argc, char **argv) int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; int flags = MAP_PRIVATE; char *file = "/dev/zero"; + bool hugetlb = false; pthread_t *tid; char *p; @@ -168,6 +170,7 @@ int main(int argc, char **argv) break; case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); + hugetlb = true; break; default: ksft_exit_fail_msg("Wrong argument\n"); @@ -199,6 +202,18 @@ int main(int argc, char **argv) } ksft_print_header(); + + if (hugetlb) { + unsigned long hp_size = default_huge_page_size(); + + if (!hp_size) + ksft_exit_skip("HugeTLB is unavailable\n"); + + size = (size + hp_size - 1) & ~(hp_size - 1); + if (!hugetlb_setup_default(size / hp_size)) + ksft_exit_skip("Not enough huge pages\n"); + } + ksft_set_plan(nthreads); filed = open(file, O_RDWR|O_CREAT, 0664); diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 788689497e92..e4c49699f3f7 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -11,6 +11,7 @@ */ #include "kselftest_harness.h" +#include "hugepage_settings.h" #include <errno.h> #include <fcntl.h> @@ -21,13 +22,13 @@ #include <strings.h> #include <time.h> #include <pthread.h> +#include <limits.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/ioctl.h> #include <sys/time.h> - /* * This is a private UAPI to the kernel test module so it isn't exported * in the usual include/uapi/... directory. @@ -69,6 +70,9 @@ enum { #ifndef FOLL_LONGTERM #define FOLL_LONGTERM 0x100 /* mapping lifetime is indefinite */ #endif + +HUGETLB_SETUP_DEFAULT_PAGES(1) + FIXTURE(hmm) { int fd; @@ -632,7 +636,7 @@ TEST_F(hmm, anon_write_child) } close(child_fd); - exit(0); + _exit(0); } } } @@ -712,7 +716,7 @@ TEST_F(hmm, anon_write_child_shared) ASSERT_EQ(ptr[i], -i); close(child_fd); - exit(0); + _exit(0); } /* @@ -784,8 +788,8 @@ TEST_F(hmm, anon_write_hugetlbfs) int *ptr; int ret; - if (!default_hsize) - SKIP(return, "Huge page size could not be determined"); + if (!hugetlb_free_default_pages()) + SKIP(return, "Not enough huge pages"); size = ALIGN(TWOMEG, default_hsize); npages = size >> self->page_shift; @@ -986,6 +990,56 @@ TEST_F(hmm, migrate) } /* + * Migrate private file memory to device private memory. + */ +TEST_F(hmm, migrate_file_private) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* * Migrate anonymous memory to device private memory and fault some of it back * to system memory, then try migrating the resulting mix of system and device * private memory to the device. @@ -1549,8 +1603,8 @@ TEST_F(hmm2, snapshot) } /* - * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that - * should be mapped by a large page table entry. + * Test the hmm_range_fault() handling of large pages (PMD or PUD) + * that should be mapped by a large page table entry. */ TEST_F(hmm, compound) { @@ -1560,13 +1614,13 @@ TEST_F(hmm, compound) unsigned long default_hsize = default_huge_page_size(); int *ptr; unsigned char *m; + unsigned char prot; int ret; unsigned long i; /* Skip test if we can't allocate a hugetlbfs page. */ - - if (!default_hsize) - SKIP(return, "Huge page size could not be determined"); + if (!hugetlb_free_default_pages()) + SKIP(return, "Not enough huge pages"); size = ALIGN(TWOMEG, default_hsize); npages = size >> self->page_shift; @@ -1596,11 +1650,20 @@ TEST_F(hmm, compound) ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); - /* Check what the device saw. */ + /* + * Check what the device saw. The region is backed by a single huge + * page that the device reports either at PMD or at PUD level depending + * on the configured default hugepage size. Determine that level from + * the first page and require every page in the range to match it + * exactly, so that a fragmented mapping mixing levels (or a missing + * large-page bit) is still caught and reported with its actual value. + */ m = buffer->mirror; + prot = HMM_DMIRROR_PROT_WRITE | + ((m[0] & HMM_DMIRROR_PROT_PUD) ? HMM_DMIRROR_PROT_PUD : + HMM_DMIRROR_PROT_PMD); for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | - HMM_DMIRROR_PROT_PMD); + ASSERT_EQ(m[i], prot); /* Make the region read-only. */ ret = mprotect(buffer->ptr, size, PROT_READ); @@ -1611,11 +1674,17 @@ TEST_F(hmm, compound) ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); - /* Check what the device saw. */ + /* + * Check what the device saw after mprotect(PROT_READ). Same + * approach as above: determine the mapping level from the first + * page and require every page to match it exactly. + */ m = buffer->mirror; + prot = HMM_DMIRROR_PROT_READ | + ((m[0] & HMM_DMIRROR_PROT_PUD) ? HMM_DMIRROR_PROT_PUD : + HMM_DMIRROR_PROT_PMD); for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | - HMM_DMIRROR_PROT_PMD); + ASSERT_EQ(m[i], prot); munmap(buffer->ptr, buffer->size); buffer->ptr = NULL; @@ -1815,6 +1884,8 @@ TEST_F(hmm, exclusive_cow) unsigned long i; int *ptr; int ret; + pid_t pid; + int status; npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; ASSERT_NE(npages, 0); @@ -1843,14 +1914,37 @@ TEST_F(hmm, exclusive_cow) ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); - fork(); + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); - /* Fault pages back to system memory and check them. */ + if (pid == 0) { + /* + * Child verifies COW independently, then _exit(0)s so it does + * not run the test teardown. A failed ASSERT_* here makes the + * harness abort() the child, so the parent sees + * !WIFEXITED(status) below and fails in turn. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + 1); + + _exit(0); + } + + /* Parent: also increment to verify COW works for both processes. */ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i]++, i); for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); + ASSERT_EQ(ptr[i], i + 1); + + /* Parent: wait for child and then free the buffer. */ + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); hmm_buffer_free(buffer); } @@ -2012,7 +2106,7 @@ TEST_F(hmm, hmm_cow_in_device) if (pid == -1) ASSERT_EQ(pid, 0); if (!pid) { - /* Child process waits for SIGTERM from the parent. */ + /* Child process waits for SIGKILL from the parent. */ while (1) { } /* Should not reach this */ @@ -2025,10 +2119,10 @@ TEST_F(hmm, hmm_cow_in_device) ptr[i] = i; /* Terminate child and wait */ - EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(0, kill(pid, SIGKILL)); EXPECT_EQ(pid, waitpid(pid, &status, 0)); EXPECT_NE(0, WIFSIGNALED(status)); - EXPECT_EQ(SIGTERM, WTERMSIG(status)); + EXPECT_EQ(SIGKILL, WTERMSIG(status)); /* Take snapshot to CPU pagetables */ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); @@ -2224,8 +2318,11 @@ TEST_F(hmm, migrate_anon_huge_fault) unsigned long npages; unsigned long size; unsigned long i; + unsigned char *m; + uint64_t entry; void *old_ptr; void *map; + int pagemap_fd; int *ptr; int ret; @@ -2248,8 +2345,6 @@ TEST_F(hmm, migrate_anon_huge_fault) npages = size >> self->page_shift; map = (void *)ALIGN((uintptr_t)buffer->ptr, size); - ret = madvise(map, size, MADV_HUGEPAGE); - ASSERT_EQ(ret, 0); old_ptr = buffer->ptr; buffer->ptr = map; @@ -2257,6 +2352,9 @@ TEST_F(hmm, migrate_anon_huge_fault) for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ptr[i] = i; + ret = madvise(map, size, MADV_COLLAPSE); + ASSERT_EQ(ret, 0); + /* Migrate memory to device. */ ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); @@ -2266,6 +2364,32 @@ TEST_F(hmm, migrate_anon_huge_fault) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); + if (!hmm_is_coherent_type(variant->device_number)) { + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, + buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE | + HMM_DMIRROR_PROT_PMD); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + ASSERT_GE(pagemap_fd, 0); + + for (i = 0; i < npages; ++i) { + entry = pagemap_get_entry(pagemap_fd, + (char *)buffer->ptr + i * self->page_size); + + ASSERT_NE(entry & PM_SWAP, 0); + ASSERT_FALSE(PAGEMAP_PRESENT(entry)); + } + + close(pagemap_fd); + } + /* Fault pages back to system memory and check them. */ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); @@ -2282,12 +2406,21 @@ TEST_F(hmm, migrate_partial_unmap_fault) struct hmm_buffer *buffer; unsigned long npages; unsigned long size = read_pmd_pagesize(); + unsigned long unmap_size; + unsigned long offsets[3]; unsigned long i; void *old_ptr; void *map; int *ptr; int ret, j, use_thp; - int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + if (!size) + size = TWOMEG; + + unmap_size = size / 2; + offsets[0] = 0; + offsets[1] = size / 4; + offsets[2] = size / 2; for (use_thp = 0; use_thp < 2; ++use_thp) { for (j = 0; j < ARRAY_SIZE(offsets); ++j) { @@ -2329,12 +2462,12 @@ TEST_F(hmm, migrate_partial_unmap_fault) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - munmap(buffer->ptr + offsets[j], ONEMEG); + munmap(buffer->ptr + offsets[j], unmap_size); /* Fault pages back to system memory and check them. */ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) if (i * sizeof(int) < offsets[j] || - i * sizeof(int) >= offsets[j] + ONEMEG) + i * sizeof(int) >= offsets[j] + unmap_size) ASSERT_EQ(ptr[i], i); buffer->ptr = old_ptr; @@ -2348,12 +2481,19 @@ TEST_F(hmm, migrate_remap_fault) struct hmm_buffer *buffer; unsigned long npages; unsigned long size = read_pmd_pagesize(); + unsigned long offsets[3]; unsigned long i; void *old_ptr, *new_ptr = NULL; void *map; int *ptr; int ret, j, use_thp, dont_unmap, before; - int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + if (!size) + size = TWOMEG; + + offsets[0] = 0; + offsets[1] = size / 4; + offsets[2] = size / 2; for (before = 0; before < 2; ++before) { for (dont_unmap = 0; dont_unmap < 2; ++dont_unmap) { @@ -2688,7 +2828,7 @@ static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_siz buffer->ptr = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!buffer->ptr) + if (buffer->ptr == MAP_FAILED) return -1; /* Apply THP hint if requested */ @@ -2756,38 +2896,45 @@ static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_siz TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120) { struct benchmark_results thp_results, regular_results; - size_t thp_size = 2 * 1024 * 1024; /* 2MB - typical THP size */ + size_t thp_size = read_pmd_pagesize(); int iterations = 5; + if (!thp_size) + thp_size = TWOMEG; + printf("\nHMM THP Migration Benchmark\n"); printf("---------------------------\n"); printf("System page size: %ld bytes\n", sysconf(_SC_PAGESIZE)); /* Test different buffer sizes */ size_t test_sizes[] = { - thp_size / 4, /* 512KB - smaller than THP */ - thp_size / 2, /* 1MB - half THP */ - thp_size, /* 2MB - single THP */ - thp_size * 2, /* 4MB - two THPs */ - thp_size * 4, /* 8MB - four THPs */ - thp_size * 8, /* 16MB - eight THPs */ - thp_size * 128, /* 256MB - one twenty eight THPs */ + thp_size / 4, /* quarter THP */ + thp_size / 2, /* half THP */ + thp_size, /* single THP */ + thp_size * 2, /* two THPs */ + thp_size * 4, /* four THPs */ + thp_size * 8, /* eight THPs */ + thp_size * 128, /* one twenty eight THPs */ }; static const char *const test_names[] = { - "Small Buffer (512KB)", - "Half THP Size (1MB)", - "Single THP Size (2MB)", - "Two THP Size (4MB)", - "Four THP Size (8MB)", - "Eight THP Size (16MB)", - "One twenty eight THP Size (256MB)" + "Small Buffer", + "Half THP Size", + "Single THP Size", + "Two THP Size", + "Four THP Size", + "Eight THP Size", + "One twenty eight THP Size" }; int num_tests = ARRAY_SIZE(test_sizes); /* Run all tests */ for (int i = 0; i < num_tests; i++) { + /* Skip test sizes exceeding INT_MAX to avoid overflow */ + if (test_sizes[i] > INT_MAX) + break; + /* Test with THP */ ASSERT_EQ(run_migration_benchmark(self->fd, 1, test_sizes[i], iterations, &thp_results), 0); diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c deleted file mode 100644 index d543419de040..000000000000 --- a/tools/testing/selftests/mm/hugepage-mmap.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mmap: - * - * Example of using huge page memory in a user application using the mmap - * system call. Before running this application, make sure that the - * administrator has mounted the hugetlbfs filesystem (on some directory - * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this - * example, the app is requesting memory of size 256MB that is backed by - * huge pages. - */ -#define _GNU_SOURCE -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#include <fcntl.h> -#include "kselftest.h" - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -static void check_bytes(char *addr) -{ - ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr) -{ - unsigned long i; - - for (i = 0; i < LENGTH; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < LENGTH; i++) - if (*(addr + i) != (char)i) { - ksft_print_msg("Error: Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(void) -{ - void *addr; - int fd, ret; - - ksft_print_header(); - ksft_set_plan(1); - - fd = memfd_create("hugepage-mmap", MFD_HUGETLB); - if (fd < 0) - ksft_exit_fail_msg("memfd_create() failed: %s\n", strerror(errno)); - - addr = mmap(NULL, LENGTH, PROTECTION, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - close(fd); - ksft_exit_fail_msg("mmap(): %s\n", strerror(errno)); - } - - ksft_print_msg("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); - - munmap(addr, LENGTH); - close(fd); - - ksft_test_result(!ret, "Read same data\n"); - - ksft_exit(!ret); -} diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/hugepage_settings.c index e748ebfb3d4e..2eab2110ac6a 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/hugepage_settings.c @@ -1,13 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 +#include <dirent.h> #include <fcntl.h> #include <limits.h> +#include <signal.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include "vm_util.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define MAX_SETTINGS_DEPTH 4 @@ -15,6 +18,7 @@ static struct thp_settings settings_stack[MAX_SETTINGS_DEPTH]; static int settings_index; static struct thp_settings saved_settings; static char dev_queue_read_ahead_path[PATH_MAX]; +static bool thp_settings_saved; static const char * const thp_enabled_strings[] = { "never", @@ -44,47 +48,6 @@ static const char * const shmem_enabled_strings[] = { NULL }; -int read_file(const char *path, char *buf, size_t buflen) -{ - int fd; - ssize_t numread; - - fd = open(path, O_RDONLY); - if (fd == -1) - return 0; - - numread = read(fd, buf, buflen - 1); - if (numread < 1) { - close(fd); - return 0; - } - - buf[numread] = '\0'; - close(fd); - - return (unsigned int) numread; -} - -unsigned long read_num(const char *path) -{ - char buf[21]; - - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file()"); - exit(EXIT_FAILURE); - } - - return strtoul(buf, NULL, 10); -} - -void write_num(const char *path, unsigned long num) -{ - char buf[21]; - - sprintf(buf, "%ld", num); - write_file(path, buf, strlen(buf) + 1); -} - int thp_read_string(const char *name, const char * const strings[]) { char path[PATH_MAX]; @@ -298,12 +261,20 @@ void thp_pop_settings(void) void thp_restore_settings(void) { - thp_write_settings(&saved_settings); + if (thp_settings_saved) + thp_write_settings(&saved_settings); } -void thp_save_settings(void) +static void __thp_save_settings(void) { + if (!thp_available()) + return; + + if (thp_settings_saved) + return; + thp_read_settings(&saved_settings); + thp_settings_saved = true; } void thp_set_read_ahead_path(char *path) @@ -370,3 +341,260 @@ bool thp_is_enabled(void) /* THP is considered enabled if it's either "always" or "madvise" */ return mode == 1 || mode == 3; } + +#define HUGETLB_MAX_NR_PAGESIZES 10 +struct hugetlb_settings { + unsigned long nr_hugepages[HUGETLB_MAX_NR_PAGESIZES]; + unsigned long sizes[HUGETLB_MAX_NR_PAGESIZES]; + unsigned long default_size; + int nr_sizes; +}; + +static struct hugetlb_settings hugetlb_saved_settings; +static bool hugetlb_settings_saved; + +int detect_hugetlb_page_sizes(unsigned long sizes[], int max) +{ + static struct hugetlb_settings *settings = &hugetlb_saved_settings; + DIR *dir; + int count = 0; + + if (settings->nr_sizes) { + if (settings->nr_sizes < max) + max = settings->nr_sizes; + for (count = 0; count < max; count++) + sizes[count] = settings->sizes[count]; + return count; + } + + dir = opendir("/sys/kernel/mm/hugepages/"); + if (!dir) + return 0; + + while (count < max) { + struct dirent *entry = readdir(dir); + size_t kb; + + if (!entry) + break; + if (entry->d_type != DT_DIR) + continue; + if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) + continue; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected hugetlb page size: %zu KiB\n", + kb); + } + closedir(dir); + return count; +} + +unsigned long default_huge_page_size(void) +{ + static struct hugetlb_settings *settings = &hugetlb_saved_settings; + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f; + + if (settings->default_size) + return settings->default_size; + + f = fopen("/proc/meminfo", "r"); + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +static void hugetlb_sysfs_path(char *buf, size_t buflen, + unsigned long size, const char *attr) +{ + snprintf(buf, buflen, "/sys/kernel/mm/hugepages/hugepages-%lukB/%s", + size / 1024, attr); +} + +unsigned long hugetlb_nr_pages(unsigned long size) +{ + char path[PATH_MAX]; + + hugetlb_sysfs_path(path, sizeof(path), size, "nr_hugepages"); + + return read_num(path); +} + +void hugetlb_set_nr_pages(unsigned long size, unsigned long nr) +{ + char path[PATH_MAX]; + + hugetlb_sysfs_path(path, sizeof(path), size, "nr_hugepages"); + + write_num(path, nr); +} + +unsigned long hugetlb_free_pages(unsigned long size) +{ + char path[PATH_MAX]; + + hugetlb_sysfs_path(path, sizeof(path), size, "free_hugepages"); + + return read_num(path); +} + +static bool __hugetlb_setup(unsigned long size, unsigned long nr) +{ + unsigned long free = hugetlb_free_pages(size); + unsigned long total = hugetlb_nr_pages(size); + + if (free >= nr) + return true; + + hugetlb_set_nr_pages(size, total + (nr - free)); + + return hugetlb_free_pages(size) >= nr; +} + +bool hugetlb_setup_default(unsigned long nr) +{ + unsigned long size; + + hugetlb_save_settings(); + size = default_huge_page_size(); + if (!size) + return false; + + return __hugetlb_setup(size, nr); +} + +bool hugetlb_setup_default_exact(unsigned long nr) +{ + unsigned long size; + + hugetlb_save_settings(); + size = default_huge_page_size(); + if (!size) + return false; + + hugetlb_set_nr_pages(size, nr); + + return hugetlb_free_pages(size) == nr; +} + +unsigned long hugetlb_setup(unsigned long nr, unsigned long sizes[], + int max) +{ + unsigned long enabled[10]; + int nr_sizes = 0; + int nr_enabled; + + hugetlb_save_settings(); + + nr_enabled = detect_hugetlb_page_sizes(enabled, ARRAY_SIZE(enabled)); + if (!nr_enabled) + return 0; + + if (nr_enabled > max) { + ksft_print_msg("detected %d huge page sizes, will only test %d\n", nr_enabled, max); + nr_enabled = max; + } + + /* request nr HugeTLB pages of every size. */ + for (int i = 0; i < nr_enabled; i++) { + if (!__hugetlb_setup(enabled[i], nr)) + continue; + sizes[nr_sizes++] = enabled[i]; + } + + return nr_sizes; +} + +static void __hugetlb_save_settings(void) +{ + struct hugetlb_settings *settings = &hugetlb_saved_settings; + int nr_sizes; + + if (hugetlb_settings_saved) + return; + + settings->default_size = default_huge_page_size(); + if (!settings->default_size) + return; + + nr_sizes = detect_hugetlb_page_sizes(settings->sizes, + HUGETLB_MAX_NR_PAGESIZES); + if (!nr_sizes) { + settings->default_size = 0; + return; + } + + for (int i = 0; i < nr_sizes; i++) { + unsigned long sz = settings->sizes[i]; + + if (!sz) + continue; + settings->nr_hugepages[i] = hugetlb_nr_pages(sz); + } + + settings->nr_sizes = nr_sizes; + hugetlb_settings_saved = true; +} + +void hugetlb_restore_settings(void) +{ + struct hugetlb_settings *settings = &hugetlb_saved_settings; + + if (!hugetlb_settings_saved || !settings->default_size) + return; + + for (int i = 0; i < HUGETLB_MAX_NR_PAGESIZES; i++) { + unsigned long sz = settings->sizes[i]; + + if (!sz) + continue; + + hugetlb_set_nr_pages(sz, settings->nr_hugepages[i]); + } +} + +static void hugepage_restore_settings_atexit(void) +{ + if (thp_settings_saved) + thp_restore_settings(); + if (hugetlb_settings_saved) + hugetlb_restore_settings(); +} + +static void hugepage_restore_settings_sighandler(int sig) +{ + /* exit() will invoke the hugepage_restore_settings_atexit handler. */ + exit(KSFT_FAIL); +} + +void hugepage_save_settings(bool thp, bool hugetlb) +{ + if (!thp && !hugetlb) + return; + + if (thp) + __thp_save_settings(); + if (hugetlb) + __hugetlb_save_settings(); + + /* + * setup exit hooks to make sure THP and HugeTLB settings are + * restored on graceful and error exits and signals + */ + atexit(hugepage_restore_settings_atexit); + signal(SIGTERM, hugepage_restore_settings_sighandler); + signal(SIGINT, hugepage_restore_settings_sighandler); + signal(SIGHUP, hugepage_restore_settings_sighandler); + signal(SIGQUIT, hugepage_restore_settings_sighandler); +} diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/hugepage_settings.h index 7748a9009191..726c73c43c05 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/hugepage_settings.h @@ -1,11 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __THP_SETTINGS_H__ -#define __THP_SETTINGS_H__ +#ifndef __HUGEPAGE_SETTINGS_H__ +#define __HUGEPAGE_SETTINGS_H__ #include <stdbool.h> #include <stddef.h> #include <stdint.h> +void hugepage_save_settings(bool thp, bool hugetlb); + +/* Transparent Huge Pages (THP) */ + enum thp_enabled { THP_NEVER, THP_ALWAYS, @@ -62,10 +66,6 @@ struct thp_settings { struct shmem_hugepages_settings shmem_hugepages[NR_ORDERS]; }; -int read_file(const char *path, char *buf, size_t buflen); -unsigned long read_num(const char *path); -void write_num(const char *path, unsigned long num); - int thp_read_string(const char *name, const char * const strings[]); void thp_write_string(const char *name, const char *val); unsigned long thp_read_num(const char *name); @@ -77,7 +77,11 @@ struct thp_settings *thp_current_settings(void); void thp_push_settings(struct thp_settings *settings); void thp_pop_settings(void); void thp_restore_settings(void); -void thp_save_settings(void); + +static inline void thp_save_settings(void) +{ + hugepage_save_settings(/* thp = */ true, /* hugetlb = */ false); +} void thp_set_read_ahead_path(char *path); unsigned long thp_supported_orders(void); @@ -86,4 +90,66 @@ unsigned long thp_shmem_supported_orders(void); bool thp_available(void); bool thp_is_enabled(void); -#endif /* __THP_SETTINGS_H__ */ +/* HugeTLB */ + +int detect_hugetlb_page_sizes(unsigned long sizes[], int max); +unsigned long default_huge_page_size(void); + +unsigned long hugetlb_nr_pages(unsigned long size); +void hugetlb_set_nr_pages(unsigned long size, unsigned long nr); +unsigned long hugetlb_free_pages(unsigned long size); + +static inline void hugetlb_save_settings(void) +{ + hugepage_save_settings(/* thp = */ false, /* hugetlb = */ true); +} + +void hugetlb_restore_settings(void); + +static inline unsigned long hugetlb_nr_default_pages(void) +{ + unsigned long size = default_huge_page_size(); + + if (!size) + return 0; + + return hugetlb_nr_pages(size); +} + +static inline void hugetlb_set_nr_default_pages(unsigned long nr) +{ + unsigned long size = default_huge_page_size(); + + if (!size) + return; + + hugetlb_set_nr_pages(size, nr); +} + +static inline unsigned long hugetlb_free_default_pages(void) +{ + unsigned long size = default_huge_page_size(); + + if (!size) + return 0; + + return hugetlb_free_pages(size); +} + +static inline bool hugetlb_available(void) +{ + return default_huge_page_size() != 0; +} + +bool hugetlb_setup_default(unsigned long nr); +bool hugetlb_setup_default_exact(unsigned long nr); +unsigned long hugetlb_setup(unsigned long nr, unsigned long sizes[], + int max); + +#define HUGETLB_SETUP_DEFAULT_PAGES(nr_pages) \ +static void __attribute__((constructor)) __hugetlb_setup_default(void) \ +{ \ + hugetlb_setup_default((nr_pages)); \ +} + +#endif /* __HUGEPAGE_SETTINGS_H__ */ diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 5b12041fa310..555b4b3d1430 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -1,15 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* - * hugepage-madvise: + * hugetlb-madvise: * * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE * on hugetlb mappings. - * - * Before running this test, make sure the administrator has pre-allocated - * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, - * the test takes an argument that is the path to a file in a hugetlbfs - * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some - * directory. */ #define _GNU_SOURCE @@ -20,18 +14,18 @@ #include <fcntl.h> #include "vm_util.h" #include "kselftest.h" +#include "hugepage_settings.h" #define MIN_FREE_PAGES 20 #define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ #define validate_free_pages(exp_free) \ do { \ - int fhp = get_free_hugepages(); \ - if (fhp != (exp_free)) { \ - printf("Unexpected number of free huge " \ - "pages line %d\n", __LINE__); \ - exit(1); \ - } \ + unsigned long fhp = hugetlb_free_default_pages(); \ + if (fhp != (exp_free)) \ + ksft_exit_fail_msg("Unexpected number of free " \ + "huge pages %lu, expected %lu line %d\n", \ + fhp, (exp_free), __LINE__); \ } while (0) unsigned long huge_page_size; @@ -57,28 +51,24 @@ int main(int argc, char **argv) int fd; int ret; + ksft_print_header(); + ksft_set_plan(1); + huge_page_size = default_huge_page_size(); - if (!huge_page_size) { - printf("Unable to determine huge page size, exiting!\n"); - exit(1); - } + if (!huge_page_size) + ksft_exit_skip("Unable to determine huge page size\n"); + base_page_size = sysconf(_SC_PAGE_SIZE); - if (!huge_page_size) { - printf("Unable to determine base page size, exiting!\n"); - exit(1); - } + if (!base_page_size) + ksft_exit_fail_msg("Unable to determine base page size\n"); - free_hugepages = get_free_hugepages(); - if (free_hugepages < MIN_FREE_PAGES) { - printf("Not enough free huge pages to test, exiting!\n"); - exit(KSFT_SKIP); - } + if (!hugetlb_setup_default(MIN_FREE_PAGES)) + ksft_exit_skip("Not enough free huge pages (have %lu, need %d)\n", hugetlb_free_default_pages(), MIN_FREE_PAGES); + free_hugepages = hugetlb_free_default_pages(); fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } + if (fd < 0) + ksft_exit_fail_perror("memfd_create"); /* * Test validity of MADV_DONTNEED addr and length arguments. mmap @@ -90,16 +80,13 @@ int main(int argc, char **argv) PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); + if (munmap(addr, huge_page_size) || munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, - huge_page_size)) { - perror("munmap"); - exit(1); - } + huge_page_size)) + ksft_exit_fail_perror("munmap"); addr = addr + huge_page_size; write_fault_pages(addr, NR_HUGE_PAGES); @@ -108,20 +95,14 @@ int main(int argc, char **argv) /* addr before mapping should fail */ ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid addr line %d\n", - __LINE__); - exit(1); - } + if (!ret) + ksft_exit_fail_msg("madvise with invalid addr unexpectedly succeeded line %d\n", __LINE__); /* addr + length after mapping should fail */ ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid length line %d\n", - __LINE__); - exit(1); - } + if (!ret) + ksft_exit_fail_msg("madvise with invalid length unexpectedly succeeded line %d\n", __LINE__); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); @@ -132,10 +113,9 @@ int main(int argc, char **argv) PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); + write_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); @@ -143,19 +123,14 @@ int main(int argc, char **argv) ret = madvise(addr + base_page_size, NR_HUGE_PAGES * huge_page_size - base_page_size, MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with unaligned start address %d\n", - __LINE__); - exit(1); - } + if (!ret) + ksft_exit_fail_msg("madvise with unaligned start unexpectedly succeeded line %d\n", __LINE__); /* addr + length should be aligned down to huge page size */ if (madvise(addr, ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, - MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); /* should free all but last page in mapping */ validate_free_pages(free_hugepages - 1); @@ -170,17 +145,14 @@ int main(int argc, char **argv) PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); + write_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); /* should free all pages in mapping */ validate_free_pages(free_hugepages); @@ -190,29 +162,25 @@ int main(int argc, char **argv) /* * Test MADV_DONTNEED on private mapping of hugetlb file */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_perror("fallocate"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); /* read should not consume any pages */ read_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* writes should allocate private pages */ @@ -220,10 +188,9 @@ int main(int argc, char **argv) validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); /* madvise should free private pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* writes should allocate private pages */ @@ -238,10 +205,9 @@ int main(int argc, char **argv) * implementation. */ if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_perror("fallocate"); + validate_free_pages(free_hugepages); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); @@ -249,29 +215,25 @@ int main(int argc, char **argv) /* * Test MADV_DONTNEED on shared mapping of hugetlb file */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_perror("fallocate"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); /* write should not consume any pages */ write_fault_pages(addr, NR_HUGE_PAGES); validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* @@ -279,29 +241,25 @@ int main(int argc, char **argv) * * madvise is same as hole punch and should free all pages. */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); /* * Test MADV_REMOVE on shared and private mapping of hugetlb file */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) + ksft_exit_fail_perror("fallocate"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); /* shared write should not consume any additional pages */ write_fault_pages(addr, NR_HUGE_PAGES); @@ -310,10 +268,8 @@ int main(int argc, char **argv) addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (addr2 == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr2 == MAP_FAILED) + ksft_exit_fail_perror("mmap"); /* private read should not consume any pages */ read_fault_pages(addr2, NR_HUGE_PAGES); @@ -324,17 +280,15 @@ int main(int argc, char **argv) validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); /* madvise of shared mapping should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); /* madvise of private mapping should free private pages */ - if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); /* private write should consume additional pages again */ @@ -346,15 +300,16 @@ int main(int argc, char **argv) * not correct. private pages should not be freed, but this is * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) + ksft_exit_fail_perror("madvise"); + validate_free_pages(free_hugepages); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); close(fd); - return 0; + + ksft_test_result_pass("MADV_DONTNEED and MADV_REMOVE on hugetlb\n"); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/hugetlb-mmap.c b/tools/testing/selftests/mm/hugetlb-mmap.c new file mode 100644 index 000000000000..0f2aad1b7dbd --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-mmap.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugetlb-mmap: + * + * Example of using huge page memory in a user application using the mmap + * system call. Before running this application, make sure that the + * administrator has mounted the hugetlbfs filesystem (on some directory + * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this + * example, the app is requesting memory of size 256MB that is backed by + * huge pages. + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <linux/memfd.h> +#include "vm_util.h" +#include "kselftest.h" +#include "hugepage_settings.h" + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +static void check_bytes(char *addr) +{ + ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static bool verify_bytes(char *addr, size_t length) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < length; i++) + if (*(addr + i) != (char)i) { + ksft_print_msg("Error: Mismatch at %lu(%p)\n", i, addr); + return false; + } + + return true; +} + +static void test_mmap(size_t length, int mmap_flags, int fd, + const char *test_name) +{ + bool passed = true; + void *addr; + + addr = mmap(NULL, length, PROTECTION, mmap_flags, fd, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); + + ksft_print_msg("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + if (!verify_bytes(addr, length)) + passed = false; + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, length)) + ksft_exit_fail_perror("munmap"); + + ksft_test_result(passed, "%s\n", test_name); +} + +static void test_anon_mmap(size_t length, int shift) +{ + const char *test_name = "hugetlb anonymous mmap"; + int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + + if (shift) + mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + + test_mmap(length, mmap_flags, -1, test_name); +} + +static void test_file_mmap(size_t length, int shift) +{ + const char *test_name = "hugetlb file mmap"; + int mfd_flags = MFD_HUGETLB; + int fd; + + if (shift) + mfd_flags |= (shift & MFD_HUGE_MASK) << MFD_HUGE_SHIFT; + + fd = memfd_create("hugetlb-mmap", mfd_flags); + if (fd < 0) + ksft_exit_fail_perror("memfd_create"); + + test_mmap(length, MAP_SHARED, fd, test_name); + close(fd); +} + +int main(int argc, char **argv) +{ + size_t hugepage_size; + size_t length = LENGTH; + int shift = 0, nr; + + ksft_print_header(); + + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) + shift = atoi(argv[2]); + + hugetlb_save_settings(); + if (shift) { + hugepage_size = (1UL << shift); + ksft_print_msg("%lu kB hugepages\n", 1UL << (shift - 10)); + } else { + hugepage_size = default_huge_page_size(); + if (!hugepage_size) + ksft_exit_skip("Could not detect default hugetlb page size."); + ksft_print_msg("Default size hugepages (%lu kB)\n", hugepage_size >> 10); + } + + /* munmap will fail if the length is not page aligned */ + length = (length + hugepage_size - 1) & ~(hugepage_size - 1); + nr = length / hugepage_size; + + hugetlb_set_nr_pages(hugepage_size, nr); + if (hugetlb_free_pages(hugepage_size) < nr) + ksft_exit_skip("Not enough %lu Kb pages\n", hugepage_size >> 10); + + ksft_set_plan(2); + ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + + test_anon_mmap(length, shift); + test_file_mmap(length, shift); + + ksft_finished(); +} diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugetlb-mremap.c index b8f7d92e5a35..ed3d92e862d8 100644 --- a/tools/testing/selftests/mm/hugepage-mremap.c +++ b/tools/testing/selftests/mm/hugetlb-mremap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * hugepage-mremap: + * hugetlb-mremap: * * Example of remapping huge page memory in a user application using the * mremap system call. The path to a file in a hugetlbfs filesystem must @@ -26,12 +26,13 @@ #include <stdbool.h> #include "kselftest.h" #include "vm_util.h" +#include "hugepage_settings.h" #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) -#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) +#define FLAGS (MAP_HUGETLB | MAP_SHARED) static void check_bytes(char *addr) { @@ -85,31 +86,21 @@ static void register_region_with_uffd(char *addr, size_t len) if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) ksft_exit_fail_msg("ioctl-UFFDIO_API: %s\n", strerror(errno)); - /* Create a private anonymous mapping. The memory will be - * demand-zero paged--that is, not yet allocated. When we - * actually touch the memory, it will be allocated via - * the userfaultfd. - */ - - addr = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); - - ksft_print_msg("Address returned by mmap() = %p\n", addr); - - /* Register the memory range of the mapping we just created for - * handling by the userfaultfd object. In mode, we request to track - * missing pages (i.e., pages that have not yet been faulted in). + /* Register the passed memory range for handling by the userfaultfd object. + * In mode, we request to track missing pages + * (i.e., pages that have not yet been faulted in). */ if (uffd_register(uffd, addr, len, true, false, false)) ksft_exit_fail_msg("ioctl-UFFDIO_REGISTER: %s\n", strerror(errno)); + + ksft_print_msg("Registered memory at address %p with userfaultfd\n", addr); } int main(int argc, char *argv[]) { + unsigned long hugepage_size; + int ret = 0, fd, nr; size_t length = 0; - int ret = 0, fd; ksft_print_header(); ksft_set_plan(1); @@ -125,30 +116,36 @@ int main(int argc, char *argv[]) else length = DEFAULT_LENGTH_MB; + hugepage_size = default_huge_page_size(); + if (!hugepage_size) + ksft_exit_skip("Could not detect default hugetlb page size\n"); length = MB_TO_BYTES(length); + length = (length + hugepage_size - 1) & ~(hugepage_size - 1); + nr = length / hugepage_size; + + if (!hugetlb_setup_default(nr)) + ksft_exit_skip("Not enough huge pages\n"); + fd = memfd_create(argv[0], MFD_HUGETLB); if (fd < 0) ksft_exit_fail_msg("Open failed: %s\n", strerror(errno)); /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ unsigned long suggested_addr = 0x7eaa40000000; - void *haddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, FLAGS, fd, 0); ksft_print_msg("Map haddr: Returned address is %p\n", haddr); if (haddr == MAP_FAILED) ksft_exit_fail_msg("mmap1: %s\n", strerror(errno)); /* mmap again to a dummy address to hopefully trigger pmd sharing. */ suggested_addr = 0x7daa40000000; - void *daddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, FLAGS, fd, 0); ksft_print_msg("Map daddr: Returned address is %p\n", daddr); if (daddr == MAP_FAILED) ksft_exit_fail_msg("mmap3: %s\n", strerror(errno)); suggested_addr = 0x7faa40000000; - void *vaddr = - mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); + void *vaddr = mmap((void *)suggested_addr, length, PROTECTION, FLAGS, fd, 0); ksft_print_msg("Map vaddr: Returned address is %p\n", vaddr); if (vaddr == MAP_FAILED) ksft_exit_fail_msg("mmap2: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c index 46230462ad48..70b24e3660c4 100644 --- a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -10,12 +10,10 @@ #include <sys/statfs.h> #include <errno.h> #include <stdbool.h> +#include <signal.h> #include "kselftest.h" -#define PREFIX " ... " -#define ERROR_PREFIX " !!! " - #define MAX_WRITE_READ_CHUNK_SIZE (getpagesize() * 16) #define MAX(a, b) (((a) > (b)) ? (a) : (b)) @@ -25,17 +23,22 @@ enum test_status { TEST_SKIPPED = 2, }; -static char *status_to_str(enum test_status status) +static void report_status(enum test_status status, const char *test_name, + size_t chunk_size) { switch (status) { case TEST_PASSED: - return "TEST_PASSED"; + ksft_test_result_pass("%s chunk_size=0x%lx\n", + test_name, chunk_size); + break; case TEST_FAILED: - return "TEST_FAILED"; + ksft_test_result_fail("%s chunk_size=0x%lx\n", + test_name, chunk_size); + break; case TEST_SKIPPED: - return "TEST_SKIPPED"; - default: - return "TEST_???"; + ksft_test_result_skip("%s chunk_size=0x%lx\n", + test_name, chunk_size); + break; } } @@ -58,8 +61,8 @@ static bool verify_chunk(char *buf, size_t len, char val) for (i = 0; i < len; ++i) { if (buf[i] != val) { - printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n", - i, buf[i], val); + ksft_print_msg("check fail: buf[%lu] = %u != %u\n", + i, buf[i], val); return false; } } @@ -75,21 +78,21 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, ssize_t total_ret_count = 0; char val = offset / wr_chunk_size + offset % wr_chunk_size; - printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); - printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", - expected); + ksft_print_msg("init val=%u with offset=0x%lx\n", val, offset); + ksft_print_msg("expect to read 0x%lx bytes of data in total\n", + expected); if (lseek(fd, offset, SEEK_SET) < 0) { - perror(PREFIX ERROR_PREFIX "seek failed"); + ksft_perror("seek failed"); return false; } while (offset + total_ret_count < len) { ret_count = read(fd, buf, wr_chunk_size); if (ret_count == 0) { - printf(PREFIX PREFIX "read reach end of the file\n"); + ksft_print_msg("read reach end of the file\n"); break; } else if (ret_count < 0) { - perror(PREFIX ERROR_PREFIX "read failed"); + ksft_perror("read failed"); break; } ++val; @@ -98,8 +101,8 @@ static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, total_ret_count += ret_count; } - printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", - total_ret_count); + ksft_print_msg("actually read 0x%lx bytes of data in total\n", + total_ret_count); return total_ret_count == expected; } @@ -112,15 +115,15 @@ static bool read_hugepage_filemap(int fd, size_t len, ssize_t total_ret_count = 0; char val = 0; - printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", - expected); + ksft_print_msg("expect to read 0x%lx bytes of data in total\n", + expected); while (total_ret_count < len) { ret_count = read(fd, buf, wr_chunk_size); if (ret_count == 0) { - printf(PREFIX PREFIX "read reach end of the file\n"); + ksft_print_msg("read reach end of the file\n"); break; } else if (ret_count < 0) { - perror(PREFIX ERROR_PREFIX "read failed"); + ksft_perror("read failed"); break; } ++val; @@ -129,8 +132,8 @@ static bool read_hugepage_filemap(int fd, size_t len, total_ret_count += ret_count; } - printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", - total_ret_count); + ksft_print_msg("actually read 0x%lx bytes of data in total\n", + total_ret_count); return total_ret_count == expected; } @@ -142,14 +145,14 @@ test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size) char *filemap = NULL; if (ftruncate(fd, len) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate failed"); + ksft_perror("ftruncate failed"); return status; } filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (filemap == MAP_FAILED) { - perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + ksft_perror("mmap for primary mapping failed"); goto done; } @@ -162,7 +165,7 @@ test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size) munmap(filemap, len); done: if (ftruncate(fd, 0) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + ksft_perror("ftruncate back to 0 failed"); status = TEST_FAILED; } @@ -179,14 +182,14 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, const unsigned long pagesize = getpagesize(); if (ftruncate(fd, len) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate failed"); + ksft_perror("ftruncate failed"); return status; } filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (filemap == MAP_FAILED) { - perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + ksft_perror("mmap for primary mapping failed"); goto done; } @@ -201,7 +204,7 @@ test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, */ hwp_addr = filemap + len / 2 + pagesize; if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) { - perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed"); + ksft_perror("MADV_HWPOISON failed"); goto unmap; } @@ -228,7 +231,7 @@ unmap: munmap(filemap, len); done: if (ftruncate(fd, 0) < 0) { - perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + ksft_perror("ftruncate back to 0 failed"); status = TEST_FAILED; } @@ -241,17 +244,17 @@ static int create_hugetlbfs_file(struct statfs *file_stat) fd = memfd_create("hugetlb_tmp", MFD_HUGETLB); if (fd < 0) { - perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file"); + ksft_perror("could not open hugetlbfs file"); return -1; } memset(file_stat, 0, sizeof(*file_stat)); if (fstatfs(fd, file_stat)) { - perror(PREFIX ERROR_PREFIX "fstatfs failed"); + ksft_perror("fstatfs failed"); goto close; } if (file_stat->f_type != HUGETLBFS_MAGIC) { - printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n"); + ksft_print_msg("not hugetlbfs file\n"); goto close; } @@ -261,6 +264,10 @@ close: return -1; } +static void sigbus_handler(int sig) +{ +} + int main(void) { int fd; @@ -273,50 +280,44 @@ int main(void) }; size_t i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(wr_chunk_sizes) * 3); + + signal(SIGBUS, sigbus_handler); for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) { - printf("Write/read chunk size=0x%lx\n", - wr_chunk_sizes[i]); + ksft_print_msg("Write/read chunk size=0x%lx\n", + wr_chunk_sizes[i]); fd = create_hugetlbfs_file(&file_stat); if (fd < 0) - goto create_failure; - printf(PREFIX "HugeTLB read regression test...\n"); + ksft_exit_fail_msg("Failed to create hugetlbfs file\n"); + status = test_hugetlb_read(fd, file_stat.f_bsize, wr_chunk_sizes[i]); - printf(PREFIX "HugeTLB read regression test...%s\n", - status_to_str(status)); close(fd); - if (status == TEST_FAILED) - return -1; + report_status(status, "HugeTLB read regression", + wr_chunk_sizes[i]); fd = create_hugetlbfs_file(&file_stat); if (fd < 0) - goto create_failure; - printf(PREFIX "HugeTLB read HWPOISON test...\n"); + ksft_exit_fail_msg("Failed to create hugetlbfs file\n"); + status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, wr_chunk_sizes[i], false); - printf(PREFIX "HugeTLB read HWPOISON test...%s\n", - status_to_str(status)); close(fd); - if (status == TEST_FAILED) - return -1; + report_status(status, "HugeTLB read HWPOISON", + wr_chunk_sizes[i]); fd = create_hugetlbfs_file(&file_stat); if (fd < 0) - goto create_failure; - printf(PREFIX "HugeTLB seek then read HWPOISON test...\n"); + ksft_exit_fail_msg("Failed to create hugetlbfs file\n"); + status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, wr_chunk_sizes[i], true); - printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n", - status_to_str(status)); close(fd); - if (status == TEST_FAILED) - return -1; + report_status(status, "HugeTLB seek then read HWPOISON", + wr_chunk_sizes[i]); } - return 0; - -create_failure: - printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n"); - return -1; + ksft_finished(); } diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugetlb-shm.c index ef06260802b5..3ff7f062b7eb 100644 --- a/tools/testing/selftests/mm/hugepage-shm.c +++ b/tools/testing/selftests/mm/hugetlb-shm.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * hugepage-shm: + * hugetlb-shm: * * Example of using huge page memory in a user application using Sys V shared * memory system calls. In this example the app is requesting 256MB of @@ -28,9 +28,27 @@ #include <sys/shm.h> #include <sys/mman.h> +#include "vm_util.h" +#include "hugepage_settings.h" + #define LENGTH (256UL*1024*1024) -#define dprintf(x) printf(x) +static void prepare(void) +{ + unsigned long length, hugepage_size, nr; + + hugepage_size = default_huge_page_size(); + if (!hugepage_size) + ksft_exit_skip("Unable to determine huge page size\n"); + + length = (LENGTH + hugepage_size - 1) & ~(hugepage_size - 1); + nr = length / hugepage_size; + + if (!hugetlb_setup_default(nr)) + ksft_exit_skip("Not enough free huge pages\n"); + + shm_limits_prepare(length); +} int main(void) { @@ -38,44 +56,45 @@ int main(void) unsigned long i; char *shmaddr; + ksft_print_header(); + ksft_set_plan(1); + + prepare(); + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } - printf("shmid: 0x%x\n", shmid); + if (shmid < 0) + ksft_exit_fail_perror("shmget"); + + ksft_print_msg("shmid: 0x%x\n", shmid); shmaddr = shmat(shmid, NULL, 0); if (shmaddr == (char *)-1) { - perror("Shared memory attach failure"); + ksft_perror("Shared memory attach failure"); shmctl(shmid, IPC_RMID, NULL); - exit(2); + ksft_exit_fail(); } - printf("shmaddr: %p\n", shmaddr); + ksft_print_msg("shmaddr: %p\n", shmaddr); - dprintf("Starting the writes:\n"); - for (i = 0; i < LENGTH; i++) { + ksft_print_msg("Starting the writes:\n"); + for (i = 0; i < LENGTH; i++) shmaddr[i] = (char)(i); - if (!(i % (1024 * 1024))) - dprintf("."); - } - dprintf("\n"); - dprintf("Starting the Check..."); + ksft_print_msg("Starting the Check..."); for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) { - printf("\nIndex %lu mismatched\n", i); - exit(3); - } - dprintf("Done.\n"); + if (shmaddr[i] != (char)i) + ksft_exit_fail_msg("Data mismatch at index %lu\n", i); + ksft_print_msg("Done.\n"); if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); + ksft_perror("Detach failure"); shmctl(shmid, IPC_RMID, NULL); - exit(4); + ksft_exit_fail(); } shmctl(shmid, IPC_RMID, NULL); - return 0; + ksft_test_result_pass("hugepage using SysV shmget/shmat\n"); + ksft_finished(); } + +SHM_LIMITS_RESTORE() diff --git a/tools/testing/selftests/mm/hugetlb-soft-offline.c b/tools/testing/selftests/mm/hugetlb-soft-offline.c index a8bc02688085..bc202e4ed2bd 100644 --- a/tools/testing/selftests/mm/hugetlb-soft-offline.c +++ b/tools/testing/selftests/mm/hugetlb-soft-offline.c @@ -6,9 +6,7 @@ * - if enable_soft_offline = 1, a hugepage should be dissolved and * nr_hugepages/free_hugepages should be reduced by 1. * - * Before running, make sure more than 2 hugepages of default_hugepagesz - * are allocated. For example, if /proc/meminfo/Hugepagesize is 2048kB: - * echo 8 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + * The test allocates 8 default hugepages */ #define _GNU_SOURCE @@ -25,6 +23,7 @@ #include <sys/types.h> #include "kselftest.h" +#include "hugepage_settings.h" #ifndef MADV_SOFT_OFFLINE #define MADV_SOFT_OFFLINE 101 @@ -100,32 +99,6 @@ static int set_enable_soft_offline(int value) return 0; } -static int read_nr_hugepages(unsigned long hugepage_size, - unsigned long *nr_hugepages) -{ - char buffer[256] = {0}; - char cmd[256] = {0}; - - sprintf(cmd, "cat /sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages", - hugepage_size); - FILE *cmdfile = popen(cmd, "r"); - - if (cmdfile == NULL) { - ksft_perror(EPREFIX "failed to popen nr_hugepages"); - return -1; - } - - if (!fgets(buffer, sizeof(buffer), cmdfile)) { - ksft_perror(EPREFIX "failed to read nr_hugepages"); - pclose(cmdfile); - return -1; - } - - *nr_hugepages = atoll(buffer); - pclose(cmdfile); - return 0; -} - static int create_hugetlbfs_file(struct statfs *file_stat) { int fd; @@ -177,20 +150,14 @@ static void test_soft_offline_common(int enable_soft_offline) ksft_exit_fail_msg("Failed to set enable_soft_offline\n"); } - if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_before) != 0) { - close(fd); - ksft_exit_fail_msg("Failed to read nr_hugepages\n"); - } + nr_hugepages_before = hugetlb_nr_default_pages(); ksft_print_msg("Before MADV_SOFT_OFFLINE nr_hugepages=%ld\n", nr_hugepages_before); ret = do_soft_offline(fd, 2 * file_stat.f_bsize, expect_errno); - if (read_nr_hugepages(hugepagesize_kb, &nr_hugepages_after) != 0) { - close(fd); - ksft_exit_fail_msg("Failed to read nr_hugepages\n"); - } + nr_hugepages_after = hugetlb_nr_default_pages(); ksft_print_msg("After MADV_SOFT_OFFLINE nr_hugepages=%ld\n", nr_hugepages_after); @@ -219,6 +186,10 @@ static void test_soft_offline_common(int enable_soft_offline) int main(int argc, char **argv) { ksft_print_header(); + + if (!hugetlb_setup_default(8)) + ksft_exit_skip("not enough hugetlb pages\n"); + ksft_set_plan(2); test_soft_offline_common(1); diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugetlb-vmemmap.c index df366a4d1b92..507df78a158d 100644 --- a/tools/testing/selftests/mm/hugepage-vmemmap.c +++ b/tools/testing/selftests/mm/hugetlb-vmemmap.c @@ -11,6 +11,7 @@ #include <sys/mman.h> #include <fcntl.h> #include "vm_util.h" +#include "hugepage_settings.h" #define PAGE_COMPOUND_HEAD (1UL << 15) #define PAGE_COMPOUND_TAIL (1UL << 16) @@ -63,7 +64,7 @@ static int check_page_flags(unsigned long pfn) read(fd, &pageflags, sizeof(pageflags)); if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { close(fd); - printf("Head page flags (%lx) is invalid\n", pageflags); + ksft_print_msg("Head page flags (%lx) is invalid\n", pageflags); return -1; } @@ -77,7 +78,7 @@ static int check_page_flags(unsigned long pfn) if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { close(fd); - printf("Tail page flags (%lx) is invalid\n", pageflags); + ksft_print_msg("Tail page flags (%lx) is invalid\n", pageflags); return -1; } } @@ -91,44 +92,41 @@ int main(int argc, char **argv) { void *addr; unsigned long pfn; + int ret; + + ksft_print_header(); + ksft_set_plan(1); + + if (!hugetlb_setup_default(1)) + ksft_exit_skip("Not enough free huge pages\n"); pagesize = psize(); maplength = default_huge_page_size(); - if (!maplength) { - printf("Unable to determine huge page size\n"); - exit(1); - } + if (!maplength) + ksft_exit_skip("Unable to determine huge page size\n"); addr = mmap(NULL, maplength, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } + if (addr == MAP_FAILED) + ksft_exit_fail_perror("mmap"); /* Trigger allocation of HugeTLB page. */ write_bytes(addr, maplength); pfn = virt_to_pfn(addr); if (pfn == -1UL) { + ksft_perror("virt_to_pfn"); munmap(addr, maplength); - perror("virt_to_pfn"); - exit(1); + ksft_exit_fail(); } - printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + ksft_print_msg("Returned address is %p whose pfn is %lx\n", addr, pfn); - if (check_page_flags(pfn) < 0) { - munmap(addr, maplength); - perror("check_page_flags"); - exit(1); - } + ret = check_page_flags(pfn); - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, maplength)) { - perror("munmap"); - exit(1); - } + if (munmap(addr, maplength)) + ksft_exit_fail_perror("munmap"); - return 0; + ksft_test_result(!ret, "HugeTLB vmemmap page flags\n"); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index 31a054fa8134..fb4600570e13 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -20,6 +20,7 @@ #include <sys/syscall.h> #include "vm_util.h" #include "kselftest.h" +#include "hugepage_settings.h" #ifndef STATX_DIOALIGN #define STATX_DIOALIGN 0x00002000U @@ -84,19 +85,13 @@ static void run_dio_using_hugetlb(int fd, unsigned int start_off, /* Get the default huge page size */ h_pagesize = default_huge_page_size(); - if (!h_pagesize) - ksft_exit_fail_msg("Unable to determine huge page size\n"); /* Reset file position since fd is shared across tests */ if (lseek(fd, 0, SEEK_SET) < 0) ksft_exit_fail_perror("lseek failed\n"); /* Get the free huge pages before allocation */ - free_hpage_b = get_free_hugepages(); - if (free_hpage_b == 0) { - close(fd); - ksft_exit_skip("No free hugepage, exiting!\n"); - } + free_hpage_b = hugetlb_free_default_pages(); /* Allocate a hugetlb page */ orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0); @@ -120,7 +115,7 @@ static void run_dio_using_hugetlb(int fd, unsigned int start_off, munmap(orig_buffer, h_pagesize); /* Get the free huge pages after unmap*/ - free_hpage_a = get_free_hugepages(); + free_hpage_a = hugetlb_free_default_pages(); ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); @@ -140,8 +135,8 @@ int main(void) ksft_print_header(); - /* Check if huge pages are free */ - if (!get_free_hugepages()) + /* request a huge page */ + if (!hugetlb_setup_default(1)) ksft_exit_skip("No free hugepage, exiting\n"); fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664); diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index b4b257775b74..2dc158054f66 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -10,6 +10,7 @@ #include "vm_util.h" #include "kselftest.h" +#include "hugepage_settings.h" #define INLOOP_ITER 100 @@ -53,7 +54,6 @@ void *madv(void *unused) int main(void) { - unsigned long free_hugepages; pthread_t thread1, thread2; /* * On kernel 6.4, we are able to reproduce the problem with ~1000 @@ -77,11 +77,8 @@ int main(void) ksft_print_msg("[INFO] detected default hugetlb page size: %zu KiB\n", huge_page_size / 1024); - free_hugepages = get_free_hugepages(); - if (free_hugepages != 1) { - ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", - free_hugepages); - } + if (!hugetlb_setup_default(1)) + ksft_exit_skip("Not enough HugeTLB pages\n"); while (max--) { huge_ptr = mmap(NULL, huge_page_size, PROT_READ | PROT_WRITE, diff --git a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c index efd774b41389..f94549efcc6f 100644 --- a/tools/testing/selftests/mm/hugetlb_madv_vs_map.c +++ b/tools/testing/selftests/mm/hugetlb_madv_vs_map.c @@ -25,7 +25,7 @@ #include <unistd.h> #include "vm_util.h" -#include "kselftest.h" +#include "hugepage_settings.h" #define INLOOP_ITER 100 @@ -77,7 +77,6 @@ void *map_extra(void *unused) int main(void) { pthread_t thread1, thread2, thread3; - unsigned long free_hugepages; void *ret; /* @@ -86,12 +85,12 @@ int main(void) */ int max = 10; - free_hugepages = get_free_hugepages(); + ksft_print_header(); + ksft_set_plan(1); - if (free_hugepages != 1) { + if (!hugetlb_setup_default_exact(1)) ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", - free_hugepages); - } + hugetlb_free_default_pages()); mmap_size = default_huge_page_size(); @@ -100,10 +99,8 @@ int main(void) MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); - if ((unsigned long)huge_ptr == -1) { - ksft_test_result_fail("Failed to allocate huge page\n"); - return KSFT_FAIL; - } + if ((unsigned long)huge_ptr == -1) + ksft_exit_fail_msg("Failed to allocate huge page\n"); pthread_create(&thread1, NULL, madv, NULL); pthread_create(&thread2, NULL, touch, NULL); @@ -115,12 +112,13 @@ int main(void) if (ret) { ksft_test_result_fail("Unexpected huge page allocation\n"); - return KSFT_FAIL; + ksft_finished(); } /* Unmap and restart */ munmap(huge_ptr, mmap_size); } - return KSFT_PASS; + ksft_test_result_pass("No unexpected huge page allocations\n"); + ksft_finished(); } diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh index 0dd31892ff67..95f517c3bd16 100755 --- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -12,6 +12,8 @@ if [[ $(id -u) -ne 0 ]]; then fi nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) +trap 'echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages' EXIT INT TERM + usage_file=usage_in_bytes if [[ "$1" == "-cgroup-v2" ]]; then @@ -46,6 +48,13 @@ function get_machine_hugepage_size() { } MB=$(get_machine_hugepage_size) +if (( MB >= 1024 )); then + UNIT="GB" + MB_DISPLAY=$((MB / 1024)) +else + UNIT="MB" + MB_DISPLAY=$MB +fi function cleanup() { echo cleanup @@ -56,7 +65,6 @@ function cleanup() { rmdir "$CGROUP_ROOT"/a/b 2>/dev/null rmdir "$CGROUP_ROOT"/a 2>/dev/null rmdir "$CGROUP_ROOT"/test1 2>/dev/null - echo $nr_hugepgs >/proc/sys/vm/nr_hugepages set -e } @@ -87,6 +95,7 @@ function assert_with_retry() { if [[ $elapsed -ge $timeout ]]; then echo "actual = $((${actual%% *} / 1024 / 1024)) MB" echo "expected = $((${expected%% *} / 1024 / 1024)) MB" + echo FAIL cleanup exit 1 fi @@ -96,22 +105,19 @@ function assert_with_retry() { } function assert_state() { - local expected_a="$1" - local expected_a_hugetlb="$2" - local expected_b="" + local expected_a_hugetlb="$1" local expected_b_hugetlb="" - if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then - expected_b="$3" - expected_b_hugetlb="$4" + if [ ! -z ${2:-} ]; then + expected_b_hugetlb="$2" fi - assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a" - assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" "$expected_a_hugetlb" + assert_with_retry \ + "$CGROUP_ROOT/a/hugetlb.${MB_DISPLAY}${UNIT}.$usage_file" "$expected_a_hugetlb" - if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then - assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b" - assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" "$expected_b_hugetlb" + if [[ -n "$expected_b_hugetlb" ]]; then + assert_with_retry \ + "$CGROUP_ROOT/a/b/hugetlb.${MB_DISPLAY}${UNIT}.$usage_file" "$expected_b_hugetlb" fi } @@ -143,18 +149,17 @@ write_hugetlbfs() { local size="$3" if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs + cg_file="$CGROUP_ROOT/$cgroup/cgroup.procs" else echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus - echo $$ >"$CGROUP_ROOT/$cgroup/tasks" - fi - ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/cgroup.procs - else - echo $$ >"$CGROUP_ROOT/tasks" + cg_file="$CGROUP_ROOT/$cgroup/tasks" fi + + # Spawn helper to join cgroup before exec to ensure correct cgroup accounting + bash -c 'echo $$ > "$1"; exec ./write_to_hugetlbfs -p "$2" -s "$3" -m 0 -o' _ \ + "$cg_file" "$path" "$size" & pid=$! + wait "$pid" echo } @@ -192,21 +197,21 @@ if [[ ! $cgroup2 ]]; then write_hugetlbfs a "$MNT"/test $size echo Assert memory charged correctly for parent use. - assert_state 0 $size 0 0 + assert_state $size 0 write_hugetlbfs a/b "$MNT"/test2 $size echo Assert memory charged correctly for child use. - assert_state 0 $(($size * 2)) 0 $size + assert_state $(($size * 2)) $size rmdir "$CGROUP_ROOT"/a/b echo Assert memory reparent correctly. - assert_state 0 $(($size * 2)) + assert_state $(($size * 2)) rm -rf "$MNT"/* umount "$MNT" echo Assert memory uncharged correctly. - assert_state 0 0 + assert_state 0 cleanup fi @@ -220,16 +225,16 @@ echo write write_hugetlbfs a/b "$MNT"/test2 $size echo Assert memory charged correctly for child only use. -assert_state 0 $(($size)) 0 $size +assert_state $(($size)) $size rmdir "$CGROUP_ROOT"/a/b echo Assert memory reparent correctly. -assert_state 0 $size +assert_state $size rm -rf "$MNT"/* umount "$MNT" echo Assert memory uncharged correctly. -assert_state 0 0 +assert_state 0 cleanup @@ -240,4 +245,3 @@ if [[ $do_umount ]]; then rm -rf $CGROUP_ROOT fi -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index 3fe7ef04ac62..10e8dedcb087 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -22,7 +22,7 @@ #include "linux/magic.h" #include "vm_util.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; @@ -41,6 +41,12 @@ enum vma_type { VMA_SHMEM, }; +enum file_setup_ops { + FILE_SETUP_READ_ONLY_FS, + FILE_SETUP_READ_WRITE_FS_READ_DATA, + FILE_SETUP_READ_WRITE_FS_WRITE_DATA, +}; + struct mem_ops { void *(*setup_area)(int nr_hpages); void (*cleanup_area)(void *p, unsigned long size); @@ -49,7 +55,9 @@ struct mem_ops { const char *name; }; -static struct mem_ops *file_ops; +static struct mem_ops *read_only_file_ops; +static struct mem_ops *read_write_file_read_ops; +static struct mem_ops *read_write_file_write_ops; static struct mem_ops *anon_ops; static struct mem_ops *shmem_ops; @@ -72,57 +80,36 @@ struct file_info { }; static struct file_info finfo; -static bool skip_settings_restore; static int exit_status; static void success(const char *msg) { printf(" \e[32m%s\e[0m\n", msg); + exit_status = KSFT_PASS; } static void fail(const char *msg) { printf(" \e[31m%s\e[0m\n", msg); - exit_status++; + exit_status = KSFT_FAIL; } static void skip(const char *msg) { printf(" \e[33m%s\e[0m\n", msg); -} - -static void restore_settings_atexit(void) -{ - if (skip_settings_restore) - return; - - printf("Restore THP and khugepaged settings..."); - thp_restore_settings(); - success("OK"); - - skip_settings_restore = true; -} - -static void restore_settings(int sig) -{ - /* exit() will invoke the restore_settings_atexit handler. */ - exit(sig ? EXIT_FAILURE : exit_status); + exit_status = KSFT_SKIP; } static void save_settings(void) { - printf("Save THP and khugepaged settings..."); - if (file_ops && finfo.type == VMA_FILE) + ksft_print_msg("Save THP and khugepaged settings..."); + if ((read_only_file_ops || read_write_file_read_ops || + read_write_file_write_ops) && + finfo.type == VMA_FILE) thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path); thp_save_settings(); success("OK"); - - atexit(restore_settings_atexit); - signal(SIGTERM, restore_settings); - signal(SIGINT, restore_settings); - signal(SIGHUP, restore_settings); - signal(SIGQUIT, restore_settings); } static void get_finfo(const char *dir) @@ -135,19 +122,13 @@ static void get_finfo(const char *dir) finfo.dir = dir; stat(finfo.dir, &path_stat); - if (!S_ISDIR(path_stat.st_mode)) { - printf("%s: Not a directory (%s)\n", __func__, finfo.dir); - exit(EXIT_FAILURE); - } + if (!S_ISDIR(path_stat.st_mode)) + ksft_exit_fail_msg("%s: Not a directory (%s)\n", __func__, finfo.dir); if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (statfs(finfo.dir, &fs)) { - perror("statfs()"); - exit(EXIT_FAILURE); - } + finfo.dir) >= sizeof(finfo.path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); + if (statfs(finfo.dir, &fs)) + ksft_exit_fail_perror("statfs()"); finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; if (finfo.type == VMA_SHMEM) return; @@ -155,40 +136,30 @@ static void get_finfo(const char *dir) /* Find owning device's queue/read_ahead_kb control */ if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } + >= sizeof(path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); + if (read_file(path, buf, sizeof(buf)) < 0) + ksft_exit_fail_perror("read_file(read_num)"); if (strstr(buf, "DEVTYPE=disk")) { /* Found it */ if (snprintf(finfo.dev_queue_read_ahead_path, sizeof(finfo.dev_queue_read_ahead_path), "/sys/dev/block/%d:%d/queue/read_ahead_kb", major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } + >= sizeof(finfo.dev_queue_read_ahead_path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); return; } - if (!strstr(buf, "DEVTYPE=partition")) { - printf("%s: Unknown device type: %s\n", __func__, path); - exit(EXIT_FAILURE); - } + if (!strstr(buf, "DEVTYPE=partition")) + ksft_exit_fail_msg("%s: Unknown device type: %s\n", __func__, path); /* * Partition of block device - need to find actual device. * Using naming convention that devnameN is partition of * device devname. */ str = strstr(buf, "DEVNAME="); - if (!str) { - printf("%s: Could not read: %s", __func__, path); - exit(EXIT_FAILURE); - } + if (!str) + ksft_exit_fail_msg("%s: Could not read: %s", __func__, path); str += 8; end = str; while (*end) { @@ -197,16 +168,13 @@ static void get_finfo(const char *dir) if (snprintf(finfo.dev_queue_read_ahead_path, sizeof(finfo.dev_queue_read_ahead_path), "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } + str) >= sizeof(finfo.dev_queue_read_ahead_path)) + ksft_exit_fail_msg("%s: Pathname is too long\n", __func__); return; } ++end; } - printf("%s: Could not read: %s\n", __func__, path); - exit(EXIT_FAILURE); + ksft_exit_fail_msg("%s: Could not read: %s\n", __func__, path); } static bool check_swap(void *addr, unsigned long size) @@ -219,26 +187,19 @@ static bool check_swap(void *addr, unsigned long size) ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, PID_SMAPS); if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", size >> 10); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); /* * Fetch the Swap: in the same block and check whether it got * the expected number of hugeepages next. @@ -261,10 +222,8 @@ static void *alloc_mapping(int nr) p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (p != BASE_ADDR) { - printf("Failed to allocate VMA at %p\n", BASE_ADDR); - exit(EXIT_FAILURE); - } + if (p != BASE_ADDR) + ksft_exit_fail_msg("Failed to allocate VMA at %p\n", BASE_ADDR); return p; } @@ -314,19 +273,13 @@ static void *alloc_hpage(struct mem_ops *ops) * khugepaged on low-load system (like a test machine), which * would cause MADV_COLLAPSE to fail with EAGAIN. */ - printf("Allocate huge page..."); - if (madvise_collapse_retry(p, hpage_pmd_size)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (!ops->check_huge(p, 1)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { - perror("madvise(MADV_HUGEPAGE)"); - exit(EXIT_FAILURE); - } + ksft_print_msg("Allocate huge page..."); + if (madvise_collapse_retry(p, hpage_pmd_size)) + ksft_exit_fail_perror("madvise(MADV_COLLAPSE)"); + if (!ops->check_huge(p, 1)) + ksft_exit_fail_perror("madvise(MADV_COLLAPSE)"); + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) + ksft_exit_fail_perror("madvise(MADV_HUGEPAGE)"); success("OK"); return p; } @@ -336,11 +289,9 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) int i; for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { - printf("Page %d is corrupted: %#x\n", - i, p[i * page_size / sizeof(*p)]); - exit(EXIT_FAILURE); - } + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) + ksft_exit_fail_msg("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); } } @@ -364,42 +315,52 @@ static bool anon_check_huge(void *addr, int nr_hpages) return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } -static void *file_setup_area(int nr_hpages) +static void *file_setup_area_common(int nr_hpages, enum file_setup_ops setup) { + const int open_opt = setup == FILE_SETUP_READ_ONLY_FS ? O_RDONLY : O_RDWR; + const int mmap_prot = setup == FILE_SETUP_READ_ONLY_FS ? PROT_READ : (PROT_READ | PROT_WRITE); int fd; void *p; unsigned long size; unlink(finfo.path); /* Cleanup from previous failed tests */ - printf("Creating %s for collapse%s...", finfo.path, - finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); - fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + ksft_print_msg("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL, 777); - if (fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } + if (fd < 0) + ksft_exit_fail_perror("open()"); size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); - write(fd, p, size); - close(fd); - munmap(p, size); - success("OK"); - - printf("Opening %s read only for collapse...", finfo.path); - finfo.fd = open(finfo.path, O_RDONLY, 777); - if (finfo.fd < 0) { - perror("open()"); + if (ftruncate(fd, size)) { + perror("ftruncate()"); exit(EXIT_FAILURE); } - p = mmap(BASE_ADDR, size, PROT_READ, - MAP_PRIVATE, finfo.fd, 0); - if (p == MAP_FAILED || p != BASE_ADDR) { + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (p != BASE_ADDR) { perror("mmap()"); exit(EXIT_FAILURE); } + fill_memory(p, 0, size); + if (msync(p, size, MS_SYNC)) { + perror("msync()"); + exit(EXIT_FAILURE); + } + close(fd); + munmap(p, size); + success("OK"); + ksft_print_msg("Opening %s %s for collapse...", finfo.path, + setup == FILE_SETUP_READ_ONLY_FS ? "read-only" : + setup == FILE_SETUP_READ_WRITE_FS_READ_DATA ? + "read-write (read)" : + "read-write (write)"); + finfo.fd = open(finfo.path, open_opt, 777); + if (finfo.fd < 0) + ksft_exit_fail_perror("open()"); + p = mmap(BASE_ADDR, size, mmap_prot, MAP_SHARED, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) + ksft_exit_fail_perror("mmap()"); /* Drop page cache */ write_file("/proc/sys/vm/drop_caches", "3", 2); @@ -407,6 +368,21 @@ static void *file_setup_area(int nr_hpages) return p; } +static void *file_setup_read_only_area(int nr_hpages) +{ + return file_setup_area_common(nr_hpages, FILE_SETUP_READ_ONLY_FS); +} + +static void *file_setup_read_write_fs_read_area(int nr_hpages) +{ + return file_setup_area_common(nr_hpages, FILE_SETUP_READ_WRITE_FS_READ_DATA); +} + +static void *file_setup_read_write_fs_write_area(int nr_hpages) +{ + return file_setup_area_common(nr_hpages, FILE_SETUP_READ_WRITE_FS_WRITE_DATA); +} + static void file_cleanup_area(void *p, unsigned long size) { munmap(p, size); @@ -414,12 +390,26 @@ static void file_cleanup_area(void *p, unsigned long size) unlink(finfo.path); } -static void file_fault(void *p, unsigned long start, unsigned long end) +static void file_fault_read(void *p, unsigned long start, unsigned long end) { - if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { - perror("madvise(MADV_POPULATE_READ"); - exit(EXIT_FAILURE); - } + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) + ksft_exit_fail_perror("madvise(MADV_POPULATE_READ)"); +} + +static void file_fault_read_and_flush(void *p, unsigned long start, unsigned long end) +{ + file_fault_read(p, start, end); + /* + * make folio clean, since dirty folios from read&write file are + * rejected and not flushed + */ + msync((char *)p + start, end - start, MS_SYNC); +} + +static void file_fault_write(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_WRITE)) + ksft_exit_fail_perror("madvise(MADV_POPULATE_WRITE)"); } static bool file_check_huge(void *addr, int nr_hpages) @@ -441,20 +431,14 @@ static void *shmem_setup_area(int nr_hpages) unsigned long size = nr_hpages * hpage_pmd_size; finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); - if (finfo.fd < 0) { - perror("memfd_create()"); - exit(EXIT_FAILURE); - } - if (ftruncate(finfo.fd, size)) { - perror("ftruncate()"); - exit(EXIT_FAILURE); - } + if (finfo.fd < 0) + ksft_exit_fail_perror("memfd_create()"); + if (ftruncate(finfo.fd, size)) + ksft_exit_fail_perror("ftruncate()"); p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, 0); - if (p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } + if (p != BASE_ADDR) + ksft_exit_fail_perror("mmap()"); return p; } @@ -477,10 +461,26 @@ static struct mem_ops __anon_ops = { .name = "anon", }; -static struct mem_ops __file_ops = { - .setup_area = &file_setup_area, +static struct mem_ops __read_only_file_ops = { + .setup_area = &file_setup_read_only_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault_read, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __read_write_file_read_ops = { + .setup_area = &file_setup_read_write_fs_read_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault_read_and_flush, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __read_write_file_write_ops = { + .setup_area = &file_setup_read_write_fs_write_area, .cleanup_area = &file_cleanup_area, - .fault = &file_fault, + .fault = &file_fault_write, .check_huge = &file_check_huge, .name = "file", }; @@ -493,13 +493,32 @@ static struct mem_ops __shmem_ops = { .name = "shmem", }; +static bool is_tmpfs(struct mem_ops *ops) +{ + return (ops == &__read_only_file_ops || + ops == &__read_write_file_read_ops || + ops == &__read_write_file_write_ops) && + finfo.type == VMA_SHMEM; +} + +static bool is_anon(struct mem_ops *ops) +{ + return ops == &__anon_ops; +} + static void __madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { int ret; struct thp_settings settings = *thp_current_settings(); - printf("%s...", msg); + ksft_print_msg("%s...", msg); + + /* + * read&write file collapse succeeds for MADV_COLLAPSE because dirty + * folios are written back after collapse fails for dirty folios and + * another collapse is attempted. + */ /* * Prevent khugepaged interference and tests that MADV_COLLAPSE @@ -526,10 +545,8 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } + if (!ops->check_huge(p, 0)) + ksft_exit_fail_msg("Unexpected huge page\n"); __madvise_collapse(msg, p, nr_hpages, ops, expect); } @@ -541,17 +558,15 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages, int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } + if (!ops->check_huge(p, 0)) + ksft_exit_fail_msg("Unexpected huge page\n"); madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ full_scans = thp_read_num("khugepaged/full_scans") + 2; - printf("%s...", msg); + ksft_print_msg("%s...", msg); while (timeout--) { if (ops->check_huge(p, nr_hpages)) break; @@ -567,6 +582,13 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages, static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { + /* + * read&write file collapse fails since khugepaged does not flush + * the target dirty folios + */ + if (!is_tmpfs(ops) && ops == &__read_write_file_write_ops) + expect = false; + if (wait_for_scan(msg, p, nr_hpages, ops)) { if (expect) fail("Timeout"); @@ -601,16 +623,6 @@ static struct collapse_context __madvise_context = { .name = "madvise", }; -static bool is_tmpfs(struct mem_ops *ops) -{ - return ops == &__file_ops && finfo.type == VMA_SHMEM; -} - -static bool is_anon(struct mem_ops *ops) -{ - return ops == &__anon_ops; -} - static void alloc_at_fault(void) { struct thp_settings settings = *thp_current_settings(); @@ -621,7 +633,7 @@ static void alloc_at_fault(void) p = alloc_mapping(1); *p = 1; - printf("Allocate huge page on fault..."); + ksft_print_msg("Allocate huge page on fault..."); if (check_huge_anon(p, 1, hpage_pmd_size)) success("OK"); else @@ -630,12 +642,14 @@ static void alloc_at_fault(void) thp_pop_settings(); madvise(p, page_size, MADV_DONTNEED); - printf("Split huge PMD on MADV_DONTNEED..."); + ksft_print_msg("Split huge PMD on MADV_DONTNEED..."); if (check_huge_anon(p, 0, hpage_pmd_size)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); + + ksft_test_result_report(exit_status, "allocate on fault and split\n"); } static void collapse_full(struct collapse_context *c, struct mem_ops *ops) @@ -650,6 +664,8 @@ static void collapse_full(struct collapse_context *c, struct mem_ops *ops) ops, true); validate_memory(p, 0, size); ops->cleanup_area(p, size); + + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) @@ -659,6 +675,7 @@ static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) p = ops->setup_area(1); c->collapse("Do not collapse empty PTE table", p, 1, ops, false); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) @@ -670,6 +687,7 @@ static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops c->collapse("Collapse PTE table with single PTE entry present", p, 1, ops, true); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) @@ -697,6 +715,9 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size); if (c->enforce_pte_scan_limits) { + ops->cleanup_area(p, hpage_pmd_size); + p = ops->setup_area(1); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, true); @@ -706,6 +727,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o skip: ops->cleanup_area(p, hpage_pmd_size); thp_pop_settings(); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) @@ -715,11 +737,9 @@ static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_op p = ops->setup_area(1); ops->fault(p, 0, hpage_pmd_size); - printf("Swapout one page..."); - if (madvise(p, page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } + ksft_print_msg("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) + ksft_exit_fail_perror("madvise(MADV_PAGEOUT)"); if (check_swap(p, page_size)) { success("OK"); } else { @@ -732,6 +752,7 @@ static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_op validate_memory(p, 0, hpage_pmd_size); out: ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) @@ -742,11 +763,9 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o p = ops->setup_area(1); ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); - if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } + ksft_print_msg("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) + ksft_exit_fail_perror("madvise(MADV_PAGEOUT)"); if (check_swap(p, (max_ptes_swap + 1) * page_size)) { success("OK"); } else { @@ -760,12 +779,10 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o if (c->enforce_pte_scan_limits) { ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, + ksft_print_msg("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) + ksft_exit_fail_perror("madvise(MADV_PAGEOUT)"); if (check_swap(p, max_ptes_swap * page_size)) { success("OK"); } else { @@ -779,6 +796,7 @@ static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *o } out: ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) @@ -795,7 +813,7 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc } madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); + ksft_print_msg("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); if (ops->check_huge(p, 0)) success("OK"); @@ -807,6 +825,7 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc validate_memory(p, 0, page_size); skip: ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) @@ -814,7 +833,7 @@ static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops void *p; p = alloc_hpage(ops); - printf("Split huge page leaving single PTE page table full of compound pages..."); + ksft_print_msg("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); if (ops->check_huge(p, 0)) @@ -826,6 +845,7 @@ static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops true); validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) @@ -834,16 +854,12 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops int i; p = ops->setup_area(1); + ksft_print_msg("Construct PTE page table full of different PTE-mapped compound pages\n"); for (i = 0; i < hpage_pmd_nr; i++) { - printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", - i + 1, hpage_pmd_nr); - madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); ops->fault(BASE_ADDR, 0, hpage_pmd_size); - if (!ops->check_huge(BASE_ADDR, 1)) { - printf("Failed to allocate huge page\n"); - exit(EXIT_FAILURE); - } + if (!ops->check_huge(BASE_ADDR, 1)) + ksft_exit_fail_msg("Failed to allocate huge page\n"); madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); p = mremap(BASE_ADDR - i * page_size, @@ -851,20 +867,16 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops (i + 1) * page_size, MREMAP_MAYMOVE | MREMAP_FIXED, BASE_ADDR + 2 * hpage_pmd_size); - if (p == MAP_FAILED) { - perror("mremap+unmap"); - exit(EXIT_FAILURE); - } + if (p == MAP_FAILED) + ksft_exit_fail_perror("mremap+unmap"); p = mremap(BASE_ADDR + 2 * hpage_pmd_size, (i + 1) * page_size, (i + 1) * page_size + hpage_pmd_size, MREMAP_MAYMOVE | MREMAP_FIXED, BASE_ADDR - (i + 1) * page_size); - if (p == MAP_FAILED) { - perror("mremap+alloc"); - exit(EXIT_FAILURE); - } + if (p == MAP_FAILED) + ksft_exit_fail_perror("mremap+alloc"); } ops->cleanup_area(BASE_ADDR, hpage_pmd_size); @@ -879,6 +891,7 @@ static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) @@ -888,19 +901,16 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) p = ops->setup_area(1); - printf("Allocate small page..."); + ksft_print_msg("Allocate small page..."); ops->fault(p, 0, page_size); if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - printf("Share small page over fork()..."); + ksft_print_msg("Share small page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - if (ops->check_huge(p, 0)) success("OK"); else @@ -912,19 +922,20 @@ static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) validate_memory(p, 0, page_size); ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); + _exit(exit_status); } wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); + exit_status = WEXITSTATUS(wstatus); - printf("Check if parent still has small page..."); + ksft_print_msg("Check if parent still has small page..."); if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); validate_memory(p, 0, page_size); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) @@ -933,18 +944,15 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o void *p; p = alloc_hpage(ops); - printf("Share huge page over fork()..."); + ksft_print_msg("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); - printf("Split huge page PMD in child process..."); + ksft_print_msg("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); if (ops->check_huge(p, 0)) @@ -961,19 +969,20 @@ static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *o validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); + _exit(exit_status); } wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); + exit_status = WEXITSTATUS(wstatus); - printf("Check if parent still has huge page..."); + ksft_print_msg("Check if parent still has huge page..."); if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) @@ -983,18 +992,15 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops void *p; p = alloc_hpage(ops); - printf("Share huge page over fork()..."); + ksft_print_msg("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); - printf("Trigger CoW on page %d of %d...", + ksft_print_msg("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); if (ops->check_huge(p, 0)) @@ -1006,7 +1012,7 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops 1, ops, !c->enforce_pte_scan_limits); if (c->enforce_pte_scan_limits) { - printf("Trigger CoW on page %d of %d...", + ksft_print_msg("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); @@ -1021,19 +1027,20 @@ static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); + _exit(exit_status); } wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); + exit_status = WEXITSTATUS(wstatus); - printf("Check if parent still has huge page..."); + ksft_print_msg("Check if parent still has huge page..."); if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void madvise_collapse_existing_thps(struct collapse_context *c, @@ -1050,6 +1057,7 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); validate_memory(p, 0, hpage_pmd_size); ops->cleanup_area(p, hpage_pmd_size); + ksft_test_result_report(exit_status, "%s\n", __func__); } /* @@ -1077,6 +1085,7 @@ static void madvise_retracted_page_tables(struct collapse_context *c, true); validate_memory(p, 0, size); ops->cleanup_area(p, size); + ksft_test_result_report(exit_status, "%s\n", __func__); } static void usage(void) @@ -1086,8 +1095,8 @@ static void usage(void) fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); - fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires a file system\n"); + fprintf(stderr, "\twith PMD-sized large folio support\n"); fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n"); fprintf(stderr, "\n\tSupported Options:\n"); @@ -1143,20 +1152,25 @@ static void parse_test_type(int argc, char **argv) usage(); if (!strcmp(buf, "all")) { - file_ops = &__file_ops; + read_only_file_ops = &__read_only_file_ops; + read_write_file_read_ops = &__read_write_file_read_ops; + read_write_file_write_ops = &__read_write_file_write_ops; anon_ops = &__anon_ops; shmem_ops = &__shmem_ops; } else if (!strcmp(buf, "anon")) { anon_ops = &__anon_ops; } else if (!strcmp(buf, "file")) { - file_ops = &__file_ops; + read_only_file_ops = &__read_only_file_ops; + read_write_file_read_ops = &__read_write_file_read_ops; + read_write_file_write_ops = &__read_write_file_write_ops; } else if (!strcmp(buf, "shmem")) { shmem_ops = &__shmem_ops; } else { usage(); } - if (!file_ops) + if (!read_only_file_ops && !read_write_file_read_ops && + !read_write_file_write_ops) return; if (argc != 2) @@ -1165,6 +1179,32 @@ static void parse_test_type(int argc, char **argv) get_finfo(argv[1]); } +typedef void (*test_fn)(struct collapse_context *c, struct mem_ops *ops); + +struct test_case { + struct collapse_context *ctx; + struct mem_ops *ops; + const char *desc; + test_fn fn; +}; + +#define MAX_TEST_CASES 64 +static struct test_case test_cases[MAX_TEST_CASES]; +static int nr_test_cases; + +#define TEST(t, c, o) do { \ + if (c && o) { \ + if (nr_test_cases >= MAX_TEST_CASES) \ + ksft_exit_fail_msg("MAX_TEST_CASES is too small\n"); \ + test_cases[nr_test_cases++] = (struct test_case){ \ + .ctx = c, \ + .ops = o, \ + .desc = #t, \ + .fn = t, \ + }; \ + } \ + } while (0) + int main(int argc, char **argv) { int hpage_pmd_order; @@ -1188,10 +1228,10 @@ int main(int argc, char **argv) .read_ahead_kb = 0, }; - if (!thp_is_enabled()) { - printf("Transparent Hugepages not available\n"); - return KSFT_SKIP; - } + ksft_print_header(); + + if (!thp_is_enabled()) + ksft_exit_skip("Transparent Hugepages not available\n"); parse_test_type(argc, argv); @@ -1199,10 +1239,8 @@ int main(int argc, char **argv) page_size = getpagesize(); hpage_pmd_size = read_pmd_pagesize(); - if (!hpage_pmd_size) { - printf("Reading PMD pagesize failed"); - exit(EXIT_FAILURE); - } + if (!hpage_pmd_size) + ksft_exit_fail_msg("Reading PMD pagesize failed\n"); hpage_pmd_nr = hpage_pmd_size / page_size; hpage_pmd_order = __builtin_ctz(hpage_pmd_nr); @@ -1218,47 +1256,54 @@ int main(int argc, char **argv) save_settings(); thp_push_settings(&default_settings); - alloc_at_fault(); - -#define TEST(t, c, o) do { \ - if (c && o) { \ - printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ - t(c, o); \ - } \ - } while (0) - TEST(collapse_full, khugepaged_context, anon_ops); - TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, read_only_file_ops); + TEST(collapse_full, khugepaged_context, read_write_file_read_ops); + TEST(collapse_full, khugepaged_context, read_write_file_write_ops); TEST(collapse_full, khugepaged_context, shmem_ops); TEST(collapse_full, madvise_context, anon_ops); - TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, read_only_file_ops); + TEST(collapse_full, madvise_context, read_write_file_read_ops); + TEST(collapse_full, madvise_context, read_write_file_write_ops); TEST(collapse_full, madvise_context, shmem_ops); TEST(collapse_empty, khugepaged_context, anon_ops); TEST(collapse_empty, madvise_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, read_only_file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, read_write_file_read_ops); + TEST(collapse_single_pte_entry, khugepaged_context, read_write_file_write_ops); TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); TEST(collapse_single_pte_entry, madvise_context, anon_ops); - TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, read_only_file_ops); + TEST(collapse_single_pte_entry, madvise_context, read_write_file_read_ops); + TEST(collapse_single_pte_entry, madvise_context, read_write_file_write_ops); TEST(collapse_single_pte_entry, madvise_context, shmem_ops); TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, khugepaged_context, read_only_file_ops); + TEST(collapse_max_ptes_none, khugepaged_context, read_write_file_read_ops); + TEST(collapse_max_ptes_none, khugepaged_context, read_write_file_write_ops); TEST(collapse_max_ptes_none, madvise_context, anon_ops); - TEST(collapse_max_ptes_none, madvise_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, read_only_file_ops); + TEST(collapse_max_ptes_none, madvise_context, read_write_file_read_ops); + TEST(collapse_max_ptes_none, madvise_context, read_write_file_write_ops); TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, read_only_file_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, read_write_file_read_ops); TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, read_only_file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, read_write_file_read_ops); TEST(collapse_full_of_compound, khugepaged_context, anon_ops); - TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, read_only_file_ops); + TEST(collapse_full_of_compound, khugepaged_context, read_write_file_read_ops); TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); TEST(collapse_full_of_compound, madvise_context, anon_ops); - TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, read_only_file_ops); + TEST(collapse_full_of_compound, madvise_context, read_write_file_read_ops); TEST(collapse_full_of_compound, madvise_context, shmem_ops); TEST(collapse_compound_extreme, khugepaged_context, anon_ops); @@ -1280,11 +1325,23 @@ int main(int argc, char **argv) TEST(collapse_max_ptes_shared, madvise_context, anon_ops); TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); - TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, read_only_file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, read_write_file_read_ops); TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); - TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, read_only_file_ops); + TEST(madvise_retracted_page_tables, madvise_context, read_write_file_read_ops); TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); - restore_settings(0); + ksft_set_plan(nr_test_cases + 1); + + alloc_at_fault(); + for (int i = 0; i < nr_test_cases; i++) { + struct test_case *t = &test_cases[i]; + + ksft_print_msg("\n# Run test: %s (%s:%s)\n", t->desc, t->ctx->name, t->ops->name); + t->fn(t->ctx, t->ops); + } + + ksft_finished(); } diff --git a/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh b/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh new file mode 100755 index 000000000000..d01950244490 --- /dev/null +++ b/tools/testing/selftests/mm/ksft_kmemleak_dedup.sh @@ -0,0 +1,222 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Regression test for kmemleak's per-scan verbose dedup. +# +# Loads samples/kmemleak's helper module to generate orphan allocations +# (some of which share an allocation backtrace), runs a few kmemleak +# scans with verbose printing enabled, and verifies that no two +# "unreferenced object" reports within a single scan share the same +# backtrace - which would mean dedup failed to collapse them. +# +# This test is intentionally permissive: the kmemleak-test module's +# leaks frequently get reported across many separate scans (per-CPU +# chunk reuse, slab freelist pointers, kernel stack residue), so dedup +# may never have anything to fold within one scan. That is not a +# regression. The test only fails when it actually catches dedup not +# happening on input that should have triggered it - i.e. two reports +# with identical backtraces in the same scan. +# +# Author: Breno Leitao <leitao@debian.org> + +ksft_skip=4 +KMEMLEAK=/sys/kernel/debug/kmemleak +VERBOSE_PARAM=/sys/module/kmemleak/parameters/verbose +MODULE=kmemleak-test + +skip() { + echo "SKIP: $*" + exit $ksft_skip +} + +fail() { + echo "FAIL: $*" + exit 1 +} + +pass() { + echo "PASS: $*" + exit 0 +} + +[ "$(id -u)" -eq 0 ] || skip "must run as root" +[ -r "$KMEMLEAK" ] || skip "no kmemleak debugfs (CONFIG_DEBUG_KMEMLEAK)" +[ -w "$VERBOSE_PARAM" ] || skip "kmemleak verbose param missing" +modinfo "$MODULE" >/dev/null 2>&1 || + skip "$MODULE not built (CONFIG_SAMPLE_KMEMLEAK)" + +# The verdict depends entirely on dmesg contents, so a silently-empty +# dmesg (dmesg_restrict=1 with CAP_SYSLOG dropped, restricted container, +# etc.) would let the script report PASS without parsing anything. Probe +# both read and clear up front and skip cleanly if either is denied. +dmesg >/dev/null 2>&1 || + skip "cannot read dmesg (need CAP_SYSLOG or dmesg_restrict=0)" +dmesg -C >/dev/null 2>&1 || + skip "cannot clear dmesg (need CAP_SYSLOG or dmesg_restrict=0)" + +# kmemleak can be present but disabled at runtime (boot arg kmemleak=off, +# or it self-disabled after an internal error). In that state writes other +# than "clear" return EPERM, so probe once and skip if so. +if ! echo scan > "$KMEMLEAK" 2>/dev/null; then + skip "kmemleak is disabled (check dmesg or kmemleak= boot arg)" +fi + +prev_verbose=$(cat "$VERBOSE_PARAM") +# shellcheck disable=SC2317 # invoked indirectly via trap +cleanup() { + echo "$prev_verbose" > "$VERBOSE_PARAM" 2>/dev/null + rmmod "$MODULE" 2>/dev/null + # Drain the leak set we generated. Subsequent selftests (e.g. + # tools/testing/selftests/net/netfilter/nft_interface_stress.sh) + # fail on any non-empty kmemleak report, so leaving the helper + # module's intentional leaks behind would poison the rest of a + # kselftest run. + # + # Caveat: kmemleak_clear() only greys objects that have already + # been reported (OBJECT_REPORTED && unreferenced_object()). Helper + # allocations that stayed "still referenced" throughout the test + # (stale pointers in per-CPU chunks, slab freelists, kernel stacks) + # were never reported and are therefore not greyed by this clear - + # they remain tracked and a later scan can still surface them. Such + # leftovers are inherent to the kmemleak-test sample module and are + # not specific to this test; consumers that fail on any kmemleak + # output (rather than on the test-specific backtraces) need to be + # robust to that, or this test should be excluded from the run. + echo clear > "$KMEMLEAK" 2>/dev/null +} +trap cleanup EXIT + +echo 1 > "$VERBOSE_PARAM" + +# Drain the existing leak set so the next scan only reports our objects. +echo clear > "$KMEMLEAK" + +# Re-clear dmesg now (the up-front probe also cleared it, but anything +# logged between then and here - module unload chatter, the probe scan, +# the verbose-param write - would otherwise pollute the parse window). +dmesg -C >/dev/null + +# If the module was left loaded by a previous aborted run, modprobe would +# be a no-op and the init function would not run, so no new leaks would be +# generated. Force a clean state first. +rmmod "$MODULE" 2>/dev/null +modprobe "$MODULE" || skip "failed to load $MODULE" +# Removing the module orphans the list elements without freeing them. +rmmod "$MODULE" || skip "failed to unload $MODULE" + +# Run a handful of scans so kmemleak has the chance to age and report +# the orphans. We do not require any particular number to be reported: +# the regression check below operates on whatever lands in dmesg. +# +# Note: with CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y the kernel's own scan +# thread can report and mark these orphans (OBJECT_REPORTED) before our +# manual scans run, after which our scans will see nothing. The +# lower-bound check below catches the case where that happens and the +# manual scans also produce nothing. +SCAN_COUNT=4 +SCAN_SLEEP=6 +for _ in $(seq 1 "$SCAN_COUNT"); do + echo scan > "$KMEMLEAK" + sleep "$SCAN_SLEEP" +done + +# Strip the leading "[ nnn.nnnnnn] " dmesg timestamp prefix. Without +# this, two identical stack frames printed from two reports in the same +# scan would produce different per-frame strings (different timestamps) +# and the duplicate-backtrace check below would not match them, silently +# passing a real dedup regression. Doing the strip here makes the rest +# of the parser timestamp-agnostic regardless of what dmesg defaults to. +log=$(dmesg | sed 's/^\[[^]]*\] //') + +# After running the workload (modprobe + scans), dmesg should contain at +# least the helper module's pr_info lines and our manual-scan output. An +# empty capture here means dmesg succeeded earlier but is now denying us +# the buffer (race with dmesg_restrict toggling, etc.); refuse to give a +# verdict on no evidence. +[ -n "$log" ] || skip "dmesg returned empty after running workload" + +# Lower bound: if kmemleak's own per-scan tally counted leaks but the +# verbose path emitted no "unreferenced object" line, the verbose printer +# itself is regressed - fail rather than silently passing on no input. +new_leaks=$(echo "$log" | + sed -n 's/.*kmemleak: \([0-9]\+\) new suspected.*/\1/p' | + awk '{s+=$1} END{print s+0}') +printed=$(echo "$log" | grep -c 'kmemleak: unreferenced object') +if [ "$new_leaks" -gt 0 ] && [ "$printed" -eq 0 ]; then + fail "verbose path broken: $new_leaks leaks counted, 0 printed in $SCAN_COUNT scans" +fi + +# Walk the log: split into per-scan chunks at "N new suspected memory +# leaks" boundaries; within each chunk, capture each "unreferenced +# object" report's backtrace and check that no backtrace is reported +# more than once. A duplicate within a single scan means dedup failed +# to collapse two leaks that share an allocation site. +violations=$(echo "$log" | awk ' + function flush_block() { + if (in_block) { + # Skip empty backtraces: leaks with trace_handle == 0 + # (early-boot allocations or stack_depot_save() failures + # under memory pressure) are intentionally not deduped, + # so multiple such reports in one scan are expected and + # must not be flagged as a regression. + if (bt != "") + seen[bt]++ + in_block = 0 + collecting = 0 + bt = "" + } + } + function check_and_reset( b) { + for (b in seen) + if (seen[b] > 1) + printf("backtrace seen %d times in one scan:\n%s\n", + seen[b], b) + delete seen + } + # Scan boundary: the per-scan summary line. + /kmemleak: [0-9]+ new suspected memory leaks/ { + flush_block() + check_and_reset() + next + } + # Start of a new "unreferenced object" report. + /kmemleak: unreferenced object/ { + flush_block() + in_block = 1 + next + } + # Inside a report, the "backtrace (crc ...):" line switches us to + # backtrace-collecting mode. + in_block && /kmemleak:[[:space:]]+backtrace \(crc/ { + collecting = 1 + next + } + # Once collecting, capture only deeply-indented "kmemleak: " lines + # (stack frames have 4+ spaces of indentation under "kmemleak: "; + # headers and the "... and N more" tail line have less). This stops + # unrelated kmemleak warns landing between reports from being lumped + # into the backtrace key, which would mask a genuine duplicate. + in_block && collecting && /kmemleak:[[:space:]]{4,}/ { + bt = bt $0 "\n" + next + } + END { + flush_block() + check_and_reset() + } +') + +if [ -n "$violations" ]; then + echo "$violations" + fail "kmemleak dedup regression: same backtrace reported more than once in a single scan" +fi + +# Count the dedup summary lines so the report distinguishes "dedup +# actually fired" from "no same-backtrace leaks turned up to dedup". +dedup_lines=$(echo "$log" | grep -c 'more object(s) with the same backtrace') + +if [ "$dedup_lines" -gt 0 ]; then + pass "no dedup violations across $SCAN_COUNT scans; dedup fired ($dedup_lines summary line(s) observed)" +else + pass "no dedup violations across $SCAN_COUNT scans; dedup had nothing to collapse" +fi diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 8d874c4754f3..31c06c72203f 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -498,6 +498,7 @@ static void test_prctl_fork(void) static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) { int ksm_fd; + size_t len; ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); if (ksm_fd < 0) @@ -506,11 +507,13 @@ static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) if (write(ksm_fd, "1", 1) != 1) return -errno; - if (write(pages_to_scan_fd, pages_to_scan, strlen(pages_to_scan)) <= 0) - return -errno; + len = strlen(pages_to_scan); + if (write(pages_to_scan_fd, pages_to_scan, len) != len) + return -1; - if (write(sleep_millisecs_fd, sleep_ms, strlen(sleep_ms)) <= 0) - return -errno; + len = strlen(sleep_ms); + if (write(sleep_millisecs_fd, sleep_ms, len) != len) + return -1; return 0; } @@ -526,11 +529,11 @@ static int stop_ksmd_and_restore_frequency(void) if (write(ksm_fd, "2", 1) != 1) return -errno; - if (write(pages_to_scan_fd, "100", 3) <= 0) - return -errno; + if (write(pages_to_scan_fd, "100", 3) != 3) + return -1; - if (write(sleep_millisecs_fd, "20", 2) <= 0) - return -errno; + if (write(sleep_millisecs_fd, "20", 2) != 2) + return -1; return 0; } diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c index a0b48b839d54..a050f4840cfa 100644 --- a/tools/testing/selftests/mm/ksm_tests.c +++ b/tools/testing/selftests/mm/ksm_tests.c @@ -15,7 +15,7 @@ #include "kselftest.h" #include <include/vdso/time64.h> #include "vm_util.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" #define KSM_FP(s) (KSM_SYSFS_PATH s) @@ -174,13 +174,13 @@ static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_ { void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); - if (!map_ptr) { - perror("mmap"); + if (map_ptr == MAP_FAILED) { + ksft_perror("mmap"); return NULL; } memset(map_ptr, data, map_size); if (mprotect(map_ptr, map_size, prot)) { - perror("mprotect"); + ksft_perror("mprotect"); munmap(map_ptr, map_size); return NULL; } @@ -201,11 +201,11 @@ static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) return 1; if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return 1; } if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { - printf("Scan time limit exceeded\n"); + ksft_print_msg("Scan time limit exceeded\n"); return 1; } } @@ -218,12 +218,12 @@ static int ksm_merge_pages(int merge_type, void *addr, size_t size, { if (merge_type == KSM_MERGE_MADVISE) { if (madvise(addr, size, MADV_MERGEABLE)) { - perror("madvise"); + ksft_perror("madvise"); return 1; } } else if (merge_type == KSM_MERGE_PRCTL) { if (prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0)) { - perror("prctl"); + ksft_perror("prctl"); return 1; } } @@ -242,7 +242,7 @@ static int ksm_unmerge_pages(void *addr, size_t size, struct timespec start_time, int timeout) { if (madvise(addr, size, MADV_UNMERGEABLE)) { - perror("madvise"); + ksft_perror("madvise"); return 1; } return 0; @@ -324,7 +324,7 @@ static int check_ksm_merge(int merge_type, int mapping, int prot, struct timespec start_time; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return KSFT_FAIL; } @@ -338,7 +338,6 @@ static int check_ksm_merge(int merge_type, int mapping, int prot, /* verify that the right number of pages are merged */ if (assert_ksm_pages_count(page_count)) { - printf("OK\n"); munmap(map_ptr, page_size * page_count); if (merge_type == KSM_MERGE_PRCTL) prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0); @@ -346,7 +345,6 @@ static int check_ksm_merge(int merge_type, int mapping, int prot, } err_out: - printf("Not OK\n"); munmap(map_ptr, page_size * page_count); return KSFT_FAIL; } @@ -358,7 +356,7 @@ static int check_ksm_unmerge(int merge_type, int mapping, int prot, int timeout, int page_count = 2; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return KSFT_FAIL; } @@ -380,13 +378,11 @@ static int check_ksm_unmerge(int merge_type, int mapping, int prot, int timeout, /* check that unmerging was successful and 0 pages are currently merged */ if (assert_ksm_pages_count(0)) { - printf("OK\n"); munmap(map_ptr, page_size * page_count); return KSFT_PASS; } err_out: - printf("Not OK\n"); munmap(map_ptr, page_size * page_count); return KSFT_FAIL; } @@ -398,7 +394,7 @@ static int check_ksm_zero_page_merge(int merge_type, int mapping, int prot, long struct timespec start_time; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return KSFT_FAIL; } @@ -425,12 +421,10 @@ static int check_ksm_zero_page_merge(int merge_type, int mapping, int prot, long else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) goto err_out; - printf("OK\n"); munmap(map_ptr, page_size * page_count); return KSFT_PASS; err_out: - printf("Not OK\n"); munmap(map_ptr, page_size * page_count); return KSFT_FAIL; } @@ -465,16 +459,16 @@ static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeo int first_node; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return KSFT_FAIL; } if (numa_available() < 0) { - perror("NUMA support not enabled"); + ksft_print_msg("NUMA support not enabled\n"); return KSFT_SKIP; } if (numa_num_configured_nodes() <= 1) { - printf("At least 2 NUMA nodes must be available\n"); + ksft_print_msg("At least 2 NUMA nodes must be available\n"); return KSFT_SKIP; } if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) @@ -485,7 +479,7 @@ static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeo numa1_map_ptr = numa_alloc_onnode(page_size, first_node); numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); if (!numa1_map_ptr || !numa2_map_ptr) { - perror("numa_alloc_onnode"); + ksft_perror("numa_alloc_onnode"); return KSFT_FAIL; } @@ -510,13 +504,11 @@ static int check_ksm_numa_merge(int merge_type, int mapping, int prot, int timeo numa_free(numa1_map_ptr, page_size); numa_free(numa2_map_ptr, page_size); - printf("OK\n"); return KSFT_PASS; err_out: numa_free(numa1_map_ptr, page_size); numa_free(numa2_map_ptr, page_size); - printf("Not OK\n"); return KSFT_FAIL; } @@ -529,7 +521,7 @@ static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, int pagemap_fd, n_normal_pages, n_huge_pages; if (!thp_is_enabled()) { - printf("Transparent Hugepages not available\n"); + ksft_print_msg("Transparent Hugepages not available\n"); return KSFT_SKIP; } @@ -559,36 +551,35 @@ static int ksm_merge_hugepages_time(int merge_type, int mapping, int prot, else n_huge_pages++; } - printf("Number of normal pages: %d\n", n_normal_pages); - printf("Number of huge pages: %d\n", n_huge_pages); + ksft_print_msg("Number of normal pages: %d\n", n_normal_pages); + ksft_print_msg("Number of huge pages: %d\n", n_huge_pages); memset(map_ptr, '*', len); if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + (end_time.tv_nsec - start_time.tv_nsec); - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + ksft_print_msg("Total size: %lu MiB\n", map_size / MB); + ksft_print_msg("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ksft_print_msg("Average speed: %.3f MiB/s\n", (map_size / MB) / ((double)scan_time_ns / NSEC_PER_SEC)); munmap(map_ptr_orig, len + HPAGE_SIZE); return KSFT_PASS; err_out: - printf("Not OK\n"); munmap(map_ptr_orig, len + HPAGE_SIZE); return KSFT_FAIL; } @@ -606,30 +597,29 @@ static int ksm_merge_time(int merge_type, int mapping, int prot, int timeout, si return KSFT_FAIL; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + (end_time.tv_nsec - start_time.tv_nsec); - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + ksft_print_msg("Total size: %lu MiB\n", map_size / MB); + ksft_print_msg("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ksft_print_msg("Average speed: %.3f MiB/s\n", (map_size / MB) / ((double)scan_time_ns / NSEC_PER_SEC)); munmap(map_ptr, map_size); return KSFT_PASS; err_out: - printf("Not OK\n"); munmap(map_ptr, map_size); return KSFT_FAIL; } @@ -646,37 +636,36 @@ static int ksm_unmerge_time(int merge_type, int mapping, int prot, int timeout, if (!map_ptr) return KSFT_FAIL; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } if (ksm_merge_pages(merge_type, map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + (end_time.tv_nsec - start_time.tv_nsec); - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + ksft_print_msg("Total size: %lu MiB\n", map_size / MB); + ksft_print_msg("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ksft_print_msg("Average speed: %.3f MiB/s\n", (map_size / MB) / ((double)scan_time_ns / NSEC_PER_SEC)); munmap(map_ptr, map_size); return KSFT_PASS; err_out: - printf("Not OK\n"); munmap(map_ptr, map_size); return KSFT_FAIL; } @@ -695,24 +684,24 @@ static int ksm_cow_time(int merge_type, int mapping, int prot, int timeout, size return KSFT_FAIL; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return KSFT_FAIL; } for (size_t i = 0; i < page_count - 1; i = i + 2) memset(map_ptr + page_size * i, '-', 1); if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); return KSFT_FAIL; } cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + (end_time.tv_nsec - start_time.tv_nsec); - printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); - printf("Not merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + ksft_print_msg("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + ksft_print_msg("Not merged pages:\n"); + ksft_print_msg("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ksft_print_msg("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / ((double)cow_time_ns / NSEC_PER_SEC)); /* Create 2000 pairs of duplicate pages */ @@ -724,30 +713,29 @@ static int ksm_cow_time(int merge_type, int mapping, int prot, int timeout, size goto err_out; if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } for (size_t i = 0; i < page_count - 1; i = i + 2) memset(map_ptr + page_size * i, '-', 1); if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); + ksft_perror("clock_gettime"); goto err_out; } cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + (end_time.tv_nsec - start_time.tv_nsec); - printf("Merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + ksft_print_msg("Merged pages:\n"); + ksft_print_msg("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ksft_print_msg("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / ((double)cow_time_ns / NSEC_PER_SEC)); munmap(map_ptr, page_size * page_count); return KSFT_PASS; err_out: - printf("Not OK\n"); munmap(map_ptr, page_size * page_count); return KSFT_FAIL; } @@ -765,6 +753,10 @@ int main(int argc, char *argv[]) bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; + const char *test_descr = "KSM merging"; + + ksft_print_header(); + ksft_set_plan(1); while ((opt = getopt(argc, argv, "dha:p:l:z:m:s:t:MUZNPCHD")) != -1) { switch (opt) { @@ -773,17 +765,13 @@ int main(int argc, char *argv[]) break; case 'p': page_count = atol(optarg); - if (page_count <= 0) { - printf("The number of pages must be greater than 0\n"); - return KSFT_FAIL; - } + if (page_count <= 0) + ksft_exit_fail_msg("The number of pages must be greater than 0\n"); break; case 'l': ksm_scan_limit_sec = atoi(optarg); - if (ksm_scan_limit_sec <= 0) { - printf("Timeout value must be greater than 0\n"); - return KSFT_FAIL; - } + if (ksm_scan_limit_sec <= 0) + ksft_exit_fail_msg("Timeout value must be greater than 0\n"); break; case 'h': print_help(); @@ -805,19 +793,15 @@ int main(int argc, char *argv[]) break; case 's': size_MB = atoi(optarg); - if (size_MB <= 0) { - printf("Size must be greater than 0\n"); - return KSFT_FAIL; - } + if (size_MB <= 0) + ksft_exit_fail_msg("Size must be greater than 0\n"); break; case 't': { int tmp = atoi(optarg); - if (tmp < 0 || tmp > KSM_MERGE_LAST) { - printf("Invalid merge type\n"); - return KSFT_FAIL; - } + if (tmp < 0 || tmp > KSM_MERGE_LAST) + ksft_exit_fail_msg("Invalid merge type\n"); merge_type = tmp; } break; @@ -845,82 +829,80 @@ int main(int argc, char *argv[]) test_name = KSM_COW_TIME; break; default: - return KSFT_FAIL; + ksft_exit_fail_msg("Unknown option\n"); } } if (prot == 0) prot = str_to_prot(KSM_PROT_STR_DEFAULT); - if (access(KSM_SYSFS_PATH, F_OK)) { - printf("Config KSM not enabled\n"); - return KSFT_SKIP; - } + if (access(KSM_SYSFS_PATH, F_OK)) + ksft_exit_skip("Config KSM not enabled\n"); - if (ksm_save_def(&ksm_sysfs_old)) { - printf("Cannot save default tunables\n"); - return KSFT_FAIL; - } + if (ksm_save_def(&ksm_sysfs_old)) + ksft_exit_fail_msg("Cannot save default tunables\n"); if (ksm_write_sysfs(KSM_FP("run"), 2) || ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || numa_available() ? 0 : ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) - return KSFT_FAIL; + ksft_exit_fail_msg("Cannot set up KSM tunables\n"); switch (test_name) { case CHECK_KSM_MERGE: + test_descr = "KSM merging"; ret = check_ksm_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, page_size); break; case CHECK_KSM_UNMERGE: + test_descr = "KSM unmerging"; ret = check_ksm_unmerge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); break; case CHECK_KSM_ZERO_PAGE_MERGE: + test_descr = "KSM zero page merging"; ret = check_ksm_zero_page_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, use_zero_pages, page_size); break; case CHECK_KSM_NUMA_MERGE: + test_descr = "KSM NUMA merging"; ret = check_ksm_numa_merge(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, merge_across_nodes, page_size); break; case KSM_MERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } + if (size_MB == 0) + ksft_exit_fail_msg("Option '-s' is required\n"); + test_descr = "KSM merge time"; ret = ksm_merge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; case KSM_MERGE_TIME_HUGE_PAGES: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } + if (size_MB == 0) + ksft_exit_fail_msg("Option '-s' is required\n"); + test_descr = "KSM merge time with huge pages"; ret = ksm_merge_hugepages_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; case KSM_UNMERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } + if (size_MB == 0) + ksft_exit_fail_msg("Option '-s' is required\n"); + test_descr = "KSM unmerge time"; ret = ksm_unmerge_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; case KSM_COW_TIME: + test_descr = "KSM COW time"; ret = ksm_cow_time(merge_type, MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); break; } - if (ksm_restore(&ksm_sysfs_old)) { - printf("Cannot restore default tunables\n"); - return KSFT_FAIL; - } + if (ksm_restore(&ksm_sysfs_old)) + ksft_print_msg("Cannot restore default tunables\n"); + + ksft_test_result_report(ret, "%s\n", test_descr); - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index 88050e0f829a..7fce5d0b622b 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -34,7 +34,7 @@ static void sense_support(void) addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (!addr) + if (addr == MAP_FAILED) ksft_exit_fail_msg("mmap failed\n"); ret = madvise(addr, pagesize, MADV_POPULATE_READ); diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c deleted file mode 100644 index aa409107611b..000000000000 --- a/tools/testing/selftests/mm/map_hugetlb.c +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Example of using hugepage memory in a user application using the mmap - * system call with MAP_HUGETLB flag. Before running this program make - * sure the administrator has allocated enough default sized huge pages - * to cover the 256 MB allocation. - */ -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#include <fcntl.h> -#include "vm_util.h" -#include "kselftest.h" - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -static void check_bytes(char *addr) -{ - ksft_print_msg("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static void read_bytes(char *addr, size_t length) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < length; i++) - if (*(addr + i) != (char)i) - ksft_exit_fail_msg("Mismatch at %lu\n", i); - - ksft_test_result_pass("Read correct data\n"); -} - -int main(int argc, char **argv) -{ - void *addr; - size_t hugepage_size; - size_t length = LENGTH; - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; - int shift = 0; - - hugepage_size = default_huge_page_size(); - /* munmap with fail if the length is not page aligned */ - if (hugepage_size > length) - length = hugepage_size; - - ksft_print_header(); - ksft_set_plan(1); - - if (argc > 1) - length = atol(argv[1]) << 20; - if (argc > 2) { - shift = atoi(argv[2]); - if (shift) - flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; - } - - if (shift) - ksft_print_msg("%u kB hugepages\n", 1 << (shift - 10)); - else - ksft_print_msg("Default size hugepages\n"); - ksft_print_msg("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - - addr = mmap(NULL, length, PROTECTION, flags, -1, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); - - ksft_print_msg("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - read_bytes(addr, length); - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, length)) - ksft_exit_fail_msg("munmap: %s\n", strerror(errno)); - - ksft_finished(); -} diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 60e78bbfc0e3..29f7492453d4 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -5,7 +5,7 @@ */ #include "kselftest_harness.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #include <strings.h> #include <pthread.h> @@ -23,6 +23,8 @@ #define MAX_RETRIES 100 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +HUGETLB_SETUP_DEFAULT_PAGES(1) + FIXTURE(migration) { pthread_t *threads; @@ -32,13 +34,26 @@ FIXTURE(migration) int n2; }; +static void reset_signals(void) +{ + struct sigaction sa = { .sa_handler = SIG_DFL }; + + sigemptyset(&sa.sa_mask); + sigaction(SIGTERM, &sa, NULL); + sigaction(SIGHUP, &sa, NULL); + sigaction(SIGINT, &sa, NULL); + sigaction(SIGQUIT, &sa, NULL); +} + FIXTURE_SETUP(migration) { int n; + reset_signals(); + if (numa_available() < 0) SKIP(return, "NUMA not available"); - self->nthreads = numa_num_task_cpus() - 1; + self->nthreads = numa_num_task_cpus() - 2; self->n1 = -1; self->n2 = -1; @@ -52,6 +67,9 @@ FIXTURE_SETUP(migration) } } + if (self->nthreads < 1 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + self->threads = malloc(self->nthreads * sizeof(*self->threads)); ASSERT_NE(self->threads, NULL); self->pids = malloc(self->nthreads * sizeof(*self->pids)); @@ -64,6 +82,29 @@ FIXTURE_TEARDOWN(migration) free(self->pids); } +static bool kill_children(FIXTURE_DATA(migration) * self) +{ + bool err = false; + pid_t pid; + int i; + + for (i = 0; i < self->nthreads; i++) { + int status = 0; + + pid = self->pids[i]; + if (pid < 0) + continue; + if (kill(pid, SIGTERM)) + err = true; + if (pid != waitpid(pid, &status, 0)) + err = true; + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGTERM) + err = true; + } + + return !err; +} + int migrate(uint64_t *ptr, int n1, int n2) { int ret, tmp; @@ -127,20 +168,17 @@ TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) uint64_t *ptr; int i; - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ASSERT_NE(ptr, MAP_FAILED); memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) + for (i = 0; i < self->nthreads; i++) if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) perror("Couldn't create thread"); ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) + for (i = 0; i < self->nthreads; i++) ASSERT_EQ(pthread_cancel(self->threads[i]), 0); } @@ -151,17 +189,14 @@ TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) { pid_t pid; uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); + int i, err; ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); ASSERT_NE(ptr, MAP_FAILED); memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) { + for (i = 0; i < self->nthreads; i++) { pid = fork(); if (!pid) { prctl(PR_SET_PDEATHSIG, SIGHUP); @@ -174,9 +209,9 @@ TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) } } - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); + err = migrate(ptr, self->n1, self->n2); + ASSERT_EQ(kill_children(self), true); + ASSERT_EQ(err, 0); } /* @@ -184,28 +219,30 @@ TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) */ TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) { + uint64_t pmdsize; uint64_t *ptr; int i; if (!thp_is_enabled()) SKIP(return, "Transparent Hugepages not available"); - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); + pmdsize = read_pmd_pagesize(); + if (!pmdsize) + SKIP(return, "Reading PMD pagesize failed"); - ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, + ptr = mmap(NULL, 2 * pmdsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); - ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, pmdsize); + ASSERT_EQ(madvise(ptr, pmdsize, MADV_HUGEPAGE), 0); + memset(ptr, 0xde, pmdsize); + for (i = 0; i < self->nthreads; i++) if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) perror("Couldn't create thread"); ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) + for (i = 0; i < self->nthreads; i++) ASSERT_EQ(pthread_cancel(self->threads[i]), 0); } @@ -215,25 +252,27 @@ TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME) { + uint64_t pmdsize; pid_t pid; uint64_t *ptr; - int i; + int i, err; if (!thp_is_enabled()) SKIP(return, "Transparent Hugepages not available"); - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); + pmdsize = read_pmd_pagesize(); + if (!pmdsize) + SKIP(return, "Reading PMD pagesize failed"); - ptr = mmap(NULL, 2 * TWOMEG, PROT_READ | PROT_WRITE, + ptr = mmap(NULL, 2 * pmdsize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); - ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, pmdsize); + ASSERT_EQ(madvise(ptr, pmdsize, MADV_HUGEPAGE), 0); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) { + memset(ptr, 0xde, pmdsize); + for (i = 0; i < self->nthreads; i++) { pid = fork(); if (!pid) { prctl(PR_SET_PDEATHSIG, SIGHUP); @@ -246,9 +285,9 @@ TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME) } } - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); + err = migrate(ptr, self->n1, self->n2); + ASSERT_EQ(kill_children(self), true); + ASSERT_EQ(err, 0); } /* @@ -256,23 +295,28 @@ TEST_F_TIMEOUT(migration, shared_anon_thp, 2*RUNTIME) */ TEST_F_TIMEOUT(migration, private_anon_htlb, 2*RUNTIME) { + unsigned long hugepage_size; uint64_t *ptr; int i; - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); + hugepage_size = default_huge_page_size(); + if (!hugepage_size) + SKIP(return, "Reading HugeTLB pagesize failed"); - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + if (hugetlb_free_default_pages() < 1) + SKIP(return, "Not enough huge pages"); + + ptr = mmap(NULL, hugepage_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) + memset(ptr, 0xde, hugepage_size); + for (i = 0; i < self->nthreads; i++) if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) perror("Couldn't create thread"); ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) + for (i = 0; i < self->nthreads; i++) ASSERT_EQ(pthread_cancel(self->threads[i]), 0); } @@ -281,19 +325,24 @@ TEST_F_TIMEOUT(migration, private_anon_htlb, 2*RUNTIME) */ TEST_F_TIMEOUT(migration, shared_anon_htlb, 2*RUNTIME) { + unsigned long hugepage_size; pid_t pid; uint64_t *ptr; - int i; + int i, err; - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); + hugepage_size = default_huge_page_size(); + if (!hugepage_size) + SKIP(return, "Reading HugeTLB pagesize failed"); - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + if (hugetlb_free_default_pages() < 1) + SKIP(return, "Not enough huge pages"); + + ptr = mmap(NULL, hugepage_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) { + memset(ptr, 0xde, hugepage_size); + for (i = 0; i < self->nthreads; i++) { pid = fork(); if (!pid) { prctl(PR_SET_PDEATHSIG, SIGHUP); @@ -306,9 +355,9 @@ TEST_F_TIMEOUT(migration, shared_anon_htlb, 2*RUNTIME) } } - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); + err = migrate(ptr, self->n1, self->n2); + ASSERT_EQ(kill_children(self), true); + ASSERT_EQ(err, 0); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c index b474f2b20def..e16e288cc7c1 100644 --- a/tools/testing/selftests/mm/mlock2-tests.c +++ b/tools/testing/selftests/mm/mlock2-tests.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #define _GNU_SOURCE #include <sys/mman.h> +#include <linux/mman.h> #include <stdint.h> #include <unistd.h> #include <string.h> @@ -163,14 +164,17 @@ static int lock_check(unsigned long addr) return (vma_rss == vma_size); } -static int unlock_lock_check(char *map) +static int unlock_lock_check(char *map, bool mlock_supported) { - if (is_vmflag_set((unsigned long)map, LOCKED)) { + if (!is_vmflag_set((unsigned long)map, LOCKED)) + return 0; + + if (mlock_supported) ksft_print_msg("VMA flag %s is present on page 1 after unlock\n", LOCKED); - return 1; - } + else + ksft_print_msg("VMA flag %s is present on an unsupported VMA\n", LOCKED); - return 0; + return 1; } static void test_mlock_lock(void) @@ -196,7 +200,7 @@ static void test_mlock_lock(void) ksft_exit_fail_msg("munlock(): %s\n", strerror(errno)); } - ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__); + ksft_test_result(!unlock_lock_check(map, true), "%s: Unlocked\n", __func__); munmap(map, 2 * page_size); } @@ -296,7 +300,7 @@ static void test_munlockall0(void) ksft_exit_fail_msg("munlockall(): %s\n", strerror(errno)); } - ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); + ksft_test_result(!unlock_lock_check(map, true), "%s: No locked memory\n", __func__); munmap(map, 2 * page_size); } @@ -336,7 +340,67 @@ static void test_munlockall1(void) ksft_exit_fail_msg("munlockall() %s\n", strerror(errno)); } - ksft_test_result(!unlock_lock_check(map), "%s: No locked memory\n", __func__); + ksft_test_result(!unlock_lock_check(map, true), "%s: No locked memory\n", __func__); + munmap(map, 2 * page_size); +} + +/* Droppable memory should not be lockable. */ +static void test_mlock_droppable(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + /* Ensure MCL_FUTURE is not set. */ + if (munlockall()) { + ksft_test_result_fail("munlockall() %s\n", strerror(errno)); + return; + } + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + if (map == MAP_FAILED) { + if ((errno == EOPNOTSUPP) || (errno == EINVAL)) + ksft_test_result_skip("%s: MAP_DROPPABLE not supported\n", __func__); + else + ksft_test_result_fail("mmap error: %s\n", strerror(errno)); + return; + } + + if (mlock2_(map, 2 * page_size, 0)) + ksft_test_result_fail("mlock2(0): %s\n", strerror(errno)); + else + ksft_test_result(!unlock_lock_check(map, false), + "%s: droppable memory not locked\n", __func__); + + munmap(map, 2 * page_size); +} + +static void test_mlockall_future_droppable(void) +{ + char *map; + unsigned long page_size = getpagesize(); + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + ksft_test_result_fail("mlockall(MCL_CURRENT | MCL_FUTURE): %s\n", strerror(errno)); + return; + } + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); + + if (map == MAP_FAILED) { + if ((errno == EOPNOTSUPP) || (errno == EINVAL)) + ksft_test_result_skip("%s: MAP_DROPPABLE not supported\n", __func__); + else + ksft_test_result_fail("mmap error: %s\n", strerror(errno)); + munlockall(); + return; + } + + ksft_test_result(!unlock_lock_check(map, false), "%s: droppable memory not locked\n", + __func__); + + munlockall(); munmap(map, 2 * page_size); } @@ -442,7 +506,7 @@ int main(int argc, char **argv) munmap(map, size); - ksft_set_plan(13); + ksft_set_plan(15); test_mlock_lock(); test_mlock_onfault(); @@ -451,6 +515,8 @@ int main(int argc, char **argv) test_lock_onfault_of_present(); test_vma_management(true); test_mlockall(); + test_mlock_droppable(); + test_mlockall_future_droppable(); ksft_finished(); } diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 308576437228..131d9d6db867 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -76,27 +76,6 @@ enum { .expect_failure = should_fail \ } -/* compute square root using binary search */ -static unsigned long get_sqrt(unsigned long val) -{ - unsigned long low = 1; - - /* assuming rand_size is less than 1TB */ - unsigned long high = (1UL << 20); - - while (low <= high) { - unsigned long mid = low + (high - low) / 2; - unsigned long temp = mid * mid; - - if (temp == val) - return mid; - if (temp < val) - low = mid + 1; - high = mid - 1; - } - return low; -} - /* * Returns false if the requested remap region overlaps with an * existing mapping (e.g text, stack) else returns true. @@ -995,11 +974,9 @@ static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; - unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; unsigned long long threshold; - unsigned long num_chunks; if (threshold_mb == VALIDATION_NO_THRESHOLD) threshold = c.region_size; @@ -1068,87 +1045,21 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto clean_up_dest_preamble; } - /* - * Verify byte pattern after remapping. Employ an algorithm with a - * square root time complexity in threshold: divide the range into - * chunks, if memcmp() returns non-zero, only then perform an - * iteration in that chunk to find the mismatch index. - */ - num_chunks = get_sqrt(threshold); - for (unsigned long i = 0; i < num_chunks; ++i) { - size_t chunk_size = threshold / num_chunks; - unsigned long shift = i * chunk_size; - - if (!memcmp(dest_addr + shift, rand_addr + shift, chunk_size)) - continue; - - /* brute force iteration only over mismatch segment */ - for (t = shift; t < shift + chunk_size; ++t) { - if (((char *) dest_addr)[t] != rand_addr[t]) { - ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, - ((char *) dest_addr)[t] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - } - - /* - * if threshold is not divisible by num_chunks, then check the - * last chunk - */ - for (t = num_chunks * (threshold / num_chunks); t < threshold; ++t) { - if (((char *) dest_addr)[t] != rand_addr[t]) { - ksft_print_msg("Data after remap doesn't match at offset %llu\n", - t); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[t] & 0xff, - ((char *) dest_addr)[t] & 0xff); - ret = -1; - goto clean_up_dest; - } + /* Verify byte pattern after remapping */ + if (memcmp(dest_addr, rand_addr, threshold)) { + ksft_print_msg("Data after remap doesn't match\n"); + ret = -1; + goto clean_up_dest; } /* Verify the dest preamble byte pattern after remapping */ - if (!c.dest_preamble_size) - goto no_preamble; - - num_chunks = get_sqrt(c.dest_preamble_size); - - for (unsigned long i = 0; i < num_chunks; ++i) { - size_t chunk_size = c.dest_preamble_size / num_chunks; - unsigned long shift = i * chunk_size; - - if (!memcmp(dest_preamble_addr + shift, rand_addr + shift, - chunk_size)) - continue; - - /* brute force iteration only over mismatched segment */ - for (d = shift; d < shift + chunk_size; ++d) { - if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { - ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - } - - for (d = num_chunks * (c.dest_preamble_size / num_chunks); d < c.dest_preamble_size; ++d) { - if (((char *) dest_preamble_addr)[d] != rand_addr[d]) { - ksft_print_msg("Preamble data after remap doesn't match at offset %llu\n", - d); - ksft_print_msg("Expected: %#x\t Got: %#x\n", rand_addr[d] & 0xff, - ((char *) dest_preamble_addr)[d] & 0xff); - ret = -1; - goto clean_up_dest; - } + if (c.dest_preamble_size && + memcmp(dest_preamble_addr, rand_addr, c.dest_preamble_size)) { + ksft_print_msg("Preamble data after remap doesn't match\n"); + ret = -1; + goto clean_up_dest; } -no_preamble: start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c index e806c1866e36..c8584d0fdeab 100644 --- a/tools/testing/selftests/mm/page_frag/page_frag_test.c +++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c @@ -131,6 +131,8 @@ static int __init page_frag_test_init(void) init_completion(&wait); if (test_alloc_len > PAGE_SIZE || test_alloc_len <= 0 || + test_push_cpu < 0 || test_push_cpu >= nr_cpu_ids || + test_pop_cpu < 0 || test_pop_cpu >= nr_cpu_ids || !cpu_active(test_push_cpu) || !cpu_active(test_pop_cpu)) return -EINVAL; diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 7f9428d6062c..762306177ad8 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -7,8 +7,6 @@ #include <sys/mman.h> #include <errno.h> #include <malloc.h> -#include "vm_util.h" -#include "kselftest.h" #include <linux/types.h> #include <linux/memfd.h> #include <linux/userfaultfd.h> @@ -23,6 +21,10 @@ #include <sys/ipc.h> #include <sys/shm.h> +#include "vm_util.h" +#include "kselftest.h" +#include "hugepage_settings.h" + #define PAGEMAP_BITS_ALL (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ @@ -1554,6 +1556,9 @@ int main(int __attribute__((unused)) argc, char *argv[]) if (init_uffd()) ksft_exit_skip("Failed to initialize userfaultfd\n"); + if (!hugetlb_setup_default(4)) + ksft_print_msg("HugeTLB test will be skipped\n"); + ksft_set_plan(117); page_size = getpagesize(); @@ -1605,7 +1610,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) } /* 5. SHM Hugetlb page testing */ - mem_size = 2*1024*1024; + mem_size = default_huge_page_size(); mem = gethugetlb_mem(mem_size, &shmid); if (mem) { wp_init(mem, mem_size); @@ -1633,7 +1638,7 @@ int main(int __attribute__((unused)) argc, char *argv[]) } /* 7. File Hugetlb testing */ - mem_size = 2*1024*1024; + mem_size = default_huge_page_size(); fd = memfd_create("uffd-test", MFD_HUGETLB | MFD_NOEXEC_SEAL); if (fd < 0) ksft_exit_fail_msg("uffd-test creation failed %d %s\n", errno, strerror(errno)); diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h index 8e9685e03c44..c5a78a2f211d 100644 --- a/tools/testing/selftests/mm/pkey-arm64.h +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -130,9 +130,10 @@ static inline u64 get_pkey_bits(u64 reg, int pkey) static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) { struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt); + size_t resv_size = GET_UCP_RESV_SIZE(uctxt); struct poe_context *poe_ctx = (struct poe_context *) get_header(ctx, POE_MAGIC, - sizeof(uctxt->uc_mcontext), NULL); + resv_size, NULL); if (poe_ctx) poe_ctx->por_el0 = pkey; } diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 7c29f075e40b..2c377f4e9df1 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -71,13 +71,14 @@ static inline void sigsafe_printf(const char *format, ...) extern void abort_hooks(void); #define pkey_assert(condition) do { \ if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ + dprintf0("# assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("# errno at assert: %d\n", errno); \ + abort_hooks(); \ + ksft_exit_fail_msg("test %d (iteration %d)\n", \ + test_nr, iteration_nr); \ + } \ } while (0) #define barrier() __asm__ __volatile__("": : :"memory") diff --git a/tools/testing/selftests/mm/prctl_thp_disable.c b/tools/testing/selftests/mm/prctl_thp_disable.c index ca27200596a4..d8d9d1de57b8 100644 --- a/tools/testing/selftests/mm/prctl_thp_disable.c +++ b/tools/testing/selftests/mm/prctl_thp_disable.c @@ -14,7 +14,7 @@ #include <sys/wait.h> #include "kselftest_harness.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #include "vm_util.h" #ifndef PR_THP_DISABLE_EXCEPT_ADVISED diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c index cd4610baf5d7..3fffd5f7e6fb 100644 --- a/tools/testing/selftests/mm/process_madv.c +++ b/tools/testing/selftests/mm/process_madv.c @@ -310,6 +310,34 @@ TEST_F(process_madvise, invalid_vlen) } /* + * Test that invalid advice is rejected even when the iovec has zero total + * length. A request with valid advice and zero length is a noop, but + * invalid advice should still fail with EINVAL. + */ +TEST_F(process_madvise, invalid_advice_zero_length) +{ + struct iovec vec = { + .iov_base = NULL, + .iov_len = 0, + }; + int pidfd = self->pidfd; + ssize_t ret; + + errno = 0; + ret = sys_process_madvise(pidfd, &vec, 1, -1, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + errno = 0; + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, 0); + + ret = sys_process_madvise(pidfd, NULL, 0, -1, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); +} + +/* * Test process_madvise() with an invalid flag value. Currently, only a flag * value of 0 is supported. This test is reserved for the future, e.g., if * synchronous flags are added. diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 2085982dba69..9a6d954ee371 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -46,6 +46,7 @@ #include <sys/ptrace.h> #include <setjmp.h> +#include "hugepage_settings.h" #include "pkey-helpers.h" int iteration_nr = 1; @@ -61,6 +62,7 @@ noinline int read_ptr(int *ptr) return *ptr; } +#if CONTROL_TRACING > 0 static void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -86,7 +88,6 @@ static void cat_into_file(char *str, char *file) close(fd); } -#if CONTROL_TRACING > 0 static int warned_tracing; static int tracing_root_ok(void) { @@ -136,6 +137,7 @@ static void tracing_off(void) void abort_hooks(void) { + fflush(stdout); fprintf(stderr, "running %s()...\n", __func__); tracing_off(); #ifdef SLEEP_ON_ABORT @@ -370,8 +372,8 @@ static void signal_handler(int signum, siginfo_t *si, void *vucontext) if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || (si->si_code == SEGV_BNDERR)) { - printf("non-PK si_code, exiting...\n"); - exit(4); + dprintf0("# non-PK si_code: %d, exiting...\n", si->si_code); + exit(1); } si_pkey_ptr = siginfo_get_pkey_ptr(si); @@ -708,50 +710,28 @@ static void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } static int hugetlb_setup_ok; -#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 static void setup_hugetlbfs(void) { - int err; - int fd; - char buf[256]; - long hpagesz_kb; - long hpagesz_mb; + long hpagesz_mb = HPAGE_SIZE / 1024 / 1024; + unsigned long free_pages; if (geteuid() != 0) { - fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); + ksft_print_msg("WARNING: not run as root, can not do hugetlb test\n"); return; } - cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); - /* - * Now go make sure that we got the pages and that they + * Make sure that we got the pages and that they * are PMD-level pages. Someone might have made PUD-level * pages the default. */ - hpagesz_kb = HPAGE_SIZE / 1024; - hpagesz_mb = hpagesz_kb / 1024; - sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); - fd = open(buf, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - /* -1 to guarantee leaving the trailing \0 */ - err = read(fd, buf, sizeof(buf)-1); - close(fd); - if (err <= 0) { - fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", - hpagesz_mb, buf, GET_NR_HUGE_PAGES); + hugetlb_save_settings(); + hugetlb_set_nr_pages(HPAGE_SIZE, GET_NR_HUGE_PAGES); + free_pages = hugetlb_free_pages(HPAGE_SIZE); + if (free_pages < GET_NR_HUGE_PAGES) { + ksft_print_msg("could not confirm %ldM pages, got: '%lu' expected %d\n", + hpagesz_mb, free_pages, GET_NR_HUGE_PAGES); return; } @@ -855,7 +835,7 @@ void expected_pkey_fault(int pkey) #define do_not_expect_pkey_fault(msg) do { \ if (last_pkey_faults != pkey_faults) \ - dprintf0("unexpected PKey fault: %s\n", msg); \ + dprintf0("# unexpected PKey fault: %s\n", msg); \ pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) @@ -1128,7 +1108,7 @@ static void become_child(void) /* in the child */ return; } - exit(0); + _exit(0); } /* Assumes that all pkeys other than 'pkey' are unallocated */ @@ -1507,18 +1487,18 @@ static void test_ptrace_modifies_pkru(int *ptr, u16 pkey) * checking */ if (__read_pkey_reg() != new_pkru) - exit(1); + _exit(1); /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ raise(SIGSTOP); if (__read_pkey_reg() != 0) - exit(1); + _exit(1); /* Stop and allow the tracer to examine PKRU */ raise(SIGSTOP); - exit(0); + _exit(0); } pkey_assert(child == waitpid(child, &status, 0)); @@ -1692,29 +1672,36 @@ static void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) pkey_assert(sret < 0); } -static void (*pkey_tests[])(int *ptr, u16 pkey) = { - test_read_of_write_disabled_region, - test_read_of_access_disabled_region, - test_read_of_access_disabled_region_with_page_already_mapped, - test_write_of_write_disabled_region, - test_write_of_write_disabled_region_with_page_already_mapped, - test_write_of_access_disabled_region, - test_write_of_access_disabled_region_with_page_already_mapped, - test_kernel_write_of_access_disabled_region, - test_kernel_write_of_write_disabled_region, - test_kernel_gup_of_access_disabled_region, - test_kernel_gup_write_to_write_disabled_region, - test_executing_on_unreadable_memory, - test_implicit_mprotect_exec_only_memory, - test_mprotect_with_pkey_0, - test_ptrace_of_child, - test_pkey_init_state, - test_pkey_syscalls_on_non_allocated_pkey, - test_pkey_syscalls_bad_args, - test_pkey_alloc_exhaust, - test_pkey_alloc_free_attach_pkey0, +struct pkey_test { + void (*func)(int *ptr, u16 pkey); + const char *name; +}; + +#define PKEY_TEST(fn) { fn, #fn } + +static struct pkey_test pkey_tests[] = { + PKEY_TEST(test_read_of_write_disabled_region), + PKEY_TEST(test_read_of_access_disabled_region), + PKEY_TEST(test_read_of_access_disabled_region_with_page_already_mapped), + PKEY_TEST(test_write_of_write_disabled_region), + PKEY_TEST(test_write_of_write_disabled_region_with_page_already_mapped), + PKEY_TEST(test_write_of_access_disabled_region), + PKEY_TEST(test_write_of_access_disabled_region_with_page_already_mapped), + PKEY_TEST(test_kernel_write_of_access_disabled_region), + PKEY_TEST(test_kernel_write_of_write_disabled_region), + PKEY_TEST(test_kernel_gup_of_access_disabled_region), + PKEY_TEST(test_kernel_gup_write_to_write_disabled_region), + PKEY_TEST(test_executing_on_unreadable_memory), + PKEY_TEST(test_implicit_mprotect_exec_only_memory), + PKEY_TEST(test_mprotect_with_pkey_0), + PKEY_TEST(test_ptrace_of_child), + PKEY_TEST(test_pkey_init_state), + PKEY_TEST(test_pkey_syscalls_on_non_allocated_pkey), + PKEY_TEST(test_pkey_syscalls_bad_args), + PKEY_TEST(test_pkey_alloc_exhaust), + PKEY_TEST(test_pkey_alloc_free_attach_pkey0), #if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) - test_ptrace_modifies_pkru, + PKEY_TEST(test_ptrace_modifies_pkru), #endif }; @@ -1735,7 +1722,7 @@ static void run_tests_once(void) dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); ptr = malloc_pkey(PAGE_SIZE, prot, pkey); dprintf1("test %d starting...\n", test_nr); - pkey_tests[test_nr](ptr, pkey); + pkey_tests[test_nr].func(ptr, pkey); dprintf1("freeing test memory: %p\n", ptr); free_pkey_malloc(ptr); sys_pkey_free(pkey); @@ -1746,7 +1733,7 @@ static void run_tests_once(void) tracing_off(); close_test_fds(); - printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); + ksft_test_result_pass("test %s (iteration %d)\n", pkey_tests[test_nr].name, iteration_nr); dprintf1("======================\n\n"); } iteration_nr++; @@ -1766,27 +1753,30 @@ int main(void) setup_handlers(); - printf("has pkeys: %d\n", pkeys_supported); + ksft_print_header(); if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; - printf("running PKEY tests for unsupported CPU/OS\n"); + ksft_set_plan(1); + ksft_print_msg("running PKEY tests for unsupported CPU/OS\n"); ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); assert(ptr != (void *)-1); test_mprotect_pkey_on_unsupported_cpu(ptr, 1); - exit(0); + ksft_test_result_pass("pkey on unsupported CPU/OS\n"); + ksft_finished(); } + ksft_set_plan(ARRAY_SIZE(pkey_tests) * nr_iterations); + pkey_setup_shadow(); - printf("startup pkey_reg: %016llx\n", read_pkey_reg()); + ksft_print_msg("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) run_tests_once(); - printf("done (all tests OK)\n"); - return 0; + ksft_finished(); } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d8468451b3a3..8c296dedf047 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -103,7 +103,7 @@ RUN_ALL=false RUN_DESTRUCTIVE=false TAP_PREFIX="# " -while getopts "aht:n" OPT; do +while getopts "aht:nd" OPT; do case ${OPT} in "a") RUN_ALL=true ;; "h") usage ;; @@ -132,7 +132,7 @@ test_selected() { run_gup_matrix() { # -t: thp=on, -T: thp=off, -H: hugetlb=on - local hugetlb_mb=$(( needmem_KB / 1024 )) + local hugetlb_mb=256 for huge in -t -T "-H -m $hugetlb_mb"; do # -u: gup-fast, -U: gup-basic, -a: pin-fast, -b: pin-basic, -L: pin-longterm @@ -154,60 +154,6 @@ run_gup_matrix() { done } -# get huge pagesize and freepages from /proc/meminfo -while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs="$size" - fi - if [ "$name" = "Hugepagesize:" ]; then - hpgsize_KB="$size" - fi -done < /proc/meminfo - -# Simple hugetlbfs tests have a hardcoded minimum requirement of -# huge pages totaling 256MB (262144KB) in size. The userfaultfd -# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take -# both of these requirements into account and attempt to increase -# number of huge pages available. -nr_cpus=$(nproc) -uffd_min_KB=$((hpgsize_KB * nr_cpus * 2)) -hugetlb_min_KB=$((256 * 1024)) -if [[ $uffd_min_KB -gt $hugetlb_min_KB ]]; then - needmem_KB=$uffd_min_KB -else - needmem_KB=$hugetlb_min_KB -fi - -# set proper nr_hugepages -if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then - orig_nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) - needpgs=$((needmem_KB / hpgsize_KB)) - tries=2 - while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do - lackpgs=$((needpgs - freepgs)) - echo 3 > /proc/sys/vm/drop_caches - if ! echo $((lackpgs + orig_nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then - echo "Please run this test as root" - exit $ksft_skip - fi - while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs=$size - fi - done < /proc/meminfo - tries=$((tries - 1)) - done - nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) - if [ "$freepgs" -lt "$needpgs" ]; then - printf "Not enough huge pages available (%d < %d)\n" \ - "$freepgs" "$needpgs" - fi - HAVE_HUGEPAGES=1 -else - echo "no hugetlbfs support in kernel?" - HAVE_HUGEPAGES=0 -fi - # filter 64bit architectures ARCH64STR="arm64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sparc64 x86_64" if [ -z "$ARCH" ]; then @@ -235,32 +181,61 @@ pretty_name() { run_test() { if test_selected ${CATEGORY}; then local skip=0 + local LOADED_HWPOISON_INJECT_MOD=0 # On memory constrainted systems some tests can fail to allocate hugepages. # perform some cleanup before the test for a higher success rate. if [ ${CATEGORY} == "thp" -o ${CATEGORY} == "hugetlb" ]; then - if [ "${HAVE_HUGEPAGES}" = "1" ]; then + mem_kb=$(awk '/MemAvailable/ {print $2}' /proc/meminfo) + mem_Mb=$((mem_kb / 1024)) + + if (( $mem_Mb < 256 )); then echo 3 > /proc/sys/vm/drop_caches sleep 2 echo 1 > /proc/sys/vm/compact_memory sleep 2 - else - echo "hugepages not supported" | tap_prefix - skip=1 fi fi + # Ensure hwpoison_inject is available for memory-failure tests + if [ "${CATEGORY}" = "memory-failure" ]; then + # Try to load hwpoison_inject if not present. + HWPOISON_DIR=/sys/kernel/debug/hwpoison/ + if [ ! -d "$HWPOISON_DIR" ]; then + if ! modprobe -n hwpoison_inject > /dev/null 2>&1; then + echo "Module hwpoison_inject not found, skipping..." \ + | tap_prefix + skip=1 + else + modprobe hwpoison_inject > /dev/null 2>&1 + LOADED_HWPOISON_INJECT_MOD=1 + if [ ! -d "$HWPOISON_DIR" ]; then + echo "hwpoison debugfs interface not present" \ + | tap_prefix + skip=1 + fi + fi + fi + + fi + local test=$(pretty_name "$*") local title="running $*" local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" | tap_prefix - if [ "${skip}" != "1" ]; then + if [ $skip -eq 1 ]; then + local ret=$ksft_skip + else ("$@" 2>&1) | tap_prefix local ret=${PIPESTATUS[0]} - else - local ret=$ksft_skip fi + + # Unload hwpoison_inject if we loaded it + if [ "${LOADED_HWPOISON_INJECT_MOD}" = "1" ]; then + modprobe -r hwpoison_inject > /dev/null 2>&1 + fi + count_total=$(( count_total + 1 )) if [ $ret -eq 0 ]; then count_pass=$(( count_pass + 1 )) @@ -270,7 +245,9 @@ run_test() { count_skip=$(( count_skip + 1 )) echo "[SKIP]" | tap_prefix echo "ok ${count_total} ${test} # SKIP" | tap_output - exitcode=$ksft_skip + if [ $exitcode -eq 0 ]; then + exitcode=$ksft_skip + fi else count_fail=$(( count_fail + 1 )) echo "[FAIL]" | tap_prefix @@ -282,31 +259,14 @@ run_test() { echo "TAP version 13" | tap_output -CATEGORY="hugetlb" run_test ./hugepage-mmap - -shmmax=$(cat /proc/sys/kernel/shmmax) -shmall=$(cat /proc/sys/kernel/shmall) -echo 268435456 > /proc/sys/kernel/shmmax -echo 4194304 > /proc/sys/kernel/shmall -CATEGORY="hugetlb" run_test ./hugepage-shm -echo "$shmmax" > /proc/sys/kernel/shmmax -echo "$shmall" > /proc/sys/kernel/shmall - -CATEGORY="hugetlb" run_test ./map_hugetlb -CATEGORY="hugetlb" run_test ./hugepage-mremap -CATEGORY="hugetlb" run_test ./hugepage-vmemmap +CATEGORY="hugetlb" run_test ./hugetlb-mmap +CATEGORY="hugetlb" run_test ./hugetlb-shm +CATEGORY="hugetlb" run_test ./hugetlb-mremap +CATEGORY="hugetlb" run_test ./hugetlb-vmemmap CATEGORY="hugetlb" run_test ./hugetlb-madvise CATEGORY="hugetlb" run_test ./hugetlb_dio - -if [ "${HAVE_HUGEPAGES}" = "1" ]; then - nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) - # For this test, we need one and just one huge page - echo 1 > /proc/sys/vm/nr_hugepages - CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv - CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map - # Restore the previous number of huge pages, since further tests rely on it - echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages -fi +CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv +CATEGORY="hugetlb" run_test ./hugetlb_madv_vs_map if test_selected "hugetlb"; then echo "NOTE: These hugetlb tests provide minimal coverage. Use" | tap_prefix @@ -333,44 +293,11 @@ CATEGORY="gup_test" run_test ./gup_longterm CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 -# Hugetlb tests require source and destination huge pages. Pass in almost half -# the size of the free pages we have, which is used for *each*. An adjustment -# of (nr_parallel - 1) is done (see nr_parallel in uffd-stress.c) to have some -# extra hugepages - this is done to prevent the test from failing by racily -# reserving more hugepages than strictly required. -# uffd-stress expects a region expressed in MiB, so we adjust -# half_ufd_size_MB accordingly. -adjustment=$(( (31 < (nr_cpus - 1)) ? 31 : (nr_cpus - 1) )) -half_ufd_size_MB=$((((freepgs - adjustment) * hpgsize_KB) / 1024 / 2)) -CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 -CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb 128 32 +CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private 128 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16 -# uffd-wp-mremap requires at least one page of each size. -have_all_size_hugepgs=true -declare -A nr_size_hugepgs -for f in /sys/kernel/mm/hugepages/**/nr_hugepages; do - old=$(cat $f) - nr_size_hugepgs["$f"]="$old" - if [ "$old" == 0 ]; then - echo 1 > "$f" - fi - if [ $(cat "$f") == 0 ]; then - have_all_size_hugepgs=false - break - fi -done -if $have_all_size_hugepgs; then - CATEGORY="userfaultfd" run_test ./uffd-wp-mremap -else - echo "# SKIP ./uffd-wp-mremap" -fi - -#cleanup -for f in "${!nr_size_hugepgs[@]}"; do - echo "${nr_size_hugepgs["$f"]}" > "$f" -done -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages +CATEGORY="userfaultfd" run_test ./uffd-wp-mremap CATEGORY="compaction" run_test ./compaction_test @@ -382,6 +309,7 @@ else fi CATEGORY="mmap" run_test ./map_populate +CATEGORY="mmap" run_test ./droppable CATEGORY="mlock" run_test ./mlock-random-test @@ -394,12 +322,10 @@ CATEGORY="mremap" run_test ./mremap_test CATEGORY="hugetlb" run_test ./thuge-gen CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2 CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2 + if $RUN_DESTRUCTIVE; then -nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) enable_soft_offline=$(cat /proc/sys/vm/enable_soft_offline) -echo 8 > /proc/sys/vm/nr_hugepages CATEGORY="hugetlb" run_test ./hugetlb-soft-offline -echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages echo "$enable_soft_offline" > /proc/sys/vm/enable_soft_offline CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison fi @@ -436,9 +362,7 @@ CATEGORY="memfd_secret" run_test ./memfd_secret fi # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 -if [ "${HAVE_HUGEPAGES}" = "1" ]; then - CATEGORY="ksm" run_test ./ksm_tests -H -s 100 -fi +CATEGORY="ksm" run_test ./ksm_tests -H -s 100 # KSM KSM_MERGE_TIME test with size of 100 CATEGORY="ksm" run_test ./ksm_tests -P -s 100 # KSM MADV_MERGEABLE test with 10 identical pages @@ -457,7 +381,6 @@ CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 CATEGORY="ksm" run_test ./ksm_functional_tests # protection_keys tests -nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) if [ -x ./protection_keys_32 ] then CATEGORY="pkey" run_test ./protection_keys_32 @@ -467,7 +390,6 @@ if [ -x ./protection_keys_64 ] then CATEGORY="pkey" run_test ./protection_keys_64 fi -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages if [ -x ./soft-dirty ] then @@ -489,24 +411,28 @@ CATEGORY="thp" run_test ./khugepaged all:shmem CATEGORY="thp" run_test ./khugepaged -s 4 all:shmem -CATEGORY="thp" run_test ./transhuge-stress -d 20 - # Try to create XFS if not provided if [ -z "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then - if [ "${HAVE_HUGEPAGES}" = "1" ]; then - if test_selected "thp"; then - if grep xfs /proc/filesystems &>/dev/null; then - XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX) - SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX) - truncate -s 314572800 ${XFS_IMG} - mkfs.xfs -q ${XFS_IMG} - mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} - MOUNTED_XFS=1 - fi + if test_selected "thp"; then + if grep xfs /proc/filesystems &>/dev/null; then + XFS_IMG=$(mktemp /tmp/xfs_img_XXXXXX) + SPLIT_HUGE_PAGE_TEST_XFS_PATH=$(mktemp -d /tmp/xfs_dir_XXXXXX) + truncate -s 314572800 ${XFS_IMG} + mkfs.xfs -q ${XFS_IMG} + mount -o loop ${XFS_IMG} ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} + MOUNTED_XFS=1 fi fi fi +if [ -n "${SPLIT_HUGE_PAGE_TEST_XFS_PATH}" ]; then +CATEGORY="thp" run_test ./khugepaged all:file ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} +elif test_selected thp; then + count_total=$(( count_total + 1 )) + count_skip=$(( count_skip + 1 )) + echo "[SKIP] ./khugepaged all:file" | tap_prefix +fi + CATEGORY="thp" run_test ./split_huge_page_test ${SPLIT_HUGE_PAGE_TEST_XFS_PATH} if [ -n "${MOUNTED_XFS}" ]; then @@ -515,6 +441,8 @@ if [ -n "${MOUNTED_XFS}" ]; then rm -f ${XFS_IMG} fi +CATEGORY="thp" run_test ./transhuge-stress -d 20 + CATEGORY="thp" run_test ./folio_split_race_test CATEGORY="migration" run_test ./migration @@ -531,28 +459,7 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned CATEGORY="rmap" run_test ./rmap -# Try to load hwpoison_inject if not present. -HWPOISON_DIR=/sys/kernel/debug/hwpoison/ -if [ ! -d "$HWPOISON_DIR" ]; then - if ! modprobe -q -R hwpoison_inject; then - echo "Module hwpoison_inject not found, skipping..." - else - modprobe hwpoison_inject > /dev/null 2>&1 - LOADED_MOD=1 - fi -fi - -if [ -d "$HWPOISON_DIR" ]; then - CATEGORY="memory-failure" run_test ./memory-failure -fi - -if [ -n "${LOADED_MOD}" ]; then - modprobe -r hwpoison_inject > /dev/null 2>&1 -fi - -if [ "${HAVE_HUGEPAGES}" = 1 ]; then - echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages -fi +CATEGORY="memory-failure" run_test ./memory-failure echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix echo "1..${count_total}" | tap_output diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index bcfcac99b436..fb1864a68e1c 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -9,7 +9,7 @@ #include "kselftest.h" #include "vm_util.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #define PAGEMAP_FILE_PATH "/proc/self/pagemap" #define TEST_ITERATIONS 10000 @@ -143,7 +143,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) if (anon) { map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (!map) + if (map == MAP_FAILED) ksft_exit_fail_msg("anon mmap failed\n"); } else { test_fd = open(fname, O_RDWR | O_CREAT, 0664); @@ -155,7 +155,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) ftruncate(test_fd, pagesize); map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, MAP_SHARED, test_fd, 0); - if (!map) + if (map == MAP_FAILED) ksft_exit_fail_msg("file mmap failed\n"); } diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 500d07c4938b..32b991472f74 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -21,7 +21,7 @@ #include <time.h> #include "vm_util.h" #include "kselftest.h" -#include "thp_settings.h" +#include "hugepage_settings.h" uint64_t pagesize; unsigned int pageshift; @@ -470,13 +470,18 @@ static void split_file_backed_thp(int order) char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; const char *tmpfs_loc = mkdtemp(tmpfs_template); char testfile[INPUT_MAX]; + unsigned long size = 2 * pmd_pagesize; + char opts[64]; ssize_t num_written, num_read; - char *file_buf1, *file_buf2; + char *file_buf1 = NULL, *file_buf2 = NULL; uint64_t pgoff_start = 0, pgoff_end = 1024; int i; ksft_print_msg("Please enable pr_debug in split_huge_pages_in_file() for more info.\n"); + if (!tmpfs_loc) + ksft_exit_fail_msg("mkdtemp failed\n"); + file_buf1 = (char *)malloc(pmd_pagesize); file_buf2 = (char *)malloc(pmd_pagesize); @@ -489,10 +494,13 @@ static void split_file_backed_thp(int order) file_buf1[i] = (char)i; memset(file_buf2, 0, pmd_pagesize); - status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + snprintf(opts, sizeof(opts), "huge=always,size=%lu", size); + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, opts); - if (status) - ksft_exit_fail_msg("Unable to create a tmpfs for testing\n"); + if (status) { + ksft_print_msg("Unable to create a tmpfs for testing\n"); + goto out; + } status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); if (status >= INPUT_MAX) { @@ -544,10 +552,13 @@ static void split_file_backed_thp(int order) status = umount(tmpfs_loc); if (status) { - rmdir(tmpfs_loc); - ksft_exit_fail_msg("Unable to umount %s\n", tmpfs_loc); + ksft_print_msg("Unable to umount %s\n", tmpfs_loc); + goto out; } + free(file_buf1); + free(file_buf2); + status = rmdir(tmpfs_loc); if (status) ksft_exit_fail_msg("cannot remove tmp dir: %s\n", strerror(errno)); @@ -560,8 +571,10 @@ close_file: close(fd); cleanup: umount(tmpfs_loc); - rmdir(tmpfs_loc); out: + free(file_buf1); + free(file_buf2); + rmdir(tmpfs_loc); ksft_exit_fail_msg("Error occurred\n"); } @@ -609,9 +622,13 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, assert(fd_size % sizeof(buf) == 0); for (i = 0; i < sizeof(buf); i++) buf[i] = (unsigned char)i; - for (i = 0; i < fd_size; i += sizeof(buf)) - write(*fd, buf, sizeof(buf)); - + for (i = 0; i < fd_size; i += sizeof(buf)) { + if (write(*fd, buf, sizeof(buf)) != sizeof(buf)) { + ksft_perror("write testfile"); + close(*fd); + goto err_out_unlink; + } + } close(*fd); sync(); *fd = open("/proc/sys/vm/drop_caches", O_WRONLY); @@ -621,7 +638,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, } if (write(*fd, "3", 1) != 1) { ksft_perror("write to drop_caches"); - goto err_out_unlink; + goto err_out_close; } close(*fd); diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 77813d34dcc2..22b9c2f1c35d 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -1,17 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Test selecting other page sizes for mmap/shmget. - - Before running this huge pages for each huge page size must have been - reserved. - For large pages beyond MAX_PAGE_ORDER (like 1GB on x86) boot options must - be used. 1GB wouldn't be tested if it isn't available. - Also shmmax must be increased. - And you need to run as root to work around some weird permissions in shm. - And nothing using huge pages should run in parallel. - When the program aborts you may need to clean up the shm segments with - ipcrm -m by hand, like this - sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m - (warning this will remove all if someone else uses them) */ +/* Test selecting other page sizes for mmap/shmget. */ #define _GNU_SOURCE #include <sys/mman.h> @@ -21,13 +9,12 @@ #include <sys/ipc.h> #include <sys/shm.h> #include <sys/stat.h> -#include <glob.h> -#include <assert.h> #include <unistd.h> #include <stdarg.h> #include <string.h> #include "vm_util.h" #include "kselftest.h" +#include "hugepage_settings.h" #if !defined(MAP_HUGETLB) #define MAP_HUGETLB 0x40000 @@ -37,15 +24,6 @@ #ifndef SHM_HUGE_SHIFT #define SHM_HUGE_SHIFT 26 #endif -#ifndef SHM_HUGE_MASK -#define SHM_HUGE_MASK 0x3f -#endif -#ifndef SHM_HUGE_2MB -#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) -#endif -#ifndef SHM_HUGE_1GB -#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) -#endif #define NUM_PAGESIZES 5 #define NUM_PAGES 4 @@ -63,32 +41,10 @@ int ilog2(unsigned long v) void show(unsigned long ps) { - char buf[100]; - if (ps == getpagesize()) return; - ksft_print_msg("%luMB: ", ps >> 20); - - fflush(stdout); - snprintf(buf, sizeof buf, - "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); - system(buf); -} - -unsigned long read_free(unsigned long ps) -{ - unsigned long val = 0; - char buf[100]; - - snprintf(buf, sizeof(buf), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); - if (read_sysfs(buf, &val) && ps != getpagesize()) - ksft_print_msg("missing %s\n", buf); - - return val; + ksft_print_msg("%luMB: %lu\n", ps >> 20, hugetlb_free_pages(ps)); } void test_mmap(unsigned long size, unsigned flags) @@ -96,14 +52,14 @@ void test_mmap(unsigned long size, unsigned flags) char *map; unsigned long before, after; - before = read_free(size); + before = hugetlb_free_pages(size); map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); if (map == MAP_FAILED) ksft_exit_fail_msg("mmap: %s\n", strerror(errno)); memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); + after = hugetlb_free_pages(size); show(size); ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, @@ -120,7 +76,7 @@ void test_shmget(unsigned long size, unsigned flags) struct shm_info i; char *map; - before = read_free(size); + before = hugetlb_free_pages(size); id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); if (id < 0) { if (errno == EPERM) { @@ -141,7 +97,7 @@ void test_shmget(unsigned long size, unsigned flags) shmctl(id, IPC_RMID, NULL); memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); + after = hugetlb_free_pages(size); show(size); ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, @@ -153,43 +109,15 @@ void test_shmget(unsigned long size, unsigned flags) void find_pagesizes(void) { unsigned long largest = getpagesize(); - unsigned long shmmax_val = 0; int i; - glob_t g; - glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); - assert(g.gl_pathc <= NUM_PAGESIZES); - for (i = 0; (i < g.gl_pathc) && (num_page_sizes < NUM_PAGESIZES); i++) { - sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", - &page_sizes[num_page_sizes]); - page_sizes[num_page_sizes] <<= 10; - ksft_print_msg("Found %luMB\n", page_sizes[i] >> 20); + num_page_sizes = hugetlb_setup(NUM_PAGES, page_sizes, ARRAY_SIZE(page_sizes)); - if (page_sizes[num_page_sizes] > largest) + for (i = 0; i < num_page_sizes; i++) + if (page_sizes[i] > largest) largest = page_sizes[i]; - if (read_free(page_sizes[num_page_sizes]) >= NUM_PAGES) - num_page_sizes++; - else - ksft_print_msg("SKIP for size %lu MB as not enough huge pages, need %u\n", - page_sizes[num_page_sizes] >> 20, NUM_PAGES); - } - globfree(&g); - - read_sysfs("/proc/sys/kernel/shmmax", &shmmax_val); - if (shmmax_val < NUM_PAGES * largest) { - ksft_print_msg("WARNING: shmmax is too small to run this test.\n"); - ksft_print_msg("Please run the following command to increase shmmax:\n"); - ksft_print_msg("echo %lu > /proc/sys/kernel/shmmax\n", largest * NUM_PAGES); - ksft_exit_skip("Test skipped due to insufficient shmmax value.\n"); - } - -#if defined(__x86_64__) - if (largest != 1U<<30) { - ksft_exit_skip("No GB pages available on x86-64\n" - "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); - } -#endif + shm_limits_prepare(NUM_PAGES * largest); } int main(void) @@ -232,3 +160,5 @@ int main(void) ksft_finished(); } + +SHM_LIMITS_RESTORE() diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c index 7a9f1035099b..8eb0c5630e7e 100644 --- a/tools/testing/selftests/mm/transhuge-stress.c +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -17,7 +17,7 @@ #include <sys/mman.h> #include "vm_util.h" #include "kselftest.h" -#include "thp_settings.h" +#include "hugepage_settings.h" int backing_fd = -1; int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 844a85ab31eb..92a21b97f745 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -37,24 +37,24 @@ #include "kselftest.h" #include "vm_util.h" +#include "hugepage_settings.h" #define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, @%s:%d)\n", \ - ret, __FILE__, __LINE__); \ +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + ksft_print_msg("ERROR: " fmt " (errno=%d, @%s:%d)\n", \ + ##__VA_ARGS__, ret, __FILE__, __LINE__); \ } while (0) -#define errexit(exitcode, fmt, ...) \ +#define errexit(fmt, ...) \ do { \ _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ + ksft_exit_fail(); \ } while (0) -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) +#define err(fmt, ...) errexit(fmt, ##__VA_ARGS__) struct uffd_global_test_opts { unsigned long nr_parallel, nr_pages, nr_pages_per_cpu, page_size; diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 700fbaa18d44..3401dd6028f0 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -286,18 +286,12 @@ static int userfaultfd_stress(uffd_global_test_opts_t *gopts) pthread_attr_setstacksize(&attr, 16*1024*1024); while (bounces--) { - printf("bounces: %d, mode:", bounces); - if (bounces & BOUNCE_RANDOM) - printf(" rnd"); - if (bounces & BOUNCE_RACINGFAULTS) - printf(" racing"); - if (bounces & BOUNCE_VERIFY) - printf(" ver"); - if (bounces & BOUNCE_POLL) - printf(" poll"); - else - printf(" read"); - printf(", "); + ksft_print_msg("bounces: %d, mode:%s%s%s%s, ", + bounces, + bounces & BOUNCE_RANDOM ? " rnd" : "", + bounces & BOUNCE_RACINGFAULTS ? " racing" : "", + bounces & BOUNCE_VERIFY ? " ver" : "", + bounces & BOUNCE_POLL ? " poll" : " read"); fflush(stdout); if (bounces & BOUNCE_POLL) @@ -461,6 +455,9 @@ int main(int argc, char **argv) if (argc < 4) usage(); + ksft_print_header(); + ksft_set_plan(1); + if (signal(SIGALRM, sigalrm) == SIG_ERR) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); @@ -483,17 +480,17 @@ int main(int argc, char **argv) * Ensure nr_parallel - 1 hugepages on top of that to account * for racy extra reservation of hugepages. */ - if (gopts->test_type == TEST_HUGETLB && - get_free_hugepages() < 2 * (bytes / gopts->page_size) + gopts->nr_parallel - 1) { - printf("skip: Skipping userfaultfd... not enough hugepages\n"); - return KSFT_SKIP; + if (gopts->test_type == TEST_HUGETLB) { + unsigned long nr = 2 * (bytes / gopts->page_size) + gopts->nr_parallel - 1; + + if (!hugetlb_setup_default(nr)) + ksft_exit_skip("Skipping userfaultfd... not enough hugepages\n"); } gopts->nr_pages_per_cpu = bytes / gopts->page_size / gopts->nr_parallel; if (!gopts->nr_pages_per_cpu) { - _err("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)", - bytes, gopts->page_size, gopts->nr_parallel); - usage(); + ksft_exit_skip("pages_per_cpu = 0, cannot test (%zu / %lu / %lu)\n", + bytes, gopts->page_size, gopts->nr_parallel); } bounces = atoi(argv[3]); @@ -503,9 +500,12 @@ int main(int argc, char **argv) } gopts->nr_pages = gopts->nr_pages_per_cpu * gopts->nr_parallel; - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - gopts->nr_pages, gopts->nr_pages_per_cpu); - return userfaultfd_stress(gopts); + ksft_print_msg("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + gopts->nr_pages, gopts->nr_pages_per_cpu); + + ksft_test_result(!userfaultfd_stress(gopts), + "uffd-stress %s\n", argv[1]); + ksft_finished(); } #else /* __NR_userfaultfd */ @@ -514,8 +514,8 @@ int main(int argc, char **argv) int main(void) { - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; + ksft_print_header(); + ksft_exit_skip("missing __NR_userfaultfd definition\n"); } #endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 6f5e404a446c..a6c14109e818 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -86,47 +86,28 @@ typedef struct { uffd_test_case_ops_t *test_case_ops; } uffd_test_case_t; -static void uffd_test_report(void) -{ - printf("Userfaults unit tests: pass=%u, skip=%u, fail=%u (total=%u)\n", - ksft_get_pass_cnt(), - ksft_get_xskip_cnt(), - ksft_get_fail_cnt(), - ksft_test_num()); -} +static char current_test[256]; static void uffd_test_pass(void) { - printf("done\n"); - ksft_inc_pass_cnt(); + ksft_test_result_pass("%s\n", current_test); } #define uffd_test_start(...) do { \ - printf("Testing "); \ - printf(__VA_ARGS__); \ - printf("... "); \ - fflush(stdout); \ + snprintf(current_test, sizeof(current_test), __VA_ARGS__); \ } while (0) -#define uffd_test_fail(...) do { \ - printf("failed [reason: "); \ - printf(__VA_ARGS__); \ - printf("]\n"); \ - ksft_inc_fail_cnt(); \ +#define uffd_test_fail(fmt, ...) do { \ + ksft_print_msg("failed reason: [" fmt "]\n", ##__VA_ARGS__); \ + ksft_test_result_fail("%s\n", current_test); \ } while (0) static void uffd_test_skip(const char *message) { - printf("skipped [reason: %s]\n", message); - ksft_inc_xskip_cnt(); + ksft_test_result_skip("%s (%s)\n", current_test, message); } -/* - * Returns 1 if specific userfaultfd supported, 0 otherwise. Note, we'll - * return 1 even if some test failed as long as uffd supported, because in - * that case we still want to proceed with the rest uffd unit tests. - */ -static int test_uffd_api(bool use_dev) +static void test_uffd_api(bool use_dev) { struct uffdio_api uffdio_api; int uffd; @@ -140,7 +121,7 @@ static int test_uffd_api(bool use_dev) uffd = uffd_open_sys(UFFD_FLAGS); if (uffd < 0) { uffd_test_skip("cannot open userfaultfd handle"); - return 0; + return; } /* Test wrong UFFD_API */ @@ -177,8 +158,6 @@ static int test_uffd_api(bool use_dev) uffd_test_pass(); out: close(uffd); - /* We have a valid uffd handle */ - return 1; } @@ -320,7 +299,7 @@ static int pagemap_test_fork(uffd_global_test_opts_t *gopts, bool with_event, bo if (test_pin) unpin_pages(&args); /* Succeed */ - exit(0); + _exit(0); } waitpid(child, &result, 0); @@ -788,7 +767,7 @@ static void uffd_sigbus_test_common(uffd_global_test_opts_t *gopts, bool wp) err("fork"); if (!pid) - exit(faulting_process(gopts, 2, wp)); + _exit(faulting_process(gopts, 2, wp)); waitpid(pid, &err, 0); if (err) @@ -842,7 +821,7 @@ static void uffd_events_test_common(uffd_global_test_opts_t *gopts, bool wp) err("fork"); if (!pid) - exit(faulting_process(gopts, 0, wp)); + _exit(faulting_process(gopts, 0, wp)); waitpid(pid, &err, 0); if (err) @@ -1701,18 +1680,58 @@ static void usage(const char *prog) exit(KSFT_FAIL); } +static int uffd_count_tests(int n_tests, int n_mems, const char *test_filter) +{ + uffd_test_case_t *test; + int i, j, count = 0; + + if (!test_filter) + count += 2; /* test_uffd_api(false) + test_uffd_api(true) */ + + for (i = 0; i < n_tests; i++) { + test = &uffd_tests[i]; + if (test_filter && !strstr(test->name, test_filter)) + continue; + for (j = 0; j < n_mems; j++) + if (test->mem_targets & mem_types[j].mem_flag) + count++; + } + + return count; +} + +static unsigned long uffd_setup_hugetlb(void) +{ + unsigned long nr_hugepages, hp_size; + + hugetlb_save_settings(); + hp_size = default_huge_page_size(); + + if (!hp_size) + return 0; + + /* need twice UFFD_TEST_MEM_SIZE, one for src area and one for dst */ + nr_hugepages = 2 * MAX(UFFD_TEST_MEM_SIZE, hp_size * 2) / hp_size; + hugetlb_set_nr_default_pages(nr_hugepages); + + if (hugetlb_free_default_pages() < nr_hugepages) + return 0; + + return hp_size; +} + int main(int argc, char *argv[]) { int n_tests = sizeof(uffd_tests) / sizeof(uffd_test_case_t); int n_mems = sizeof(mem_types) / sizeof(mem_type_t); const char *test_filter = NULL; + unsigned long hugepage_size; bool list_only = false; uffd_test_case_t *test; mem_type_t *mem_type; uffd_test_args_t args; const char *errmsg; - int has_uffd, opt; - int i, j; + int i, j, opt; while ((opt = getopt(argc, argv, "f:hl")) != -1) { switch (opt) { @@ -1730,24 +1749,30 @@ int main(int argc, char *argv[]) } } - if (!test_filter && !list_only) { - has_uffd = test_uffd_api(false); - has_uffd |= test_uffd_api(true); - - if (!has_uffd) { - printf("Userfaultfd not supported or unprivileged, skip all tests\n"); - exit(KSFT_SKIP); + if (list_only) { + for (i = 0; i < n_tests; i++) { + test = &uffd_tests[i]; + if (test_filter && !strstr(test->name, test_filter)) + continue; + printf("%s\n", test->name); } + return KSFT_PASS; + } + + hugepage_size = uffd_setup_hugetlb(); + + ksft_print_header(); + ksft_set_plan(uffd_count_tests(n_tests, n_mems, test_filter)); + + if (!test_filter) { + test_uffd_api(false); + test_uffd_api(true); } for (i = 0; i < n_tests; i++) { test = &uffd_tests[i]; if (test_filter && !strstr(test->name, test_filter)) continue; - if (list_only) { - printf("%s\n", test->name); - continue; - } for (j = 0; j < n_mems; j++) { mem_type = &mem_types[j]; @@ -1758,10 +1783,14 @@ int main(int argc, char *argv[]) uffd_test_ops = mem_type->mem_ops; uffd_test_case_ops = test->test_case_ops; + if (!(test->mem_targets & mem_type->mem_flag)) + continue; + + uffd_test_start("%s on %s", test->name, mem_type->name); if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) { - gopts.page_size = default_huge_page_size(); + gopts.page_size = hugepage_size; if (gopts.page_size == 0) { - uffd_test_skip("huge page size is 0, feature missing?"); + uffd_test_skip("not enough HugeTLB pages"); continue; } } else { @@ -1777,10 +1806,6 @@ int main(int argc, char *argv[]) /* Initialize test arguments */ args.mem_type = mem_type; - if (!(test->mem_targets & mem_type->mem_flag)) - continue; - - uffd_test_start("%s on %s", test->name, mem_type->name); if (!uffd_feature_supported(test)) { uffd_test_skip("feature missing"); continue; @@ -1794,10 +1819,7 @@ int main(int argc, char *argv[]) } } - if (!list_only) - uffd_test_report(); - - return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS; + ksft_finished(); } #else /* __NR_userfaultfd */ @@ -1806,8 +1828,8 @@ int main(int argc, char *argv[]) int main(void) { - printf("Skipping %s (missing __NR_userfaultfd)\n", __file__); - return KSFT_SKIP; + ksft_print_header(); + ksft_exit_skip("missing __NR_userfaultfd definition\n"); } #endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c index 17186d4a4147..c973d6722720 100644 --- a/tools/testing/selftests/mm/uffd-wp-mremap.c +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -8,16 +8,27 @@ #include <linux/mman.h> #include <sys/mman.h> #include "kselftest.h" -#include "thp_settings.h" +#include "hugepage_settings.h" #include "uffd-common.h" static int pagemap_fd; -static size_t pagesize; static int nr_pagesizes = 1; +static unsigned long pagesize; static int nr_thpsizes; static size_t thpsizes[20]; static int nr_hugetlbsizes; -static size_t hugetlbsizes[10]; +static unsigned long hugetlbsizes[10]; + +static void check_uffd_wp_feature_supported(void) +{ + uint64_t features = 0; + + if (uffd_get_features(&features)) + ksft_exit_skip("failed to get available features (%d)\n", errno); + + if (!(features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) + ksft_exit_skip("uffd-wp feature not supported\n"); +} static int detect_thp_sizes(size_t sizes[], int max) { @@ -245,7 +256,7 @@ out: } struct testcase { - size_t *sizes; + unsigned long *sizes; int *nr_sizes; bool private; bool swapout; @@ -336,14 +347,16 @@ int main(int argc, char **argv) struct thp_settings settings; int i, j, plan = 0; + hugepage_save_settings(true, true); + + check_uffd_wp_feature_supported(); + pagesize = getpagesize(); nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); - nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, - ARRAY_SIZE(hugetlbsizes)); + nr_hugetlbsizes = hugetlb_setup(1, hugetlbsizes, ARRAY_SIZE(hugetlbsizes)); - /* If THP is supported, save THP settings and initially disable THP. */ + /* If THP is supported, initially disable THP. */ if (nr_thpsizes) { - thp_save_settings(); thp_read_settings(&settings); for (i = 0; i < NR_ORDERS; i++) { settings.hugepages[i].enabled = THP_NEVER; @@ -368,10 +381,6 @@ int main(int argc, char **argv) tc->swapout, tc->hugetlb); } - /* If THP is supported, restore original THP settings. */ - if (nr_thpsizes) - thp_restore_settings(); - i = ksft_get_fail_cnt(); if (i) ksft_exit_fail_msg("%d out of %d tests failed\n", diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 51401e081b20..e24d7ba00b44 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -11,6 +11,7 @@ #include "vm_util.h" #include "kselftest.h" +#include "hugepage_settings.h" /* * The hint addr value is used to allocate addresses @@ -257,40 +258,35 @@ void testcases_init(void) switch_hint = addr_switch_hint; } -static int run_test(struct testcase *test, int count) +static void run_test(struct testcase *test, int count) { void *p; - int i, ret = KSFT_PASS; + int i; for (i = 0; i < count; i++) { struct testcase *t = test + i; p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - if (p == MAP_FAILED) { - printf("FAILED\n"); - ret = KSFT_FAIL; + ksft_perror("MAP_FAILED"); + ksft_test_result_fail("%s\n", t->msg); continue; } if (t->low_addr_required && p >= (void *)(switch_hint)) { - printf("FAILED\n"); - ret = KSFT_FAIL; + ksft_print_msg("%p not below switch hint\n", p); + ksft_test_result_fail("%s\n", t->msg); } else { /* * Do a dereference of the address returned so that we catch * bugs in page fault handling */ memset(p, 0, t->size); - printf("OK\n"); + ksft_test_result_pass("%s\n", t->msg); } if (!t->keep_mapped) munmap(p, t->size); } - - return ret; } #ifdef __aarch64__ @@ -322,19 +318,23 @@ static int supported_arch(void) int main(int argc, char **argv) { - int ret, hugetlb_ret = KSFT_PASS; + bool run_hugetlb = false; + + ksft_print_header(); if (!supported_arch()) - return KSFT_SKIP; + ksft_exit_skip("Architecture not supported\n"); + + if (hugetlb_setup_default(6)) + run_hugetlb = true; testcases_init(); - ret = run_test(testcases, sz_testcases); - if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases); + ksft_set_plan(sz_testcases + (run_hugetlb ? sz_hugetlb_testcases : 0)); + + run_test(testcases, sz_testcases); + if (run_hugetlb) + run_test(hugetlb_testcases, sz_hugetlb_testcases); - if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS) - return KSFT_PASS; - else - return KSFT_FAIL; + ksft_finished(); } diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index 9492c2d72634..01c15fe3c799 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -9,7 +9,6 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 -orig_nr_hugepages=0 skip() { @@ -77,43 +76,5 @@ check_test_requirements() esac } -save_nr_hugepages() -{ - orig_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) -} - -restore_nr_hugepages() -{ - echo "$orig_nr_hugepages" > /proc/sys/vm/nr_hugepages -} - -setup_nr_hugepages() -{ - local needpgs=$1 - while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs="$size" - break - fi - done < /proc/meminfo - if [ "$freepgs" -ge "$needpgs" ]; then - return - fi - local hpgs=$((orig_nr_hugepages + needpgs)) - echo $hpgs > /proc/sys/vm/nr_hugepages - - local nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) - if [ "$nr_hugepgs" != "$hpgs" ]; then - restore_nr_hugepages - skip "$0: no enough hugepages for testing" - fi -} - check_test_requirements -save_nr_hugepages -# The HugeTLB tests require 6 pages -setup_nr_hugepages 6 -./va_high_addr_switch --run-hugetlb -retcode=$? -restore_nr_hugepages -exit $retcode +./va_high_addr_switch diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index db94564f4431..311fc5b4513e 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -2,7 +2,6 @@ #include <string.h> #include <errno.h> #include <fcntl.h> -#include <dirent.h> #include <inttypes.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> @@ -291,53 +290,6 @@ int64_t allocate_transhuge(void *ptr, int pagemap_fd) return -1; } -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -int detect_hugetlb_page_sizes(size_t sizes[], int max) -{ - DIR *dir = opendir("/sys/kernel/mm/hugepages/"); - int count = 0; - - if (!dir) - return 0; - - while (count < max) { - struct dirent *entry = readdir(dir); - size_t kb; - - if (!entry) - break; - if (entry->d_type != DT_DIR) - continue; - if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) - continue; - sizes[count++] = kb * 1024; - ksft_print_msg("[INFO] detected hugetlb page size: %zu KiB\n", - kb); - } - closedir(dir); - return count; -} - int pageflags_get(unsigned long pfn, int kpageflags_fd, uint64_t *flags) { size_t count; @@ -396,25 +348,6 @@ int uffd_unregister(int uffd, void *addr, uint64_t len) return ret; } -unsigned long get_free_hugepages(void) -{ - unsigned long fhp = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return fhp; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) - break; - } - - free(line); - fclose(f); - return fhp; -} - static bool check_vmflag(void *addr, const char *flag) { char buffer[MAX_LINE_LENGTH]; @@ -463,7 +396,7 @@ bool softdirty_supported(void) /* New mappings are expected to be marked with VM_SOFTDIRTY (sd). */ addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (!addr) + if (addr == MAP_FAILED) ksft_exit_fail_msg("mmap failed\n"); supported = check_vmflag(addr, "sd"); @@ -765,6 +698,27 @@ int unpoison_memory(unsigned long pfn) return ret > 0 ? 0 : -errno; } +int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + void write_file(const char *path, const char *buf, size_t buflen) { int fd, saved_errno; @@ -788,3 +742,49 @@ void write_file(const char *path, const char *buf, size_t buflen) ksft_exit_fail_msg("%s write(%.*s) is truncated, expected %zu bytes, got %zd bytes\n", path, (int)(buflen - 1), buf, buflen - 1, numwritten); } + +unsigned long read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) + ksft_exit_fail_perror("read_file()"); + + return strtoul(buf, NULL, 10); +} + +void write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%lu", num); + write_file(path, buf, strlen(buf) + 1); +} + +static unsigned long shmall, shmmax; + +void __shm_limits_restore(void) +{ + if (shmmax) + write_num("/proc/sys/kernel/shmmax", shmmax); + if (shmall) + write_num("/proc/sys/kernel/shmall", shmall); +} + +void shm_limits_prepare(unsigned long length) +{ + unsigned long nr = length / psize(); + unsigned long val; + + val = read_num("/proc/sys/kernel/shmmax"); + if (val < length) { + write_num("/proc/sys/kernel/shmmax", length); + shmmax = val; + } + + val = read_num("/proc/sys/kernel/shmall"); + if (val < nr) { + write_num("/proc/sys/kernel/shmall", nr); + shmall = val; + } +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 1a07305ceff4..ea8fc8fdf0eb 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -94,8 +94,6 @@ bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); int64_t allocate_transhuge(void *ptr, int pagemap_fd); -unsigned long default_huge_page_size(void); -int detect_hugetlb_page_sizes(size_t sizes[], int max); int pageflags_get(unsigned long pfn, int kpageflags_fd, uint64_t *flags); int uffd_register(int uffd, void *addr, uint64_t len, @@ -103,7 +101,6 @@ int uffd_register(int uffd, void *addr, uint64_t len, int uffd_unregister(int uffd, void *addr, uint64_t len); int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor, uint64_t *ioctls); -unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); bool check_vmflag_pfnmap(void *addr); bool check_vmflag_guard(void *addr); @@ -168,3 +165,15 @@ int unpoison_memory(unsigned long pfn); #define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) void write_file(const char *path, const char *buf, size_t buflen); +int read_file(const char *path, char *buf, size_t buflen); +unsigned long read_num(const char *path); +void write_num(const char *path, unsigned long num); + +void shm_limits_prepare(unsigned long length); +void __shm_limits_restore(void); + +#define SHM_LIMITS_RESTORE() \ +static void __attribute__((destructor)) shm_limits_restore(void) \ +{ \ + __shm_limits_restore(); \ +} diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c index b570746e917c..26b452c98c66 100644 --- a/tools/testing/selftests/namespaces/listns_efault_test.c +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -38,7 +38,7 @@ TEST(listns_partial_fault_with_ns_cleanup) __u64 *ns_ids; ssize_t ret; long page_size; - pid_t pid, iter_pid; + pid_t pid, iter_pid, ns_pids[5]; int pidfds[5]; int sv[5][2]; int iter_pidfd; @@ -114,6 +114,7 @@ TEST(listns_partial_fault_with_ns_cleanup) pid = create_child(&pidfds[i], CLONE_NEWNS); ASSERT_NE(pid, -1); + ns_pids[i] = pid; if (pid == 0) { close(sv[i][0]); /* Close parent end */ @@ -164,7 +165,7 @@ TEST(listns_partial_fault_with_ns_cleanup) /* Wait for all mount namespace children to exit and cleanup */ for (i = 0; i < 5; i++) { - waitpid(-1, NULL, 0); + waitpid(ns_pids[i], NULL, 0); close(sv[i][0]); close(pidfds[i]); } @@ -175,6 +176,12 @@ TEST(listns_partial_fault_with_ns_cleanup) ASSERT_EQ(ret, iter_pid); close(iter_pidfd); + /* If listns() is not supported the iterator exits cleanly via ENOSYS */ + if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) { + munmap(map, page_size); + SKIP(return, "listns() not supported"); + } + /* Should have been killed */ ASSERT_TRUE(WIFSIGNALED(status)); ASSERT_EQ(WTERMSIG(status), SIGKILL); @@ -250,7 +257,7 @@ TEST(listns_late_fault_with_ns_cleanup) __u64 *ns_ids; ssize_t ret; long page_size; - pid_t pid, iter_pid; + pid_t pid, iter_pid, ns_pids[10]; int pidfds[10]; int sv[10][2]; int iter_pidfd; @@ -320,6 +327,7 @@ TEST(listns_late_fault_with_ns_cleanup) pid = create_child(&pidfds[i], CLONE_NEWNS); ASSERT_NE(pid, -1); + ns_pids[i] = pid; if (pid == 0) { close(sv[i][0]); /* Close parent end */ @@ -373,7 +381,7 @@ TEST(listns_late_fault_with_ns_cleanup) /* Wait for all children and cleanup */ for (i = 0; i < 10; i++) { - waitpid(-1, NULL, 0); + waitpid(ns_pids[i], NULL, 0); close(sv[i][0]); close(pidfds[i]); } @@ -384,6 +392,12 @@ TEST(listns_late_fault_with_ns_cleanup) ASSERT_EQ(ret, iter_pid); close(iter_pidfd); + /* If listns() is not supported the iterator exits cleanly via ENOSYS */ + if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) { + munmap(map, page_size); + SKIP(return, "listns() not supported"); + } + /* Should have been killed */ ASSERT_TRUE(WIFSIGNALED(status)); ASSERT_EQ(WTERMSIG(status), SIGKILL); @@ -402,7 +416,7 @@ TEST(listns_mnt_ns_cleanup_on_fault) __u64 *ns_ids; ssize_t ret; long page_size; - pid_t pid, iter_pid; + pid_t pid, iter_pid, ns_pids[8]; int pidfds[8]; int sv[8][2]; int iter_pidfd; @@ -462,6 +476,7 @@ TEST(listns_mnt_ns_cleanup_on_fault) pid = create_child(&pidfds[i], CLONE_NEWNS); ASSERT_NE(pid, -1); + ns_pids[i] = pid; if (pid == 0) { close(sv[i][0]); /* Close parent end */ @@ -508,7 +523,7 @@ TEST(listns_mnt_ns_cleanup_on_fault) /* Wait for children and cleanup */ for (i = 0; i < 8; i++) { - waitpid(-1, NULL, 0); + waitpid(ns_pids[i], NULL, 0); close(sv[i][0]); close(pidfds[i]); } @@ -519,6 +534,12 @@ TEST(listns_mnt_ns_cleanup_on_fault) ASSERT_EQ(ret, iter_pid); close(iter_pidfd); + /* If listns() is not supported the iterator exits cleanly via ENOSYS */ + if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) { + munmap(map, page_size); + SKIP(return, "listns() not supported"); + } + /* Should have been killed */ ASSERT_TRUE(WIFSIGNALED(status)); ASSERT_EQ(WTERMSIG(status), SIGKILL); diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index b4a14c6693a5..46dc838cba82 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -25,14 +25,24 @@ /* Fixture for tests that create child processes */ FIXTURE(nsid) { pid_t child_pid; + pid_t grandchild_pid; }; FIXTURE_SETUP(nsid) { self->child_pid = 0; + self->grandchild_pid = 0; } FIXTURE_TEARDOWN(nsid) { - /* Clean up any child process that may still be running */ + /* + * Kill grandchild first: timens_separate and pidns_separate fork a + * grandchild that calls pause(). It is reparented to init on child + * exit and keeps the test runner's tap pipe open, hanging the runner. + */ + if (self->grandchild_pid > 0) { + kill(self->grandchild_pid, SIGKILL); + waitpid(self->grandchild_pid, NULL, 0); + } if (self->child_pid > 0) { kill(self->child_pid, SIGKILL); waitpid(self->child_pid, NULL, 0); @@ -676,6 +686,7 @@ TEST_F(nsid, timens_separate) pid_t grandchild_pid; ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + self->grandchild_pid = grandchild_pid; close(pipefd[0]); /* Open grandchild's time namespace */ @@ -797,6 +808,7 @@ TEST_F(nsid, pidns_separate) pid_t grandchild_pid; ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + self->grandchild_pid = grandchild_pid; close(pipefd[0]); /* Open grandchild's PID namespace */ diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 97ad4d551d44..c9f46031ac73 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -6,6 +6,7 @@ busy_poller cmsg_sender epoll_busy_poll fin_ack_lat +getsockopt_iter hwtstamp_config icmp_rfc4884 io_uring_zerocopy_tx @@ -40,7 +41,6 @@ skf_net_off socket so_incoming_cpu so_netns_cookie -so_txtime so_rcv_listener stress_reuseport_listen tap diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index a275ed584026..708d960ae07d 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -26,6 +26,7 @@ TEST_PROGS := \ cmsg_time.sh \ double_udp_encap.sh \ drop_monitor_tests.sh \ + ecmp_rehash.sh \ fcnal-ipv4.sh \ fcnal-ipv6.sh \ fcnal-other.sh \ @@ -69,6 +70,7 @@ TEST_PROGS := \ nl_netdev.py \ nl_nlctrl.py \ pmtu.sh \ + protodown.sh \ psock_snd.sh \ reuseaddr_ports_exhausted.sh \ reuseport_addr_any.sh \ @@ -83,7 +85,6 @@ TEST_PROGS := \ rxtimestamp.sh \ sctp_vrf.sh \ skf_net_off.sh \ - so_txtime.sh \ srv6_end_dt46_l3vpn_test.sh \ srv6_end_dt4_l3vpn_test.sh \ srv6_end_dt6_l3vpn_test.sh \ @@ -96,6 +97,7 @@ TEST_PROGS := \ srv6_hl2encap_red_l2vpn_test.sh \ srv6_iptunnel_cache.sh \ stress_reuseport_listen.sh \ + tcp_ecmp_failover.sh \ tcp_fastopen_backup_key.sh \ test_bpf.sh \ test_bridge_backup_port.sh \ @@ -108,6 +110,7 @@ TEST_PROGS := \ test_vxlan_nh.sh \ test_vxlan_nolocalbypass.sh \ test_vxlan_under_vrf.sh \ + test_vxlan_vnifilter_notify.sh \ test_vxlan_vnifiltering.sh \ tfo_passive.sh \ traceroute.sh \ @@ -157,7 +160,6 @@ TEST_GEN_FILES := \ skf_net_off \ so_netns_cookie \ so_rcv_listener \ - so_txtime \ socket \ stress_reuseport_listen \ tcp_fastopen_backup_key \ @@ -176,6 +178,7 @@ TEST_GEN_PROGS := \ bind_timewait \ bind_wildcard \ epoll_busy_poll \ + getsockopt_iter \ icmp_rfc4884 \ ipv6_fragmentation \ proc_net_pktgen \ diff --git a/tools/testing/selftests/net/af_unix/scm_inq.c b/tools/testing/selftests/net/af_unix/scm_inq.c index 3a86be9bda17..6268b5bf50be 100644 --- a/tools/testing/selftests/net/af_unix/scm_inq.c +++ b/tools/testing/selftests/net/af_unix/scm_inq.c @@ -8,8 +8,9 @@ #include "kselftest_harness.h" -#define NR_CHUNKS 100 -#define MSG_LEN 256 +#define NR_CHUNKS 100 +#define MSG_LEN 256 +#define NR_PARTIAL_READS 3 FIXTURE(scm_inq) { @@ -120,4 +121,53 @@ TEST_F(scm_inq, basic) recv_chunks(_metadata, self); } +TEST_F(scm_inq, partial_read) +{ + char buf[MSG_LEN * NR_PARTIAL_READS] = {}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + struct msghdr msg = {}; + struct iovec iov = {}; + struct cmsghdr *cmsg; + int err, inq, ret, i; + int remain; + + err = setsockopt(self->fd[1], SOL_SOCKET, SO_INQ, &(int){1}, sizeof(int)); + if (variant->type != SOCK_STREAM) { + ASSERT_EQ(-ENOPROTOOPT, -errno); + return; + } + ASSERT_EQ(0, err); + + ret = send(self->fd[0], buf, sizeof(buf), 0); + ASSERT_EQ(sizeof(buf), ret); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + iov.iov_base = buf; + iov.iov_len = MSG_LEN; + + for (i = 0; i < NR_PARTIAL_READS; i++) { + remain = MSG_LEN * (NR_PARTIAL_READS - 1 - i); + + memset(buf, 0, MSG_LEN); + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + ret = recvmsg(self->fd[1], &msg, 0); + ASSERT_EQ(MSG_LEN, ret); + + cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(NULL, cmsg); + ASSERT_EQ(CMSG_LEN(sizeof(int)), cmsg->cmsg_len); + ASSERT_EQ(SOL_SOCKET, cmsg->cmsg_level); + ASSERT_EQ(SCM_INQ, cmsg->cmsg_type); + ASSERT_EQ(remain, *(int *)CMSG_DATA(cmsg)); + + ret = ioctl(self->fd[1], SIOCINQ, &inq); + ASSERT_EQ(0, ret); + ASSERT_EQ(remain, inq); + } +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/bind_bhash.c b/tools/testing/selftests/net/bind_bhash.c index da04b0b19b73..2bd100777448 100644 --- a/tools/testing/selftests/net/bind_bhash.c +++ b/tools/testing/selftests/net/bind_bhash.c @@ -52,18 +52,19 @@ static int bind_socket(int opt, const char *addr) err = setsockopt(sock_fd, SOL_SOCKET, opt, &reuse, sizeof(reuse)); if (err) { perror("setsockopt failed"); - goto cleanup; + goto err_free_info; } } err = bind(sock_fd, res->ai_addr, res->ai_addrlen); if (err) { perror("failed to bind to port"); - goto cleanup; + goto err_free_info; } - + freeaddrinfo(res); return sock_fd; - +err_free_info: + freeaddrinfo(res); cleanup: close(sock_fd); return err; diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh index 334a7eca8a80..cc571f607429 100755 --- a/tools/testing/selftests/net/broadcast_ether_dst.sh +++ b/tools/testing/selftests/net/broadcast_ether_dst.sh @@ -44,7 +44,7 @@ test_broadcast_ether_dst() { # tcpdump will exit after receiving a single packet # timeout will kill tcpdump if it is still running after 2s timeout 2s ip netns exec "${CLIENT_NS}" \ - tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" & + tcpdump -i link0 -c 1 -w "${CAPFILE}" -Z root icmp &> "${OUTPUT}" & pid=$! slowwait 1 grep -qs "listening" "${OUTPUT}" diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 94d722770420..e1ce35c2abbe 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -117,11 +117,14 @@ CONFIG_OPENVSWITCH=m CONFIG_OPENVSWITCH_GENEVE=m CONFIG_OPENVSWITCH_GRE=m CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_PAGE_POOL_STATS=y CONFIG_PROC_SYSCTL=y CONFIG_PSAMPLE=m CONFIG_RPS=y +CONFIG_SYN_COOKIES=y CONFIG_SYSFS=y CONFIG_TAP=m +CONFIG_TCP_CONG_DCTCP=y CONFIG_TCP_MD5SIG=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_TEST_BPF=m diff --git a/tools/testing/selftests/net/ecmp_rehash.sh b/tools/testing/selftests/net/ecmp_rehash.sh new file mode 100755 index 000000000000..f05a6c8edd2a --- /dev/null +++ b/tools/testing/selftests/net/ecmp_rehash.sh @@ -0,0 +1,1109 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test local ECMP path re-selection on TCP retransmission timeout and PLB. +# +# Two namespaces connected by two parallel veth pairs with a 2-way ECMP +# route. When a TCP path is blocked (via tc drop) or congested (via +# netem ECN marking), the kernel rehashes the connection via +# sk_rethink_txhash() + __sk_dst_reset(), causing the next route lookup +# to select the other ECMP path. +# +# Expected runtime: ~60 seconds. Most time is spent waiting for TCP +# retransmission timeouts (1-7s per test) and running multi-round +# consistency checks (10 rounds each). The large slowwait/connect-timeout +# values (30-120s) are worst-case bounds for CI; a correctly functioning +# kernel reaches each check well before the timeout expires. + +source lib.sh + +SUBNETS=(a b) +PORT=9900 +: "${ECMP_REBUILD_ROUNDS:=10}" + +# alloc_ports NAME [COUNT]: set NAME to the next free port and reserve +# COUNT ports (default 1) from a shared counter. Each test allocates its +# own port(s) where it runs, so a retry or a newly added test never +# collides; the per-round tests reserve ECMP_REBUILD_ROUNDS each. +NEXT_PORT=$PORT +alloc_ports() +{ + printf -v "$1" '%d' "$NEXT_PORT" + NEXT_PORT=$((NEXT_PORT + ${2:-1})) +} + +ALL_TESTS=" + test_ecmp_syn_rehash + test_ecmp_synack_rehash + test_ecmp_midstream_rehash + test_ecmp_midstream_ack_rehash + test_ecmp_plb_rehash + test_ecmp_hash_policy1_no_rehash + test_ecmp_no_flowlabel_leak + test_ecmp_dst_rebuild_consistency + test_ecmp_syncookie_path_consistency +" + +link_tx_packets_get() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" cat "/sys/class/net/$dev/statistics/tx_packets" +} + +# Return the number of packets matched by the tc filter action on a device. +# When tc drops packets via "action drop", the device's tx_packets is not +# incremented (packet never reaches veth_xmit), but the tc action maintains +# its own counter. +tc_filter_pkt_count() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc -s filter show dev "$dev" parent 1: 2>/dev/null | + awk '/Sent .* pkt/ { + for (i=1; i<=NF; i++) + if ($i == "pkt") { print $(i-1); exit } + }' +} + +# Read a TcpExt counter from /proc/net/netstat in a namespace. +# Returns 0 if the counter is not found. +get_netstat_counter() +{ + local ns=$1; shift + local field=$1; shift + local val + + # shellcheck disable=SC2016 + val=$(ip netns exec "$ns" awk -v key="$field" ' + /^TcpExt:/ { + if (!h) { split($0, n); h=1 } + else { + split($0, v) + for (i in n) + if (n[i] == key) print v[i] + } + } + ' /proc/net/netstat) + echo "${val:-0}" +} + +# Apply netem ECN marking: CE-mark all ECT packets instead of dropping them. +mark_ecn() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc qdisc add dev "$dev" root netem loss 100% ecn +} + +# Block TCP (IPv6 next-header = 6) egress, allowing ICMPv6 through. +block_tcp() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio + ip netns exec "$ns" tc filter add dev "$dev" parent 1: \ + protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action drop +} + +unblock_tcp() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc qdisc del dev "$dev" root 2>/dev/null +} + +# Return success when a device's TX counter exceeds a baseline value. +dev_tx_packets_above() +{ + local ns=$1; shift + local dev=$1; shift + local baseline=$1; shift + + local cur + cur=$(link_tx_packets_get "$ns" "$dev") + [ "$cur" -gt "$baseline" ] +} + +# Return success when both devices have dropped at least one TCP packet. +both_devs_attempted() +{ + local ns=$1; shift + local dev0=$1; shift + local dev1=$1; shift + + local c0 c1 + c0=$(tc_filter_pkt_count "$ns" "$dev0") + c1=$(tc_filter_pkt_count "$ns" "$dev1") + [ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ] +} + +link_tx_packets_total() +{ + local ns=$1; shift + local dev0=${1:-veth0a}; shift 2>/dev/null + local dev1=${1:-veth1a} + + echo $(( $(link_tx_packets_get "$ns" "$dev0") + + $(link_tx_packets_get "$ns" "$dev1") )) +} + +# (Re)install the ECMP multipath routes between NS1 and NS2. $1 is the +# ip route operation ("add" to create, "change" to replace). If $2 is +# given it names a congestion control to pin on both routes via "congctl"; +# because dctcp carries TCP_CONG_NEEDS_ECN, this also tags the route with +# DST_FEATURE_ECN_CA, which makes the server negotiate ECN without the +# listener itself having to run dctcp. The nexthop topology lives here +# only, so a test can re-pin the routes and restore them with one call. +install_ecmp_routes() +{ + local op=$1 cc=$2 + local -a cc_attr=() + + [ -n "$cc" ] && cc_attr=(congctl "$cc") + + ip -n "$NS1" -6 route "$op" fd00:ff::2/128 "${cc_attr[@]}" \ + nexthop via fd00:a::2 dev veth0a \ + nexthop via fd00:b::2 dev veth1a + + ip -n "$NS2" -6 route "$op" fd00:ff::1/128 "${cc_attr[@]}" \ + nexthop via fd00:a::1 dev veth0b \ + nexthop via fd00:b::1 dev veth1b +} + +setup() +{ + setup_ns NS1 NS2 + + local ns + for ns in "$NS1" "$NS2"; do + ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.accept_dad=0 + ip netns exec "$ns" sysctl -qw net.ipv6.conf.default.accept_dad=0 + ip netns exec "$ns" sysctl -qw net.ipv6.conf.all.forwarding=1 + ip netns exec "$ns" sysctl -qw net.core.txrehash=1 + done + + local i sub + for i in 0 1; do + sub=${SUBNETS[$i]} + ip link add "veth${i}a" type veth peer name "veth${i}b" + ip link set "veth${i}a" netns "$NS1" + ip link set "veth${i}b" netns "$NS2" + ip -n "$NS1" addr add "fd00:${sub}::1/64" dev "veth${i}a" + ip -n "$NS2" addr add "fd00:${sub}::2/64" dev "veth${i}b" + ip -n "$NS1" link set "veth${i}a" up + ip -n "$NS2" link set "veth${i}b" up + done + + ip -n "$NS1" addr add fd00:ff::1/128 dev lo + ip -n "$NS2" addr add fd00:ff::2/128 dev lo + + # Allow many SYN retries at 1-second intervals (linear, no + # exponential backoff) so the rehash test has enough attempts + # to exercise both ECMP paths. + if ! ip netns exec "$NS1" sysctl -qw \ + net.ipv4.tcp_syn_linear_timeouts=25; then + echo "SKIP: tcp_syn_linear_timeouts not supported" + return "$ksft_skip" + fi + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_syn_retries=25 + + # Keep the server's request socket alive during the blocking + # period so SYN/ACK retransmits continue. + ip netns exec "$NS2" sysctl -qw net.ipv4.tcp_synack_retries=25 + + install_ecmp_routes add + + for i in 0 1; do + sub=${SUBNETS[$i]} + ip netns exec "$NS1" \ + ping -6 -c1 -W5 "fd00:${sub}::2" &>/dev/null + ip netns exec "$NS2" \ + ping -6 -c1 -W5 "fd00:${sub}::1" &>/dev/null + done + + if ! ip netns exec "$NS1" ping -6 -c1 -W5 fd00:ff::2 &>/dev/null; then + echo "Basic connectivity check failed" + return "$ksft_skip" + fi +} + +# Block ALL paths, start a connection, wait until SYNs have been dropped +# on both interfaces (proving rehash steered the SYN to a new path), then +# unblock so the connection completes. +test_ecmp_syn_rehash() +{ + RET=0 + local port + alloc_ports port + + block_tcp "$NS1" veth0a + defer unblock_tcp "$NS1" veth0a + block_tcp "$NS1" veth1a + defer unblock_tcp "$NS1" veth1a + + ip netns exec "$NS2" socat \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ + EXEC:"echo ESTABLISH_OK" & + defer kill_process $! + + wait_local_port_listen "$NS2" "$port" tcp + + local rehash_before + rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + + # Start the connection in the background; it will retry SYNs at + # 1-second intervals until an unblocked path is found. + # Use -u (unidirectional) to only receive from the server; + # sending data back would risk SIGPIPE if the server's EXEC + # child has already exited. + local tmpfile + tmpfile=$(mktemp) + defer rm -f "$tmpfile" + + ip netns exec "$NS1" socat -u \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60" \ + STDOUT >"$tmpfile" 2>&1 & + local client_pid=$! + defer kill_process "$client_pid" + + # Wait until both paths have seen at least one dropped SYN. + # This proves sk_rethink_txhash() rehashed the connection from + # one ECMP path to the other. + slowwait 30 both_devs_attempted "$NS1" veth0a veth1a > /dev/null + check_err $? "SYNs did not appear on both paths (rehash not working)" + if [ "$RET" -ne 0 ]; then + log_test "Local ECMP SYN rehash: establish with blocked paths" + return + fi + + # Unblock both paths and let the next SYN retransmit succeed. + unblock_tcp "$NS1" veth0a + unblock_tcp "$NS1" veth1a + + local rc=0 + wait "$client_pid" || rc=$? + + local result + result=$(cat "$tmpfile" 2>/dev/null) + + if [[ "$result" != *"ESTABLISH_OK"* ]]; then + check_err 1 "connection failed after unblocking (rc=$rc): $result" + fi + + local rehash_after + rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + if [ "$rehash_after" -le "$rehash_before" ]; then + check_err 1 "TcpTimeoutRehash counter did not increment" + fi + + log_test "Local ECMP SYN rehash: establish with blocked paths" +} + +# Block the server's return paths so SYN/ACKs are dropped. The client +# retransmits SYNs at 1-second intervals; each duplicate SYN arriving at +# the server triggers tcp_rtx_synack() which re-rolls txhash, so the +# retransmitted SYN/ACK selects a different ECMP return path. +test_ecmp_synack_rehash() +{ + RET=0 + local port + alloc_ports port + + block_tcp "$NS2" veth0b + defer unblock_tcp "$NS2" veth0b + block_tcp "$NS2" veth1b + defer unblock_tcp "$NS2" veth1b + + ip netns exec "$NS2" socat \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ + EXEC:"echo SYNACK_OK" & + defer kill_process $! + + wait_local_port_listen "$NS2" "$port" tcp + + # Start the connection; SYNs reach the server (client egress is + # open) but SYN/ACKs are dropped on the server's return path. + local tmpfile + tmpfile=$(mktemp) + defer rm -f "$tmpfile" + + ip netns exec "$NS1" socat -u \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=60" \ + STDOUT >"$tmpfile" 2>&1 & + local client_pid=$! + defer kill_process "$client_pid" + + # Wait until both server-side interfaces have dropped at least + # one SYN/ACK, proving the server rehashed its return path. + slowwait 30 both_devs_attempted "$NS2" veth0b veth1b > /dev/null + check_err $? "SYN/ACKs did not appear on both return paths" + if [ "$RET" -ne 0 ]; then + log_test "Local ECMP SYN/ACK rehash: blocked return path" + return + fi + + # Unblock and let the connection complete. + unblock_tcp "$NS2" veth0b + unblock_tcp "$NS2" veth1b + + local rc=0 + wait "$client_pid" || rc=$? + + local result + result=$(cat "$tmpfile" 2>/dev/null) + + if [[ "$result" != *"SYNACK_OK"* ]]; then + check_err 1 "connection failed after unblocking (rc=$rc): $result" + fi + + log_test "Local ECMP SYN/ACK rehash: blocked return path" +} + +# Establish a data transfer with both paths open, then block the +# active path. Verify that data appears on the previously inactive +# path (proving RTO triggered a rehash) and that TcpTimeoutRehash +# incremented. +# +# With 2-way ECMP each rehash may pick the same path, so a single +# attempt can occasionally fail. Retry once for robustness. + +# Single attempt at the midstream rehash check. Returns 0 on success. +ecmp_midstream_rehash_attempt() +{ + local port=$1; shift + local reason="" + + ip netns exec "$NS2" socat -u \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & + local server_pid=$! + + wait_local_port_listen "$NS2" "$port" tcp + + local base_tx0 base_tx1 + base_tx0=$(link_tx_packets_get "$NS1" veth0a) + base_tx1=$(link_tx_packets_get "$NS1" veth1a) + + # Continuous data source; timeout caps overall test duration and + # must exceed the slowwait below so data keeps flowing. + ip netns exec "$NS1" timeout 90 socat -u \ + OPEN:/dev/zero \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" &>/dev/null & + local client_pid=$! + + # Wait for enough packets to identify the active path. + if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ + ">= $((base_tx0 + base_tx1 + 10))" \ + link_tx_packets_total "$NS1" > /dev/null; then + kill "$client_pid" "$server_pid" 2>/dev/null + wait "$client_pid" "$server_pid" 2>/dev/null + echo "no TX activity" + return 1 + fi + + # Find the active path and block it. + local current_tx0 current_tx1 active_idx inactive_idx + current_tx0=$(link_tx_packets_get "$NS1" veth0a) + current_tx1=$(link_tx_packets_get "$NS1" veth1a) + if [ $((current_tx0 - base_tx0)) -ge $((current_tx1 - base_tx1)) ]; then + active_idx=0; inactive_idx=1 + else + active_idx=1; inactive_idx=0 + fi + + local rehash_before + rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + # Suppress __dst_negative_advice() in tcp_write_timeout() so + # that __sk_dst_reset() is the only dst-invalidation mechanism + # on the RTO path. + local saved_retries1 + saved_retries1=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_retries1) + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_retries1=255 + + block_tcp "$NS1" "veth${active_idx}a" + + # Capture baseline after block_tcp returns. block_tcp adds a + # prio qdisc then a tc filter; between those two steps the + # qdisc's CAN_BYPASS fast-path lets packets through unfiltered. + local inactive_before + inactive_before=$(link_tx_packets_get "$NS1" "veth${inactive_idx}a") + + # Wait for meaningful data on the previously inactive path, + # proving RTO triggered a rehash and data actually moved. + if ! slowwait 60 dev_tx_packets_above \ + "$NS1" "veth${inactive_idx}a" "$((inactive_before + 100))" \ + > /dev/null; then + reason="no data on alternate path" + fi + + local rehash_after + rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + if [ "$rehash_after" -le "$rehash_before" ]; then + reason="${reason:+$reason; }TcpTimeoutRehash did not increment" + fi + + unblock_tcp "$NS1" "veth${active_idx}a" + ip netns exec "$NS1" sysctl -qw \ + net.ipv4.tcp_retries1="$saved_retries1" + kill "$client_pid" "$server_pid" 2>/dev/null + wait "$client_pid" "$server_pid" 2>/dev/null + if [ -n "$reason" ]; then + echo "$reason" + return 1 + fi + return 0 +} + +test_ecmp_midstream_rehash() +{ + RET=0 + local port retry_port + alloc_ports port + alloc_ports retry_port + + local fail_reason + if ! ecmp_midstream_rehash_attempt "$port" >/dev/null; then + fail_reason=$(ecmp_midstream_rehash_attempt "$retry_port") + check_err $? "$fail_reason" + fi + + log_test "Local ECMP midstream rehash: block active path" +} + +# Single attempt at the ACK rehash check. Returns 0 on success. +ecmp_ack_rehash_attempt() +{ + local port=$1; shift + local reason="" + + ip netns exec "$NS2" socat -u \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & + local server_pid=$! + + wait_local_port_listen "$NS2" "$port" tcp + + local base_tx0 base_tx1 + base_tx0=$(link_tx_packets_get "$NS2" veth0b) + base_tx1=$(link_tx_packets_get "$NS2" veth1b) + + # Continuous data source from NS1 to NS2. Cap the send buffer + # so in-flight data stays below the receiver's advertised window. + # Without this, the sender can exhaust the receiver's window and + # enter persist mode (zero-window probing) instead of RTO when + # ACKs are blocked, and persist probes do not trigger flowlabel + # rehash. + ip netns exec "$NS1" timeout 120 socat -u \ + OPEN:/dev/zero \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],sndbuf=16384" \ + &>/dev/null & + local client_pid=$! + + # Wait for enough server TX (ACKs) to identify the active return path. + if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ + ">= $((base_tx0 + base_tx1 + 10))" \ + link_tx_packets_total "$NS2" veth0b veth1b > /dev/null; then + kill "$client_pid" "$server_pid" 2>/dev/null + wait "$client_pid" "$server_pid" 2>/dev/null + echo "no server TX activity" + return 1 + fi + + local cur_tx0 cur_tx1 active_dev inactive_dev + cur_tx0=$(link_tx_packets_get "$NS2" veth0b) + cur_tx1=$(link_tx_packets_get "$NS2" veth1b) + if [ $((cur_tx0 - base_tx0)) -ge $((cur_tx1 - base_tx1)) ]; then + active_dev=veth0b; inactive_dev=veth1b + else + active_dev=veth1b; inactive_dev=veth0b + fi + + local rehash_before + rehash_before=$(get_netstat_counter "$NS2" TcpDuplicateDataRehash) + + # Block the inactive return path first (no effect on current + # ACK flow), then block the active path. This avoids counting + # normal ACK drops as rehash evidence. + block_tcp "$NS2" "$inactive_dev" + local inactive_before + inactive_before=$(tc_filter_pkt_count "$NS2" "$inactive_dev") + block_tcp "$NS2" "$active_dev" + + # NS1 will RTO (no ACKs), retransmit with new flowlabel. + # NS2 detects the flowlabel change via tcp_rcv_spurious_retrans(), + # rehashes, and NS2's ACKs try the previously inactive return + # path. One successful rehash is sufficient. + if ! slowwait 60 until_counter_is \ + ">= $((${inactive_before:-0} + 1))" \ + tc_filter_pkt_count "$NS2" "$inactive_dev" > /dev/null; then + reason="no ACKs on alternate return path after blocking" + fi + + local rehash_after + rehash_after=$(get_netstat_counter "$NS2" TcpDuplicateDataRehash) + if [ "$rehash_after" -le "$rehash_before" ]; then + reason="${reason:+$reason; }TcpDuplicateDataRehash did not increment" + fi + + unblock_tcp "$NS2" "$active_dev" + unblock_tcp "$NS2" "$inactive_dev" + kill "$client_pid" "$server_pid" 2>/dev/null + wait "$client_pid" "$server_pid" 2>/dev/null + if [ -n "$reason" ]; then + echo "$reason" + return 1 + fi + return 0 +} + +# Block the receiver's (NS2) ACK return paths while data flows from +# NS1 to NS2. The sender (NS1) times out and retransmits with a new +# flowlabel; the receiver detects the changed flowlabel via +# tcp_rcv_spurious_retrans() and rehashes its own txhash so that its +# ACKs try a different ECMP return path. +# +# With 2-way ECMP each rehash may pick the same path, so a single +# attempt can occasionally fail. Retry once for robustness. +test_ecmp_midstream_ack_rehash() +{ + RET=0 + local port retry_port + alloc_ports port + alloc_ports retry_port + + local fail_reason + if ! ecmp_ack_rehash_attempt "$port" >/dev/null; then + fail_reason=$(ecmp_ack_rehash_attempt "$retry_port") + check_err $? "$fail_reason" + fi + + log_test "Local ECMP midstream ACK rehash: blocked return path" +} + +# Establish a DCTCP data transfer with PLB enabled, then ECN-mark both +# paths. Sustained CE marking triggers PLB to call sk_rethink_txhash() +# + __sk_dst_reset(), bouncing the connection between ECMP paths. +# Verify data appears on both paths and that TCPPLBRehash incremented. +test_ecmp_plb_rehash() +{ + RET=0 + local port + alloc_ports port + + # PLB needs DCTCP, a restricted congestion control. Adding it to + # the host-global tcp_allowed_congestion_control would relax the + # restricted-CC policy for the whole host (there is no per-netns + # allowed set). Instead pin dctcp on the test routes with + # "congctl": the route's RTAX_CC_ALGO is honoured on both the + # connect and accept paths without the restricted-CC check, and a + # dctcp route also carries DST_FEATURE_ECN_CA so the server + # negotiates ECN -- all confined to the test namespaces. + local available + available=$(ip netns exec "$NS1" sysctl -n \ + net.ipv4.tcp_available_congestion_control) + if ! echo "$available" | grep -qw dctcp; then + log_test_skip "Local ECMP PLB rehash: DCTCP not available" + return "$ksft_skip" + fi + install_ecmp_routes change dctcp + defer install_ecmp_routes change + + # Save NS1 sysctls before modifying them. + local saved_ecn1 saved_plb_enabled saved_plb_rounds + local saved_plb_thresh saved_plb_suspend + saved_ecn1=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_ecn) + saved_plb_enabled=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_enabled) + saved_plb_rounds=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_rehash_rounds) + saved_plb_thresh=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_cong_thresh) + saved_plb_suspend=$(ip netns exec "$NS1" sysctl -n net.ipv4.tcp_plb_suspend_rto_sec) + + # Enable ECN and PLB on the sender; dctcp comes from the route. + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_ecn=1 + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_enabled=1 + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_rehash_rounds=3 + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_cong_thresh=1 + ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_suspend_rto_sec=0 + defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_ecn="$saved_ecn1" + defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_enabled="$saved_plb_enabled" + defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_rehash_rounds="$saved_plb_rounds" + defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_cong_thresh="$saved_plb_thresh" + defer ip netns exec "$NS1" sysctl -qw net.ipv4.tcp_plb_suspend_rto_sec="$saved_plb_suspend" + + ip netns exec "$NS2" socat -u \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" - >/dev/null & + defer kill_process $! + + wait_local_port_listen "$NS2" "$port" tcp + + local base_tx0 base_tx1 + base_tx0=$(link_tx_packets_get "$NS1" veth0a) + base_tx1=$(link_tx_packets_get "$NS1" veth1a) + + ip netns exec "$NS1" timeout 90 socat -u \ + OPEN:/dev/zero \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" &>/dev/null & + local client_pid=$! + defer kill_process "$client_pid" + + # Wait for data to start flowing before applying ECN marking. + busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ + ">= $((base_tx0 + base_tx1 + 10))" \ + link_tx_packets_total "$NS1" > /dev/null + check_err $? "no TX activity detected" + if [ "$RET" -ne 0 ]; then + log_test "Local ECMP PLB rehash: ECN-marked path" + return + fi + + # Snapshot TX counters and rehash stats before ECN marking. + local pre_ecn_tx0 pre_ecn_tx1 + pre_ecn_tx0=$(link_tx_packets_get "$NS1" veth0a) + pre_ecn_tx1=$(link_tx_packets_get "$NS1" veth1a) + + local plb_before rto_before + plb_before=$(get_netstat_counter "$NS1" TCPPLBRehash) + rto_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + + # CE-mark all data on both paths. PLB detects sustained + # congestion and rehashes, bouncing traffic between paths. + mark_ecn "$NS1" veth0a + defer unblock_tcp "$NS1" veth0a # removes the marking rule + mark_ecn "$NS1" veth1a + defer unblock_tcp "$NS1" veth1a # removes the marking rule + + # Wait for meaningful data on both paths, proving PLB rehashed + # the connection and traffic actually moved. Require at least + # 100 packets beyond the baseline to rule out stray control + # packets (ND, etc.) satisfying the check. + slowwait 60 dev_tx_packets_above \ + "$NS1" veth0a "$((pre_ecn_tx0 + 100))" > /dev/null + check_err $? "no data on veth0a after ECN marking" + + slowwait 60 dev_tx_packets_above \ + "$NS1" veth1a "$((pre_ecn_tx1 + 100))" > /dev/null + check_err $? "no data on veth1a after ECN marking" + + local plb_after rto_after + plb_after=$(get_netstat_counter "$NS1" TCPPLBRehash) + rto_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + if [ "$plb_after" -le "$plb_before" ]; then + check_err 1 "TCPPLBRehash counter did not increment" + fi + if [ "$rto_after" -gt "$rto_before" ]; then + check_err 1 "TcpTimeoutRehash incremented; rehash was RTO-driven, not PLB" + fi + + log_test "Local ECMP PLB rehash: ECN-marked path" +} + +# Verify that hash policy 1 (L3+L4 symmetric) preserves the ECMP path +# across rehash. Policy 1 computes a deterministic hash from the +# 5-tuple, so mp_hash stays 0 and rt6_multipath_hash() always selects +# the same path regardless of txhash changes. +test_ecmp_hash_policy1_no_rehash() +{ + RET=0 + local port + alloc_ports port + + local saved_policy + saved_policy=$(ip netns exec "$NS1" sysctl -n \ + net.ipv6.fib_multipath_hash_policy) + ip netns exec "$NS1" sysctl -qw net.ipv6.fib_multipath_hash_policy=1 + defer ip netns exec "$NS1" sysctl -qw \ + net.ipv6.fib_multipath_hash_policy="$saved_policy" + + block_tcp "$NS1" veth0a + defer unblock_tcp "$NS1" veth0a + block_tcp "$NS1" veth1a + defer unblock_tcp "$NS1" veth1a + + ip netns exec "$NS2" socat \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr,fork" \ + EXEC:"echo POLICY1_OK" & + defer kill_process $! + + wait_local_port_listen "$NS2" "$port" tcp + + local rehash_before + rehash_before=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + + ip netns exec "$NS1" timeout 10 socat -u \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=8" \ + STDOUT >/dev/null 2>&1 & + local client_pid=$! + defer kill_process "$client_pid" + + # With policy 1, the deterministic 5-tuple hash always selects + # the same path. Wait for multiple SYN retransmits (proving + # rehash was attempted), then verify all SYNs landed on the + # same interface. + local rehash_after + slowwait 8 until_counter_is ">= $((rehash_before + 3))" \ + get_netstat_counter "$NS1" TcpTimeoutRehash > /dev/null + rehash_after=$(get_netstat_counter "$NS1" TcpTimeoutRehash) + if [ "$rehash_after" -le "$rehash_before" ]; then + check_err 1 "TcpTimeoutRehash counter did not increment" + fi + + local c0 c1 + c0=$(tc_filter_pkt_count "$NS1" veth0a) + c1=$(tc_filter_pkt_count "$NS1" veth1a) + if [ "${c0:-0}" -ge 1 ] && [ "${c1:-0}" -ge 1 ]; then + check_err 1 "SYNs appeared on both paths despite policy 1" + fi + if [ "${c0:-0}" -eq 0 ] && [ "${c1:-0}" -eq 0 ]; then + check_err 1 "no SYNs observed on either path" + fi + + log_test "Local ECMP policy 1: no path change on rehash" +} + +# Verify that mp_hash does not leak into the on-wire flowlabel. +# With auto_flowlabels=0, the wire flowlabel must be 0. Install tc +# filters that pass TCP with flowlabel=0 but drop TCP with nonzero +# flowlabel, then establish a connection and transfer data. If +# mp_hash leaked into fl6->flowlabel, the SYN or data packets would +# be dropped and the connection would fail. +test_ecmp_no_flowlabel_leak() +{ + RET=0 + local port + alloc_ports port + + local saved_afl + saved_afl=$(ip netns exec "$NS1" sysctl -n \ + net.ipv6.auto_flowlabels) + ip netns exec "$NS1" sysctl -qw net.ipv6.auto_flowlabels=0 + defer ip netns exec "$NS1" sysctl -qw \ + net.ipv6.auto_flowlabels="$saved_afl" + + # On both egress interfaces: pass TCP with flowlabel=0 (prio 1), + # drop any remaining TCP (nonzero flowlabel, prio 2). ICMPv6 + # matches neither filter and passes through normally. + local dev + for dev in veth0a veth1a; do + ip netns exec "$NS1" tc qdisc add dev "$dev" \ + root handle 1: prio + ip netns exec "$NS1" tc filter add dev "$dev" parent 1: \ + protocol ipv6 prio 1 u32 \ + match u32 0x00000000 0x000FFFFF at 0 \ + match u8 0x06 0xff at 6 \ + action ok + ip netns exec "$NS1" tc filter add dev "$dev" parent 1: \ + protocol ipv6 prio 2 u32 \ + match u8 0x06 0xff at 6 \ + action drop + defer unblock_tcp "$NS1" "$dev" + done + + ip netns exec "$NS2" socat \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \ + EXEC:"echo FLOWLABEL_OK" & + defer kill_process $! + + wait_local_port_listen "$NS2" "$port" tcp + + local tmpfile + tmpfile=$(mktemp) + defer rm -f "$tmpfile" + + ip netns exec "$NS1" socat -u \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1],connect-timeout=10" \ + STDOUT >"$tmpfile" 2>&1 + + local result + result=$(cat "$tmpfile" 2>/dev/null) + if [[ "$result" != *"FLOWLABEL_OK"* ]]; then + check_err 1 "connection failed: mp_hash may have leaked into wire flowlabel" + fi + + log_test "No flowlabel leak with auto_flowlabels=0" +} + +# Helper: stream data, invalidate the cached dst by adding and +# removing a dummy route (bumps fib6_node sernum), then check that +# traffic stays on the same ECMP path. Used by both the normal +# tcp_v6_connect and syncookie variants. +ecmp_dst_rebuild_check() +{ + local ns_client=$1; shift + local port=$1; shift + local rc=0 + + # Suppress __dst_negative_advice() during the test so that a + # real TCP timeout cannot trigger an additional dst + # invalidation via a different code path. + local saved_retries1 + saved_retries1=$(ip netns exec "$ns_client" sysctl -n \ + net.ipv4.tcp_retries1) + ip netns exec "$ns_client" sysctl -qw net.ipv4.tcp_retries1=255 + + local base0 base1 + base0=$(link_tx_packets_get "$ns_client" veth0a) + base1=$(link_tx_packets_get "$ns_client" veth1a) + + ip netns exec "$ns_client" timeout 15 socat -u \ + OPEN:/dev/zero \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" \ + &>/dev/null & + local client_pid=$! + + # Wait for enough packets to identify the active path. + # Return 2 for setup failure (distinct from 1 = path changed). + if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ + ">= $((base0 + base1 + 50))" \ + link_tx_packets_total "$ns_client" > /dev/null; then + ip netns exec "$ns_client" sysctl -qw \ + net.ipv4.tcp_retries1="$saved_retries1" + kill "$client_pid" 2>/dev/null + wait "$client_pid" 2>/dev/null + return 2 + fi + + local mid0 mid1 active_dev inactive_dev + mid0=$(link_tx_packets_get "$ns_client" veth0a) + mid1=$(link_tx_packets_get "$ns_client" veth1a) + if [ $((mid0 - base0)) -ge $((mid1 - base1)) ]; then + active_dev=veth0a; inactive_dev=veth1a + else + active_dev=veth1a; inactive_dev=veth0a + fi + + local active_before inactive_before + active_before=$(link_tx_packets_get "$ns_client" "$active_dev") + inactive_before=$(link_tx_packets_get "$ns_client" "$inactive_dev") + + # Invalidate the cached dst by bumping the fib6_node sernum. + # Adding and removing a high-metric dummy route achieves this + # without touching the ECMP nexthops, avoiding a transient + # single-nexthop state during multipath route replace. + ip -n "$ns_client" -6 route add fd00:ff::2/128 dev lo metric 9999 + ip -n "$ns_client" -6 route del fd00:ff::2/128 dev lo metric 9999 + + # Wait for enough post-rebuild traffic to detect a path change. + if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ + ">= $((active_before + inactive_before + 50))" \ + link_tx_packets_total "$ns_client" > /dev/null; then + ip netns exec "$ns_client" sysctl -qw \ + net.ipv4.tcp_retries1="$saved_retries1" + kill "$client_pid" 2>/dev/null + wait "$client_pid" 2>/dev/null + return 2 + fi + + local active_after inactive_after + active_after=$(link_tx_packets_get "$ns_client" "$active_dev") + inactive_after=$(link_tx_packets_get "$ns_client" "$inactive_dev") + + local active_delta=$((active_after - active_before)) + local inactive_delta=$((inactive_after - inactive_before)) + + if [ "$inactive_delta" -gt "$active_delta" ]; then + rc=1 + fi + + ip netns exec "$ns_client" sysctl -qw \ + net.ipv4.tcp_retries1="$saved_retries1" + kill "$client_pid" 2>/dev/null + wait "$client_pid" 2>/dev/null + return "$rc" +} + +# Run ecmp_dst_rebuild_check for ECMP_REBUILD_ROUNDS rounds, each with +# a fresh server and connection. With a correct kernel the path is +# deterministic (same txhash always selects the same ECMP nexthop), +# so any path change is a bug. Multiple rounds catch a buggy kernel +# that picks a random path: each round has 50% chance of accidentally +# matching, so 10 rounds gives < 0.1% false-pass probability. +ecmp_dst_rebuild_loop() +{ + local base_port=$1; shift + local label=$1; shift + local path_changes=0 + local r + + for r in $(seq 1 "$ECMP_REBUILD_ROUNDS"); do + local port=$((base_port + r - 1)) + + ip netns exec "$NS2" socat -u \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \ + - >/dev/null & + local server_pid=$! + + wait_local_port_listen "$NS2" "$port" tcp + + local check_rc=0 + ecmp_dst_rebuild_check "$NS1" "$port" || check_rc=$? + + kill "$server_pid" 2>/dev/null + wait "$server_pid" 2>/dev/null + + busywait "$BUSYWAIT_TIMEOUT" \ + port_has_no_active_tcp "$NS1" "$port" > /dev/null + busywait "$BUSYWAIT_TIMEOUT" \ + port_has_no_active_tcp "$NS2" "$port" > /dev/null + + if [ "$check_rc" -eq 2 ]; then + check_err 1 "no TX activity in round $r" + break + elif [ "$check_rc" -eq 1 ]; then + path_changes=$((path_changes + 1)) + fi + done + + if [ "$path_changes" -gt 0 ]; then + check_err 1 "$path_changes/$ECMP_REBUILD_ROUNDS changed path" + fi + + log_test "$label" +} + +# Verify that a dst invalidation does not cause the connection to +# switch ECMP paths. With the fix, both the initial route lookup +# (tcp_v6_connect) and subsequent rebuilds (inet6_csk_route_socket) +# use sk_txhash >> 1, so the path is stable. +test_ecmp_dst_rebuild_consistency() +{ + RET=0 + local base_port + alloc_ports base_port "$ECMP_REBUILD_ROUNDS" + + ecmp_dst_rebuild_loop "$base_port" \ + "ECMP path stable after dst invalidation" +} + +# Return 0 (true) when no active TCP sockets remain on a port. +# TIME_WAIT is excluded because it does not generate outgoing traffic. +port_has_no_active_tcp() +{ + local ns=$1; shift + local port=$1; shift + + ! ip netns exec "$ns" ss -tnH \ + state established \ + state fin-wait-1 \ + state fin-wait-2 \ + state close-wait \ + state last-ack \ + state closing \ + state syn-sent \ + state syn-recv \ + "sport = :$port or dport = :$port" | grep -q . +} + +# Count TCP packets on server egress without blocking them. +# Uses tc filters with "action ok" so packets are counted and passed. +count_tcp() +{ + local ns=$1; shift + local dev=$1; shift + + ip netns exec "$ns" tc qdisc add dev "$dev" root handle 1: prio + ip netns exec "$ns" tc filter add dev "$dev" parent 1: \ + protocol ipv6 prio 1 u32 match u8 0x06 0xff at 6 action ok +} + +# Verify that the server's SYN-ACK (sent from the request socket) and +# subsequent ACKs (sent from the full socket created in cookie_v6_check) +# use the same ECMP path. With syncookies the request socket is freed +# after the SYN-ACK and a new one is created during cookie validation; +# this test catches the case where the two request sockets pick +# different ECMP paths due to independent txhash values. +test_ecmp_syncookie_path_consistency() +{ + RET=0 + + local saved_syncookies + saved_syncookies=$(ip netns exec "$NS2" sysctl -n \ + net.ipv4.tcp_syncookies 2>/dev/null) + if [ -z "$saved_syncookies" ]; then + log_test_skip "Syncookie server ECMP path consistent" + return "$ksft_skip" + fi + ip netns exec "$NS2" sysctl -qw net.ipv4.tcp_syncookies=2 + defer ip netns exec "$NS2" sysctl -qw \ + net.ipv4.tcp_syncookies="$saved_syncookies" + + count_tcp "$NS2" veth0b + defer unblock_tcp "$NS2" veth0b + count_tcp "$NS2" veth1b + defer unblock_tcp "$NS2" veth1b + + local path_splits=0 + local r base_port + alloc_ports base_port "$ECMP_REBUILD_ROUNDS" + + for r in $(seq 1 "$ECMP_REBUILD_ROUNDS"); do + local port=$((base_port + r - 1)) + + ip netns exec "$NS2" socat -u \ + "TCP6-LISTEN:$port,bind=[fd00:ff::2],reuseaddr" \ + - >/dev/null & + local server_pid=$! + + wait_local_port_listen "$NS2" "$port" tcp + + local srv_base0 srv_base1 + srv_base0=$(tc_filter_pkt_count "$NS2" veth0b) + srv_base1=$(tc_filter_pkt_count "$NS2" veth1b) + + ip netns exec "$NS1" timeout 5 socat -u \ + OPEN:/dev/zero \ + "TCP6:[fd00:ff::2]:$port,bind=[fd00:ff::1]" \ + &>/dev/null & + local client_pid=$! + + local cli_base + cli_base=$(link_tx_packets_total "$NS1") + if ! busywait "$BUSYWAIT_TIMEOUT" until_counter_is \ + ">= $((cli_base + 200))" \ + link_tx_packets_total "$NS1" > /dev/null; then + check_err 1 "no TX activity in round $r" + kill "$client_pid" 2>/dev/null + wait "$client_pid" 2>/dev/null + kill "$server_pid" 2>/dev/null + wait "$server_pid" 2>/dev/null + break + fi + + local srv_tcp0 srv_tcp1 + srv_tcp0=$(tc_filter_pkt_count "$NS2" veth0b) + srv_tcp1=$(tc_filter_pkt_count "$NS2" veth1b) + local srv_delta0=$(( ${srv_tcp0:-0} - ${srv_base0:-0} )) + local srv_delta1=$(( ${srv_tcp1:-0} - ${srv_base1:-0} )) + + if [ "$srv_delta0" -gt 0 ] && [ "$srv_delta1" -gt 0 ]; then + path_splits=$((path_splits + 1)) + fi + + kill "$client_pid" 2>/dev/null + wait "$client_pid" 2>/dev/null + kill "$server_pid" 2>/dev/null + wait "$server_pid" 2>/dev/null + + # Wait for TCP teardown packets (FIN/RST) to finish so + # they do not pollute the next round's tc filter counters. + busywait "$BUSYWAIT_TIMEOUT" \ + port_has_no_active_tcp "$NS1" "$port" > /dev/null + busywait "$BUSYWAIT_TIMEOUT" \ + port_has_no_active_tcp "$NS2" "$port" > /dev/null + done + + if [ "$path_splits" -gt 0 ]; then + check_err 1 "$path_splits/$ECMP_REBUILD_ROUNDS had split server path" + fi + + log_test "Syncookie server ECMP path consistent" +} + +require_command socat + +trap 'defer_scopes_cleanup; cleanup_all_ns' EXIT +setup || exit $? +tests_run +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index af64f93bb2e1..b338bfb196a2 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -12,7 +12,9 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \ ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \ ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \ ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance \ - ipv4_mpath_balance_preferred fib6_ra_to_static" + ipv4_mpath_balance_preferred ipv4_mpath_oif ipv4_mpath_oif_nh \ + ipv4_mpath_oif_vrf ipv6_mpath_oif ipv6_mpath_oif_nh ipv6_mpath_oif_vrf \ + fib6_ra_to_static fib6_temp_addr_renewal" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1611,6 +1613,62 @@ fib6_ra_to_static() cleanup &> /dev/null } +fib6_temp_addr_renewal() { + setup + + echo + echo "Fib6 temporary address renewal test" + set -e + + # ra6 is required for the test. (ipv6toolkit) + if [ ! -x "$(command -v ra6)" ]; then + echo "SKIP: ra6 not found." + set +e + cleanup &> /dev/null + return + fi + + # Create a pair of veth devices to send a RA message from one + # device to another. + $IP link add veth1 type veth peer name veth2 + $IP link set dev veth1 up + $IP link set dev veth2 up + + # Make veth1 ready to receive RA messages. + $NS_EXEC sysctl -wq net.ipv6.conf.veth1.accept_ra=2 + $NS_EXEC sysctl -wq net.ipv6.conf.veth1.use_tempaddr=2 + $NS_EXEC sysctl -wq net.ipv6.conf.veth1.temp_prefered_lft=15 + $NS_EXEC sysctl -wq net.ipv6.conf.veth1.max_desync_factor=0 + + # Send a RA message with a prefix from veth2. + $NS_EXEC ra6 -i veth2 -s fe80::1 -d ff02::1 -P 2001:12::/64\#LA\#3600\#3600 -e + sleep 3 + + # Deprecate it + $NS_EXEC ra6 -i veth2 -s fe80::1 -d ff02::1 -P 2001:12::/64\#LA\#3600\#0 -e + sleep 3 + + # Restore it + $NS_EXEC ra6 -i veth2 -s fe80::1 -d ff02::1 -P 2001:12::/64\#LA\#3600\#3600 -e + + ret=1 + for i in $(seq 1 25); do + sleep 1 + num_dep="$($IP -6 addr | grep -c "temporary deprecated" || true)" + num_tot="$($IP -6 addr | grep -c "temporary" || true)" + + if [ "$num_dep" -eq 1 ] && [ "$num_tot" -ge 2 ]; then + ret=0 + break + fi + done + log_test "$ret" 0 "IPv6 temporary address cleanly deprecated and regenerated" + + set +e + + cleanup &> /dev/null +} + # add route for a prefix, flushing any existing routes first # expected to be the first step of a test add_route() @@ -2915,6 +2973,247 @@ ipv6_mpath_balance_test() forwarding_cleanup } +ipv4_mpath_oif_test_common() +{ + local get_param=$1; shift + local expected_oif=$1; shift + local test_name=$1; shift + local tmp_file + + tmp_file=$(mktemp) + + for i in {1..100}; do + $IP route get 203.0.113.${i} $get_param >> "$tmp_file" + done + + [[ $(grep "$expected_oif" "$tmp_file" | wc -l) -eq 100 ]] + log_test $? 0 "$test_name" + + rm "$tmp_file" +} + +ipv4_mpath_oif_test() +{ + echo + echo "IPv4 multipath oif test" + + setup + + set -e + $IP link add dummy1 up type dummy + $IP address add 192.0.2.1/28 dev dummy1 + $IP address add 192.0.2.17/32 dev lo + + $IP route add 203.0.113.0/24 \ + nexthop via 198.51.100.2 dev dummy0 \ + nexthop via 192.0.2.2 dev dummy1 + set +e + + ipv4_mpath_oif_test_common "oif dummy0" "dummy0" \ + "IPv4 multipath via first nexthop" + + ipv4_mpath_oif_test_common "oif dummy1" "dummy1" \ + "IPv4 multipath via second nexthop" + + ipv4_mpath_oif_test_common "oif dummy0 from 192.0.2.17" "dummy0" \ + "IPv4 multipath via first nexthop with source address" + + ipv4_mpath_oif_test_common "oif dummy1 from 192.0.2.17" "dummy1" \ + "IPv4 multipath via second nexthop with source address" + + cleanup +} + +ipv4_mpath_oif_nh_test() +{ + echo + echo "IPv4 multipath oif with nexthop object test" + + setup + + set -e + $IP link add dummy1 up type dummy + $IP address add 192.0.2.1/28 dev dummy1 + $IP address add 192.0.2.17/32 dev lo + + $IP nexthop add id 1 via 198.51.100.2 dev dummy0 + $IP nexthop add id 2 via 192.0.2.2 dev dummy1 + $IP nexthop add id 3 group 1/2 + $IP route add 203.0.113.0/24 nhid 3 + set +e + + ipv4_mpath_oif_test_common "oif dummy0" "dummy0" \ + "IPv4 multipath via first nexthop" + + ipv4_mpath_oif_test_common "oif dummy1" "dummy1" \ + "IPv4 multipath via second nexthop" + + ipv4_mpath_oif_test_common "oif dummy0 from 192.0.2.17" "dummy0" \ + "IPv4 multipath via first nexthop with source address" + + ipv4_mpath_oif_test_common "oif dummy1 from 192.0.2.17" "dummy1" \ + "IPv4 multipath via second nexthop with source address" + + cleanup +} + +ipv4_mpath_oif_vrf_test() +{ + echo + echo "IPv4 multipath oif with VRF test" + + setup + + set -e + $IP -4 rule add pref 32765 table local + $IP -4 rule del pref 0 + $IP link add name vrf-123 up type vrf table 123 + $IP link set dev dummy0 master vrf-123 + $IP link add dummy1 up master vrf-123 type dummy + $IP address add 192.0.2.1/28 dev dummy1 + $IP address add 192.0.2.17/32 dev vrf-123 + + $IP route add 203.0.113.0/24 vrf vrf-123 \ + nexthop via 198.51.100.2 dev dummy0 \ + nexthop via 192.0.2.2 dev dummy1 + set +e + + ipv4_mpath_oif_test_common "oif dummy0" "dummy0" \ + "IPv4 multipath via first nexthop" + + ipv4_mpath_oif_test_common "oif dummy1" "dummy1" \ + "IPv4 multipath via second nexthop" + + ipv4_mpath_oif_test_common "oif dummy0 from 192.0.2.17" "dummy0" \ + "IPv4 multipath via first nexthop with source address" + + ipv4_mpath_oif_test_common "oif dummy1 from 192.0.2.17" "dummy1" \ + "IPv4 multipath via second nexthop with source address" + + cleanup +} + +ipv6_mpath_oif_test_common() +{ + local get_param=$1; shift + local expected_oif=$1; shift + local test_name=$1; shift + local tmp_file + + tmp_file=$(mktemp) + + for i in {1..100}; do + $IP route get 2001:db8:10::${i} $get_param >> "$tmp_file" + done + + [[ $(grep "$expected_oif" "$tmp_file" | wc -l) -eq 100 ]] + log_test $? 0 "$test_name" + + rm "$tmp_file" +} + +ipv6_mpath_oif_test() +{ + echo + echo "IPv6 multipath oif test" + + setup + + set -e + $IP link add dummy1 up type dummy + $IP address add 2001:db8:2::1/64 dev dummy1 + $IP address add 2001:db8:100::1/128 dev lo + + $IP route add 2001:db8:10::/64 \ + nexthop via 2001:db8:1::2 dev dummy0 \ + nexthop via 2001:db8:2::2 dev dummy1 + set +e + + ipv6_mpath_oif_test_common "oif dummy0" "dummy0" \ + "IPv6 multipath via first nexthop" + + ipv6_mpath_oif_test_common "oif dummy1" "dummy1" \ + "IPv6 multipath via second nexthop" + + ipv6_mpath_oif_test_common "oif dummy0 from 2001:db8:100::1" "dummy0" \ + "IPv6 multipath via first nexthop with source address" + + ipv6_mpath_oif_test_common "oif dummy1 from 2001:db8:100::1" "dummy1" \ + "IPv6 multipath via second nexthop with source address" + + cleanup +} + +ipv6_mpath_oif_nh_test() +{ + echo + echo "IPv6 multipath oif with nexthop object test" + + setup + + set -e + $IP link add dummy1 up type dummy + $IP address add 2001:db8:2::1/64 dev dummy1 + $IP address add 2001:db8:100::1/128 dev lo + + $IP nexthop add id 1 via 2001:db8:1::2 dev dummy0 + $IP nexthop add id 2 via 2001:db8:2::2 dev dummy1 + $IP nexthop add id 3 group 1/2 + $IP route add 2001:db8:10::/64 nhid 3 + set +e + + ipv6_mpath_oif_test_common "oif dummy0" "dummy0" \ + "IPv6 multipath via first nexthop" + + ipv6_mpath_oif_test_common "oif dummy1" "dummy1" \ + "IPv6 multipath via second nexthop" + + ipv6_mpath_oif_test_common "oif dummy0 from 2001:db8:100::1" "dummy0" \ + "IPv6 multipath via first nexthop with source address" + + ipv6_mpath_oif_test_common "oif dummy1 from 2001:db8:100::1" "dummy1" \ + "IPv6 multipath via second nexthop with source address" + + cleanup +} + +ipv6_mpath_oif_vrf_test() +{ + echo + echo "IPv6 multipath oif with VRF test" + + setup + + set -e + $NS_EXEC sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1 + $IP -6 rule add pref 32765 table local + $IP -6 rule del pref 0 + $IP link add name vrf-123 up type vrf table 123 + $IP link set dev dummy0 master vrf-123 + $IP link add dummy1 up master vrf-123 type dummy + $IP address add 2001:db8:2::1/64 dev dummy1 + $IP address add 2001:db8:100::1/128 dev vrf-123 + + $IP route add 2001:db8:10::/64 vrf vrf-123 \ + nexthop via 2001:db8:1::2 dev dummy0 \ + nexthop via 2001:db8:2::2 dev dummy1 + set +e + + ipv6_mpath_oif_test_common "oif dummy0" "dummy0" \ + "IPv6 multipath via first nexthop" + + ipv6_mpath_oif_test_common "oif dummy1" "dummy1" \ + "IPv6 multipath via second nexthop" + + ipv6_mpath_oif_test_common "oif dummy0 from 2001:db8:100::1" "dummy0" \ + "IPv6 multipath via first nexthop with source address" + + ipv6_mpath_oif_test_common "oif dummy1 from 2001:db8:100::1" "dummy1" \ + "IPv6 multipath via second nexthop with source address" + + cleanup +} + ################################################################################ # usage @@ -3001,7 +3300,14 @@ do ipv4_mpath_balance) ipv4_mpath_balance_test;; ipv6_mpath_balance) ipv6_mpath_balance_test;; ipv4_mpath_balance_preferred) ipv4_mpath_balance_preferred_test;; + ipv4_mpath_oif) ipv4_mpath_oif_test;; + ipv4_mpath_oif_nh) ipv4_mpath_oif_nh_test;; + ipv4_mpath_oif_vrf) ipv4_mpath_oif_vrf_test;; + ipv6_mpath_oif) ipv6_mpath_oif_test;; + ipv6_mpath_oif_nh) ipv6_mpath_oif_nh_test;; + ipv6_mpath_oif_vrf) ipv6_mpath_oif_vrf_test;; fib6_ra_to_static) fib6_ra_to_static;; + fib6_temp_addr_renewal) fib6_temp_addr_renewal;; help) echo "Test names: $TESTS"; exit 0;; esac diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh index e8031f68200a..c4cd2078a8db 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_mcast.sh @@ -4,7 +4,7 @@ ALL_TESTS="vlmc_control_test vlmc_querier_test vlmc_igmp_mld_version_test \ vlmc_last_member_test vlmc_startup_query_test vlmc_membership_test \ vlmc_querier_intvl_test vlmc_query_intvl_test vlmc_query_response_intvl_test \ - vlmc_router_port_test vlmc_filtering_test" + vlmc_router_port_test vlmc_filtering_test vlmc_mcast_toggle_test" NUM_NETIFS=4 CHECK_TC="yes" TEST_GROUP="239.10.10.10" @@ -162,14 +162,27 @@ vlmc_query_cnt_setup() { local type=$1 local dev=$2 + local match=($3) if [[ $type == "igmp" ]]; then - tc filter add dev $dev egress pref 10 prot 802.1Q \ + # This matches: IP Protocol 2 (IGMP) + tc filter add dev "$dev" egress pref 10 prot 802.1Q \ flower vlan_id 10 vlan_ethtype ipv4 dst_ip 224.0.0.1 ip_proto 2 \ + action goto chain 1 + # AND Type 0x11 (Query) at offset 0 of IGMP header + # 20 bytes IPv4 header + 4 bytes Router Alert option + IGMP[offset 0] + tc filter add dev "$dev" egress pref 20 chain 1 prot 802.1Q u32 \ + match u8 0x11 0xff at 24 "${match[@]}" \ action pass else - tc filter add dev $dev egress pref 10 prot 802.1Q \ + # This matches: ICMPv6 + tc filter add dev "$dev" egress pref 10 prot 802.1Q \ flower vlan_id 10 vlan_ethtype ipv6 dst_ip ff02::1 ip_proto icmpv6 \ + action goto chain 1 + # AND Type 0x82 (Query) at offset 0 of MLD header + # 40 bytes IPv6 header + 8 bytes Hop-by-hop option + MLD[offset 0] + tc filter add dev "$dev" egress pref 20 chain 1 prot 802.1Q u32 \ + match u8 0x82 0xff at 48 "${match[@]}" \ action pass fi @@ -181,7 +194,39 @@ vlmc_query_cnt_cleanup() local dev=$1 ip link set dev br0 type bridge mcast_stats_enabled 0 - tc filter del dev $dev egress pref 10 + tc filter del dev "$dev" egress pref 20 chain 1 + tc filter del dev "$dev" egress pref 10 +} + +vlmc_query_get_intvl_match() +{ + local type=$1 + local version=$2 + local test=$3 + local enc_val=$4 + + if [ "$test" = "qqic" ]; then + # QQIC is 8-bit floating point encoding for IGMPv3 and MLDv2 + if [ "${type}v${version}" = "igmpv3" ]; then + # QQIC is at offset 9 of IGMP header + # 20 bytes IPv4 header + 4 bytes Router Alert option + IGMP[offset 9] + echo "match u8 $enc_val 0xff at 33" + elif [ "${type}v${version}" = "mldv2" ]; then + # QQIC is at offset 25 of MLD header + # 40 bytes IPv6 header + 8 bytes Hop-by-hop option + MLD[offset 25] + echo "match u8 $enc_val 0xff at 73" + fi + elif [ "$test" = "mrc" ]; then + if [ "${type}v${version}" = "igmpv3" ]; then + # MRC is 8-bit floating point encoding at offset 1 of IGMP header + # 20 bytes IPv4 header + 4 bytes Router Alert option + IGMP[offset 1] + echo "match u8 $enc_val 0xff at 25" + elif [ "${type}v${version}" = "mldv2" ]; then + # MRC is 16-bit floating point encoding at offset 4 of MLD header + # 40 bytes IPv6 header + 8 bytes Hop-by-hop option + MLD[offset 4] + echo "match u16 $enc_val 0xffff at 52" + fi + fi } vlmc_check_query() @@ -191,9 +236,13 @@ vlmc_check_query() local dev=$3 local expect=$4 local time=$5 + local test=$6 + local enc_val=$7 + local intvl_match="" local ret=0 - vlmc_query_cnt_setup $type $dev + intvl_match="$(vlmc_query_get_intvl_match "$type" "$version" "$test" "$enc_val")" + vlmc_query_cnt_setup "$type" "$dev" "$intvl_match" local pre_tx_xstats=$(vlmc_query_cnt_xstats $type $version $dev) bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_querier 1 @@ -201,7 +250,7 @@ vlmc_check_query() if [[ $ret -eq 0 ]]; then sleep $time - local tcstats=$(tc_rule_stats_get $dev 10 egress) + local tcstats=$(tc_rule_stats_get "$dev" 20 egress) local post_tx_xstats=$(vlmc_query_cnt_xstats $type $version $dev) if [[ $tcstats != $expect || \ @@ -448,8 +497,46 @@ vlmc_query_intvl_test() # 1 is sent immediately, then 2 more in the next 5 seconds vlmc_check_query igmp 2 $swp1 3 5 check_err $? "Wrong number of tagged IGMPv2 general queries sent" - log_test "Vlan 10 mcast_query_interval option changed to 200" + log_test "Number of tagged IGMPv2 general query" + RET=0 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_igmp_version 3 + check_err $? "Could not set mcast_igmp_version in vlan 10" + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 2 + check_err $? "Could not set mcast_mld_version in vlan 10" + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_interval 6000 + check_err $? "Could not set mcast_query_interval in vlan 10" + # 1 is sent immediately, IGMPv3 QQIC should match with linear value 60 (0x3c) + # which is 8-bit encoded value of 60 [units of seconds] + vlmc_check_query igmp 3 $swp1 1 1 qqic 0x3c + check_err $? "Wrong QQIC in generated IGMPv3 general queries" + log_test "IGMPv3 QQIC linear value 60(s)" + + RET=0 + # 1 is sent immediately, MLDv2 QQIC should match with linear value 60 (0x3c) + # which is 8-bit encoded value of 60 [units of seconds] + vlmc_check_query mld 2 $swp1 1 1 qqic 0x3c + check_err $? "Wrong QQIC in generated MLDv2 general queries" + log_test "MLDv2 QQIC linear value 60(s)" + + RET=0 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_interval 16000 + check_err $? "Could not set mcast_query_interval in vlan 10" + # 1 is sent immediately, IGMPv3 QQIC should match with non linear value 132 (0x84) + # which is 8-bit encoded value of 160 [units of seconds] + vlmc_check_query igmp 3 $swp1 1 1 qqic 0x84 + check_err $? "Wrong QQIC in generated IGMPv3 general queries" + log_test "IGMPv3 QQIC non linear value 160(s)" + + RET=0 + # 1 is sent immediately, MLDv2 QQIC should match with non linear value 132 (0x84) + # which is 8-bit encoded value of 160 [units of seconds] + vlmc_check_query mld 2 $swp1 1 1 qqic 0x84 + check_err $? "Wrong QQIC in generated MLDv2 general queries" + log_test "MLDv2 QQIC non linear value 160(s)" + + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_igmp_version 2 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 1 bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 2 bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_interval 12500 } @@ -469,10 +556,47 @@ vlmc_query_response_intvl_test() log_test "Vlan mcast_query_response_interval global option default value" RET=0 - bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 200 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 0 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_igmp_version 3 + check_err $? "Could not set mcast_igmp_version in vlan 10" + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 2 + check_err $? "Could not set mcast_mld_version in vlan 10" + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 600 + check_err $? "Could not set mcast_query_response_interval in vlan 10" + # 1 is sent immediately, IGMPv3 MRC should match with linear value 60 (0x3c) + # which is 8-bit encoded value of 60 [units of 0.1s = 6 seconds] + vlmc_check_query igmp 3 $swp1 1 1 mrc 0x3c + check_err $? "Wrong MRC in generated IGMPv3 general queries" + log_test "IGMPv3 MRC linear value of 60(x0.1s)" + + RET=0 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 2400 check_err $? "Could not set mcast_query_response_interval in vlan 10" - log_test "Vlan 10 mcast_query_response_interval option changed to 200" + # 1 is sent immediately, MLDv2 MRC should match with linear value 0x5dc0 (24000) + # which is 16-bit encoded value of 24000 [units of ms / 24 seconds] + vlmc_check_query mld 2 $swp1 1 1 mrc 0x5dc0 + check_err $? "Wrong MRC in generated MLDv2 general queries" + log_test "MLDv2 MRC linear value of 24000(ms)" + RET=0 + # 1 is sent immediately, IGMPv3 MRC should match with non linear value 142 (0x8e) + # which is 8-bit encoded value of 240 [units of 0.1s = 24 seconds] + vlmc_check_query igmp 3 $swp1 1 1 mrc 0x8e + check_err $? "Wrong MRC in generated IGMPv3 general queries" + log_test "IGMPv3 MRC non linear value of 240(x0.1s)" + + RET=0 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 4800 + check_err $? "Could not set mcast_query_response_interval in vlan 10" + # 1 is sent immediately, MLDv2 MRC should match with non linear value 0x8770 (34672) + # which is 16-bit encoded value of 48000 [units of ms / 48 seconds] + vlmc_check_query mld 2 $swp1 1 1 mrc 0x8770 + check_err $? "Wrong MRC in generated MLDv2 general queries" + log_test "MLDv2 MRC non linear value of 48000(ms)" + + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_igmp_version 2 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_mld_version 1 + bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_startup_query_count 2 bridge vlan global set vid 10 dev br0 mcast_snooping 1 mcast_query_response_interval 1000 } @@ -537,6 +661,34 @@ vlmc_filtering_test() log_test "Disable multicast vlan snooping when vlan filtering is disabled" } +vlmc_mcast_toggle_test() +{ + RET=0 + + ip link add name br1-mcast up type bridge mcast_snooping 1 mcast_querier 1 vlan_filtering 1 + ip link add name dummy1-mcast up master br1-mcast type dummy + + # Enabling per-VLAN multicast snooping should disable the per-port + # multicast context on "dummy1-mcast". + ip link set dev br1-mcast type bridge mcast_vlan_snooping 1 + + # Toggling multicast snooping on the bridge should not affect the + # per-port multicast context on "dummy1-mcast" given that per-VLAN + # multicast snooping is enabled. + ip link set dev br1-mcast type bridge mcast_snooping 0 + ip link set dev br1-mcast type bridge mcast_snooping 1 + + # If both the per-port and per-{port, VLAN} multicast contexts are + # enabled on "dummy1-mcast", removing it from the bridge will result + # in a splat. + ip link set dev dummy1-mcast nomaster + + log_test "Toggling mcast snooping with per-VLAN mcast snooping enabled" + + ip link del dev dummy1-mcast + ip link del dev br1-mcast +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/ipmr.c b/tools/testing/selftests/net/forwarding/ipmr.c index df870aad9ead..9cd9f70de132 100644 --- a/tools/testing/selftests/net/forwarding/ipmr.c +++ b/tools/testing/selftests/net/forwarding/ipmr.c @@ -2,7 +2,9 @@ /* Copyright 2026 Google LLC */ #include <linux/if.h> +#include <linux/in6.h> #include <linux/mroute.h> +#include <linux/mroute6.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/socket.h> @@ -17,6 +19,14 @@ FIXTURE(ipmr) int netlink_sk; int raw_sk; int veth_ifindex; + union { + struct vifctl vif; + struct mif6ctl vif6; + }; + union { + struct mfcctl mfc; + struct mf6cctl mfc6; + }; }; FIXTURE_VARIANT(ipmr) @@ -24,7 +34,14 @@ FIXTURE_VARIANT(ipmr) int family; int protocol; int level; + int rtm_family; int opts[MRT_MAX - MRT_BASE + 1]; + int flush_flags; + int vif_size; + char vif_check_cmd_pimreg[64]; + char vif_check_cmd_veth[64]; + int mfc_size; + char mfc_check_cmd[1024]; }; FIXTURE_VARIANT_ADD(ipmr, ipv4) @@ -32,6 +49,7 @@ FIXTURE_VARIANT_ADD(ipmr, ipv4) .family = AF_INET, .protocol = IPPROTO_IGMP, .level = IPPROTO_IP, + .rtm_family = RTNL_FAMILY_IPMR, .opts = { MRT_INIT, MRT_DONE, @@ -47,6 +65,44 @@ FIXTURE_VARIANT_ADD(ipmr, ipv4) MRT_DEL_MFC_PROXY, MRT_FLUSH, }, + .flush_flags = MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC | + MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC, + .vif_size = sizeof(struct vifctl), + .vif_check_cmd_pimreg = "cat /proc/net/ip_mr_vif | grep -q pimreg", + .vif_check_cmd_veth = "cat /proc/net/ip_mr_vif | grep -q veth", + .mfc_size = sizeof(struct mfcctl), + .mfc_check_cmd = "cat /proc/net/ip_mr_cache | grep -q '00000000 00000000'", +}; + +FIXTURE_VARIANT_ADD(ipmr, ipv6) +{ + .family = AF_INET6, + .protocol = IPPROTO_ICMPV6, + .level = IPPROTO_IPV6, + .rtm_family = RTNL_FAMILY_IP6MR, + .opts = { + MRT6_INIT, + MRT6_DONE, + MRT6_ADD_MIF, + MRT6_DEL_MIF, + MRT6_ADD_MFC, + MRT6_DEL_MFC, + MRT6_VERSION, + MRT6_ASSERT, + MRT6_PIM, + MRT6_TABLE, + MRT6_ADD_MFC_PROXY, + MRT6_DEL_MFC_PROXY, + MRT6_FLUSH, + }, + .flush_flags = MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC | + MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC, + .vif_size = sizeof(struct mif6ctl), + .vif_check_cmd_pimreg = "cat /proc/net/ip6_mr_vif | grep -q pim6reg", + .vif_check_cmd_veth = "cat /proc/net/ip6_mr_vif | grep -q veth", + .mfc_size = sizeof(struct mf6cctl), + .mfc_check_cmd = "cat /proc/net/ip6_mr_cache | " + "grep -q '0000:0000:0000:0000:0000:0000:0000:0000 0000:0000:0000:0000:0000:0000:0000:0000'", }; struct mfc_attr { @@ -71,7 +127,9 @@ static struct rtattr *nl_add_rtattr(struct nlmsghdr *nlmsg, struct rtattr *rta, return RTA_NEXT(rta, unused); } -static int nl_sendmsg_mfc(struct __test_metadata *_metadata, FIXTURE_DATA(ipmr) *self, +static int nl_sendmsg_mfc(struct __test_metadata *_metadata, + FIXTURE_DATA(ipmr) *self, + const FIXTURE_VARIANT(ipmr) *variant, __u16 nlmsg_type, struct mfc_attr *mfc_attr) { struct { @@ -87,7 +145,7 @@ static int nl_sendmsg_mfc(struct __test_metadata *_metadata, FIXTURE_DATA(ipmr) }, .rtm = { /* hard requirements in rtm_to_ipmr_mfcc() */ - .rtm_family = RTNL_FAMILY_IPMR, + .rtm_family = variant->rtm_family, .rtm_dst_len = 32, .rtm_type = RTN_MULTICAST, .rtm_scope = RT_SCOPE_UNIVERSE, @@ -144,6 +202,18 @@ FIXTURE_SETUP(ipmr) ASSERT_EQ(0, err); self->veth_ifindex = ifr.ifr_ifindex; + + if (variant->family == AF_INET) { + self->vif = (struct vifctl){ + .vifc_flags = VIFF_USE_IFINDEX, + .vifc_lcl_ifindex = self->veth_ifindex, + }; + } else { + self->vif6 = (struct mif6ctl){ + .mif6c_flags = 0, + .mif6c_pifi = self->veth_ifindex, + }; + } } FIXTURE_TEARDOWN(ipmr) @@ -169,41 +239,39 @@ TEST_F(ipmr, mrt_init) TEST_F(ipmr, mrt_add_vif_register) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_REGISTER, - }; int err; + memset(&self->vif, 0, variant->vif_size); + + if (variant->family == AF_INET) + self->vif.vifc_flags = VIFF_REGISTER; + else + self->vif6.mif6c_flags = MIFF_REGISTER; + err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_vif | grep -q pimreg"); + err = system(variant->vif_check_cmd_pimreg); ASSERT_EQ(0, err); err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); } TEST_F(ipmr, mrt_del_vif_unreg) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; int err; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_vif | grep -q veth0"); + err = system(variant->vif_check_cmd_veth); ASSERT_EQ(0, err); /* VIF is removed along with its device. */ @@ -213,23 +281,18 @@ TEST_F(ipmr, mrt_del_vif_unreg) /* mrt->vif_table[veth_ifindex]->dev is NULL. */ err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(-1, err); ASSERT_EQ(EADDRNOTAVAIL, errno); } TEST_F(ipmr, mrt_del_vif_netns_dismantle) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; int err; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); /* Let cleanup_net() remove veth0 and VIF. */ @@ -237,49 +300,42 @@ TEST_F(ipmr, mrt_del_vif_netns_dismantle) TEST_F(ipmr, mrt_add_mfc) { - struct mfcctl mfc = {}; int err; /* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */ err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); ASSERT_EQ(0, err); /* (0.0.0.0 -> 0.0.0.0) */ - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); } TEST_F(ipmr, mrt_add_mfc_proxy) { - struct mfcctl mfc = {}; int err; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE], - &mfc, sizeof(mfc)); + &self->mfc, variant->mfc_size); } TEST_F(ipmr, mrt_add_mfc_netlink) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; struct mfc_attr mfc_attr = { .table = RT_TABLE_DEFAULT, .origin = 0, @@ -291,26 +347,21 @@ TEST_F(ipmr, mrt_add_mfc_netlink) err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); - err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_DELROUTE, &mfc_attr); ASSERT_EQ(0, err); } TEST_F(ipmr, mrt_add_mfc_netlink_proxy) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; struct mfc_attr mfc_attr = { .table = RT_TABLE_DEFAULT, .origin = 0, @@ -322,16 +373,16 @@ TEST_F(ipmr, mrt_add_mfc_netlink_proxy) err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); - err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_DELROUTE, &mfc_attr); ASSERT_EQ(0, err); } @@ -347,12 +398,12 @@ TEST_F(ipmr, mrt_add_mfc_netlink_no_vif) /* netlink always requires RTA_IIF of an existing vif. */ mfc_attr.ifindex = 0; - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(-ENFILE, err); /* netlink always requires RTA_IIF of an existing vif. */ mfc_attr.ifindex = self->veth_ifindex; - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(-ENFILE, err); } @@ -387,10 +438,10 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) } /* Create a MFC for mrt->vif_table[0]. */ - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_NEWROUTE, &mfc_attr); ASSERT_EQ(0, err); - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); /* Remove mrt->vif_table[0]. */ @@ -398,13 +449,13 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) ASSERT_EQ(0, err); /* MFC entry is NOT removed even if the tied VIF is removed... */ - err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); + err = system(variant->mfc_check_cmd); ASSERT_EQ(0, err); /* ... and netlink is not capable of removing such an entry * because netlink always requires a valid RTA_IIF ... :/ */ - err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_DELROUTE, &mfc_attr); ASSERT_EQ(-ENODEV, err); /* It can be removed by setsockopt(), but let cleanup_net() remove this time. */ @@ -412,11 +463,6 @@ TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) TEST_F(ipmr, mrt_table_flush) { - struct vifctl vif = { - .vifc_vifi = 0, - .vifc_flags = VIFF_USE_IFINDEX, - .vifc_lcl_ifindex = self->veth_ifindex, - }; struct mfc_attr mfc_attr = { .origin = 0, .group = 0, @@ -424,7 +470,7 @@ TEST_F(ipmr, mrt_table_flush) .proxy = false, }; int table_id = 92; - int err, flags; + int err; /* Set a random table id rather than RT_TABLE_DEFAULT. * Note that /proc/net/ip_mr_{vif,cache} only supports RT_TABLE_DEFAULT. @@ -436,20 +482,29 @@ TEST_F(ipmr, mrt_table_flush) err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], - &vif, sizeof(vif)); + &self->vif, variant->vif_size); ASSERT_EQ(0, err); - mfc_attr.table = table_id; - err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); + if (variant->family == AF_INET) { + mfc_attr.table = table_id; + err = nl_sendmsg_mfc(_metadata, self, variant, RTM_NEWROUTE, &mfc_attr); + } else { + err = setsockopt(self->raw_sk, + variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], + &self->mfc, variant->mfc_size); + } ASSERT_EQ(0, err); /* Flush mrt->vif_table[] and all caches. */ - flags = MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | - MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC; err = setsockopt(self->raw_sk, variant->level, variant->opts[MRT_FLUSH - MRT_BASE], - &flags, sizeof(flags)); + &variant->flush_flags, sizeof(variant->flush_flags)); ASSERT_EQ(0, err); } +XFAIL_ADD(ipmr, ipv6, mrt_add_mfc_netlink); +XFAIL_ADD(ipmr, ipv6, mrt_add_mfc_netlink_proxy); +XFAIL_ADD(ipmr, ipv6, mrt_add_mfc_netlink_no_vif); +XFAIL_ADD(ipmr, ipv6, mrt_del_mfc_netlink_netns_dismantle); + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/getsockopt_iter.c b/tools/testing/selftests/net/getsockopt_iter.c new file mode 100644 index 000000000000..209569354d0e --- /dev/null +++ b/tools/testing/selftests/net/getsockopt_iter.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Quick test for getsockopt{_iter} tests. + * + * Each fixture targets one converted protocol and pins down the + * returned-length / errno semantics across buffer-size variations, + * an unknown optname and a bogus level. + * + * - netlink: NETLINK_PKTINFO covers the flag-style int path; the + * NETLINK_LIST_MEMBERSHIPS cases cover the size-discovery path + * that always reports the required buffer length back via optlen, + * even when the user buffer is too small to receive any group bits. + * - vsock: SO_VM_SOCKETS_BUFFER_SIZE covers the u64 path. + * + * Author: Breno Leitao <leitao@debian.org> + */ + +#include <errno.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/time_types.h> +#include <linux/vm_sockets.h> +#include <sys/socket.h> +#include "kselftest_harness.h" + +#ifndef AF_VSOCK +#define AF_VSOCK 40 +#endif + +/* ---------- netlink ---------- */ + +FIXTURE(netlink) +{ + int fd; +}; + +FIXTURE_SETUP(netlink) +{ + int group = RTNLGRP_LINK; + + self->fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (self->fd < 0) + SKIP(return, "AF_NETLINK socket: %s", strerror(errno)); + + /* Joining a multicast group grows nlk->ngroups so the + * NETLINK_LIST_MEMBERSHIPS path has a non-zero size to report. + */ + if (setsockopt(self->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, + &group, sizeof(group)) < 0) + SKIP(return, "NETLINK_ADD_MEMBERSHIP: %s", strerror(errno)); +} + +FIXTURE_TEARDOWN(netlink) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(netlink, pktinfo_exact) +{ + socklen_t optlen; + int val = -1; + + optlen = sizeof(val); + + ASSERT_EQ(0, getsockopt(self->fd, SOL_NETLINK, NETLINK_PKTINFO, + &val, &optlen)); + ASSERT_EQ(sizeof(int), optlen); + ASSERT_TRUE(val == 0 || val == 1); +} + +TEST_F(netlink, pktinfo_oversize_clamped) +{ + char buf[16] = {}; + socklen_t optlen; + + optlen = sizeof(buf); + + ASSERT_EQ(0, getsockopt(self->fd, SOL_NETLINK, NETLINK_PKTINFO, + buf, &optlen)); + ASSERT_EQ(sizeof(int), optlen); +} + +TEST_F(netlink, pktinfo_undersize) +{ + char buf[2] = {}; + socklen_t optlen; + + optlen = sizeof(buf); + + ASSERT_EQ(-1, getsockopt(self->fd, SOL_NETLINK, NETLINK_PKTINFO, + buf, &optlen)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(sizeof(buf), optlen); +} + +TEST_F(netlink, list_memberships_size_discovery) +{ + socklen_t optlen = 0; + char dummy; + + ASSERT_EQ(0, getsockopt(self->fd, SOL_NETLINK, + NETLINK_LIST_MEMBERSHIPS, + &dummy, &optlen)); + ASSERT_GT(optlen, 0); + ASSERT_EQ(0, optlen % sizeof(__u32)); +} + +TEST_F(netlink, list_memberships_full_read) +{ + __u32 buf[64] = {}; + socklen_t optlen; + + optlen = sizeof(buf); + + ASSERT_EQ(0, getsockopt(self->fd, SOL_NETLINK, + NETLINK_LIST_MEMBERSHIPS, + buf, &optlen)); + ASSERT_GT(optlen, 0); + ASSERT_LE(optlen, sizeof(buf)); + ASSERT_EQ(0, optlen % sizeof(__u32)); +} + +TEST_F(netlink, bad_level) +{ + socklen_t optlen; + int val; + + optlen = sizeof(val); + + ASSERT_EQ(-1, getsockopt(self->fd, SOL_SOCKET + 1, NETLINK_PKTINFO, + &val, &optlen)); + ASSERT_EQ(ENOPROTOOPT, errno); + ASSERT_EQ(sizeof(val), optlen); +} + +TEST_F(netlink, bad_optname) +{ + socklen_t optlen; + int val; + + optlen = sizeof(val); + + ASSERT_EQ(-1, getsockopt(self->fd, SOL_NETLINK, 0x7fff, + &val, &optlen)); + ASSERT_EQ(ENOPROTOOPT, errno); + ASSERT_EQ(sizeof(val), optlen); +} + +/* ---------- vsock ---------- */ + +FIXTURE(vsock) +{ + int fd; +}; + +FIXTURE_SETUP(vsock) +{ + self->fd = socket(AF_VSOCK, SOCK_STREAM, 0); + if (self->fd < 0) + SKIP(return, "AF_VSOCK socket: %s", strerror(errno)); +} + +FIXTURE_TEARDOWN(vsock) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(vsock, buffer_size_exact) +{ + socklen_t optlen; + uint64_t val = 0; + + optlen = sizeof(val); + + ASSERT_EQ(0, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_BUFFER_SIZE, + &val, &optlen)); + ASSERT_EQ(sizeof(uint64_t), optlen); + ASSERT_GT(val, 0); +} + +TEST_F(vsock, buffer_size_oversize_clamped) +{ + char buf[16] = {}; + socklen_t optlen; + + optlen = sizeof(buf); + + ASSERT_EQ(0, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_BUFFER_SIZE, + buf, &optlen)); + ASSERT_EQ(sizeof(uint64_t), optlen); +} + +TEST_F(vsock, buffer_size_undersize) +{ + char buf[4] = {}; + socklen_t optlen; + + optlen = sizeof(buf); + + ASSERT_EQ(-1, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_BUFFER_SIZE, + buf, &optlen)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(sizeof(buf), optlen); +} + +TEST_F(vsock, bad_level) +{ + socklen_t optlen; + uint64_t val; + + optlen = sizeof(val); + + ASSERT_EQ(-1, getsockopt(self->fd, SOL_SOCKET + 1, + SO_VM_SOCKETS_BUFFER_SIZE, + &val, &optlen)); + ASSERT_EQ(ENOPROTOOPT, errno); + ASSERT_EQ(sizeof(val), optlen); +} + +TEST_F(vsock, bad_optname) +{ + socklen_t optlen; + uint64_t val; + + optlen = sizeof(val); + + ASSERT_EQ(-1, getsockopt(self->fd, AF_VSOCK, 0x7fff, + &val, &optlen)); + ASSERT_EQ(ENOPROTOOPT, errno); + ASSERT_EQ(sizeof(val), optlen); +} + +/* SO_VM_SOCKETS_CONNECT_TIMEOUT_{NEW,OLD} return a sock_timeval-shaped + * payload, which is wider than u64 on 64-bit. They exercise the path + * where the protocol's reported lv (16 bytes) is larger than the + * common 8-byte u64 case covered above. + */ +TEST_F(vsock, connect_timeout_new_exact) +{ + struct __kernel_sock_timeval tv = {}; + socklen_t optlen; + + optlen = sizeof(tv); + + ASSERT_EQ(0, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW, + &tv, &optlen)); + ASSERT_EQ(sizeof(tv), optlen); +} + +TEST_F(vsock, connect_timeout_new_oversize_clamped) +{ + char buf[sizeof(struct __kernel_sock_timeval) * 2] = {}; + socklen_t optlen; + + optlen = sizeof(buf); + + ASSERT_EQ(0, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW, + buf, &optlen)); + ASSERT_EQ(sizeof(struct __kernel_sock_timeval), optlen); +} + +TEST_F(vsock, connect_timeout_new_undersize) +{ + socklen_t optlen; + uint64_t val; + + optlen = sizeof(val); + + ASSERT_EQ(-1, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW, + &val, &optlen)); + ASSERT_EQ(EINVAL, errno); + ASSERT_EQ(sizeof(val), optlen); +} + +TEST_F(vsock, connect_timeout_old_exact) +{ + struct __kernel_old_timeval tv = {}; + socklen_t optlen; + + optlen = sizeof(tv); + + ASSERT_EQ(0, getsockopt(self->fd, AF_VSOCK, + SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD, + &tv, &optlen)); + ASSERT_EQ(sizeof(tv), optlen); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index b2b99889942f..845c26dd01a9 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -273,8 +273,8 @@ setup() ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null - ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null + ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha link set veth0 up &>/dev/null ip -netns $ioam_node_alpha link set lo up &>/dev/null ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \ diff --git a/tools/testing/selftests/net/lib/gro.c b/tools/testing/selftests/net/lib/gro.c index 11b16ae5f0e8..7a333155de1a 100644 --- a/tools/testing/selftests/net/lib/gro.c +++ b/tools/testing/selftests/net/lib/gro.c @@ -67,12 +67,14 @@ #include <errno.h> #include <error.h> #include <getopt.h> +#include <net/ethernet.h> +#include <net/if.h> #include <linux/filter.h> #include <linux/if_packet.h> +#include <linux/if_pppox.h> #include <linux/ipv6.h> #include <linux/net_tstamp.h> -#include <net/ethernet.h> -#include <net/if.h> +#include <linux/ppp_defs.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip6.h> @@ -102,9 +104,12 @@ #define MAX_LARGE_PKT_CNT ((IP_MAXPACKET - (MAX_HDR_LEN - ETH_HLEN)) / \ (ASSUMED_MTU - (MAX_HDR_LEN - ETH_HLEN))) #define MIN_EXTHDR_SIZE 8 +#define L2_HLEN_MAX (ETH_HLEN + PPPOE_SES_HLEN) #define EXT_PAYLOAD_1 "\x00\x00\x00\x00\x00\x00" #define EXT_PAYLOAD_2 "\x11\x11\x11\x11\x11\x11" +#define EXIT_OVER_COALESCE 42 + #define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -134,6 +139,7 @@ static int total_hdr_len = -1; static int ethhdr_proto = -1; static bool ipip; static bool ip6ip6; +static bool pppoe; static uint64_t txtime_ns; static int num_flows = 4; static bool order_check; @@ -171,6 +177,22 @@ static void vlog(const char *fmt, ...) } } +static void fill_pppoelayer(void *buf, int payload_len, uint16_t sid) +{ + struct pppoe_ppp_hdr { + struct pppoe_hdr eh; + __be16 proto; + } *ph = buf; + + payload_len += sizeof(struct tcphdr); + ph->eh.type = 1; + ph->eh.ver = 1; + ph->eh.code = 0; + ph->eh.sid = htons(sid); + ph->eh.length = htons(payload_len + sizeof(ph->proto)); + ph->proto = htons(proto == PF_INET ? PPP_IP : PPP_IPV6); +} + static void setup_sock_filter(int fd) { const int dport_off = tcp_offset + offsetof(struct tcphdr, dest); @@ -412,11 +434,15 @@ static void create_packet(void *buf, int seq_offset, int ack_offset, fill_networklayer(buf + inner_ip_off, payload_len, IPPROTO_TCP); if (inner_ip_off > ETH_HLEN) { - int encap_proto = (proto == PF_INET) ? - IPPROTO_IPIP : IPPROTO_IPV6; + if (pppoe) { + fill_pppoelayer(buf + ETH_HLEN, payload_len + ip_hdr_len, 0x1234); + } else { + int encap_proto = (proto == PF_INET) ? + IPPROTO_IPIP : IPPROTO_IPV6; - fill_networklayer(buf + ETH_HLEN, - payload_len + ip_hdr_len, encap_proto); + fill_networklayer(buf + ETH_HLEN, + payload_len + ip_hdr_len, encap_proto); + } } fill_datalinklayer(buf); @@ -526,7 +552,7 @@ static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, static void send_data_pkts(int fd, struct sockaddr_ll *daddr, int payload_len1, int payload_len2) { - static char buf[ETH_HLEN + IP_MAXPACKET]; + static char buf[L2_HLEN_MAX + IP_MAXPACKET]; create_packet(buf, 0, 0, payload_len1, 0); write_packet(fd, buf, total_hdr_len + payload_len1, daddr); @@ -1071,6 +1097,20 @@ static void send_fragment6(int fd, struct sockaddr_ll *daddr) write_packet(fd, buf, bufpkt_len, daddr); } +static void send_changed_pppoe_sid(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + int pkt_size = total_hdr_len + PAYLOAD_LEN; + struct pppoe_hdr *hdr = (struct pppoe_hdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + hdr->sid = htons(0x4321); + write_packet(fd, buf, pkt_size, daddr); +} + static void bind_packetsocket(int fd) { struct sockaddr_ll daddr = {}; @@ -1121,11 +1161,14 @@ static void recv_error(int fd, int rcv_errno) static void check_recv_pkts(int fd, int *correct_payload, int correct_num_pkts) { - static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; - struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); - struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + static char buffer[IP_MAXPACKET + L2_HLEN_MAX + 1]; + int nhoff = ETH_HLEN + (pppoe ? PPPOE_SES_HLEN : 0); + struct iphdr *iph = (struct iphdr *)(buffer + nhoff); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + nhoff); struct tcphdr *tcph; bool bad_packet = false; + int bytes_expected = 0; + int bytes_received = 0; int tcp_ext_len = 0; int ip_ext_len = 0; int pkt_size = -1; @@ -1134,13 +1177,15 @@ static void check_recv_pkts(int fd, int *correct_payload, int i; vlog("Expected {"); - for (i = 0; i < correct_num_pkts; i++) + for (i = 0; i < correct_num_pkts; i++) { vlog("%d ", correct_payload[i]); + bytes_expected += correct_payload[i]; + } vlog("}, Total %d packets\nReceived {", correct_num_pkts); while (1) { ip_ext_len = 0; - pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + pkt_size = recv(fd, buffer, sizeof(buffer), 0); if (pkt_size < 0) recv_error(fd, errno); @@ -1170,9 +1215,17 @@ static void check_recv_pkts(int fd, int *correct_payload, vlog("[!=%d]", correct_payload[num_pkt]); bad_packet = true; } + bytes_received += data_len; num_pkt++; } vlog("}, Total %d packets.\n", num_pkt); + /* Signal over-coalescing explicitly, it's a hard failure, unlike + * under-coalescing which could be timing- or loss-related. + */ + if (num_pkt < correct_num_pkts && bytes_received == bytes_expected) + error(EXIT_OVER_COALESCE, 0, + "over-coalesced: got %d pkts vs expected %d (%d B)", + num_pkt, correct_num_pkts, bytes_received); if (num_pkt != correct_num_pkts) error(1, 0, "incorrect number of packets"); if (bad_packet) @@ -1183,9 +1236,10 @@ static void check_recv_pkts(int fd, int *correct_payload, static void check_capacity_pkts(int fd) { - static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; - struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); - struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + static char buffer[IP_MAXPACKET + L2_HLEN_MAX + 1]; + int nhoff = ETH_HLEN + (pppoe ? PPPOE_SES_HLEN : 0); + struct iphdr *iph = (struct iphdr *)(buffer + nhoff); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + nhoff); int num_pkt = 0, num_coal = 0, pkt_idx; const char *fail_reason = NULL; int flow_order[num_flows * 2]; @@ -1203,7 +1257,7 @@ static void check_capacity_pkts(int fd) while (1) { ip_ext_len = 0; - pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + pkt_size = recv(fd, buffer, sizeof(buffer), 0); if (pkt_size < 0) recv_error(fd, errno); @@ -1499,6 +1553,12 @@ static void gro_sender(void) usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + /* PPPoE sub-tests */ + } else if (strcmp(testname, "pppoe_sid") == 0) { + send_changed_pppoe_sid(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else { error(1, 0, "Unknown testcase: %s", testname); } @@ -1716,6 +1776,12 @@ static void gro_receiver(void) } else if (strcmp(testname, "capacity") == 0) { check_capacity_pkts(rxfd); + } else if (strcmp(testname, "pppoe_sid") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("different PPPoE session ID doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else { error(1, 0, "Test case error: unknown testname %s", testname); } @@ -1734,6 +1800,8 @@ static void parse_args(int argc, char **argv) { "ipv6", no_argument, NULL, '6' }, { "ipip", no_argument, NULL, 'e' }, { "ip6ip6", no_argument, NULL, 'E' }, + { "pppoev4", no_argument, NULL, 'p' }, + { "pppoev6", no_argument, NULL, 'P' }, { "num-flows", required_argument, NULL, 'n' }, { "rx", no_argument, NULL, 'r' }, { "saddr", required_argument, NULL, 's' }, @@ -1745,7 +1813,7 @@ static void parse_args(int argc, char **argv) }; int c; - while ((c = getopt_long(argc, argv, "46d:D:eEi:n:rs:S:t:ov", opts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "46d:D:eEi:n:pPrs:S:t:ov", opts, NULL)) != -1) { switch (c) { case '4': proto = PF_INET; @@ -1765,6 +1833,16 @@ static void parse_args(int argc, char **argv) proto = PF_INET6; ethhdr_proto = htons(ETH_P_IPV6); break; + case 'p': + pppoe = true; + proto = PF_INET; + ethhdr_proto = htons(ETH_P_PPP_SES); + break; + case 'P': + pppoe = true; + proto = PF_INET6; + ethhdr_proto = htons(ETH_P_PPP_SES); + break; case 'd': addr4_dst = addr6_dst = optarg; break; @@ -1812,6 +1890,10 @@ int main(int argc, char **argv) } else if (ip6ip6) { tcp_offset = ETH_HLEN + sizeof(struct ipv6hdr) * 2; total_hdr_len = tcp_offset + sizeof(struct tcphdr); + } else if (pppoe) { + tcp_offset = ETH_HLEN + PPPOE_SES_HLEN + + (proto == PF_INET ? sizeof(struct iphdr) : sizeof(struct ipv6hdr)); + total_hdr_len = tcp_offset + sizeof(struct tcphdr); } else if (proto == PF_INET) { tcp_offset = ETH_HLEN + sizeof(struct iphdr); total_hdr_len = tcp_offset + sizeof(struct tcphdr); diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py index 7c81d86a7e97..e58bdbdc58ee 100644 --- a/tools/testing/selftests/net/lib/py/__init__.py +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -10,11 +10,11 @@ from .ksft import KsftFailEx, KsftSkipEx, KsftXfailEx, ksft_pr, ksft_eq, \ ksft_ge, ksft_gt, ksft_lt, ksft_raises, ksft_busy_wait, \ ktap_result, ksft_disruptive, ksft_setup, ksft_run, ksft_exit, \ ksft_variants, KsftNamedVariant -from .netns import NetNS, NetNSEnter +from .netns import NetNS, NetNSEnter, UserNetNS from .nsim import NetdevSim, NetdevSimDev from .utils import CmdExitFailure, fd_read_timeout, cmd, bkg, defer, \ bpftool, ip, ethtool, bpftrace, rand_port, rand_ports, wait_port_listen, \ - wait_file, tool + wait_file, tool, tc from .bpf import bpf_map_set, bpf_map_dump, bpf_prog_map_ids from .ynl import NlError, NlctrlFamily, YnlFamily, \ EthtoolFamily, NetdevFamily, RtnlFamily, RtnlAddrFamily @@ -26,10 +26,10 @@ __all__ = ["KSRC", "ksft_is", "ksft_ge", "ksft_gt", "ksft_lt", "ksft_raises", "ksft_busy_wait", "ktap_result", "ksft_disruptive", "ksft_setup", "ksft_run", "ksft_exit", "ksft_variants", "KsftNamedVariant", - "NetNS", "NetNSEnter", + "NetNS", "NetNSEnter", "UserNetNS", "CmdExitFailure", "fd_read_timeout", "cmd", "bkg", "defer", "bpftool", "ip", "ethtool", "bpftrace", "rand_port", "rand_ports", - "wait_port_listen", "wait_file", "tool", + "wait_port_listen", "wait_file", "tool", "tc", "bpf_map_set", "bpf_map_dump", "bpf_prog_map_ids", "NetdevSim", "NetdevSimDev", "NetshaperFamily", "DevlinkFamily", "PSPFamily", "NlError", diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py index 8e9317044eef..526f6aa80077 100644 --- a/tools/testing/selftests/net/lib/py/netns.py +++ b/tools/testing/selftests/net/lib/py/netns.py @@ -1,9 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 -from .utils import ip import ctypes +import os import random import string +import subprocess +import time +from pathlib import Path + +from .utils import ip libc = ctypes.cdll.LoadLibrary('libc.so.6') @@ -34,6 +39,74 @@ class NetNS: return f"NetNS({self.name})" +class UserNetNS: + """Network namespace owned by a non-init user namespace.""" + + def __init__(self): + self.name = ''.join( + random.choice(string.ascii_lowercase) for _ in range(8)) + self.user_ns_path = f"/run/userns/{self.name}" + self.net_ns_path = f"/run/netns/{self.name}" + self._user_mounted = False + self._net_mounted = False + + os.makedirs("/run/userns", exist_ok=True) + os.makedirs("/run/netns", exist_ok=True) + + Path(self.user_ns_path).touch() + Path(self.net_ns_path).touch() + + with subprocess.Popen( + ["unshare", "--user", "--net", "--map-root-user", + "sleep", "infinity"]) as proc: + try: + pid = proc.pid + init_user = os.readlink("/proc/self/ns/user") + for _ in range(200): + try: + if os.readlink(f"/proc/{pid}/ns/user") != init_user: + break + except OSError: + pass + time.sleep(0.01) + else: + raise RuntimeError("unshare child did not create userns") + + subprocess.run(["mount", "--bind", f"/proc/{pid}/ns/user", + self.user_ns_path], check=True) + self._user_mounted = True + subprocess.run(["mount", "--bind", f"/proc/{pid}/ns/net", + self.net_ns_path], check=True) + self._net_mounted = True + finally: + proc.kill() + + def __del__(self): + if self._net_mounted: + subprocess.run(["umount", self.net_ns_path], check=False) + self._net_mounted = False + if self._user_mounted: + subprocess.run(["umount", self.user_ns_path], check=False) + self._user_mounted = False + for path in (self.net_ns_path, self.user_ns_path): + try: + os.unlink(path) + except OSError: + pass + + def __enter__(self): + return self + + def __exit__(self, ex_type, ex_value, ex_tb): + self.__del__() + + def __str__(self): + return self.name + + def __repr__(self): + return f"UserNetNS({self.name})" + + class NetNSEnter: def __init__(self, ns_name): self.ns_path = f"/run/netns/{ns_name}" diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 6c44a3d2bbf7..184bb04343f6 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -23,6 +23,10 @@ class CmdExitFailure(Exception): self.cmd = cmd_obj +class CmdExitZeroFailure(CmdExitFailure): + """ Command succeeded (returned zero exit code), but expected failure. """ + + def fd_read_timeout(fd, timeout): rlist, _, _ = select.select([fd], [], [], timeout) if rlist: @@ -39,10 +43,16 @@ class cmd: Use bkg() instead to run a command in the background. """ - def __init__(self, comm, shell=None, fail=True, ns=None, background=False, - host=None, timeout=5, ksft_ready=None, ksft_wait=None): + def __init__(self, comm, shell=None, fail=True, expect_fail=False, ns=None, + background=False, host=None, timeout=20, ksft_ready=None, + ksft_wait=None): if ns: - comm = f'ip netns exec {ns} ' + comm + if hasattr(ns, 'user_ns_path'): + comm = (f'nsenter --user={ns.user_ns_path} ' + f'--net={ns.net_ns_path} --setuid=0 --setgid=0 -- ' + + comm) + else: + comm = f'ip netns exec {ns} ' + comm self.stdout = None self.stderr = None @@ -88,7 +98,8 @@ class cmd: self._process_terminate(terminate=terminate, timeout=1) raise CmdInitFailure("Did not receive ready message", self) if not background: - self.process(terminate=False, fail=fail, timeout=timeout) + self.process(terminate=False, fail=fail, expect_fail=expect_fail, + timeout=timeout) def _process_terminate(self, terminate, timeout): if terminate: @@ -102,7 +113,7 @@ class cmd: return stdout, stderr - def process(self, terminate=True, fail=None, timeout=5): + def process(self, terminate=True, fail=None, expect_fail=False, timeout=20): if fail is None: fail = not terminate @@ -111,10 +122,19 @@ class cmd: stdout, stderr = self._process_terminate(terminate=terminate, timeout=timeout) - if self.proc.returncode != 0 and fail: + + # Fail on unexpected test failure if fail. + # Fail on unexpected test success if expect_fail. + # Fail on negative returncode if either: + # Set by subprocess on crash or signal, this is never expected failure. + if (self.proc.returncode != 0 and fail or + (self.proc.returncode < 0 and expect_fail)): if len(stderr) > 0 and stderr[-1] == "\n": stderr = stderr[:-1] raise CmdExitFailure("Command failed", self) + elif self.proc.returncode == 0 and expect_fail: + raise CmdExitZeroFailure("Command succeeded (expected fail)", self) + def __repr__(self): def str_fmt(name, s): @@ -157,14 +177,17 @@ class bkg(cmd): with bkg("my_binary", ksft_wait=5): """ - def __init__(self, comm, shell=None, fail=None, ns=None, host=None, - exit_wait=False, ksft_ready=None, ksft_wait=None): + def __init__(self, comm, shell=None, fail=None, expect_fail=None, + ns=None, host=None, exit_wait=False, ksft_ready=None, + ksft_wait=None): super().__init__(comm, background=True, - shell=shell, fail=fail, ns=ns, host=host, - ksft_ready=ksft_ready, ksft_wait=ksft_wait) + shell=shell, fail=fail, expect_fail=expect_fail, + ns=ns, host=host, ksft_ready=ksft_ready, + ksft_wait=ksft_wait) self.terminate = not exit_wait and not ksft_wait self._exit_wait = exit_wait self.check_fail = fail + self.expect_fail = expect_fail if shell and self.terminate: print("# Warning: combining shell and terminate is risky!") @@ -179,7 +202,8 @@ class bkg(cmd): # since forcing termination silences failures with fail=None if self.proc.poll() is None: terminate = terminate or (self._exit_wait and ex_type is not None) - return self.process(terminate=terminate, fail=self.check_fail) + return self.process(terminate=terminate, fail=self.check_fail, + expect_fail=self.expect_fail) GLOBAL_DEFER_QUEUE = [] @@ -220,7 +244,10 @@ class defer: def tool(name, args, json=None, ns=None, host=None): cmd_str = name + ' ' if json: - cmd_str += '--json ' + if name == 'tc': + cmd_str += '-json ' + else: + cmd_str += '--json ' cmd_str += args cmd_obj = cmd(cmd_str, ns=ns, host=host) if json: @@ -238,6 +265,13 @@ def ip(args, json=None, ns=None, host=None): return tool('ip', args, json=json, host=host) +def tc(args, json=None, ns=None, host=None): + """ Helper to call tc with standard set of optional args. """ + if ns: + args = f'-netns {ns} ' + args + return tool('tc', args, json=json, host=host) + + def ethtool(args, json=None, ns=None, host=None): return tool('ethtool', args, json=json, ns=ns, host=host) diff --git a/tools/testing/selftests/net/lib/xdp_native.bpf.c b/tools/testing/selftests/net/lib/xdp_native.bpf.c index 64f05229ab24..ded3f896e622 100644 --- a/tools/testing/selftests/net/lib/xdp_native.bpf.c +++ b/tools/testing/selftests/net/lib/xdp_native.bpf.c @@ -268,6 +268,17 @@ static int xdp_mode_tx_handler(struct xdp_md *ctx, __u16 port) return XDP_PASS; } +static __always_inline __u16 csum_fold_helper(__u32 csum) +{ + csum = (csum & 0xffff) + (csum >> 16); + return ~((csum & 0xffff) + (csum >> 16)); +} + +static __always_inline __u16 csum_fold_udp_helper(__u32 csum) +{ + return csum_fold_helper(csum) ? : 0xffff; +} + static void *update_pkt(struct xdp_md *ctx, __s16 offset, __u32 *udp_csum) { void *data_end = (void *)(long)ctx->data_end; @@ -281,21 +292,22 @@ static void *update_pkt(struct xdp_md *ctx, __s16 offset, __u32 *udp_csum) if (eth->h_proto == bpf_htons(ETH_P_IP)) { struct iphdr *iph = data + sizeof(*eth); - __u16 total_len; if (iph + 1 > (struct iphdr *)data_end) return NULL; - iph->tot_len = bpf_htons(bpf_ntohs(iph->tot_len) + offset); - udph = (void *)eth + sizeof(*iph) + sizeof(*eth); if (!udph || udph + 1 > (struct udphdr *)data_end) return NULL; - len_new = bpf_htons(bpf_ntohs(udph->len) + offset); + len = iph->tot_len; + len_new = bpf_htons(bpf_ntohs(len) + offset); + iph->tot_len = len_new; + iph->check = csum_fold_helper( + bpf_csum_diff(&len, sizeof(len), &len_new, + sizeof(len_new), ~((__u32)iph->check))); } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { struct ipv6hdr *ipv6h = data + sizeof(*eth); - __u16 payload_len; if (ipv6h + 1 > (struct ipv6hdr *)data_end) return NULL; @@ -304,33 +316,27 @@ static void *update_pkt(struct xdp_md *ctx, __s16 offset, __u32 *udp_csum) if (!udph || udph + 1 > (struct udphdr *)data_end) return NULL; - *udp_csum = ~((__u32)udph->check); - len = ipv6h->payload_len; len_new = bpf_htons(bpf_ntohs(len) + offset); ipv6h->payload_len = len_new; - - *udp_csum = bpf_csum_diff(&len, sizeof(len), &len_new, - sizeof(len_new), *udp_csum); - - len = udph->len; - len_new = bpf_htons(bpf_ntohs(udph->len) + offset); - *udp_csum = bpf_csum_diff(&len, sizeof(len), &len_new, - sizeof(len_new), *udp_csum); } else { return NULL; } + len = udph->len; + len_new = bpf_htons(bpf_ntohs(len) + offset); + + *udp_csum = ~((__u32)udph->check); + *udp_csum = bpf_csum_diff(&len, sizeof(len), &len_new, + sizeof(len_new), *udp_csum); + *udp_csum = bpf_csum_diff(&len, sizeof(len), &len_new, + sizeof(len_new), *udp_csum); + udph->len = len_new; return udph; } -static __u16 csum_fold_helper(__u32 csum) -{ - return ~((csum & 0xffff) + (csum >> 16)) ? : 0xffff; -} - static int xdp_adjst_tail_shrnk_data(struct xdp_md *ctx, __u16 offset, unsigned long hdr_len) { @@ -359,7 +365,7 @@ static int xdp_adjst_tail_shrnk_data(struct xdp_md *ctx, __u16 offset, return -1; udp_csum = bpf_csum_diff((__be32 *)tmp_buff, offset, 0, 0, udp_csum); - udph->check = (__u16)csum_fold_helper(udp_csum); + udph->check = (__u16)csum_fold_udp_helper(udp_csum); if (bpf_xdp_adjust_tail(ctx, 0 - offset) < 0) return -1; @@ -403,7 +409,7 @@ static int xdp_adjst_tail_grow_data(struct xdp_md *ctx, __u16 offset) return -1; udp_csum = bpf_csum_diff(0, 0, (__be32 *)tmp_buff, offset, udp_csum); - udph->check = (__u16)csum_fold_helper(udp_csum); + udph->check = (__u16)csum_fold_udp_helper(udp_csum); buff_len = bpf_xdp_get_buff_len(ctx); @@ -484,8 +490,7 @@ static int xdp_adjst_head_shrnk_data(struct xdp_md *ctx, __u64 hdr_len, return -1; udp_csum = bpf_csum_diff((__be32 *)tmp_buff, offset, 0, 0, udp_csum); - - udph->check = (__u16)csum_fold_helper(udp_csum); + udph->check = (__u16)csum_fold_udp_helper(udp_csum); if (bpf_xdp_load_bytes(ctx, 0, tmp_buff, MAX_ADJST_OFFSET) < 0) return -1; @@ -542,7 +547,7 @@ static int xdp_adjst_head_grow_data(struct xdp_md *ctx, __u64 hdr_len, return -1; udp_csum = bpf_csum_diff(0, 0, (__be32 *)data_buff, offset, udp_csum); - udph->check = (__u16)csum_fold_helper(udp_csum); + udph->check = (__u16)csum_fold_udp_helper(udp_csum); if (hdr_len > MAX_ADJST_OFFSET || hdr_len == 0) return -1; diff --git a/tools/testing/selftests/net/link_netns.py b/tools/testing/selftests/net/link_netns.py index aab043c59d69..6d1f863b6262 100755 --- a/tools/testing/selftests/net/link_netns.py +++ b/tools/testing/selftests/net/link_netns.py @@ -3,13 +3,14 @@ import time -from lib.py import ksft_run, ksft_exit, ksft_true +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_true from lib.py import ip from lib.py import NetNS, NetNSEnter from lib.py import RtnlFamily LINK_NETNSID = 100 +LINK_NETNSID2 = 200 def test_event() -> None: @@ -32,6 +33,57 @@ def test_event() -> None: "Received unexpected link notification") +def test_event_all_nsid() -> None: + """NETLINK_LISTEN_ALL_NSID notifications: local events must not + carry nsid even with a self-referential mapping. Remote events + must carry the correct nsid.""" + + with NetNS() as ns1, NetNS() as ns2: + net1, net2 = str(ns1), str(ns2) + + with NetNSEnter(net1): + rtnl = RtnlFamily() + rtnl.ntf_listen_all_nsid() + rtnl.ntf_subscribe("rtnlgrp-link") + + # Case 1: no nsid assigned, local event, no nsid expected. + ip("link add dummy-lo type dummy", ns=net1) + + # Case 2: self-referential nsid, local event, still no nsid. + ip(f"netns set {net1} {LINK_NETNSID}", ns=net1) + ip("link add dummy-sr type dummy", ns=net1) + + # Case 3: remote event, nsid present. + ip(f"netns set {net2} {LINK_NETNSID2}", ns=net1) + ip("link add dummy-re type dummy", ns=net2) + + # Collect the three newlink events, ignoring unrelated noise. + events = {} + for msg in rtnl.poll_ntf(duration=1): + if msg['name'] == 'getlink': + ifname = msg['msg'].get('ifname') + if ifname in ('dummy-lo', 'dummy-sr', 'dummy-re'): + events[ifname] = msg + if len(events) == 3: + break + + ksft_true('dummy-lo' in events, "missing local event") + ksft_true(events['dummy-lo'].get('nsid') is None, + "local event without nsid should not carry nsid") + + ksft_true('dummy-sr' in events, "missing self-ref event") + ksft_true(events['dummy-sr'].get('nsid') is None, + "local event with self-ref nsid should not carry nsid") + + ksft_true('dummy-re' in events, "missing remote event") + ksft_eq(events['dummy-re'].get('nsid'), LINK_NETNSID2, + "remote event should carry nsid") + + ip("link del dummy-lo", ns=net1) + ip("link del dummy-sr", ns=net1) + ip("link del dummy-re", ns=net2) + + def validate_link_netns(netns, ifname, link_netnsid) -> bool: link_info = ip(f"-d link show dev {ifname}", ns=netns, json=True) if not link_info: @@ -133,7 +185,12 @@ def test_peer_net() -> None: def main() -> None: - ksft_run([test_event, test_link_net, test_peer_net]) + ksft_run([ + test_event, + test_event_all_nsid, + test_link_net, + test_peer_net, + ]) ksft_exit() diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index a6447f7a31fe..d158678fa6ab 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -401,7 +401,7 @@ do_transfer() mptcp_lib_wait_local_port_listen "${listener_ns}" "${port}" local start - start=$(date +%s%3N) + start=$(date +%s%N) ip netns exec ${connector_ns} \ ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ $extra_args $connect_addr < "$cin" > "$cout" & @@ -423,7 +423,7 @@ do_transfer() fi local stop - stop=$(date +%s%3N) + stop=$(date +%s%N) if $capture; then sleep 1 @@ -439,7 +439,7 @@ do_transfer() fi local duration - duration=$((stop-start)) + duration=$(((stop-start) / 1000000)) printf "(duration %05sms) " "${duration}" if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ] || [ ${timeout_pid} -ne 0 ]; then mptcp_lib_pr_fail "client exit code $retc, server $rets" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index beec41f6662a..c0aeffd5cb71 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -63,6 +63,7 @@ unset fastclose unset fullmesh unset speed unset bind_addr +unset ifaces_nr unset join_syn_rej unset join_csum_ns1 unset join_csum_ns2 @@ -86,6 +87,10 @@ unset fb_mpc_data unset fb_md5_sig unset fb_dss +unset add_addr_tx_nr +unset add_addr_echo_tx_nr +unset add_addr_drop_tx_nr + # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || # (ip6 && (ip6[74] & 0xf0) == 0x30)'" CBPF_MPTCP_SUBOPTION_ADD_ADDR="14, @@ -146,7 +151,7 @@ init_partial() # ns1eth4 ns2eth4 local i - for i in $(seq 1 4); do + for i in $(seq 1 "${ifaces_nr:-4}"); do ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad @@ -165,7 +170,7 @@ init_partial() init_shapers() { local i - for i in $(seq 1 4); do + for i in $(seq 1 "${ifaces_nr:-4}"); do tc -n $ns1 qdisc add dev ns1eth$i root netem rate 20mbit delay 1ms tc -n $ns2 qdisc add dev ns2eth$i root netem rate 20mbit delay 1ms done @@ -512,6 +517,19 @@ reset_with_tcp_filter() fi } +# For kernel supporting limits above 8 +# $1: title ; $2,4: addrs limit ns1,2 ; $3,5: subflows limit ns1,2 +reset_with_high_limits() +{ + reset "${1}" || return 1 + + if ! pm_nl_set_limits "${ns1}" "${2}" "${3}" 2>/dev/null || + ! pm_nl_set_limits "${ns2}" "${4}" "${5}" 2>/dev/null; then + mark_as_skipped "unable to set the limits to ${*:2}" + return 1 + fi +} + # $1: err msg fail_test() { @@ -1696,6 +1714,9 @@ chk_add_nr() local ack_nr=$port_nr local mis_syn_nr=0 local mis_ack_nr=0 + local add_tx_nr=${add_addr_tx_nr:-${add_nr}} + local echo_tx_nr=${add_addr_echo_tx_nr:-${echo_nr}} + local drop_tx_nr=${add_addr_drop_tx_nr:-0} local ns_tx=$ns1 local ns_rx=$ns2 local tx="" @@ -1797,34 +1818,25 @@ chk_add_nr() print_ok fi fi -} - -chk_add_tx_nr() -{ - local add_tx_nr=$1 - local echo_tx_nr=$2 - local count - print_check "add addr tx" - count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtAddAddrTx") - if [ -z "$count" ]; then - print_skip + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtAddAddrTx") # Tolerate more ADD_ADDR then expected (if any), due to retransmissions - elif [ "$count" != "$add_tx_nr" ] && - { [ "$add_tx_nr" -eq 0 ] || [ "$count" -lt "$add_tx_nr" ]; }; then + if [ -n "$count" ] && [ "$count" != "$add_tx_nr" ] && + { [ "$add_tx_nr" -eq 0 ] || [ "$count" -lt "$add_tx_nr" ]; }; then + print_check "add addr tx" fail_test "got $count ADD_ADDR[s] TX, expected $add_tx_nr" - else - print_ok fi - print_check "add addr echo tx" - count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtEchoAddTx") - if [ -z "$count" ]; then - print_skip - elif [ "$count" != "$echo_tx_nr" ]; then + count=$(mptcp_lib_get_counter ${ns_rx} "MPTcpExtEchoAddTx") + if [ -n "$count" ] && [ "$count" != "$echo_tx_nr" ]; then + print_check "add addr echo tx" fail_test "got $count ADD_ADDR echo[s] TX, expected $echo_tx_nr" - else - print_ok + fi + + count=$(mptcp_lib_get_counter ${ns_tx} "MPTcpExtAddAddrTxDrop") + if [ -n "$count" ] && [ "$count" != "$drop_tx_nr" ]; then + print_check "add addr tx drop" + fail_test "got $count ADD_ADDR drop[s] TX, expected $drop_tx_nr" fi } @@ -2237,7 +2249,6 @@ signal_address_tests() pm_nl_add_endpoint $ns1 10.0.2.1 flags signal run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 - chk_add_tx_nr 1 1 chk_add_nr 1 1 fi @@ -2515,8 +2526,8 @@ add_addr_timeout_tests() speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 - chk_add_tx_nr 4 4 - chk_add_nr 4 0 + add_addr_echo_tx_nr=4 \ + chk_add_nr 4 0 fi # add_addr timeout IPv6 @@ -2527,7 +2538,8 @@ add_addr_timeout_tests() speed=slow \ run_tests $ns1 $ns2 dead:beef:1::1 chk_join_nr 1 1 1 - chk_add_nr 4 0 + add_addr_echo_tx_nr=4 \ + chk_add_nr 4 0 fi # signal addresses timeout @@ -2539,7 +2551,8 @@ add_addr_timeout_tests() speed=10 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 - chk_add_nr 8 0 + add_addr_echo_tx_nr=8 \ + chk_add_nr 8 0 fi # signal invalid addresses timeout @@ -2552,7 +2565,8 @@ add_addr_timeout_tests() run_tests $ns1 $ns2 10.0.1.1 join_syn_tx=2 \ chk_join_nr 1 1 1 - chk_add_nr 8 0 + add_addr_echo_tx_nr=7 \ + chk_add_nr 8 0 fi } @@ -3184,6 +3198,17 @@ add_addr_ports_tests() chk_add_nr 1 1 1 fi + # signal address v6 with port + if reset "signal address v6 with port" && + continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/add_addr_v6_port_drop_ts'; then + pm_nl_set_limits $ns1 0 1 + pm_nl_set_limits $ns2 1 1 + pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal port 10100 + run_tests $ns1 $ns2 dead:beef:1::1 + chk_join_nr 1 1 1 + chk_add_nr 1 1 1 + fi + # subflow and signal with port if reset "subflow and signal with port"; then pm_nl_add_endpoint $ns1 10.0.2.1 flags signal port 10100 @@ -3278,6 +3303,21 @@ add_addr_ports_tests() chk_mpc_endp_attempt ${retl} 1 fi + + # first signal address drops, second one still progresses + if reset "signal addr list progresses after tx drop"; then + pm_nl_set_limits $ns1 0 2 + pm_nl_set_limits $ns2 1 0 + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_v6_port_drop_ts=0 2>/dev/null || true + ip netns exec $ns1 sysctl -q net.ipv4.tcp_timestamps=1 + ip netns exec $ns2 sysctl -q net.ipv4.tcp_timestamps=1 + + pm_nl_add_endpoint $ns1 dead:beef:2::1 flags signal port 10100 + pm_nl_add_endpoint $ns1 dead:beef:3::1 flags signal + run_tests $ns1 $ns2 dead:beef:1::1 + add_addr_drop_tx_nr=1 \ + chk_add_nr 1 1 0 + fi } bind_tests() @@ -3669,6 +3709,21 @@ fullmesh_tests() chk_prio_nr 0 1 1 0 chk_rm_nr 0 1 fi + + # fullmesh in 8x8 to create 63 additional subflows + if ifaces_nr=8 reset_with_high_limits "fullmesh 8x8" 64 64 64 64; then + # higher chance to lose ADD_ADDR: allow retransmissions + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1 + local i + for i in $(seq 1 8); do + pm_nl_add_endpoint $ns2 10.0.$i.2 flags subflow,fullmesh + pm_nl_add_endpoint $ns1 10.0.$i.1 flags signal + done + speed=slow \ + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr 63 63 63 + fi + } fastclose_tests() @@ -4069,6 +4124,10 @@ userspace_tests() chk_rm_nr 0 1 chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 + # check counters are not affected by errors at creation time + userspace_pm_add_sf $ns2 10.0.12.2 10 2>/dev/null + chk_mptcp_info subflows 0 subflows 0 + chk_subflows_total 1 1 kill_events_pids mptcp_lib_kill_group_wait $tests_pid fi diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 5fea7e7df628..5ef6033775c8 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -28,7 +28,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_SUBTEST_FLAKY=0 -MPTCP_LIB_SUBTESTS_LAST_TS_MS= +MPTCP_LIB_SUBTESTS_LAST_TS_NS= MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -236,7 +236,7 @@ mptcp_lib_kversion_ge() { } mptcp_lib_subtests_last_ts_reset() { - MPTCP_LIB_SUBTESTS_LAST_TS_MS="$(date +%s%3N)" + MPTCP_LIB_SUBTESTS_LAST_TS_NS="$(date +%s%N)" } mptcp_lib_subtests_last_ts_reset @@ -255,7 +255,7 @@ __mptcp_lib_result_check_duplicated() { __mptcp_lib_result_add() { local result="${1}" local time="time=" - local ts_prev_ms + local ts_prev_ns shift local id=$((${#MPTCP_LIB_SUBTESTS[@]} + 1)) @@ -265,9 +265,9 @@ __mptcp_lib_result_add() { # not to add two '#' [[ "${*}" != *"#"* ]] && time="# ${time}" - ts_prev_ms="${MPTCP_LIB_SUBTESTS_LAST_TS_MS}" + ts_prev_ns="${MPTCP_LIB_SUBTESTS_LAST_TS_NS}" mptcp_lib_subtests_last_ts_reset - time+="$((MPTCP_LIB_SUBTESTS_LAST_TS_MS - ts_prev_ms))ms" + time+="$(((MPTCP_LIB_SUBTESTS_LAST_TS_NS - ts_prev_ns) / 1000000))ms" MPTCP_LIB_SUBTESTS+=("${result} ${id} - ${KSFT_TEST}: ${*} ${time}") } @@ -474,20 +474,24 @@ mptcp_lib_wait_local_port_listen() { wait_local_port_listen "${@}" "tcp" } +# $1: error file, $2: cmd, $3: expected msg, [$4: expected error] mptcp_lib_check_output() { local err="${1}" local cmd="${2}" local expected="${3}" + local exp_error="${4:-0}" local cmd_ret=0 local out - if ! out=$(${cmd} 2>"${err}"); then - cmd_ret=${?} - fi + out=$(${cmd} 2>"${err}") || cmd_ret=1 - if [ ${cmd_ret} -ne 0 ]; then - mptcp_lib_pr_fail "command execution '${cmd}' stderr" - cat "${err}" + if [ "${cmd_ret}" != "${exp_error}" ]; then + mptcp_lib_pr_fail "unexpected returned code for '${cmd}', info:" + if [ "${exp_error}" = 0 ]; then + cat "${err}" + else + echo "${out}" + fi return 2 elif [ "${out}" = "${expected}" ]; then return 0 diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index ab8bce06b262..e850a87429b6 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -355,10 +355,10 @@ sin=$(mktemp) sout=$(mktemp) cin=$(mktemp) cout=$(mktemp) +trap cleanup EXIT init make_file "$cin" "client" 1 make_file "$sin" "server" 1 -trap cleanup EXIT mptcp_lib_subtests_last_ts_reset run_tests $ns1 $ns2 10.0.1.1 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 123d9d7a0278..21bfe1311f11 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -66,6 +66,15 @@ get_limits() { fi } +get_limits_nb() { + if mptcp_lib_is_ip_mptcp; then + ip -n "${ns1}" mptcp limits | awk '{ print $2" "$4 }' + else + ip netns exec "${ns1}" ./pm_nl_ctl limits | \ + awk '{ printf "%s ", $2 }' + fi +} + format_endpoints() { mptcp_lib_pm_nl_format_endpoints "${@}" } @@ -122,10 +131,12 @@ check() local cmd="$1" local expected="$2" local msg="$3" + local exp_error="$4" local rc=0 mptcp_lib_print_title "$msg" - mptcp_lib_check_output "${err}" "${cmd}" "${expected}" || rc=${?} + mptcp_lib_check_output "${err}" "${cmd}" "${expected}" "${exp_error}" || + rc=${?} if [ ${rc} -eq 2 ]; then mptcp_lib_result_fail "${msg} # error ${rc}" ret=${KSFT_FAIL} @@ -158,53 +169,62 @@ check "show_endpoints" \ "3,10.0.1.3,signal backup")" "dump addrs" del_endpoint 2 -check "get_endpoint 2" "" "simple del addr" +check "get_endpoint 2" "" "simple del addr" 1 check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "3,10.0.1.3,signal backup")" "dump addrs after del" +add_endpoint 10.0.1.2 id 2 add_endpoint 10.0.1.3 2>/dev/null -check "get_endpoint 4" "" "duplicate addr" +check "get_endpoint 4" "" "duplicate addr" 1 add_endpoint 10.0.1.4 flags signal check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" -for i in $(seq 5 9); do - add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 -done -check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" -check "get_endpoint 10" "" "above hard addr limit" +read -r -a default_limits_nb <<< "$(get_limits_nb)" +# limits have been increased: from 8 to 64 for subflows/add_addr & 255 for endp +if mptcp_lib_expect_all_features || set_limits 9 9 2>/dev/null; then + max_endp=255 + max_limits=64 +else + max_endp=8 + max_limits=8 +fi +set_limits "${default_limits_nb[@]}" -del_endpoint 9 -for i in $(seq 10 255); do - add_endpoint 10.0.0.9 id "${i}" - del_endpoint "${i}" +for i in $(seq 5 ${max_endp}); do + add_endpoint "10.0.0.${i}" id "${i}" done -check "show_endpoints" \ - "$(format_endpoints "1,10.0.1.1" \ - "3,10.0.1.3,signal backup" \ - "4,10.0.1.4,signal" \ - "5,10.0.1.5,signal" \ - "6,10.0.1.6,signal" \ - "7,10.0.1.7,signal" \ - "8,10.0.1.8,signal")" "id limit" +check "get_endpoint ${max_endp}" \ + "$(format_endpoints "${max_endp},10.0.0.${max_endp}")" "id limit" + +if add_endpoint '10.0.0.1' &>/dev/null; then + hardlimit="no error" +else + hardlimit="error" +fi +check "echo ${hardlimit}" "error" "above hard addr limit" flush_endpoint check "show_endpoints" "" "flush addrs" -add_endpoint 10.0.1.1 flags unknown -check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" "ignore unknown flags" -flush_endpoint +# "unknown" flag is only supported by pm_nl_ctl +if ! mptcp_lib_is_ip_mptcp; then + add_endpoint 10.0.1.1 flags unknown + check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" \ + "ignore unknown flags" + flush_endpoint +fi -set_limits 9 1 2>/dev/null +set_limits $((max_limits + 1)) 1 2>/dev/null check "get_limits" "${default_limits}" "rcv addrs above hard limit" -set_limits 1 9 2>/dev/null +set_limits 1 $((max_limits + 1)) 2>/dev/null check "get_limits" "${default_limits}" "subflows above hard limit" -set_limits 8 8 +set_limits ${max_limits} ${max_limits} flush_endpoint ## to make sure it doesn't affect the limits -check "get_limits" "$(format_limits 8 8)" "set limits" +check "get_limits" "$(format_limits ${max_limits} ${max_limits})" "set limits" flush_endpoint add_endpoint 10.0.1.1 diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 99eecccbf0c8..78180da1efcc 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -217,8 +217,6 @@ static int capture_events(int fd, int event_group) /* do a netlink command and, if max > 0, fetch the reply ; nh's size >1024B */ static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max) { - struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; - socklen_t addr_len; void *data = nh; int rem, ret; int err = 0; @@ -230,15 +228,15 @@ static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max) } nh->nlmsg_len = len; - ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr)); + ret = send(fd, data, len, 0); if (ret != len) error(1, errno, "send netlink: %uB != %uB\n", ret, len); - addr_len = sizeof(nladdr); - rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len); + ret = recv(fd, data, max, 0); if (ret < 0) error(1, errno, "recv netlink: %uB\n", ret); + rem = ret; /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */ for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) { if (nh->nlmsg_type == NLMSG_DONE) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index d11a8b949aab..7b9aabe10170 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -76,13 +76,13 @@ setup() ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth1 ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth1 nodad - ip -net "$ns1" link set ns1eth1 up mtu 1500 + ip -net "$ns1" link set ns1eth1 up mtu 1500 gso_max_segs 0 ip -net "$ns1" route add default via 10.0.1.2 ip -net "$ns1" route add default via dead:beef:1::2 ip -net "$ns1" addr add 10.0.2.1/24 dev ns1eth2 ip -net "$ns1" addr add dead:beef:2::1/64 dev ns1eth2 nodad - ip -net "$ns1" link set ns1eth2 up mtu 1500 + ip -net "$ns1" link set ns1eth2 up mtu 1500 gso_max_segs 0 ip -net "$ns1" route add default via 10.0.2.2 metric 101 ip -net "$ns1" route add default via dead:beef:2::2 metric 101 @@ -91,21 +91,21 @@ setup() ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1 ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad - ip -net "$ns2" link set ns2eth1 up mtu 1500 + ip -net "$ns2" link set ns2eth1 up mtu 1500 gso_max_segs 0 ip -net "$ns2" addr add 10.0.2.2/24 dev ns2eth2 ip -net "$ns2" addr add dead:beef:2::2/64 dev ns2eth2 nodad - ip -net "$ns2" link set ns2eth2 up mtu 1500 + ip -net "$ns2" link set ns2eth2 up mtu 1500 gso_max_segs 0 ip -net "$ns2" addr add 10.0.3.2/24 dev ns2eth3 ip -net "$ns2" addr add dead:beef:3::2/64 dev ns2eth3 nodad - ip -net "$ns2" link set ns2eth3 up mtu 1500 + ip -net "$ns2" link set ns2eth3 up mtu 1500 gso_max_segs 0 ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1 ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1 ip -net "$ns3" addr add 10.0.3.3/24 dev ns3eth1 ip -net "$ns3" addr add dead:beef:3::3/64 dev ns3eth1 nodad - ip -net "$ns3" link set ns3eth1 up mtu 1500 + ip -net "$ns3" link set ns3eth1 up mtu 1500 gso_max_segs 0 ip -net "$ns3" route add default via 10.0.3.2 ip -net "$ns3" route add default via dead:beef:3::2 @@ -223,9 +223,11 @@ run_test() local rate2=$2 local delay1=$3 local delay2=$4 + local limit1=$5 + local limit2=$6 local lret local dev - shift 4 + shift 6 local msg=$* [ $delay1 -gt 0 ] && delay1="delay ${delay1}ms" || delay1="" @@ -240,10 +242,10 @@ run_test() # keep the queued pkts number low, or the RTT estimator will see # increasing latency over time. - tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1 limit 50 - tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2 limit 50 - tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1 limit 50 - tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2 limit 50 + tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1 limit ${limit1} + tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2 limit ${limit2} + tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1 limit ${limit1} + tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2 limit ${limit2} # time is measured in ms, account for transfer size, aggregated link speed # and header overhead (10%) @@ -301,13 +303,13 @@ done setup mptcp_lib_subtests_last_ts_reset -run_test 10 10 0 0 "balanced bwidth" -run_test 10 10 1 25 "balanced bwidth with unbalanced delay" +run_test 10 10 0 0 20 20 "balanced bwidth" +run_test 10 10 1 25 20 50 "balanced bwidth with unbalanced delay" # we still need some additional infrastructure to pass the following test-cases -MPTCP_LIB_SUBTEST_FLAKY=1 run_test 10 3 0 0 "unbalanced bwidth" -run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay" -run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay" +MPTCP_LIB_SUBTEST_FLAKY=1 run_test 10 3 0 0 30 20 "unbalanced bwidth" +run_test 10 3 1 25 40 30 "unbalanced bwidth with unbalanced delay" +run_test 10 3 25 1 50 30 "unbalanced bwidth with opposed, unbalanced delay" mptcp_lib_result_print_all_tap exit $ret diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index ee2d1a5254f8..f88dd4ef8d26 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -26,11 +26,13 @@ TEST_PROGS := \ nft_concat_range.sh \ nft_conntrack_helper.sh \ nft_fib.sh \ + nft_fib_nexthop.sh \ nft_flowtable.sh \ nft_interface_stress.sh \ nft_meta.sh \ nft_nat.sh \ nft_nat_zones.sh \ + nft_offload.sh \ nft_queue.sh \ nft_synproxy.sh \ nft_tproxy_tcp.sh \ diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 979cff56e1f5..c3c121b6f300 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -11,7 +11,12 @@ CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_SHA1=m +CONFIG_DEBUG_FS=y CONFIG_DUMMY=m +CONFIG_FAIL_FUNCTION=y +CONFIG_FAULT_INJECTION=y +CONFIG_FAULT_INJECTION_DEBUG_FS=y +CONFIG_FUNCTION_ERROR_INJECTION=y CONFIG_INET_DIAG=m CONFIG_INET_ESP=m CONFIG_INET_SCTP_DIAG=m @@ -36,6 +41,7 @@ CONFIG_IP_VS_RR=m CONFIG_MACVLAN=m CONFIG_NAMESPACES=y CONFIG_NET_CLS_U32=m +CONFIG_NETDEVSIM=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NETFILTER_NETLINK=m diff --git a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh index d860f7d9744b..7261975957ef 100755 --- a/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh +++ b/tools/testing/selftests/net/netfilter/conntrack_sctp_collision.sh @@ -2,18 +2,32 @@ # SPDX-License-Identifier: GPL-2.0 # # Testing For SCTP COLLISION SCENARIO as Below: -# +# 1. Stale INIT_ACK capture: # 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] # 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] # 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] # 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] # 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] # 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] +# (Delayed) +# +# 2. Stale INIT capture: +# 14:35:48.353250 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT] [init tag: 1187206187] +# 14:35:48.353275 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT ACK] [init tag: 2017837359] +# 14:35:48.353283 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [COOKIE ECHO] +# 14:35:48.353977 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [COOKIE ACK] +# 14:35:47.655279 IP CLIENT_IP.PORT > SERVER_IP.PORT: sctp (1) [INIT] [init tag: 2017837359] +# (Delayed) +# 14:35:48.855335 IP SERVER_IP.PORT > CLIENT_IP.PORT: sctp (1) [INIT ACK] [init tag: 164579970] # # TOPO: SERVER_NS (link0)<--->(link1) ROUTER_NS (link2)<--->(link3) CLIENT_NS source lib.sh +checktool "nft --version" "run test without nft" +checktool "tc -h" "run test without tc" +checktool "modprobe -q sctp" "load sctp module" + CLIENT_IP="198.51.200.1" CLIENT_PORT=1234 @@ -24,7 +38,8 @@ CLIENT_GW="198.51.200.2" SERVER_GW="198.51.100.2" # setup the topo -setup() { +topo_setup() { + # setup_ns cleans up existing net namespaces first. setup_ns CLIENT_NS SERVER_NS ROUTER_NS ip -n "$SERVER_NS" link add link0 type veth peer name link1 netns "$ROUTER_NS" ip -n "$CLIENT_NS" link add link3 type veth peer name link2 netns "$ROUTER_NS" @@ -38,35 +53,53 @@ setup() { ip -n "$ROUTER_NS" addr add $SERVER_GW/24 dev link1 ip -n "$ROUTER_NS" addr add $CLIENT_GW/24 dev link2 ip net exec "$ROUTER_NS" sysctl -wq net.ipv4.ip_forward=1 + sysctl -wq net.netfilter.nf_log_all_netns=1 ip -n "$CLIENT_NS" link set link3 up ip -n "$CLIENT_NS" addr add $CLIENT_IP/24 dev link3 ip -n "$CLIENT_NS" route add $SERVER_IP dev link3 via $CLIENT_GW +} + +conf_delay() +{ + # simulate the delay on OVS upcall by setting up a delay for INIT_ACK/INIT with + local ns=$1 + local link=$2 + local chunk_type=$3 - # simulate the delay on OVS upcall by setting up a delay for INIT_ACK with - # tc on $SERVER_NS side - tc -n "$SERVER_NS" qdisc add dev link0 root handle 1: htb r2q 64 - tc -n "$SERVER_NS" class add dev link0 parent 1: classid 1:1 htb rate 100mbit - tc -n "$SERVER_NS" filter add dev link0 parent 1: protocol ip u32 match ip protocol 132 \ - 0xff match u8 2 0xff at 32 flowid 1:1 - if ! tc -n "$SERVER_NS" qdisc add dev link0 parent 1:1 handle 10: netem delay 1200ms; then + # use a smaller number for assoc's max_retrans to reproduce the issue + ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3 + + tc -n "$ns" qdisc add dev "$link" root handle 1: htb r2q 64 + tc -n "$ns" class add dev "$link" parent 1: classid 1:1 htb rate 100mbit + tc -n "$ns" filter add dev "$link" parent 1: protocol ip \ + u32 match ip protocol 132 0xff match u8 "$chunk_type" 0xff at 32 flowid 1:1 + if ! tc -n "$ns" qdisc add dev "$link" parent 1:1 handle 10: netem delay 1200ms; then echo "SKIP: Cannot add netem qdisc" - exit $ksft_skip + return $ksft_skip fi # simulate the ctstate check on OVS nf_conntrack - ip net exec "$ROUTER_NS" iptables -A FORWARD -m state --state INVALID,UNTRACKED -j DROP - ip net exec "$ROUTER_NS" iptables -A INPUT -p sctp -j DROP - - # use a smaller number for assoc's max_retrans to reproduce the issue - modprobe -q sctp - ip net exec "$CLIENT_NS" sysctl -wq net.sctp.association_max_retrans=3 + ip net exec "$ROUTER_NS" nft -f - <<-EOF + table ip t { + chain forward { + type filter hook forward priority filter; policy accept; + meta l4proto icmp counter accept + ct state new counter accept + ct state established,related counter accept + ct state invalid log flags all counter drop comment \ + "Expect to drop stale INIT/INIT_ACK chunks" + counter + } + } + EOF + return 0 } cleanup() { - ip net exec "$CLIENT_NS" pkill sctp_collision >/dev/null 2>&1 - ip net exec "$SERVER_NS" pkill sctp_collision >/dev/null 2>&1 + # cleanup_all_ns terminates running processes in the namespaces. cleanup_all_ns + sysctl -wq net.netfilter.nf_log_all_netns=0 } do_test() { @@ -81,7 +114,19 @@ do_test() { # run the test case trap cleanup EXIT -setup && \ -echo "Test for SCTP Collision in nf_conntrack:" && \ -do_test && echo "PASS!" -exit $? + +echo "Test for SCTP INIT_ACK Collision in nf_conntrack:" +topo_setup || exit $? +conf_delay $SERVER_NS link0 2 || exit $? + +if ! do_test; then + exit $ksft_fail +fi + +echo "Test for SCTP INIT Collision in nf_conntrack:" +topo_setup || exit $? +conf_delay $CLIENT_NS link3 1 || exit $? + +if ! do_test; then + exit $ksft_fail +fi diff --git a/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh b/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh new file mode 100755 index 000000000000..c4f203057382 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# shellcheck disable=SC2154 +# +# Exercise nft_fib6_eval()'s sibling/nh enumeration on three route shapes: +# 1) route via a single external nexthop (nhid) +# 2) route via an external nexthop group (nhid -> group, two members) +# 3) route via old-style multipath (nexthop ... nexthop ...) +# +# In each scenario the route's nexthop set contains veth0 (the iif of the +# test packet). nft_fib6_info_nh_uses_dev() must walk the set and report +# veth0 as a valid oif. For (2) and (3) the matching nexthop is the second +# member, so the walk has to traverse beyond the primary nh. +# +# After sending $PKTS ICMPv6 echo requests from ns1, check two counters on +# nsrouter: +# nf_ok -- `fib daddr . iif oif eq "veth0"` must equal $PKTS +# nf_bad -- `fib daddr . iif oif missing` must stay at 0 +# Both rules also match on iif veth0 and ip6 daddr dead:dead::/64 so that +# kernel-generated ND/MLD/RA traffic cannot pollute the counters. +# +# Topology similar to nft_fib.sh, without ns2; two dummy interfaces on +# nsrouter host extra nh devices: +# +# dead:1::99 dead:1::1 +# ns1 <----veth----> nsrouter --- dummy0 dead:2::1 +# \-- dummy1 dead:9::1 + +source lib.sh + +ret=0 +PKTS=3 + +checktool "nft --version" "run test without nft" +checktool "ip -V" "run test without iproute2" + +setup_ns nsrouter ns1 +trap cleanup_all_ns EXIT + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" \ + > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net "$ns1" link set lo up +ip -net "$ns1" link set eth0 up +ip -net "$ns1" -6 addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" -6 route add default via dead:1::1 + +ip -net "$nsrouter" link set lo up +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" -6 addr add dead:1::1/64 dev veth0 nodad + +if ! ip -net "$nsrouter" link add dummy0 type dummy 2>/dev/null; then + echo "SKIP: dummy netdev not available" + exit $ksft_skip +fi +ip -net "$nsrouter" link set dummy0 up +ip -net "$nsrouter" -6 addr add dead:2::1/64 dev dummy0 nodad + +ip -net "$nsrouter" link add dummy1 type dummy +ip -net "$nsrouter" link set dummy1 up +ip -net "$nsrouter" -6 addr add dead:9::1/64 dev dummy1 nodad + +ip netns exec "$nsrouter" sysctl -q net.ipv6.conf.all.forwarding=1 + +load_fib_rule() { + # filter on iif + daddr so the counters only see our test packets + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +flush ruleset +table ip6 t { + counter nf_ok { } + counter nf_bad { } + chain c { + type filter hook prerouting priority 0; policy accept; + iif "veth0" ip6 daddr dead:dead::/64 fib daddr . iif oif eq "veth0" counter name nf_ok + iif "veth0" ip6 daddr dead:dead::/64 fib daddr . iif oif missing counter name nf_bad + } +} +EOF +} + +bad_counter() { + local counter=$1 + local expect=$2 + local tag=$3 + + echo "FAIL ($tag): counter $counter has unexpected value (expected \"$expect\")" 1>&2 + ip netns exec "$nsrouter" nft list counter ip6 t "$counter" 1>&2 +} + +run_scenario() { + local what="$1"; shift + # counter output format is "packets PACKET_NUM bytes BYTES_NUM"; + # we only care about the packet count + local expect_ok="packets $PKTS bytes" + local expect_bad="packets 0 bytes" + local lret=0 + + # reset route + nexthop state between scenarios + ip -net "$nsrouter" -6 route del dead:dead::/64 > /dev/null 2>&1 || true + ip -net "$nsrouter" nexthop flush > /dev/null 2>&1 || true + + # run the scenario function passed by the caller + "$@" || echo "WARN ($what): scenario setup returned non-zero" + + load_fib_rule || { echo "FAIL ($what): nft load"; ret=1; return; } + + # ping a daddr inside dead:dead::/64 so fib has to walk the nh set + ip netns exec "$ns1" ping -6 -c "$PKTS" -i 0.1 -W 1 dead:dead::1 \ + > /dev/null 2>&1 || true + + # verify the packets went through the expected fib path + if ! ip netns exec "$nsrouter" nft list counter ip6 t nf_ok | grep -q "$expect_ok"; then + bad_counter nf_ok "$expect_ok" "$what" + lret=1 + fi + if ! ip netns exec "$nsrouter" nft list counter ip6 t nf_bad | grep -q "$expect_bad"; then + bad_counter nf_bad "$expect_bad" "$what" + lret=1 + fi + + if [ $lret -eq 0 ]; then + echo "PASS: $what" + else + ret=1 + fi +} + +scenario_single_nh() { + ip -net "$nsrouter" nexthop add id 1 via dead:1::99 dev veth0 + ip -net "$nsrouter" -6 route add dead:dead::/64 nhid 1 +} +run_scenario "single external nexthop (nhid -> veth0)" scenario_single_nh + +scenario_nh_group() { + ip -net "$nsrouter" nexthop add id 1 via dead:2::2 dev dummy0 + ip -net "$nsrouter" nexthop add id 2 via dead:1::99 dev veth0 + ip -net "$nsrouter" nexthop add id 100 group 1/2 + ip -net "$nsrouter" -6 route add dead:dead::/64 nhid 100 +} +run_scenario "nexthop group (dummy0 + veth0)" scenario_nh_group + +scenario_old_multipath() { + ip -net "$nsrouter" -6 route add dead:dead::/64 \ + nexthop via dead:2::2 dev dummy0 \ + nexthop via dead:1::99 dev veth0 +} +run_scenario "old-style multipath (sibling on veth0)" scenario_old_multipath + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 7a34ef468975..08ad07500e8a 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -592,7 +592,7 @@ ip -net "$nsr1" link set tun0 up ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null -ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 +ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 encaplimit none ip -net "$nsr1" link set tun6 up ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad @@ -601,7 +601,7 @@ ip -net "$nsr2" link set tun0 up ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null -ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 || ret=1 +ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 encaplimit none || ret=1 ip -net "$nsr2" link set tun6 up ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad @@ -651,7 +651,7 @@ ip -net "$nsr1" route change default via 192.168.200.2 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept' -ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 +ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 encaplimit none ip -net "$nsr1" link set tun6.10 up ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad ip -6 -net "$nsr1" route delete default @@ -670,7 +670,7 @@ ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10 ip -net "$nsr2" route change default via 192.168.200.1 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null -ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 || ret=1 +ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 encaplimit none || ret=1 ip -net "$nsr2" link set tun6.10 up ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad ip -6 -net "$nsr2" route delete default diff --git a/tools/testing/selftests/net/netfilter/nft_offload.sh b/tools/testing/selftests/net/netfilter/nft_offload.sh new file mode 100755 index 000000000000..859bdedf1a51 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_offload.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +checktool "nft --version" "run test without nft tool" +modprobe -q netdevsim + +sysfs="/sys/kernel/debug/fail_function" +failname="/proc/self/make-it-fail" +duration=30 +fault=0 +ret=0 +file_ft="" +file_rs="" +id=$((RANDOM%65536)) + +read -r t < /proc/sys/kernel/tainted +if [ "$t" -ne 0 ];then + echo SKIP: kernel is tainted + exit $ksft_skip +fi + +cleanup() { + cleanup_netdevsim "$id" "$NS" + cleanup_ns "$NS" + [ "$fault" -eq 1 ] && echo '!nsim_setup_tc' > "$sysfs/inject" + rm -f "$file_ft" "$file_rs" +} +trap cleanup EXIT + +skip() { + echo "SKIP: $*" + [ $ret -eq 0 ] && exit 4 + + exit $ret +} + +set -e +setup_ns NS + +create_netdevsim "$id" "$NS" >/dev/null +nsim_port=$(create_netdevsim_port "$id" "$NS" 2) + +file_ft=$(mktemp) +cat > "$file_ft" <<EOF +flush ruleset +table inet t { + flowtable f { + flags offload + hook ingress priority filter + 10 + devices = { "$nsim_port", "dummyf1" } + } + + chain cf { + type filter hook forward priority 0; policy accept; + ct state new meta l4proto tcp flow add @f + } +} +EOF + +if ip netns exec "$NS" nft -f "$file_ft"; then + echo "PASS: flowtable offload" +else + echo "FAIL: flowtable offload" + ret=1 +fi + +file_rs=$(mktemp) +cat > "$file_rs" <<EOF +table netdev t { + chain c { + type filter hook ingress device $nsim_port priority 1 + flags offload + ip saddr 10.2.1.1 ip daddr 10.2.1.2 ip protocol icmp accept + ip saddr 10.2.1.1 ip daddr 10.2.1.3 ip protocol icmp drop + ip saddr 10.2.1.0/24 ip daddr 10.2.1.0/24 ip protocol icmp accept + ip6 saddr dead:beef::1 ip6 daddr dead:beef::2 meta l4proto ipv6-icmp accept + ip6 saddr dead:beef::1 ip6 daddr dead:beef::3 meta l4proto ipv6-icmp drop + ip6 saddr dead:beef::/64 ip6 daddr dead:beef::/64 meta l4proto ipv6-icmp accept + } +} +EOF +if ip netns exec "$NS" nft -f "$file_rs"; then + echo "PASS: ruleset offload" +else + echo "FAIL: ruleset offload" + ret=1 +fi + +test -d "$sysfs" || skip "$sysfs not present" +grep -q nsim_setup_tc "$sysfs/injectable" || skip "nsim_setup_tc fault injection not available" + +echo Y > "$sysfs/task-filter" +echo 0 > "$sysfs/verbose" +echo "nsim_setup_tc" > "$sysfs/inject" +fault=1 + +p=$(((RANDOM%90) + 10)) +echo $p > "$sysfs/probability" +echo -1 > "$sysfs/times" + +count=0 +ok=0 + +now=$(date +%s) +stop=$((now+duration)) + +# fault-injection enabled rule loads are expected to fail. +set +e +while [ "$now" -le "$stop" ]; do + for f in "$file_ft" "$file_rs"; do + if ip netns exec "$NS" bash -c "echo 1 > $failname ; ip netns exec \"$NS\" nft -f $f" 2> /dev/null;then + ok=$((ok+1)) + fi + count=$((count+1)) + done + now=$(date +%s) +done + +sleep 5 + +read -r t < /proc/sys/kernel/tainted +if [ "$t" -eq 0 ];then + echo "PASS: Not tainted. $count rounds, $ok successful ruleset loads with P $p." +else + echo "ERROR: Tainted. $count rounds, $ok successful ruleset loads with P $p." + dmesg + ret=1 +fi + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index d80390848e85..7c857a2e0f34 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -85,11 +85,12 @@ ip -net "$ns3" route add default via 10.0.3.1 ip -net "$ns3" route add default via dead:3::1 load_ruleset() { - local name=$1 - local prio=$2 + local family=$1 + local name=$2 + local prio=$3 ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF -table inet $name { +table $family $name { chain nfq { ip protocol icmp queue bypass icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass @@ -228,6 +229,7 @@ nf_queue_wait() test_queue() { local expected="$1" + local family="$2" local last="" # spawn nf_queue listeners @@ -255,11 +257,13 @@ test_queue() if [ x"$last" != x"$expected packets total" ]; then echo "FAIL: Expected $expected packets total, but got $last" 1>&2 ip netns exec "$nsrouter" nft list ruleset + echo -n "$TMPFILE0: ";cat "$TMPFILE0" + echo -n "$TMPFILE1: ";cat "$TMPFILE1" exit 1 fi done - echo "PASS: Expected and received $last" + echo "PASS: Expected and received $last ($family)" } listener_ready() @@ -400,6 +404,8 @@ EOF kill "$nfqpid" echo "PASS: icmp+nfqueue via vrf" + ip -net "$ns1" link del tvrf + ip netns exec "$ns1" nft flush ruleset } sctp_listener_ready() @@ -814,12 +820,53 @@ EOF check_tainted "queue program exiting while packets queued" } +test_queue_bridge() +{ + ip -net "$nsrouter" addr flush dev veth0 + ip -net "$nsrouter" addr flush dev veth1 + + ip -net "$nsrouter" link add br0 type bridge + ip -net "$nsrouter" link set veth0 master br0 + ip -net "$nsrouter" link set veth1 master br0 + + ip -net "$nsrouter" link set br0 up + + ip -net "$nsrouter" addr add 10.0.2.1/16 dev br0 + ip -net "$nsrouter" addr add dead:2::1/64 dev br0 nodad + + ip -net "$ns1" addr flush dev eth0 + ip -net "$ns2" addr flush dev eth0 + + ip -net "$ns1" addr add 10.0.1.1/16 dev eth0 + ip -net "$ns1" addr add dead:2::2/64 dev eth0 nodad + + ip -net "$ns2" addr add 10.0.2.99/16 dev eth0 + ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad + + ip netns exec "$nsrouter" nft flush ruleset + + ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=0 > /dev/null + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=0 > /dev/null + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=0 > /dev/null + + if ! test_ping;then + echo "FAIL: netns bridge connectivity" 1>&2 + exit $ret + fi + + load_ruleset "bridge" "filter" 10 + test_queue 10 "bridge" + + load_ruleset "bridge" "filter2" 20 + test_queue 20 "bridge" +} + ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null -load_ruleset "filter" 0 +load_ruleset "inet" "filter" 0 if test_ping; then # queue bypass works (rules were skipped, no listener) @@ -842,11 +889,11 @@ load_counter_ruleset 10 # 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply). # 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply. # so we expect that userspace program receives 10 packets. -test_queue 10 +test_queue 10 "inet" # same. We queue to a second program as well. -load_ruleset "filter2" 20 -test_queue 20 +load_ruleset "inet" "filter2" 20 +test_queue 20 "inet" ip netns exec "$ns1" nft flush ruleset test_tcp_forward @@ -863,4 +910,7 @@ test_queue_stress test_icmp_vrf test_queue_removal +# turns router into a bridge +test_queue_bridge + exit $ret diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index eff55c64a012..ceb44c8e1fec 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -9,7 +9,7 @@ import errno from os import system from lib.py import ksft_run, ksft_exit from lib.py import ksft_eq, ksft_ge, ksft_ne, ksft_raises, ksft_busy_wait -from lib.py import NetdevFamily, NetdevSimDev, NlError, ip +from lib.py import NetdevFamily, NetdevSimDev, NlError, defer, ip def empty_check(nf) -> None: @@ -255,6 +255,117 @@ def page_pool_check(nf) -> None: nsim.dfs_write("pp_hold", "y") +def page_pool_dump_ifindex(nf) -> None: + """Test page pool dump filtering by ifindex.""" + nsimdev1 = NetdevSimDev(queue_count=3) + rm_nsim1 = defer(nsimdev1.remove) + nsimdev2 = NetdevSimDev(queue_count=5) + defer(nsimdev2.remove) + + nsim1 = nsimdev1.nsims[0] + nsim2 = nsimdev2.nsims[0] + + ip(f"link set dev {nsim1.ifname} up") + ip(f"link set dev {nsim2.ifname} up") + + # Unfiltered dump should have pools from both devices + all_pp = nf.page_pool_get({}, dump=True) + pp1_all = [pp for pp in all_pp + if pp.get("ifindex") == nsim1.ifindex] + pp2_all = [pp for pp in all_pp + if pp.get("ifindex") == nsim2.ifindex] + ksft_ge(len(pp1_all), 1) + ksft_ge(len(pp2_all), 1) + + # Filtered dump should only return pools for that device + pp1_flt = nf.page_pool_get({'ifindex': nsim1.ifindex}, dump=True) + ksft_eq(pp1_flt, pp1_all) + + pp2_flt = nf.page_pool_get({'ifindex': nsim2.ifindex}, dump=True) + ksft_eq(pp2_flt, pp2_all) + + # Non-existent ifindex should return empty dump + pp_none = nf.page_pool_get({'ifindex': 12345678}, dump=True) + ksft_eq(len(pp_none), 0) + + # Device down - no pools for that ifindex + ip(f"link set dev {nsim1.ifname} down") + pp1_down = nf.page_pool_get({'ifindex': nsim1.ifindex}, dump=True) + ksft_eq(len(pp1_down), 0) + + # Remove device, dump by its old ifindex should return empty + old_ifindex = nsim1.ifindex + rm_nsim1.exec() + pp1_gone = nf.page_pool_get({'ifindex': old_ifindex}, dump=True) + ksft_eq(len(pp1_gone), 0) + + +def page_pool_ifindex_leak_check(nf) -> None: + """Test that zombie page pools don't show up under the original ifindex.""" + nsimdev = NetdevSimDev() + rm_nsim = defer(nsimdev.remove) + nsim = nsimdev.nsims[0] + + ip(f"link set dev {nsim.ifname} up") + nsim.dfs_write("pp_hold", "y") + + pp_up = nf.page_pool_get({'ifindex': nsim.ifindex}, dump=True) + ksft_ge(len(pp_up), 1) + + # Remove device with leaked page - pool becomes zombie (orphaned to lo) + old_ifindex = nsim.ifindex + rm_nsim.exec() + + # Zombie pool should NOT appear under the original device + pp_down = nf.page_pool_get({'ifindex': old_ifindex}, dump=True) + ksft_eq(len(pp_down), 0) + + # But it should appear in an unfiltered dump (under loopback) + pp_all = nf.page_pool_get({}, dump=True) + orphans = [pp for pp in pp_all + if "detach-time" in pp and "ifindex" not in pp] + ksft_ge(len(orphans), 1) + + +def page_pool_stats_ifindex_check(nf) -> None: + """Test page pool stats dump filtering by ifindex.""" + nsimdev1 = NetdevSimDev(queue_count=3) + defer(nsimdev1.remove) + nsimdev2 = NetdevSimDev(queue_count=5) + defer(nsimdev2.remove) + + nsim1 = nsimdev1.nsims[0] + nsim2 = nsimdev2.nsims[0] + + ip(f"link set dev {nsim1.ifname} up") + ip(f"link set dev {nsim2.ifname} up") + + # Unfiltered stats dump + all_stats = nf.page_pool_stats_get({}, dump=True) + s1_all = [s for s in all_stats + if s.get("info", {}).get("ifindex") == nsim1.ifindex] + s2_all = [s for s in all_stats + if s.get("info", {}).get("ifindex") == nsim2.ifindex] + ksft_ge(len(s1_all), 1) + ksft_ge(len(s2_all), 1) + + # Filtered stats dump + s1_flt = nf.page_pool_stats_get({'info': {'ifindex': nsim1.ifindex}}, + dump=True) + ksft_eq(s1_flt, s1_all) + + # Non-existent ifindex should return empty + s_none = nf.page_pool_stats_get({'info': {'ifindex': 12345678}}, dump=True) + ksft_eq(len(s_none), 0) + + # info.id should be rejected for stats dump + with ksft_raises(NlError) as cm: + nf.page_pool_stats_get({'info': {'id': s1_all[0]['info']['id']}}, + dump=True) + ksft_eq(cm.exception.nl_msg.error, -errno.EINVAL) + ksft_eq(cm.exception.nl_msg.extack['bad-attr'], '.info.id') + + def main() -> None: """ Ksft boiler plate main """ nf = NetdevFamily() @@ -265,7 +376,11 @@ def main() -> None: napi_set_threaded, dev_set_threaded, nsim_rxq_reset_down, - page_pool_check], + page_pool_check, + page_pool_dump_ifindex, + page_pool_ifindex_leak_check, + page_pool_stats_ifindex_check + ], args=(nf, )) ksft_exit() diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index b327d3061ed5..2954245129a2 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -26,7 +26,12 @@ tests=" netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces tunnel_metadata ovs: test extraction of tunnel metadata + tunnel_refcount ovs: test tunnel vport reference cleanup drop_reason drop: test drop reasons are emitted + pop_vlan vlan: POP_VLAN action strips tag + dec_ttl ttl: dec_ttl decrements IP TTL + flow_set flow-set: Flow modify + action_set set: SET action rewrites fields psample psample: Sampling packets with psample" info() { @@ -190,6 +195,23 @@ ovs_add_flow () { return 0 } +ovs_mod_flow () { + if [ -n "$4" ]; then + info "Modifying flow: sbx:$1 br:$2 flow:$3 act:$4" + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py \ + mod-flow "$2" "$3" "$4" + else + info "Modifying flow (no actions): sbx:$1 br:$2 flow:$3" + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py \ + mod-flow "$2" "$3" + fi + if [ $? -ne 0 ]; then + info "Flow modify [ $3 ] failed" + return 1 + fi + return 0 +} + ovs_del_flows () { info "Deleting all flows from DP: sbx:$1 br:$2" ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py del-flows "$2" @@ -243,6 +265,184 @@ usage() { } +test_dec_ttl() { + sbx_add "test_dec_ttl" || return $? + ovs_add_dp "test_dec_ttl" decttl || return 1 + + info "create namespaces" + for ns in client server; do + ovs_add_netns_and_veths "test_dec_ttl" "decttl" "$ns" \ + "${ns:0:1}0" "${ns:0:1}1" || return 1 + done + + ip netns exec client ip addr add 10.0.0.1/24 dev c1 + ip netns exec client ip link set c1 up + ip netns exec server ip addr add 10.0.0.2/24 dev s1 + ip netns exec server ip link set s1 up + + # Probe: check if kernel supports dec_ttl action. + ovs_add_flow "test_dec_ttl" decttl \ + 'in_port(1),eth(),eth_type(0x0800),ipv4()' \ + 'dec_ttl(le_1())' &>/dev/null + if [ $? -ne 0 ]; then + info "no support for dec_ttl - skipping" + ovs_exit_sig + return $ksft_skip + fi + + ovs_del_flows "test_dec_ttl" decttl + + # ARP flows (bidirectional) + ovs_add_flow "test_dec_ttl" decttl \ + 'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1 + ovs_add_flow "test_dec_ttl" decttl \ + 'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1 + + # IP flows with dec_ttl action + ovs_add_flow "test_dec_ttl" decttl \ + 'in_port(1),eth(),eth_type(0x0800),ipv4()' \ + 'dec_ttl(le_1()),2' || return 1 + ovs_add_flow "test_dec_ttl" decttl \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' \ + 'dec_ttl(le_1()),1' || return 1 + + info "verify connectivity with dec_ttl" + ovs_sbx "test_dec_ttl" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 || return 1 + + info "verify TTL=1 is dropped by dec_ttl" + ovs_sbx "test_dec_ttl" ip netns exec client ping -c 1 -W 2 \ + -t 1 10.0.0.2 >/dev/null 2>&1 \ + && { info "FAIL: ping should fail with TTL=1 and dec_ttl" + return 1; } + + return 0 +} + +test_flow_set() { + sbx_add "test_flow_set" || return $? + ovs_add_dp "test_flow_set" flowset || return 1 + + info "create namespaces" + for ns in client server; do + ovs_add_netns_and_veths "test_flow_set" "flowset" "$ns" \ + "${ns:0:1}0" "${ns:0:1}1" || return 1 + done + + ip netns exec client ip addr add 10.0.0.1/24 dev c1 + ip netns exec client ip link set c1 up + ip netns exec server ip addr add 10.0.0.2/24 dev s1 + ip netns exec server ip link set s1 up + + ovs_add_flow "test_flow_set" flowset \ + 'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1 + ovs_add_flow "test_flow_set" flowset \ + 'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1 + + local fwd_flow="ufid:00000001-0002-0003-0004-000500060007" + fwd_flow="$fwd_flow,in_port(1),eth(),eth_type(0x0800),ipv4()" + + ovs_add_flow "test_flow_set" flowset "$fwd_flow" '2' \ + || return 1 + ovs_add_flow "test_flow_set" flowset \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' '1' || return 1 + + info "verify initial forwarding" + ovs_sbx "test_flow_set" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 || return 1 + + info "mod-flow with new actions (change to drop)" + ovs_mod_flow "test_flow_set" flowset "$fwd_flow" 'drop' \ + || return 1 + + info "verify traffic is now dropped" + ovs_sbx "test_flow_set" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 >/dev/null 2>&1 \ + && { info "FAIL: ping should fail after mod-flow to drop" + return 1; } + + info "mod-flow without actions" + ovs_mod_flow "test_flow_set" flowset "$fwd_flow" || return 1 + + info "verify flow retained drop action via dump" + python3 "$ovs_base/ovs-dpctl.py" dump-flows flowset \ + | grep -q "actions:drop" || \ + { info "FAIL: flow not showing drop action"; return 1; } + + info "verify drop actions unchanged" + ovs_sbx "test_flow_set" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 >/dev/null 2>&1 \ + && { info "FAIL: ping should still fail after no-actions set" + return 1; } + + return 0 +} + +test_action_set() { + sbx_add "test_action_set" || return $? + ovs_add_dp "test_action_set" settest || return 1 + + info "create namespaces" + for ns in client server; do + ovs_add_netns_and_veths "test_action_set" "settest" "$ns" \ + "${ns:0:1}0" "${ns:0:1}1" || return 1 + done + + ip netns exec client ip addr add 10.0.0.1/24 dev c1 + ip netns exec client ip link set c1 up + ip netns exec server ip addr add 10.0.0.2/24 dev s1 + ip netns exec server ip link set s1 up + + ovs_add_flow "test_action_set" settest \ + 'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1 + + ovs_add_flow "test_action_set" settest \ + 'in_port(1),eth(),eth_type(0x0800),ipv4()' '2' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' '1' || return 1 + + info "verify connectivity without SET" + ovs_sbx "test_action_set" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 || return 1 + + ovs_del_flows "test_action_set" settest + ovs_add_flow "test_action_set" settest \ + 'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1 + + info "set ipv4 dst to unreachable address" + ovs_add_flow "test_action_set" settest \ + 'in_port(1),eth(),eth_type(0x0800),ipv4()' \ + 'set(ipv4(dst=10.0.0.99)),2' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' '1' || return 1 + + info "verify ping fails with rewritten dst" + ovs_sbx "test_action_set" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 >/dev/null 2>&1 \ + && { info "FAIL: ping should fail with dst rewritten" + return 1; } + + ovs_del_flows "test_action_set" settest + ovs_add_flow "test_action_set" settest \ + 'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(1),eth(),eth_type(0x0800),ipv4()' '2' || return 1 + ovs_add_flow "test_action_set" settest \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' '1' || return 1 + + info "verify connectivity restored without SET" + ovs_sbx "test_action_set" ip netns exec client ping -c 1 -W 2 \ + 10.0.0.2 || return 1 + + return 0 +} + # psample test # - use psample to observe packets test_psample() { @@ -303,6 +503,8 @@ test_psample() { # sFlow / IPFIX. nlpid=$(grep -E "listening on upcall packet handler" \ $ovs_dir/s0.out | cut -d ":" -f 2 | tr -d ' ') + [ -z "$nlpid" ] && \ + { info "failed to get upcall PID"; return 1; } ovs_add_flow "test_psample" psample \ "in_port(2),eth(),eth_type(0x0800),ipv4()" \ @@ -336,6 +538,10 @@ test_drop_reason() { ovs_drop_subsys=$(pahole -C skb_drop_reason_subsys | awk '/OPENVSWITCH/ { print $3; }' | tr -d ,) + if [ -z "$ovs_drop_subsys" ]; then + info "failed to get OVS drop subsys ID" + return $ksft_skip + fi sbx_add "test_drop_reason" || return $? @@ -434,13 +640,19 @@ test_arp_ping () { # Setup client namespace ip netns exec client ip addr add 172.31.110.10/24 dev c1 ip netns exec client ip link set c1 up - HW_CLIENT=`ip netns exec client ip link show dev c1 | grep -E 'link/ether [0-9a-f:]+' | awk '{print $2;}'` + HW_CLIENT=$(ip netns exec client ip link show dev c1 \ + | awk '/link\/ether/ {print $2}') + [ -z "$HW_CLIENT" ] && \ + { info "failed to get client hwaddr"; return 1; } info "Client hwaddr: $HW_CLIENT" # Setup server namespace ip netns exec server ip addr add 172.31.110.20/24 dev s1 ip netns exec server ip link set s1 up - HW_SERVER=`ip netns exec server ip link show dev s1 | grep -E 'link/ether [0-9a-f:]+' | awk '{print $2;}'` + HW_SERVER=$(ip netns exec server ip link show dev s1 \ + | awk '/link\/ether/ {print $2}') + [ -z "$HW_SERVER" ] && \ + { info "failed to get server hwaddr"; return 1; } info "Server hwaddr: $HW_SERVER" ovs_add_flow "test_arp_ping" arpping \ @@ -830,6 +1042,119 @@ test_tunnel_metadata() { return 0 } +test_tunnel_refcount() { + sbxname="test_tunnel_refcount" + sbx_add "${sbxname}" || return 1 + + ovs_sbx "${sbxname}" ip netns add trefns || return 1 + on_exit "ovs_sbx ${sbxname} ip netns del trefns" + + for tun_type in gre vxlan geneve; do + info "testing ${tun_type} tunnel vport refcount" + + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + add-dp dp-${tun_type} || return 1 + + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + add-if --no-lwt -t ${tun_type} \ + dp-${tun_type} ovs-${tun_type}0 || return 1 + + ovs_wait ip -netns trefns link show \ + ovs-${tun_type}0 >/dev/null 2>&1 || return 1 + + info "deleting dp - may hang if reference counting is broken" + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + del-dp dp-${tun_type} & + + dev_removed() { + ! ip -netns trefns link show "$1" >/dev/null 2>&1 + } + ovs_wait dev_removed dp-${tun_type} || return 1 + ovs_wait dev_removed ovs-${tun_type}0 || return 1 + done + + return 0 +} + +test_pop_vlan() { + local sbx="test_pop_vlan" + sbx_add "$sbx" || return $? + ovs_add_dp "$sbx" vlandp || return 1 + + ovs_add_netns_and_veths "$sbx" vlandp \ + ns1 veth1 ns1veth 192.0.2.1/24 || return 1 + ovs_add_netns_and_veths "$sbx" vlandp \ + ns2 veth2 ns2veth 192.0.2.2/24 || return 1 + + # Baseline: untagged bidirectional forwarding + ovs_add_flow "$sbx" vlandp \ + 'in_port(1),eth(),eth_type(0x0806),arp()' '2' || return 1 + ovs_add_flow "$sbx" vlandp \ + 'in_port(2),eth(),eth_type(0x0806),arp()' '1' || return 1 + ovs_add_flow "$sbx" vlandp \ + 'in_port(1),eth(),eth_type(0x0800),ipv4()' '2' || return 1 + ovs_add_flow "$sbx" vlandp \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' '1' || return 1 + ovs_sbx "$sbx" ip netns exec ns1 ping -c 3 -W 2 \ + 192.0.2.2 || return 1 + + # VLAN topology: ns1 uses VLAN sub-interface, ns2 is plain + ip -n ns1 link add link ns1veth name ns1veth.10 \ + type vlan id 10 || return 1 + on_exit "ip -n ns1 link del ns1veth.10 2>/dev/null" + ip -n ns1 addr add 198.51.100.1/24 dev ns1veth.10 || return 1 + ip -n ns1 link set ns1veth.10 up || return 1 + ip -n ns2 addr add 198.51.100.2/24 dev ns2veth || return 1 + + ovs_del_flows "$sbx" vlandp + + # Static ARP: avoids VLAN-tagged ARP complexity + local ns1veth10mac ns2mac + ns1veth10mac=$(ip -n ns1 link show ns1veth.10 \ + | awk '/link\/ether/ {print $2}') + [ -z "$ns1veth10mac" ] && \ + { info "failed to get ns1veth10mac"; return 1; } + ns2mac=$(ip -n ns2 link show ns2veth \ + | awk '/link\/ether/ {print $2}') + [ -z "$ns2mac" ] && \ + { info "failed to get ns2mac"; return 1; } + ip -n ns1 neigh replace 198.51.100.2 lladdr "$ns2mac" \ + dev ns1veth.10 nud permanent || return 1 + ip -n ns2 neigh replace 198.51.100.1 \ + lladdr "$ns1veth10mac" \ + dev ns2veth nud permanent || return 1 + + local vlan_match='in_port(1),eth(),eth_type(0x8100),' + vlan_match+='vlan(vid=10),' + vlan_match+='encap(eth_type(0x0800),' + vlan_match+='ipv4(src=198.51.100.1,proto=1),icmp())' + + # Negative: forward without pop_vlan -- tagged frame + # is invisible to ns2 (no VLAN sub-interface), ping fails + ovs_add_flow "$sbx" vlandp "$vlan_match" '2' || return 1 + ovs_sbx "$sbx" ip netns exec ns1 ping -I ns1veth.10 \ + -c 3 -W 1 198.51.100.2 >/dev/null 2>&1 \ + && { info "FAIL: ping should fail without pop_vlan" + return 1; } + + ovs_del_flows "$sbx" vlandp + + # Positive: pop_vlan strips tag on forward path, + # push_vlan restores tag on return path -- ping succeeds + ovs_add_flow "$sbx" vlandp \ + "$vlan_match" 'pop_vlan,2' || return 1 + ovs_add_flow "$sbx" vlandp \ + 'in_port(2),eth(),eth_type(0x0800),ipv4()' \ + 'push_vlan(vid=10,pcp=0,tpid=0x8100),1' || return 1 + ovs_sbx "$sbx" ip netns exec ns1 ping -I ns1veth.10 \ + -c 3 -W 2 198.51.100.2 || return 1 + + return 0 +} + run_test() { ( tname="$1" diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 848f61fdcee0..e1ecfad2c03e 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -11,7 +11,6 @@ import logging import math import multiprocessing import re -import socket import struct import sys import time @@ -370,7 +369,7 @@ class ovsactions(nla): ("OVS_ACTION_ATTR_OUTPUT", "uint32"), ("OVS_ACTION_ATTR_USERSPACE", "userspace"), ("OVS_ACTION_ATTR_SET", "ovskey"), - ("OVS_ACTION_ATTR_PUSH_VLAN", "none"), + ("OVS_ACTION_ATTR_PUSH_VLAN", "push_vlan"), ("OVS_ACTION_ATTR_POP_VLAN", "flag"), ("OVS_ACTION_ATTR_SAMPLE", "sample"), ("OVS_ACTION_ATTR_RECIRC", "uint32"), @@ -389,11 +388,21 @@ class ovsactions(nla): ("OVS_ACTION_ATTR_CLONE", "recursive"), ("OVS_ACTION_ATTR_CHECK_PKT_LEN", "none"), ("OVS_ACTION_ATTR_ADD_MPLS", "none"), - ("OVS_ACTION_ATTR_DEC_TTL", "none"), + ("OVS_ACTION_ATTR_DEC_TTL", "dec_ttl"), ("OVS_ACTION_ATTR_DROP", "uint32"), ("OVS_ACTION_ATTR_PSAMPLE", "psample"), ) + class dec_ttl(nla): # pylint: disable=invalid-name + """Nested OVS_DEC_TTL_ATTR_* sub-attributes.""" + + nla_flags = NLA_F_NESTED + + nla_map = ( + ("OVS_DEC_TTL_ATTR_UNSPEC", "none"), + ("OVS_DEC_TTL_ATTR_ACTION", "actions"), + ) + class psample(nla): nla_flags = NLA_F_NESTED @@ -427,6 +436,9 @@ class ovsactions(nla): return actstr + class push_vlan(nla): + fields = (("vlan_tpid", "!H"), ("vlan_tci", "!H")) + class sample(nla): nla_flags = NLA_F_NESTED @@ -633,6 +645,21 @@ class ovsactions(nla): print_str += "ct_clear" elif field[0] == "OVS_ACTION_ATTR_POP_VLAN": print_str += "pop_vlan" + elif field[0] == "OVS_ACTION_ATTR_DEC_TTL": + datum = self.get_attr(field[0]) + print_str += "dec_ttl(le_1(" + subacts = datum.get_attr("OVS_DEC_TTL_ATTR_ACTION") + if subacts and subacts.get("attrs"): + print_str += subacts.dpstr(more) + print_str += "))" + elif field[0] == "OVS_ACTION_ATTR_PUSH_VLAN": + datum = self.get_attr(field[0]) + tpid = datum["vlan_tpid"] + tci = datum["vlan_tci"] + vid = tci & 0x0FFF + pcp = (tci >> 13) & 0x7 + print_str += "push_vlan(vid=%d,pcp=%d" \ + ",tpid=0x%04x)" % (vid, pcp, tpid) elif field[0] == "OVS_ACTION_ATTR_POP_ETH": print_str += "pop_eth" elif field[0] == "OVS_ACTION_ATTR_POP_NSH": @@ -726,7 +753,71 @@ class ovsactions(nla): actstr = actstr[strspn(actstr, ", ") :] parsed = True - if parse_starts_block(actstr, "clone(", False): + if parse_starts_block(actstr, "push_vlan(", False): + actstr = actstr[len("push_vlan("):] + vid = 0 + pcp = 0 + tpid = 0x8100 + if ")" not in actstr: + raise ValueError( + "push_vlan(): missing ')'") + paren = actstr.index(")") + if not actstr[:paren].strip(): + raise ValueError("push_vlan(): no fields") + for kv in actstr[:paren].split(","): + if "=" not in kv: + raise ValueError( + "push_vlan(): bad field '%s'" + % kv.strip()) + k = kv[:kv.index("=")].strip() + v = kv[kv.index("=") + 1:].strip() + if k == "vid": + vid = int(v, 0) + if vid < 0 or vid > 0xFFF: + raise ValueError( + "push_vlan(): vid=%d out of " + "range (0-4095)" % vid) + elif k == "pcp": + pcp = int(v, 0) + if pcp < 0 or pcp > 7: + raise ValueError( + "push_vlan(): pcp=%d out of " + "range (0-7)" % pcp) + elif k == "tpid": + tpid = int(v, 0) + if tpid < 0 or tpid > 0xFFFF: + raise ValueError( + "push_vlan(): tpid=0x%x out " + "of range (0-0xffff)" % tpid) + else: + raise ValueError( + "push_vlan(): unknown key '%s'" + % k) + tci = (vid & 0x0FFF) | ((pcp & 0x7) << 13) \ + | 0x1000 + pvact = self.push_vlan() + pvact["vlan_tpid"] = tpid + pvact["vlan_tci"] = tci + self["attrs"].append( + ["OVS_ACTION_ATTR_PUSH_VLAN", pvact]) + actstr = actstr[paren + 1:] + parsed = True + + elif parse_starts_block(actstr, "dec_ttl(le_1(", False): + parencount += 2 + subacts = ovsactions() + actstr = actstr[len("dec_ttl(le_1("):] + parsed_len = subacts.parse(actstr) + decttl = ovsactions.dec_ttl() + decttl["attrs"].append( + ("OVS_DEC_TTL_ATTR_ACTION", subacts) + ) + self["attrs"].append( + ("OVS_ACTION_ATTR_DEC_TTL", decttl) + ) + actstr = actstr[parsed_len:] + parsed = True + elif parse_starts_block(actstr, "clone(", False): parencount += 1 subacts = ovsactions() actstr = actstr[len("clone("):] @@ -897,15 +988,21 @@ class ovsactions(nla): return (totallen - len(actstr)) +# pyroute2 resolves nla_map types via getattr(self, name). +# dec_ttl needs "actions" to resolve to ovsactions, but +# ovsactions is not defined when dec_ttl class body runs. +ovsactions.dec_ttl.actions = ovsactions + + class ovskey(nla): nla_flags = NLA_F_NESTED nla_map = ( ("OVS_KEY_ATTR_UNSPEC", "none"), - ("OVS_KEY_ATTR_ENCAP", "none"), + ("OVS_KEY_ATTR_ENCAP", "encap_ovskey"), ("OVS_KEY_ATTR_PRIORITY", "uint32"), ("OVS_KEY_ATTR_IN_PORT", "uint32"), ("OVS_KEY_ATTR_ETHERNET", "ethaddr"), - ("OVS_KEY_ATTR_VLAN", "uint16"), + ("OVS_KEY_ATTR_VLAN", "be16"), ("OVS_KEY_ATTR_ETHERTYPE", "be16"), ("OVS_KEY_ATTR_IPV4", "ovs_key_ipv4"), ("OVS_KEY_ATTR_IPV6", "ovs_key_ipv6"), @@ -1636,6 +1733,194 @@ class ovskey(nla): class ovs_key_mpls(nla): fields = (("lse", ">I"),) + # 802.1Q CFI (Canonical Format Indicator) bit, always set for Ethernet + _VLAN_CFI_MASK = 0x1000 + + @staticmethod + def _vlan_dpstr(tci): + """Format VLAN TCI as vid=X,pcp=Y,cfi=Z or tci=0xNNNN. + + When cfi=1 (standard Ethernet VLAN), outputs decomposed + vid/pcp/cfi fields. When cfi=0 (truncated VLAN header), + falls back to raw tci=0x%04x to ensure round-trip + correctness: the parser auto-adds cfi=1 for vid/pcp + format, so cfi=0 would be lost on re-parse.""" + vid = tci & 0x0FFF + pcp = (tci >> 13) & 0x7 + cfi = (tci >> 12) & 0x1 + if cfi: + return "vid=%d,pcp=%d,cfi=%d" % (vid, pcp, cfi) + return "tci=0x%04x" % tci + + @staticmethod + def _parse_vlan_from_flowstr(flowstr): + """Parse vlan(tci=X) or vlan(vid=X[,pcp=Y,cfi=Z]) from flowstr. + + Returns (remaining_flowstr, key_tci, mask_tci). + TCI values use standard bit layout (VID bits 0-11, + CFI bit 12, PCP bits 13-15); byte order conversion to + big-endian happens in pyroute2 be16 NLA serialization. + The mask covers only the fields the caller specified: + vid -> 0x0FFF, pcp -> 0xE000, cfi -> 0x1000, tci -> 0xFFFF. + + The tci= key sets the raw TCI bitfield (no CFI validation) to allow + non-Ethernet use cases. Use cfi=1 for standard Ethernet VLAN matching. + """ + tci = 0 + mask = 0 + has_tci = False + has_vid = has_pcp = has_cfi = False + _tci_mix_err = "vlan(): 'tci' cannot be mixed " \ + "with 'vid'/'pcp'/'cfi'" + first = True + while True: + flowstr = flowstr.lstrip() + if not flowstr: + raise ValueError("vlan(): missing ')'") + if flowstr[0] == ')': + break + if not first: + flowstr = flowstr[1:] # skip ',' + if not flowstr: + raise ValueError("vlan(): missing ')' after trailing comma") + flowstr = flowstr.lstrip() + if flowstr and flowstr[0] == ')': + break + if flowstr and flowstr[0] == ',': + raise ValueError( + "vlan(): empty or extra comma in field list") + first = False + + eq = flowstr.find('=') + if eq == -1: + raise ValueError( + "vlan(): expected key=value, got '%s'" % flowstr) + key = flowstr[:eq].strip() + flowstr = flowstr[eq + 1:] + + end = flowstr.find(',') + end2 = flowstr.find(')') + if end == -1 and end2 == -1: + raise ValueError("vlan(): missing ')'") + if end == -1 or (end2 != -1 and end2 < end): + end = end2 + val = flowstr[:end].strip() + flowstr = flowstr[end:] + + if not val: + raise ValueError("vlan(): empty value for key '%s'" % key) + try: + v = int(val, 0) + except ValueError as exc: + raise ValueError( + "vlan(): invalid value '%s' for key '%s'" + % (val, key)) from exc + + if key == 'tci': + if has_tci: + raise ValueError("vlan(): duplicate 'tci'") + if has_vid or has_pcp or has_cfi: + raise ValueError(_tci_mix_err) + if v > 0xFFFF or v < 0: + raise ValueError("vlan(): tci=0x%x out of range" % v) + tci = v + mask = 0xFFFF + has_tci = True + elif key == 'vid': + if has_tci: + raise ValueError(_tci_mix_err) + if has_vid: + raise ValueError("vlan(): duplicate 'vid'") + if v < 0 or v > 0xFFF: + raise ValueError("vlan(): vid=%d out of range (0-4095)" % v) + tci |= v + mask |= 0x0FFF + has_vid = True + elif key == 'pcp': + if has_tci: + raise ValueError(_tci_mix_err) + if has_pcp: + raise ValueError("vlan(): duplicate 'pcp'") + if v < 0 or v > 7: + raise ValueError("vlan(): pcp=%d out of range (0-7)" % v) + tci |= (v & 0x7) << 13 + mask |= 0xE000 + has_pcp = True + elif key == 'cfi': + if has_tci: + raise ValueError(_tci_mix_err) + if has_cfi: + raise ValueError("vlan(): duplicate 'cfi'") + if v != 1: + raise ValueError("vlan(): cfi must be 1 for Ethernet") + tci |= ovskey._VLAN_CFI_MASK + mask |= ovskey._VLAN_CFI_MASK + has_cfi = True + else: + raise ValueError("vlan(): unknown key '%s'" % key) + + flowstr = flowstr[1:] # skip ')' + # Catch immediate '))' (user error). A ')' after ',' is consumed + # by parse()'s strspn(flowstr, "), ") inter-field separator stripping. + if flowstr.lstrip().startswith(')'): + raise ValueError("vlan(): unmatched ')'") + # parse() strips trailing ',', ')', ' ' as inter-field separators, + # so we do not need to call strspn here. + + if mask == 0: + raise ValueError("vlan(): no fields specified, " + "use vlan(vid=X[,pcp=Y,cfi=Z]) or vlan(tci=X)") + if not has_tci: + tci |= ovskey._VLAN_CFI_MASK + mask |= ovskey._VLAN_CFI_MASK + return flowstr, tci, mask + + @staticmethod + def _parse_encap_from_flowstr(flowstr): + """Parse encap(inner_flow) from flowstr. + + Returns (remaining_flowstr, inner_key_dict, inner_mask_dict) + where each dict has an 'attrs' key for recursive NLA encoding. + Parenthesis-depth tracking handles nested encap() calls but not + quoted strings containing literal parentheses. + """ + depth = 1 + end = -1 + for i, c in enumerate(flowstr): + if c == '(': + depth += 1 + elif c == ')': + depth -= 1 + if depth < 0: + raise ValueError( + "encap(): unmatched ')' at position %d" % i) + if depth == 0: + end = i + break + + if end == -1: + if depth > 1: + raise ValueError("encap(): missing ')' in nested encap") + raise ValueError("encap(): missing ')'") + + inner_str = flowstr[:end].strip() + if not inner_str: + raise ValueError("encap(): empty inner flow") + + flowstr = flowstr[end + 1:] + if flowstr.lstrip().startswith(')'): + raise ValueError("encap(): unmatched ')' after encap()") + + inner_key = encap_ovskey() + inner_mask = encap_ovskey() + remaining = inner_key.parse(inner_str, inner_mask) + if remaining and re.search(r'[^\s,)]', remaining): + raise ValueError( + "encap(): unrecognized trailing " + "content '%s'" % remaining.strip()) + + return flowstr, inner_key, inner_mask + def parse(self, flowstr, mask=None): for field in ( ("OVS_KEY_ATTR_PRIORITY", "skb_priority", intparse), @@ -1658,6 +1943,16 @@ class ovskey(nla): lambda x: intparse(x, "0xffff"), ), ( + "OVS_KEY_ATTR_VLAN", + "vlan", + ovskey._parse_vlan_from_flowstr, + ), + ( + "OVS_KEY_ATTR_ENCAP", + "encap", + ovskey._parse_encap_from_flowstr, + ), + ( "OVS_KEY_ATTR_IPV4", "ipv4", ovskey.ovs_key_ipv4, @@ -1794,6 +2089,9 @@ class ovskey(nla): True, ), ("OVS_KEY_ATTR_ETHERNET", None, None, False, False), + ("OVS_KEY_ATTR_VLAN", "vlan", ovskey._vlan_dpstr, + lambda x: False, True), + ("OVS_KEY_ATTR_ENCAP", None, None, False, False), ( "OVS_KEY_ATTR_ETHERTYPE", "eth_type", @@ -1821,22 +2119,63 @@ class ovskey(nla): v = self.get_attr(field[0]) if v is not None: m = None if mask is None else mask.get_attr(field[0]) + fmt = field[2] # str format or callable if field[4] is False: print_str += v.dpstr(m, more) print_str += "," else: if m is None or field[3](m): - print_str += field[1] + "(" - print_str += field[2] % v - print_str += ")," + val = fmt(v) if callable(fmt) else fmt % v + print_str += field[1] + "(" + val + ")," elif more or m != 0: - print_str += field[1] + "(" - print_str += (field[2] % v) + "/" + (field[2] % m) - print_str += ")," + if field[0] == "OVS_KEY_ATTR_VLAN": + val = "tci=0x%04x/0x%04x" % (v, m) + elif callable(fmt): + val = fmt(v) + "/" + fmt(m) + else: + val = (fmt % v) + "/" + (fmt % m) + print_str += field[1] + "(" + val + ")," return print_str +class encap_ovskey(ovskey): + """Inner flow key attributes valid inside 802.1Q ENCAP. + + Only L2-L4 key attributes (slots 0-21) appear inside ENCAP. + Metadata-only attributes (SKB_MARK, DP_HASH, RECIRC_ID, etc.) + are set to "none" -- they never appear inside ENCAP per + ovs_nla_put_vlan() in net/openvswitch/flow_netlink.c. + + nla_map indexes must match OVS_KEY_ATTR_* enum values in + include/uapi/linux/openvswitch.h. + """ + nla_map = ( + ("OVS_KEY_ATTR_UNSPEC", "none"), + ("OVS_KEY_ATTR_ENCAP", "none"), # placeholder, parsed by ovskey + ("OVS_KEY_ATTR_PRIORITY", "none"), # skb metadata, not in ENCAP + ("OVS_KEY_ATTR_IN_PORT", "none"), # skb metadata, not in ENCAP + ("OVS_KEY_ATTR_ETHERNET", "ethaddr"), + ("OVS_KEY_ATTR_VLAN", "be16"), + ("OVS_KEY_ATTR_ETHERTYPE", "be16"), + ("OVS_KEY_ATTR_IPV4", "ovs_key_ipv4"), + ("OVS_KEY_ATTR_IPV6", "ovs_key_ipv6"), + ("OVS_KEY_ATTR_TCP", "ovs_key_tcp"), + ("OVS_KEY_ATTR_UDP", "ovs_key_udp"), + ("OVS_KEY_ATTR_ICMP", "ovs_key_icmp"), + ("OVS_KEY_ATTR_ICMPV6", "ovs_key_icmpv6"), + ("OVS_KEY_ATTR_ARP", "ovs_key_arp"), + ("OVS_KEY_ATTR_ND", "ovs_key_nd"), + ("OVS_KEY_ATTR_SKB_MARK", "none"), # metadata, not in ENCAP + ("OVS_KEY_ATTR_TUNNEL", "none"), # tunnel metadata, not in ENCAP + ("OVS_KEY_ATTR_SCTP", "ovs_key_sctp"), + ("OVS_KEY_ATTR_TCP_FLAGS", "be16"), + ("OVS_KEY_ATTR_DP_HASH", "none"), # metadata, not in ENCAP + ("OVS_KEY_ATTR_RECIRC_ID", "none"), # metadata, not in ENCAP + ("OVS_KEY_ATTR_MPLS", "array(ovs_key_mpls)"), + ) + + class OvsPacket(GenericNetlinkSocket): OVS_PACKET_CMD_MISS = 1 # Flow table miss OVS_PACKET_CMD_ACTION = 2 # USERSPACE action @@ -2069,7 +2408,7 @@ class OvsVport(GenericNetlinkSocket): elif vport_type == "internal": return OvsVport.OVS_VPORT_TYPE_INTERNAL elif vport_type == "gre": - return OvsVport.OVS_VPORT_TYPE_INTERNAL + return OvsVport.OVS_VPORT_TYPE_GRE elif vport_type == "vxlan": return OvsVport.OVS_VPORT_TYPE_VXLAN elif vport_type == "geneve": @@ -2121,6 +2460,7 @@ class OvsVport(GenericNetlinkSocket): ) TUNNEL_DEFAULTS = [("geneve", 6081), + ("gre", 0), ("vxlan", 4789)] for tnl in TUNNEL_DEFAULTS: @@ -2129,9 +2469,13 @@ class OvsVport(GenericNetlinkSocket): dport = tnl[1] if not lwt: + if tnl[0] == "gre": + # GRE tunnels have no options. + break + vportopt = OvsVport.ovs_vport_msg.vportopts() vportopt["attrs"].append( - ["OVS_TUNNEL_ATTR_DST_PORT", socket.htons(dport)] + ["OVS_TUNNEL_ATTR_DST_PORT", dport] ) msg["attrs"].append( ["OVS_VPORT_ATTR_OPTIONS", vportopt] @@ -2145,6 +2489,9 @@ class OvsVport(GenericNetlinkSocket): geneve_port=dport, geneve_collect_metadata=True, geneve_udp_zero_csum6_rx=1) + elif tnl[0] == "gre": + ipr.link("add", ifname=vport_ifname, kind="gretap", + gre_collect_metadata=True) elif tnl[0] == "vxlan": ipr.link("add", ifname=vport_ifname, kind=tnl[0], vxlan_learning=0, vxlan_collect_metadata=1, @@ -2355,9 +2702,10 @@ class OvsFlow(GenericNetlinkSocket): self["attrs"].append(["OVS_FLOW_ATTR_KEY", k]) self["attrs"].append(["OVS_FLOW_ATTR_MASK", m]) - a = ovsactions() - a.parse(actstr) - self["attrs"].append(["OVS_FLOW_ATTR_ACTIONS", a]) + if actstr is not None: + a = ovsactions() + a.parse(actstr) + self["attrs"].append(["OVS_FLOW_ATTR_ACTIONS", a]) def __init__(self): GenericNetlinkSocket.__init__(self) @@ -2391,6 +2739,25 @@ class OvsFlow(GenericNetlinkSocket): raise ne return reply + def mod_flow(self, dpifindex, flowmsg): + """Modify an existing flow in the kernel.""" + flowmsg["cmd"] = OVS_FLOW_CMD_SET + flowmsg["version"] = OVS_DATAPATH_VERSION + flowmsg["reserved"] = 0 + flowmsg["dpifindex"] = dpifindex + + try: + reply = self.nlm_request( + flowmsg, + msg_type=self.prid, + msg_flags=NLM_F_REQUEST | NLM_F_ACK, + ) + reply = reply[0] + except NetlinkError as ne: + print(flowmsg) + raise ne + return reply + def del_flows(self, dpifindex): """ Send a del message to the kernel that will drop all flows. @@ -2563,7 +2930,7 @@ def print_ovsdp_full(dp_lookup_rep, ifindex, ndb=NDB(), vpl=OvsVport()): if vpo: dpo = vpo.get_attr("OVS_TUNNEL_ATTR_DST_PORT") if dpo: - opts += " tnl-dport:%s" % socket.ntohs(dpo) + opts += " tnl-dport:%s" % dpo print( " port %d: %s (%s%s)" % ( @@ -2576,6 +2943,7 @@ def print_ovsdp_full(dp_lookup_rep, ifindex, ndb=NDB(), vpl=OvsVport()): def main(argv): + nlmsg_atoms.encap_ovskey = encap_ovskey nlmsg_atoms.ovskey = ovskey nlmsg_atoms.ovsactions = ovsactions @@ -2632,7 +3000,7 @@ def main(argv): "--ptype", type=str, default="netdev", - choices=["netdev", "internal", "geneve", "vxlan"], + choices=["netdev", "internal", "gre", "geneve", "vxlan"], help="Interface type (default netdev)", ) addifcmd.add_argument( @@ -2645,7 +3013,7 @@ def main(argv): addifcmd.add_argument( "-l", "--lwt", - type=bool, + action=argparse.BooleanOptionalAction, default=True, help="Use LWT infrastructure instead of vport (default true)." ) @@ -2665,6 +3033,12 @@ def main(argv): addflcmd.add_argument("flow", help="Flow specification") addflcmd.add_argument("acts", help="Flow actions") + modflcmd = subparsers.add_parser("mod-flow") + modflcmd.add_argument("modbr", help="Datapath name") + modflcmd.add_argument("modflow", help="Flow specification") + modflcmd.add_argument("modacts", help="Flow actions", + nargs="?", default=None) + delfscmd = subparsers.add_parser("del-flows") delfscmd.add_argument("flsbr", help="Datapath name") @@ -2762,6 +3136,14 @@ def main(argv): flow = OvsFlow.ovs_flow_msg() flow.parse(args.flow, args.acts, rep["dpifindex"]) ovsflow.add_flow(rep["dpifindex"], flow) + elif hasattr(args, "modbr"): + rep = ovsdp.info(args.modbr, 0) + if rep is None: + print(f"DP '{args.modbr}' not found.") + return 1 + flow = OvsFlow.ovs_flow_msg() + flow.parse(args.modflow, args.modacts, rep["dpifindex"]) + ovsflow.mod_flow(rep["dpifindex"], flow) elif hasattr(args, "flsbr"): rep = ovsdp.info(args.flsbr, 0) if rep is None: diff --git a/tools/testing/selftests/net/ovpn/test-close-socket.sh b/tools/testing/selftests/net/ovpn/test-close-socket.sh index af1532b4d2da..ec9a51bbf3c9 100755 --- a/tools/testing/selftests/net/ovpn/test-close-socket.sh +++ b/tools/testing/selftests/net/ovpn/test-close-socket.sh @@ -53,7 +53,7 @@ ovpn_run_ping_traffic() { for p in $(seq 1 ${OVPN_NUM_PEERS}); do ovpn_cmd_ok "send ping traffic to peer ${p}" \ - ip netns exec ovpn_peer0 ping -qfc 500 -w 3 \ + ip netns exec ovpn_peer0 ping -qfc 100 -w 3 \ 5.5.5.$((p + 1)) done } diff --git a/tools/testing/selftests/net/ovpn/test-mark.sh b/tools/testing/selftests/net/ovpn/test-mark.sh index 5a8f47554286..7c1d56e9c525 100755 --- a/tools/testing/selftests/net/ovpn/test-mark.sh +++ b/tools/testing/selftests/net/ovpn/test-mark.sh @@ -66,7 +66,7 @@ ovpn_mark_run_baseline_traffic() { for p in $(seq 1 3); do ovpn_cmd_ok "send baseline traffic to peer ${p}" \ - ip netns exec ovpn_peer0 ping -qfc 500 -w 3 \ + ip netns exec ovpn_peer0 ping -qfc 100 -w 3 \ 5.5.5.$((p + 1)) done } @@ -101,7 +101,7 @@ ovpn_mark_verify_drop_traffic() { local total_count for p in $(seq 1 3); do - if ping_output=$(ip netns exec ovpn_peer0 ping -qfc 500 -w 1 \ + if ping_output=$(ip netns exec ovpn_peer0 ping -qfc 100 -w 1 \ 5.5.5.$((p + 1)) 2>&1); then printf '%s\n' "expected ping to peer ${p} to fail \ after nft drop rule" @@ -144,7 +144,7 @@ ovpn_mark_verify_traffic_recovery() { sleep 1 for p in $(seq 1 3); do ovpn_cmd_ok "send recovery traffic to peer ${p}" \ - ip netns exec ovpn_peer0 ping -qfc 500 -w 3 \ + ip netns exec ovpn_peer0 ping -qfc 100 -w 3 \ 5.5.5.$((p + 1)) done } diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh index b50dbe45a4d0..9b5610837032 100755 --- a/tools/testing/selftests/net/ovpn/test.sh +++ b/tools/testing/selftests/net/ovpn/test.sh @@ -98,10 +98,10 @@ ovpn_run_basic_traffic() { sleep 0.3 ovpn_cmd_ok "send baseline traffic to peer ${p}" \ ip netns exec ovpn_peer0 \ - ping -qfc 500 -w 3 5.5.5.$((p + 1)) + ping -qfc 100 -w 3 5.5.5.$((p + 1)) ovpn_cmd_ok "send large-payload traffic to peer ${p}" \ ip netns exec ovpn_peer0 \ - ping -qfc 500 -s 3000 -w 3 5.5.5.$((p + 1)) + ping -qfc 100 -s 3000 -w 3 5.5.5.$((p + 1)) wait "${tcpdump_pid1}" || return 1 wait "${tcpdump_pid2}" || return 1 @@ -110,7 +110,7 @@ ovpn_run_basic_traffic() { ovpn_run_lan_traffic() { ovpn_cmd_ok "ping LAN behind peer1" \ - ip netns exec ovpn_peer0 ping -qfc 500 -w 3 "${OVPN_LAN_IP}" + ip netns exec ovpn_peer0 ping -qfc 100 -w 3 "${OVPN_LAN_IP}" } ovpn_run_float_mode() { @@ -127,7 +127,7 @@ ovpn_run_float_mode() { for p in $(seq 1 ${OVPN_NUM_PEERS}); do peer_ns="ovpn_peer${p}" ovpn_cmd_ok "ping tunnel after float peer ${p}" \ - ip netns exec "${peer_ns}" ping -qfc 500 -w 3 5.5.5.1 + ip netns exec "${peer_ns}" ping -qfc 100 -w 3 5.5.5.1 done } diff --git a/tools/testing/selftests/net/packetdrill/tcp_syncookies_ip4_9k.pkt b/tools/testing/selftests/net/packetdrill/tcp_syncookies_ip4_9k.pkt new file mode 100644 index 000000000000..60910069b3d7 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syncookies_ip4_9k.pkt @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Check syncookies. +// +// Check we are able to rebuild client sack, wscale, ecn and mss options. +// IPv4 msstab[4] = { 536, 1300, 1440, 1460 } + +--ip_version=ipv4 + +`./defaults.sh +sysctl -q net.ipv4.tcp_syncookies=2 +ip link set dev tun0 mtu 9000 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 10) = 0 + + +0 < S 0:0(0) win 32792 <mss 8960,sackOK,TS val 100 ecr 0,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 8960,sackOK,TS val 4000 ecr 100,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 1024 <nop,nop,TS val 110 ecr 4000> + + +0 accept(3, ..., ...) = 4 + +// Check we properly infer from the final packet the other peer wanted mss >= 1460, wscale 10, sackOK and no ECN. +// Note that mss is limited to 1460 - 12 because of IPv4 msstab[] +// This is only possible because TCP TS option was used. +// Linux uses the SYNACK TS.val 6 low order bits to encode the options. + + +0 %{ assert tcpi_snd_mss == 1460 - 12, tcpi_snd_mss; \ + assert tcpi_snd_wscale == 10, tcpi_snd_wscale; \ + assert (tcpi_options & TCPI_OPT_SACK) != 0, tcpi_options; \ + assert (tcpi_options & TCPI_OPT_TIMESTAMPS) != 0, tcpi_options; \ + assert (tcpi_options & TCPI_OPT_WSCALE) != 0, tcpi_options; \ + assert (tcpi_options & TCPI_OPT_ECN) == 0, tcpi_options +}% diff --git a/tools/testing/selftests/net/packetdrill/tcp_syncookies_ip6_9k.pkt b/tools/testing/selftests/net/packetdrill/tcp_syncookies_ip6_9k.pkt new file mode 100644 index 000000000000..f333c61044bc --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_syncookies_ip6_9k.pkt @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Check syncookies. +// +// Check we are able to rebuild client sack, wscale, ecn and mss options. +// IPv6 msstab[4] = { 1280 - 60, 1480 - 60, 1500 - 60, 9000 - 60 } + +--ip_version=ipv6 + +`./defaults.sh +sysctl -q net.ipv4.tcp_syncookies=2 +ip link set dev tun0 mtu 9000 +` + + 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 10) = 0 + + +0 < S 0:0(0) win 32792 <mss 8940,sackOK,TS val 100 ecr 0,nop,wscale 10> + +0 > S. 0:0(0) ack 1 <mss 8940,sackOK,TS val 4000 ecr 100,nop,wscale 8> + +.01 < . 1:1(0) ack 1 win 1024 <nop,nop,TS val 110 ecr 4000> + + +0 accept(3, ..., ...) = 4 + +// Check we properly infer from the final packet the other peer wanted mss >= 8940, wscale 10, sackOK and no ECN. +// This is only possible because TCP TS option was used. +// Linux uses the SYNACK TS.val 6 low order bits to encode the options. + + +0 %{ assert tcpi_snd_mss == 8940 - 12, tcpi_snd_mss; \ + assert tcpi_snd_wscale == 10, tcpi_snd_wscale; \ + assert (tcpi_options & TCPI_OPT_SACK) != 0, tcpi_options; \ + assert (tcpi_options & TCPI_OPT_TIMESTAMPS) != 0, tcpi_options; \ + assert (tcpi_options & TCPI_OPT_WSCALE) != 0, tcpi_options; \ + assert (tcpi_options & TCPI_OPT_ECN) == 0, tcpi_options +}% diff --git a/tools/testing/selftests/net/ppp/Makefile b/tools/testing/selftests/net/ppp/Makefile index b39b0abadde6..6036fa134351 100644 --- a/tools/testing/selftests/net/ppp/Makefile +++ b/tools/testing/selftests/net/ppp/Makefile @@ -5,6 +5,7 @@ top_srcdir = ../../../../.. TEST_PROGS := \ ppp_async.sh \ pppoe.sh \ + pppol2tp.sh \ # end of TEST_PROGS TEST_FILES := \ diff --git a/tools/testing/selftests/net/ppp/config b/tools/testing/selftests/net/ppp/config index b45d25c5b970..843545df8f03 100644 --- a/tools/testing/selftests/net/ppp/config +++ b/tools/testing/selftests/net/ppp/config @@ -1,4 +1,5 @@ CONFIG_IPV6=y +CONFIG_L2TP=m CONFIG_PACKET=y CONFIG_PPP=m CONFIG_PPP_ASYNC=m @@ -6,4 +7,5 @@ CONFIG_PPP_BSDCOMP=m CONFIG_PPP_DEFLATE=m CONFIG_PPPOE=m CONFIG_PPPOE_HASH_BITS_4=y +CONFIG_PPPOL2TP=m CONFIG_VETH=y diff --git a/tools/testing/selftests/net/ppp/pppoe-server-options b/tools/testing/selftests/net/ppp/pppoe-server-options index 66c8c9d319e9..cd586be7061b 100644 --- a/tools/testing/selftests/net/ppp/pppoe-server-options +++ b/tools/testing/selftests/net/ppp/pppoe-server-options @@ -1,2 +1,3 @@ noauth noipdefault +nomagic diff --git a/tools/testing/selftests/net/ppp/pppol2tp.sh b/tools/testing/selftests/net/ppp/pppol2tp.sh new file mode 100755 index 000000000000..37c4d56c5c6e --- /dev/null +++ b/tools/testing/selftests/net/ppp/pppol2tp.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source ppp_common.sh + +VETH_SERVER="veth-server" +VETH_CLIENT="veth-client" +OUTER_IP_SERVER="172.16.1.1" +OUTER_IP_CLIENT="172.16.1.2" + +PPPOL2TP_DIR=$(mktemp -d /tmp/pppol2tp.XXXXXX) +PPPOL2TP_LOG="$PPPOL2TP_DIR/l2tp.log" + +# shellcheck disable=SC2329 +cleanup() { + cleanup_all_ns + [ -n "$SOCAT_PID" ] && kill_process "$SOCAT_PID" + rm -rf "$PPPOL2TP_DIR" +} + +trap cleanup EXIT + +require_command xl2tpd +ppp_common_init +modprobe -q l2tp_ppp + +# Create the veth pair +ip link add "$VETH_SERVER" type veth peer name "$VETH_CLIENT" +ip link set "$VETH_SERVER" netns "$NS_SERVER" +ip link set "$VETH_CLIENT" netns "$NS_CLIENT" +ip -netns "$NS_SERVER" link set "$VETH_SERVER" up +ip -netns "$NS_CLIENT" link set "$VETH_CLIENT" up +ip -netns "$NS_SERVER" address add dev "$VETH_SERVER" "$OUTER_IP_SERVER" peer "$OUTER_IP_CLIENT" +ip -netns "$NS_CLIENT" address add dev "$VETH_CLIENT" "$OUTER_IP_CLIENT" peer "$OUTER_IP_SERVER" + +# Start socat as syslog listener +socat -v -u UNIX-RECV:/dev/log OPEN:/dev/null > "$PPPOL2TP_LOG" 2>&1 & +SOCAT_PID=$! + +# Generate configuration files +cat > "$PPPOL2TP_DIR/l2tp-server.conf" <<EOF +[global] +listen-addr = $OUTER_IP_SERVER +access control = no + +[lns default] +ip range = $IP_CLIENT +local ip = $IP_SERVER +require authentication = no +require chap = no +require pap = no +ppp debug = yes +pppoptfile = $(pwd)/pppoe-server-options +EOF + +cat > "$PPPOL2TP_DIR/l2tp-client.conf" <<EOF +[global] +listen-addr = $OUTER_IP_CLIENT +access control = no + +[lac server] +lns = $OUTER_IP_SERVER +require authentication = no +require chap = no +require pap = no +ppp debug = yes +pppoptfile = $(pwd)/pppoe-server-options +EOF + +# Start the L2TP Server +ip netns exec "$NS_SERVER" xl2tpd -D -c "$PPPOL2TP_DIR/l2tp-server.conf" \ + -p "$PPPOL2TP_DIR/l2tp-server.pid" -C "$PPPOL2TP_DIR/l2tp-server.control" & + +# Start the L2TP Client +ip netns exec "$NS_CLIENT" xl2tpd -D -c "$PPPOL2TP_DIR/l2tp-client.conf" \ + -p "$PPPOL2TP_DIR/l2tp-client.pid" -C "$PPPOL2TP_DIR/l2tp-client.control" & + +# Wait for xl2tpd to start and open their control pipes +slowwait 2 [ -p "$PPPOL2TP_DIR/l2tp-server.control" ] +slowwait 2 [ -p "$PPPOL2TP_DIR/l2tp-client.control" ] + +# Connect LAC to LNS +echo "c server" > "$PPPOL2TP_DIR/l2tp-client.control" + +ppp_test_connectivity + +log_test "PPPoL2TP" + +# Recursion test +RET=0 +# Delete route to LNS IP +ip -netns "$NS_CLIENT" route del "$OUTER_IP_SERVER" +# Add default route through ppp0 +ip -netns "$NS_CLIENT" route add default dev ppp0 +# ping (we expect the ping to fail but not deadlock the system) +ip netns exec "$NS_CLIENT" ping -c 1 "$IP_SERVER" -w 1 +check_fail $? + +log_test "PPPoL2TP Recursion" + +# Dump syslog messages if the test failed +if [ "$EXIT_STATUS" -ne 0 ]; then + while read -r _sign _date _time len _from _to + do len=${len##*=} + read -n "$len" -r LINE + echo "$LINE" + done < "$PPPOL2TP_LOG" +fi + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/net/protodown.sh b/tools/testing/selftests/net/protodown.sh new file mode 100755 index 000000000000..0a7b78c63c37 --- /dev/null +++ b/tools/testing/selftests/net/protodown.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test the protodown mechanism. Verify basic protodown toggling, protodown +# reasons, operational state when the lower device carrier changes, and correct +# operational state when the lower device has no carrier. + +# shellcheck disable=SC1091,SC2034,SC2154,SC2317 +source lib.sh + +require_command jq + +ALL_TESTS=" + protodown_basic_macvlan + protodown_basic_vxlan + protodown_reasons + protodown_lower_toggle + protodown_lower_down +" + +operstate_get() +{ + local ns=$1; shift + local dev=$1; shift + + ip -n "$ns" -j link show dev "$dev" | jq -r '.[].operstate' +} + +operstate_check() +{ + local ns=$1; shift + local dev=$1; shift + local expected=$1; shift + + local current + current=$(operstate_get "$ns" "$dev") + + [ "$current" = "$expected" ] +} + +setup_prepare() +{ + setup_ns NS + defer cleanup_all_ns + + ip -n "$NS" link add name dummy0 up type dummy + + ip -n "$NS" link add name macvlan0 link dummy0 up type macvlan mode bridge + + ip -n "$NS" link add name vxlan0 up type vxlan id 10010 dstport 4789 +} + +protodown_basic() +{ + local dev=$1; shift + + ip -n "$NS" link set dev "$dev" protodown on + check_err $? "Failed to set protodown on" + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" "$dev" DOWN + check_err $? "Operational state is not DOWN after setting protodown" + + ip -n "$NS" link set dev "$dev" protodown off + check_err $? "Failed to set protodown off" + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" "$dev" UP + check_err $? "Operational state is not UP after clearing protodown" +} + +protodown_basic_macvlan() +{ + RET=0 + + protodown_basic macvlan0 + + log_test "Basic protodown on/off with macvlan" +} + +protodown_basic_vxlan() +{ + RET=0 + + protodown_basic vxlan0 + + log_test "Basic protodown on/off with vxlan" +} + +protodown_reasons() +{ + RET=0 + + ip -n "$NS" link set dev macvlan0 protodown on + + ip -n "$NS" link set dev macvlan0 protodown_reason 0 on + check_err $? "Failed to set protodown reason bit 0" + + # Cannot clear protodown while reasons are active. + ip -n "$NS" link set dev macvlan0 protodown off 2>/dev/null + check_fail $? "Clearing protodown succeeded with active reasons" + + ip -n "$NS" link set dev macvlan0 protodown_reason 0 off + check_err $? "Failed to clear protodown reason bit 0" + + # Can clear protodown when no reasons are active. + ip -n "$NS" link set dev macvlan0 protodown off + check_err $? "Failed to clear protodown with no active reasons" + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 UP + check_err $? "Operational state is not UP after clearing protodown" + + log_test "Protodown reasons" +} + +protodown_lower_toggle() +{ + RET=0 + + ip -n "$NS" link set dev macvlan0 protodown on + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 DOWN + check_err $? "Operational state is not DOWN after setting protodown" + + # Toggle carrier on the lower device. The macvlan should stay DOWN + # because protodown is on. + ip -n "$NS" link set dev dummy0 carrier off + ip -n "$NS" link set dev dummy0 carrier on + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" dummy0 UP + check_err $? "Lower device is not UP after carrier on" + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 DOWN + check_err $? "Macvlan operational state is not DOWN despite protodown" + + # Clear protodown and verify the macvlan comes back up. + ip -n "$NS" link set dev macvlan0 protodown off + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 UP + check_err $? "Operational state is not UP after clearing protodown" + + log_test "Protodown with lower device toggled" +} + +protodown_lower_down() +{ + RET=0 + + # Bring the lower device carrier down first. + ip -n "$NS" link set dev dummy0 carrier off + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 LOWERLAYERDOWN + check_err $? "Macvlan is not LOWERLAYERDOWN with lower carrier off" + + # Toggle protodown on and off while lower has no carrier. The macvlan + # should not transition to UP. + ip -n "$NS" link set dev macvlan0 protodown on + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 LOWERLAYERDOWN + check_err $? "Macvlan is not LOWERLAYERDOWN after setting protodown" + + ip -n "$NS" link set dev macvlan0 protodown off + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 LOWERLAYERDOWN + check_err $? "Macvlan is not LOWERLAYERDOWN after clearing protodown" + + # Bring the lower device carrier up. The macvlan should transition to + # UP. + ip -n "$NS" link set dev dummy0 carrier on + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" dummy0 UP + check_err $? "Lower device is not UP after carrier on" + + busywait "$BUSYWAIT_TIMEOUT" operstate_check "$NS" macvlan0 UP + check_err $? "Macvlan is not UP after lower device is UP" + + log_test "Protodown with lower device down" +} + +trap defer_scopes_cleanup EXIT +setup_prepare +tests_run + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/net/rds/.gitignore b/tools/testing/selftests/net/rds/.gitignore index 1c6f04e2aa11..7ca4b1440f51 100644 --- a/tools/testing/selftests/net/rds/.gitignore +++ b/tools/testing/selftests/net/rds/.gitignore @@ -1 +1,2 @@ include.sh +getsockopt diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile index fe363be8e358..ab9e92399a6d 100644 --- a/tools/testing/selftests/net/rds/Makefile +++ b/tools/testing/selftests/net/rds/Makefile @@ -3,7 +3,9 @@ all: @echo mk_build_dir="$(shell pwd)" > include.sh -TEST_PROGS := run.sh +TEST_PROGS := rds_run.sh + +TEST_GEN_PROGS := getsockopt TEST_FILES := \ include.sh \ @@ -16,4 +18,6 @@ EXTRA_CLEAN := \ /tmp/rds_logs \ # end of EXTRA_CLEAN +CFLAGS += $(KHDR_INCLUDES) + include ../../lib.mk diff --git a/tools/testing/selftests/net/rds/README.txt b/tools/testing/selftests/net/rds/README.txt index c6fe003d503b..8aa41148b1b5 100644 --- a/tools/testing/selftests/net/rds/README.txt +++ b/tools/testing/selftests/net/rds/README.txt @@ -1,21 +1,27 @@ RDS self-tests ============== -These scripts provide a coverage test for RDS-TCP by creating two -network namespaces and running rds packets between them. A loopback -network is provisioned with optional probability of packet loss or -corruption. A workload of 50000 hashes, each 64 characters in size, -are passed over an RDS socket on this test network. A passing test means -the RDS-TCP stack was able to recover properly. The provided config.sh -can be used to compile the kernel with the necessary gcov options. The -kernel may optionally be configured to omit the coverage report as well. +These scripts provide a coverage test for RDS-TCP and RDS-RDMA (over +RoCE/RXE) by setting up two endpoints and running RDS packets between +them. The TCP path creates two network namespaces; the RDMA path uses +an RXE (soft RoCE) device backed by a veth pair. A workload of 50000 +hashes, each 64 characters in size, is passed over an RDS socket on +this test network with an optional probability of packet loss or +corruption. A passing test means the RDS stack was able to recover +properly. The provided config.sh can be used to compile the kernel +with the necessary gcov options; pass -r to also enable the kernel +configs required for the RDMA transport. The kernel may optionally be +configured to omit the coverage report as well. USAGE: - run.sh [-d logdir] [-l packet_loss] [-c packet_corruption] - [-u packet_duplcate] + rds_run.sh [-d logdir] [-l packet_loss] [-c packet_corruption] + [-u packet_duplicate] [-t timeout] + [-T tcp|rdma|tcp,rdma] OPTIONS: - -d Log directory. Defaults to tools/testing/selftests/net/rds/rds_logs + -d Log directory. If set, logs will be stored in the + given dir, or skipped if unset. Log dir can also be + set through the RDS_LOG_DIR env variable -l Simulates a percentage of packet loss @@ -23,11 +29,36 @@ OPTIONS: -u Simulates a percentage of packet duplication. + -t Test timeout. Defaults to tools/testing/selftests/net/rds/settings + + -T Comma-separated list of transports to test. Accepts + "tcp", "rdma", or "tcp,rdma". Defaults to "tcp". Use + config.sh -r to enable required RDMA configs + +ENV VARIABLES: + RDS_LOG_DIR Log directory. If set, logs will be stored in + the given dir, or skipped if unset. Log dir + can also be set with the -d flag. + + Use with --rwdir on the CI path to retain logs after + test compleation. Log dir end point must be within + the specified --rwdir path for logs to persist on + the host. + + SUDO_USER The user name that should be used for tcpdump + --relinquish-privileges. Set this to a user + belonging to the sudoers group to avoid drop + privilege errors with the vng 9p filesystem + which may result in empty pcaps + EXAMPLE: # Create a suitable gcov enabled .config tools/testing/selftests/net/rds/config.sh -g + # Optionally add RDMA configs (CONFIG_RDS_RDMA, CONFIG_RDMA_RXE) + tools/testing/selftests/net/rds/config.sh -r + # Alternatly create a gcov disabled .config tools/testing/selftests/net/rds/config.sh @@ -39,6 +70,8 @@ EXAMPLE: # launch the tests in a VM vng -v --rwdir ./ --run . --user root --cpus 4 -- \ - "export PYTHONPATH=tools/testing/selftests/net/; tools/testing/selftests/net/rds/run.sh" + "export PYTHONPATH=tools/testing/selftests/net/; \ + export SUDO_USER=example_user; \ + export RDS_LOG_DIR=tools/testing/selftests/net/rds/rds_logs; \ + tools/testing/selftests/net/rds/rds_run.sh -T tcp,rdma" -An HTML coverage report will be output in tools/testing/selftests/net/rds/rds_logs/coverage/. diff --git a/tools/testing/selftests/net/rds/config.sh b/tools/testing/selftests/net/rds/config.sh index 29a79314dd60..2df2226310ef 100755 --- a/tools/testing/selftests/net/rds/config.sh +++ b/tools/testing/selftests/net/rds/config.sh @@ -10,7 +10,8 @@ CONF_FILE="" FLAGS=() GENERATE_GCOV_REPORT=0 -while getopts "gc:" opt; do +ENABLE_RDMA=0 +while getopts "gc:r" opt; do case ${opt} in g) GENERATE_GCOV_REPORT=1 @@ -18,8 +19,11 @@ while getopts "gc:" opt; do c) CONF_FILE=$OPTARG ;; + r) + ENABLE_RDMA=1 + ;; :) - echo "USAGE: config.sh [-g] [-c config]" + echo "USAGE: config.sh [-g] [-c config] [-r]" exit 1 ;; ?) @@ -33,9 +37,6 @@ if [[ "$CONF_FILE" != "" ]]; then FLAGS=(--file "$CONF_FILE") fi -# no modules -scripts/config "${FLAGS[@]}" --disable CONFIG_MODULES - # enable RDS scripts/config "${FLAGS[@]}" --enable CONFIG_RDS scripts/config "${FLAGS[@]}" --enable CONFIG_RDS_TCP @@ -58,3 +59,10 @@ scripts/config "${FLAGS[@]}" --enable CONFIG_VETH # simulate packet loss scripts/config "${FLAGS[@]}" --enable CONFIG_NET_SCH_NETEM +if [ "$ENABLE_RDMA" -eq 1 ]; then + # enable RDS over InfiniBand / RDMA (rds_rdma test) + scripts/config "${FLAGS[@]}" --enable CONFIG_INFINIBAND + scripts/config "${FLAGS[@]}" --enable CONFIG_INFINIBAND_ADDR_TRANS + scripts/config "${FLAGS[@]}" --enable CONFIG_RDMA_RXE + scripts/config "${FLAGS[@]}" --enable CONFIG_RDS_RDMA +fi diff --git a/tools/testing/selftests/net/rds/getsockopt.c b/tools/testing/selftests/net/rds/getsockopt.c new file mode 100644 index 000000000000..93ff252c69b8 --- /dev/null +++ b/tools/testing/selftests/net/rds/getsockopt.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Exercise the RDS getsockopt() paths that were converted to the + * getsockopt_iter() / sockopt_t callback. + * + * Three distinct paths are covered: + * + * - RDS_RECVERR and SO_RDS_TRANSPORT, which now return their int value + * through copy_to_iter() and report the written length in opt->optlen. + * + * - RDS_INFO_*, which pins the userspace buffer with + * iov_iter_extract_pages() (including a non-zero starting page offset) + * and lets the info producers memcpy the snapshot in under a spinlock. + * + * The kvec (in-kernel buffer) -> -EOPNOTSUPP path of rds_info_getsockopt() + * is not reachable from a userspace getsockopt() and so is not tested here. + */ +#include <errno.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <linux/rds.h> + +#include "../../kselftest_harness.h" + +#ifndef AF_RDS +#define AF_RDS 21 +#endif + +FIXTURE(rds) { + int fd; +}; + +FIXTURE_SETUP(rds) +{ + self->fd = socket(AF_RDS, SOCK_SEQPACKET, 0); + if (self->fd < 0) + SKIP(return, "AF_RDS unavailable (errno %d) - load the rds module", + errno); +} + +FIXTURE_TEARDOWN(rds) +{ + if (self->fd >= 0) + close(self->fd); +} + +/* RDS_RECVERR defaults to 0 and is reported back as a 4-byte int. */ +TEST_F(rds, recverr_default) +{ + socklen_t len = sizeof(int); + int val = 0xdeadbeef; + + ASSERT_EQ(0, getsockopt(self->fd, SOL_RDS, RDS_RECVERR, &val, &len)); + EXPECT_EQ(sizeof(int), len); + EXPECT_EQ(0, val); +} + +/* A value set via setsockopt() must be readable back unchanged. */ +TEST_F(rds, recverr_set_get) +{ + socklen_t len = sizeof(int); + int val = 1; + + ASSERT_EQ(0, setsockopt(self->fd, SOL_RDS, RDS_RECVERR, &val, len)); + + val = 0; + ASSERT_EQ(0, getsockopt(self->fd, SOL_RDS, RDS_RECVERR, &val, &len)); + EXPECT_EQ(sizeof(int), len); + EXPECT_EQ(1, val); +} + +/* A buffer smaller than an int is rejected with EINVAL, not silently. */ +TEST_F(rds, recverr_short_buffer) +{ + socklen_t len = sizeof(int) - 1; + char buf[sizeof(int)]; + + EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, RDS_RECVERR, buf, &len)); + EXPECT_EQ(EINVAL, errno); +} + +/* An unbound socket reports RDS_TRANS_NONE for SO_RDS_TRANSPORT. */ +TEST_F(rds, transport_unbound) +{ + socklen_t len = sizeof(int); + int val = 0; + + ASSERT_EQ(0, getsockopt(self->fd, SOL_RDS, SO_RDS_TRANSPORT, &val, + &len)); + EXPECT_EQ(sizeof(int), len); + EXPECT_EQ(RDS_TRANS_NONE, (unsigned int)val); +} + +TEST_F(rds, transport_short_buffer) +{ + socklen_t len = sizeof(int) - 1; + char buf[sizeof(int)]; + + EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, SO_RDS_TRANSPORT, buf, + &len)); + EXPECT_EQ(EINVAL, errno); +} + +/* + * RDS_INFO_COUNTERS with a zero-length buffer is the "probe" call: it must + * fail with ENOSPC and report the required snapshot size in optlen. + */ +TEST_F(rds, info_counters_probe) +{ + socklen_t len = 0; + + EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, NULL, + &len)); + EXPECT_EQ(ENOSPC, errno); + EXPECT_GT(len, 0); + /* The snapshot is an array of fixed-size counter records. */ + EXPECT_EQ(0, len % (socklen_t)sizeof(struct rds_info_counter)); +} + +/* + * A real snapshot into an unaligned userspace buffer exercises the + * iov_iter_extract_pages() path, including the non-zero offset0 handling + * that the patch reworked. Place the buffer at a non-page-aligned address + * spanning into the next page to make sure multi-page pinning works too. + */ +TEST_F(rds, info_counters_snapshot) +{ + struct rds_info_counter *ctr; + socklen_t need = 0, len; + long pagesz = sysconf(_SC_PAGESIZE); + size_t offset, map_len; + unsigned int i, n; + char *region, *buf; + int ret; + + /* Probe for the required size. */ + getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, NULL, &need); + ASSERT_GT(need, 0); + + /* + * Place the buffer at a non-page-aligned offset that runs past the + * first page boundary, and size the mapping from the probed length so + * the test keeps working if the counter set grows. + */ + offset = pagesz - 64; + map_len = ((offset + need + pagesz - 1) / pagesz) * pagesz; + + region = mmap(NULL, map_len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, region); + + buf = region + offset; + + /* + * On success the RDS_INFO path returns the positive per-element size + * (lens.each) rather than 0, and writes the full snapshot length back + * into optlen. + */ + len = need; + ret = getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, buf, &len); + ASSERT_GE(ret, 0) { + TH_LOG("getsockopt snapshot failed: errno %d", errno); + } + EXPECT_EQ(sizeof(struct rds_info_counter), ret); + EXPECT_EQ(need, len); + + /* The counter names must be NUL-terminated, non-empty strings. */ + ctr = (struct rds_info_counter *)buf; + n = len / sizeof(*ctr); + ASSERT_GT(n, 0); + for (i = 0; i < n; i++) { + size_t namelen = strnlen((char *)ctr[i].name, + sizeof(ctr[i].name)); + + EXPECT_GT(namelen, 0); + EXPECT_LT(namelen, sizeof(ctr[i].name)); + } + + munmap(region, map_len); +} + +/* + * A non-zero but too-small buffer must report ENOSPC and the full required + * length, without corrupting memory past the buffer. + */ +TEST_F(rds, info_counters_short_buffer) +{ + socklen_t need = 0, len; + char small[sizeof(struct rds_info_counter)]; + + getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, NULL, &need); + ASSERT_GT(need, 0); + + /* Ask with a buffer guaranteed smaller than the full snapshot. */ + if (need <= (socklen_t)sizeof(small)) + SKIP(return, "snapshot fits in one record; nothing to test"); + + len = 1; /* < sizeof(struct rds_info_counter) */ + EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, small, + &len)); + EXPECT_EQ(ENOSPC, errno); + EXPECT_EQ(need, len); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/rds/rds_run.sh b/tools/testing/selftests/net/rds/rds_run.sh new file mode 100755 index 000000000000..cdf487ec97dc --- /dev/null +++ b/tools/testing/selftests/net/rds/rds_run.sh @@ -0,0 +1,319 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 + +set -e +set -u + +unset KBUILD_OUTPUT + +current_dir="$(realpath "$(dirname "$0")")" +build_dir="$current_dir" + +build_include="$current_dir/include.sh" +if test -f "$build_include"; then + # this include will define "$mk_build_dir" as the location the test was + # built. We will need this if the tests are installed in a location + # other than the kernel source + + source "$build_include" + build_dir="$mk_build_dir" +fi + +# Source settings for timeout value (also used by ksft runner) +source "$current_dir"/settings + +# This test requires kernel source and the *.gcda data therein +# Locate the top level of the kernel source, and the net/rds +# subfolder with the appropriate *.gcno object files +ksrc_dir="$(realpath "$build_dir"/../../../../../)" +kconfig="$ksrc_dir/.config" +obj_dir="$ksrc_dir/net/rds" + +GCOV_CMD=gcov + +#check to see if the host has the required packages to generate a gcov report +check_gcov_env() +{ + if ! which "$GCOV_CMD" > /dev/null 2>&1; then + echo "# Warning: Could not find gcov. " + GENERATE_GCOV_REPORT=0 + return + fi + + # the gcov version must match the gcc version + GCC_VER=$(gcc -dumpfullversion) + GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| awk 'BEGIN {FS="-"}{print $1}') + if [ "$GCOV_VER" != "$GCC_VER" ]; then + #attempt to find a matching gcov version + GCOV_CMD=gcov-$(gcc -dumpversion) + + if ! which "$GCOV_CMD" > /dev/null 2>&1; then + echo "# Warning: Could not find an appropriate gcov installation. \ + gcov version must match gcc version" + GENERATE_GCOV_REPORT=0 + return + fi + + #recheck version number of found gcov executable + GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| \ + awk 'BEGIN {FS="-"}{print $1}') + if [ "$GCOV_VER" != "$GCC_VER" ]; then + echo "# Warning: Could not find an appropriate gcov installation. \ + gcov version must match gcc version" + GENERATE_GCOV_REPORT=0 + else + echo "# Warning: Mismatched gcc and gcov detected. Using $GCOV_CMD" + fi + fi +} + +# Check to see if the kconfig has the required configs to generate a coverage report +check_gcov_conf() +{ + if ! grep -x "CONFIG_GCOV_PROFILE_RDS=y" "$kconfig" > /dev/null 2>&1; then + echo "# INFO: CONFIG_GCOV_PROFILE_RDS should be enabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + if ! grep -x "CONFIG_GCOV_KERNEL=y" "$kconfig" > /dev/null 2>&1; then + echo "# INFO: CONFIG_GCOV_KERNEL should be enabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + if grep -x "CONFIG_GCOV_PROFILE_ALL=y" "$kconfig" > /dev/null 2>&1; then + echo "# INFO: CONFIG_GCOV_PROFILE_ALL should be disabled for coverage reports" + GENERATE_GCOV_REPORT=0 + fi + + if [ "$GENERATE_GCOV_REPORT" -eq 0 ]; then + echo "# To enable gcov reports, please run "\ + "\"tools/testing/selftests/net/rds/config.sh -g\" and rebuild the kernel" + else + # if we have the required kernel configs, proceed to check the environment to + # ensure we have the required gcov packages + check_gcov_env + fi +} + +# Checks if a kconfig is enabled (set to =y or =m) +# $1: kconfig symbol to check +# $2: (optional) module name backing $1 +# Ex: check_conf_enabled CONFIG_RDS_TCP rds_tcp +# Modules for configs set to =m will be probed +# If omitted, only a built-in (=y) config is accepted. +# Returns on success. exits 4 on failure +# Kselftest framework requirement - SKIP code is 4. +check_conf_enabled() { + if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then + return + fi + if [ -n "${2:-}" ] && grep -x "$1=m" "$kconfig" > /dev/null 2>&1; then + probe_module "$2" + return + fi + echo "selftests: [SKIP] This test requires $1 enabled" + echo "Please run" \ + "tools/testing/selftests/net/rds/config.sh and rebuild the kernel" + exit 4 +} + +check_rdma_conf_enabled() { + if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then + return + fi + if [ -n "${2:-}" ] && grep -x "$1=m" "$kconfig" > /dev/null 2>&1; then + probe_module "$2" + return + fi + echo "selftests: [XFAIL] rdma transport requires $1 enabled" + echo "To enable, run" \ + "tools/testing/selftests/net/rds/config.sh -r and rebuild" + exit 2 +} + +# Load the module backing a config that is built as a loadable module +# (=m). Built-in (=y) configs are already available and don't reach +# here. Exits with the SKIP code if a required module cannot be loaded. +probe_module() { + if ! modprobe -q "$1"; then + echo "selftests: [SKIP] could not load required module $1" + exit 4 + fi +} + +check_conf() { + check_conf_enabled CONFIG_NET_SCH_NETEM sch_netem + check_conf_enabled CONFIG_VETH veth + check_conf_enabled CONFIG_NET_NS + check_conf_enabled CONFIG_RDS_TCP rds_tcp + check_conf_enabled CONFIG_RDS rds +} + +# Check kernel config and host environment for RDS-RDMA support. +# Exits with XFAIL (2) if the user requested rdma but prerequisites +# are not met. +check_rdma_conf() +{ + case "$TRANSPORT" in + *rdma*) ;; + *) return ;; + esac + + # Kconfig will enforce CONFIG_INFINIBAND_* as dependencies + # of CONFIG_RDMA_RXE + check_rdma_conf_enabled CONFIG_RDMA_RXE rdma_rxe + check_rdma_conf_enabled CONFIG_RDS_RDMA rds_rdma + + if ! which rdma > /dev/null 2>&1; then + echo "selftests: [XFAIL] rdma transport requires the 'rdma'" \ + "tool (iproute2)" + exit 2 + fi +} + +check_env() +{ + if ! test -d "$obj_dir"; then + echo "selftests: [SKIP] This test requires a kernel source tree" + exit 4 + fi + if ! test -e "$kconfig"; then + echo "selftests: [SKIP] This test requires a configured kernel source tree" + exit 4 + fi + if ! which strace > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without strace" + exit 4 + fi + if ! which tcpdump > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without tcpdump" + exit 4 + fi + + if ! which python3 > /dev/null 2>&1; then + echo "selftests: [SKIP] Could not run test without python3" + exit 4 + fi + + python_major=$(python3 -c "import sys; print(sys.version_info[0])") + python_minor=$(python3 -c "import sys; print(sys.version_info[1])") + if [[ python_major -lt 3 || ( python_major -eq 3 && python_minor -lt 9 ) ]] ; then + echo "selftests: [SKIP] Could not run test without at least python3.9" + python3 -V + exit 4 + fi +} + +LOG_DIR="${RDS_LOG_DIR:-}" +TIMEOUT=$timeout +GENERATE_GCOV_REPORT=1 +TRANSPORT=tcp +FLAGS=() + +while getopts "d:l:c:u:t:T:" opt; do + case ${opt} in + d) + LOG_DIR=${OPTARG} + ;; + l) + FLAGS+=("-l" "${OPTARG}") + ;; + c) + FLAGS+=("-c" "${OPTARG}") + ;; + t) + TIMEOUT=${OPTARG} + ;; + u) + FLAGS+=("-u" "${OPTARG}") + ;; + T) + TRANSPORT=${OPTARG} + ;; + :) + echo "USAGE: rds_run.sh [-d logdir] [-l packet_loss]" \ + "[-c packet_corruption] [-u packet_duplicate] [-t timeout]" \ + "[-T tcp|rdma|tcp,rdma]" + exit 1 + ;; + ?) + echo "Invalid option: -${OPTARG}." + exit 1 + ;; + esac +done + +# Validate transport tokens +IFS=',' read -ra transports <<< "$TRANSPORT" +for t in "${transports[@]}"; do + if [ "$t" != "tcp" ] && [ "$t" != "rdma" ]; then + echo "rds_run.sh: unknown transport '$t' (expected tcp or rdma)" + exit 1 + fi +done + +FLAGS+=("--transport" "${TRANSPORT}") + +check_env +check_conf +check_gcov_conf +check_rdma_conf + +TRACE_CMD=() +if [[ -n "$LOG_DIR" ]]; then + FLAGS+=("-d" "$LOG_DIR") + + TRACE_FILE="${LOG_DIR}/rds-strace.txt" + COVR_DIR="${LOG_DIR}/coverage/" + DMESG_FILE="${LOG_DIR}/rds-dmesg.out" + + mkdir -p "$LOG_DIR" + mkdir -p "$COVR_DIR" + + rm -f "$TRACE_FILE" + rm -f "$DMESG_FILE" + rm -f "$LOG_DIR"/rds-*.pcap + rm -f "$COVR_DIR"/gcovr* + + echo "# Traces will be logged to ${TRACE_FILE}" + TRACE_CMD=(strace -T -tt -o "${TRACE_FILE}") +fi + +set +e +echo "# running RDS tests..." +"${TRACE_CMD[@]}" python3 "$(dirname "$0")/test.py" "${FLAGS[@]}" -t "$TIMEOUT" + +test_rc=$? + +if [[ -n "$LOG_DIR" ]]; then + dmesg > "${DMESG_FILE}" +fi + +if [[ -n "$LOG_DIR" ]] && [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then + echo "# saving coverage data..." + + # Ensure debugfs is mounted before reading gcov data. + if ! mountpoint -q /sys/kernel/debug 2>/dev/null; then + mount -t debugfs debugfs /sys/kernel/debug 2>/dev/null || true + fi + + (set +x; cd /sys/kernel/debug/gcov; find ./* -name '*.gcda' | \ + while read -r f + do + cat < "/sys/kernel/debug/gcov/$f" > "/$f" + done) + + echo "# running gcovr..." + gcovr -s --html-details --gcov-executable "$GCOV_CMD" --gcov-ignore-parse-errors \ + --root "${ksrc_dir}" -o "${COVR_DIR}/gcovr" "${ksrc_dir}/net/rds/" \ + > "${LOG_DIR}/gcovr.log" 2>&1 + echo "# gcovr log: ${LOG_DIR}/gcovr.log" +else + echo "# Coverage report will be skipped" +fi + +if [ "$test_rc" -eq 0 ]; then + echo "# PASS: Test completed successfully" +else + echo "# FAIL: Test failed" +fi + +exit "$test_rc" diff --git a/tools/testing/selftests/net/rds/run.sh b/tools/testing/selftests/net/rds/run.sh deleted file mode 100755 index 897d17d1b8db..000000000000 --- a/tools/testing/selftests/net/rds/run.sh +++ /dev/null @@ -1,227 +0,0 @@ -#! /bin/bash -# SPDX-License-Identifier: GPL-2.0 - -set -e -set -u - -unset KBUILD_OUTPUT - -current_dir="$(realpath "$(dirname "$0")")" -build_dir="$current_dir" - -build_include="$current_dir/include.sh" -if test -f "$build_include"; then - # this include will define "$mk_build_dir" as the location the test was - # built. We will need this if the tests are installed in a location - # other than the kernel source - - source "$build_include" - build_dir="$mk_build_dir" -fi - -# Source settings for timeout value (also used by ksft runner) -source "$current_dir"/settings - -# This test requires kernel source and the *.gcda data therein -# Locate the top level of the kernel source, and the net/rds -# subfolder with the appropriate *.gcno object files -ksrc_dir="$(realpath "$build_dir"/../../../../../)" -kconfig="$ksrc_dir/.config" -obj_dir="$ksrc_dir/net/rds" - -GCOV_CMD=gcov - -#check to see if the host has the required packages to generate a gcov report -check_gcov_env() -{ - if ! which "$GCOV_CMD" > /dev/null 2>&1; then - echo "Warning: Could not find gcov. " - GENERATE_GCOV_REPORT=0 - return - fi - - # the gcov version must match the gcc version - GCC_VER=$(gcc -dumpfullversion) - GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| awk 'BEGIN {FS="-"}{print $1}') - if [ "$GCOV_VER" != "$GCC_VER" ]; then - #attempt to find a matching gcov version - GCOV_CMD=gcov-$(gcc -dumpversion) - - if ! which "$GCOV_CMD" > /dev/null 2>&1; then - echo "Warning: Could not find an appropriate gcov installation. \ - gcov version must match gcc version" - GENERATE_GCOV_REPORT=0 - return - fi - - #recheck version number of found gcov executable - GCOV_VER=$($GCOV_CMD -v | grep gcov | awk '{print $3}'| \ - awk 'BEGIN {FS="-"}{print $1}') - if [ "$GCOV_VER" != "$GCC_VER" ]; then - echo "Warning: Could not find an appropriate gcov installation. \ - gcov version must match gcc version" - GENERATE_GCOV_REPORT=0 - else - echo "Warning: Mismatched gcc and gcov detected. Using $GCOV_CMD" - fi - fi -} - -# Check to see if the kconfig has the required configs to generate a coverage report -check_gcov_conf() -{ - if ! grep -x "CONFIG_GCOV_PROFILE_RDS=y" "$kconfig" > /dev/null 2>&1; then - echo "INFO: CONFIG_GCOV_PROFILE_RDS should be enabled for coverage reports" - GENERATE_GCOV_REPORT=0 - fi - if ! grep -x "CONFIG_GCOV_KERNEL=y" "$kconfig" > /dev/null 2>&1; then - echo "INFO: CONFIG_GCOV_KERNEL should be enabled for coverage reports" - GENERATE_GCOV_REPORT=0 - fi - if grep -x "CONFIG_GCOV_PROFILE_ALL=y" "$kconfig" > /dev/null 2>&1; then - echo "INFO: CONFIG_GCOV_PROFILE_ALL should be disabled for coverage reports" - GENERATE_GCOV_REPORT=0 - fi - - if [ "$GENERATE_GCOV_REPORT" -eq 0 ]; then - echo "To enable gcov reports, please run "\ - "\"tools/testing/selftests/net/rds/config.sh -g\" and rebuild the kernel" - else - # if we have the required kernel configs, proceed to check the environment to - # ensure we have the required gcov packages - check_gcov_env - fi -} - -# Kselftest framework requirement - SKIP code is 4. -check_conf_enabled() { - if ! grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then - echo "selftests: [SKIP] This test requires $1 enabled" - echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel" - exit 4 - fi -} -check_conf_disabled() { - if grep -x "$1=y" "$kconfig" > /dev/null 2>&1; then - echo "selftests: [SKIP] This test requires $1 disabled" - echo "Please run tools/testing/selftests/net/rds/config.sh and rebuild the kernel" - exit 4 - fi -} -check_conf() { - check_conf_enabled CONFIG_NET_SCH_NETEM - check_conf_enabled CONFIG_VETH - check_conf_enabled CONFIG_NET_NS - check_conf_enabled CONFIG_RDS_TCP - check_conf_enabled CONFIG_RDS - check_conf_disabled CONFIG_MODULES -} - -check_env() -{ - if ! test -d "$obj_dir"; then - echo "selftests: [SKIP] This test requires a kernel source tree" - exit 4 - fi - if ! test -e "$kconfig"; then - echo "selftests: [SKIP] This test requires a configured kernel source tree" - exit 4 - fi - if ! which strace > /dev/null 2>&1; then - echo "selftests: [SKIP] Could not run test without strace" - exit 4 - fi - if ! which tcpdump > /dev/null 2>&1; then - echo "selftests: [SKIP] Could not run test without tcpdump" - exit 4 - fi - - if ! which python3 > /dev/null 2>&1; then - echo "selftests: [SKIP] Could not run test without python3" - exit 4 - fi - - python_major=$(python3 -c "import sys; print(sys.version_info[0])") - python_minor=$(python3 -c "import sys; print(sys.version_info[1])") - if [[ python_major -lt 3 || ( python_major -eq 3 && python_minor -lt 9 ) ]] ; then - echo "selftests: [SKIP] Could not run test without at least python3.9" - python3 -V - exit 4 - fi -} - -LOG_DIR="$current_dir"/rds_logs -PLOSS=0 -PCORRUPT=0 -PDUP=0 -GENERATE_GCOV_REPORT=1 -while getopts "d:l:c:u:" opt; do - case ${opt} in - d) - LOG_DIR=${OPTARG} - ;; - l) - PLOSS=${OPTARG} - ;; - c) - PCORRUPT=${OPTARG} - ;; - u) - PDUP=${OPTARG} - ;; - :) - echo "USAGE: run.sh [-d logdir] [-l packet_loss] [-c packet_corruption]" \ - "[-u packet_duplcate] [-g]" - exit 1 - ;; - ?) - echo "Invalid option: -${OPTARG}." - exit 1 - ;; - esac -done - - -check_env -check_conf -check_gcov_conf - - -rm -fr "$LOG_DIR" -TRACE_FILE="${LOG_DIR}/rds-strace.txt" -COVR_DIR="${LOG_DIR}/coverage/" -mkdir -p "$LOG_DIR" -mkdir -p "$COVR_DIR" - -set +e -echo running RDS tests... -echo Traces will be logged to "$TRACE_FILE" -rm -f "$TRACE_FILE" -strace -T -tt -o "$TRACE_FILE" python3 "$(dirname "$0")/test.py" \ - --timeout "$timeout" -d "$LOG_DIR" -l "$PLOSS" -c "$PCORRUPT" -u "$PDUP" - -test_rc=$? -dmesg > "${LOG_DIR}/dmesg.out" - -if [ "$GENERATE_GCOV_REPORT" -eq 1 ]; then - echo saving coverage data... - (set +x; cd /sys/kernel/debug/gcov; find ./* -name '*.gcda' | \ - while read -r f - do - cat < "/sys/kernel/debug/gcov/$f" > "/$f" - done) - - echo running gcovr... - gcovr -s --html-details --gcov-executable "$GCOV_CMD" --gcov-ignore-parse-errors \ - -o "${COVR_DIR}/gcovr" "${ksrc_dir}/net/rds/" -else - echo "Coverage report will be skipped" -fi - -if [ "$test_rc" -eq 0 ]; then - echo "PASS: Test completed successfully" -else - echo "FAIL: Test failed" -fi - -exit "$test_rc" diff --git a/tools/testing/selftests/net/rds/settings b/tools/testing/selftests/net/rds/settings index d2009a64589c..8cb41e6a83cc 100644 --- a/tools/testing/selftests/net/rds/settings +++ b/tools/testing/selftests/net/rds/settings @@ -1 +1 @@ -timeout=400 +timeout=800 diff --git a/tools/testing/selftests/net/rds/test.py b/tools/testing/selftests/net/rds/test.py index 93e23e8b256c..9e4df01cb0d4 100755 --- a/tools/testing/selftests/net/rds/test.py +++ b/tools/testing/selftests/net/rds/test.py @@ -1,23 +1,30 @@ #! /usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +""" +This module provides functional testing for the net/rds component. +""" import argparse +import atexit import ctypes import errno import hashlib import os import select +import re import signal import socket import subprocess import sys -import tempfile -import shutil +import time # Allow utils module to be imported from different directory this_dir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(this_dir, "../")) -from lib.py.utils import ip +# pylint: disable-next=wrong-import-position,import-error,no-name-in-module +from lib.py.utils import ip, cmd # noqa: E402 +# pylint: disable-next=wrong-import-position,import-error,no-name-in-module +from lib.py.ksft import ksft_pr # noqa: E402 libc = ctypes.cdll.LoadLibrary('libc.so.6') setns = libc.setns @@ -28,6 +35,44 @@ NET1 = 'net1' VETH0 = 'veth0' VETH1 = 'veth1' +tcpdump_procs = [] +tcp_addrs = [ + # we technically don't need different port numbers, but this will + # help identify traffic in the network analyzer + ('10.0.0.1', 10000), + ('10.0.0.2', 20000), +] + +# RDMA network configs +RXE_DEV0 = 'rxe0' +RXE_DEV1 = 'rxe1' + +VETH_RDMA0 = 'veth_rdma0' +VETH_RDMA1 = 'veth_rdma1' + +rdma_addrs = [ + ('10.0.0.3', 30000), + ('10.0.0.4', 30000), +] + +# send_packets flag space +OP_FLAG_TCP = 0x1 +OP_FLAG_RDMA = 0x2 + +# from include/uapi/linux/rds.h: SO_RDS_TRANSPORT pins a socket to a +# specific RDS transport so connection setup cannot silently fall back +# to another (e.g. loopback) transport. +SOL_RDS = 276 +SO_RDS_TRANSPORT = 8 +RDS_TRANS_TCP = 2 +RDS_TRANS_IB = 0 + +signal_handler_label = "" + +tap_idx = 0 +nr_pass = 0 +nr_fail = 0 + # Helper function for creating a socket inside a network namespace. # We need this because otherwise RDS will detect that the two TCP # sockets are on the same interface and use the loop transport instead @@ -43,233 +88,469 @@ def netns_socket(netns, *sock_args): child = os.fork() if child == 0: - # change network namespace - with open(f'/var/run/netns/{netns}', encoding='utf-8') as f: - try: + try: + # change network namespace + with open(f'/var/run/netns/{netns}', encoding='utf-8') as f: setns(f.fileno(), 0) - except IOError as e: - print(e.errno) - print(e) - - # create socket in target namespace - sock = socket.socket(*sock_args) + # create socket in target namespace + sock = socket.socket(*sock_args) - # send resulting socket to parent - socket.send_fds(u0, [], [sock.fileno()]) + # send resulting socket to parent + socket.send_fds(u0, [], [sock.fileno()]) - sys.exit(0) + os._exit(0) + except BaseException: + os._exit(1) # receive socket from child _, fds, _, _ = socket.recv_fds(u1, 0, 1) - os.waitpid(child, 0) + _, status = os.waitpid(child, 0) u0.close() u1.close() + if not os.WIFEXITED(status) or os.WEXITSTATUS(status) != 0: + raise RuntimeError( + f"netns_socket child failed in netns {netns} (status={status})") return socket.fromfd(fds[0], *sock_args) -def signal_handler(_sig, _frame): +def send_burst(socks, ip_addrs, snd_hashes, nr_sent, nr_total): + """Send until blocked or nr_total reached. Return updated nr_sent.""" + + while nr_sent < nr_total: + data = hashlib.sha256( + f'packet {nr_sent}'.encode('utf-8')).hexdigest().encode('utf-8') + # pseudo-random send/receive pattern + snd_idx = nr_sent % 2 + rcv_idx = 1 - (nr_sent % 3) % 2 + + snd = socks[snd_idx] + rcv = socks[rcv_idx] + try: + snd.sendto(data, ip_addrs[rcv_idx]) + except BlockingIOError: + return nr_sent + except OSError as e: + if e.errno in (errno.ENOBUFS, errno.ECONNRESET, errno.EPIPE): + return nr_sent + raise + snd_hashes.setdefault((snd.fileno(), rcv.fileno()), + hashlib.sha256()).update(f'<{data}>'.encode('utf-8')) + nr_sent += 1 + return nr_sent + +def recv_burst(epoll, socks, ip_addrs, rcv_hashes, nr_rcv): + """Drain whatever's readable from epoll. Return updated nr_recv.""" + for filen, evntmask in epoll.poll(): + if not evntmask & select.EPOLLRDNORM: + continue + rcv = next(s for s in socks if s.fileno() == filen) + while True: + try: + data, adr = rcv.recvfrom(1024) + except BlockingIOError: + break + snd_idx = ip_addrs.index(adr) + snd = socks[snd_idx] + rcv_hashes.setdefault((snd.fileno(), rcv.fileno()), + hashlib.sha256()).update(f'<{data}>'.encode('utf-8')) + nr_rcv += 1 + return nr_rcv + +def check_info(socks): """ - Test timed out signal handler + Check all rds info pages for errors + + :param socks: list of sockets to check """ - print('Test timed out') - sys.exit(1) -#Parse out command line arguments. We take an optional -# timeout parameter and an optional log output folder -parser = argparse.ArgumentParser(description="init script args", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("-d", "--logdir", action="store", - help="directory to store logs", default="/tmp") -parser.add_argument('--timeout', help="timeout to terminate hung test", - type=int, default=0) -parser.add_argument('-l', '--loss', help="Simulate tcp packet loss", - type=int, default=0) -parser.add_argument('-c', '--corruption', help="Simulate tcp packet corruption", - type=int, default=0) -parser.add_argument('-u', '--duplicate', help="Simulate tcp packet duplication", - type=int, default=0) -args = parser.parse_args() -logdir=args.logdir -PACKET_LOSS=str(args.loss)+'%' -PACKET_CORRUPTION=str(args.corruption)+'%' -PACKET_DUPLICATE=str(args.duplicate)+'%' + # the Python socket module doesn't know these + rds_info_first = 10000 + rds_info_last = 10017 -ip(f"netns add {NET0}") -ip(f"netns add {NET1}") -ip("link add type veth") + nr_success = 0 + nr_error = 0 -addrs = [ - # we technically don't need different port numbers, but this will - # help identify traffic in the network analyzer - ('10.0.0.1', 10000), - ('10.0.0.2', 20000), -] + for sock in socks: + for optname in range(rds_info_first, rds_info_last + 1): + # Sigh, the Python socket module doesn't allow us to pass + # buffer lengths greater than 1024 for some reason. RDS + # wants multiple pages. + try: + sock.getsockopt(socket.SOL_RDS, optname, 1024) + nr_success = nr_success + 1 + except OSError as e: + nr_error = nr_error + 1 + if e.errno == errno.ENOSPC: + # ignore + pass + + ksft_pr(f"getsockopt(): {nr_success}/{nr_error}") + +def verify_hashes(snd_hashes, rcv_hashes): + """Compare send/recv hashes per (sender, receiver) pair.""" + for key, snd_hash in snd_hashes.items(): + rcv_hash = rcv_hashes.get(key) + if rcv_hash is None: + ksft_pr("FAIL: No data received") + return 1 + if snd_hash.hexdigest() != rcv_hash.hexdigest(): + ksft_pr("FAIL: Send/recv mismatch") + ksft_pr("hash expected:", snd_hash.hexdigest()) + ksft_pr("hash received:", rcv_hash.hexdigest()) + return 1 + ksft_pr(f"{key[0]}/{key[1]}: ok") + return 0 + +def snd_rcv_packets(env): + """ + Send packets on the given network interfaces -# move interfaces to separate namespaces so they can no longer be -# bound directly; this prevents rds from switching over from the tcp -# transport to the loop transport. -ip(f"link set {VETH0} netns {NET0} up") -ip(f"link set {VETH1} netns {NET1} up") + :param env: transport-environment dict for setup_tcp() / setup_rdma(). + "addrs": list of (ip, port) tuples matching the sockets + "netns": list of netns names for TCP or None for RDMA + "flags": OP_FLAG_TCP or OP_FLAG_RDMA, selects sockets + """ + addrs = env["addrs"] + netns_list = env["netns"] + flags = env.get("flags", 0) + if (flags & OP_FLAG_TCP) and (flags & OP_FLAG_RDMA): + raise RuntimeError(f"Invalid transport flag sets multiple transports: {flags}") -# add addresses -ip(f"-n {NET0} addr add {addrs[0][0]}/32 dev {VETH0}") -ip(f"-n {NET1} addr add {addrs[1][0]}/32 dev {VETH1}") + if flags & OP_FLAG_TCP: + sockets = [ + netns_socket(netns_list[0], socket.AF_RDS, socket.SOCK_SEQPACKET), + netns_socket(netns_list[1], socket.AF_RDS, socket.SOCK_SEQPACKET), + ] -# add routes -ip(f"-n {NET0} route add {addrs[1][0]}/32 dev {VETH0}") -ip(f"-n {NET1} route add {addrs[0][0]}/32 dev {VETH1}") + # Pin the sockets to the TCP transport so it doesn't fail over to a + # different transport during this test + for s in sockets: + s.setsockopt(SOL_RDS, SO_RDS_TRANSPORT, RDS_TRANS_TCP) + elif flags & OP_FLAG_RDMA: + sockets = [ + socket.socket(socket.AF_RDS, socket.SOCK_SEQPACKET), + socket.socket(socket.AF_RDS, socket.SOCK_SEQPACKET), + ] -# sanity check that our two interfaces/addresses are correctly set up -# and communicating by doing a single ping -ip(f"netns exec {NET0} ping -c 1 {addrs[1][0]}") + # Pin the sockets to the RDMA transport so it doesn't fail over to a + # different transport during this test + for s in sockets: + s.setsockopt(SOL_RDS, SO_RDS_TRANSPORT, RDS_TRANS_IB) + else: + raise RuntimeError(f"Invalid transport flag sets no transports: {flags}") -# Start a packet capture on each network -tcpdump_procs = [] -for net in [NET0, NET1]: - pcap = logdir+'/'+net+'.pcap' - fd, pcap_tmp = tempfile.mkstemp(suffix=".pcap", prefix=f"{net}-", dir="/tmp") - p = subprocess.Popen( - ['ip', 'netns', 'exec', net, - '/usr/sbin/tcpdump', '-i', 'any', '-w', pcap_tmp]) - tcpdump_procs.append((p, pcap_tmp, pcap, fd)) - -# simulate packet loss, duplication and corruption -for net, iface in [(NET0, VETH0), (NET1, VETH1)]: - ip(f"netns exec {net} /usr/sbin/tc qdisc add dev {iface} root netem \ - corrupt {PACKET_CORRUPTION} loss {PACKET_LOSS} duplicate \ - {PACKET_DUPLICATE}") - -# add a timeout -if args.timeout > 0: - signal.alarm(args.timeout) - signal.signal(signal.SIGALRM, signal_handler) - -sockets = [ - netns_socket(NET0, socket.AF_RDS, socket.SOCK_SEQPACKET), - netns_socket(NET1, socket.AF_RDS, socket.SOCK_SEQPACKET), -] + for s, addr in zip(sockets, addrs): + s.bind(addr) + s.setblocking(0) -for s, addr in zip(sockets, addrs): - s.bind(addr) - s.setblocking(0) + send_hashes = {} + recv_hashes = {} -fileno_to_socket = { - s.fileno(): s for s in sockets -} + ep = select.epoll() -addr_to_socket = dict(zip(addrs, sockets)) + for s in sockets: + ep.register(s, select.EPOLLRDNORM) -socket_to_addr = { - s: addr for addr, s in zip(addrs, sockets) -} + num_packets = 50000 + nr_send = 0 + nr_recv = 0 -send_hashes = {} -recv_hashes = {} + while nr_send < num_packets: -ep = select.epoll() + # Send as much as we can without blocking + ksft_pr("sending...", nr_send, nr_recv) + nr_send = send_burst(sockets, addrs, send_hashes, nr_send, num_packets) -for s in sockets: - ep.register(s, select.EPOLLRDNORM) + # Receive as much as we can without blocking + ksft_pr("receiving...", nr_send, nr_recv) + while nr_recv < nr_send: + nr_recv = recv_burst(ep, sockets, addrs, recv_hashes, nr_recv) -NUM_PACKETS = 50000 -nr_send = 0 -nr_recv = 0 + # exercise net/rds/tcp.c:rds_tcp_sysctl_reset() + if netns_list: + for net in netns_list: + ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_rcvbuf=10000") + ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_sndbuf=10000") -while nr_send < NUM_PACKETS: - # Send as much as we can without blocking - print("sending...", nr_send, nr_recv) - while nr_send < NUM_PACKETS: - send_data = hashlib.sha256( - f'packet {nr_send}'.encode('utf-8')).hexdigest().encode('utf-8') + ksft_pr("done", nr_send, nr_recv) - # pseudo-random send/receive pattern - sender = sockets[nr_send % 2] - receiver = sockets[1 - (nr_send % 3) % 2] + check_info(sockets) - try: - sender.sendto(send_data, socket_to_addr[receiver]) - send_hashes.setdefault((sender.fileno(), receiver.fileno()), - hashlib.sha256()).update(f'<{send_data}>'.encode('utf-8')) - nr_send = nr_send + 1 - except BlockingIOError as e: - break - except OSError as e: - if e.errno in [errno.ENOBUFS, errno.ECONNRESET, errno.EPIPE]: - break - raise + # We're done sending and receiving stuff, now let's check if what + # we received is what we sent. + rc = verify_hashes(send_hashes, recv_hashes) + + ep.close() + for s in sockets: + s.close() + + return rc + +def stop_pcaps(): + """Stop tcpdump processes. + + We use pop() here to drain the list in the event that the test + completes after the signal handler is fired. List will be empty + if logdir is not set + """ + + if not tcpdump_procs: + return - # Receive as much as we can without blocking - print("receiving...", nr_send, nr_recv) - while nr_recv < nr_send: - for fileno, eventmask in ep.poll(): - receiver = fileno_to_socket[fileno] - - if eventmask & select.EPOLLRDNORM: - while True: - try: - recv_data, address = receiver.recvfrom(1024) - sender = addr_to_socket[address] - recv_hashes.setdefault((sender.fileno(), - receiver.fileno()), hashlib.sha256()).update( - f'<{recv_data}>'.encode('utf-8')) - nr_recv = nr_recv + 1 - except BlockingIOError as e: - break - - # exercise net/rds/tcp.c:rds_tcp_sysctl_reset() - for net in [NET0, NET1]: - ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_rcvbuf=10000") - ip(f"netns exec {net} /usr/sbin/sysctl net.rds.tcp.rds_tcp_sndbuf=10000") - -print("done", nr_send, nr_recv) - -# the Python socket module doesn't know these -RDS_INFO_FIRST = 10000 -RDS_INFO_LAST = 10017 - -nr_success = 0 -nr_error = 0 - -for s in sockets: - for optname in range(RDS_INFO_FIRST, RDS_INFO_LAST + 1): - # Sigh, the Python socket module doesn't allow us to pass - # buffer lengths greater than 1024 for some reason. RDS - # wants multiple pages. + ksft_pr("Stopping network packet captures") + while tcpdump_procs: + proc = tcpdump_procs.pop() + proc.terminate() try: - s.getsockopt(socket.SOL_RDS, optname, 1024) - nr_success = nr_success + 1 - except OSError as e: - nr_error = nr_error + 1 - if e.errno == errno.ENOSPC: - # ignore - pass - -print(f"getsockopt(): {nr_success}/{nr_error}") - -print("Stopping network packet captures") -for p, pcap_tmp, pcap, fd in tcpdump_procs: - p.terminate() - p.wait() - os.close(fd) - shutil.move(pcap_tmp, pcap) - -# We're done sending and receiving stuff, now let's check if what -# we received is what we sent. -for (sender, receiver), send_hash in send_hashes.items(): - recv_hash = recv_hashes.get((sender, receiver)) - - if recv_hash is None: - print("FAIL: No data received") - sys.exit(1) - - if send_hash.hexdigest() != recv_hash.hexdigest(): - print("FAIL: Send/recv mismatch") - print("hash expected:", send_hash.hexdigest()) - print("hash received:", recv_hash.hexdigest()) - sys.exit(1) - - print(f"{sender}/{receiver}: ok") - -print("Success") -sys.exit(0) + proc.wait(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + +def signal_handler(_sig, _frame): + """ + Test timed out signal handler + """ + ksft_pr(f"Test timed out: {signal_handler_label}") + print(f"not ok {tap_idx} rds selftest {signal_handler_label}") + sys.exit(1) + +def setup_tcp(): + """ + Configure tcp network + """ + + # clean up any leftovers from a previously interrupted run + teardown_tcp() + + ip(f"netns add {NET0}") + ip(f"netns add {NET1}") + ip("link add type veth") + + # Move TCP interfaces into separate namespaces so they can no longer be + # bound directly; this prevents rds from switching over from the tcp + # transport to the loop transport. + ip(f"link set {VETH0} netns {NET0} up") + ip(f"link set {VETH1} netns {NET1} up") + + # add addresses + ip(f"-n {NET0} addr add {tcp_addrs[0][0]}/32 dev {VETH0}") + ip(f"-n {NET1} addr add {tcp_addrs[1][0]}/32 dev {VETH1}") + + # add routes + ip(f"-n {NET0} route add {tcp_addrs[1][0]}/32 dev {VETH0}") + ip(f"-n {NET1} route add {tcp_addrs[0][0]}/32 dev {VETH1}") + + # sanity check that our two interfaces/addresses are correctly set up + # and communicating by doing a single ping + ip(f"netns exec {NET0} ping -c 1 {tcp_addrs[1][0]}") + + # Start a packet capture on each network + if logdir is not None: + for netn in [NET0, NET1]: + pcap = logdir+'/rds-'+netn+'.pcap' + + tcpdump_cmd = ['ip', 'netns', 'exec', netn, '/usr/sbin/tcpdump'] + sudo_user = os.environ.get('SUDO_USER') + if sudo_user: + tcpdump_cmd.extend(['-Z', sudo_user]) + tcpdump_cmd.extend(['-i', 'any', '-w', pcap]) + + # pylint: disable-next=consider-using-with + p = subprocess.Popen(tcpdump_cmd, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + tcpdump_procs.append(p) + + # simulate packet loss, duplication and corruption + for netn, iface in [(NET0, VETH0), (NET1, VETH1)]: + ip(f"netns exec {netn} /usr/sbin/tc qdisc add dev {iface} root netem \ + corrupt {PACKET_CORRUPTION} loss {PACKET_LOSS} duplicate \ + {PACKET_DUPLICATE}") + +def teardown_tcp(): + """ + Tear down the tcp network configured by setup_tcp(). + + Removing the namespaces also removes the veth pair, addresses, + routes, and netem qdisc that live inside them. fail=False so + this is safe to call in error paths after a partial or complete setup. + """ + cmd(f"ip netns del {NET0}", fail=False) + cmd(f"ip netns del {NET1}", fail=False) + +def get_iface_mac(iface): + """Return the MAC address of a local network interface.""" + out = subprocess.check_output(['ip', 'link', 'show', iface], text=True) + mac = re.search(r'link/ether\s+([0-9a-f:]+)', out) + if not mac: + raise RuntimeError(f"Cannot determine MAC address of {iface}") + return mac.group(1) + +def setup_rdma(): + """ + Configure rdma network + """ + + # remove links left over by previously interrupted run. + teardown_rdma() + + # use call here since modprobe may fail if the rdma_rxe + # module is built-in + subprocess.call(['modprobe', 'rdma_rxe'], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + ip(f"link add {VETH_RDMA0} type veth peer name {VETH_RDMA1}") + + ip(f"link set {VETH_RDMA0} up") + ip(f"link set {VETH_RDMA1} up") + + # Since both addresses are in the same namespace, the source address + # is always local, so enable accept_local + cmd(f"/usr/sbin/sysctl -q net.ipv4.conf.{VETH_RDMA0}.accept_local=1") + cmd(f"/usr/sbin/sysctl -q net.ipv4.conf.{VETH_RDMA1}.accept_local=1") + + # Reverse path filters must be disabled so that the local routes don't + # cause RPF failures. + cmd(f"/usr/sbin/sysctl -q net.ipv4.conf.{VETH_RDMA0}.rp_filter=0") + cmd(f"/usr/sbin/sysctl -q net.ipv4.conf.{VETH_RDMA1}.rp_filter=0") + + # add addresses + ip(f"addr add {rdma_addrs[0][0]}/32 dev {VETH_RDMA0}") + ip(f"addr add {rdma_addrs[1][0]}/32 dev {VETH_RDMA1}") + + # add routes + ip(f"route add {rdma_addrs[1][0]}/32 dev {VETH_RDMA0}") + ip(f"route add {rdma_addrs[0][0]}/32 dev {VETH_RDMA1}") + + # ARP will not resolve neighbor IPs on /32 routes without a subnet. + # Avoid this by adding neighbors directly so RDMA CM can populate path + # records with correct mac addrs without waiting for the ARP. + mac0 = get_iface_mac(VETH_RDMA0) + mac1 = get_iface_mac(VETH_RDMA1) + ip(f"neigh add {rdma_addrs[1][0]} lladdr {mac1} dev {VETH_RDMA0} nud permanent") + ip(f"neigh add {rdma_addrs[0][0]} lladdr {mac0} dev {VETH_RDMA1} nud permanent") + + cmd(f'rdma link add {RXE_DEV0} type rxe netdev {VETH_RDMA0}') + cmd(f'rdma link add {RXE_DEV1} type rxe netdev {VETH_RDMA1}') + + time.sleep(1) # allow RXE devices to initialise + + # Start a packet capture on each network + if logdir is not None: + for iface in [VETH_RDMA0, VETH_RDMA1]: + pcap = logdir+'/rds-roce-'+iface+'.pcap' + + tcpdump_cmd = ['/usr/sbin/tcpdump'] + sudo_user = os.environ.get('SUDO_USER') + if sudo_user: + tcpdump_cmd.extend(['-Z', sudo_user]) + tcpdump_cmd.extend(['-i', iface, '-w', pcap]) + + # pylint: disable-next=consider-using-with + p = subprocess.Popen(tcpdump_cmd, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + tcpdump_procs.append(p) + + # simulate packet loss, duplication and corruption + for iface in [VETH_RDMA0, VETH_RDMA1]: + cmd(f"/usr/sbin/tc qdisc add dev {iface} root netem \ + corrupt {PACKET_CORRUPTION} loss {PACKET_LOSS} duplicate \ + {PACKET_DUPLICATE}") + +def teardown_rdma(): + """ + Tear down the rdma network configured by setup_rdma(). + """ + + # remove links left over by previously interrupted run. + cmd(f'rdma link del {RXE_DEV0}', fail=False) + cmd(f'rdma link del {RXE_DEV1}', fail=False) + cmd(f'ip link del {VETH_RDMA0}', fail=False) + + +#Parse out command line arguments. We take an optional +# timeout parameter and an optional log output folder +parser = argparse.ArgumentParser(description="init script args", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("-d", "--logdir", action="store", + help="directory to store logs", default=None) +parser.add_argument("-T", "--transport", default="tcp", + help="Comma-separated list of transports to test: " + "tcp, rdma, or tcp,rdma. Each matching test " + "is run once per transport. " + "'rdma' requires CONFIG_RDS_RDMA and rdma_rxe.") +parser.add_argument('-t', '--timeout', help="timeout to terminate hung test", + type=int, default=0) +parser.add_argument('-l', '--loss', help="Simulate tcp packet loss", + type=int, default=0) +parser.add_argument('-c', '--corruption', help="Simulate tcp packet corruption", + type=int, default=0) +parser.add_argument('-u', '--duplicate', help="Simulate tcp packet duplication", + type=int, default=0) +args = parser.parse_args() +logdir=args.logdir +PACKET_LOSS=str(args.loss)+'%' +PACKET_CORRUPTION=str(args.corruption)+'%' +PACKET_DUPLICATE=str(args.duplicate)+'%' + +# check transport is either tcp or rdma +transports = [t.strip() for t in args.transport.split(',')] +for t in transports: + if t not in ('tcp', 'rdma'): + raise SystemExit(f"test.py: unknown transport: {t!r}") + +# Register stop_pcaps before any network setups so that any partially setup +# tcpdumps are still cleaned up on error +atexit.register(stop_pcaps) + +# Set up all requested transports upfront so network plumbing is +# ready before any test runs. +transport_envs = {} +FLAGS = 0 +if 'tcp' in transports: + # Register cleanups before setups to handle partial setups that error'd out + atexit.register(teardown_tcp) + setup_tcp() + transport_envs['tcp'] = { + 'addrs': tcp_addrs, + 'netns': [NET0, NET1], + 'flags': FLAGS | OP_FLAG_TCP, + } + +if 'rdma' in transports: + atexit.register(teardown_rdma) + setup_rdma() + transport_envs['rdma'] = { + 'addrs': rdma_addrs, + 'netns': None, + 'flags': FLAGS | OP_FLAG_RDMA, + } + +print("TAP version 13") +print(f"1..{len(transport_envs)}") + +for transport, tenv in transport_envs.items(): + tap_idx += 1 + + # add a timeout + if args.timeout > 0: + signal_handler_label = transport + signal.alarm(args.timeout) + signal.signal(signal.SIGALRM, signal_handler) + + ret = snd_rcv_packets(tenv) + + # cancel timeout + signal.alarm(0) + + if ret == 0: + ksft_pr("Success") + print(f"ok {tap_idx} rds selftest {transport}") + nr_pass += 1 + else: + print(f"not ok {tap_idx} rds selftest {transport}") + nr_fail += 1 + +ksft_pr(f"Totals: pass:{nr_pass} fail:{nr_fail} skip:0") +sys.exit(1 if nr_fail else 0) diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py index e9ad5e88da97..3622413d793d 100755 --- a/tools/testing/selftests/net/rtnetlink.py +++ b/tools/testing/selftests/net/rtnetlink.py @@ -1,17 +1,20 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_exit, ksft_run, ksft_ge, RtnlAddrFamily import socket +import time +from lib.py import bkg, ip, ksft_exit, ksft_run, ksft_ge, ksft_true, KsftSkipEx +from lib.py import CmdExitFailure, NetNS, NetNSEnter, RtnlAddrFamily IPV4_ALL_HOSTS_MULTICAST = b'\xe0\x00\x00\x01' -def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: +def dump_mcaddr_check() -> None: """ Verify that at least one interface has the IPv4 all-hosts multicast address. At least the loopback interface should have this address. """ + rtnl = RtnlAddrFamily() addresses = rtnl.getmulticast({"ifa-family": socket.AF_INET}, dump=True) all_host_multicasts = [ @@ -21,9 +24,39 @@ def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: ksft_ge(len(all_host_multicasts), 1, "No interface found with the IPv4 all-hosts multicast address") +def ipv4_devconf_notify() -> None: + """ + Configure an interface and set ipv4-devconf values through netlink + to verify that the appropriate netlink notifications are being sent. + """ + + with NetNS() as ns: + with NetNSEnter(str(ns)): + ifname = "dummy1" + ip(f"link add name {ifname} type dummy", ns=str(ns)) + + with bkg("ip monitor", ns=str(ns)) as cmd_obj: + time.sleep(1) + try: + ip(f"link set dev {ifname} inet forwarding on") + ip(f"link set dev {ifname} inet proxy_arp on") + ip(f"link set dev {ifname} inet rp_filter 1") + ip(f"link set dev {ifname} inet ignore_routes_with_linkdown on") + except CmdExitFailure: + raise KsftSkipEx("iproute2 does not support IPv4 devconf attributes") + time.sleep(1) + + ksft_true(f"inet {ifname} ignore_routes_with_linkdown on" in cmd_obj.stdout, + f"No 'ignore_routes_with_linkdown on' notificiation found for interface {ifname}") + ksft_true(f"inet {ifname} rp_filter strict" in cmd_obj.stdout, + f"No 'rp_filter strict' notificiation found for interface {ifname}") + ksft_true(f"inet {ifname} proxy_neigh on" in cmd_obj.stdout, + f"No 'proxy_neigh on' notificiation found for interface {ifname}") + ksft_true(f"inet {ifname} forwarding on" in cmd_obj.stdout, + f"No 'forwarding on' notificiation found for interface {ifname}") + def main() -> None: - rtnl = RtnlAddrFamily() - ksft_run([dump_mcaddr_check], args=(rtnl, )) + ksft_run([dump_mcaddr_check, ipv4_devconf_notify]) ksft_exit() if __name__ == "__main__": diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index c499953d4885..ace3a99023ed 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -24,6 +24,8 @@ ALL_TESTS=" kci_test_macsec kci_test_macsec_vlan kci_test_team_bridge_macvlan + kci_test_bridge_promisc_netlink + kci_test_bridge_promisc_sysfs kci_test_ipsec kci_test_ipsec_offload kci_test_fdb_get @@ -61,6 +63,14 @@ check_fail() fi } +sysfs_write() +{ + local val="$1" + local path="$2" + + echo "$val" > "$path" +} + run_cmd_common() { local cmd="$*" @@ -680,6 +690,59 @@ kci_test_team_bridge_macvlan() end_test "PASS: team_bridge_macvlan" } +# Test that changing bridge port flags via the netlink path does not sleep with +# the bridge spin lock held. +kci_test_bridge_promisc_netlink() +{ + local dummy="test_dummy1" + local bridge="test_br1" + local team="test_team1" + local ret=0 + + run_cmd ip link add $team up type team + run_cmd ip link add $bridge up type bridge vlan_filtering 1 + run_cmd ip link add $dummy up type dummy + run_cmd ip link set $dummy master $bridge + run_cmd ip link set $team master $bridge + + # This causes the bridge driver to sync all the static FDB entries to + # the team device (which supports unicast filtering) and remove it from + # promiscuous mode. The call to dev_set_promiscuity() can sleep due to + # Rx mode inlining, which is a problem if the bridge spin lock is held. + run_cmd bridge link set dev $dummy flood off learning off + + run_cmd ip link del $dummy + run_cmd ip link del $bridge + run_cmd ip link del $team + + end_test "PASS: bridge_promisc_netlink" +} + +# Same as kci_test_bridge_promisc_netlink(), but the flags are changed via the +# sysfs path. +kci_test_bridge_promisc_sysfs() +{ + local dummy="test_dummy1" + local bridge="test_br1" + local team="test_team1" + local ret=0 + + run_cmd ip link add $team up type team + run_cmd ip link add $bridge up type bridge vlan_filtering 1 + run_cmd ip link add $dummy up type dummy + run_cmd ip link set $dummy master $bridge + run_cmd ip link set $team master $bridge + + run_cmd sysfs_write 0 /sys/class/net/$dummy/brport/unicast_flood + run_cmd sysfs_write 0 /sys/class/net/$dummy/brport/learning + + run_cmd ip link del $dummy + run_cmd ip link del $bridge + run_cmd ip link del $team + + end_test "PASS: bridge_promisc_sysfs" +} + #------------------------------------------------------------------- # Example commands # ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \ diff --git a/tools/testing/selftests/net/so_txtime.sh b/tools/testing/selftests/net/so_txtime.sh deleted file mode 100755 index 5e861ad32a42..000000000000 --- a/tools/testing/selftests/net/so_txtime.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Regression tests for the SO_TXTIME interface - -set -e - -readonly ksft_skip=4 -readonly DEV="veth0" -readonly BIN="./so_txtime" - -readonly RAND="$(mktemp -u XXXXXX)" -readonly NSPREFIX="ns-${RAND}" -readonly NS1="${NSPREFIX}1" -readonly NS2="${NSPREFIX}2" - -readonly SADDR4='192.168.1.1' -readonly DADDR4='192.168.1.2' -readonly SADDR6='fd::1' -readonly DADDR6='fd::2' - -cleanup() { - ip netns del "${NS2}" - ip netns del "${NS1}" -} - -trap cleanup EXIT - -# Create virtual ethernet pair between network namespaces -ip netns add "${NS1}" -ip netns add "${NS2}" - -ip link add "${DEV}" netns "${NS1}" type veth \ - peer name "${DEV}" netns "${NS2}" - -# Bring the devices up -ip -netns "${NS1}" link set "${DEV}" up -ip -netns "${NS2}" link set "${DEV}" up - -# Set fixed MAC addresses on the devices -ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 -ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 - -# Add fixed IP addresses to the devices -ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" -ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" -ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad -ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad - -run_test() { - local readonly IP="$1" - local readonly CLOCK="$2" - local readonly TXARGS="$3" - local readonly RXARGS="$4" - - if [[ "${IP}" == "4" ]]; then - local readonly SADDR="${SADDR4}" - local readonly DADDR="${DADDR4}" - elif [[ "${IP}" == "6" ]]; then - local readonly SADDR="${SADDR6}" - local readonly DADDR="${DADDR6}" - else - echo "Invalid IP version ${IP}" - exit 1 - fi - - local readonly START="$(date +%s%N --date="+ 0.1 seconds")" - - ip netns exec "${NS2}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${RXARGS}" -r & - ip netns exec "${NS1}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${TXARGS}" - wait "$!" -} - -do_test() { - run_test $@ - [ $? -ne 0 ] && ret=1 -} - -do_fail_test() { - run_test $@ - [ $? -eq 0 ] && ret=1 -} - -ip netns exec "${NS1}" tc qdisc add dev "${DEV}" root fq -set +e -ret=0 -do_test 4 mono a,-1 a,-1 -do_test 6 mono a,0 a,0 -do_test 6 mono a,10 a,10 -do_test 4 mono a,10,b,20 a,10,b,20 -do_test 6 mono a,20,b,10 b,20,a,20 - -if ip netns exec "${NS1}" tc qdisc replace dev "${DEV}" root etf clockid CLOCK_TAI delta 400000; then - do_fail_test 4 tai a,-1 a,-1 - do_fail_test 6 tai a,0 a,0 - do_test 6 tai a,10 a,10 - do_test 4 tai a,10,b,20 a,10,b,20 - do_test 6 tai a,20,b,10 b,10,a,20 -else - echo "tc ($(tc -V)) does not support qdisc etf. skipping" - [ $ret -eq 0 ] && ret=$ksft_skip -fi - -if [ $ret -eq 0 ]; then - echo OK. All tests passed -elif [[ $ret -ne $ksft_skip && -n "$KSFT_MACHINE_SLOW" ]]; then - echo "Ignoring errors due to slow environment" 1>&2 - ret=0 -fi -exit $ret diff --git a/tools/testing/selftests/net/tcp_ao/config b/tools/testing/selftests/net/tcp_ao/config index f22148512365..1b120bfd89c4 100644 --- a/tools/testing/selftests/net/tcp_ao/config +++ b/tools/testing/selftests/net/tcp_ao/config @@ -1,7 +1,3 @@ -CONFIG_CRYPTO_CMAC=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_RMD160=y -CONFIG_CRYPTO_SHA1=y CONFIG_IPV6=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_NET_L3_MASTER_DEV=y diff --git a/tools/testing/selftests/net/tcp_ao/key-management.c b/tools/testing/selftests/net/tcp_ao/key-management.c index 69d9a7a05d5c..d86bb380b79f 100644 --- a/tools/testing/selftests/net/tcp_ao/key-management.c +++ b/tools/testing/selftests/net/tcp_ao/key-management.c @@ -380,31 +380,6 @@ static void check_listen_socket(void) close(sk); } -static const char *fips_fpath = "/proc/sys/crypto/fips_enabled"; -static bool is_fips_enabled(void) -{ - static int fips_checked = -1; - FILE *fenabled; - int enabled; - - if (fips_checked >= 0) - return !!fips_checked; - if (access(fips_fpath, R_OK)) { - if (errno != ENOENT) - test_error("Can't open %s", fips_fpath); - fips_checked = 0; - return false; - } - fenabled = fopen(fips_fpath, "r"); - if (!fenabled) - test_error("Can't open %s", fips_fpath); - if (fscanf(fenabled, "%d", &enabled) != 1) - test_error("Can't read from %s", fips_fpath); - fclose(fenabled); - fips_checked = !!enabled; - return !!fips_checked; -} - struct test_key { char password[TCP_AO_MAXKEYLEN]; const char *alg; @@ -430,14 +405,7 @@ struct key_collection { static struct key_collection collection; #define TEST_MAX_MACLEN 16 -const char *test_algos[] = { - "cmac(aes128)", - "hmac(sha1)", "hmac(sha512)", "hmac(sha384)", "hmac(sha256)", - "hmac(sha224)", "hmac(sha3-512)", - /* only if !CONFIG_FIPS */ -#define TEST_NON_FIPS_ALGOS 2 - "hmac(rmd160)", "hmac(md5)" -}; +const char *test_algos[] = { "cmac(aes128)", "hmac(sha1)", "hmac(sha256)" }; const unsigned int test_maclens[] = { 1, 4, 12, 16 }; #define MACLEN_SHIFT 2 #define ALGOS_SHIFT 4 @@ -452,7 +420,7 @@ static unsigned int make_mask(unsigned int shift, unsigned int prev_shift) static void init_key_in_collection(unsigned int index, bool randomized) { struct test_key *key = &collection.keys[index]; - unsigned int algos_nr, algos_index; + unsigned int algos_index; /* Same for randomized and non-randomized test flows */ key->client_keyid = index; @@ -474,10 +442,7 @@ static void init_key_in_collection(unsigned int index, bool randomized) key->maclen = test_maclens[index & make_mask(shift, 0)]; algos_index = index & make_mask(ALGOS_SHIFT, shift); } - algos_nr = ARRAY_SIZE(test_algos); - if (is_fips_enabled()) - algos_nr -= TEST_NON_FIPS_ALGOS; - key->alg = test_algos[algos_index % algos_nr]; + key->alg = test_algos[algos_index % ARRAY_SIZE(test_algos)]; } static int init_default_key_collection(unsigned int nr_keys, bool randomized) diff --git a/tools/testing/selftests/net/tcp_ecmp_failover.sh b/tools/testing/selftests/net/tcp_ecmp_failover.sh new file mode 100755 index 000000000000..5768aa8bff6a --- /dev/null +++ b/tools/testing/selftests/net/tcp_ecmp_failover.sh @@ -0,0 +1,216 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2026 Google LLC. +# +# This test verifies TCP flow failover between ECMP routes +# upon carrier loss on the active device. +# +# socat -----------------------------> socat +# | +# .-- veth-c1 -|- veth-s1 --. +# dummy0 -| | |-- dummy0 +# '-- veth-c2 -|- veth-s2 --' +# | +# + +REQUIRE_JQ=no +REQUIRE_MZ=no +NUM_NETIFS=0 + +source forwarding/lib.sh + +CLIENT_IP="10.0.59.1" +SERVER_IP="10.0.92.1" +CLIENT_IP6="2001:db8:5a9a::1" +SERVER_IP6="2001:db8:9292::1" + +setup_server() +{ + IP="ip -n $server" + NS_EXEC="ip netns exec $server" + + $IP link add dummy0 type dummy + $IP link set dummy0 up + + $IP -4 addr add $SERVER_IP/32 dev dummy0 + $IP -6 addr add $SERVER_IP6/128 dev dummy0 nodad + + $IP link set veth-s1 up + $IP link set veth-s2 up + + $IP -4 addr add 192.168.1.2/24 dev veth-s1 + $IP -4 addr add 192.168.2.2/24 dev veth-s2 + + $IP -4 route add $CLIENT_IP/32 \ + nexthop via 192.168.1.1 dev veth-s1 weight 1 \ + nexthop via 192.168.2.1 dev veth-s2 weight 1 + + $IP -6 addr add 2001:db8:1::2/64 dev veth-s1 nodad + $IP -6 addr add 2001:db8:2::2/64 dev veth-s2 nodad + + $IP -6 route add $CLIENT_IP6/128 \ + nexthop via 2001:db8:1::1 dev veth-s1 weight 1 \ + nexthop via 2001:db8:2::1 dev veth-s2 weight 1 +} + +setup_client() +{ + IP="ip -n $client" + NS_EXEC="ip netns exec $client" + + $IP link add dummy0 type dummy + $IP link set dummy0 up + + $IP -4 addr add $CLIENT_IP/32 dev dummy0 + $IP -6 addr add $CLIENT_IP6/128 dev dummy0 nodad + + $IP link set veth-c1 up + $IP link set veth-c2 up + + $IP -4 addr add 192.168.1.1/24 dev veth-c1 + $IP -4 addr add 192.168.2.1/24 dev veth-c2 + + $IP -4 route add $SERVER_IP/32 \ + nexthop via 192.168.1.2 dev veth-c1 weight 1 \ + nexthop via 192.168.2.2 dev veth-c2 weight 1 + + $IP -6 addr add 2001:db8:1::1/64 dev veth-c1 nodad + $IP -6 addr add 2001:db8:2::1/64 dev veth-c2 nodad + + $IP -6 route add $SERVER_IP6/128 \ + nexthop via 2001:db8:1::2 dev veth-c1 weight 1 \ + nexthop via 2001:db8:2::2 dev veth-c2 weight 1 + + # By default, tcp_retries1=3 triggers a route refresh + # after 3 retransmits (~5s). Ensure this never occurs + # for test stability. + $NS_EXEC sysctl -qw net.ipv4.tcp_retries1=100 + + # When NETDEV_CHANGE is issued for a dev tied to an ECMP + # route, RTNH_F_LINKDOWN is flagged and the sernum is + # bumped to invalidate the route via sk_dst_check(). + # + # Without ignore_routes_with_linkdown=1, subsequent + # lookups may still select the same RTNH_F_LINKDOWN route. + $NS_EXEC sysctl -qw net.ipv4.conf.veth-c1.ignore_routes_with_linkdown=1 + $NS_EXEC sysctl -qw net.ipv4.conf.veth-c2.ignore_routes_with_linkdown=1 + + $NS_EXEC sysctl -qw net.ipv6.conf.veth-c1.ignore_routes_with_linkdown=1 + $NS_EXEC sysctl -qw net.ipv6.conf.veth-c2.ignore_routes_with_linkdown=1 +} + +setup() +{ + setup_ns client server + + ip -n "$client" link add veth-c1 type veth peer veth-s1 netns "$server" + ip -n "$client" link add veth-c2 type veth peer veth-s2 netns "$server" + + setup_server + setup_client +} + +cleanup() +{ + cleanup_all_ns > /dev/null 2>&1 +} + +tcp_ecmp_failover() +{ + local pf=$1; shift + local server_ip=$1; shift + local client_ip=$1; shift + + RET=0 + + tcpdump_start veth-s1 "$server" + tcpdump_start veth-s2 "$server" + + ip netns exec "$server" \ + socat -u TCP-LISTEN:8080,pf="$pf",bind="$server_ip",reuseaddr /dev/null & + server_pid=$! + + # Wait for server to start listening. + # Sometimes client fails without this sleep. + sleep 1 + + ip netns exec "$client" \ + socat -u /dev/zero TCP:"$server_ip":8080,pf="$pf",bind="$client_ip" & + client_pid=$! + + # To capture enough packets. + sleep 3 + + tcpdump_stop veth-s1 + tcpdump_stop veth-s2 + + pkts_s1=$(tcpdump_show veth-s1 | wc -l) + pkts_s2=$(tcpdump_show veth-s2 | wc -l) + + tcpdump_cleanup veth-s1 + tcpdump_cleanup veth-s2 + + # Detect the device chosen by the client + if [ "$pkts_s1" -gt "$pkts_s2" ]; then + veth_down=veth-s1 + veth_up=veth-s2 + else + veth_down=veth-s2 + veth_up=veth-s1 + fi + + # Taking down $veth_down causes its peer to lose carrier, + # triggering NETDEV_CHANGE. This flags RTNH_F_LINKDOWN + # and bumps the sernum for the route associated with that + # peer, invalidating the cached dst in the TCP socket. + # + # Consequently, sk_dst_check() fails, forcing the subsequent + # lookup to select the remaining healthy route via $veth_up. + ip -n "$server" link set "$veth_down" down + + tcpdump_start "$veth_up" "$server" + + # To capture enough packets. + sleep 3 + + tcpdump_stop "$veth_up" + + kill -9 "$client_pid" > /dev/null 2>&1 + kill -9 "$server_pid" > /dev/null 2>&1 + wait 2> /dev/null + + pkts=$(tcpdump_show $veth_up | wc -l) + + tcpdump_cleanup "$veth_up" + + if [ "$pkts" -lt 1000 ]; then + RET=$ksft_fail + fi +} + +test_ipv4() +{ + setup + tcp_ecmp_failover IPv4 $SERVER_IP $CLIENT_IP + log_test "TCP IPv4 failover" + cleanup +} + +test_ipv6() +{ + setup + tcp_ecmp_failover IPv6 "[$SERVER_IP6]" "[$CLIENT_IP6]" + log_test "TCP IPv6 failover" + cleanup +} + +require_command socat +require_command tcpdump + +trap cleanup EXIT + +test_ipv4 +test_ipv6 + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh index 9067197c9055..e9ed0d750996 100755 --- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh +++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh @@ -56,6 +56,12 @@ TESTS=" neigh_suppress_uc_ns neigh_vlan_suppress_arp neigh_vlan_suppress_ns + neigh_suppress_arp_probe + neigh_suppress_dad_ns + neigh_forward_grat_arp + neigh_forward_grat_na + neigh_vlan_forward_grat_arp + neigh_vlan_forward_grat_na " VERBOSE=0 PAUSE_ON_FAIL=no @@ -74,7 +80,8 @@ log_test() printf "TEST: %-60s [ OK ]\n" "${msg}" nsuccess=$((nsuccess+1)) else - ret=1 + # shellcheck disable=SC2154 + ret=$(ksft_exit_status_merge "$ret" "$ksft_fail") nfail=$((nfail+1)) printf "TEST: %-60s [FAIL]\n" "${msg}" if [ "$VERBOSE" = "1" ]; then @@ -97,6 +104,7 @@ log_test() fi [ "$VERBOSE" = "1" ] && echo + return 0 } run_cmd() @@ -134,6 +142,15 @@ tc_check_packets() [[ $pkts == $count ]] } +neigh_forward_grat_check() +{ + if ! bridge link help 2>&1 | grep -q "neigh_forward_grat"; then + echo "SKIP: iproute2 bridge too old, missing gratuitous ARP/unsolicited NA forwarding control support" + # shellcheck disable=SC2154 + return "$ksft_skip" + fi +} + ################################################################################ # Setup @@ -561,6 +578,17 @@ icmpv6_header_get() echo $p } +icmpv6_na_header_get() +{ + local csum=$1; shift + local tip=$1; shift + + # Type 136 (Neighbor Advertisement), hex format, Override flag set, + # Solicited flag clear (unsolicited NA). + # ICMPv6.type : ICMPv6.code : ICMPv6.checksum : Flags : Target Address + echo "88:00:$csum:20:00:00:00:$tip:" +} + neigh_suppress_uc_ns_common() { local vid=$1; shift @@ -875,6 +903,439 @@ neigh_vlan_suppress_ns() log_test $? 0 "NS suppression (VLAN $vid2)" } +neigh_suppress_arp_probe() +{ + local vid=10 + local tip=192.0.2.2 + local h2_mac + + echo + echo "Per-port ARP probe suppression" + echo "------------------------------" + + run_cmd "tc -n $sw1 qdisc replace dev vx0 clsact" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 101 proto 0x0806 flower indev swp1 arp_tip $tip arp_sip 0.0.0.0 arp_op request action pass" + + # Initial state - check that ARP probes are not suppressed. + run_cmd "ip netns exec $h1 arping -D -q -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "ARP probe suppression" + + # Enable neighbor suppression and check that nothing changes. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + run_cmd "ip netns exec $h1 arping -D -q -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 2 + log_test $? 0 "ARP probe suppression" + + # Install FDB and a neighbor and check that ARP probes are suppressed. + h2_mac=$(ip -n "$h2" -j -p link show eth0."$vid" | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $h2_mac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 neigh replace $tip lladdr $h2_mac nud permanent dev br0.$vid" + log_test $? 0 "FDB and neighbor entry installation" + + run_cmd "ip netns exec $h1 arping -D -q -c 1 -w 5 -I eth0.$vid $tip" + log_test $? 1 "arping" + tc_check_packets "$sw1" "dev vx0 egress" 101 2 + log_test $? 0 "ARP probe suppression" + + # Remove the neighbor entry and check that ARP probes are not suppressed. + run_cmd "ip -n $sw1 neigh del $tip dev br0.$vid" + log_test $? 0 "neighbor removal" + + run_cmd "ip netns exec $h1 arping -D -q -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 3 + log_test $? 0 "ARP probe suppression" + + # Disable neighbor suppression. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress off" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress off\"" + log_test $? 0 "\"neigh_suppress\" is off" + + run_cmd "ip netns exec $h1 arping -D -q -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 4 + log_test $? 0 "ARP probe suppression" +} + +neigh_suppress_dad_ns() +{ + local vid=10 + local tip=2001:db8:1::99 + local mcast=ff02::1:ff00:99 + local dmac=33:33:ff:00:00:99 + local full_tip=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:99 + local csum="4b:bc" + local smac + local tmac + + echo + echo "Per-port DAD NS suppression" + echo "---------------------------" + + smac=$(ip -n "$h1" -j -p link show eth0."$vid" | jq -r '.[]["address"]') + + run_cmd "tc -n $sw1 qdisc replace dev vx0 clsact" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 101 proto ipv6 flower indev swp1 ip_proto icmpv6 dst_ip $mcast src_ip :: type 135 code 0 action pass" + + # Initial state - check that DAD NS are not suppressed. + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A :: -B $mcast -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum" "$full_tip") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "DAD NS suppression" + + # Enable neighbor suppression and check that nothing changes. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A :: -B $mcast -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum" "$full_tip") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 2 + log_test $? 0 "DAD NS suppression" + + # Install FDB and a neighbor and check that DAD NS are suppressed + # and that a proxy NA is sent back to h1. + tmac=$(ip -n "$h2" -j -p link show eth0."$vid" | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 -6 neigh replace $tip lladdr $tmac nud permanent dev br0.$vid" + log_test $? 0 "FDB and neighbor entry installation" + + run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 dst_ip ff02::1 src_ip $tip type 136 code 0 action pass" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A :: -B $mcast -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum" "$full_tip") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 2 + log_test $? 0 "DAD NS suppression" + tc_check_packets "$h1" "dev eth0.$vid ingress" 101 1 + log_test $? 0 "DAD NS proxy NA reply" + + # Remove the neighbor entry and check that DAD NS are not suppressed. + run_cmd "ip -n $sw1 -6 neigh del $tip dev br0.$vid" + log_test $? 0 "neighbor removal" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A :: -B $mcast -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum" "$full_tip") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 3 + log_test $? 0 "DAD NS suppression" + + # Disable neighbor suppression. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress off" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress off\"" + log_test $? 0 "\"neigh_suppress\" is off" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A :: -B $mcast -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum" "$full_tip") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 4 + log_test $? 0 "DAD NS suppression" +} + +neigh_forward_grat_arp() +{ + local vid=10 + local sip=192.0.2.1 + local tip=$sip + local h2_ip=192.0.2.2 + local h2_mac + + neigh_forward_grat_check || return $? + + echo + echo "Gratuitous ARP forwarding" + echo "-------------------------" + + run_cmd "tc -n $sw1 qdisc replace dev vx0 clsact" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 101 proto 0x0806 flower indev swp1 arp_tip $tip arp_sip $sip arp_op request action pass" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 102 proto 0x0806 flower indev swp1 arp_tip $h2_ip arp_sip $sip arp_op request action pass" + + h2_mac=$(ip -n "$h2" -j -p link show eth0."$vid" | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $h2_mac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 neigh replace $tip lladdr $h2_mac nud permanent dev br0.$vid" + run_cmd "ip -n $sw1 neigh replace $h2_ip lladdr $h2_mac nud permanent dev br0.$vid" + + # Enable neighbor suppression. Gratuitous ARP should be suppressed by + # default (neigh_forward_grat defaults to off). + run_cmd "ip -n $sw1 link set dev vx0 type bridge_slave neigh_suppress on" + run_cmd "ip -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + # Send gratuitous ARP (sip == tip) and check it's suppressed. + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 0 + log_test $? 0 "Gratuitous ARP suppression" + + # Explicitly enable neigh_forward_grat and verify gratuitous ARP is + # now forwarded. + run_cmd "ip -n $sw1 link set dev vx0 type bridge_slave neigh_forward_grat on" + run_cmd "ip -n $sw1 -d link show dev vx0 | grep \"neigh_forward_grat on\"" + log_test $? 0 "\"neigh_forward_grat\" is on" + + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "Gratuitous ARP forwarding" + + # Verify that regular (non-gratuitous) ARP requests are still + # suppressed when neigh_forward_grat is enabled. + run_cmd "ip netns exec $h1 arping -c 1 -w 5 -I eth0.$vid $h2_ip" + tc_check_packets "$sw1" "dev vx0 egress" 102 0 + log_test $? 0 "Regular ARP suppression with \"neigh_forward_grat\" on" + + # Disable neigh_forward_grat and verify suppression resumes. + run_cmd "ip -n $sw1 link set dev vx0 type bridge_slave neigh_forward_grat off" + run_cmd "ip -n $sw1 -d link show dev vx0 | grep \"neigh_forward_grat off\"" + log_test $? 0 "\"neigh_forward_grat\" is off" + + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid $tip" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "Gratuitous ARP suppression" +} + +# neigh_forward_grat_arp() uses 'ip link' interface, and neigh_forward_grat_na() +# uses 'bridge link' interface to exercise both paths. +neigh_forward_grat_na() +{ + local vid=10 + local saddr=2001:db8:1::1 + local daddr=ff02::1 + local h2_addr=2001:db8:1::2 + local h2_maddr=ff02::1:ff00:2 + local full_addr=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01 + local h2_full_addr=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:02 + local csum="fd:32" + local csum_ns="1f:2f" + local dmac=33:33:00:00:00:01 + local h2_dmac=33:33:ff:00:00:02 + local h2_mac + local smac + + neigh_forward_grat_check || return $? + + echo + echo "Unsolicited NA forwarding" + echo "-------------------------" + + smac=$(ip -n "$h1" -j -p link show eth0."$vid" | jq -r '.[]["address"]') + + run_cmd "tc -n $sw1 qdisc replace dev vx0 clsact" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 101 proto ipv6 flower indev swp1 ip_proto icmpv6 dst_ip $daddr src_ip $saddr type 136 code 0 action pass" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 102 proto ipv6 flower indev swp1 ip_proto icmpv6 dst_ip $h2_maddr src_ip $saddr type 135 code 0 action pass" + + h2_mac=$(ip -n "$h2" -j -p link show eth0."$vid" | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $h2_mac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 neigh replace $saddr lladdr $h2_mac nud permanent dev br0.$vid" + run_cmd "ip -n $sw1 neigh replace $h2_addr lladdr $h2_mac nud permanent dev br0.$vid" + + # Enable neighbor suppression. Unsolicited NA should be suppressed by + # default (neigh_forward_grat defaults to off). + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + # Send unsolicited NA and check it's suppressed. + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A $saddr -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum" "$full_addr") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 0 + log_test $? 0 "Unsolicited NA suppression" + + # Explicitly enable neigh_forward_grat and verify unsolicited NA is + # now forwarded. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_forward_grat on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_forward_grat on\"" + log_test $? 0 "\"neigh_forward_grat\" is on" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A $saddr -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum" "$full_addr") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "Unsolicited NA forwarding" + + # Verify that solicited NS messages are still suppressed when + # neigh_forward_grat is enabled. + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $h2_dmac -A $saddr -B $h2_maddr -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum_ns" "$h2_full_addr") -q" + tc_check_packets "$sw1" "dev vx0 egress" 102 0 + log_test $? 0 "Solicited NS suppression with \"neigh_forward_grat\" on" + + # Disable neigh_forward_grat and verify suppression resumes. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_forward_grat off" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_forward_grat off\"" + log_test $? 0 "\"neigh_forward_grat\" is off" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a $smac -b $dmac -A $saddr -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum" "$full_addr") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "Unsolicited NA suppression" +} + +neigh_vlan_forward_grat_arp() +{ + local vid1=10 + local vid2=20 + local sip1=192.0.2.1 + local sip2=192.0.2.17 + local h2_ip1=192.0.2.2 + local h2_mac1 + local h2_mac2 + + neigh_forward_grat_check || return $? + + echo + echo "Per-VLAN gratuitous ARP forwarding" + echo "----------------------------------" + + run_cmd "tc -n $sw1 qdisc replace dev vx0 clsact" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 101 proto 0x0806 flower indev swp1 arp_tip $sip1 arp_sip $sip1 arp_op request action pass" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 102 proto 0x0806 flower indev swp1 arp_tip $sip2 arp_sip $sip2 arp_op request action pass" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 103 proto 0x0806 flower indev swp1 arp_tip $h2_ip1 arp_sip $sip1 arp_op request action pass" + + h2_mac1=$(ip -n "$h2" -j -p link show eth0."$vid1" | jq -r '.[]["address"]') + h2_mac2=$(ip -n "$h2" -j -p link show eth0."$vid2" | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $h2_mac1 dev vx0 master static vlan $vid1" + run_cmd "bridge -n $sw1 fdb replace $h2_mac2 dev vx0 master static vlan $vid2" + run_cmd "ip -n $sw1 neigh replace $sip1 lladdr $h2_mac1 nud permanent dev br0.$vid1" + run_cmd "ip -n $sw1 neigh replace $sip2 lladdr $h2_mac2 nud permanent dev br0.$vid2" + run_cmd "ip -n $sw1 neigh replace $h2_ip1 lladdr $h2_mac1 nud permanent dev br0.$vid1" + + # Enable per-{Port, VLAN} neighbor suppression. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_vlan_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_vlan_suppress on\"" + log_test $? 0 "\"neigh_vlan_suppress\" is on" + + # Enable neighbor suppression on VLAN 10. Gratuitous ARP should be + # suppressed by default on VLAN 10 (neigh_forward_grat defaults to off) + # but not on VLAN 20. + run_cmd "bridge -n $sw1 vlan set vid $vid1 dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d vlan show dev vx0 vid $vid1 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on (VLAN $vid1)" + + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid1 $sip1" + tc_check_packets "$sw1" "dev vx0 egress" 101 0 + log_test $? 0 "Gratuitous ARP suppression (VLAN $vid1)" + + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid2 $sip2" + tc_check_packets "$sw1" "dev vx0 egress" 102 1 + log_test $? 0 "Gratuitous ARP forwarding (VLAN $vid2)" + + # Enable neigh_forward_grat on VLAN 10 and verify gratuitous ARP is + # now forwarded. + run_cmd "bridge -n $sw1 vlan set vid $vid1 dev vx0 neigh_forward_grat on" + run_cmd "bridge -n $sw1 -d vlan show dev vx0 vid $vid1 | grep \"neigh_forward_grat on\"" + log_test $? 0 "\"neigh_forward_grat\" is on (VLAN $vid1)" + + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid1 $sip1" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "Gratuitous ARP forwarding (VLAN $vid1)" + + # Verify that regular (non-gratuitous) ARP requests on VLAN $vid1 are + # still suppressed when neigh_forward_grat is enabled. + run_cmd "ip netns exec $h1 arping -c 1 -w 5 -I eth0.$vid1 $h2_ip1" + tc_check_packets "$sw1" "dev vx0 egress" 103 0 + log_test $? 0 "Regular ARP suppression with \"neigh_forward_grat\" on (VLAN $vid1)" + + # Enable neighbor suppression on VLAN 20 (neigh_forward_grat defaults to + # off), and verify gratuitous ARP is suppressed on VLAN 20. + run_cmd "bridge -n $sw1 vlan set vid $vid2 dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d vlan show dev vx0 vid $vid2 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on (VLAN $vid2)" + + # VLAN 10 should still forward (neigh_forward_grat is on). + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid1 $sip1" + tc_check_packets "$sw1" "dev vx0 egress" 101 2 + log_test $? 0 "Gratuitous ARP forwarding (VLAN $vid1)" + + # VLAN 20 should suppress (neigh_forward_grat defaults to off). + run_cmd "ip netns exec $h1 arping -U -c 1 -w 5 -I eth0.$vid2 $sip2" + tc_check_packets "$sw1" "dev vx0 egress" 102 1 + log_test $? 0 "Gratuitous ARP suppression (VLAN $vid2)" +} + +neigh_vlan_forward_grat_na() +{ + local vid1=10 + local vid2=20 + local saddr1=2001:db8:1::1 + local daddr=ff02::1 + local h2_addr1=2001:db8:1::2 + local h2_maddr1=ff02::1:ff00:2 + local full_addr1=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:01 + local h2_full_addr1=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:02 + local csum1="fd:32" + local csum_ns1="1f:2f" + local saddr2=2001:db8:2::1 + local full_addr2=20:01:0d:b8:00:02:00:00:00:00:00:00:00:00:00:01 + local csum2="fd:30" + local dmac=33:33:00:00:00:01 + local h2_dmac1=33:33:ff:00:00:02 + local h2_mac1 + local h2_mac2 + local smac + + neigh_forward_grat_check || return $? + + echo + echo "Per-VLAN unsolicited NA forwarding" + echo "----------------------------------" + + smac=$(ip -n "$h1" -j -p link show eth0."$vid1" | jq -r '.[]["address"]') + + run_cmd "tc -n $sw1 qdisc replace dev vx0 clsact" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 101 proto ipv6 flower indev swp1 ip_proto icmpv6 dst_ip $daddr src_ip $saddr1 type 136 code 0 action pass" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 102 proto ipv6 flower indev swp1 ip_proto icmpv6 dst_ip $daddr src_ip $saddr2 type 136 code 0 action pass" + run_cmd "tc -n $sw1 filter replace dev vx0 egress pref 1 handle 103 proto ipv6 flower indev swp1 ip_proto icmpv6 dst_ip $h2_maddr1 src_ip $saddr1 type 135 code 0 action pass" + + h2_mac1=$(ip -n "$h2" -j -p link show eth0."$vid1" | jq -r '.[]["address"]') + h2_mac2=$(ip -n "$h2" -j -p link show eth0."$vid2" | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $h2_mac1 dev vx0 master static vlan $vid1" + run_cmd "bridge -n $sw1 fdb replace $h2_mac2 dev vx0 master static vlan $vid2" + run_cmd "ip -n $sw1 neigh replace $saddr1 lladdr $h2_mac1 nud permanent dev br0.$vid1" + run_cmd "ip -n $sw1 neigh replace $saddr2 lladdr $h2_mac2 nud permanent dev br0.$vid2" + run_cmd "ip -n $sw1 neigh replace $h2_addr1 lladdr $h2_mac1 nud permanent dev br0.$vid1" + + # Enable per-{Port, VLAN} neighbor suppression. + run_cmd "bridge -n $sw1 link set dev vx0 neigh_vlan_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_vlan_suppress on\"" + log_test $? 0 "\"neigh_vlan_suppress\" is on" + + # Enable neighbor suppression on VLAN 10. Unsolicited NA should be + # suppressed by default on VLAN 10 (neigh_forward_grat defaults to off) + # but not on VLAN 20. + run_cmd "bridge -n $sw1 vlan set vid $vid1 dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d vlan show dev vx0 vid $vid1 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on (VLAN $vid1)" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid1 -c 1 -a $smac -b $dmac -A $saddr1 -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum1" "$full_addr1") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 0 + log_test $? 0 "Unsolicited NA suppression (VLAN $vid1)" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid2 -c 1 -a $smac -b $dmac -A $saddr2 -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum2" "$full_addr2") -q" + tc_check_packets "$sw1" "dev vx0 egress" 102 1 + log_test $? 0 "Unsolicited NA forwarding (VLAN $vid2)" + + # Enable neigh_forward_grat on VLAN 10 and verify unsolicited NA is + # now forwarded. + run_cmd "bridge -n $sw1 vlan set vid $vid1 dev vx0 neigh_forward_grat on" + run_cmd "bridge -n $sw1 -d vlan show dev vx0 vid $vid1 | grep \"neigh_forward_grat on\"" + log_test $? 0 "\"neigh_forward_grat\" is on (VLAN $vid1)" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid1 -c 1 -a $smac -b $dmac -A $saddr1 -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum1" "$full_addr1") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 1 + log_test $? 0 "Unsolicited NA forwarding (VLAN $vid1)" + + # Verify that solicited NS messages on VLAN $vid1 are still suppressed + # when neigh_forward_grat is enabled. + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid1 -c 1 -a $smac -b $h2_dmac1 -A $saddr1 -B $h2_maddr1 -t ip hop=255,next=58,payload=$(icmpv6_header_get "$csum_ns1" "$h2_full_addr1") -q" + tc_check_packets "$sw1" "dev vx0 egress" 103 0 + log_test $? 0 "Solicited NS suppression with \"neigh_forward_grat\" on (VLAN $vid1)" + + # Enable neighbor suppression on VLAN 20 (neigh_forward_grat defaults to + # off), and verify unsolicited NA is suppressed on VLAN 20. + run_cmd "bridge -n $sw1 vlan set vid $vid2 dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d vlan show dev vx0 vid $vid2 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on (VLAN $vid2)" + + # VLAN 10 should still forward (neigh_forward_grat is on). + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid1 -c 1 -a $smac -b $dmac -A $saddr1 -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum1" "$full_addr1") -q" + tc_check_packets "$sw1" "dev vx0 egress" 101 2 + log_test $? 0 "Unsolicited NA forwarding (VLAN $vid1)" + + # VLAN 20 should suppress (neigh_forward_grat defaults to off). + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid2 -c 1 -a $smac -b $dmac -A $saddr2 -B $daddr -t ip hop=255,next=58,payload=$(icmpv6_na_header_get "$csum2" "$full_addr2") -q" + tc_check_packets "$sw1" "dev vx0 egress" 102 1 + log_test $? 0 "Unsolicited NA suppression (VLAN $vid2)" +} + ################################################################################ # Usage @@ -961,7 +1422,10 @@ cleanup for t in $TESTS do - setup; $t; cleanup; + setup + $t + ret=$(ksft_exit_status_merge "$ret" $?) + cleanup done if [ "$TESTS" != "none" ]; then diff --git a/tools/testing/selftests/net/test_vxlan_vnifilter_notify.sh b/tools/testing/selftests/net/test_vxlan_vnifilter_notify.sh new file mode 100755 index 000000000000..9d51a9e02ae0 --- /dev/null +++ b/tools/testing/selftests/net/test_vxlan_vnifilter_notify.sh @@ -0,0 +1,184 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# shellcheck disable=SC2034,SC2154,SC2317,SC2329 +# +# Test for VXLAN vnifilter netlink notifications (RTM_NEWTUNNEL / +# RTM_DELTUNNEL). +# +# Verifies that: +# - Adding a new VNI sends a notification +# - Adding a new VNI with a remote sends a notification +# - Deleting a VNI sends a notification +# - Re-adding an existing VNI with the same attributes does not send +# a spurious notification +# - Updating an existing VNI's remote sends a notification +# - Deleting a non-existent VNI does not send a notification + +source lib.sh + +require_command bridge + +VXLAN_DEV=vxlan100 + +ALL_TESTS=" + test_vni_add_notify + test_vni_add_remote_notify + test_vni_del_notify + test_vni_readd_no_notify + test_vni_update_remote_notify + test_vni_del_nonexistent_no_notify +" + +setup_prepare() +{ + setup_ns NS1 + defer cleanup_all_ns + + ip -n "$NS1" link add $VXLAN_DEV type vxlan dstport 4789 \ + local 10.0.0.1 nolearning external vnifilter + ip -n "$NS1" link set $VXLAN_DEV up +} + +# Run bridge monitor in the background, execute a command, then count +# the notification lines. +# Usage: vni_notify_check <command> [args...] +# Sets: NOTIFY_COUNT with the number of notifications observed. +vni_notify_check() +{ + local tmpf cmd_ret monitor_pid + + tmpf=$(mktemp) + defer rm "$tmpf" + + defer_scope_push + ip netns exec "$NS1" bridge monitor vni > "$tmpf" 2>/dev/null & + monitor_pid=$! + defer kill_process "$monitor_pid" + + sleep 0.5 + if [ ! -e "/proc/$monitor_pid" ]; then + RET=$ksft_skip + log_test "iproute2 'bridge monitor vni' not supported" + return "$RET" + fi + + "$@" + cmd_ret=$? + sleep 0.2 + defer_scope_pop + + NOTIFY_COUNT=$(grep -c "$VXLAN_DEV" "$tmpf") + NOTIFY_COUNT=${NOTIFY_COUNT:-0} + return "$cmd_ret" +} + +# Adding a brand new VNI should produce a notification. +test_vni_add_notify() +{ + RET=0 + + vni_notify_check \ + bridge -n "$NS1" vni add vni 1000 dev "$VXLAN_DEV" + check_err $? "Failed to add VNI" + + [ "$NOTIFY_COUNT" -eq 1 ] + check_err $? "Expected 1 notification for VNI add, got $NOTIFY_COUNT" + + bridge -n "$NS1" vni delete vni 1000 dev "$VXLAN_DEV" 2>/dev/null + + log_test "VNI add sends notification" +} + +# Adding a VNI with a remote should produce a notification. +test_vni_add_remote_notify() +{ + RET=0 + + vni_notify_check \ + bridge -n "$NS1" vni add vni 4000 remote 10.0.0.2 dev "$VXLAN_DEV" + check_err $? "Failed to add VNI with remote" + + [ "$NOTIFY_COUNT" -eq 1 ] + check_err $? "Expected 1 notification for VNI add with remote, got $NOTIFY_COUNT" + + bridge -n "$NS1" vni delete vni 4000 dev "$VXLAN_DEV" + + log_test "VNI add with remote sends notification" +} + +# Deleting a VNI should produce a notification. +test_vni_del_notify() +{ + RET=0 + + bridge -n "$NS1" vni add vni 2000 dev "$VXLAN_DEV" + + vni_notify_check \ + bridge -n "$NS1" vni delete vni 2000 dev "$VXLAN_DEV" + check_err $? "Failed to delete VNI" + + [ "$NOTIFY_COUNT" -eq 1 ] + check_err $? "Expected 1 notification for VNI del, got $NOTIFY_COUNT" + + log_test "VNI delete sends notification" +} + +# Re-adding an existing VNI with the same attributes should not produce +# a notification. +test_vni_readd_no_notify() +{ + RET=0 + + bridge -n "$NS1" vni add vni 3000 dev "$VXLAN_DEV" + + vni_notify_check \ + bridge -n "$NS1" vni add vni 3000 dev "$VXLAN_DEV" + check_err $? "Failed to re-add VNI" + + [ "$NOTIFY_COUNT" -eq 0 ] + check_err $? "Expected 0 notifications for VNI re-add, got $NOTIFY_COUNT" + + bridge -n "$NS1" vni delete vni 3000 dev "$VXLAN_DEV" + + log_test "VNI re-add does not send spurious notification" +} + +# Updating an existing VNI's remote should produce a notification. +test_vni_update_remote_notify() +{ + RET=0 + + bridge -n "$NS1" vni add vni 5000 remote 10.0.0.2 dev "$VXLAN_DEV" + + vni_notify_check \ + bridge -n "$NS1" vni add vni 5000 remote 10.0.0.3 dev "$VXLAN_DEV" + check_err $? "Failed to update VNI remote" + + [ "$NOTIFY_COUNT" -eq 1 ] + check_err $? "Expected 1 notification for VNI remote update, got $NOTIFY_COUNT" + + bridge -n "$NS1" vni delete vni 5000 dev "$VXLAN_DEV" + + log_test "VNI remote update sends notification" +} + +# Deleting a non-existent VNI should not produce a notification. +test_vni_del_nonexistent_no_notify() +{ + RET=0 + + vni_notify_check \ + bridge -n "$NS1" vni delete vni 9999 dev "$VXLAN_DEV" 2>/dev/null + + [ "$NOTIFY_COUNT" -eq 0 ] + check_err $? "Expected 0 notifications for non-existent VNI del, got $NOTIFY_COUNT" + + log_test "Non-existent VNI delete does not send notification" +} + +trap defer_scopes_cleanup EXIT + +setup_prepare +tests_run + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 9e2ccea13d70..cbdd3ea28b99 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -946,6 +946,49 @@ TEST_F(tls, peek_and_splice) EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); } +TEST_F(tls, splice_to_pipe_small) +{ + int send_len = TLS_PAYLOAD_MAX_LEN; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + size_t total = 0; + int p[2]; + + memrnd(mem_send, sizeof(mem_send)); + + ASSERT_GE(pipe(p), 0); + + /* Shrink pipe to 1 page (typically 4096 bytes) to force multiple + * splice iterations for a 16384-byte TLS record. + */ + EXPECT_GE(fcntl(p[1], F_SETPIPE_SZ, 4096), 4096); + + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + while (total < (size_t)send_len) { + ssize_t spliced, drained; + + spliced = splice(self->cfd, NULL, p[1], NULL, + send_len - total, 0); + EXPECT_GT(spliced, 0); + if (spliced <= 0) + break; + + drained = read(p[0], mem_recv + total, spliced); + EXPECT_EQ(drained, spliced); + if (drained <= 0) + break; + + total += drained; + } + + EXPECT_EQ(total, (size_t)send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); + + close(p[0]); + close(p[1]); +} + #define MAX_FRAGS 48 TEST_F(tls, splice_short) { @@ -954,6 +997,8 @@ TEST_F(tls, splice_short) char sendbuf[0x100]; char sendchar = 'S'; int pipefds[2]; + int pipe_sz; + int ret; int i; sendchar_iov.iov_base = &sendchar; @@ -962,7 +1007,11 @@ TEST_F(tls, splice_short) memset(sendbuf, 's', sizeof(sendbuf)); ASSERT_GE(pipe2(pipefds, O_NONBLOCK), 0); - ASSERT_GE(fcntl(pipefds[0], F_SETPIPE_SZ, (MAX_FRAGS + 1) * 0x1000), 0); + pipe_sz = (MAX_FRAGS + 1) * getpagesize(); + ret = fcntl(pipefds[0], F_SETPIPE_SZ, pipe_sz); + if (ret < 0 && errno == EPERM) + SKIP(return, "insufficient pipe capacity"); + ASSERT_GE(ret, pipe_sz); for (i = 0; i < MAX_FRAGS; i++) ASSERT_GE(vmsplice(pipefds[1], &sendchar_iov, 1, 0), 0); @@ -1506,7 +1555,7 @@ test_mutliproc(struct __test_metadata *_metadata, struct _test_data_tls *self, res = recv(self->cfd, rb, left > sizeof(rb) ? sizeof(rb) : left, 0); - EXPECT_GE(res, 0); + ASSERT_GE(res, 0); left -= res; } } else { @@ -1523,7 +1572,7 @@ test_mutliproc(struct __test_metadata *_metadata, struct _test_data_tls *self, res = send(self->fd, buf, left > file_sz ? file_sz : left, 0); - EXPECT_GE(res, 0); + ASSERT_GE(res, 0); left -= res; } } diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh index e8c02c64e03a..d04caa14202d 100755 --- a/tools/testing/selftests/net/vlan_bridge_binding.sh +++ b/tools/testing/selftests/net/vlan_bridge_binding.sh @@ -64,7 +64,7 @@ check_operstate() local expect=$1; shift local operstate - operstate=$(busywait 1000 \ + operstate=$(busywait 2000 \ operstate_is "$dev" "$expect") check_err $? "Got operstate of $operstate, expected $expect" } diff --git a/tools/testing/selftests/nolibc/Makefile.include b/tools/testing/selftests/nolibc/Makefile.include index 96fe2bc2191e..c30ca3a9ef14 100644 --- a/tools/testing/selftests/nolibc/Makefile.include +++ b/tools/testing/selftests/nolibc/Makefile.include @@ -6,7 +6,7 @@ _CFLAGS_STACKPROTECTOR ?= $(call try-run, \ $(__CFLAGS_STACKPROTECTOR)) _CFLAGS_SANITIZER ?= $(call cc-option,-fsanitize=undefined -fsanitize-trap=all) CFLAGS_NOLIBC_TEST ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 \ - -W -Wall -Wextra -Wundef \ + -W -Wall -Wextra -Wundef -Wwrite-strings \ $(call cc-option,-fno-stack-protector) $(call cc-option,-Wmissing-prototypes) \ $(_CFLAGS_STACKPROTECTOR) $(_CFLAGS_SANITIZER) diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc index f30bc68470cc..06f881e2e90c 100644 --- a/tools/testing/selftests/nolibc/Makefile.nolibc +++ b/tools/testing/selftests/nolibc/Makefile.nolibc @@ -64,6 +64,7 @@ ARCH_s390x = s390 ARCH_sparc32 = sparc ARCH_sparc64 = sparc ARCH_sh4 = sh +ARCH_parisc32 = parisc ARCH := $(or $(ARCH_$(XARCH)),$(XARCH)) # kernel image names by architecture @@ -74,33 +75,18 @@ IMAGE_x86 = arch/x86/boot/bzImage IMAGE_arm64 = arch/arm64/boot/Image IMAGE_arm = arch/arm/boot/zImage IMAGE_armthumb = arch/arm/boot/zImage -IMAGE_mips32le = vmlinuz -IMAGE_mips32be = vmlinuz -IMAGE_mipsn32le = vmlinuz -IMAGE_mipsn32be = vmlinuz -IMAGE_mips64le = vmlinuz -IMAGE_mips64be = vmlinuz -IMAGE_ppc = vmlinux -IMAGE_ppc64 = vmlinux IMAGE_ppc64le = arch/powerpc/boot/zImage -IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32 = arch/riscv/boot/Image IMAGE_riscv64 = arch/riscv/boot/Image IMAGE_s390x = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE_sparc32 = arch/sparc/boot/image IMAGE_sparc64 = arch/sparc/boot/image -IMAGE_m68k = vmlinux IMAGE_sh4 = arch/sh/boot/zImage -IMAGE = $(objtree)/$(IMAGE_$(XARCH)) +IMAGE = $(objtree)/$(or $(IMAGE_$(XARCH)),vmlinux) IMAGE_NAME = $(notdir $(IMAGE)) # default kernel configurations that appear to be usable -DEFCONFIG_i386 = defconfig -DEFCONFIG_x86_64 = defconfig -DEFCONFIG_x32 = defconfig -DEFCONFIG_x86 = defconfig -DEFCONFIG_arm64 = defconfig DEFCONFIG_arm = multi_v7_defconfig DEFCONFIG_armthumb = multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig @@ -112,20 +98,18 @@ DEFCONFIG_mips64be = malta_defconfig generic/64r2.config generic/eb.config DEFCONFIG_ppc = pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le = powernv_defconfig -DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32 = rv32_defconfig -DEFCONFIG_riscv64 = defconfig -DEFCONFIG_s390x = defconfig -DEFCONFIG_loongarch = defconfig DEFCONFIG_sparc32 = sparc32_defconfig DEFCONFIG_sparc64 = sparc64_defconfig DEFCONFIG_m68k = virt_defconfig DEFCONFIG_sh4 = rts7751r2dplus_defconfig -DEFCONFIG = $(DEFCONFIG_$(XARCH)) +DEFCONFIG_openrisc = virt_defconfig +DEFCONFIG = $(or $(DEFCONFIG_$(XARCH)),defconfig) EXTRACONFIG_x32 = -e CONFIG_X86_X32_ABI EXTRACONFIG_arm = -e CONFIG_NAMESPACES EXTRACONFIG_armthumb = -e CONFIG_NAMESPACES +EXTRACONFIG_sparc32 = -e CONFIG_TMPFS EXTRACONFIG_m68k = -e CONFIG_BLK_DEV_INITRD EXTRACONFIG_sh4 = -e CONFIG_BLK_DEV_INITRD -e CONFIG_CMDLINE_FROM_BOOTLOADER EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) @@ -134,37 +118,27 @@ EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) TEST = # QEMU_ARCH: arch names used by qemu -QEMU_ARCH_i386 = i386 -QEMU_ARCH_x86_64 = x86_64 QEMU_ARCH_x32 = x86_64 QEMU_ARCH_x86 = x86_64 QEMU_ARCH_arm64 = aarch64 -QEMU_ARCH_arm = arm QEMU_ARCH_armthumb = arm QEMU_ARCH_mips32le = mipsel # works with malta_defconfig -QEMU_ARCH_mips32be = mips +QEMU_ARCH_mips32be = mips QEMU_ARCH_mipsn32le = mips64el QEMU_ARCH_mipsn32be = mips64 QEMU_ARCH_mips64le = mips64el QEMU_ARCH_mips64be = mips64 -QEMU_ARCH_ppc = ppc -QEMU_ARCH_ppc64 = ppc64 QEMU_ARCH_ppc64le = ppc64 -QEMU_ARCH_riscv = riscv64 -QEMU_ARCH_riscv32 = riscv32 -QEMU_ARCH_riscv64 = riscv64 -QEMU_ARCH_s390x = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH_sparc32 = sparc -QEMU_ARCH_sparc64 = sparc64 -QEMU_ARCH_m68k = m68k -QEMU_ARCH_sh4 = sh4 -QEMU_ARCH = $(QEMU_ARCH_$(XARCH)) +QEMU_ARCH_openrisc = or1k +QEMU_ARCH_parisc32 = hppa +QEMU_ARCH = $(or $(QEMU_ARCH_$(XARCH)),$(XARCH)) QEMU_ARCH_USER_ppc64le = ppc64le QEMU_ARCH_USER_mipsn32le = mipsn32el QEMU_ARCH_USER_mipsn32be = mipsn32 -QEMU_ARCH_USER = $(or $(QEMU_ARCH_USER_$(XARCH)),$(QEMU_ARCH_$(XARCH))) +QEMU_ARCH_USER = $(or $(QEMU_ARCH_USER_$(XARCH)),$(QEMU_ARCH)) QEMU_BIOS_DIR = /usr/share/edk2/ QEMU_BIOS_loongarch = $(QEMU_BIOS_DIR)/loongarch64/OVMF_CODE.fd @@ -190,7 +164,6 @@ QEMU_ARGS_mips64be = -M malta -cpu 5KEc -append "panic=-1 $(TEST:%=NOLIBC_TEST QEMU_ARGS_ppc = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc64 = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" -QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -199,6 +172,8 @@ QEMU_ARGS_sparc32 = -M SS-5 -m 256M -append "console=ttyS0,115200 panic=-1 $( QEMU_ARGS_sparc64 = -M sun4u -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_m68k = -M virt -append "console=ttyGF0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_sh4 = -M r2d -serial file:/dev/stdout -append "console=ttySC1,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_openrisc = -M virt -m 512M -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_parisc32 = -M B160L -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS = -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA) # OUTPUT is only set when run from the main makefile, otherwise @@ -215,6 +190,7 @@ CFLAGS_i386 = $(call cc-option,-m32) CFLAGS_x32 = -mx32 CFLAGS_arm = -marm CFLAGS_armthumb = -mthumb -march=armv6t2 +CFLAGS_parisc32 = -mfast-indirect-calls CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) @@ -233,6 +209,7 @@ CFLAGS_XARCH = $(CFLAGS_$(XARCH)) endif LDLIBS_ppc = $(if $(LLVM),,-lgcc) +LDLIBS_openrisc = $(if $(LLVM),,-lgcc) LDLIBS = $(LDLIBS_$(XARCH)) include Makefile.include diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index d3c4facb54c0..c1c1ce43a047 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #define _LARGEFILE64_SOURCE +#define _FILE_OFFSET_BITS 64 /* libc-specific include files * The program may be built in 3 ways: @@ -45,6 +46,7 @@ #include <stdbool.h> #include <byteswap.h> #include <endian.h> +#include <alloca.h> #pragma GCC diagnostic ignored "-Wmissing-prototypes" @@ -647,20 +649,25 @@ int expect_str_buf_eq(size_t expr, const char *buf, size_t val, int llen, const return 0; } +enum strtox_func { + strtox_func_strtol, + strtox_func_strtoul, +}; + #define EXPECT_STRTOX(cond, func, input, base, expected, chars, expected_errno) \ - do { if (!(cond)) result(llen, SKIPPED); else ret += expect_strtox(llen, func, input, base, expected, chars, expected_errno); } while (0) + do { if (!(cond)) result(llen, SKIPPED); else ret += expect_strtox(llen, strtox_func_ ## func, input, base, expected, chars, expected_errno); } while (0) static __attribute__((unused)) -int expect_strtox(int llen, void *func, const char *input, int base, intmax_t expected, int expected_chars, int expected_errno) +int expect_strtox(int llen, enum strtox_func func, const char *input, int base, intmax_t expected, int expected_chars, int expected_errno) { char *endptr; int actual_errno, actual_chars; intmax_t r; errno = 0; - if (func == strtol) { + if (func == strtox_func_strtol) { r = strtol(input, &endptr, base); - } else if (func == strtoul) { + } else if (func == strtox_func_strtoul) { r = strtoul(input, &endptr, base); } else { result(llen, FAIL); @@ -797,7 +804,7 @@ int test_getdents64(const char *dir) int fd, ret; int err; - ret = fd = open(dir, O_RDONLY | O_DIRECTORY, 0); + ret = fd = open(dir, O_RDONLY | O_DIRECTORY); if (ret < 0) return ret; @@ -1010,6 +1017,57 @@ int test_fork(enum fork_type type) } } +int test_ftruncate(void) +{ + struct stat stat_buf; + int ret, fd; + + ret = ftruncate(-1, 0); + if (ret != -1 || errno != EBADF) { + errno = EINVAL; + return __LINE__; + } + + fd = memfd_create(__func__, 0); + if (fd == -1) + return __LINE__; + + /* + * This also tests that the high 32-bit half is passed through correctly. + * If it gets lost, the kernel will see a positive number and not fail. + */ + ret = ftruncate(fd, -1); + if (!(ret == -1 && errno == EINVAL)) { + if (ret == 0) + errno = EINVAL; + ret = __LINE__; + goto end; + } + + ret = ftruncate(fd, 42); + if (ret != 0) { + ret = __LINE__; + goto end; + } + + ret = fstat(fd, &stat_buf); + if (ret != 0) { + ret = __LINE__; + goto end; + } + + if (stat_buf.st_size != 42) { + errno = EINVAL; + ret = __LINE__; + goto end; + } + +end: + close(fd); + + return ret; +} + int test_stat_timestamps(void) { struct stat st; @@ -1298,6 +1356,45 @@ int test_openat(void) return 0; } +int test_open_mode(void) +{ + const mode_t mode = 0444; + struct stat stat_buf; + int fd, ret; + + fd = open("/tmp", O_TMPFILE | O_RDWR, mode); + if (fd == -1) + return -1; + + ret = fstat(fd, &stat_buf); + close(fd); + + if (ret == -1) + return -1; + + if ((stat_buf.st_mode & 0777) != mode) + return -1; + + return 0; +} + +int test_nolibc_enosys(void) +{ + if (true) + return 0; + +#if defined(NOLIBC) + /* + * __nolibc_enosys() will fail the compilation. + * Make sure it can be optimized away if not actually called. + */ + if (__nolibc_enosys("something") != -ENOSYS) + return 1; +#endif + + return 0; +} + int test_namespace(void) { int original_ns, new_ns, ret; @@ -1364,6 +1461,52 @@ out: return ret; } +int test_large_file(void) +{ + off_t large_seek = ((off_t)UINT32_MAX) + 100; + int fd, ret, saved_errno; + ssize_t written; + off_t off; + +#if defined(__mips__) && defined(_ABIN32) + /* https://lore.kernel.org/qemu-devel/fed03914-a95a-4522-a432-f129264cb2ac@t-8ch.de/ */ + if (getpid() != 1) + return 0; +#endif + + if (large_seek < UINT32_MAX) { + errno = EOVERFLOW; + return -1; + } + + fd = open("/tmp", O_TMPFILE | O_RDWR, 0644); + if (fd == -1) + return -1; + + off = lseek(fd, large_seek, SEEK_CUR); + if (off == -1) { + ret = off; + goto out; + } else if (off != large_seek) { + errno = ERANGE; + ret = -1; + goto out; + } + + written = write(fd, "1", 1); + if (written == -1) { + ret = written; + goto out; + } + + ret = 0; +out: + saved_errno = errno; + close(fd); + errno = saved_errno; + return ret; +} + /* Run syscall tests between IDs <min> and <max>. * Return 0 on success, non-zero on failure. */ @@ -1441,12 +1584,13 @@ int run_syscall(int min, int max) CASE_TEST(dup2_m1); tmp = dup2(-1, 100); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; CASE_TEST(dup3_0); tmp = dup3(0, 100, 0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; CASE_TEST(dup3_m1); tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; - CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break; + CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = (char []){"/"}, [1] = NULL }, NULL), -1, EACCES); break; CASE_TEST(fchdir_stdin); EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break; CASE_TEST(fchdir_badfd); EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break; CASE_TEST(file_stream); EXPECT_SYSZR(1, test_file_stream()); break; CASE_TEST(file_stream_wsr); EXPECT_SYSZR(1, test_file_stream_wsr()); break; CASE_TEST(fork); EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break; + CASE_TEST(ftruncate); EXPECT_SYSZR(1, test_ftruncate()); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; CASE_TEST(directories); EXPECT_SYSZR(is_nolibc && proc, test_dirent()); break; @@ -1466,9 +1610,11 @@ int run_syscall(int min, int max) CASE_TEST(munmap_bad); EXPECT_SYSER(1, munmap(NULL, 0), -1, EINVAL); break; CASE_TEST(mmap_munmap_good); EXPECT_SYSZR(1, test_mmap_munmap()); break; CASE_TEST(nanosleep); ts.tv_nsec = -1; EXPECT_SYSER(1, nanosleep(&ts, NULL), -1, EINVAL); break; + CASE_TEST(nolibc_enosys); EXPECT_ZR(is_nolibc, test_nolibc_enosys()); break; CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", O_RDONLY), -1); if (tmp != -1) close(tmp); break; CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", O_RDONLY), -1, ENOENT); if (tmp != -1) close(tmp); break; CASE_TEST(openat_dir); EXPECT_SYSZR(1, test_openat()); break; + CASE_TEST(open_mode); EXPECT_SYSZR(1, test_open_mode()); break; CASE_TEST(pipe); EXPECT_SYSZR(1, test_pipe()); break; CASE_TEST(poll_null); EXPECT_SYSZR(1, poll(NULL, 0, 0)); break; CASE_TEST(poll_stdout); EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break; @@ -1508,6 +1654,7 @@ int run_syscall(int min, int max) CASE_TEST(_syscall_noargs); EXPECT_SYSEQ(is_nolibc, _syscall(__NR_getpid), getpid()); break; CASE_TEST(_syscall_args); EXPECT_SYSEQ(is_nolibc, _syscall(__NR_statx, 0, NULL, 0, 0, NULL), -EFAULT); break; CASE_TEST(namespace); EXPECT_SYSZR(euid0 && proc, test_namespace()); break; + CASE_TEST(largefile); EXPECT_SYSZR(1, test_large_file()); break; case __LINE__: return ret; /* must be last */ /* note: do not set any defaults so as to permit holes above */ @@ -1516,6 +1663,18 @@ int run_syscall(int min, int max) return ret; } +int test_alloca(void) +{ + uint64_t *x; + + x = alloca(sizeof(*x)); + + *x = 0x1234; + __asm__ ("" : "+r" (x)); + + return *x - 0x1234; +} + int test_difftime(void) { if (difftime(200., 100.) != 100.) @@ -1731,6 +1890,7 @@ int run_stdlib(int min, int max) CASE_TEST(toupper_noop); EXPECT_EQ(1, toupper('A'), 'A'); break; CASE_TEST(abs); EXPECT_EQ(1, abs(-10), 10); break; CASE_TEST(abs_noop); EXPECT_EQ(1, abs(10), 10); break; + CASE_TEST(alloca); EXPECT_ZR(1, test_alloca()); break; CASE_TEST(difftime); EXPECT_ZR(1, test_difftime()); break; CASE_TEST(memchr_foobar6_o); EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break; CASE_TEST(memchr_foobar3_b); EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break; diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index cd439096fdf3..6460e25001de 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -21,6 +21,7 @@ all_archs=( i386 x86_64 x32 arm64 arm armthumb mips32le mips32be mipsn32le mipsn32be mips64le mips64be + openrisc ppc ppc64 ppc64le riscv32 riscv64 s390x @@ -28,6 +29,7 @@ all_archs=( sparc32 sparc64 m68k sh4 + parisc32 ) archs="${all_archs[@]}" @@ -107,6 +109,7 @@ crosstool_arch() { case "$1" in arm64) echo aarch64;; armthumb) echo arm;; + openrisc) echo or1k;; ppc) echo powerpc;; ppc64) echo powerpc64;; ppc64le) echo powerpc64;; @@ -116,6 +119,7 @@ crosstool_arch() { s390*) echo s390;; sparc*) echo sparc64;; x32*) echo x86_64;; + parisc32) echo hppa;; *) echo "$1";; esac } @@ -173,6 +177,10 @@ test_arch() { fi MAKE=(make -f Makefile.nolibc -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") + if [ "$arch" = "parisc32" ]; then + MAKE+=("CROSS32CC=${cross_compile}gcc") + fi + case "$test_mode" in 'system') test_target=run @@ -185,7 +193,7 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" - if [ "$arch" = "m68k" -o "$arch" = "sh4" ] && [ "$llvm" = "1" ]; then + if [ "$arch" = "m68k" -o "$arch" = "sh4" -o "$arch" = "openrisc" -o "$arch" = "parisc32" ] && [ "$llvm" = "1" ]; then echo "Unsupported configuration" return fi diff --git a/tools/testing/selftests/openat2/helpers.c b/tools/testing/selftests/openat2/helpers.c deleted file mode 100644 index 5074681ffdc9..000000000000 --- a/tools/testing/selftests/openat2/helpers.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Author: Aleksa Sarai <cyphar@cyphar.com> - * Copyright (C) 2018-2019 SUSE LLC. - */ - -#define _GNU_SOURCE -#include <errno.h> -#include <fcntl.h> -#include <stdbool.h> -#include <string.h> -#include <syscall.h> -#include <limits.h> - -#include "helpers.h" - -bool needs_openat2(const struct open_how *how) -{ - return how->resolve != 0; -} - -int raw_openat2(int dfd, const char *path, void *how, size_t size) -{ - int ret = syscall(__NR_openat2, dfd, path, how, size); - return ret >= 0 ? ret : -errno; -} - -int sys_openat2(int dfd, const char *path, struct open_how *how) -{ - return raw_openat2(dfd, path, how, sizeof(*how)); -} - -int sys_openat(int dfd, const char *path, struct open_how *how) -{ - int ret = openat(dfd, path, how->flags, how->mode); - return ret >= 0 ? ret : -errno; -} - -int sys_renameat2(int olddirfd, const char *oldpath, - int newdirfd, const char *newpath, unsigned int flags) -{ - int ret = syscall(__NR_renameat2, olddirfd, oldpath, - newdirfd, newpath, flags); - return ret >= 0 ? ret : -errno; -} - -int touchat(int dfd, const char *path) -{ - int fd = openat(dfd, path, O_CREAT, 0700); - if (fd >= 0) - close(fd); - return fd; -} - -char *fdreadlink(int fd) -{ - char *target, *tmp; - - E_asprintf(&tmp, "/proc/self/fd/%d", fd); - - target = malloc(PATH_MAX); - if (!target) - ksft_exit_fail_msg("fdreadlink: malloc failed\n"); - memset(target, 0, PATH_MAX); - - E_readlink(tmp, target, PATH_MAX); - free(tmp); - return target; -} - -bool fdequal(int fd, int dfd, const char *path) -{ - char *fdpath, *dfdpath, *other; - bool cmp; - - fdpath = fdreadlink(fd); - dfdpath = fdreadlink(dfd); - - if (!path) - E_asprintf(&other, "%s", dfdpath); - else if (*path == '/') - E_asprintf(&other, "%s", path); - else - E_asprintf(&other, "%s/%s", dfdpath, path); - - cmp = !strcmp(fdpath, other); - - free(fdpath); - free(dfdpath); - free(other); - return cmp; -} - -bool openat2_supported = false; - -void __attribute__((constructor)) init(void) -{ - struct open_how how = {}; - int fd; - - BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_VER0); - - /* Check openat2(2) support. */ - fd = sys_openat2(AT_FDCWD, ".", &how); - openat2_supported = (fd >= 0); - - if (fd >= 0) - close(fd); -} diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h deleted file mode 100644 index 510e60602511..000000000000 --- a/tools/testing/selftests/openat2/helpers.h +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Author: Aleksa Sarai <cyphar@cyphar.com> - * Copyright (C) 2018-2019 SUSE LLC. - */ - -#ifndef __RESOLVEAT_H__ -#define __RESOLVEAT_H__ - -#define _GNU_SOURCE -#include <stdint.h> -#include <stdbool.h> -#include <errno.h> -#include <linux/types.h> -#include "kselftest.h" - -#define ARRAY_LEN(X) (sizeof (X) / sizeof (*(X))) -#define BUILD_BUG_ON(e) ((void)(sizeof(struct { int:(-!!(e)); }))) - -#ifndef SYS_openat2 -#ifndef __NR_openat2 -#define __NR_openat2 437 -#endif /* __NR_openat2 */ -#define SYS_openat2 __NR_openat2 -#endif /* SYS_openat2 */ - -/* - * Arguments for how openat2(2) should open the target path. If @resolve is - * zero, then openat2(2) operates very similarly to openat(2). - * - * However, unlike openat(2), unknown bits in @flags result in -EINVAL rather - * than being silently ignored. @mode must be zero unless one of {O_CREAT, - * O_TMPFILE} are set. - * - * @flags: O_* flags. - * @mode: O_CREAT/O_TMPFILE file mode. - * @resolve: RESOLVE_* flags. - */ -struct open_how { - __u64 flags; - __u64 mode; - __u64 resolve; -}; - -#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */ -#define OPEN_HOW_SIZE_LATEST OPEN_HOW_SIZE_VER0 - -bool needs_openat2(const struct open_how *how); - -#ifndef RESOLVE_IN_ROOT -/* how->resolve flags for openat2(2). */ -#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings - (includes bind-mounts). */ -#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style - "magic-links". */ -#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks - (implies OEXT_NO_MAGICLINKS) */ -#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like - "..", symlinks, and absolute - paths which escape the dirfd. */ -#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." - be scoped inside the dirfd - (similar to chroot(2)). */ -#endif /* RESOLVE_IN_ROOT */ - -#define E_func(func, ...) \ - do { \ - errno = 0; \ - if (func(__VA_ARGS__) < 0) \ - ksft_exit_fail_msg("%s:%d %s failed - errno:%d\n", \ - __FILE__, __LINE__, #func, errno); \ - } while (0) - -#define E_asprintf(...) E_func(asprintf, __VA_ARGS__) -#define E_chmod(...) E_func(chmod, __VA_ARGS__) -#define E_dup2(...) E_func(dup2, __VA_ARGS__) -#define E_fchdir(...) E_func(fchdir, __VA_ARGS__) -#define E_fstatat(...) E_func(fstatat, __VA_ARGS__) -#define E_kill(...) E_func(kill, __VA_ARGS__) -#define E_mkdirat(...) E_func(mkdirat, __VA_ARGS__) -#define E_mount(...) E_func(mount, __VA_ARGS__) -#define E_prctl(...) E_func(prctl, __VA_ARGS__) -#define E_readlink(...) E_func(readlink, __VA_ARGS__) -#define E_setresuid(...) E_func(setresuid, __VA_ARGS__) -#define E_symlinkat(...) E_func(symlinkat, __VA_ARGS__) -#define E_touchat(...) E_func(touchat, __VA_ARGS__) -#define E_unshare(...) E_func(unshare, __VA_ARGS__) - -#define E_assert(expr, msg, ...) \ - do { \ - if (!(expr)) \ - ksft_exit_fail_msg("ASSERT(%s:%d) failed (%s): " msg "\n", \ - __FILE__, __LINE__, #expr, ##__VA_ARGS__); \ - } while (0) - -int raw_openat2(int dfd, const char *path, void *how, size_t size); -int sys_openat2(int dfd, const char *path, struct open_how *how); -int sys_openat(int dfd, const char *path, struct open_how *how); -int sys_renameat2(int olddirfd, const char *oldpath, - int newdirfd, const char *newpath, unsigned int flags); - -int touchat(int dfd, const char *path); -char *fdreadlink(int fd); -bool fdequal(int fd, int dfd, const char *path); - -extern bool openat2_supported; - -#endif /* __RESOLVEAT_H__ */ diff --git a/tools/testing/selftests/openat2/rename_attack_test.c b/tools/testing/selftests/openat2/rename_attack_test.c deleted file mode 100644 index aa5699e45729..000000000000 --- a/tools/testing/selftests/openat2/rename_attack_test.c +++ /dev/null @@ -1,160 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Author: Aleksa Sarai <cyphar@cyphar.com> - * Copyright (C) 2018-2019 SUSE LLC. - */ - -#define _GNU_SOURCE -#include <errno.h> -#include <fcntl.h> -#include <sched.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/mount.h> -#include <sys/mman.h> -#include <sys/prctl.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <stdbool.h> -#include <string.h> -#include <syscall.h> -#include <limits.h> -#include <unistd.h> - -#include "kselftest.h" -#include "helpers.h" - -/* Construct a test directory with the following structure: - * - * root/ - * |-- a/ - * | `-- c/ - * `-- b/ - */ -int setup_testdir(void) -{ - int dfd; - char dirname[] = "/tmp/ksft-openat2-rename-attack.XXXXXX"; - - /* Make the top-level directory. */ - if (!mkdtemp(dirname)) - ksft_exit_fail_msg("setup_testdir: failed to create tmpdir\n"); - dfd = open(dirname, O_PATH | O_DIRECTORY); - if (dfd < 0) - ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n"); - - E_mkdirat(dfd, "a", 0755); - E_mkdirat(dfd, "b", 0755); - E_mkdirat(dfd, "a/c", 0755); - - return dfd; -} - -/* Swap @dirfd/@a and @dirfd/@b constantly. Parent must kill this process. */ -pid_t spawn_attack(int dirfd, char *a, char *b) -{ - pid_t child = fork(); - if (child != 0) - return child; - - /* If the parent (the test process) dies, kill ourselves too. */ - E_prctl(PR_SET_PDEATHSIG, SIGKILL); - - /* Swap @a and @b. */ - for (;;) - renameat2(dirfd, a, dirfd, b, RENAME_EXCHANGE); - exit(1); -} - -#define NUM_RENAME_TESTS 2 -#define ROUNDS 400000 - -const char *flagname(int resolve) -{ - switch (resolve) { - case RESOLVE_IN_ROOT: - return "RESOLVE_IN_ROOT"; - case RESOLVE_BENEATH: - return "RESOLVE_BENEATH"; - } - return "(unknown)"; -} - -void test_rename_attack(int resolve) -{ - int dfd, afd; - pid_t child; - void (*resultfn)(const char *msg, ...) = ksft_test_result_pass; - int escapes = 0, other_errs = 0, exdevs = 0, eagains = 0, successes = 0; - - struct open_how how = { - .flags = O_PATH, - .resolve = resolve, - }; - - if (!openat2_supported) { - how.resolve = 0; - ksft_print_msg("openat2(2) unsupported -- using openat(2) instead\n"); - } - - dfd = setup_testdir(); - afd = openat(dfd, "a", O_PATH); - if (afd < 0) - ksft_exit_fail_msg("test_rename_attack: failed to open 'a'\n"); - - child = spawn_attack(dfd, "a/c", "b"); - - for (int i = 0; i < ROUNDS; i++) { - int fd; - char *victim_path = "c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../.."; - - if (openat2_supported) - fd = sys_openat2(afd, victim_path, &how); - else - fd = sys_openat(afd, victim_path, &how); - - if (fd < 0) { - if (fd == -EAGAIN) - eagains++; - else if (fd == -EXDEV) - exdevs++; - else if (fd == -ENOENT) - escapes++; /* escaped outside and got ENOENT... */ - else - other_errs++; /* unexpected error */ - } else { - if (fdequal(fd, afd, NULL)) - successes++; - else - escapes++; /* we got an unexpected fd */ - } - close(fd); - } - - if (escapes > 0) - resultfn = ksft_test_result_fail; - ksft_print_msg("non-escapes: EAGAIN=%d EXDEV=%d E<other>=%d success=%d\n", - eagains, exdevs, other_errs, successes); - resultfn("rename attack with %s (%d runs, got %d escapes)\n", - flagname(resolve), ROUNDS, escapes); - - /* Should be killed anyway, but might as well make sure. */ - E_kill(child, SIGKILL); -} - -#define NUM_TESTS NUM_RENAME_TESTS - -int main(int argc, char **argv) -{ - ksft_print_header(); - ksft_set_plan(NUM_TESTS); - - test_rename_attack(RESOLVE_BENEATH); - test_rename_attack(RESOLVE_IN_ROOT); - - if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) - ksft_exit_fail(); - else - ksft_exit_pass(); -} diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c index 0f64b9b17081..a84709cabd8b 100644 --- a/tools/testing/selftests/perf_events/watermark_signal.c +++ b/tools/testing/selftests/perf_events/watermark_signal.c @@ -102,7 +102,7 @@ TEST(watermark_signal) } p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (p == NULL) { + if (p == MAP_FAILED) { perror("mmap"); goto cleanup; } diff --git a/tools/testing/selftests/pid_namespace/pid_max.c b/tools/testing/selftests/pid_namespace/pid_max.c index c9519e7385b6..5d686a09aa15 100644 --- a/tools/testing/selftests/pid_namespace/pid_max.c +++ b/tools/testing/selftests/pid_namespace/pid_max.c @@ -12,10 +12,74 @@ #include <syscall.h> #include <sys/mount.h> #include <sys/wait.h> +#include <unistd.h> #include "kselftest_harness.h" #include "../pidfd/pidfd.h" +/* + * The kernel computes the minimum allowed pid_max as: + * max(RESERVED_PIDS + 1, PIDS_PER_CPU_MIN * num_possible_cpus()) + * Mirror that here so the test values are always valid. + * + * Note: glibc's get_nprocs_conf() returns the number of *configured* + * (present) CPUs, not *possible* CPUs. The kernel uses + * num_possible_cpus() which corresponds to /sys/devices/system/cpu/possible. + * These can differ significantly (e.g. 16 configured vs 128 possible). + */ +#define RESERVED_PIDS 300 +#define PIDS_PER_CPU_MIN 8 + +/* Count CPUs from a range list like "0-31" or "0-15,32-47". */ +static int num_possible_cpus(void) +{ + FILE *f; + int count = 0; + int lo, hi; + + f = fopen("/sys/devices/system/cpu/possible", "r"); + if (!f) + return 0; + + while (fscanf(f, "%d", &lo) == 1) { + if (fscanf(f, "-%d", &hi) == 1) + count += hi - lo + 1; + else + count++; + /* skip comma separator */ + fscanf(f, ","); + } + + fclose(f); + return count; +} + +static int pid_min(void) +{ + int cpu_min = PIDS_PER_CPU_MIN * num_possible_cpus(); + + return cpu_min > (RESERVED_PIDS + 1) ? cpu_min : (RESERVED_PIDS + 1); +} + +/* + * Outer and inner pid_max limits used by the tests. The outer limit is + * the more restrictive ancestor; the inner limit is set higher in a + * nested namespace but must still be capped by the outer limit. + * Both are derived from the kernel's minimum so they are always writable. + * + * Global so that clone callbacks can access them without parameter plumbing. + */ +static int outer_limit; +static int inner_limit; + +static int write_int_to_fd(int fd, int val) +{ + char buf[12]; + int len = snprintf(buf, sizeof(buf), "%d", val); + + return write(fd, buf, len); +} + #define __STACK_SIZE (8 * 1024 * 1024) static pid_t do_clone(int (*fn)(void *), void *arg, int flags) { @@ -60,18 +124,18 @@ static int pid_max_cb(void *data) return -1; } - ret = write(fd, "500", sizeof("500") - 1); + ret = write_int_to_fd(fd, inner_limit); if (ret < 0) { fprintf(stderr, "%m - Failed to write pid_max\n"); return -1; } - for (int i = 0; i < 501; i++) { + for (int i = 0; i < inner_limit + 1; i++) { pid = fork(); if (pid == 0) exit(EXIT_SUCCESS); wait_for_pid(pid); - if (pid > 500) { + if (pid > inner_limit) { fprintf(stderr, "Managed to create pid number beyond limit\n"); return -1; } @@ -106,7 +170,7 @@ static int pid_max_nested_inner(void *data) return fret; } - ret = write(fd, "500", sizeof("500") - 1); + ret = write_int_to_fd(fd, inner_limit); close(fd); if (ret < 0) { fprintf(stderr, "%m - Failed to write pid_max\n"); @@ -133,8 +197,8 @@ static int pid_max_nested_inner(void *data) return fret; } - /* Now make sure that we wrap pids at 400. */ - for (i = 0; i < 510; i++) { + /* Now make sure that we wrap pids at outer_limit. */ + for (i = 0; i < inner_limit + 10; i++) { pid_t pid; pid = fork(); @@ -145,7 +209,7 @@ static int pid_max_nested_inner(void *data) exit(EXIT_SUCCESS); wait_for_pid(pid); - if (pid >= 500) { + if (pid >= inner_limit) { fprintf(stderr, "Managed to create process with pid %d beyond configured limit\n", pid); return fret; } @@ -156,15 +220,19 @@ static int pid_max_nested_inner(void *data) static int pid_max_nested_outer(void *data) { - int fret = -1, nr_procs = 400; - pid_t pids[1000]; - int fd, i, ret; + int fret = -1, nr_procs = 0; + pid_t *pids; + int fd, ret; pid_t pid; + pids = malloc(outer_limit * sizeof(pid_t)); + if (!pids) + return -1; + ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0); if (ret) { fprintf(stderr, "%m - Failed to make rootfs private mount\n"); - return fret; + goto out; } umount2("/proc", MNT_DETACH); @@ -172,27 +240,28 @@ static int pid_max_nested_outer(void *data) ret = mount("proc", "/proc", "proc", 0, NULL); if (ret) { fprintf(stderr, "%m - Failed to mount proc\n"); - return fret; + goto out; } fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY); if (fd < 0) { fprintf(stderr, "%m - Failed to open pid_max\n"); - return fret; + goto out; } - ret = write(fd, "400", sizeof("400") - 1); + ret = write_int_to_fd(fd, outer_limit); close(fd); if (ret < 0) { fprintf(stderr, "%m - Failed to write pid_max\n"); - return fret; + goto out; } /* - * Create 397 processes. This leaves room for do_clone() (398) and - * one more 399. So creating another process needs to fail. + * Create (outer_limit - 4) processes. This leaves room for + * do_clone() and one more. So creating another process needs + * to fail. */ - for (nr_procs = 0; nr_procs < 396; nr_procs++) { + for (nr_procs = 0; nr_procs < outer_limit - 4; nr_procs++) { pid = fork(); if (pid < 0) goto reap; @@ -220,20 +289,26 @@ reap: for (int i = 0; i < nr_procs; i++) wait_for_pid(pids[i]); +out: + free(pids); return fret; } static int pid_max_nested_limit_inner(void *data) { - int fret = -1, nr_procs = 400; + int fret = -1, nr_procs = 0; int fd, ret; pid_t pid; - pid_t pids[1000]; + pid_t *pids; + + pids = malloc(inner_limit * sizeof(pid_t)); + if (!pids) + return -1; ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0); if (ret) { fprintf(stderr, "%m - Failed to make rootfs private mount\n"); - return fret; + goto out; } umount2("/proc", MNT_DETACH); @@ -241,23 +316,23 @@ static int pid_max_nested_limit_inner(void *data) ret = mount("proc", "/proc", "proc", 0, NULL); if (ret) { fprintf(stderr, "%m - Failed to mount proc\n"); - return fret; + goto out; } fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY); if (fd < 0) { fprintf(stderr, "%m - Failed to open pid_max\n"); - return fret; + goto out; } - ret = write(fd, "500", sizeof("500") - 1); + ret = write_int_to_fd(fd, inner_limit); close(fd); if (ret < 0) { fprintf(stderr, "%m - Failed to write pid_max\n"); - return fret; + goto out; } - for (nr_procs = 0; nr_procs < 500; nr_procs++) { + for (nr_procs = 0; nr_procs < inner_limit; nr_procs++) { pid = fork(); if (pid < 0) break; @@ -268,7 +343,7 @@ static int pid_max_nested_limit_inner(void *data) pids[nr_procs] = pid; } - if (nr_procs >= 400) { + if (nr_procs >= outer_limit) { fprintf(stderr, "Managed to create processes beyond the configured outer limit\n"); goto reap; } @@ -279,6 +354,8 @@ reap: for (int i = 0; i < nr_procs; i++) wait_for_pid(pids[i]); +out: + free(pids); return fret; } @@ -307,7 +384,7 @@ static int pid_max_nested_limit_outer(void *data) return -1; } - ret = write(fd, "400", sizeof("400") - 1); + ret = write_int_to_fd(fd, outer_limit); close(fd); if (ret < 0) { fprintf(stderr, "%m - Failed to write pid_max\n"); @@ -328,17 +405,32 @@ static int pid_max_nested_limit_outer(void *data) return 0; } -TEST(pid_max_simple) +FIXTURE(pid_max) { + int dummy; +}; + +FIXTURE_SETUP(pid_max) { - pid_t pid; + int min = pid_min(); + outer_limit = min + 100; + inner_limit = min + 200; +} + +FIXTURE_TEARDOWN(pid_max) +{ +} + +TEST_F(pid_max, simple) +{ + pid_t pid; pid = do_clone(pid_max_cb, NULL, CLONE_NEWPID | CLONE_NEWNS); ASSERT_GT(pid, 0); ASSERT_EQ(0, wait_for_pid(pid)); } -TEST(pid_max_nested_limit) +TEST_F(pid_max, nested_limit) { pid_t pid; @@ -347,7 +439,7 @@ TEST(pid_max_nested_limit) ASSERT_EQ(0, wait_for_pid(pid)); } -TEST(pid_max_nested) +TEST_F(pid_max, nested) { pid_t pid; diff --git a/tools/testing/selftests/pipe/.gitignore b/tools/testing/selftests/pipe/.gitignore new file mode 100644 index 000000000000..20b549361a15 --- /dev/null +++ b/tools/testing/selftests/pipe/.gitignore @@ -0,0 +1 @@ +pipe_bench diff --git a/tools/testing/selftests/pipe/Makefile b/tools/testing/selftests/pipe/Makefile new file mode 100644 index 000000000000..1810c680117b --- /dev/null +++ b/tools/testing/selftests/pipe/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2026 Meta Platforms, Inc. and affiliates +# Copyright (c) 2026 Breno Leitao <leitao@debian.org> + +CFLAGS += -O2 -Wall -Wextra -pthread + +TEST_GEN_PROGS := pipe_bench + +include ../lib.mk diff --git a/tools/testing/selftests/pipe/pipe_bench.c b/tools/testing/selftests/pipe/pipe_bench.c new file mode 100644 index 000000000000..7e96429b8fb4 --- /dev/null +++ b/tools/testing/selftests/pipe/pipe_bench.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * pipe_bench - exercise concurrent pipe operation + * + * N writer threads hammer a single pipe with multi-page writes; M reader + * threads drain it. Each writer records its own write() latency histogram. + * Multi-page writes (msgsize >= PAGE_SIZE) force the loop in + * anon_pipe_write() to call alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT) under + * pipe->mutex, which is the critical section the patch shrinks. + * + * By default the benchmark sweeps writers in {1, 2, 5} x readers in + * {1, 5, 10} and prints one block per configuration so two runs (e.g. + * baseline vs patched) can be diffed directly. Pass -w and -r to run a + * single configuration instead. Pass --memory-pressure to spawn stress-ng + * alongside the sweep so the per-page alloc_page() path under pipe->mutex + * has to dip into reclaim. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates + * Copyright (c) 2026 Breno Leitao <leitao@debian.org> + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <poll.h> +#include <pthread.h> +#include <signal.h> +#include <stdatomic.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/wait.h> +#include <time.h> +#include <unistd.h> + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) +#define HIST_BUCKETS 32 + +static size_t g_msgsize = 16 * 4096; +static int g_duration = 3; +static int g_pipe_size = 1024 * 1024; +static int g_memory_pressure; + +static atomic_int g_stop; +static int g_pipe[2]; + +struct wstats { + uint64_t writes; + uint64_t bytes; + uint64_t lat_sum_ns; + uint64_t lat_max_ns; + uint64_t lat_hist[HIST_BUCKETS]; + char *buf; +}; + +struct rstats { + char *buf; +}; + +struct hist_totals { + uint64_t writes; + uint64_t bytes; + uint64_t lat_sum; + uint64_t lat_max; +}; + +static inline uint64_t now_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +static inline int log2_bucket(uint64_t v) +{ + int b = 0; + + if (!v) + return 0; + while (v >>= 1) + b++; + return b < HIST_BUCKETS ? b : HIST_BUCKETS - 1; +} + +static void *writer(void *arg) +{ + struct wstats *s = arg; + + while (!atomic_load_explicit(&g_stop, memory_order_relaxed)) { + uint64_t t0 = now_ns(); + ssize_t n = write(g_pipe[1], s->buf, g_msgsize); + uint64_t dt = now_ns() - t0; + + if (n > 0) { + s->writes++; + s->bytes += (uint64_t)n; + s->lat_sum_ns += dt; + if (dt > s->lat_max_ns) + s->lat_max_ns = dt; + s->lat_hist[log2_bucket(dt)]++; + } else if (n < 0 && (errno == EPIPE || errno == EBADF)) { + break; + } + } + return NULL; +} + +static void *reader(void *arg) +{ + struct rstats *s = arg; + + /* + * Drain until EOF (write end closed by main). g_stop is not checked + * here on purpose: writers may be blocked in write() with the pipe + * full when g_stop is set, so the reader must keep draining until + * main closes the write end. + */ + for (;;) { + ssize_t n = read(g_pipe[0], s->buf, g_msgsize); + + if (n <= 0) + break; + } + return NULL; +} + +/* Sum per-writer stats and per-bucket counts into the caller's aggregates. */ +static void aggregate_wstats(struct wstats *all, int nw, + uint64_t agg[HIST_BUCKETS], + struct hist_totals *t) +{ + memset(t, 0, sizeof(*t)); + for (int i = 0; i < nw; i++) { + t->writes += all[i].writes; + t->bytes += all[i].bytes; + t->lat_sum += all[i].lat_sum_ns; + if (all[i].lat_max_ns > t->lat_max) + t->lat_max = all[i].lat_max_ns; + for (int b = 0; b < HIST_BUCKETS; b++) + agg[b] += all[i].lat_hist[b]; + } +} + +/* + * Walk @agg in order, returning the inclusive upper bound (in ns) of the + * log2 bucket where the running sum first reaches @target. + * + * A percentile is undefined with zero samples, and with very low sample + * counts integer truncation could make @target zero -- then "cum >= 0" + * would latch on the first (possibly empty) bucket. Callers must pass + * @target >= 1. + */ +static uint64_t bucket_at(const uint64_t agg[HIST_BUCKETS], uint64_t target) +{ + uint64_t cum = 0; + + for (int b = 0; b < HIST_BUCKETS; b++) { + /* HIST_BUCKETS <= 63, so (b + 1) is always a safe shift. */ + uint64_t upper = (1ULL << (b + 1)) - 1; + + cum += agg[b]; + if (cum >= target) + return upper; + } + return 0; +} + +static void compute_p50_p99(const uint64_t agg[HIST_BUCKETS], uint64_t writes, + uint64_t *p50, uint64_t *p99) +{ + uint64_t p50_target, p99_target; + + *p50 = *p99 = 0; + if (!writes) + return; + + p50_target = writes * 50 / 100; + p99_target = writes * 99 / 100; + if (!p50_target) + p50_target = 1; + if (!p99_target) + p99_target = 1; + + *p50 = bucket_at(agg, p50_target); + *p99 = bucket_at(agg, p99_target); +} + +static void print_summary(int nw, int nr, const struct hist_totals *t, + uint64_t p50, uint64_t p99) +{ + double sec = g_duration; + uint64_t avg_ns = t->writes ? t->lat_sum / t->writes : 0; + + printf("config: writers=%d readers=%d msgsize=%zu duration=%d pipe_size=%d memory_pressure=%s\n", + nw, nr, g_msgsize, g_duration, g_pipe_size, + g_memory_pressure ? "yes" : "no"); + printf("writes: total=%llu rate=%.0f/s\n", + (unsigned long long)t->writes, (double)t->writes / sec); + printf("throughput_MBps: %.2f\n", + ((double)t->bytes / sec) / (1024.0 * 1024.0)); + printf("lat_avg_ns: %llu\n", (unsigned long long)avg_ns); + printf("lat_p50_ns_upper: %llu\n", (unsigned long long)p50); + printf("lat_p99_ns_upper: %llu\n", (unsigned long long)p99); + printf("lat_max_ns: %llu\n", (unsigned long long)t->lat_max); +} + +static void summarize(struct wstats *all, int nw, int nr) +{ + uint64_t agg[HIST_BUCKETS] = {0}; + struct hist_totals t; + uint64_t p50, p99; + + aggregate_wstats(all, nw, agg, &t); + compute_p50_p99(agg, t.writes, &p50, &p99); + print_summary(nw, nr, &t, p50, p99); +} + +/* + * Child branch of fork(): restore SIGPIPE to default (parent ignores it), + * exec stress-ng, and on failure write the reason into @hs_wr before + * exiting. The parent observes EOF on hs_wr (closed via O_CLOEXEC) when + * exec succeeds. + */ +static void stress_ng_child(int hs_wr) __attribute__((noreturn)); +static void stress_ng_child(int hs_wr) +{ + char errbuf[256]; + + signal(SIGPIPE, SIG_DFL); + execlp("stress-ng", "stress-ng", + "--vm", "4", "--vm-bytes", "80%", + "--vm-method", "all", + (char *)NULL); + snprintf(errbuf, sizeof(errbuf), + "exec stress-ng failed: %s\n", strerror(errno)); + (void)!write(hs_wr, errbuf, strlen(errbuf)); + _exit(127); +} + +/* + * Read from the O_CLOEXEC handshake pipe. Anything readable means the + * child wrote an error before exec; EOF (n == 0) means the write-end + * closed because exec succeeded. Returns 0 on exec success, -1 if the + * child failed and was reaped. + */ +static int stress_ng_wait_handshake(int hs_rd, pid_t pid) +{ + struct pollfd pfd = { .fd = hs_rd, .events = POLLIN }; + char errbuf[256]; + int status; + int ret; + + ret = poll(&pfd, 1, 500); + if (ret <= 0) + return 0; + + ssize_t n = read(hs_rd, errbuf, sizeof(errbuf) - 1); + + if (n > 0) { + errbuf[n] = '\0'; + fputs(errbuf, stderr); + waitpid(pid, &status, 0); + return -1; + } + return 0; +} + +static pid_t spawn_stress_ng(void) +{ + int hs[2]; + pid_t pid; + + /* + * Handshake pipe: child writes one byte and _exit()s on exec + * failure. On exec success the O_CLOEXEC flag closes the write + * end, which the parent observes as EOF. This makes the "is + * stress-ng on $PATH?" check fail fast rather than silently. + */ + if (pipe2(hs, O_CLOEXEC) < 0) { + perror("pipe2"); + return -1; + } + + pid = fork(); + if (pid < 0) { + perror("fork"); + close(hs[0]); + close(hs[1]); + return -1; + } + if (pid == 0) { + close(hs[0]); + stress_ng_child(hs[1]); + } + + close(hs[1]); + if (stress_ng_wait_handshake(hs[0], pid) < 0) { + close(hs[0]); + return -1; + } + close(hs[0]); + + /* Give stress-ng a moment to map its VM regions before measuring. */ + sleep(1); + return pid; +} + +static void kill_stress_ng(pid_t pid) +{ + int status; + + if (pid <= 0) + return; + kill(pid, SIGTERM); + for (int i = 0; i < 20; i++) { + if (waitpid(pid, &status, WNOHANG) > 0) + return; + usleep(100 * 1000); + } + kill(pid, SIGKILL); + waitpid(pid, &status, 0); +} + +/* + * Allocate per-thread page-aligned buffers in main so a failed + * aligned_alloc() aborts the run before any thread starts. Workers used + * to allocate their own buffer and return NULL on failure, which left + * peers blocked in write()/read() with nobody to unblock them. + */ +static int alloc_thread_bufs(struct wstats *ws, int nw, + struct rstats *rs, int nr) +{ + for (int i = 0; i < nw; i++) { + ws[i].buf = aligned_alloc(4096, g_msgsize); + if (!ws[i].buf) { + fprintf(stderr, "writer %d: aligned_alloc(%zu) failed\n", + i, g_msgsize); + return -1; + } + memset(ws[i].buf, 0xAA, g_msgsize); + } + for (int i = 0; i < nr; i++) { + rs[i].buf = aligned_alloc(4096, g_msgsize); + if (!rs[i].buf) { + fprintf(stderr, "reader %d: aligned_alloc(%zu) failed\n", + i, g_msgsize); + return -1; + } + } + return 0; +} + +static void free_thread_bufs(struct wstats *ws, int nw, + struct rstats *rs, int nr) +{ + if (ws) + for (int i = 0; i < nw; i++) + free(ws[i].buf); + if (rs) + for (int i = 0; i < nr; i++) + free(rs[i].buf); +} + +static int start_readers(pthread_t *rt, struct rstats *rs, int nr, + int *created) +{ + for (int i = 0; i < nr; i++) { + int err = pthread_create(&rt[i], NULL, reader, &rs[i]); + + if (err) { + fprintf(stderr, "pthread_create reader %d: %s\n", + i, strerror(err)); + return -1; + } + (*created)++; + } + return 0; +} + +static int start_writers(pthread_t *wt, struct wstats *ws, int nw, + int *created) +{ + for (int i = 0; i < nw; i++) { + int err = pthread_create(&wt[i], NULL, writer, &ws[i]); + + if (err) { + fprintf(stderr, "pthread_create writer %d: %s\n", + i, strerror(err)); + return -1; + } + (*created)++; + } + return 0; +} + +static int open_bench_pipe(void) +{ + if (pipe(g_pipe) < 0) { + perror("pipe"); + return -1; + } + if (fcntl(g_pipe[1], F_SETPIPE_SZ, g_pipe_size) < 0) + perror("F_SETPIPE_SZ (continuing)"); + return 0; +} + +/* + * Normal termination: g_stop tells writers to leave the loop after the + * current write() returns. Closing the shared write-end fd means once + * the in-flight writes drain, readers see EOF and exit. Writers are not + * unblocked by EPIPE here -- g_pipe[0] stays open so readers can keep + * draining. + * + * Error path: some threads may have been created and others skipped, so + * writers could be blocked in write() with no reader making progress. + * Close both ends -- closing the read end is what delivers EPIPE to a + * blocked writer. + */ +static void stop_and_join(pthread_t *wt, int nw_created, + pthread_t *rt, int nr_created, int rc) +{ + atomic_store(&g_stop, 1); + close(g_pipe[1]); + if (rc < 0) + close(g_pipe[0]); + for (int i = 0; i < nw_created; i++) + pthread_join(wt[i], NULL); + for (int i = 0; i < nr_created; i++) + pthread_join(rt[i], NULL); + if (rc == 0) + close(g_pipe[0]); +} + +static int run_one(int nw, int nr) +{ + pthread_t *wt = NULL, *rt = NULL; + struct wstats *ws = NULL; + struct rstats *rs = NULL; + int nw_created = 0, nr_created = 0; + int rc = 0; + + atomic_store(&g_stop, 0); + + if (open_bench_pipe() < 0) + return -1; + + wt = calloc((size_t)nw, sizeof(*wt)); + rt = calloc((size_t)nr, sizeof(*rt)); + ws = calloc((size_t)nw, sizeof(*ws)); + rs = calloc((size_t)nr, sizeof(*rs)); + if (!wt || !rt || !ws || !rs) { + fprintf(stderr, "alloc failed\n"); + rc = -1; + goto teardown; + } + + if (alloc_thread_bufs(ws, nw, rs, nr) < 0) { + rc = -1; + goto teardown; + } + + if (start_readers(rt, rs, nr, &nr_created) < 0 || + start_writers(wt, ws, nw, &nw_created) < 0) { + rc = -1; + goto teardown; + } + + sleep((unsigned int)g_duration); + +teardown: + stop_and_join(wt, nw_created, rt, nr_created, rc); + + if (rc == 0) { + summarize(ws, nw, nr); + fflush(stdout); + } + + free_thread_bufs(ws, nw, rs, nr); + free(wt); + free(rt); + free(ws); + free(rs); + return rc; +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "usage: %s [-w writers] [-r readers] [-s msgsize] [-d secs] [-p pipe_size] [--memory-pressure]\n" + " default: sweep writers={1,2,5} x readers={1,5,10}\n" + " --memory-pressure: spawn stress-ng (--vm 4 --vm-bytes 80%% --vm-method all) for the run\n", + prog); +} + +static int parse_args(int argc, char **argv, + int *writers_override, int *readers_override) +{ + static const struct option long_opts[] = { + {"memory-pressure", no_argument, NULL, 'M'}, + {0, 0, 0, 0}, + }; + int opt; + + while ((opt = getopt_long(argc, argv, "w:r:s:d:p:", + long_opts, NULL)) != -1) { + switch (opt) { + case 'w': + *writers_override = atoi(optarg); + break; + case 'r': + *readers_override = atoi(optarg); + break; + case 's': + g_msgsize = (size_t)atol(optarg); + break; + case 'd': + g_duration = atoi(optarg); + break; + case 'p': + g_pipe_size = atoi(optarg); + break; + case 'M': + g_memory_pressure = 1; + break; + default: + usage(argv[0]); + return -1; + } + } + return 0; +} + +/* + * aligned_alloc(4096, size) requires size to be a multiple of the + * alignment (C11); glibc returns NULL otherwise, which would make + * writer/reader threads silently exit and the run report zero writes. + * Validate up front instead. + */ +static int validate_args(void) +{ + if (g_msgsize == 0 || g_msgsize % 4096 != 0) { + fprintf(stderr, + "msgsize must be a positive multiple of 4096 (got %zu)\n", + g_msgsize); + return -1; + } + if (g_duration <= 0) { + fprintf(stderr, "duration must be > 0 seconds (got %d)\n", + g_duration); + return -1; + } + if (g_pipe_size <= 0) { + fprintf(stderr, "pipe_size must be > 0 bytes (got %d)\n", + g_pipe_size); + return -1; + } + return 0; +} + +static int run_sweep(void) +{ + static const int writers_sweep[] = {1, 2, 5}; + static const int readers_sweep[] = {1, 5, 10}; + + for (size_t i = 0; i < ARRAY_SIZE(writers_sweep); i++) { + for (size_t j = 0; j < ARRAY_SIZE(readers_sweep); j++) { + printf("---\n"); + if (run_one(writers_sweep[i], readers_sweep[j]) < 0) + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int writers_override = 0, readers_override = 0; + pid_t stress_pid = -1; + int rc = 0; + + if (parse_args(argc, argv, &writers_override, &readers_override) < 0) + return 1; + if (validate_args() < 0) + return 1; + + signal(SIGPIPE, SIG_IGN); + setvbuf(stdout, NULL, _IOLBF, 0); + setvbuf(stderr, NULL, _IOLBF, 0); + + fprintf(stderr, "pid=%d\n", getpid()); + fflush(stderr); + + if (g_memory_pressure) { + stress_pid = spawn_stress_ng(); + if (stress_pid < 0) { + fprintf(stderr, + "memory_pressure requested but stress-ng could not be spawned\n"); + return 1; + } + } + + if (writers_override > 0 || readers_override > 0) { + int nw = writers_override > 0 ? writers_override : 1; + int nr = readers_override > 0 ? readers_override : 1; + + rc = run_one(nw, nr) < 0 ? 1 : 0; + } else { + rc = run_sweep() < 0 ? 1 : 0; + } + + kill_stress_ng(stress_pid); + return rc; +} diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index a734553718da..1026d8c400e1 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -17,8 +17,8 @@ */ /* * Fork a child that concurrently modifies address space while the main - * process is reading /proc/$PID/maps and verifying the results. Address - * space modifications include: + * process is reading /proc/$PID/maps and /proc/$PID/smaps, verifying the + * results. Address space modifications include: * VMA splitting and merging * */ @@ -39,6 +39,13 @@ #include <sys/types.h> #include <sys/wait.h> +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) + /* /proc/pid/maps parsing routines */ struct page_content { char *data; @@ -66,6 +73,11 @@ enum test_state { TEST_DONE, }; +enum maps_file { + MAPS, + SMAPS, +}; + struct vma_modifier_info; FIXTURE(proc_maps_race) @@ -76,7 +88,9 @@ FIXTURE(proc_maps_race) struct line_content last_line; struct line_content first_line; unsigned long duration_sec; + enum maps_file maps_file; int shared_mem_size; + int skip_pages; int page_size; int vma_count; bool verbose; @@ -84,6 +98,19 @@ FIXTURE(proc_maps_race) pid_t pid; }; +FIXTURE_VARIANT(proc_maps_race) +{ + const enum maps_file maps_file; +}; + +FIXTURE_VARIANT_ADD(proc_maps_race, maps) { + .maps_file = MAPS, +}; + +FIXTURE_VARIANT_ADD(proc_maps_race, smaps) { + .maps_file = SMAPS, +}; + typedef bool (*vma_modifier_op)(FIXTURE_DATA(proc_maps_race) *self); typedef bool (*vma_mod_result_check_op)(struct line_content *mod_last_line, struct line_content *mod_first_line, @@ -105,38 +132,102 @@ struct vma_modifier_info { void *child_mapped_addr[]; }; - -static bool read_two_pages(FIXTURE_DATA(proc_maps_race) *self) +static bool read_page(FIXTURE_DATA(proc_maps_race) *self, + struct page_content *page) { ssize_t bytes_read; - if (lseek(self->maps_fd, 0, SEEK_SET) < 0) + bytes_read = read(self->maps_fd, page->data, self->page_size); + if (bytes_read <= 0) return false; - bytes_read = read(self->maps_fd, self->page1.data, self->page_size); - if (bytes_read <= 0) + /* Make sure data always ends with a newline character. */ + if (page->data[bytes_read - 1] != '\n') return false; - self->page1.size = bytes_read; + page->size = bytes_read; - bytes_read = read(self->maps_fd, self->page2.data, self->page_size); - if (bytes_read <= 0) + return true; +} + +static bool parse_vma_line(char *line_start, char *line_end, + unsigned long *start, unsigned long *end) +{ + bool found; + + *line_end = '\0'; /* stop sscanf at the EOL */ + found = (sscanf(line_start, "%lx-%lx", start, end) == 2); + *line_end = '\n'; + + return found; +} + +static int locate_containing_page(FIXTURE_DATA(proc_maps_race) *self, + unsigned long addr, unsigned long size) +{ + unsigned long start, end; + int page = 0; + + if (lseek(self->maps_fd, 0, SEEK_SET) < 0) + return -1; + + while (true) { + char *curr_pos; + char *end_pos; + + if (!read_page(self, &self->page1)) + return -1; + + curr_pos = self->page1.data; + end_pos = self->page1.data + self->page1.size; + while (curr_pos < end_pos) { + char *line_end; + + line_end = strchr(curr_pos, '\n'); + if (!line_end) + break; + + if (parse_vma_line(curr_pos, line_end, &start, &end) && + start == addr && end == addr + size) + return page; + + curr_pos = line_end + 1; + } + page++; + } + + return 0; +} + +static bool read_two_pages(FIXTURE_DATA(proc_maps_race) *self) +{ + if (lseek(self->maps_fd, 0, SEEK_SET) < 0) return false; - self->page2.size = bytes_read; + for (int i = 0; i < self->skip_pages; i++) + if (!read_page(self, &self->page1)) + return false; - return true; + return read_page(self, &self->page1) && read_page(self, &self->page2); } -static void copy_first_line(struct page_content *page, char *first_line) +static void copy_line(const char *line_start, const char *line_end, + char *buf, size_t buf_size) { - char *pos = strchr(page->data, '\n'); + size_t len = min(line_end - line_start, buf_size - 1); - strncpy(first_line, page->data, pos - page->data); - first_line[pos - page->data] = '\0'; + strncpy(buf, line_start, len); + buf[len] = '\0'; } -static void copy_last_line(struct page_content *page, char *last_line) +static void copy_first_line(struct page_content *page, char *first_line, + size_t line_size) +{ + copy_line(page->data, strchr(page->data, '\n'), first_line, line_size); +} + +static void copy_last_line(struct page_content *page, char *last_line, + size_t line_size) { /* Get the last line in the first page */ const char *end = page->data + page->size - 1; @@ -146,8 +237,59 @@ static void copy_last_line(struct page_content *page, char *last_line) /* search previous newline */ while (pos[-1] != '\n') pos--; - strncpy(last_line, pos, end - pos); - last_line[end - pos] = '\0'; + + copy_line(pos, end, last_line, line_size); +} + +static bool copy_first_entry(struct page_content *page, char *first_line, + size_t line_size) +{ + char *start_pos = page->data; + + while (start_pos < page->data + page->size) { + unsigned long start_addr; + unsigned long end_addr; + char *end_pos; + + end_pos = strchr(start_pos, '\n'); + if (!end_pos) + break; + + if (parse_vma_line(start_pos, end_pos, &start_addr, &end_addr)) { + copy_line(start_pos, end_pos, first_line, line_size); + return true; + } + + start_pos = end_pos + 1; + } + + return false; +} + +static bool copy_last_entry(struct page_content *page, char *last_line, + size_t line_size) +{ + char *end_pos = page->data + page->size - 1; + char *start_pos; + + while (end_pos > page->data) { + unsigned long start_addr; + unsigned long end_addr; + + /* skip last newline */ + start_pos = end_pos - 1; + /* search previous newline */ + while (start_pos > page->data && start_pos[-1] != '\n') + start_pos--; + if (parse_vma_line(start_pos, end_pos, &start_addr, &end_addr)) { + copy_line(start_pos, end_pos, last_line, line_size); + return true; + } + + end_pos = start_pos - 1; + } + + return false; } /* Read the last line of the first page and the first line of the second page */ @@ -158,8 +300,16 @@ static bool read_boundary_lines(FIXTURE_DATA(proc_maps_race) *self, if (!read_two_pages(self)) return false; - copy_last_line(&self->page1, last_line->text); - copy_first_line(&self->page2, first_line->text); + if (self->maps_file == MAPS) { + copy_last_line(&self->page1, last_line->text, LINE_MAX_SIZE); + copy_first_line(&self->page2, first_line->text, LINE_MAX_SIZE); + } else if (self->maps_file == SMAPS) { + if (!copy_last_entry(&self->page1, last_line->text, LINE_MAX_SIZE) || + !copy_first_entry(&self->page2, first_line->text, LINE_MAX_SIZE)) + return false; + } else { + return false; + } return sscanf(last_line->text, "%lx-%lx", &last_line->start_addr, &last_line->end_addr) == 2 && @@ -418,11 +568,14 @@ FIXTURE_SETUP(proc_maps_race) struct vma_modifier_info *mod_info; pthread_mutexattr_t mutex_attr; pthread_condattr_t cond_attr; + unsigned long first_map_addr; + unsigned long last_map_addr; unsigned long duration_sec; char fname[32]; self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); self->verbose = verbose && !strncmp(verbose, "1", 1); + self->maps_file = variant->maps_file; duration_sec = duration ? atol(duration) : 0; self->duration_sec = duration_sec ? duration_sec : 5UL; @@ -489,7 +642,16 @@ FIXTURE_SETUP(proc_maps_race) exit(0); } - sprintf(fname, "/proc/%d/maps", self->pid); + switch (self->maps_file) { + case MAPS: + sprintf(fname, "/proc/%d/maps", self->pid); + break; + case SMAPS: + sprintf(fname, "/proc/%d/smaps", self->pid); + break; + default: + ksft_exit_fail(); + } self->maps_fd = open(fname, O_RDONLY); ASSERT_NE(self->maps_fd, -1); @@ -502,6 +664,13 @@ FIXTURE_SETUP(proc_maps_race) self->page2.data = malloc(self->page_size); ASSERT_NE(self->page2.data, NULL); + first_map_addr = (unsigned long)mod_info->child_mapped_addr[0]; + last_map_addr = (unsigned long)mod_info->child_mapped_addr[mod_info->vma_count - 1]; + + self->skip_pages = locate_containing_page(self, + min(first_map_addr, last_map_addr), + self->page_size * 3); + ASSERT_NE(self->skip_pages, -1); ASSERT_TRUE(read_boundary_lines(self, &self->last_line, &self->first_line)); /* @@ -527,7 +696,6 @@ FIXTURE_SETUP(proc_maps_race) ASSERT_TRUE(mod_info->addr && mod_info->next_addr); signal_state(mod_info, PARENT_READY); - } FIXTURE_TEARDOWN(proc_maps_race) @@ -617,20 +785,20 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) last_line_changed = strcmp(new_last_line.text, self->last_line.text) != 0; first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed); - - /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ - ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, - &vma_start, &vma_end)); - /* - * The vma at the split address can be either the same as - * original one (if read before the split) or the same as the - * first line in the second page (if read after the split). - */ - ASSERT_TRUE((vma_start == self->last_line.start_addr && - vma_end == self->last_line.end_addr) || - (vma_start == split_first_line.start_addr && - vma_end == split_first_line.end_addr)); - + if (self->maps_file == MAPS) { + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma at the split address can be either the same as + * original one (if read before the split) or the same as the + * first line in the second page (if read after the split). + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end == self->last_line.end_addr) || + (vma_start == split_first_line.start_addr && + vma_end == split_first_line.end_addr)); + } clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -700,17 +868,18 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) strcmp(new_first_line.text, restored_first_line.text), "Expand result invalid", self)); } - - /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ - ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end)); - /* - * The vma should stay at the same address and have either the - * original size of 3 pages or 1 page if read after shrinking. - */ - ASSERT_TRUE(vma_start == self->last_line.start_addr && - (vma_end - vma_start == self->page_size * 3 || - vma_end - vma_start == self->page_size)); - + if (self->maps_file == MAPS) { + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, + &vma_start, &vma_end)); + /* + * The vma should stay at the same address and have either the + * original size of 3 pages or 1 page if read after shrinking. + */ + ASSERT_TRUE(vma_start == self->last_line.start_addr && + (vma_end - vma_start == self->page_size * 3 || + vma_end - vma_start == self->page_size)); + } clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -780,20 +949,20 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) strcmp(new_first_line.text, restored_first_line.text), "Remap restore result invalid", self)); } - - /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ - ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, - &vma_start, &vma_end)); - /* - * The vma should either stay at the same address and have the - * original size of 3 pages or we should find the remapped vma - * at the remap destination address with size of 1 page. - */ - ASSERT_TRUE((vma_start == self->last_line.start_addr && - vma_end - vma_start == self->page_size * 3) || - (vma_start == self->last_line.start_addr + self->page_size && - vma_end - vma_start == self->page_size)); - + if (self->maps_file == MAPS) { + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma should either stay at the same address and have the + * original size of 3 pages or we should find the remapped vma + * at the remap destination address with size of 1 page. + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end - vma_start == self->page_size * 3) || + (vma_start == self->last_line.start_addr + self->page_size && + vma_end - vma_start == self->page_size)); + } clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh index c4ee5f910931..be9412538fb8 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh @@ -1,12 +1,13 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # -# Usage: kvm-series.sh config-list commit-id-list [ kvm.sh parameters ] +# Usage: kvm-series.sh config-list commit-id-range [ kvm.sh parameters ] # -# Tests the specified list of unadorned configs ("TREE01 SRCU-P" but not -# "CFLIST" or "3*TRACE01") and an indication of a set of commits to test, -# then runs each commit through the specified list of commits using kvm.sh. -# The runs are grouped into a -series/config/commit directory tree. +# Tests the specified list of unadorned configs ("TREE01 SRCU-P" but +# not "CFLIST" or "3*TRACE01") and an indication of a range of commits +# ("v7.0-rc1..rcu/dev", but not "cd0ce7bab0408 ff74db28df623 17c52d7b31a1f") +# to test, then runs each commit through the specified list of commits using +# kvm.sh. The runs are grouped into a -series/config/commit directory tree. # Each run defaults to a duration of one minute. # # Run in top-level Linux source directory. Please note that this is in diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index a33ba109ef0b..f0083891ee81 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -184,7 +184,7 @@ do do_clocksourcewd=no do_srcu_lockdep=no ;; - --do-normal|--do-no-normal|--no-normal) + --do-normal|--do-norm|--do-no-normal|--do-no-norm|--no-normal|--no-norm) do_normal=`doyesno "$1" --do-normal` explicit_normal=yes ;; diff --git a/tools/testing/selftests/rdma/Makefile b/tools/testing/selftests/rdma/Makefile index 7dd7cba7a73c..07af7f15c1bf 100644 --- a/tools/testing/selftests/rdma/Makefile +++ b/tools/testing/selftests/rdma/Makefile @@ -2,6 +2,7 @@ TEST_PROGS := rxe_rping_between_netns.sh \ rxe_ipv6.sh \ rxe_socket_with_netns.sh \ - rxe_test_NETDEV_UNREGISTER.sh + rxe_test_NETDEV_UNREGISTER.sh \ + rxe_sent_rcvd_bytes.sh include ../lib.mk diff --git a/tools/testing/selftests/rdma/rxe_ipv6.sh b/tools/testing/selftests/rdma/rxe_ipv6.sh index b7059bfd6d7c..32dad687a044 100755 --- a/tools/testing/selftests/rdma/rxe_ipv6.sh +++ b/tools/testing/selftests/rdma/rxe_ipv6.sh @@ -8,6 +8,8 @@ RXE_NAME="rxe6" PORT=4791 IP6_ADDR="2001:db8::1/64" +source "$(dirname "$0")/../kselftest/ktap_helpers.sh" + exec > /dev/null # Cleanup function to run on exit (even on failure) @@ -21,8 +23,8 @@ trap cleanup EXIT # 1. Prerequisites check for mod in tun veth rdma_rxe; do if ! modinfo "$mod" >/dev/null 2>&1; then - echo "Error: Kernel module '$mod' not found." - exit 1 + echo "SKIP: Kernel module '$mod' not found." >&2 + exit $KSFT_SKIP fi done diff --git a/tools/testing/selftests/rdma/rxe_rping_between_netns.sh b/tools/testing/selftests/rdma/rxe_rping_between_netns.sh index e5b876f58c6e..e7554fbb8951 100755 --- a/tools/testing/selftests/rdma/rxe_rping_between_netns.sh +++ b/tools/testing/selftests/rdma/rxe_rping_between_netns.sh @@ -8,6 +8,8 @@ IP_A="1.1.1.1" IP_B="1.1.1.2" PORT=4791 +source "$(dirname "$0")/../kselftest/ktap_helpers.sh" + exec > /dev/null # --- Cleanup Routine --- @@ -27,6 +29,11 @@ if [[ $EUID -ne 0 ]]; then exit 1 fi +if ! modinfo rdma_rxe >/dev/null 2>&1; then + echo "SKIP: Kernel module 'rdma_rxe' not found." >&2 + exit $KSFT_SKIP +fi + modprobe rdma_rxe || { echo "Failed to load rdma_rxe"; exit 1; } # --- Setup Network Topology --- diff --git a/tools/testing/selftests/rdma/rxe_sent_rcvd_bytes.sh b/tools/testing/selftests/rdma/rxe_sent_rcvd_bytes.sh new file mode 100755 index 000000000000..0e4fbfeebd22 --- /dev/null +++ b/tools/testing/selftests/rdma/rxe_sent_rcvd_bytes.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Configuration +PORT=4791 +MODS=("tun" "rdma_rxe") + +exec > /dev/null + +# --- Helper: Cleanup Routine --- +cleanup() { + echo "Cleaning up resources..." + rdma link del rxe0 2>/dev/null + ip link del tun0 2>/dev/null + for m in "${MODS[@]}"; do modprobe -r "$m" 2>/dev/null; done +} + +# Ensure cleanup runs on script exit or interrupt +trap cleanup EXIT + +# --- Phase 1: Environment Check --- +if [[ $EUID -ne 0 ]]; then + echo "Error: This script must be run as root." + exit 1 +fi + +for m in "${MODS[@]}"; do + modprobe "$m" || { echo "Error: Failed to load $m"; exit 1; } +done + +# --- Phase 2: Create Interfaces & RXE Links --- +echo "Creating tun0 (1.1.1.1) and rxe0..." +ip tuntap add mode tun tun0 +ip addr add 1.1.1.1/24 dev tun0 +ip link set tun0 up +rdma link add rxe0 type rxe netdev tun0 + +# Verify port 4791 is listening +if ! ss -Huln sport = :$PORT | grep -q ":$PORT"; then + echo "Error: UDP port $PORT not found after rxe0 creation" + exit 1 +fi + +orig_s=`cat /sys/class/infiniband/rxe0/ports/1/counters/port_xmit_data` +orig_r=`cat /sys/class/infiniband/rxe0/ports/1/counters/port_rcv_data` + +rping -s -a 1.1.1.1 -C 3 -v & +sleep 1 +rping -c -a 1.1.1.1 -C 3 -d -v + +new_s=`cat /sys/class/infiniband/rxe0/ports/1/counters/port_xmit_data` +new_r=`cat /sys/class/infiniband/rxe0/ports/1/counters/port_rcv_data` + +echo sent $new_s $orig_s +echo rcvd $new_r $orig_r + +result0=$((new_s - orig_s)) +result1=$((new_r - orig_r)) + +if [ $result0 != $result1 ]; then + echo "Error: sent and rcvd bytes different" + echo $result0 + echo $result1 + exit 1 +fi + +echo "Deleting rxe0..." +rdma link del rxe0 + +# Port should now be gone +if ss -Huln sport = :$PORT | grep -q ":$PORT"; then + echo "Error: UDP port $PORT still exists after all links deleted" + exit 1 +fi + +echo "Test passed successfully." diff --git a/tools/testing/selftests/rdma/rxe_socket_with_netns.sh b/tools/testing/selftests/rdma/rxe_socket_with_netns.sh index 002e5098f751..9478657c02c1 100755 --- a/tools/testing/selftests/rdma/rxe_socket_with_netns.sh +++ b/tools/testing/selftests/rdma/rxe_socket_with_netns.sh @@ -4,6 +4,8 @@ PORT=4791 MODS=("tun" "rdma_rxe") +source "$(dirname "$0")/../kselftest/ktap_helpers.sh" + exec > /dev/null # --- Helper: Cleanup Routine --- @@ -26,6 +28,10 @@ if [[ $EUID -ne 0 ]]; then fi for m in "${MODS[@]}"; do + if ! modinfo "$m" >/dev/null 2>&1; then + echo "SKIP: Kernel module '$m' not found." >&2 + exit $KSFT_SKIP + fi modprobe "$m" || { echo "Error: Failed to load $m"; exit 1; } done diff --git a/tools/testing/selftests/rdma/rxe_test_NETDEV_UNREGISTER.sh b/tools/testing/selftests/rdma/rxe_test_NETDEV_UNREGISTER.sh index 021ca451499d..8c18cea7535c 100755 --- a/tools/testing/selftests/rdma/rxe_test_NETDEV_UNREGISTER.sh +++ b/tools/testing/selftests/rdma/rxe_test_NETDEV_UNREGISTER.sh @@ -5,6 +5,8 @@ DEV_NAME="tun0" RXE_NAME="rxe0" RDMA_PORT=4791 +source "$(dirname "$0")/../kselftest/ktap_helpers.sh" + exec > /dev/null # --- Cleanup Routine --- @@ -19,8 +21,8 @@ trap cleanup EXIT # 1. Dependency Check if ! modinfo rdma_rxe >/dev/null 2>&1; then - echo "Error: rdma_rxe module not found." - exit 1 + echo "SKIP: rdma_rxe module not found." >&2 + exit $KSFT_SKIP fi modprobe rdma_rxe diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c index 1ff1104e6575..df9bea584a2d 100644 --- a/tools/testing/selftests/resctrl/cache.c +++ b/tools/testing/selftests/resctrl/cache.c @@ -10,7 +10,6 @@ void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config) memset(pea, 0, sizeof(*pea)); pea->type = PERF_TYPE_HARDWARE; pea->size = sizeof(*pea); - pea->read_format = PERF_FORMAT_GROUP; pea->exclude_kernel = 1; pea->exclude_hv = 1; pea->exclude_idle = 1; @@ -37,19 +36,13 @@ int perf_event_reset_enable(int pe_fd) return 0; } -void perf_event_initialize_read_format(struct perf_event_read *pe_read) -{ - memset(pe_read, 0, sizeof(*pe_read)); - pe_read->nr = 1; -} - int perf_open(struct perf_event_attr *pea, pid_t pid, int cpu_no) { int pe_fd; pe_fd = perf_event_open(pea, pid, cpu_no, -1, PERF_FLAG_FD_CLOEXEC); if (pe_fd == -1) { - ksft_perror("Error opening leader"); + ksft_perror("Unable to set up performance monitoring"); return -1; } @@ -132,9 +125,9 @@ static int print_results_cache(const char *filename, pid_t bm_pid, __u64 llc_val * * Return: =0 on success. <0 on failure. */ -int perf_event_measure(int pe_fd, struct perf_event_read *pe_read, - const char *filename, pid_t bm_pid) +int perf_event_measure(int pe_fd, const char *filename, pid_t bm_pid) { + __u64 value; int ret; /* Stop counters after one span to get miss rate */ @@ -142,13 +135,13 @@ int perf_event_measure(int pe_fd, struct perf_event_read *pe_read, if (ret < 0) return ret; - ret = read(pe_fd, pe_read, sizeof(*pe_read)); + ret = read(pe_fd, &value, sizeof(value)); if (ret == -1) { ksft_perror("Could not get perf value"); return -1; } - return print_results_cache(filename, bm_pid, pe_read->values[0].value); + return print_results_cache(filename, bm_pid, value); } /* @@ -174,6 +167,19 @@ int measure_llc_resctrl(const char *filename, pid_t bm_pid) } /* + * Reduce L2 allocation to minimum when testing L3 cache allocation. + */ +int minimize_l2_occupancy(const struct resctrl_test *test, + const struct user_params *uparams, + const struct resctrl_val_param *param) +{ + if (!strcmp(test->resource, "L3") && resctrl_resource_exists("L2")) + return write_schemata(param->ctrlgrp, "0x1", uparams->cpu, "L2"); + + return 0; +} + +/* * show_cache_info - Show generic cache test information * @no_of_bits: Number of bits * @avg_llc_val: Average of LLC cache result data diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c index f00b622c1460..371a2f26dc47 100644 --- a/tools/testing/selftests/resctrl/cat_test.c +++ b/tools/testing/selftests/resctrl/cat_test.c @@ -14,42 +14,20 @@ #define RESULT_FILE_NAME "result_cat" #define NUM_OF_RUNS 5 -/* - * Minimum difference in LLC misses between a test with n+1 bits CBM to the - * test with n bits is MIN_DIFF_PERCENT_PER_BIT * (n - 1). With e.g. 5 vs 4 - * bits in the CBM mask, the minimum difference must be at least - * MIN_DIFF_PERCENT_PER_BIT * (4 - 1) = 3 percent. - * - * The relationship between number of used CBM bits and difference in LLC - * misses is not expected to be linear. With a small number of bits, the - * margin is smaller than with larger number of bits. For selftest purposes, - * however, linear approach is enough because ultimately only pass/fail - * decision has to be made and distinction between strong and stronger - * signal is irrelevant. - */ -#define MIN_DIFF_PERCENT_PER_BIT 1UL - static int show_results_info(__u64 sum_llc_val, int no_of_bits, unsigned long cache_span, - unsigned long min_diff_percent, unsigned long num_of_runs, bool platform, __s64 *prev_avg_llc_val) { __u64 avg_llc_val = 0; - float avg_diff; int ret = 0; avg_llc_val = sum_llc_val / num_of_runs; if (*prev_avg_llc_val) { - float delta = (__s64)(avg_llc_val - *prev_avg_llc_val); - - avg_diff = delta / *prev_avg_llc_val; - ret = platform && (avg_diff * 100) < (float)min_diff_percent; + ret = platform && (avg_llc_val < *prev_avg_llc_val); - ksft_print_msg("%s Check cache miss rate changed more than %.1f%%\n", - ret ? "Fail:" : "Pass:", (float)min_diff_percent); - - ksft_print_msg("Percent diff=%.1f\n", avg_diff * 100); + ksft_print_msg("%s Check cache miss rate increased\n", + ret ? "Fail:" : "Pass:"); } *prev_avg_llc_val = avg_llc_val; @@ -58,10 +36,10 @@ static int show_results_info(__u64 sum_llc_val, int no_of_bits, return ret; } -/* Remove the highest bit from CBM */ +/* Remove the highest bits from CBM */ static unsigned long next_mask(unsigned long current_mask) { - return current_mask & (current_mask >> 1); + return current_mask & (current_mask >> 2); } static int check_results(struct resctrl_val_param *param, const char *cache_type, @@ -112,7 +90,6 @@ static int check_results(struct resctrl_val_param *param, const char *cache_type ret = show_results_info(sum_llc_perf_miss, bits, alloc_size / 64, - MIN_DIFF_PERCENT_PER_BIT * (bits - 1), runs, get_vendor() == ARCH_INTEL, &prev_avg_llc_val); if (ret) @@ -158,7 +135,6 @@ static int cat_test(const struct resctrl_test *test, struct resctrl_val_param *param, size_t span, unsigned long current_mask) { - struct perf_event_read pe_read; struct perf_event_attr pea; cpu_set_t old_affinity; unsigned char *buf; @@ -181,8 +157,11 @@ static int cat_test(const struct resctrl_test *test, if (ret) goto reset_affinity; + ret = minimize_l2_occupancy(test, uparams, param); + if (ret) + goto reset_affinity; + perf_event_attr_initialize(&pea, PERF_COUNT_HW_CACHE_MISSES); - perf_event_initialize_read_format(&pe_read); pe_fd = perf_open(&pea, bm_pid, uparams->cpu); if (pe_fd < 0) { ret = -1; @@ -215,7 +194,7 @@ static int cat_test(const struct resctrl_test *test, fill_cache_read(buf, span, true); - ret = perf_event_measure(pe_fd, &pe_read, param->filename, bm_pid); + ret = perf_event_measure(pe_fd, param->filename, bm_pid); if (ret) goto free_buf; } diff --git a/tools/testing/selftests/resctrl/cmt_test.c b/tools/testing/selftests/resctrl/cmt_test.c index d09e693dc739..ccb6fe881a94 100644 --- a/tools/testing/selftests/resctrl/cmt_test.c +++ b/tools/testing/selftests/resctrl/cmt_test.c @@ -19,12 +19,40 @@ #define CON_MON_LCC_OCCUP_PATH \ "%s/%s/mon_data/mon_L3_%02d/llc_occupancy" -static int cmt_init(const struct resctrl_val_param *param, int domain_id) +/* + * Initialize capacity bitmasks (CBMs) of: + * - control group being tested per test parameters, + * - default resource group as inverse of control group being tested to prevent + * other tasks from interfering with test, + * - L2 resource of control group being tested to minimize allocations into + * L2 if possible to better predict L3 occupancy. + */ +static int cmt_init(const struct resctrl_test *test, + const struct user_params *uparams, + const struct resctrl_val_param *param, int domain_id) { + unsigned long full_mask; + char schemata[64]; + int ret; + sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH, param->ctrlgrp, domain_id); - return 0; + ret = get_full_cbm(test->resource, &full_mask); + if (ret) + return ret; + + snprintf(schemata, sizeof(schemata), "%lx", ~param->mask & full_mask); + ret = write_schemata("", schemata, uparams->cpu, test->resource); + if (ret) + return ret; + + snprintf(schemata, sizeof(schemata), "%lx", param->mask); + ret = write_schemata(param->ctrlgrp, schemata, uparams->cpu, test->resource); + if (ret) + return ret; + + return minimize_l2_occupancy(test, uparams, param); } static int cmt_setup(const struct resctrl_test *test, @@ -153,11 +181,11 @@ static int cmt_run_test(const struct resctrl_test *test, const struct user_param span = cache_portion_size(cache_total_size, param.mask, long_mask); if (uparams->fill_buf) { - fill_buf.buf_size = span; + fill_buf.buf_size = span * 2; fill_buf.memflush = uparams->fill_buf->memflush; param.fill_buf = &fill_buf; } else if (!uparams->benchmark_cmd[0]) { - fill_buf.buf_size = span; + fill_buf.buf_size = span * 2; fill_buf.memflush = true; param.fill_buf = &fill_buf; } diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c index 19a01a52dc1a..b9fa7968cd6e 100644 --- a/tools/testing/selftests/resctrl/fill_buf.c +++ b/tools/testing/selftests/resctrl/fill_buf.c @@ -139,6 +139,6 @@ ssize_t get_fill_buf_size(int cpu_no, const char *cache_type) if (ret) return ret; - return cache_total_size * 2 > MINIMUM_SPAN ? - cache_total_size * 2 : MINIMUM_SPAN; + return cache_total_size * 4 > MINIMUM_SPAN ? + cache_total_size * 4 : MINIMUM_SPAN; } diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c index c7e9adc0368f..39cee9898359 100644 --- a/tools/testing/selftests/resctrl/mba_test.c +++ b/tools/testing/selftests/resctrl/mba_test.c @@ -12,12 +12,14 @@ #define RESULT_FILE_NAME "result_mba" #define NUM_OF_RUNS 5 -#define MAX_DIFF_PERCENT 8 +#define MAX_DIFF_PERCENT 15 #define ALLOCATION_MAX 100 #define ALLOCATION_MIN 10 #define ALLOCATION_STEP 10 -static int mba_init(const struct resctrl_val_param *param, int domain_id) +static int mba_init(const struct resctrl_test *test, + const struct user_params *uparams, + const struct resctrl_val_param *param, int domain_id) { int ret; diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c index 84d8bc250539..6dbbc3b76003 100644 --- a/tools/testing/selftests/resctrl/mbm_test.c +++ b/tools/testing/selftests/resctrl/mbm_test.c @@ -11,7 +11,7 @@ #include "resctrl.h" #define RESULT_FILE_NAME "result_mbm" -#define MAX_DIFF_PERCENT 8 +#define MAX_DIFF_PERCENT 15 #define NUM_OF_RUNS 5 static int @@ -83,7 +83,9 @@ static int check_results(size_t span) return ret; } -static int mbm_init(const struct resctrl_val_param *param, int domain_id) +static int mbm_init(const struct resctrl_test *test, + const struct user_params *uparams, + const struct resctrl_val_param *param, int domain_id) { int ret; diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h index afe635b6e48d..175101022bf3 100644 --- a/tools/testing/selftests/resctrl/resctrl.h +++ b/tools/testing/selftests/resctrl/resctrl.h @@ -55,7 +55,7 @@ * and MBM respectively, for instance generating "overhead" traffic which * is not counted against any specific RMID. */ -#define THROTTLE_THRESHOLD 750 +#define THROTTLE_THRESHOLD 2500 /* * fill_buf_param: "fill_buf" benchmark parameters @@ -135,7 +135,9 @@ struct resctrl_val_param { char filename[64]; unsigned long mask; int num_of_runs; - int (*init)(const struct resctrl_val_param *param, + int (*init)(const struct resctrl_test *test, + const struct user_params *uparams, + const struct resctrl_val_param *param, int domain_id); int (*setup)(const struct resctrl_test *test, const struct user_params *uparams, @@ -146,13 +148,6 @@ struct resctrl_val_param { struct fill_buf_param *fill_buf; }; -struct perf_event_read { - __u64 nr; /* The number of events */ - struct { - __u64 value; /* The value of the event */ - } values[2]; -}; - /* * Memory location that consumes values compiler must not optimize away. * Volatile ensures writes to this location cannot be optimized away by @@ -208,12 +203,13 @@ unsigned int count_bits(unsigned long n); int snc_kernel_support(void); void perf_event_attr_initialize(struct perf_event_attr *pea, __u64 config); -void perf_event_initialize_read_format(struct perf_event_read *pe_read); int perf_open(struct perf_event_attr *pea, pid_t pid, int cpu_no); int perf_event_reset_enable(int pe_fd); -int perf_event_measure(int pe_fd, struct perf_event_read *pe_read, - const char *filename, pid_t bm_pid); +int perf_event_measure(int pe_fd, const char *filename, pid_t bm_pid); int measure_llc_resctrl(const char *filename, pid_t bm_pid); +int minimize_l2_occupancy(const struct resctrl_test *test, + const struct user_params *uparams, + const struct resctrl_val_param *param); void show_cache_info(int no_of_bits, __u64 avg_llc_val, size_t cache_span, bool lines); /* diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c index 7c08e936572d..f20d2194c35f 100644 --- a/tools/testing/selftests/resctrl/resctrl_val.c +++ b/tools/testing/selftests/resctrl/resctrl_val.c @@ -11,10 +11,10 @@ #include "resctrl.h" #define UNCORE_IMC "uncore_imc" -#define READ_FILE_NAME "events/cas_count_read" +#define READ_FILE_NAME "cas_count_read" #define DYN_PMU_PATH "/sys/bus/event_source/devices" #define SCALE 0.00006103515625 -#define MAX_IMCS 20 +#define MAX_IMCS 40 #define MAX_TOKENS 5 #define CON_MBM_LOCAL_BYTES_PATH \ @@ -32,7 +32,6 @@ struct imc_counter_config { __u64 event; __u64 umask; struct perf_event_attr pe; - struct membw_read_format return_value; int fd; }; @@ -74,7 +73,7 @@ static void read_mem_bw_ioctl_perf_event_ioc_disable(int i) * @cas_count_cfg: Config * @count: iMC number */ -static void get_read_event_and_umask(char *cas_count_cfg, int count) +static void get_read_event_and_umask(char *cas_count_cfg, unsigned int count) { char *token[MAX_TOKENS]; int i = 0; @@ -110,45 +109,114 @@ static int open_perf_read_event(int i, int cpu_no) return 0; } -/* Get type and config of an iMC counter's read event. */ -static int read_from_imc_dir(char *imc_dir, int count) +static int parse_imc_read_bw_events(char *imc_dir, unsigned int type, + unsigned int *count) { - char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024]; + char imc_events_dir[PATH_MAX], imc_counter_cfg[PATH_MAX]; + unsigned int orig_count = *count; + char cas_count_cfg[1024]; + struct dirent *ep; + int path_len; + int ret = -1; + int num_cfg; FILE *fp; + DIR *dp; - /* Get type of iMC counter */ - sprintf(imc_counter_type, "%s%s", imc_dir, "type"); - fp = fopen(imc_counter_type, "r"); - if (!fp) { - ksft_perror("Failed to open iMC counter type file"); + path_len = snprintf(imc_events_dir, sizeof(imc_events_dir), "%sevents", + imc_dir); + if (path_len >= sizeof(imc_events_dir)) { + ksft_print_msg("Unable to create path to %sevents\n", imc_dir); + return -1; + } + dp = opendir(imc_events_dir); + if (!dp) { + ksft_perror("Unable to open PMU events directory"); return -1; } - if (fscanf(fp, "%u", &imc_counters_config[count].type) <= 0) { - ksft_perror("Could not get iMC type"); + + while ((ep = readdir(dp))) { + /* + * Parse all event files with READ_FILE_NAME prefix that + * contain the event number and umask. Skip files containing + * "." that contain unused properties of event. + */ + if (!strstr(ep->d_name, READ_FILE_NAME) || + strchr(ep->d_name, '.')) + continue; + + path_len = snprintf(imc_counter_cfg, sizeof(imc_counter_cfg), + "%s/%s", imc_events_dir, ep->d_name); + if (path_len >= sizeof(imc_counter_cfg)) { + ksft_print_msg("Unable to create path to %s/%s\n", + imc_events_dir, ep->d_name); + goto out_close; + } + fp = fopen(imc_counter_cfg, "r"); + if (!fp) { + ksft_perror("Failed to open iMC config file"); + goto out_close; + } + num_cfg = fscanf(fp, "%1023s", cas_count_cfg); fclose(fp); + if (num_cfg <= 0) { + ksft_perror("Could not get iMC cas count read"); + goto out_close; + } + if (*count >= MAX_IMCS) { + ksft_print_msg("Maximum iMC count exceeded\n"); + goto out_close; + } - return -1; + imc_counters_config[*count].type = type; + get_read_event_and_umask(cas_count_cfg, *count); + /* Do not fail after incrementing *count. */ + *count += 1; } - fclose(fp); + if (*count == orig_count) { + ksft_print_msg("Unable to find events in %s\n", imc_events_dir); + goto out_close; + } + ret = 0; +out_close: + closedir(dp); + return ret; +} - /* Get read config */ - sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME); - fp = fopen(imc_counter_cfg, "r"); - if (!fp) { - ksft_perror("Failed to open iMC config file"); +/* Get type and config of an iMC counter's read event. */ +static int read_from_imc_dir(char *imc_dir, unsigned int *count) +{ + char imc_counter_type[PATH_MAX]; + unsigned int type; + int path_len; + FILE *fp; + int ret; + /* Get type of iMC counter */ + path_len = snprintf(imc_counter_type, sizeof(imc_counter_type), + "%s%s", imc_dir, "type"); + if (path_len >= sizeof(imc_counter_type)) { + ksft_print_msg("Unable to create path to %s%s\n", + imc_dir, "type"); return -1; } - if (fscanf(fp, "%1023s", cas_count_cfg) <= 0) { - ksft_perror("Could not get iMC cas count read"); - fclose(fp); + fp = fopen(imc_counter_type, "r"); + if (!fp) { + ksft_perror("Failed to open iMC counter type file"); return -1; } + ret = fscanf(fp, "%u", &type); fclose(fp); - - get_read_event_and_umask(cas_count_cfg, count); + if (ret <= 0) { + ksft_perror("Could not get iMC type"); + return -1; + } + ret = parse_imc_read_bw_events(imc_dir, type, count); + if (ret) { + ksft_print_msg("Unable to parse bandwidth event and umask\n"); + return ret; + } return 0; } @@ -197,13 +265,12 @@ static int num_of_imcs(void) if (temp[0] >= '0' && temp[0] <= '9') { sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH, ep->d_name); - ret = read_from_imc_dir(imc_dir, count); + ret = read_from_imc_dir(imc_dir, &count); if (ret) { closedir(dp); return ret; } - count++; } } closedir(dp); @@ -312,23 +379,23 @@ static int get_read_mem_bw_imc(float *bw_imc) * Take overflow into consideration before calculating total bandwidth. */ for (imc = 0; imc < imcs; imc++) { + struct membw_read_format measurement; struct imc_counter_config *r = &imc_counters_config[imc]; - if (read(r->fd, &r->return_value, - sizeof(struct membw_read_format)) == -1) { + if (read(r->fd, &measurement, sizeof(measurement)) == -1) { ksft_perror("Couldn't get read bandwidth through iMC"); return -1; } - __u64 r_time_enabled = r->return_value.time_enabled; - __u64 r_time_running = r->return_value.time_running; + __u64 r_time_enabled = measurement.time_enabled; + __u64 r_time_running = measurement.time_running; if (r_time_enabled != r_time_running) of_mul_read = (float)r_time_enabled / (float)r_time_running; - reads += r->return_value.value * of_mul_read * SCALE; + reads += measurement.value * of_mul_read * SCALE; } *bw_imc = reads; @@ -569,7 +636,7 @@ int resctrl_val(const struct resctrl_test *test, goto reset_affinity; if (param->init) { - ret = param->init(param, domain_id); + ret = param->init(test, uparams, param, domain_id); if (ret) goto reset_affinity; } diff --git a/tools/testing/selftests/riscv/abi/Makefile b/tools/testing/selftests/riscv/abi/Makefile index ed82ff9c664e..041114675ad5 100644 --- a/tools/testing/selftests/riscv/abi/Makefile +++ b/tools/testing/selftests/riscv/abi/Makefile @@ -7,4 +7,4 @@ TEST_GEN_PROGS := pointer_masking include ../../lib.mk $(OUTPUT)/pointer_masking: pointer_masking.c - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/cfi/Makefile b/tools/testing/selftests/riscv/cfi/Makefile index 93b4738c0e2e..418b4b5325a5 100644 --- a/tools/testing/selftests/riscv/cfi/Makefile +++ b/tools/testing/selftests/riscv/cfi/Makefile @@ -16,7 +16,7 @@ ifeq ($(shell $(CC) $(CFLAGS) -nostdlib -xc /dev/null -o /dev/null > /dev/null 2 TEST_GEN_PROGS := cfitests $(OUTPUT)/cfitests: cfitests.c shadowstack.c - $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $^ else $(shell echo "Toolchain doesn't support CFI, skipping CFI kselftest." >&2) diff --git a/tools/testing/selftests/riscv/cfi/cfitests.c b/tools/testing/selftests/riscv/cfi/cfitests.c index 39d097b6881f..0e3943461e7d 100644 --- a/tools/testing/selftests/riscv/cfi/cfitests.c +++ b/tools/testing/selftests/riscv/cfi/cfitests.c @@ -141,6 +141,12 @@ int main(int argc, char *argv[]) ksft_print_msg("Starting risc-v tests\n"); + /* Test unknown PR_CFI bits */ + ret = my_syscall5(__NR_prctl, PR_SET_CFI, PR_CFI_BRANCH_LANDING_PADS, + PR_CFI_ENABLE | 0xffff0, 0, 0); + if (!ret) + ksft_exit_fail_msg("PR_SET_CFI accepted reserved branch landing pad bits\n"); + /* * Landing pad test. Not a lot of kernel changes to support landing * pads for user mode except lighting up a bit in senvcfg via a prctl. diff --git a/tools/testing/selftests/riscv/hwprobe/Makefile b/tools/testing/selftests/riscv/hwprobe/Makefile index cec81610a5f2..71e3f26c541b 100644 --- a/tools/testing/selftests/riscv/hwprobe/Makefile +++ b/tools/testing/selftests/riscv/hwprobe/Makefile @@ -9,10 +9,10 @@ TEST_GEN_PROGS := hwprobe cbo which-cpus include ../../lib.mk $(OUTPUT)/hwprobe: hwprobe.c sys_hwprobe.S - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ $(OUTPUT)/cbo: cbo.c sys_hwprobe.S - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ $(OUTPUT)/which-cpus: which-cpus.c sys_hwprobe.S - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/mm/Makefile b/tools/testing/selftests/riscv/mm/Makefile index 4664ed79e20b..24122453e3d0 100644 --- a/tools/testing/selftests/riscv/mm/Makefile +++ b/tools/testing/selftests/riscv/mm/Makefile @@ -12,4 +12,4 @@ TEST_PROGS := run_mmap.sh include ../../lib.mk $(OUTPUT)/mm: mmap_default.c mmap_bottomup.c mmap_tests.h - $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/sigreturn/Makefile b/tools/testing/selftests/riscv/sigreturn/Makefile index eb8bac9279a8..8c77508641f3 100644 --- a/tools/testing/selftests/riscv/sigreturn/Makefile +++ b/tools/testing/selftests/riscv/sigreturn/Makefile @@ -9,4 +9,4 @@ TEST_GEN_PROGS := sigreturn include ../../lib.mk $(OUTPUT)/sigreturn: sigreturn.c - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/vector/Makefile b/tools/testing/selftests/riscv/vector/Makefile index 326dafd739bf..7e0017b3fb8b 100644 --- a/tools/testing/selftests/riscv/vector/Makefile +++ b/tools/testing/selftests/riscv/vector/Makefile @@ -11,29 +11,29 @@ include ../../lib.mk TEST_GEN_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(TEST_GEN_LIBS)) $(OUTPUT)/sys_hwprobe.o: ../hwprobe/sys_hwprobe.S - $(CC) -static -c -o$@ $(CFLAGS) $^ + $(CC) -static -c -o $@ $(CFLAGS) $^ $(OUTPUT)/v_helpers.o: v_helpers.c - $(CC) -static -c -o$@ $(CFLAGS) $^ + $(CC) -static -c -o $@ $(CFLAGS) $^ $(OUTPUT)/vstate_prctl: vstate_prctl.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ $(OUTPUT)/vstate_exec_nolibc: vstate_exec_nolibc.c $(CC) -nostdlib -static -include ../../../../include/nolibc/nolibc.h \ -Wall $(CFLAGS) $(LDFLAGS) $^ -o $@ -lgcc $(OUTPUT)/v_initval: v_initval.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ $(OUTPUT)/v_exec_initval_nolibc: v_exec_initval_nolibc.c $(CC) -nostdlib -static -include ../../../../include/nolibc/nolibc.h \ -Wall $(CFLAGS) $(LDFLAGS) $^ -o $@ -lgcc $(OUTPUT)/vstate_ptrace: vstate_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ $(OUTPUT)/validate_v_ptrace: validate_v_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o - $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ + $(CC) -static -o $@ $(CFLAGS) $(LDFLAGS) $^ EXTRA_CLEAN += $(TEST_GEN_OBJ) diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 4ef90823b652..50d69e22ee7a 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -14,14 +14,20 @@ LDLIBS += -lpthread -ldl # still track changes to header files and depend on shared object. OVERRIDE_TARGETS = 1 -TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ - param_test_benchmark param_test_compare_twice param_test_mm_cid \ - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test slice_test +TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test \ + param_test_benchmark param_test_mm_cid_benchmark -TEST_GEN_PROGS_EXTENDED = librseq.so +TEST_GEN_PROGS_EXTENDED = librseq.so \ + param_test \ + param_test_compare_twice \ + param_test_mm_cid \ + param_test_mm_cid_compare_twice \ + syscall_errors_test \ + legacy_check \ + slice_test \ + check_optimized -TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh run_timeslice_test.sh TEST_FILES := settings @@ -62,3 +68,6 @@ $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) $(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/check_optimized: check_optimized.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/check_optimized.c b/tools/testing/selftests/rseq/check_optimized.c new file mode 100644 index 000000000000..a13e3f2c8fc6 --- /dev/null +++ b/tools/testing/selftests/rseq/check_optimized.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include <assert.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <string.h> +#include <sys/time.h> + +#include "rseq.h" + +int main(int argc, char **argv) +{ + if (__rseq_register_current_thread(true, false)) + return -1; + return 0; +} diff --git a/tools/testing/selftests/rseq/config b/tools/testing/selftests/rseq/config new file mode 100644 index 000000000000..a64608043ace --- /dev/null +++ b/tools/testing/selftests/rseq/config @@ -0,0 +1,3 @@ +CONFIG_EXPERT=y +CONFIG_RSEQ=y +CONFIG_RSEQ_SLICE_EXTENSION=y diff --git a/tools/testing/selftests/rseq/legacy_check.c b/tools/testing/selftests/rseq/legacy_check.c new file mode 100644 index 000000000000..3f7de4e28303 --- /dev/null +++ b/tools/testing/selftests/rseq/legacy_check.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <errno.h> +#include <signal.h> +#include <stdint.h> +#include <unistd.h> + +#include "rseq.h" + +#include "../kselftest_harness.h" + +FIXTURE(legacy) +{ +}; + +static int cpu_id_in_sigfn = -1; + +static void sigfn(int sig) +{ + struct rseq_abi *rs = rseq_get_abi(); + + cpu_id_in_sigfn = rs->cpu_id_start; +} + +FIXTURE_SETUP(legacy) +{ + int res = __rseq_register_current_thread(true, true); + + switch (res) { + case -ENOSYS: + SKIP(return, "RSEQ not enabled\n"); + case -EBUSY: + SKIP(return, "GLIBC owns RSEQ. Disable GLIBC RSEQ registration\n"); + default: + ASSERT_EQ(res, 0); + } + + ASSERT_NE(signal(SIGUSR1, sigfn), SIG_ERR); +} + +FIXTURE_TEARDOWN(legacy) +{ +} + +TEST_F(legacy, legacy_test) +{ + struct rseq_abi *rs = rseq_get_abi(); + + ASSERT_NE(rs, NULL); + + /* Overwrite rs::cpu_id_start */ + rs->cpu_id_start = -1; + sleep(1); + ASSERT_NE(rs->cpu_id_start, -1); + + rs->cpu_id_start = -1; + ASSERT_EQ(raise(SIGUSR1), 0); + ASSERT_NE(rs->cpu_id_start, -1); + ASSERT_NE(cpu_id_in_sigfn, -1); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 05d03e679e06..e1e98dbabe4b 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -38,7 +38,7 @@ static int opt_modulo, verbose; static int opt_yield, opt_signal, opt_sleep, opt_disable_rseq, opt_threads = 200, opt_disable_mod = 0, opt_test = 's'; - +static bool opt_rseq_legacy; static long long opt_reps = 5000; static __thread __attribute__((tls_model("initial-exec"))) @@ -281,9 +281,12 @@ unsigned int yield_mod_cnt, nr_abort; } \ } +#define rseq_no_glibc true + #else #define printf_verbose(fmt, ...) +#define rseq_no_glibc false #endif /* BENCHMARK */ @@ -481,7 +484,7 @@ void *test_percpu_spinlock_thread(void *arg) long long i, reps; if (!opt_disable_rseq && thread_data->reg && - rseq_register_current_thread()) + __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = thread_data->reps; for (i = 0; i < reps; i++) { @@ -558,7 +561,7 @@ void *test_percpu_inc_thread(void *arg) long long i, reps; if (!opt_disable_rseq && thread_data->reg && - rseq_register_current_thread()) + __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = thread_data->reps; for (i = 0; i < reps; i++) { @@ -712,7 +715,7 @@ void *test_percpu_list_thread(void *arg) long long i, reps; struct percpu_list *list = (struct percpu_list *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -895,7 +898,7 @@ void *test_percpu_buffer_thread(void *arg) long long i, reps; struct percpu_buffer *buffer = (struct percpu_buffer *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -1105,7 +1108,7 @@ void *test_percpu_memcpy_buffer_thread(void *arg) long long i, reps; struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -1258,7 +1261,7 @@ void *test_membarrier_worker_thread(void *arg) const int iters = opt_reps; int i; - if (rseq_register_current_thread()) { + if (__rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) { fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", errno, strerror(errno)); abort(); @@ -1323,7 +1326,7 @@ void *test_membarrier_manager_thread(void *arg) intptr_t expect_a = 0, expect_b = 0; int cpu_a = 0, cpu_b = 0; - if (rseq_register_current_thread()) { + if (__rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) { fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", errno, strerror(errno)); abort(); @@ -1475,6 +1478,7 @@ static void show_usage(int argc, char **argv) printf(" [-D M] Disable rseq for each M threads\n"); printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n"); printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n"); + printf(" [-O] Test with optimized RSEQ\n"); printf(" [-v] Verbose output.\n"); printf(" [-h] Show this help.\n"); printf("\n"); @@ -1602,6 +1606,9 @@ int main(int argc, char **argv) case 'M': opt_mo = RSEQ_MO_RELEASE; break; + case 'L': + opt_rseq_legacy = true; + break; default: show_usage(argc, argv); goto error; @@ -1618,7 +1625,7 @@ int main(int argc, char **argv) if (set_signal_handler()) goto error; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) goto error; if (!opt_disable_rseq && !rseq_validate_cpu_id()) { fprintf(stderr, "Error: cpu id getter unavailable\n"); diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index ecef315204b2..5f4ea2152c2f 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -192,9 +192,14 @@ struct rseq_abi { struct rseq_abi_slice_ctrl slice_ctrl; /* + * Place holder to push the size above 32 bytes. + */ + __u8 __reserved; + + /* * Flexible array member at end of structure, after last feature field. */ char end[]; -} __attribute__((aligned(4 * sizeof(__u64)))); +} __attribute__((aligned(256))); #endif /* _RSEQ_ABI_H */ diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index a736727b83c1..be0d0a97031e 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -56,6 +56,7 @@ ptrdiff_t rseq_offset; * unsuccessful. */ unsigned int rseq_size = -1U; +static unsigned int rseq_alloc_size; /* Flags used during rseq registration. */ unsigned int rseq_flags; @@ -115,29 +116,17 @@ bool rseq_available(void) } } -/* The rseq areas need to be at least 32 bytes. */ -static -unsigned int get_rseq_min_alloc_size(void) -{ - unsigned int alloc_size = rseq_size; - - if (alloc_size < ORIG_RSEQ_ALLOC_SIZE) - alloc_size = ORIG_RSEQ_ALLOC_SIZE; - return alloc_size; -} - /* * Return the feature size supported by the kernel. * * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE): * - * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) + * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE). * * It should never return a value below ORIG_RSEQ_FEATURE_SIZE. */ -static -unsigned int get_rseq_kernel_feature_size(void) +static unsigned int get_rseq_kernel_feature_size(void) { unsigned long auxv_rseq_feature_size, auxv_rseq_align; @@ -152,15 +141,24 @@ unsigned int get_rseq_kernel_feature_size(void) return ORIG_RSEQ_FEATURE_SIZE; } -int rseq_register_current_thread(void) +int __rseq_register_current_thread(bool nolibc, bool legacy) { + unsigned int size; int rc; if (!rseq_ownership) { /* Treat libc's ownership as a successful registration. */ - return 0; + return nolibc ? -EBUSY : 0; } - rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); + + /* The minimal allocation size is 32, which is the legacy allocation size */ + size = get_rseq_kernel_feature_size(); + if (legacy || size < ORIG_RSEQ_ALLOC_SIZE) + rseq_alloc_size = ORIG_RSEQ_ALLOC_SIZE; + else + rseq_alloc_size = size; + + rc = sys_rseq(&__rseq.abi, rseq_alloc_size, 0, RSEQ_SIG); if (rc) { /* * After at least one thread has registered successfully @@ -179,9 +177,8 @@ int rseq_register_current_thread(void) * The first thread to register sets the rseq_size to mimic the libc * behavior. */ - if (RSEQ_READ_ONCE(rseq_size) == 0) { - RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size()); - } + if (RSEQ_READ_ONCE(rseq_size) == 0) + RSEQ_WRITE_ONCE(rseq_size, size); return 0; } @@ -194,7 +191,7 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, rseq_alloc_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index f51a5fdb0444..c62ebb9290c0 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -8,6 +8,7 @@ #ifndef RSEQ_H #define RSEQ_H +#include <assert.h> #include <stdint.h> #include <stdbool.h> #include <pthread.h> @@ -142,7 +143,12 @@ static inline struct rseq_abi *rseq_get_abi(void) * succeed. A restartable sequence executed from a non-registered * thread will always fail. */ -int rseq_register_current_thread(void); +int __rseq_register_current_thread(bool nolibc, bool legacy); + +static inline int rseq_register_current_thread(void) +{ + return __rseq_register_current_thread(false, false); +} /* * Unregister rseq for current thread. diff --git a/tools/testing/selftests/rseq/run_legacy_check.sh b/tools/testing/selftests/rseq/run_legacy_check.sh new file mode 100755 index 000000000000..5577b46ea092 --- /dev/null +++ b/tools/testing/selftests/rseq/run_legacy_check.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./legacy_check diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh index 8d31426ab41f..69a3fa049929 100755 --- a/tools/testing/selftests/rseq/run_param_test.sh +++ b/tools/testing/selftests/rseq/run_param_test.sh @@ -34,6 +34,11 @@ REPS=1000 SLOW_REPS=100 NR_THREADS=$((6*${NR_CPUS})) +# Prevent GLIBC from registering RSEQ so the selftest can run in legacy and +# performance optimized mode. +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" +export GLIBC_TUNABLES + function do_tests() { local i=0 @@ -103,6 +108,40 @@ function inject_blocking() NR_LOOPS= } +echo "Testing in legacy RSEQ mode" +echo "Yield injection (25%)" +inject_blocking -m 4 -y -L + +echo "Yield injection (50%)" +inject_blocking -m 2 -y -L + +echo "Yield injection (100%)" +inject_blocking -m 1 -y -L + +echo "Kill injection (25%)" +inject_blocking -m 4 -k -L + +echo "Kill injection (50%)" +inject_blocking -m 2 -k -L + +echo "Kill injection (100%)" +inject_blocking -m 1 -k -L + +echo "Sleep injection (1ms, 25%)" +inject_blocking -m 4 -s 1 -L + +echo "Sleep injection (1ms, 50%)" +inject_blocking -m 2 -s 1 -L + +echo "Sleep injection (1ms, 100%)" +inject_blocking -m 1 -s 1 -L + +./check_optimized || { + echo "Skipping optimized RSEQ mode test. Not supported"; + exit 0 +} + +echo "Testing in optimized RSEQ mode" echo "Yield injection (25%)" inject_blocking -m 4 -y diff --git a/tools/testing/selftests/rseq/run_timeslice_test.sh b/tools/testing/selftests/rseq/run_timeslice_test.sh new file mode 100755 index 000000000000..551ebed71ec6 --- /dev/null +++ b/tools/testing/selftests/rseq/run_timeslice_test.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ + +# Prevent GLIBC from registering RSEQ so the selftest can run in legacy +# and performance optimized mode. +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" +export GLIBC_TUNABLES + +./check_optimized || { + echo "Skipping optimized RSEQ mode test. Not supported"; + exit 0 +} + +./slice_test diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c index 357122dcb487..e402d4440bc2 100644 --- a/tools/testing/selftests/rseq/slice_test.c +++ b/tools/testing/selftests/rseq/slice_test.c @@ -124,6 +124,13 @@ FIXTURE_SETUP(slice_ext) { cpu_set_t affinity; + if (__rseq_register_current_thread(true, false)) + SKIP(return, "RSEQ not supported\n"); + + if (prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0)) + SKIP(return, "Time slice extension not supported\n"); + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); /* Pin it on a single CPU. Avoid CPU 0 */ @@ -137,11 +144,6 @@ FIXTURE_SETUP(slice_ext) break; } - ASSERT_EQ(rseq_register_current_thread(), 0); - - ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, - PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); - self->noise_params.noise_nsecs = variant->noise_nsecs; self->noise_params.sleep_nsecs = variant->sleep_nsecs; self->noise_params.run = 1; diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c index 4e93262703ca..383d06e972a4 100644 --- a/tools/testing/selftests/sched_ext/dequeue.c +++ b/tools/testing/selftests/sched_ext/dequeue.c @@ -33,6 +33,7 @@ static void worker_fn(int id) /* Do some work to trigger scheduling events */ for (j = 0; j < 10000; j++) sum += j; + asm volatile("" : : "r"(sum)); /* Sleep to trigger dequeue */ usleep(1000 + (id * 100)); diff --git a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c index 9f16d39255e7..0d6fcc8e5eb6 100644 --- a/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c +++ b/tools/testing/selftests/sched_ext/non_scx_kfunc_deny.bpf.c @@ -9,12 +9,7 @@ * Copyright (C) 2026 Cheng-Yang Chou <yphbchou0911@gmail.com> */ -#include <vmlinux.h> -#include <bpf/bpf_helpers.h> -#include <bpf/bpf_tracing.h> - -/* SCX kfunc from scx_kfunc_ids_any set */ -void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +#include <scx/common.bpf.h> SEC("struct_ops/ssthresh") __u32 BPF_PROG(tcp_ca_ssthresh, struct sock *sk) diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c index 7f23fb17b1e0..9e802b52b29e 100644 --- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c +++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c @@ -95,7 +95,7 @@ static int scan_dsq_pool(void) record_peek_result(task->pid); /* Try to move this task to local */ - if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0) == 0) { + if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0)) { moved = 1; break; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index 5b6e045e1109..7e342c0cec65 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -6,6 +6,7 @@ */ #include <bpf/bpf.h> #include <scx/common.h> +#include <stdlib.h> #include <sys/wait.h> #include <unistd.h> #include "select_cpu_dfl.bpf.skel.h" @@ -13,29 +14,44 @@ #define NUM_CHILDREN 1028 +struct select_cpu_dfl_ctx { + struct select_cpu_dfl *skel; + struct bpf_link *link; +}; + static enum scx_test_status setup(void **ctx) { - struct select_cpu_dfl *skel; + struct select_cpu_dfl_ctx *tctx; + + tctx = malloc(sizeof(*tctx)); + SCX_FAIL_IF(!tctx, "Failed to allocate test context"); + tctx->link = NULL; - skel = select_cpu_dfl__open(); - SCX_FAIL_IF(!skel, "Failed to open"); - SCX_ENUM_INIT(skel); - SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + tctx->skel = select_cpu_dfl__open(); + if (!tctx->skel) { + free(tctx); + SCX_FAIL("Failed to open"); + } + SCX_ENUM_INIT(tctx->skel); + if (select_cpu_dfl__load(tctx->skel)) { + select_cpu_dfl__destroy(tctx->skel); + free(tctx); + SCX_FAIL("Failed to load skel"); + } - *ctx = skel; + *ctx = tctx; return SCX_TEST_PASS; } static enum scx_test_status run(void *ctx) { - struct select_cpu_dfl *skel = ctx; - struct bpf_link *link; + struct select_cpu_dfl_ctx *tctx = ctx; pid_t pids[NUM_CHILDREN]; - int i, status; + int i, status, nforked = 0; - link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); - SCX_FAIL_IF(!link, "Failed to attach scheduler"); + tctx->link = bpf_map__attach_struct_ops(tctx->skel->maps.select_cpu_dfl_ops); + SCX_FAIL_IF(!tctx->link, "Failed to attach scheduler"); for (i = 0; i < NUM_CHILDREN; i++) { pids[i] = fork(); @@ -43,25 +59,31 @@ static enum scx_test_status run(void *ctx) sleep(1); exit(0); } + if (pids[i] > 0) + nforked++; } for (i = 0; i < NUM_CHILDREN; i++) { + if (pids[i] <= 0) + continue; SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); SCX_EQ(status, 0); } - SCX_ASSERT(!skel->bss->saw_local); - - bpf_link__destroy(link); + SCX_GT(nforked, 0); + SCX_ASSERT(!tctx->skel->bss->saw_local); return SCX_TEST_PASS; } static void cleanup(void *ctx) { - struct select_cpu_dfl *skel = ctx; + struct select_cpu_dfl_ctx *tctx = ctx; - select_cpu_dfl__destroy(skel); + if (tctx->link) + bpf_link__destroy(tctx->link); + select_cpu_dfl__destroy(tctx->skel); + free(tctx); } struct scx_test select_cpu_dfl = { diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c index 5b0a619bab86..2af01cee90cc 100644 --- a/tools/testing/selftests/sched_ext/total_bw.c +++ b/tools/testing/selftests/sched_ext/total_bw.c @@ -100,6 +100,98 @@ static int read_total_bw_values(long *bw_values, int max_cpus) return cpu_count; } +/* + * Read a per-CPU dl_server param (runtime or period) from debugfs. + * Returns the value in nanoseconds, or -1 on failure. + */ +static long read_server_param(const char *server, const char *param, int cpu) +{ + char path[128]; + long value = -1; + FILE *fp; + + snprintf(path, sizeof(path), + "/sys/kernel/debug/sched/%s_server/cpu%d/%s", + server, cpu, param); + fp = fopen(path, "r"); + if (!fp) + return -1; + if (fscanf(fp, "%ld", &value) != 1) + value = -1; + fclose(fp); + + return value; +} + +/* + * Write a per-CPU dl_server param to debugfs. Returns 0 on success. + */ +static int write_server_param(const char *server, const char *param, + int cpu, long value) +{ + char path[128]; + FILE *fp; + int ret = 0; + + snprintf(path, sizeof(path), + "/sys/kernel/debug/sched/%s_server/cpu%d/%s", + server, cpu, param); + fp = fopen(path, "w"); + if (!fp) + return -1; + if (fprintf(fp, "%ld", value) < 0) + ret = -1; + if (fclose(fp) != 0) + ret = -1; + + return ret; +} + +static int read_fair_runtime_all(int nr_cpus, long *runtimes) +{ + int i; + + for (i = 0; i < nr_cpus; i++) { + runtimes[i] = read_server_param("fair", "runtime", i); + if (runtimes[i] <= 0) + return -1; + } + + return 0; +} + +static int write_fair_runtime_all(int nr_cpus, long value) +{ + int i; + + for (i = 0; i < nr_cpus; i++) { + if (write_server_param("fair", "runtime", i, value) < 0) { + SCX_ERR("Failed to write fair_server runtime on CPU %d", i); + return -1; + } + } + + return 0; +} + +/* + * Restore per-CPU fair_server runtimes. + */ +static int restore_fair_runtime_all(int nr_cpus, const long *runtimes) +{ + int ret = 0; + int i; + + for (i = 0; i < nr_cpus; i++) { + if (write_server_param("fair", "runtime", i, runtimes[i]) < 0) { + SCX_ERR("Failed to restore fair_server runtime on CPU %d", i); + ret = -1; + } + } + + return ret; +} + static bool verify_total_bw_consistency(long *bw_values, int count) { int i; @@ -217,6 +309,9 @@ static enum scx_test_status run(void *ctx) struct bpf_link *link; long loaded_bw[MAX_CPUS]; long unloaded_bw[MAX_CPUS]; + long doubled_bw[MAX_CPUS]; + long original_runtime[MAX_CPUS], doubled_runtime; + enum scx_test_status ret; int i; /* Test scenario 2: BPF program loaded */ @@ -257,7 +352,111 @@ static enum scx_test_status run(void *ctx) } fprintf(stderr, "All total_bw values are consistent across all scenarios\n"); - return SCX_TEST_PASS; + + /* + * Validate auto-register/unregister of dl_server bandwidth reservations. + * + * Doubling fair_server's runtime doubles its bw contribution. With a + * full-mode BPF scheduler (minimal_ops), the kernel should detach + * fair_server and attach ext_server, dropping total_bw back to its + * pre-customization (default ext_server-only) value. On unload, the + * fair_server reservation should come back with its customized runtime + * preserved, so total_bw doubles again. + */ + if (read_fair_runtime_all(test_ctx->nr_cpus, original_runtime) < 0) { + fprintf(stderr, "Skipping attach/detach validation: debugfs not accessible\n"); + return SCX_TEST_PASS; + } + doubled_runtime = original_runtime[0] * 2; + + fprintf(stderr, + "Setting fair_server runtime to %ld ns on all CPUs (orig %ld)\n", + doubled_runtime, original_runtime[0]); + + if (write_fair_runtime_all(test_ctx->nr_cpus, doubled_runtime) < 0) { + ret = SCX_TEST_FAIL; + goto restore; + } + + if (fetch_verify_total_bw(doubled_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values after doubling fair runtime"); + ret = SCX_TEST_FAIL; + goto restore; + } + + /* + * After doubling the runtime, fair_server's bw contribution must grow. + * We don't assert exactly 2x, because the kernel's to_ratio() truncates + * the value, so 2 * to_ratio(period, runtime) and + * to_ratio(period, 2 * runtime) can differ. + */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (doubled_bw[i] <= test_ctx->baseline_bw[i]) { + SCX_ERR("CPU%d: fair did not increase total_bw (baseline=%ld, doubled=%ld)", + i, test_ctx->baseline_bw[i], doubled_bw[i]); + ret = SCX_TEST_FAIL; + goto restore; + } + } + + link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler for detach test"); + ret = SCX_TEST_FAIL; + goto restore; + } + + if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values with BPF loaded (detach test)"); + bpf_link__destroy(link); + ret = SCX_TEST_FAIL; + goto restore; + } + + /* + * In full mode the customized fair_server is detached and ext_server is + * attached at its default runtime, total_bw must match baseline. + */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (loaded_bw[i] != test_ctx->baseline_bw[i]) { + SCX_ERR("CPU%d: expected bw %ld (fair detached, ext default), got %ld", + i, test_ctx->baseline_bw[i], loaded_bw[i]); + bpf_link__destroy(link); + ret = SCX_TEST_FAIL; + goto restore; + } + } + + bpf_link__destroy(link); + + if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values after BPF unload (detach test)"); + ret = SCX_TEST_FAIL; + goto restore; + } + + /* + * After unload, fair_server is re-attached with its preserved 2x + * runtime, so total_bw should return to the doubled value. + */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (unloaded_bw[i] != doubled_bw[i]) { + SCX_ERR("CPU%d: BPF unloaded: expected %ld (fair restored at 2x), got %ld", + i, doubled_bw[i], unloaded_bw[i]); + ret = SCX_TEST_FAIL; + goto restore; + } + } + + fprintf(stderr, + "dl_server attach/detach with customized fair runtime verified\n"); + ret = SCX_TEST_PASS; + +restore: + if (restore_fair_runtime_all(test_ctx->nr_cpus, original_runtime) < 0) + SCX_ERR("Failed to fully restore per-CPU fair_server runtimes"); + + return ret; } static void cleanup(void *ctx) diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index c20aa16b1d63..0e5618be0335 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -4,6 +4,7 @@ CONFIG_DUMMY=y CONFIG_VETH=y +CONFIG_IFB=y # # Core Netfilter Configuration diff --git a/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py index bb19b8b76d3b..0bece7c74f07 100644 --- a/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py +++ b/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py @@ -120,6 +120,7 @@ class SubPlugin(TdcPlugin): dev0 = self.args.NAMES["DEV0"]; dev1 = self.args.NAMES["DEV1"]; dummy = self.args.NAMES["DUMMY"]; + ifb = self.args.NAMES['IFB'] if self.args.verbose: print('{}._nl_ns_create'.format(self.sub_class)) @@ -129,6 +130,7 @@ class SubPlugin(TdcPlugin): with IPRoute() as ip: ip.link('add', ifname=dev1, kind='veth', peer={'ifname': dev0, 'net_ns_fd':'/proc/1/ns/net'}) ip.link('add', ifname=dummy, kind='dummy') + ip.link('add', ifname=ifb, kind='ifb') ticks = 20 while True: if ticks == 0: @@ -136,8 +138,10 @@ class SubPlugin(TdcPlugin): try: dev1_idx = ip.link_lookup(ifname=dev1)[0] dummy_idx = ip.link_lookup(ifname=dummy)[0] + ifb_idx = ip.link_lookup(ifname=ifb)[0] ip.link('set', index=dev1_idx, state='up') ip.link('set', index=dummy_idx, state='up') + ip.link('set', index=ifb_idx, state='up') break except: time.sleep(0.1) @@ -169,8 +173,11 @@ class SubPlugin(TdcPlugin): cmds.append(self._replace_keywords('link set $DEV1 netns {}'.format(ns))) cmds.append(self._replace_keywords('link add $DUMMY type dummy'.format(ns))) cmds.append(self._replace_keywords('link set $DUMMY netns {}'.format(ns))) + cmds.append(self._replace_keywords('link add $IFB type ifb')) + cmds.append(self._replace_keywords('link set $IFB netns {}'.format(ns))) cmds.append(self._replace_keywords('netns exec {} $IP link set $DEV1 up'.format(ns))) cmds.append(self._replace_keywords('netns exec {} $IP link set $DUMMY up'.format(ns))) + cmds.append(self._replace_keywords('netns exec {} $IP link set $IFB up'.format(ns))) cmds.append(self._replace_keywords('link set $DEV0 up'.format(ns))) if self.args.device: diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json index 33bb8f3ff8ed..da65f838bd52 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json @@ -664,5 +664,43 @@ "teardown": [ "$TC qdisc del dev $DEV1 ingress_block 21 clsact" ] + }, + { + "id": "9c2a", + "name": "Act_ct preserves skb cb across defrag before prio dequeue", + "category": [ + "actions", + "ct", + "scapy" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DUMMY root handle 1: prio", + "$TC qdisc add dev $DUMMY clsact", + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 1 matchall action mirred egress redirect dev $DUMMY" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 1 matchall action ct zone 1 pipe", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "[Ether()/frag for frag in fragment(IP(src='10.0.0.10', dst='10.0.0.1', id=1)/UDP(sport=12345, dport=9)/Raw(b'A' * 4000), fragsize=1400)]" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 1 '^qdisc prio 1:'", + "matchPattern": "Sent [1-9][0-9]* bytes [1-9][0-9]* pkt", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact", + "$TC qdisc del dev $DUMMY root handle 1:" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json index 808aef4afe22..ece7ec41bf99 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json @@ -1378,5 +1378,60 @@ "teardown": [ "$TC actions flush action ife" ] + }, + { + "id": "4e6b", + "name": "Decode IFE packet with truncated inner Ethernet header", + "category": [ + "actions", + "ife" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + [ + "$TC actions flush action ife", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 clsact" + ], + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether(type=0xED3E) / Raw(b'\\x00\\x02\\xaa')" + } + ], + "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol all matchall action ife decode pipe index 10", + "expExitCode": "0", + "verifyCmd": "$TC -s -j actions get action ife index 10", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "kind": "ife", + "mode": "decode", + "index": 10, + "stats": { + "bytes": 3, + "packets": 1, + "drops": 1 + } + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json index b056eb966871..d0cad6571691 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json @@ -1144,6 +1144,620 @@ "teardown": [ "$TC qdisc del dev $DUMMY clsact" ] + }, + { + "id": "531c", + "name": "Redirect multiport: dummy egress -> dev1 ingress -> dummy egress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY egress protocol ip prio 10 matchall action mirred ingress redirect dev $DEV1 index 1", + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 2" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY clsact", + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "b1d7", + "name": "Redirect singleport: dev1 ingress -> dev1 egress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DEV1 index 1" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 egress protocol ip prio 11 matchall action mirred ingress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "c66d", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dev1 egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred egress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "aa99", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred ingress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 2, + "overlimits": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "37d7", + "name": "Redirect multiport: dev1 ingress -> dummy egress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred ingress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "6d02", + "name": "Redirect multiport: dummy egress -> dev1 ingress -> dummy egress, different prios (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY egress protocol ip prio 10 matchall action mirred ingress redirect dev $DEV1 index 1", + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 11 matchall action mirred egress redirect dev $DUMMY index 2" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY clsact", + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "8115", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dummy egress -> dev1 egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred egress redirect dev $DUMMY index 2" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 12 matchall action mirred egress redirect dev $DEV1 index 3", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "9eb3", + "name": "Redirect multiport: dev1 ingress -> dummy egress -> dev1 egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred egress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "d837", + "name": "Redirect multiport: dev1 ingress -> dummy egress -> dummy ingress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred ingress redirect dev $DUMMY index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "2071", + "name": "Redirect singleport: dev1 ingress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DEV1 index 1", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1, + "overlimits": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "0101", + "name": "Redirect singleport: dummy egress -> dummy ingress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred ingress redirect dev $DUMMY index 1" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "cf97", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dummy egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred egress redirect dev $DUMMY index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] } - ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json index 37c410332174..d8b685cfc62d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json @@ -1920,5 +1920,54 @@ "teardown": [ "$TC actions flush action pedit" ] + }, + { + "id": "1a4f", + "name": "Pedit udp dport should not mangle TCP packet dport", + "category": [ + "actions", + "pedit" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip pref 1 matchall action pedit ex munge udp dport set 18053 continue" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol ip pref 2 flower ip_proto tcp dst_port 2222 action drop index 1", + "scapy": { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1')/TCP(dport=2222)" + }, + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action gact index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "gact", + "control_action": { + "type": "drop" + }, + "index": 1, + "stats": { + "packets": 1 + } + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index eefadd0546d3..a1f97a4b606e 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -392,26 +392,32 @@ "htb" ], "plugins": { - "requires": "nsPlugin" + "requires": [ + "nsPlugin", + "scapyPlugin" + ] }, "setup": [ - "$IP link set dev $DUMMY up || true", - "$IP addr add 10.10.10.10/24 dev $DUMMY || true", - "$TC qdisc add dev $DUMMY handle 1: root htb default 10", - "$TC class add dev $DUMMY parent 1: classid 1:10 htb rate 1kbit", - "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", - "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", - "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", - "sleep 0.1" + "$TC qdisc add dev $IFB handle 1: root htb default 10", + "$TC class add dev $IFB parent 1: classid 1:10 htb rate 1kbit", + "$TC qdisc add dev $IFB parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $IFB parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "$TC qdisc add dev $DEV1 ingress", + "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress mirror dev $IFB" ], - "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + }, + "cmdUnderTest": "$TC -s qdisc show dev $IFB", "expExitCode": "0", - "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "verifyCmd": "$TC -s qdisc show dev $IFB | grep -A 5 'qdisc fq_codel'", "matchPattern": "dropped [1-9][0-9]*", "matchCount": "1", "teardown": [ - "$TC qdisc del dev $DUMMY handle 1: root", - "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + "$TC qdisc del dev $IFB root", + "$TC qdisc del dev $DEV1 ingress" ] }, { @@ -423,26 +429,32 @@ "qfq" ], "plugins": { - "requires": "nsPlugin" + "requires": [ + "nsPlugin", + "scapyPlugin" + ] }, "setup": [ - "$IP link set dev $DUMMY up || true", - "$IP addr add 10.10.10.10/24 dev $DUMMY || true", - "$TC qdisc add dev $DUMMY handle 1: root qfq", - "$TC class add dev $DUMMY parent 1: classid 1:10 qfq weight 1 maxpkt 1000", - "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", - "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", - "ping -c 10 -s 1000 -f -I $DUMMY 10.10.10.1 > /dev/null || true", - "sleep 0.1" + "$TC qdisc add dev $IFB handle 1: root qfq", + "$TC class add dev $IFB parent 1: classid 1:10 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $IFB parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $IFB parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "$TC qdisc add dev $DEV1 ingress", + "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress mirror dev $IFB" ], - "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "scapy": { + "iface": "$DEV0", + "count": 10, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + }, + "cmdUnderTest": "$TC -s qdisc show dev $IFB", "expExitCode": "0", - "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "verifyCmd": "$TC -s qdisc show dev $IFB | grep -A 5 'qdisc fq_codel'", "matchPattern": "dropped [1-9][0-9]*", "matchCount": "1", "teardown": [ - "$TC qdisc del dev $DUMMY handle 1: root", - "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + "$TC qdisc del dev $IFB root", + "$TC qdisc del dev $DEV1 ingress" ] }, { @@ -454,26 +466,32 @@ "hfsc" ], "plugins": { - "requires": "nsPlugin" + "requires": [ + "nsPlugin", + "scapyPlugin" + ] }, "setup": [ - "$IP link set dev $DUMMY up || true", - "$IP addr add 10.10.10.10/24 dev $DUMMY || true", - "$TC qdisc add dev $DUMMY handle 1: root hfsc default 10", - "$TC class add dev $DUMMY parent 1: classid 1:10 hfsc sc rate 1kbit ul rate 1kbit", - "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", - "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", - "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", - "sleep 0.1" + "$TC qdisc add dev $IFB handle 1: root hfsc default 10", + "$TC class add dev $IFB parent 1: classid 1:10 hfsc sc rate 1kbit ul rate 1kbit", + "$TC qdisc add dev $IFB parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $IFB parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "$TC qdisc add dev $DEV1 ingress", + "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress mirror dev $IFB" ], - "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + }, + "cmdUnderTest": "$TC -s qdisc show dev $IFB", "expExitCode": "0", - "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "verifyCmd": "$TC -s qdisc show dev $IFB | grep -A 5 'qdisc fq_codel'", "matchPattern": "dropped [1-9][0-9]*", "matchCount": "1", "teardown": [ - "$TC qdisc del dev $DUMMY handle 1: root", - "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + "$TC qdisc del dev $IFB root", + "$TC qdisc del dev $DEV1 ingress" ] }, { @@ -485,26 +503,32 @@ "drr" ], "plugins": { - "requires": "nsPlugin" + "requires": [ + "nsPlugin", + "scapyPlugin" + ] }, "setup": [ - "$IP link set dev $DUMMY up || true", - "$IP addr add 10.10.10.10/24 dev $DUMMY || true", - "$TC qdisc add dev $DUMMY handle 1: root drr", - "$TC class add dev $DUMMY parent 1: classid 1:10 drr quantum 1500", - "$TC qdisc add dev $DUMMY parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", - "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", - "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", - "sleep 0.1" + "$TC qdisc add dev $IFB handle 1: root drr", + "$TC class add dev $IFB parent 1: classid 1:10 drr quantum 1500", + "$TC qdisc add dev $IFB parent 1:10 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $IFB parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:10", + "$TC qdisc add dev $DEV1 ingress", + "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress mirror dev $IFB" ], - "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + }, + "cmdUnderTest": "$TC -s qdisc show dev $IFB", "expExitCode": "0", - "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "verifyCmd": "$TC -s qdisc show dev $IFB | grep -A 5 'qdisc fq_codel'", "matchPattern": "dropped [1-9][0-9]*", "matchCount": "1", "teardown": [ - "$TC qdisc del dev $DUMMY handle 1: root", - "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + "$TC qdisc del dev $IFB root", + "$TC qdisc del dev $DEV1 ingress" ] }, { @@ -516,26 +540,32 @@ "ets" ], "plugins": { - "requires": "nsPlugin" + "requires": [ + "nsPlugin", + "scapyPlugin" + ] }, "setup": [ - "$IP link set dev $DUMMY up || true", - "$IP addr add 10.10.10.10/24 dev $DUMMY || true", - "$TC qdisc add dev $DUMMY handle 1: root ets bands 2 strict 1", - "$TC class change dev $DUMMY parent 1: classid 1:1 ets", - "$TC qdisc add dev $DUMMY parent 1:1 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", - "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1", - "ping -c 5 -f -I $DUMMY 10.10.10.1 > /dev/null || true", - "sleep 0.1" + "$TC qdisc add dev $IFB handle 1: root ets bands 2 strict 1", + "$TC class change dev $IFB parent 1: classid 1:1 ets", + "$TC qdisc add dev $IFB parent 1:1 handle 10: fq_codel memory_limit 1 flows 1 target 0.1ms interval 1ms", + "$TC filter add dev $IFB parent 1: protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1", + "$TC qdisc add dev $DEV1 ingress", + "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress mirror dev $IFB" ], - "cmdUnderTest": "$TC -s qdisc show dev $DUMMY", + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + }, + "cmdUnderTest": "$TC -s qdisc show dev $IFB", "expExitCode": "0", - "verifyCmd": "$TC -s qdisc show dev $DUMMY | grep -A 5 'qdisc fq_codel'", + "verifyCmd": "$TC -s qdisc show dev $IFB | grep -A 5 'qdisc fq_codel'", "matchPattern": "dropped [1-9][0-9]*", "matchCount": "1", "teardown": [ - "$TC qdisc del dev $DUMMY handle 1: root", - "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + "$TC qdisc del dev $IFB root", + "$TC qdisc del dev $DEV1 ingress" ] }, { @@ -702,6 +732,7 @@ "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%", "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", "$TC class add dev $DUMMY parent 1:0 classid 1:2 hfsc ls m2 10Mbit", + "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 netem duplicate 100%", "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", "ping -c 1 10.10.10.1 -I$DUMMY > /dev/null || true", "$TC filter del dev $DUMMY parent 1:0 protocol ip prio 1", @@ -714,8 +745,8 @@ { "kind": "hfsc", "handle": "1:", - "bytes": 294, - "packets": 3 + "bytes": 392, + "packets": 4 } ], "matchCount": "1", @@ -1136,5 +1167,378 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "7a5f", + "name": "Force red to dequeue from its child's gso_skb with qfq leaf", + "category": [ + "qdisc", + "tbf", + "red", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: red limit 757 min 16 max 24 avpkt 16", + "$TC qdisc add dev $DUMMY parent 2: handle 3: qfq", + "$TC class add dev $DUMMY classid 3:1 parent 3: qfq maxpkt 512 weight 1", + "$TC filter add dev $DUMMY parent 3: protocol ip prio 1 matchall classid 3:1 action ok" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "red", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "cdae", + "name": "Force sfb to dequeue from its child's gso_skb with qfq leaf", + "category": [ + "qdisc", + "tbf", + "sfb", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: sfb", + "$TC qdisc add dev $DUMMY parent 2: handle 3: qfq", + "$TC class add dev $DUMMY classid 3:1 parent 3: qfq maxpkt 512 weight 1", + "$TC filter add dev $DUMMY parent 3: protocol ip prio 1 matchall classid 3:1 action ok" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "sfb", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "291d", + "name": "Force red to dequeue from its child's gso_skb with dualpi2 leaf", + "category": [ + "qdisc", + "tbf", + "red", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: red limit 757 min 16 max 24 avpkt 16", + "$TC qdisc add dev $DUMMY parent 2: handle 3: dualpi2" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "red", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "9c6d", + "name": "Force sfb to dequeue from its child's gso_skb with dualpi2 leaf", + "category": [ + "qdisc", + "tbf", + "sfb", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: sfb", + "$TC qdisc add dev $DUMMY parent 2: handle 3: dualpi2" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "sfb", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "3a62", + "name": "Try to create a qlen underflow with QFQ/CBS", + "category": [ + "qdisc", + "qfq", + "cbs" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY classid 1:1 parent 1: qfq", + "$TC class add dev $DUMMY classid 1:2 parent 1: qfq", + "$TC qdisc add dev $DUMMY handle 2: parent 1:1 cbs", + "$TC qdisc add dev $DUMMY handle 3: parent 2: netem delay 5000000000", + "$TC filter add dev $DUMMY parent 1: prio 1 u32 match ip dst 10.10.10.1 classid 1:1 action ok", + "$TC filter add dev $DUMMY parent 1: prio 2 u32 match ip dst 10.10.10.2 classid 1:2 action ok", + "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "$IP l set $DUMMY down", + "$IP l set $DUMMY up", + "$TC qdisc replace dev $DUMMY handle 4: parent 2: pfifo" + ], + "cmdUnderTest": "ping -c 1 10.10.10.2 -W0.01 -I$DUMMY", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:1", + "matchJSON": [ + { + "kind": "cbs", + "handle": "2:", + "bytes": 0, + "packets": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "c797", + "name": "Verify fq_codel won't mistakenly deactivate QFQ parent class during peek", + "category": [ + "qdisc", + "qfq", + "fq_codel" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000", + "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$IP l set dev $DUMMY mtu 1500", + "$TC qdisc add dev $DUMMY parent 1:2 handle 10: fq_codel target 1 interval 1 flows 1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "$IP l set dev $DUMMY mtu 65336", + "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 > /dev/null || true", + "ping -c 3 -s 2000 -I $DUMMY 10.10.10.2 -W0.01 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc show dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 3, + "drops": 1, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "plug", + "handle": "2:", + "packets": 1, + "drops": 0, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "fq_codel", + "handle": "10:", + "packets": 2, + "drops": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "82d9", + "name": "Verify codel won't mistakenly deactivate QFQ parent class during peek", + "category": [ + "qdisc", + "qfq", + "codel" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000", + "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$IP l set dev $DUMMY mtu 1500", + "$TC qdisc add dev $DUMMY parent 1:2 handle 10: codel target 1ms interval 1ms", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "$IP l set dev $DUMMY mtu 65336", + "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 > /dev/null || true", + "ping -c 3 -s 2000 -I $DUMMY 10.10.10.2 -W0.01 > /dev/null || true", + "sleep 0.1" + ], + "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc show dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 3, + "drops": 1, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "plug", + "handle": "2:", + "packets": 1, + "drops": 0, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "codel", + "handle": "10:", + "packets": 2, + "drops": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "d3da", + "name": "Verify dualpi2 won't mistakenly deactivate QFQ parent class during peek", + "category": [ + "qdisc", + "qfq", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true" , + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 1000", + "$TC class add dev $DUMMY parent 1: classid 1:2 qfq weight 1 maxpkt 1000", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$TC qdisc add dev $DUMMY parent 1:2 handle 10: dualpi2 step_thresh 500ms", + "$TC filter add dev $DUMMY parent 10: protocol ip prio 1 matchall classid 10:1 action ok", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC filter add dev $DUMMY parent 1: protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01 || true", + "ping -c 3 -i 0.1 -I $DUMMY 10.10.10.2 -W0.01 || true", + "sleep 0.7", + "ping -c 1 -I $DUMMY 10.10.10.2 -W0.01 || true", + "$TC qdisc change dev $DUMMY handle 2:0 plug release_indefinite" + ], + "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 -W0.01", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc show dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 4, + "drops": 2, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "plug", + "handle": "2:", + "packets": 2, + "drops": 0, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "dualpi2", + "handle": "10:", + "packets": 2, + "drops": 2, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dualpi2.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dualpi2.json index cd1f2ee8f354..ed6a900bb568 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dualpi2.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/dualpi2.json @@ -250,5 +250,49 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "891f", + "name": "Verify DualPI2 GSO backlog accounting with QFQ parent", + "category": [ + "qdisc", + "dualpi2", + "qfq", + "gso" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 1 maxpkt 4096", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2: dualpi2", + "$TC filter add dev $DUMMY parent 1: matchall classid 1:1" + ], + "cmdUnderTest": "./tdc_gso.py 10.10.10.10 10.10.10.1 9000 1200 2400", + "expExitCode": "0", + "verifyCmd": "$TC -j -s qdisc ls dev $DUMMY", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "packets": 2, + "backlog": 0, + "qlen": 0 + }, + { + "kind": "dualpi2", + "handle": "2:", + "packets": 2, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json index 718d2df2aafa..472b672a600d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json @@ -338,84 +338,34 @@ ] }, { - "id": "d34d", - "name": "NETEM test qdisc duplication restriction in qdisc tree in netem_change root", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY root handle 1: netem limit 1", - "$TC qdisc add dev $DUMMY parent 1: handle 2: netem limit 1" - ], - "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 1: netem duplicate 50%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "2", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] - }, - { - "id": "b33f", - "name": "NETEM test qdisc duplication restriction in qdisc tree in netem_change non-root", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY root handle 1: netem limit 1", - "$TC qdisc add dev $DUMMY parent 1: handle 2: netem limit 1" - ], - "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2: netem duplicate 50%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "2", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] - }, - { - "id": "cafe", - "name": "NETEM test qdisc duplication restriction in qdisc tree", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY root handle 1: netem limit 1 duplicate 100%" + "id": "8c17", + "name": "Test netem's recursive duplicate", + "category": [ + "qdisc", + "netem" ], - "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1: handle 2: netem duplicate 100%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "1", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] - }, - { - "id": "1337", - "name": "NETEM test qdisc duplication restriction in qdisc tree across branches", - "category": ["qdisc", "netem"], "plugins": { "requires": "nsPlugin" }, "setup": [ - "$TC qdisc add dev $DUMMY parent root handle 1:0 hfsc", - "$TC class add dev $DUMMY parent 1:0 classid 1:1 hfsc rt m2 10Mbit", - "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem", - "$TC class add dev $DUMMY parent 1:0 classid 1:2 hfsc rt m2 10Mbit" - ], - "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 netem duplicate 100%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "1", + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: netem limit 1000 duplicate 100%", + "$TC qdisc add dev $DUMMY parent 1: handle 2: netem limit 1000 duplicate 100%" + ], + "cmdUnderTest": "ping -c 1 10.10.11.11 -W 0.01", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY root", + "matchJSON": [ + { + "kind": "netem", + "handle": "1:", + "bytes": 294, + "packets": 3 + } + ], "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" + "$TC qdisc del dev $DUMMY handle 1: root" ] - } + } ] diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py index 81b4ac3f050c..511d66c36a2a 100755 --- a/tools/testing/selftests/tc-testing/tdc.py +++ b/tools/testing/selftests/tc-testing/tdc.py @@ -378,6 +378,7 @@ def run_one_test(pm, args, index, tidx): dev0 = NAMES['DEV0'] dev1 = NAMES['DEV1'] dummy = NAMES['DUMMY'] + ifb = NAMES['IFB'] result = True tresult = "" tap = "" @@ -414,6 +415,7 @@ def run_one_test(pm, args, index, tidx): NAMES['DEV0'] = '{}id{}'.format(NAMES['DEV0'], tidx['id']) NAMES['DEV1'] = '{}id{}'.format(NAMES['DEV1'], tidx['id']) NAMES['DUMMY'] = '{}id{}'.format(NAMES['DUMMY'], tidx['id']) + NAMES['IFB'] = '{}id{}'.format(NAMES['IFB'], tidx['id']) pm.call_pre_case(tidx) prepare_env(tidx, args, pm, 'setup', "-----> prepare stage", tidx["setup"]) @@ -474,6 +476,7 @@ def run_one_test(pm, args, index, tidx): NAMES['DEV0'] = dev0 NAMES['DEV1'] = dev1 NAMES['DUMMY'] = dummy + NAMES['IFB'] = ifb return res diff --git a/tools/testing/selftests/tc-testing/tdc_config.py b/tools/testing/selftests/tc-testing/tdc_config.py index 9488b03cbc2c..cd0bd42f05a5 100644 --- a/tools/testing/selftests/tc-testing/tdc_config.py +++ b/tools/testing/selftests/tc-testing/tdc_config.py @@ -17,6 +17,7 @@ NAMES = { 'DEV1': 'v0p1', 'DEV2': '', 'DUMMY': 'dummy1', + 'IFB': 'ifbtdc0', 'ETHTOOL': '/usr/sbin/ethtool', 'ETH': 'eth0', 'BATCH_FILE': './batch.txt', diff --git a/tools/testing/selftests/tc-testing/tdc_gso.py b/tools/testing/selftests/tc-testing/tdc_gso.py new file mode 100755 index 000000000000..b66528ea4b68 --- /dev/null +++ b/tools/testing/selftests/tc-testing/tdc_gso.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +tdc_gso.py - send a UDP GSO datagram + +Copyright (C) 2026 Xingquan Liu <b1n@b1n.io> +""" + +import argparse +import socket +import struct +import sys + +UDP_MAX_SEGMENTS = 1 << 7 + + +parser = argparse.ArgumentParser(description="UDP GSO datagram sender") +parser.add_argument("src", help="source IPv4 address") +parser.add_argument("dst", help="destination IPv4 address") +parser.add_argument("port", type=int, help="destination UDP port") +parser.add_argument("gso_size", type=int, help="UDP GSO segment payload size") +parser.add_argument("payload_len", type=int, help="total UDP payload length") +args = parser.parse_args() + +if args.gso_size <= 0 or args.gso_size > 0xFFFF: + parser.error("gso_size must fit in an unsigned 16-bit integer") +if args.payload_len <= args.gso_size: + parser.error("payload_len must be larger than gso_size") +if args.payload_len > args.gso_size * UDP_MAX_SEGMENTS: + parser.error("payload_len exceeds UDP_MAX_SEGMENTS") + +SOL_UDP = getattr(socket, "SOL_UDP", socket.IPPROTO_UDP) +UDP_SEGMENT = getattr(socket, "UDP_SEGMENT", 103) + +sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +sock.bind((args.src, 0)) + +payload = b"b" * args.payload_len +cmsg = [(SOL_UDP, UDP_SEGMENT, struct.pack("=H", args.gso_size))] + +sent = sock.sendmsg([payload], cmsg, 0, (args.dst, args.port)) +sys.exit(sent != len(payload)) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 38512623622a..2f3bac9fc6e8 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -78,19 +78,25 @@ static void sig_handler(int nr) done = 1; } +static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) +{ + int64_t diff; + + diff = NSEC_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec); + diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); + return diff; +} + /* * Check the expected timer expiration matches the GTOD elapsed delta since * we armed the timer. Keep a 0.5 sec error margin due to various jitter. */ -static int check_diff(struct timeval start, struct timeval end) +static int check_diff(struct timespec start, struct timespec end) { - long long diff; - - diff = end.tv_usec - start.tv_usec; - diff += (end.tv_sec - start.tv_sec) * USEC_PER_SEC; + long long diff = calcdiff_ns(end, start); - if (llabs(diff - DELAY * USEC_PER_SEC) > USEC_PER_SEC / 2) { - printf("Diff too high: %lld..", diff); + if (llabs(diff - DELAY * NSEC_PER_SEC) > NSEC_PER_SEC / 2) { + printf("Diff too high: %lld ns..", diff); return -1; } @@ -99,22 +105,25 @@ static int check_diff(struct timeval start, struct timeval end) static void check_itimer(int which, const char *name) { - struct timeval start, end; + struct timespec start, end; struct itimerval val = { .it_value.tv_sec = DELAY, }; + int clock_id = CLOCK_REALTIME; done = 0; if (which == ITIMER_VIRTUAL) signal(SIGVTALRM, sig_handler); - else if (which == ITIMER_PROF) + else if (which == ITIMER_PROF) { + clock_id = CLOCK_THREAD_CPUTIME_ID; signal(SIGPROF, sig_handler); + } else if (which == ITIMER_REAL) signal(SIGALRM, sig_handler); - if (gettimeofday(&start, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &start)) + fatal_error(name, "clock_gettime()"); if (setitimer(which, &val, NULL) < 0) fatal_error(name, "setitimer()"); @@ -126,18 +135,19 @@ static void check_itimer(int which, const char *name) else if (which == ITIMER_REAL) idle_loop(); - if (gettimeofday(&end, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &end)) + fatal_error(name, "clock_gettime()"); ksft_test_result(check_diff(start, end) == 0, "%s\n", name); } static void check_timer_create(int which, const char *name) { - struct timeval start, end; + struct timespec start, end; struct itimerspec val = { .it_value.tv_sec = DELAY, }; + int clock_id = CLOCK_REALTIME; timer_t id; done = 0; @@ -148,16 +158,16 @@ static void check_timer_create(int which, const char *name) if (signal(SIGALRM, sig_handler) == SIG_ERR) fatal_error(name, "signal()"); - if (gettimeofday(&start, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &start)) + fatal_error(name, "clock_gettime()"); if (timer_settime(id, 0, &val, NULL) < 0) fatal_error(name, "timer_settime()"); user_loop(); - if (gettimeofday(&end, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &end)) + fatal_error(name, "clock_gettime()"); ksft_test_result(check_diff(start, end) == 0, "timer_create() per %s\n", name); @@ -445,15 +455,6 @@ static void check_delete(void) ksft_test_result(!tsig.signals, "check_delete\n"); } -static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) -{ - int64_t diff; - - diff = NSEC_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec); - diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); - return diff; -} - static void check_sigev_none(int which, const char *name) { struct timespec start, now; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index fbd9b1e7342a..0b23c09daea5 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1735,6 +1735,17 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } + /* + * The kernel may reduce nr_hw_queues (e.g. capped to nr_cpu_ids). + * Cap nthreads to the actual queue count to avoid creating extra + * handler threads that will hang during device removal. + * + * per_io_tasks mode is excluded: threads interleave across all + * queues so nthreads > nr_hw_queues is valid and intentional. + */ + if (!ctx->per_io_tasks && dev->nthreads > info->nr_hw_queues) + dev->nthreads = info->nr_hw_queues; + ret = ublk_start_daemon(ctx, dev); ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); if (ret < 0) diff --git a/tools/testing/selftests/uevent/uevent_filtering.c b/tools/testing/selftests/uevent/uevent_filtering.c index 974b076f9235..33a09f66d7e2 100644 --- a/tools/testing/selftests/uevent/uevent_filtering.c +++ b/tools/testing/selftests/uevent/uevent_filtering.c @@ -22,7 +22,7 @@ #include "kselftest_harness.h" #define __DEV_FULL "/sys/devices/virtual/mem/full/uevent" -#define __UEVENT_BUFFER_SIZE (2048 * 2) +#define __UEVENT_BUFFER_SIZE (1024 * 1024) #define __UEVENT_HEADER "add@/devices/virtual/mem/full" #define __UEVENT_HEADER_LEN sizeof("add@/devices/virtual/mem/full") #define __UEVENT_LISTEN_ALL -1 diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 0684932d91bf..2c32c48db509 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,6 +1,6 @@ ARCH ?= $(shell uname -m) -ifeq (,$(filter $(ARCH),aarch64 arm64 x86_64)) +ifeq (,$(filter $(ARCH),aarch64 arm64 x86 x86_64)) # Do nothing on unsupported architectures include ../lib.mk else @@ -12,6 +12,7 @@ TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test TEST_GEN_PROGS += vfio_pci_driver_test +TEST_GEN_PROGS += vfio_pci_sriov_uapi_test TEST_FILES += scripts/cleanup.sh TEST_FILES += scripts/lib.sh @@ -23,14 +24,18 @@ include lib/libvfio.mk CFLAGS += -I$(top_srcdir)/tools/include CFLAGS += -MD +CFLAGS += -Wall -Werror CFLAGS += $(EXTRA_CFLAGS) LDFLAGS += -pthread -$(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O) +$(TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(LIBVFIO_O) $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@ TEST_GEN_PROGS_O = $(patsubst %, %.o, $(TEST_GEN_PROGS)) +$(TEST_GEN_PROGS_O): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_PROGS_O) $(LIBVFIO_O)) -include $(TEST_DEP_FILES) diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h index 1b6da54cc2cb..07862b470777 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -5,6 +5,7 @@ #include <libvfio/assert.h> #include <libvfio/iommu.h> #include <libvfio/iova_allocator.h> +#include <libvfio/sysfs.h> #include <libvfio/vfio_pci_device.h> #include <libvfio/vfio_pci_driver.h> diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h index f4ebd122d9b6..77b68c7129a6 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h @@ -51,4 +51,9 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +#define snprintf_assert(_s, _size, _fmt, ...) do { \ + int __ret = snprintf(_s, _size, _fmt, ##__VA_ARGS__); \ + VFIO_ASSERT_LT(__ret, _size); \ +} while (0) + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/sysfs.h b/tools/testing/selftests/vfio/lib/include/libvfio/sysfs.h new file mode 100644 index 000000000000..c9ab1ea8f5a9 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/sysfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_SYSFS_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_SYSFS_H + +int sysfs_sriov_totalvfs_get(const char *bdf); +int sysfs_sriov_numvfs_get(const char *bdf); +void sysfs_sriov_numvfs_set(const char *bdf, int numvfs); +char *sysfs_sriov_vf_bdf_get(const char *pf_bdf, int i); +int sysfs_iommu_group_get(const char *bdf); +char *sysfs_driver_get(const char *bdf); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_SYSFS_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h index 2858885a89bb..3eabead717bb 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h @@ -38,6 +38,8 @@ struct vfio_pci_device { #define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) #define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) +struct vfio_pci_device *vfio_pci_device_alloc(const char *bdf, struct iommu *iommu); +void vfio_pci_device_free(struct vfio_pci_device *device); struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); void vfio_pci_device_cleanup(struct vfio_pci_device *device); @@ -122,4 +124,13 @@ static inline bool vfio_pci_device_match(struct vfio_pci_device *device, const char *vfio_pci_get_cdev_path(const char *bdf); +void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf); +void __vfio_pci_group_get_device_fd(struct vfio_pci_device *device, + const char *bdf, const char *vf_token); +void vfio_container_set_iommu(struct vfio_pci_device *device); +void vfio_pci_cdev_open(struct vfio_pci_device *device, const char *bdf); +int __vfio_device_bind_iommufd(int device_fd, int iommufd, const char *vf_token); + +void vfio_device_set_vf_token(int fd, const char *vf_token); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H */ diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 9f47bceed16f..67942b085068 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -6,6 +6,7 @@ LIBVFIO_SRCDIR := $(selfdir)/vfio/lib LIBVFIO_C := iommu.c LIBVFIO_C += iova_allocator.c LIBVFIO_C += libvfio.c +LIBVFIO_C += sysfs.c LIBVFIO_C += vfio_pci_device.c LIBVFIO_C += vfio_pci_driver.c @@ -19,11 +20,15 @@ LIBVFIO_OUTPUT := $(OUTPUT)/libvfio LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C)) LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) -$(shell mkdir -p $(LIBVFIO_O_DIRS)) + +$(LIBVFIO_O_DIRS): + mkdir -p $@ CFLAGS += -I$(LIBVFIO_SRCDIR)/include -$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c +LDLIBS += -luuid + +$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c | $(LIBVFIO_O_DIRS) $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ EXTRA_CLEAN += $(LIBVFIO_OUTPUT) diff --git a/tools/testing/selftests/vfio/lib/sysfs.c b/tools/testing/selftests/vfio/lib/sysfs.c new file mode 100644 index 000000000000..11415448b2e2 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/sysfs.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <linux/limits.h> + +#include <libvfio.h> + +#define readlink_safe(_path, _buf) ({ \ + int __ret; \ + \ + _Static_assert(!__builtin_types_compatible_p( \ + __typeof__(_buf), char *), \ + "readlink_safe: _buf must be an array, not a pointer"); \ + \ + __ret = readlink(_path, _buf, sizeof(_buf) - 1); \ + if (__ret != -1) \ + _buf[__ret] = 0; \ + __ret; \ +}) + +static void readlink_base(const char *path, const char *data_fmt, void *out_data) +{ + char rl_path[PATH_MAX]; + int ret; + + ret = readlink_safe(path, rl_path); + VFIO_ASSERT_NE(ret, -1); + + ret = sscanf(basename(rl_path), data_fmt, out_data); + VFIO_ASSERT_EQ(ret, 1); +} + +static int sysfs_val_get_int(const char *component, const char *name, + const char *file) +{ + char path[PATH_MAX]; + char buf[32]; + int ret; + int fd; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/%s/%s/%s", component, name, file); + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + VFIO_ASSERT_GT(read(fd, buf, ARRAY_SIZE(buf)), 0); + VFIO_ASSERT_EQ(close(fd), 0); + + errno = 0; + ret = strtol(buf, NULL, 0); + VFIO_ASSERT_EQ(errno, 0, "sysfs path \"%s\" is not an integer: \"%s\"\n", path, buf); + + return ret; +} + +static void sysfs_val_set(const char *component, const char *name, + const char *file, const char *val) +{ + char path[PATH_MAX]; + int fd; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/%s/%s/%s", component, name, file); + VFIO_ASSERT_GT(fd = open(path, O_WRONLY), 0); + + VFIO_ASSERT_EQ(write(fd, val, strlen(val)), strlen(val)); + VFIO_ASSERT_EQ(close(fd), 0); +} + +static int sysfs_device_val_get(const char *bdf, const char *file) +{ + return sysfs_val_get_int("devices", bdf, file); +} + +static void sysfs_device_val_set(const char *bdf, const char *file, const char *val) +{ + sysfs_val_set("devices", bdf, file, val); +} + +static void sysfs_device_val_set_int(const char *bdf, const char *file, int val) +{ + char val_str[32]; + + snprintf_assert(val_str, sizeof(val_str), "%d", val); + sysfs_device_val_set(bdf, file, val_str); +} + +int sysfs_sriov_totalvfs_get(const char *bdf) +{ + return sysfs_device_val_get(bdf, "sriov_totalvfs"); +} + +int sysfs_sriov_numvfs_get(const char *bdf) +{ + return sysfs_device_val_get(bdf, "sriov_numvfs"); +} + +void sysfs_sriov_numvfs_set(const char *bdf, int numvfs) +{ + sysfs_device_val_set_int(bdf, "sriov_numvfs", numvfs); +} + +char *sysfs_sriov_vf_bdf_get(const char *pf_bdf, int i) +{ + char path[PATH_MAX]; + char *out_vf_bdf; + + /* Fit "0000:00:00.0" */ + out_vf_bdf = calloc(16, sizeof(char)); + VFIO_ASSERT_NOT_NULL(out_vf_bdf); + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/devices/%s/virtfn%d", pf_bdf, i); + readlink_base(path, "%s", out_vf_bdf); + + return out_vf_bdf; +} + +int sysfs_iommu_group_get(const char *bdf) +{ + char path[PATH_MAX]; + int group; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/devices/%s/iommu_group", bdf); + readlink_base(path, "%d", &group); + + return group; +} + +char *sysfs_driver_get(const char *bdf) +{ + char driver_path[PATH_MAX]; + char path[PATH_MAX]; + char *out_driver; + int ret; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/devices/%s/driver", bdf); + ret = readlink_safe(path, driver_path); + if (ret == -1) { + if (errno == ENOENT) + return NULL; + + VFIO_FAIL("Failed to read %s\n", path); + } + + out_driver = strdup(basename(driver_path)); + VFIO_ASSERT_NOT_NULL(out_driver); + + return out_driver; +} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index fc75e04ef010..94dc5fcecbeb 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -22,11 +22,11 @@ #include <linux/types.h> #include <linux/vfio.h> +#include <uuid/uuid.h> + #include "kselftest.h" #include <libvfio.h> -#define PCI_SYSFS_PATH "/sys/bus/pci/devices" - static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { @@ -115,6 +115,40 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } +static int vfio_device_feature_ioctl(int fd, u32 flags, void *data, + size_t data_size) +{ + u8 buffer[sizeof(struct vfio_device_feature) + data_size] = {}; + struct vfio_device_feature *feature = (void *)buffer; + + memcpy(feature->data, data, data_size); + + feature->argsz = sizeof(buffer); + feature->flags = flags; + + return ioctl(fd, VFIO_DEVICE_FEATURE, feature); +} + +static void vfio_device_feature_set(int fd, u16 feature, void *data, size_t data_size) +{ + u32 flags = VFIO_DEVICE_FEATURE_SET | feature; + int ret; + + ret = vfio_device_feature_ioctl(fd, flags, data, data_size); + VFIO_ASSERT_EQ(ret, 0, "Failed to set feature %u\n", feature); +} + +void vfio_device_set_vf_token(int fd, const char *vf_token) +{ + uuid_t token_uuid = {0}; + + VFIO_ASSERT_NOT_NULL(vf_token, "vf_token is NULL"); + VFIO_ASSERT_EQ(uuid_parse(vf_token, token_uuid), 0); + + vfio_device_feature_set(fd, VFIO_DEVICE_FEATURE_PCI_VF_TOKEN, + token_uuid, sizeof(uuid_t)); +} + static void vfio_pci_region_get(struct vfio_pci_device *device, int index, struct vfio_region_info *info) { @@ -204,25 +238,7 @@ void vfio_pci_device_reset(struct vfio_pci_device *device) ioctl_assert(device->fd, VFIO_DEVICE_RESET, NULL); } -static unsigned int vfio_pci_get_group_from_dev(const char *bdf) -{ - char dev_iommu_group_path[PATH_MAX] = {0}; - char sysfs_path[PATH_MAX] = {0}; - unsigned int group; - int ret; - - snprintf(sysfs_path, PATH_MAX, "%s/%s/iommu_group", PCI_SYSFS_PATH, bdf); - - ret = readlink(sysfs_path, dev_iommu_group_path, sizeof(dev_iommu_group_path)); - VFIO_ASSERT_NE(ret, -1, "Failed to get the IOMMU group for device: %s\n", bdf); - - ret = sscanf(basename(dev_iommu_group_path), "%u", &group); - VFIO_ASSERT_EQ(ret, 1, "Failed to get the IOMMU group for device: %s\n", bdf); - - return group; -} - -static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf) +void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf) { struct vfio_group_status group_status = { .argsz = sizeof(group_status), @@ -230,8 +246,8 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf char group_path[32]; int group; - group = vfio_pci_get_group_from_dev(bdf); - snprintf(group_path, sizeof(group_path), "/dev/vfio/%d", group); + group = sysfs_iommu_group_get(bdf); + snprintf_assert(group_path, sizeof(group_path), "/dev/vfio/%d", group); device->group_fd = open(group_path, O_RDWR); VFIO_ASSERT_GE(device->group_fd, 0, "open(%s) failed\n", group_path); @@ -242,14 +258,37 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->iommu->container_fd); } -static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) +void __vfio_pci_group_get_device_fd(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + char arg[64]; + + /* + * If a vf_token exists, argument to VFIO_GROUP_GET_DEVICE_FD + * will be in the form of the following example: + * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" + */ + if (vf_token) + snprintf_assert(arg, ARRAY_SIZE(arg), "%s vf_token=%s", bdf, vf_token); + else + snprintf_assert(arg, ARRAY_SIZE(arg), "%s", bdf); + + device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, arg); +} + +static void vfio_pci_group_get_device_fd(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + __vfio_pci_group_get_device_fd(device, bdf, vf_token); + VFIO_ASSERT_GE(device->fd, 0); +} + +void vfio_container_set_iommu(struct vfio_pci_device *device) { struct iommu *iommu = device->iommu; unsigned long iommu_type = iommu->mode->iommu_type; int ret; - vfio_pci_group_setup(device, bdf); - ret = ioctl(iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); @@ -259,9 +298,14 @@ static void vfio_pci_container_setup(struct vfio_pci_device *device, const char * because the IOMMU type is already set. */ (void)ioctl(iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); +} - device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); - VFIO_ASSERT_GE(device->fd, 0); +static void vfio_pci_container_setup(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + vfio_pci_group_setup(device, bdf); + vfio_container_set_iommu(device); + vfio_pci_group_get_device_fd(device, bdf, vf_token); } static void vfio_pci_device_setup(struct vfio_pci_device *device) @@ -302,7 +346,7 @@ const char *vfio_pci_get_cdev_path(const char *bdf) cdev_path = calloc(PATH_MAX, 1); VFIO_ASSERT_NOT_NULL(cdev_path); - snprintf(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); + snprintf_assert(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); dir = opendir(dir_path); VFIO_ASSERT_NOT_NULL(dir, "Failed to open directory %s\n", dir_path); @@ -312,7 +356,7 @@ const char *vfio_pci_get_cdev_path(const char *bdf) if (strncmp("vfio", entry->d_name, 4)) continue; - snprintf(cdev_path, PATH_MAX, "/dev/vfio/devices/%s", entry->d_name); + snprintf_assert(cdev_path, PATH_MAX, "/dev/vfio/devices/%s", entry->d_name); break; } @@ -322,14 +366,32 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } -static void vfio_device_bind_iommufd(int device_fd, int iommufd) +int __vfio_device_bind_iommufd(int device_fd, int iommufd, const char *vf_token) { struct vfio_device_bind_iommufd args = { .argsz = sizeof(args), .iommufd = iommufd, }; + uuid_t token_uuid; + + if (vf_token) { + VFIO_ASSERT_EQ(uuid_parse(vf_token, token_uuid), 0); + args.flags |= VFIO_DEVICE_BIND_FLAG_TOKEN; + args.token_uuid_ptr = (u64)token_uuid; + } + + if (ioctl(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args)) + return -errno; + + return 0; +} + +static void vfio_device_bind_iommufd(int device_fd, int iommufd, + const char *vf_token) +{ + int ret = __vfio_device_bind_iommufd(device_fd, iommufd, vf_token); - ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args); + VFIO_ASSERT_EQ(ret, 0, "Failed VFIO_DEVICE_BIND_IOMMUFD ioctl\n"); } static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) @@ -342,19 +404,24 @@ static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) ioctl_assert(device_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &args); } -static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *bdf) +void vfio_pci_cdev_open(struct vfio_pci_device *device, const char *bdf) { const char *cdev_path = vfio_pci_get_cdev_path(bdf); device->fd = open(cdev_path, O_RDWR); VFIO_ASSERT_GE(device->fd, 0); free((void *)cdev_path); +} - vfio_device_bind_iommufd(device->fd, device->iommu->iommufd); +static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + vfio_pci_cdev_open(device, bdf); + vfio_device_bind_iommufd(device->fd, device->iommu->iommufd, vf_token); vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) +struct vfio_pci_device *vfio_pci_device_alloc(const char *bdf, struct iommu *iommu) { struct vfio_pci_device *device; @@ -365,10 +432,24 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iomm device->iommu = iommu; device->bdf = bdf; + return device; +} + +void vfio_pci_device_free(struct vfio_pci_device *device) +{ + free(device); +} + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) +{ + struct vfio_pci_device *device; + + device = vfio_pci_device_alloc(bdf, iommu); + if (iommu->mode->container_path) - vfio_pci_container_setup(device, bdf); + vfio_pci_container_setup(device, bdf, NULL); else - vfio_pci_iommufd_setup(device, bdf); + vfio_pci_iommufd_setup(device, bdf, NULL); vfio_pci_device_setup(device); vfio_pci_driver_probe(device); @@ -397,5 +478,5 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) if (device->group_fd) VFIO_ASSERT_EQ(close(device->group_fd), 0); - free(device); + vfio_pci_device_free(device); } diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index abb170bdcef7..7d0de8c79de1 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -44,9 +44,9 @@ static int intel_iommu_mapping_get(const char *bdf, u64 iova, FILE *file; char *rest; - snprintf(iommu_mapping_path, sizeof(iommu_mapping_path), - "/sys/kernel/debug/iommu/intel/%s/domain_translation_struct", - bdf); + snprintf_assert(iommu_mapping_path, sizeof(iommu_mapping_path), + "/sys/kernel/debug/iommu/intel/%s/domain_translation_struct", + bdf); printf("Searching for IOVA 0x%lx in %s\n", iova, iommu_mapping_path); diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 7c0fe8ce3a61..93c11fd5e081 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -39,16 +39,17 @@ FIXTURE_TEARDOWN(vfio_pci_device_test) iommu_cleanup(self->iommu); } -#define read_pci_id_from_sysfs(_file) ({ \ - char __sysfs_path[PATH_MAX]; \ - char __buf[32]; \ - int __fd; \ - \ - snprintf(__sysfs_path, PATH_MAX, "/sys/bus/pci/devices/%s/%s", device_bdf, _file); \ - ASSERT_GT((__fd = open(__sysfs_path, O_RDONLY)), 0); \ - ASSERT_GT(read(__fd, __buf, ARRAY_SIZE(__buf)), 0); \ - ASSERT_EQ(0, close(__fd)); \ - (u16)strtoul(__buf, NULL, 0); \ +#define read_pci_id_from_sysfs(_file) ({ \ + char __sysfs_path[PATH_MAX]; \ + char __buf[32]; \ + int __fd; \ + \ + snprintf_assert(__sysfs_path, PATH_MAX, "/sys/bus/pci/devices/%s/%s", \ + device_bdf, _file); \ + ASSERT_GT((__fd = open(__sysfs_path, O_RDONLY)), 0); \ + ASSERT_GT(read(__fd, __buf, ARRAY_SIZE(__buf)), 0); \ + ASSERT_EQ(0, close(__fd)); \ + (u16)strtoul(__buf, NULL, 0); \ }) TEST_F(vfio_pci_device_test, config_space_read_write) diff --git a/tools/testing/selftests/vfio/vfio_pci_sriov_uapi_test.c b/tools/testing/selftests/vfio/vfio_pci_sriov_uapi_test.c new file mode 100644 index 000000000000..19d657d00b75 --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_sriov_uapi_test.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "lib/include/libvfio/assert.h" +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <linux/limits.h> + +#include <libvfio.h> + +#include "../kselftest_harness.h" + +#define UUID_1 "52ac9bff-3a88-4fbd-901a-0d767c3b6c97" +#define UUID_2 "88594674-90a0-47a9-aea8-9d9b352ac08a" + +static const char *pf_bdf; +static char *vf_bdf; + +static pid_t main_pid; + +static int container_setup(struct vfio_pci_device *device, const char *bdf, + const char *vf_token) +{ + vfio_pci_group_setup(device, bdf); + vfio_container_set_iommu(device); + __vfio_pci_group_get_device_fd(device, bdf, vf_token); + + /* The device fd will be -1 in case of mismatched tokens */ + return (device->fd < 0); +} + +static int iommufd_setup(struct vfio_pci_device *device, const char *bdf, + const char *vf_token) +{ + vfio_pci_cdev_open(device, bdf); + return __vfio_device_bind_iommufd(device->fd, + device->iommu->iommufd, vf_token); +} + +static int device_init(const char *bdf, struct iommu *iommu, + const char *vf_token, struct vfio_pci_device **out_dev) +{ + struct vfio_pci_device *device = vfio_pci_device_alloc(bdf, iommu); + int ret; + + if (iommu->mode->container_path) + ret = container_setup(device, bdf, vf_token); + else + ret = iommufd_setup(device, bdf, vf_token); + + *out_dev = device; + return ret; +} + +static void device_cleanup(struct vfio_pci_device *device) +{ + if (!device) + return; + + if (device->fd > 0) + VFIO_ASSERT_EQ(close(device->fd), 0); + + if (device->group_fd) + VFIO_ASSERT_EQ(close(device->group_fd), 0); + + vfio_pci_device_free(device); +} + +FIXTURE(vfio_pci_sriov_uapi_test) { + struct vfio_pci_device *pf; + struct vfio_pci_device *vf; + struct iommu *iommu; + char *pf_token; +}; + +FIXTURE_VARIANT(vfio_pci_sriov_uapi_test) { + const char *iommu_mode; + char *vf_token; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode, _name, _vf_token) \ +FIXTURE_VARIANT_ADD(vfio_pci_sriov_uapi_test, _iommu_mode ## _ ## _name) { \ + .iommu_mode = #_iommu_mode, \ + .vf_token = (_vf_token), \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(same_uuid, UUID_1); +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(diff_uuid, UUID_2); +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(null_uuid, NULL); + +FIXTURE_SETUP(vfio_pci_sriov_uapi_test) +{ + self->iommu = iommu_init(variant->iommu_mode); + + self->pf_token = UUID_1; + ASSERT_EQ(device_init(pf_bdf, self->iommu, self->pf_token, &self->pf), 0); +} + +FIXTURE_TEARDOWN(vfio_pci_sriov_uapi_test) +{ + device_cleanup(self->vf); + device_cleanup(self->pf); + iommu_cleanup(self->iommu); +} + +/* + * This asserts if the VF device is successfully created if its token matches + * with the token used to create/override the PF or fails during a mismatch. + */ +#define ASSERT_COND_VF_CREATION(_ret) do { \ + if (!variant->vf_token || strcmp(self->pf_token, variant->vf_token)) { \ + ASSERT_NE((_ret), 0); \ + } else { \ + ASSERT_EQ((_ret), 0); \ + } \ +} while (0) + +/* + * Validate if the UAPI handles correctly and incorrectly set token on the VF. + */ +TEST_F(vfio_pci_sriov_uapi_test, init_token_match) +{ + int ret; + + ret = device_init(vf_bdf, self->iommu, variant->vf_token, &self->vf); + ASSERT_COND_VF_CREATION(ret); +} + +/* + * After closing the PF, validate if the VF access still needs the right token. + */ +TEST_F(vfio_pci_sriov_uapi_test, pf_early_close) +{ + int ret; + + device_cleanup(self->pf); + + /* Clean the 'pf' to avoid calling device_cleanup() again. */ + self->pf = NULL; + + ret = device_init(vf_bdf, self->iommu, variant->vf_token, &self->vf); + ASSERT_COND_VF_CREATION(ret); +} + +/* + * After PF device init, override the existing token and validate if the newly + * set token is the one that's active. + */ +TEST_F(vfio_pci_sriov_uapi_test, override_token) +{ + int ret; + + self->pf_token = UUID_2; + vfio_device_set_vf_token(self->pf->fd, self->pf_token); + + ret = device_init(vf_bdf, self->iommu, variant->vf_token, &self->vf); + ASSERT_COND_VF_CREATION(ret); +} + +static void vf_teardown(void) +{ + /* + * The child processes, created by TEST_F()s, inherits this atexit() + * handler. Hence, check and destroy the VF only when the main/parent + * process exits. + */ + if (getpid() != main_pid) + return; + + free(vf_bdf); + sysfs_sriov_numvfs_set(pf_bdf, 0); +} + +static void vf_setup(void) +{ + char *vf_driver; + int nr_vfs; + + nr_vfs = sysfs_sriov_totalvfs_get(pf_bdf); + if (nr_vfs <= 0) + ksft_exit_skip("SR-IOV may not be supported by the PF: %s\n", pf_bdf); + + nr_vfs = sysfs_sriov_numvfs_get(pf_bdf); + if (nr_vfs != 0) + ksft_exit_skip("SR-IOV already configured for the PF: %s\n", pf_bdf); + + /* Create only one VF for testing */ + sysfs_sriov_numvfs_set(pf_bdf, 1); + + /* + * Setup an exit handler to destroy the VF in case of failures + * during further setup at the end of the test run. + */ + main_pid = getpid(); + VFIO_ASSERT_EQ(atexit(vf_teardown), 0); + + vf_bdf = sysfs_sriov_vf_bdf_get(pf_bdf, 0); + + /* + * The VF inherits the driver from the PF. + * Ensure this is 'vfio-pci' before proceeding. + */ + vf_driver = sysfs_driver_get(vf_bdf); + VFIO_ASSERT_NE(vf_driver, NULL); + VFIO_ASSERT_EQ(strcmp(vf_driver, "vfio-pci"), 0); + free(vf_driver); + + printf("Created 1 VF (%s) under the PF: %s\n", vf_bdf, pf_bdf); +} + +int main(int argc, char *argv[]) +{ + pf_bdf = vfio_selftests_get_bdf(&argc, argv); + vf_setup(); + + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index d97913a6bdc7..310dfc2a39ad 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -330,27 +330,34 @@ check_netns() { return 0 } +# Compare MAJOR.MINOR versions numerically. Returns 0 (true) if $1 < $2. +version_lt() { + local -a a=(${1//./ }) + local -a b=(${2//./ }) + + if [[ "${a[0]}" -lt "${b[0]}" ]]; then + return 0 + elif [[ "${a[0]}" -gt "${b[0]}" ]]; then + return 1 + elif [[ "${a[1]}" -lt "${b[1]}" ]]; then + return 0 + fi + + return 1 +} + check_vng() { - local tested_versions local version - local ok - tested_versions=("1.33" "1.36" "1.37") - version="$(vng --version)" + version="$(vng --version | awk '{print $2}')" - ok=0 - for tv in "${tested_versions[@]}"; do - if [[ "${version}" == *"${tv}"* ]]; then - ok=1 - break - fi - done - - if [[ ! "${ok}" -eq 1 ]]; then - printf "warning: vng version '%s' has not been tested and may " "${version}" >&2 - printf "not function properly.\n\tThe following versions have been tested: " >&2 - echo "${tested_versions[@]}" >&2 + # Supported: 1.33, or any version >= 1.36. 1.34 and 1.35 are untested. + if [[ "${version}" == "1.33" ]] || ! version_lt "${version}" "1.36"; then + return fi + + printf "warning: vng version '%s' has not been tested and may " "${version}" >&2 + printf "not function properly.\n\tSupported: 1.33 or >= 1.36\n" >&2 } check_socat() { @@ -438,8 +445,14 @@ vng_dry_run() { # stopped with SIGTTOU and hangs until kselftest's timer expires. # setsid works around this by launching vng in a new session that has # no controlling terminal, so tcsetattr() succeeds. + # + # Fixed in 1.41 (https://github.com/arighi/virtme-ng/pull/453). - setsid -w vng --run "$@" --dry-run &>/dev/null + if version_lt "$(vng --version | awk '{print $2}')" "1.41"; then + setsid -w vng --run "$@" --dry-run &>/dev/null + else + vng --run "$@" --dry-run &>/dev/null + fi } vm_start() { diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 8c7257155958..e0a0693df08f 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -154,7 +154,7 @@ void kmem_cache_shrink(struct kmem_cache *cachep) { } -int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, +bool kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, void **p) { size_t i; @@ -213,7 +213,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, pthread_mutex_unlock(&cachep->lock); if (cachep->callback) cachep->exec_callback = true; - return 0; + return false; } for (i = 0; i < size; i++) { @@ -224,7 +224,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, printf("Allocating %p from slab\n", p[i]); } - return size; + return true; } struct kmem_cache * @@ -271,8 +271,8 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) sheaf->cache = s; sheaf->capacity = capacity; - sheaf->size = kmem_cache_alloc_bulk(s, gfp, size, sheaf->objects); - if (!sheaf->size) { + sheaf->size = size; + if (!kmem_cache_alloc_bulk(s, gfp, size, sheaf->objects)) { free(sheaf); return NULL; } @@ -284,7 +284,6 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, struct slab_sheaf **sheafp, unsigned int size) { struct slab_sheaf *sheaf = *sheafp; - int refill; if (sheaf->size >= size) return 0; @@ -299,12 +298,10 @@ int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, return 0; } - refill = kmem_cache_alloc_bulk(s, gfp, size - sheaf->size, - &sheaf->objects[sheaf->size]); - if (!refill) + if (!kmem_cache_alloc_bulk(s, gfp, size - sheaf->size, + &sheaf->objects[sheaf->size])) return -ENOMEM; - - sheaf->size += refill; + sheaf->size = size; return 0; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index 9e0dfd3a85b0..bf26b3f48d3a 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -483,23 +483,10 @@ struct mmap_action { enum mmap_action_type type; /* - * If specified, this hook is invoked after the selected action has been - * successfully completed. Note that the VMA write lock still held. - * - * The absolute minimum ought to be done here. - * - * Returns 0 on success, or an error code. - */ - int (*success_hook)(const struct vm_area_struct *vma); - - /* - * If specified, this hook is invoked when an error occurred when - * attempting the selection action. - * - * The hook can return an error code in order to filter the error, but - * it is not valid to clear the error here. + * If non-zero, replace errors that arise from mmap actions with this + * value instead. Only valid error codes may be specified. */ - int (*error_hook)(int err); + int error_override; /* * This should be set in rare instances where the operation required @@ -1303,6 +1290,7 @@ static inline void compat_set_desc_from_vma(struct vm_area_desc *desc, desc->vm_file = vma->vm_file; desc->vma_flags = vma->flags; desc->page_prot = vma->vm_page_prot; + desc->vm_ops = vma->vm_ops; /* Default. */ desc->action.type = MMAP_NOTHING; |
