summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2025-09-18 11:37:25 +0200
committerPaolo Abeni <pabeni@redhat.com>2025-09-18 11:37:25 +0200
commitb332fb727509c0d38e18dd16f36d166f1c9ede88 (patch)
treefdd3aa3fe7e47032ff884ac6492591934aee35ba
parent18cfe3c1a121c275fb0a86dd7b0c049c6dd0a038 (diff)
parente6afcd60c26fca227c700825a94020209970c05e (diff)
Merge branch 'eth-fbnic-add-devlink-health-support-for-fw-crashes-and-otp-mem-corruptions'
Jakub Kicinski says: ==================== eth: fbnic: add devlink health support for FW crashes and OTP mem corruptions Add support for FW crash detection and a corresponding devlink health reporter. Add a reporter for checking OTP memory health. The output is not particularly exciting: # devlink health show pci/0000:01:00.0: reporter fw state healthy error 0 recover 0 auto_dump true reporter otp state healthy error 0 recover 0 auto_dump true # devlink health diagnose pci/0000:01:00.0 reporter fw FW uptime: 0 # devlink health dump show pci/0000:01:00.0 reporter fw FW coredump: 5a 45 01 00 04 00 06 00 00 00 00 00 4d 01 00 d0 .. lots of hex follows .. # devlink health dump show pci/0000:01:00.0 reporter otp OTP: Status: 0 Data: 0 ECC: 0 v2: https://lore.kernel.org/20250915155312.1083292-1-kuba@kernel.org v1: https://lore.kernel.org/20250912201428.566190-1-kuba@kernel.org ==================== Link: https://patch.msgid.link/20250916231420.1693955-1-kuba@kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com>
-rw-r--r--Documentation/networking/device_drivers/ethernet/meta/fbnic.rst19
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic.h13
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_csr.h18
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_devlink.c249
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_fw.c241
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_fw.h47
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c2
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h2
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_pci.c39
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_rpc.c57
10 files changed, 659 insertions, 28 deletions
diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst
index fb6559fa4be4..1e82f90d9ad2 100644
--- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst
+++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst
@@ -69,6 +69,25 @@ On host boot the latest UEFI driver is always used, no explicit activation
is required. Firmware activation is required to run new control firmware. cmrt
firmware can only be activated by power cycling the NIC.
+Health reporters
+----------------
+
+fw reporter
+~~~~~~~~~~~
+
+The ``fw`` health reporter tracks FW crashes. Dumping the reporter will
+show the core dump of the most recent FW crash, and if no FW crash has
+happened since power cycle - a snapshot of the FW memory. Diagnose callback
+shows FW uptime based on the most recently received heartbeat message
+(the crashes are detected by checking if uptime goes down).
+
+otp reporter
+~~~~~~~~~~~~
+
+OTP memory ("fuses") are used for secure boot and anti-rollback
+protection. The OTP memory is ECC protected, ECC errors indicate
+either manufacturing defect or part deteriorating with age.
+
Statistics
----------
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h
index 311c7dda911a..b03e5a3d5144 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic.h
@@ -27,6 +27,8 @@ struct fbnic_dev {
struct net_device *netdev;
struct dentry *dbg_fbd;
struct device *hwmon;
+ struct devlink_health_reporter *fw_reporter;
+ struct devlink_health_reporter *otp_reporter;
u32 __iomem *uc_addr0;
u32 __iomem *uc_addr4;
@@ -84,6 +86,10 @@ struct fbnic_dev {
/* Local copy of hardware statistics */
struct fbnic_hw_stats hw_stats;
+ /* Firmware time since boot in milliseconds */
+ u64 firmware_time;
+ u64 prev_firmware_time;
+
struct fbnic_fw_log fw_log;
};
@@ -155,8 +161,13 @@ extern char fbnic_driver_name[];
void fbnic_devlink_free(struct fbnic_dev *fbd);
struct fbnic_dev *fbnic_devlink_alloc(struct pci_dev *pdev);
+int fbnic_devlink_health_create(struct fbnic_dev *fbd);
+void fbnic_devlink_health_destroy(struct fbnic_dev *fbd);
void fbnic_devlink_register(struct fbnic_dev *fbd);
void fbnic_devlink_unregister(struct fbnic_dev *fbd);
+void __printf(2, 3)
+fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...);
+void fbnic_devlink_otp_check(struct fbnic_dev *fbd, const char *msg);
int fbnic_fw_request_mbx(struct fbnic_dev *fbd);
void fbnic_fw_free_mbx(struct fbnic_dev *fbd);
@@ -187,6 +198,8 @@ void fbnic_dbg_fbd_exit(struct fbnic_dev *fbd);
void fbnic_dbg_init(void);
void fbnic_dbg_exit(void);
+void fbnic_rpc_reset_valid_entries(struct fbnic_dev *fbd);
+
void fbnic_csr_get_regs(struct fbnic_dev *fbd, u32 *data, u32 *regs_version);
int fbnic_csr_regs_len(struct fbnic_dev *fbd);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
index e2fffe1597e9..d3a7ad921f18 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h
@@ -1178,4 +1178,22 @@ enum {
#define FBNIC_IPC_MBX_DESC_FW_CMPL DESC_BIT(1)
#define FBNIC_IPC_MBX_DESC_HOST_CMPL DESC_BIT(0)
+/* OTP Registers
+ * These registers are accessible via bar4 offset and are written by CMRT
+ * on boot. For the write status, the register is broken up in half with OTP
+ * Write Data Status occupying the top 16 bits and the ECC status occupying the
+ * bottom 16 bits.
+ */
+#define FBNIC_NS_OTP_STATUS 0x0021d
+#define FBNIC_NS_OTP_WRITE_STATUS 0x0021e
+
+#define FBNIC_NS_OTP_WRITE_DATA_STATUS_MASK CSR_GENMASK(31, 16)
+#define FBNIC_NS_OTP_WRITE_ECC_STATUS_MASK CSR_GENMASK(15, 0)
+
+#define FBNIC_REGS_VERSION CSR_GENMASK(31, 16)
+#define FBNIC_REGS_HW_TYPE CSR_GENMASK(15, 8)
+enum{
+ FBNIC_CSR_VERSION_V1_0_ASIC = 1,
+};
+
#endif /* _FBNIC_CSR_H_ */
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c
index c5f81f139e7e..b62b1d5b1453 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c
@@ -8,6 +8,7 @@
#include <net/devlink.h>
#include "fbnic.h"
+#include "fbnic_fw.h"
#include "fbnic_tlv.h"
#define FBNIC_SN_STR_LEN 24
@@ -369,6 +370,254 @@ static const struct devlink_ops fbnic_devlink_ops = {
.flash_update = fbnic_devlink_flash_update,
};
+static int fbnic_fw_reporter_dump(struct devlink_health_reporter *reporter,
+ struct devlink_fmsg *fmsg, void *priv_ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter);
+ u32 offset, index, index_count, length, size;
+ struct fbnic_fw_completion *fw_cmpl;
+ u8 *dump_data, **data;
+ int err;
+
+ fw_cmpl = fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP);
+ if (!fw_cmpl)
+ return -ENOMEM;
+
+ err = fbnic_fw_xmit_coredump_info_msg(fbd, fw_cmpl, true);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to transmit core dump info msg");
+ goto cmpl_free;
+ }
+ if (!wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Timed out waiting on core dump info");
+ err = -ETIMEDOUT;
+ goto cmpl_cleanup;
+ }
+
+ size = fw_cmpl->u.coredump_info.size;
+ err = fw_cmpl->result;
+
+ fbnic_mbx_clear_cmpl(fbd, fw_cmpl);
+ fbnic_fw_put_cmpl(fw_cmpl);
+
+ /* Handle error returned by firmware */
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Firmware core dump returned error");
+ return err;
+ }
+ if (!size) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Firmware core dump returned size 0");
+ return -EIO;
+ }
+
+ /* Read the dump, we can only transfer TLV_MAX_DATA at a time */
+ index_count = DIV_ROUND_UP(size, TLV_MAX_DATA);
+
+ fw_cmpl = __fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP,
+ sizeof(void *) * index_count + size);
+ if (!fw_cmpl)
+ return -ENOMEM;
+
+ /* Populate pointer table w/ pointer offsets */
+ dump_data = (void *)&fw_cmpl->u.coredump.data[index_count];
+ data = fw_cmpl->u.coredump.data;
+ fw_cmpl->u.coredump.size = size;
+ fw_cmpl->u.coredump.stride = TLV_MAX_DATA;
+
+ for (index = 0; index < index_count; index++) {
+ /* First iteration installs completion */
+ struct fbnic_fw_completion *cmpl_arg = index ? NULL : fw_cmpl;
+
+ offset = index * TLV_MAX_DATA;
+ length = min(size - offset, TLV_MAX_DATA);
+
+ data[index] = dump_data + offset;
+ err = fbnic_fw_xmit_coredump_read_msg(fbd, cmpl_arg,
+ offset, length);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to transmit core dump msg");
+ if (cmpl_arg)
+ goto cmpl_free;
+ else
+ goto cmpl_cleanup;
+ }
+
+ if (wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) {
+ reinit_completion(&fw_cmpl->done);
+ } else {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "Timed out waiting on core dump (%d/%d)",
+ index + 1, index_count);
+ err = -ETIMEDOUT;
+ goto cmpl_cleanup;
+ }
+
+ /* If we didn't see the reply record as incomplete */
+ if (fw_cmpl->u.coredump.data[index]) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "No data for core dump chunk (%d/%d)",
+ index + 1, index_count);
+ err = -EIO;
+ goto cmpl_cleanup;
+ }
+ }
+
+ devlink_fmsg_binary_pair_nest_start(fmsg, "FW coredump");
+
+ for (offset = 0; offset < size; offset += length) {
+ length = min_t(u32, size - offset, TLV_MAX_DATA);
+
+ devlink_fmsg_binary_put(fmsg, dump_data + offset, length);
+ }
+
+ devlink_fmsg_binary_pair_nest_end(fmsg);
+
+cmpl_cleanup:
+ fbnic_mbx_clear_cmpl(fbd, fw_cmpl);
+cmpl_free:
+ fbnic_fw_put_cmpl(fw_cmpl);
+
+ return err;
+}
+
+static int
+fbnic_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
+ struct devlink_fmsg *fmsg,
+ struct netlink_ext_ack *extack)
+{
+ struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter);
+ u32 sec, msec;
+
+ /* Device is most likely down, we're not exchanging heartbeats */
+ if (!fbd->prev_firmware_time)
+ return 0;
+
+ sec = div_u64_rem(fbd->firmware_time, MSEC_PER_SEC, &msec);
+
+ devlink_fmsg_pair_nest_start(fmsg, "last_heartbeat");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_pair_nest_start(fmsg, "fw_uptime");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_u32_pair_put(fmsg, "sec", sec);
+ devlink_fmsg_u32_pair_put(fmsg, "msec", msec);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+
+ return 0;
+}
+
+void __printf(2, 3)
+fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...)
+{
+ char msg[FBNIC_FW_LOG_MAX_SIZE];
+ va_list args;
+
+ va_start(args, format);
+ vsnprintf(msg, FBNIC_FW_LOG_MAX_SIZE, format, args);
+ va_end(args);
+
+ devlink_health_report(fbd->fw_reporter, msg, fbd);
+ if (fbnic_fw_log_ready(fbd))
+ fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg);
+}
+
+static const struct devlink_health_reporter_ops fbnic_fw_ops = {
+ .name = "fw",
+ .dump = fbnic_fw_reporter_dump,
+ .diagnose = fbnic_fw_reporter_diagnose,
+};
+
+static u32 fbnic_read_otp_status(struct fbnic_dev *fbd)
+{
+ return fbnic_fw_rd32(fbd, FBNIC_NS_OTP_STATUS);
+}
+
+static int
+fbnic_otp_reporter_dump(struct devlink_health_reporter *reporter,
+ struct devlink_fmsg *fmsg, void *priv_ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter);
+ u32 otp_status, otp_write_status, m;
+
+ otp_status = fbnic_read_otp_status(fbd);
+ otp_write_status = fbnic_fw_rd32(fbd, FBNIC_NS_OTP_WRITE_STATUS);
+
+ /* Dump OTP status */
+ devlink_fmsg_pair_nest_start(fmsg, "OTP");
+ devlink_fmsg_obj_nest_start(fmsg);
+
+ devlink_fmsg_u32_pair_put(fmsg, "Status", otp_status);
+
+ /* Extract OTP Write Data status */
+ m = FBNIC_NS_OTP_WRITE_DATA_STATUS_MASK;
+ devlink_fmsg_u32_pair_put(fmsg, "Data",
+ FIELD_GET(m, otp_write_status));
+
+ /* Extract OTP Write ECC status */
+ m = FBNIC_NS_OTP_WRITE_ECC_STATUS_MASK;
+ devlink_fmsg_u32_pair_put(fmsg, "ECC",
+ FIELD_GET(m, otp_write_status));
+
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+
+ return 0;
+}
+
+void fbnic_devlink_otp_check(struct fbnic_dev *fbd, const char *msg)
+{
+ /* Check if there is anything to report */
+ if (!fbnic_read_otp_status(fbd))
+ return;
+
+ devlink_health_report(fbd->otp_reporter, msg, fbd);
+ if (fbnic_fw_log_ready(fbd))
+ fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg);
+}
+
+static const struct devlink_health_reporter_ops fbnic_otp_ops = {
+ .name = "otp",
+ .dump = fbnic_otp_reporter_dump,
+};
+
+int fbnic_devlink_health_create(struct fbnic_dev *fbd)
+{
+ fbd->fw_reporter = devlink_health_reporter_create(priv_to_devlink(fbd),
+ &fbnic_fw_ops, fbd);
+ if (IS_ERR(fbd->fw_reporter)) {
+ dev_warn(fbd->dev,
+ "Failed to create FW fault reporter: %pe\n",
+ fbd->fw_reporter);
+ return PTR_ERR(fbd->fw_reporter);
+ }
+
+ fbd->otp_reporter = devlink_health_reporter_create(priv_to_devlink(fbd),
+ &fbnic_otp_ops, fbd);
+ if (IS_ERR(fbd->otp_reporter)) {
+ devlink_health_reporter_destroy(fbd->fw_reporter);
+ dev_warn(fbd->dev,
+ "Failed to create OTP fault reporter: %pe\n",
+ fbd->otp_reporter);
+ return PTR_ERR(fbd->otp_reporter);
+ }
+
+ return 0;
+}
+
+void fbnic_devlink_health_destroy(struct fbnic_dev *fbd)
+{
+ devlink_health_reporter_destroy(fbd->otp_reporter);
+ devlink_health_reporter_destroy(fbd->fw_reporter);
+}
+
void fbnic_devlink_free(struct fbnic_dev *fbd)
{
struct devlink *devlink = priv_to_devlink(fbd);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c
index 6e580654493c..6c3e7f81a2ed 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c
@@ -495,6 +495,11 @@ int fbnic_fw_xmit_ownership_msg(struct fbnic_dev *fbd, bool take_ownership)
fbd->last_heartbeat_request = req_time;
+ /* Set prev_firmware_time to 0 to avoid triggering firmware crash
+ * detection until we receive the second uptime in a heartbeat resp.
+ */
+ fbd->prev_firmware_time = 0;
+
/* Set heartbeat detection based on if we are taking ownership */
fbd->fw_heartbeat_enabled = take_ownership;
@@ -660,6 +665,7 @@ static int fbnic_fw_parse_cap_resp(void *opaque, struct fbnic_tlv_msg **results)
}
static const struct fbnic_tlv_index fbnic_ownership_resp_index[] = {
+ FBNIC_TLV_ATTR_U64(FBNIC_FW_OWNERSHIP_TIME),
FBNIC_TLV_ATTR_LAST
};
@@ -671,10 +677,14 @@ static int fbnic_fw_parse_ownership_resp(void *opaque,
/* Count the ownership response as a heartbeat reply */
fbd->last_heartbeat_response = jiffies;
+ /* Capture firmware time for logging and firmware crash check */
+ fbd->firmware_time = fta_get_uint(results, FBNIC_FW_OWNERSHIP_TIME);
+
return 0;
}
static const struct fbnic_tlv_index fbnic_heartbeat_resp_index[] = {
+ FBNIC_TLV_ATTR_U64(FBNIC_FW_HEARTBEAT_UPTIME),
FBNIC_TLV_ATTR_LAST
};
@@ -685,6 +695,9 @@ static int fbnic_fw_parse_heartbeat_resp(void *opaque,
fbd->last_heartbeat_response = jiffies;
+ /* Capture firmware time for logging and firmware crash check */
+ fbd->firmware_time = fta_get_uint(results, FBNIC_FW_HEARTBEAT_UPTIME);
+
return 0;
}
@@ -706,6 +719,7 @@ static int fbnic_fw_xmit_heartbeat_message(struct fbnic_dev *fbd)
goto free_message;
fbd->last_heartbeat_request = req_time;
+ fbd->prev_firmware_time = fbd->firmware_time;
return err;
@@ -766,7 +780,8 @@ void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd)
return;
/* Was the last heartbeat response long time ago? */
- if (!fbnic_fw_heartbeat_current(fbd)) {
+ if (!fbnic_fw_heartbeat_current(fbd) ||
+ fbd->firmware_time < fbd->prev_firmware_time) {
dev_warn(fbd->dev,
"Firmware did not respond to heartbeat message\n");
fbd->fw_heartbeat_enabled = false;
@@ -778,6 +793,215 @@ void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd)
dev_warn(fbd->dev, "Failed to send heartbeat message\n");
}
+/**
+ * fbnic_fw_xmit_coredump_info_msg - Create and transmit a coredump info message
+ * @fbd: FBNIC device structure
+ * @cmpl_data: Structure to store info in
+ * @force: Force coredump event if one hasn't already occurred
+ *
+ * Return: zero on success, negative errno on failure
+ *
+ * Asks the FW for info related to coredump. If a coredump doesn't exist it
+ * can optionally force one if force is true.
+ */
+int fbnic_fw_xmit_coredump_info_msg(struct fbnic_dev *fbd,
+ struct fbnic_fw_completion *cmpl_data,
+ bool force)
+{
+ struct fbnic_tlv_msg *msg;
+ int err = 0;
+
+ msg = fbnic_tlv_msg_alloc(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_REQ);
+ if (!msg)
+ return -ENOMEM;
+
+ if (force) {
+ err = fbnic_tlv_attr_put_flag(msg, FBNIC_FW_COREDUMP_REQ_INFO_CREATE);
+ if (err)
+ goto free_msg;
+ }
+
+ err = fbnic_mbx_map_req_w_cmpl(fbd, msg, cmpl_data);
+ if (err)
+ goto free_msg;
+
+ return 0;
+
+free_msg:
+ free_page((unsigned long)msg);
+ return err;
+}
+
+static const struct fbnic_tlv_index fbnic_coredump_info_resp_index[] = {
+ FBNIC_TLV_ATTR_FLAG(FBNIC_FW_COREDUMP_INFO_AVAILABLE),
+ FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_INFO_SIZE),
+ FBNIC_TLV_ATTR_S32(FBNIC_FW_COREDUMP_INFO_ERROR),
+ FBNIC_TLV_ATTR_LAST
+};
+
+static int
+fbnic_fw_parse_coredump_info_resp(void *opaque, struct fbnic_tlv_msg **results)
+{
+ struct fbnic_fw_completion *cmpl_data;
+ struct fbnic_dev *fbd = opaque;
+ u32 msg_type;
+ s32 err;
+
+ /* Verify we have a completion pointer to provide with data */
+ msg_type = FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP;
+ cmpl_data = fbnic_fw_get_cmpl_by_type(fbd, msg_type);
+ if (!cmpl_data)
+ return -ENOSPC;
+
+ err = fta_get_sint(results, FBNIC_FW_COREDUMP_INFO_ERROR);
+ if (err)
+ goto msg_err;
+
+ if (!results[FBNIC_FW_COREDUMP_INFO_AVAILABLE]) {
+ err = -ENOENT;
+ goto msg_err;
+ }
+
+ cmpl_data->u.coredump_info.size =
+ fta_get_uint(results, FBNIC_FW_COREDUMP_INFO_SIZE);
+
+msg_err:
+ cmpl_data->result = err;
+ complete(&cmpl_data->done);
+ fbnic_fw_put_cmpl(cmpl_data);
+
+ return err;
+}
+
+/**
+ * fbnic_fw_xmit_coredump_read_msg - Create and transmit a coredump read request
+ * @fbd: FBNIC device structure
+ * @cmpl_data: Completion struct to store coredump
+ * @offset: Offset into coredump requested
+ * @length: Length of section of cordeump to fetch
+ *
+ * Return: zero on success, negative errno on failure
+ *
+ * Asks the firmware to provide a section of the cordeump back in a message.
+ * The response will have an offset and size matching the values provided.
+ */
+int fbnic_fw_xmit_coredump_read_msg(struct fbnic_dev *fbd,
+ struct fbnic_fw_completion *cmpl_data,
+ u32 offset, u32 length)
+{
+ struct fbnic_tlv_msg *msg;
+ int err = 0;
+
+ msg = fbnic_tlv_msg_alloc(FBNIC_TLV_MSG_ID_COREDUMP_READ_REQ);
+ if (!msg)
+ return -ENOMEM;
+
+ if (offset) {
+ err = fbnic_tlv_attr_put_int(msg, FBNIC_FW_COREDUMP_READ_OFFSET,
+ offset);
+ if (err)
+ goto free_message;
+ }
+
+ if (length) {
+ err = fbnic_tlv_attr_put_int(msg, FBNIC_FW_COREDUMP_READ_LENGTH,
+ length);
+ if (err)
+ goto free_message;
+ }
+
+ err = fbnic_mbx_map_req_w_cmpl(fbd, msg, cmpl_data);
+ if (err)
+ goto free_message;
+
+ return 0;
+
+free_message:
+ free_page((unsigned long)msg);
+ return err;
+}
+
+static const struct fbnic_tlv_index fbnic_coredump_resp_index[] = {
+ FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_READ_OFFSET),
+ FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_READ_LENGTH),
+ FBNIC_TLV_ATTR_RAW_DATA(FBNIC_FW_COREDUMP_READ_DATA),
+ FBNIC_TLV_ATTR_S32(FBNIC_FW_COREDUMP_READ_ERROR),
+ FBNIC_TLV_ATTR_LAST
+};
+
+static int fbnic_fw_parse_coredump_resp(void *opaque,
+ struct fbnic_tlv_msg **results)
+{
+ struct fbnic_fw_completion *cmpl_data;
+ u32 index, last_offset, last_length;
+ struct fbnic_dev *fbd = opaque;
+ struct fbnic_tlv_msg *data_hdr;
+ u32 length, offset;
+ u32 msg_type;
+ s32 err;
+
+ /* Verify we have a completion pointer to provide with data */
+ msg_type = FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP;
+ cmpl_data = fbnic_fw_get_cmpl_by_type(fbd, msg_type);
+ if (!cmpl_data)
+ return -ENOSPC;
+
+ err = fta_get_sint(results, FBNIC_FW_COREDUMP_READ_ERROR);
+ if (err)
+ goto msg_err;
+
+ data_hdr = results[FBNIC_FW_COREDUMP_READ_DATA];
+ if (!data_hdr) {
+ err = -ENODATA;
+ goto msg_err;
+ }
+
+ offset = fta_get_uint(results, FBNIC_FW_COREDUMP_READ_OFFSET);
+ length = fta_get_uint(results, FBNIC_FW_COREDUMP_READ_LENGTH);
+
+ if (length > le16_to_cpu(data_hdr->hdr.len) - sizeof(u32)) {
+ dev_err(fbd->dev, "length greater than size of message\n");
+ err = -EINVAL;
+ goto msg_err;
+ }
+
+ /* Only the last offset can have a length != stride */
+ last_length =
+ (cmpl_data->u.coredump.size % cmpl_data->u.coredump.stride) ? :
+ cmpl_data->u.coredump.stride;
+ last_offset = cmpl_data->u.coredump.size - last_length;
+
+ /* Verify offset and length */
+ if (offset % cmpl_data->u.coredump.stride || offset > last_offset) {
+ dev_err(fbd->dev, "offset %d out of range\n", offset);
+ err = -EINVAL;
+ } else if (length != ((offset == last_offset) ?
+ last_length : cmpl_data->u.coredump.stride)) {
+ dev_err(fbd->dev, "length %d out of range for offset %d\n",
+ length, offset);
+ err = -EINVAL;
+ }
+ if (err)
+ goto msg_err;
+
+ /* If data pointer is NULL it is already filled, just skip the copy */
+ index = offset / cmpl_data->u.coredump.stride;
+ if (!cmpl_data->u.coredump.data[index])
+ goto msg_err;
+
+ /* Copy data and mark index filled by setting pointer to NULL */
+ memcpy(cmpl_data->u.coredump.data[index],
+ fbnic_tlv_attr_get_value_ptr(data_hdr), length);
+ cmpl_data->u.coredump.data[index] = NULL;
+
+msg_err:
+ cmpl_data->result = err;
+ complete(&cmpl_data->done);
+ fbnic_fw_put_cmpl(cmpl_data);
+
+ return err;
+}
+
int fbnic_fw_xmit_fw_start_upgrade(struct fbnic_dev *fbd,
struct fbnic_fw_completion *cmpl_data,
unsigned int id, unsigned int len)
@@ -1207,6 +1431,11 @@ static const struct fbnic_tlv_parser fbnic_fw_tlv_parser[] = {
fbnic_fw_parse_ownership_resp),
FBNIC_TLV_PARSER(HEARTBEAT_RESP, fbnic_heartbeat_resp_index,
fbnic_fw_parse_heartbeat_resp),
+ FBNIC_TLV_PARSER(COREDUMP_GET_INFO_RESP,
+ fbnic_coredump_info_resp_index,
+ fbnic_fw_parse_coredump_info_resp),
+ FBNIC_TLV_PARSER(COREDUMP_READ_RESP, fbnic_coredump_resp_index,
+ fbnic_fw_parse_coredump_resp),
FBNIC_TLV_PARSER(FW_START_UPGRADE_RESP,
fbnic_fw_start_upgrade_resp_index,
fbnic_fw_parse_fw_start_upgrade_resp),
@@ -1529,11 +1758,12 @@ void fbnic_get_fw_ver_commit_str(struct fbnic_dev *fbd, char *fw_version,
fw_version, str_sz);
}
-struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type)
+struct fbnic_fw_completion *__fbnic_fw_alloc_cmpl(u32 msg_type,
+ size_t priv_size)
{
struct fbnic_fw_completion *cmpl;
- cmpl = kzalloc(sizeof(*cmpl), GFP_KERNEL);
+ cmpl = kzalloc(sizeof(*cmpl) + priv_size, GFP_KERNEL);
if (!cmpl)
return NULL;
@@ -1544,6 +1774,11 @@ struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type)
return cmpl;
}
+struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type)
+{
+ return __fbnic_fw_alloc_cmpl(msg_type, 0);
+}
+
void fbnic_fw_put_cmpl(struct fbnic_fw_completion *fw_cmpl)
{
kref_put(&fw_cmpl->ref_count, fbnic_fw_release_cmpl_data);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h
index ec67b80809b0..d776be9fc7f7 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h
@@ -67,6 +67,14 @@ struct fbnic_fw_completion {
int result;
union {
struct {
+ u32 size;
+ } coredump_info;
+ struct {
+ u32 size;
+ u16 stride;
+ u8 *data[];
+ } coredump;
+ struct {
u32 offset;
u32 length;
} fw_update;
@@ -89,6 +97,12 @@ void fbnic_mbx_flush_tx(struct fbnic_dev *fbd);
int fbnic_fw_xmit_ownership_msg(struct fbnic_dev *fbd, bool take_ownership);
int fbnic_fw_init_heartbeat(struct fbnic_dev *fbd, bool poll);
void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd);
+int fbnic_fw_xmit_coredump_info_msg(struct fbnic_dev *fbd,
+ struct fbnic_fw_completion *cmpl_data,
+ bool force);
+int fbnic_fw_xmit_coredump_read_msg(struct fbnic_dev *fbd,
+ struct fbnic_fw_completion *cmpl_data,
+ u32 offset, u32 length);
int fbnic_fw_xmit_fw_start_upgrade(struct fbnic_dev *fbd,
struct fbnic_fw_completion *cmpl_data,
unsigned int id, unsigned int len);
@@ -100,6 +114,8 @@ int fbnic_fw_xmit_tsene_read_msg(struct fbnic_dev *fbd,
int fbnic_fw_xmit_send_logs(struct fbnic_dev *fbd, bool enable,
bool send_log_history);
int fbnic_fw_xmit_rpc_macda_sync(struct fbnic_dev *fbd);
+struct fbnic_fw_completion *__fbnic_fw_alloc_cmpl(u32 msg_type,
+ size_t priv_size);
struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type);
void fbnic_fw_put_cmpl(struct fbnic_fw_completion *cmpl_data);
@@ -135,6 +151,10 @@ enum {
FBNIC_TLV_MSG_ID_OWNERSHIP_RESP = 0x13,
FBNIC_TLV_MSG_ID_HEARTBEAT_REQ = 0x14,
FBNIC_TLV_MSG_ID_HEARTBEAT_RESP = 0x15,
+ FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_REQ = 0x18,
+ FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP = 0x19,
+ FBNIC_TLV_MSG_ID_COREDUMP_READ_REQ = 0x20,
+ FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP = 0x21,
FBNIC_TLV_MSG_ID_FW_START_UPGRADE_REQ = 0x22,
FBNIC_TLV_MSG_ID_FW_START_UPGRADE_RESP = 0x23,
FBNIC_TLV_MSG_ID_FW_WRITE_CHUNK_REQ = 0x24,
@@ -198,10 +218,37 @@ enum {
enum {
FBNIC_FW_OWNERSHIP_FLAG = 0x0,
+ FBNIC_FW_OWNERSHIP_TIME = 0x1,
FBNIC_FW_OWNERSHIP_MSG_MAX
};
enum {
+ FBNIC_FW_HEARTBEAT_UPTIME = 0x0,
+ FBNIC_FW_HEARTBEAT_NUMBER_OF_MESSAGES = 0x1,
+ FBNIC_FW_HEARTBEAT_MSG_MAX
+};
+
+enum {
+ FBNIC_FW_COREDUMP_REQ_INFO_CREATE = 0x0,
+ FBNIC_FW_COREDUMP_REQ_INFO_MSG_MAX
+};
+
+enum {
+ FBNIC_FW_COREDUMP_INFO_AVAILABLE = 0x0,
+ FBNIC_FW_COREDUMP_INFO_SIZE = 0x1,
+ FBNIC_FW_COREDUMP_INFO_ERROR = 0x2,
+ FBNIC_FW_COREDUMP_INFO_MSG_MAX
+};
+
+enum {
+ FBNIC_FW_COREDUMP_READ_OFFSET = 0x0,
+ FBNIC_FW_COREDUMP_READ_LENGTH = 0x1,
+ FBNIC_FW_COREDUMP_READ_DATA = 0x2,
+ FBNIC_FW_COREDUMP_READ_ERROR = 0x3,
+ FBNIC_FW_COREDUMP_READ_MSG_MAX
+};
+
+enum {
FBNIC_FW_START_UPGRADE_ERROR = 0x0,
FBNIC_FW_START_UPGRADE_SECTION = 0x1,
FBNIC_FW_START_UPGRADE_IMAGE_LENGTH = 0x2,
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c
index c1663f042245..85a883dba385 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c
@@ -72,7 +72,7 @@ void fbnic_fw_log_free(struct fbnic_dev *fbd)
}
int fbnic_fw_log_write(struct fbnic_dev *fbd, u64 index, u32 timestamp,
- char *msg)
+ const char *msg)
{
struct fbnic_fw_log_entry *entry, *head, *tail, *next;
struct fbnic_fw_log *log = &fbd->fw_log;
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h
index cb6555f40a24..50ec79003108 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h
@@ -41,5 +41,5 @@ void fbnic_fw_log_disable(struct fbnic_dev *fbd);
int fbnic_fw_log_init(struct fbnic_dev *fbd);
void fbnic_fw_log_free(struct fbnic_dev *fbd);
int fbnic_fw_log_write(struct fbnic_dev *fbd, u64 index, u32 timestamp,
- char *msg);
+ const char *msg);
#endif /* _FBNIC_FW_LOG_H_ */
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index 9fdc8f4f36cc..a7a6b4db8016 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -167,6 +167,20 @@ void fbnic_down(struct fbnic_net *fbn)
fbnic_flush(fbn);
}
+static int fbnic_fw_config_after_crash(struct fbnic_dev *fbd)
+{
+ if (fbnic_fw_xmit_ownership_msg(fbd, true)) {
+ dev_err(fbd->dev, "NIC failed to take ownership\n");
+
+ return -1;
+ }
+
+ fbnic_rpc_reset_valid_entries(fbd);
+ __fbnic_set_rx_mode(fbd);
+
+ return 0;
+}
+
static void fbnic_health_check(struct fbnic_dev *fbd)
{
struct fbnic_fw_mbx *tx_mbx = &fbd->mbx[FBNIC_IPC_MBX_TX_IDX];
@@ -182,13 +196,11 @@ static void fbnic_health_check(struct fbnic_dev *fbd)
if (tx_mbx->head != tx_mbx->tail)
return;
- /* TBD: Need to add a more thorough recovery here.
- * Specifically I need to verify what all the firmware will have
- * changed since we had setup and it rebooted. May just need to
- * perform a down/up. For now we will just reclaim ownership so
- * the heartbeat can catch the next fault.
- */
- fbnic_fw_xmit_ownership_msg(fbd, true);
+ fbnic_devlink_fw_report(fbd, "Firmware crashed detected!");
+ fbnic_devlink_otp_check(fbd, "error detected after firmware recovery");
+
+ if (fbnic_fw_config_after_crash(fbd))
+ dev_err(fbd->dev, "Firmware recovery failed after crash\n");
}
static void fbnic_service_task(struct work_struct *work)
@@ -269,6 +281,10 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return -ENOMEM;
}
+ err = fbnic_devlink_health_create(fbd);
+ if (err)
+ goto free_fbd;
+
/* Populate driver with hardware-specific info and handlers */
fbd->max_num_queues = info->max_num_queues;
@@ -279,7 +295,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
err = fbnic_alloc_irqs(fbd);
if (err)
- goto free_fbd;
+ goto err_destroy_health;
err = fbnic_mac_init(fbd);
if (err) {
@@ -306,6 +322,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
err);
fbnic_devlink_register(fbd);
+ fbnic_devlink_otp_check(fbd, "error detected during probe");
fbnic_dbg_fbd_init(fbd);
/* Capture snapshot of hardware stats so netdev can calculate delta */
@@ -348,6 +365,8 @@ init_failure_mode:
return 0;
free_irqs:
fbnic_free_irqs(fbd);
+err_destroy_health:
+ fbnic_devlink_health_destroy(fbd);
free_fbd:
fbnic_devlink_free(fbd);
@@ -382,6 +401,7 @@ static void fbnic_remove(struct pci_dev *pdev)
fbnic_fw_free_mbx(fbd);
fbnic_free_irqs(fbd);
+ fbnic_devlink_health_destroy(fbd);
fbnic_devlink_free(fbd);
}
@@ -456,6 +476,9 @@ static int __fbnic_pm_resume(struct device *dev)
*/
fbnic_fw_log_enable(fbd, list_empty(&fbd->fw_log.entries));
+ /* Since the FW should be up, check if it reported OTP errors */
+ fbnic_devlink_otp_check(fbd, "error detected after PM resume");
+
/* No netdev means there isn't a network interface to bring up */
if (fbnic_init_failure(fbd))
return 0;
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
index 4284b3cb7fcc..7f31e890031c 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
@@ -596,6 +596,21 @@ static void fbnic_clear_macda(struct fbnic_dev *fbd)
}
}
+static void fbnic_clear_valid_macda(struct fbnic_dev *fbd)
+{
+ int idx;
+
+ for (idx = ARRAY_SIZE(fbd->mac_addr); idx--;) {
+ struct fbnic_mac_addr *mac_addr = &fbd->mac_addr[idx];
+
+ if (mac_addr->state == FBNIC_TCAM_S_VALID) {
+ fbnic_clear_macda_entry(fbd, idx);
+
+ mac_addr->state = FBNIC_TCAM_S_UPDATE;
+ }
+ }
+}
+
static void fbnic_write_macda_entry(struct fbnic_dev *fbd, unsigned int idx,
struct fbnic_mac_addr *mac_addr)
{
@@ -1124,13 +1139,25 @@ void fbnic_write_ip_addr(struct fbnic_dev *fbd)
}
}
-void fbnic_clear_rules(struct fbnic_dev *fbd)
+static void fbnic_clear_valid_act_tcam(struct fbnic_dev *fbd)
{
- u32 dest = FIELD_PREP(FBNIC_RPC_ACT_TBL0_DEST_MASK,
- FBNIC_RPC_ACT_TBL0_DEST_BMC);
int i = FBNIC_RPC_TCAM_ACT_NUM_ENTRIES - 1;
struct fbnic_act_tcam *act_tcam;
+ /* Work from the bottom up deleting all other rules from hardware */
+ do {
+ act_tcam = &fbd->act_tcam[i];
+
+ if (act_tcam->state != FBNIC_TCAM_S_VALID)
+ continue;
+
+ fbnic_clear_act_tcam(fbd, i);
+ act_tcam->state = FBNIC_TCAM_S_UPDATE;
+ } while (i--);
+}
+
+void fbnic_clear_rules(struct fbnic_dev *fbd)
+{
/* Clear MAC rules */
fbnic_clear_macda(fbd);
@@ -1145,6 +1172,11 @@ void fbnic_clear_rules(struct fbnic_dev *fbd)
* the interface back up.
*/
if (fbnic_bmc_present(fbd)) {
+ u32 dest = FIELD_PREP(FBNIC_RPC_ACT_TBL0_DEST_MASK,
+ FBNIC_RPC_ACT_TBL0_DEST_BMC);
+ int i = FBNIC_RPC_TCAM_ACT_NUM_ENTRIES - 1;
+ struct fbnic_act_tcam *act_tcam;
+
act_tcam = &fbd->act_tcam[i];
if (act_tcam->state == FBNIC_TCAM_S_VALID &&
@@ -1153,21 +1185,10 @@ void fbnic_clear_rules(struct fbnic_dev *fbd)
wr32(fbd, FBNIC_RPC_ACT_TBL1(i), 0);
act_tcam->state = FBNIC_TCAM_S_UPDATE;
-
- i--;
}
}
- /* Work from the bottom up deleting all other rules from hardware */
- do {
- act_tcam = &fbd->act_tcam[i];
-
- if (act_tcam->state != FBNIC_TCAM_S_VALID)
- continue;
-
- fbnic_clear_act_tcam(fbd, i);
- act_tcam->state = FBNIC_TCAM_S_UPDATE;
- } while (i--);
+ fbnic_clear_valid_act_tcam(fbd);
}
static void fbnic_delete_act_tcam(struct fbnic_dev *fbd, unsigned int idx)
@@ -1217,3 +1238,9 @@ void fbnic_write_rules(struct fbnic_dev *fbd)
fbnic_update_act_tcam(fbd, i);
}
}
+
+void fbnic_rpc_reset_valid_entries(struct fbnic_dev *fbd)
+{
+ fbnic_clear_valid_act_tcam(fbd);
+ fbnic_clear_valid_macda(fbd);
+}