From cb464ba53c0cb497dcb4a3daaf4fad4b75291863 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 11 Oct 2021 13:14:28 +0300 Subject: net/mlx5: Extend health buffer dump Enhance health buffer to include: - assert_var5: expose the 6'th assert variable. - time: error's time-stamp in seconds (epoch time). - rfr: Recovery Flow Requiered. When set, indicates that the error cannot be recovered without flow involving reset. - severity: error's severity value, ranging from emergency to debug. Expose them in the health buffer dump (dmesg and devlink fw reporter). Health buffer in dmesg: mlx5_core 0000:08:00.0: print_health_info:425:(pid 912): Health issue observed, firmware internal error, severity(3) ERROR: mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[0] 0x08040700 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[1] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[2] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[3] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[4] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[5] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:432:(pid 912): assert_exit_ptr 0x00aaf800 mlx5_core 0000:08:00.0: print_health_info:434:(pid 912): assert_callra 0x00aaf70c mlx5_core 0000:08:00.0: print_health_info:436:(pid 912): fw_ver 16.32.492 mlx5_core 0000:08:00.0: print_health_info:437:(pid 912): time 1634819758 mlx5_core 0000:08:00.0: print_health_info:438:(pid 912): hw_id 0x0000020d mlx5_core 0000:08:00.0: print_health_info:439:(pid 912): rfr 0 mlx5_core 0000:08:00.0: print_health_info:440:(pid 912): severity 3 (ERROR) mlx5_core 0000:08:00.0: print_health_info:441:(pid 912): irisc_index 9 mlx5_core 0000:08:00.0: print_health_info:442:(pid 912): synd 0x1: firmware internal error mlx5_core 0000:08:00.0: print_health_info:444:(pid 912): ext_synd 0x802b mlx5_core 0000:08:00.0: print_health_info:445:(pid 912): raw fw_ver 0x102001ec Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 14 ++++++++------ include/linux/mlx5/mlx5_ifc.h | 10 ++++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux/mlx5') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 347167c18802..f8a0bbb42c3b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -541,19 +541,21 @@ struct mlx5_cmd_layout { u8 status_own; }; -enum mlx5_fatal_assert_bit_offsets { - MLX5_RFR_OFFSET = 31, +enum mlx5_rfr_severity_bit_offsets { + MLX5_RFR_BIT_OFFSET = 0x7, }; struct health_buffer { - __be32 assert_var[5]; - __be32 rsvd0[3]; + __be32 assert_var[6]; + __be32 rsvd0[2]; __be32 assert_exit_ptr; __be32 assert_callra; - __be32 rsvd1[2]; + __be32 rsvd1[1]; + __be32 time; __be32 fw_ver; __be32 hw_id; - __be32 rfr; + u8 rfr_severity; + u8 rsvd2[3]; u8 irisc_index; u8 synd; __be16 ext_synd; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 09e43019d877..6d292b5b8992 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4149,13 +4149,19 @@ struct mlx5_ifc_health_buffer_bits { u8 assert_callra[0x20]; - u8 reserved_at_140[0x40]; + u8 reserved_at_140[0x20]; + + u8 time[0x20]; u8 fw_version[0x20]; u8 hw_id[0x20]; - u8 reserved_at_1c0[0x20]; + u8 rfr[0x1]; + u8 reserved_at_1c1[0x3]; + u8 valid[0x1]; + u8 severity[0x3]; + u8 reserved_at_1c8[0x18]; u8 irisc_index[0x8]; u8 synd[0x8]; -- cgit v1.2.3 From 5a1023deeed02a2078bcc11eec1c4be31e85892d Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 13 Oct 2021 09:45:22 +0300 Subject: net/mlx5: Add periodic update of host time to firmware Firmware logs its asserts also to non-volatile memory. In order to reduce drift between the NIC and the host, the driver sets the host epoch-time to the firmware every hour. Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux/mlx5') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3f4c0f2314a5..f617dfbcd9fd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -134,6 +134,7 @@ enum { MLX5_REG_MCIA = 0x9014, MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, + MLX5_REG_MRTC = 0x902d, MLX5_REG_MTRC_CAP = 0x9040, MLX5_REG_MTRC_CONF = 0x9041, MLX5_REG_MTRC_STDB = 0x9042, @@ -440,6 +441,7 @@ struct mlx5_core_health { struct work_struct report_work; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_fatal_reporter; + struct delayed_work update_fw_log_ts_work; }; struct mlx5_qp_table { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6d292b5b8992..746381eccccf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10358,6 +10358,17 @@ struct mlx5_ifc_pddr_reg_bits { union mlx5_ifc_pddr_reg_page_data_auto_bits page_data; }; +struct mlx5_ifc_mrtc_reg_bits { + u8 time_synced[0x1]; + u8 reserved_at_1[0x1f]; + + u8 reserved_at_20[0x20]; + + u8 time_h[0x20]; + + u8 time_l[0x20]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -10419,6 +10430,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mirc_reg_bits mirc_reg; struct mlx5_ifc_mfrl_reg_bits mfrl_reg; struct mlx5_ifc_mtutc_reg_bits mtutc_reg; + struct mlx5_ifc_mrtc_reg_bits mrtc_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From 46ae40b94d8826591472798114a723cc7feac7a7 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 12 Aug 2021 11:53:34 +0300 Subject: net/mlx5: Let user configure io_eq_size param Currently, each I/O EQ is taking 128KB of memory. This size is not needed in all use cases, and is critical with large scale. Hence, allow user to configure the size of I/O EQs. For example, to reduce I/O EQ size to 64, execute: $ devlink resource set pci/0000:00:0b.0 path /io_eq_size/ size 64 $ devlink dev reload pci/0000:00:0b.0 Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/mlx5') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f617dfbcd9fd..47c07f95bbe1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -797,10 +797,6 @@ struct mlx5_db { int index; }; -enum { - MLX5_COMP_EQ_SIZE = 1024, -}; - enum { MLX5_PTYS_IB = 1 << 0, MLX5_PTYS_EN = 1 << 2, -- cgit v1.2.3 From a6cb08daa3b459e3dab1d98c67cb8c931f4d81a5 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 13 Oct 2021 09:57:54 +0300 Subject: net/mlx5: Let user configure event_eq_size param Event EQ is an EQ which received the notification of almost all the events generated by the NIC. Currently, each event EQ is taking 512KB of memory. This size is not needed in most use cases, and is critical with large scale. Hence, allow user to configure the size of the event EQ. For example to reduce event EQ size to 64, execute:: $ devlink resource set pci/0000:00:0b.0 path /event_eq_size/ size 64 $ devlink dev reload pci/0000:00:0b.0 Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/mlx5') diff --git a/include/linux/mlx5/eq.h b/include/linux/mlx5/eq.h index ea3ff5a8ced3..11161e427608 100644 --- a/include/linux/mlx5/eq.h +++ b/include/linux/mlx5/eq.h @@ -5,7 +5,6 @@ #define MLX5_CORE_EQ_H #define MLX5_NUM_CMD_EQE (32) -#define MLX5_NUM_ASYNC_EQE (0x1000) #define MLX5_NUM_SPARE_EQE (0x80) struct mlx5_eq; -- cgit v1.2.3 From 554604061979d656bbbec50f101526349cd5aa5f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 16 Aug 2021 08:41:08 +0300 Subject: net/mlx5: Let user configure max_macs param Currently, max_macs is taking 70Kbytes of memory per function. This size is not needed in all use cases, and is critical with large scale. Hence, allow user to configure the number of max_macs. For example, to reduce the number of max_macs to 1, execute:: $ devlink dev param set pci/0000:00:0b.0 name max_macs value 1 \ cmode driverinit $ devlink dev reload pci/0000:00:0b.0 Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mlx5') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 746381eccccf..97465d00de9d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1603,7 +1603,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ext_stride_num_range[0x1]; u8 roce_rw_supported[0x1]; - u8 reserved_at_3a2[0x1]; + u8 log_max_current_uc_list_wr_supported[0x1]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; -- cgit v1.2.3