summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet
diff options
context:
space:
mode:
authorVadim Pasternak <vadimp@nvidia.com>2021-01-08 16:52:09 +0200
committerJakub Kicinski <kuba@kernel.org>2021-01-09 16:25:10 -0800
commit57726ebe2733891c9f59105eff028735f73d05fb (patch)
treefad00833e299e3aa8fdd94034529329181ee015e /drivers/net/ethernet
parentb77413446408fdd256599daf00d5be72b5f3e7c6 (diff)
mlxsw: core: Add validation of transceiver temperature thresholds
Validate thresholds to avoid a single failure due to some transceiver unreliability. Ignore the last readouts in case warning temperature is above alarm temperature, since it can cause unexpected thermal shutdown. Stay with the previous values and refresh threshold within the next iteration. This is a rare scenario, but it was observed at a customer site. Fixes: 6a79507cfe94 ("mlxsw: core: Extend thermal module with per QSFP module thermal zones") Signed-off-by: Vadim Pasternak <vadimp@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'drivers/net/ethernet')
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core_thermal.c11
1 files changed, 7 insertions, 4 deletions
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 8fa286ccdd6b..250a85049697 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -176,6 +176,12 @@ mlxsw_thermal_module_trips_update(struct device *dev, struct mlxsw_core *core,
if (err)
return err;
+ if (crit_temp > emerg_temp) {
+ dev_warn(dev, "%s : Critical threshold %d is above emergency threshold %d\n",
+ tz->tzdev->type, crit_temp, emerg_temp);
+ return 0;
+ }
+
/* According to the system thermal requirements, the thermal zones are
* defined with four trip points. The critical and emergency
* temperature thresholds, provided by QSFP module are set as "active"
@@ -190,11 +196,8 @@ mlxsw_thermal_module_trips_update(struct device *dev, struct mlxsw_core *core,
tz->trips[MLXSW_THERMAL_TEMP_TRIP_NORM].temp = crit_temp;
tz->trips[MLXSW_THERMAL_TEMP_TRIP_HIGH].temp = crit_temp;
tz->trips[MLXSW_THERMAL_TEMP_TRIP_HOT].temp = emerg_temp;
- if (emerg_temp > crit_temp)
- tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp +
+ tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp +
MLXSW_THERMAL_MODULE_TEMP_SHIFT;
- else
- tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp;
return 0;
}