summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorGuchun Chen <guchun.chen@amd.com>2020-07-23 16:05:00 +0800
committerAlex Deucher <alexander.deucher@amd.com>2020-08-04 17:26:46 -0400
commit9c06f91ff2349da67651cddf88507aa967b58b08 (patch)
tree5279e89fd62d4cdd436d20fa1b72b330e52e28ba /drivers
parent35cd2cdadbccf08e9b4d5a8fca4914dc37ab48e0 (diff)
drm/amdgpu: schedule ras recovery when reaching bad page threshold(v2)
Once the bad page saved to eeprom reaches the configured threshold, ras recovery will be issued to notify user. v2: Fix spelling typo. Signed-off-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c37
1 files changed, 36 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index da8b35a5b41c..461dfd22bc1c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
int i, ret = 0;
struct i2c_msg *msgs, *msg;
unsigned char *buffs, *buff;
+ bool sched_ras_recovery = false;
struct eeprom_table_record *record;
struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
return 0;
@@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
goto free_buff;
}
+ /*
+ * If saved bad pages number exceeds the bad page threshold for
+ * the whole VRAM, update table header to mark the BAD GPU tag
+ * and schedule one ras recovery after eeprom write is done,
+ * this can avoid the missing for latest records.
+ *
+ * This new header will be picked up and checked in the bootup
+ * by ras recovery, which may break bootup process to notify
+ * user this GPU is in bad state and to retire such GPU for
+ * further check.
+ */
+ if (write && (amdgpu_bad_page_threshold != 0) &&
+ ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
+ dev_warn(adev->dev,
+ "Saved bad pages(%d) reaches threshold value(%d).\n",
+ control->num_recs + num, ras->bad_page_cnt_threshold);
+ control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
+ sched_ras_recovery = true;
+ }
+
/* In case of overflow just start from beginning to not lose newest records */
if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
control->next_addr = EEPROM_RECORD_START;
-
/*
* TODO Currently makes EEPROM writes for each record, this creates
* internal fragmentation. Optimized the code to do full page write of
@@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
__update_tbl_checksum(control, records, num, old_hdr_byte_sum);
__update_table_header(control, buffs);
+
+ if (sched_ras_recovery) {
+ /*
+ * Before scheduling ras recovery, assert the related
+ * flag first, which shall bypass common bad page
+ * reservation execution in amdgpu_ras_reset_gpu.
+ */
+ amdgpu_ras_get_context(adev)->flags |=
+ AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
+
+ dev_warn(adev->dev, "Conduct ras recovery due to bad "
+ "page threshold reached.\n");
+ amdgpu_ras_reset_gpu(adev);
+ }
} else if (!__validate_tbl_checksum(control, records, num)) {
DRM_WARN("EEPROM Table checksum mismatch!");
/* TODO Uncomment when EEPROM read/write is relliable */