diff options
author | Guchun Chen <guchun.chen@amd.com> | 2020-07-23 16:05:00 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2020-08-04 17:26:46 -0400 |
commit | 9c06f91ff2349da67651cddf88507aa967b58b08 (patch) | |
tree | 5279e89fd62d4cdd436d20fa1b72b330e52e28ba /drivers | |
parent | 35cd2cdadbccf08e9b4d5a8fca4914dc37ab48e0 (diff) |
drm/amdgpu: schedule ras recovery when reaching bad page threshold(v2)
Once the bad page saved to eeprom reaches the configured
threshold, ras recovery will be issued to notify user.
v2: Fix spelling typo.
Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 37 |
1 files changed, 36 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index da8b35a5b41c..461dfd22bc1c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, int i, ret = 0; struct i2c_msg *msgs, *msg; unsigned char *buffs, *buff; + bool sched_ras_recovery = false; struct eeprom_table_record *record; struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS) return 0; @@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, goto free_buff; } + /* + * If saved bad pages number exceeds the bad page threshold for + * the whole VRAM, update table header to mark the BAD GPU tag + * and schedule one ras recovery after eeprom write is done, + * this can avoid the missing for latest records. + * + * This new header will be picked up and checked in the bootup + * by ras recovery, which may break bootup process to notify + * user this GPU is in bad state and to retire such GPU for + * further check. + */ + if (write && (amdgpu_bad_page_threshold != 0) && + ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) { + dev_warn(adev->dev, + "Saved bad pages(%d) reaches threshold value(%d).\n", + control->num_recs + num, ras->bad_page_cnt_threshold); + control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD; + sched_ras_recovery = true; + } + /* In case of overflow just start from beginning to not lose newest records */ if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES)) control->next_addr = EEPROM_RECORD_START; - /* * TODO Currently makes EEPROM writes for each record, this creates * internal fragmentation. Optimized the code to do full page write of @@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, __update_tbl_checksum(control, records, num, old_hdr_byte_sum); __update_table_header(control, buffs); + + if (sched_ras_recovery) { + /* + * Before scheduling ras recovery, assert the related + * flag first, which shall bypass common bad page + * reservation execution in amdgpu_ras_reset_gpu. + */ + amdgpu_ras_get_context(adev)->flags |= + AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV; + + dev_warn(adev->dev, "Conduct ras recovery due to bad " + "page threshold reached.\n"); + amdgpu_ras_reset_gpu(adev); + } } else if (!__validate_tbl_checksum(control, records, num)) { DRM_WARN("EEPROM Table checksum mismatch!"); /* TODO Uncomment when EEPROM read/write is relliable */ |