Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • agd5f/linux
  • ltuikov/linux
  • FireBurn/linux
  • u6/linux
  • nikitalita/linux
  • siqueira/linux
  • magali/linux
  • amos/linux-display
  • isinyaaa/linux
  • somalapuram/linux
  • nirmoy/linux
  • hwentland/linux
  • hakzsam/linux
  • siqueira/linux-kunit
  • xushuotao/linux
  • lixian/linux
  • asheplyakov/linux
  • ap6711451/linux
  • alonsopascacioflores/linux
  • pyuan/linux
  • pepp/linux
  • alex.hung/linux
  • ckborah/linux-colorpipeline
  • lucmann/linux
  • MSkeffington/amdgfx-linux-fork
25 results
Show changes
Commits on Source (65)
Showing
with 439 additions and 156 deletions
......@@ -1194,9 +1194,15 @@ struct amdgpu_device {
bool debug_exp_resets;
bool debug_disable_gpu_ring_reset;
bool enforce_isolation[MAX_XCP];
/* Added this mutex for cleaner shader isolation between GFX and compute processes */
/* Protection for the following isolation structure */
struct mutex enforce_isolation_mutex;
bool enforce_isolation[MAX_XCP];
struct amdgpu_isolation {
void *owner;
struct dma_fence *spearhead;
struct amdgpu_sync active;
struct amdgpu_sync prev;
} isolation[MAX_XCP];
struct amdgpu_init_level *init_lvl;
......@@ -1482,6 +1488,9 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev);
struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
struct dma_fence *gang);
struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
struct amdgpu_ring *ring,
struct amdgpu_job *job);
bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
......
......@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
{
struct aca_bank_node *node;
struct aca_bank *bank;
int r;
if (!adev->cper.enabled)
return;
......@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
/* UEs must be encoded into separate CPER entries */
if (type == ACA_SMU_TYPE_UE) {
struct aca_banks de_banks;
aca_banks_init(&de_banks);
list_for_each_entry(node, &banks->list, node) {
bank = &node->bank;
if (amdgpu_cper_generate_ue_record(adev, bank))
dev_warn(adev->dev, "fail to generate ue cper records\n");
if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
r = aca_banks_add_bank(&de_banks, bank);
if (r)
dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
} else {
if (amdgpu_cper_generate_ue_record(adev, bank))
dev_warn(adev->dev, "fail to generate ue cper records\n");
}
}
if (!list_empty(&de_banks.list)) {
if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
dev_warn(adev->dev, "fail to generate de cper records\n");
}
aca_banks_release(&de_banks);
} else {
/*
* SMU_TYPE_CE banks are combined into 1 CPER entries,
......@@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
if (ret)
return ret;
/* DEs may contain in CEs or UEs */
if (type != ACA_ERROR_TYPE_DEFERRED)
aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
return aca_log_aca_error(handle, type, err_data);
}
......
......@@ -76,11 +76,17 @@ struct ras_query_context;
#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */
#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
ACA_ERROR_TYPE_DEFERRED : \
ACA_ERROR_TYPE_CE)
#define ACA_BANK_ERR_IS_DEFFERED(bank) \
(ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
ACA_ERROR_TYPE_CE)
#define ACA_BANK_ERR_UE_DE_DECODE(bank) \
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
ACA_ERROR_TYPE_UE)
enum aca_reg_idx {
ACA_REG_IDX_CTL = 0,
......
......@@ -491,7 +491,7 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
if (ret)
return ret;
return amdgpu_sync_fence(sync, vm->last_update);
return amdgpu_sync_fence(sync, vm->last_update, GFP_KERNEL);
}
static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
......@@ -1249,7 +1249,7 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
(void)amdgpu_sync_fence(sync, bo_va->last_pt_update);
(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
return 0;
}
......@@ -1273,7 +1273,7 @@ static int update_gpuvm_pte(struct kgd_mem *mem,
return ret;
}
return amdgpu_sync_fence(sync, bo_va->last_pt_update);
return amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
}
static int map_bo_to_gpuvm(struct kgd_mem *mem,
......@@ -2913,7 +2913,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
}
dma_resv_for_each_fence(&cursor, bo->tbo.base.resv,
DMA_RESV_USAGE_KERNEL, fence) {
ret = amdgpu_sync_fence(&sync_obj, fence);
ret = amdgpu_sync_fence(&sync_obj, fence, GFP_KERNEL);
if (ret) {
pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
goto validate_map_fail;
......
......@@ -455,10 +455,10 @@ static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos)
return umin(rec_len, chunk);
}
void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
void *src, int count)
void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count)
{
u64 pos, wptr_old, rptr = *ring->rptr_cpu_addr & ring->ptr_mask;
int rec_cnt_dw = count >> 2;
u32 chunk, ent_sz;
u8 *s = (u8 *)src;
......@@ -485,6 +485,9 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
s += chunk;
}
if (ring->count_dw < rec_cnt_dw)
ring->count_dw = 0;
/* the buffer is overflow, adjust rptr */
if (((wptr_old < rptr) && (rptr <= ring->wptr)) ||
((ring->wptr < wptr_old) && (wptr_old < rptr)) ||
......@@ -501,12 +504,10 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring,
pos = rptr;
} while (!amdgpu_cper_is_hdr(ring, rptr));
}
mutex_unlock(&ring->adev->cper.ring_lock);
if (ring->count_dw >= (count >> 2))
ring->count_dw -= (count >> 2);
else
ring->count_dw = 0;
if (ring->count_dw >= rec_cnt_dw)
ring->count_dw -= rec_cnt_dw;
mutex_unlock(&ring->adev->cper.ring_lock);
}
static u64 amdgpu_cper_ring_get_rptr(struct amdgpu_ring *ring)
......
......@@ -428,7 +428,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
dma_fence_put(old);
}
r = amdgpu_sync_fence(&p->sync, fence);
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
dma_fence_put(fence);
if (r)
return r;
......@@ -450,7 +450,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
return r;
}
r = amdgpu_sync_fence(&p->sync, fence);
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
dma_fence_put(fence);
return r;
}
......@@ -1111,7 +1111,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
struct drm_gpu_scheduler *sched = entity->rq->sched;
struct amdgpu_ring *ring = to_amdgpu_ring(sched);
if (amdgpu_vmid_uses_reserved(adev, vm, ring->vm_hub))
if (amdgpu_vmid_uses_reserved(vm, ring->vm_hub))
return -EINVAL;
}
}
......@@ -1124,7 +1124,8 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update,
GFP_KERNEL);
if (r)
return r;
......@@ -1135,7 +1136,8 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
GFP_KERNEL);
if (r)
return r;
}
......@@ -1154,7 +1156,8 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
GFP_KERNEL);
if (r)
return r;
}
......@@ -1167,7 +1170,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
r = amdgpu_sync_fence(&p->sync, vm->last_update);
r = amdgpu_sync_fence(&p->sync, vm->last_update, GFP_KERNEL);
if (r)
return r;
......@@ -1248,7 +1251,8 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
continue;
}
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence);
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence,
GFP_KERNEL);
dma_fence_put(fence);
if (r)
return r;
......
......@@ -227,6 +227,24 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
static DEVICE_ATTR(pcie_replay_count, 0444,
amdgpu_device_get_pcie_replay_count, NULL);
static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
{
int ret = 0;
if (!amdgpu_sriov_vf(adev))
ret = sysfs_create_file(&adev->dev->kobj,
&dev_attr_pcie_replay_count.attr);
return ret;
}
static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
{
if (!amdgpu_sriov_vf(adev))
sysfs_remove_file(&adev->dev->kobj,
&dev_attr_pcie_replay_count.attr);
}
static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t ppos, size_t count)
......@@ -4172,11 +4190,6 @@ static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
}
#endif
static const struct attribute *amdgpu_dev_attributes[] = {
&dev_attr_pcie_replay_count.attr,
NULL
};
static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
{
if (amdgpu_mcbp == 1)
......@@ -4281,7 +4294,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->gfx.reset_sem_mutex);
/* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
mutex_init(&adev->enforce_isolation_mutex);
for (i = 0; i < MAX_XCP; ++i) {
adev->isolation[i].spearhead = dma_fence_get_stub();
amdgpu_sync_create(&adev->isolation[i].active);
amdgpu_sync_create(&adev->isolation[i].prev);
}
mutex_init(&adev->gfx.kfd_sch_mutex);
mutex_init(&adev->gfx.workload_profile_mutex);
mutex_init(&adev->vcn.workload_profile_mutex);
amdgpu_device_init_apu_flags(adev);
......@@ -4399,10 +4419,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
/* Get rid of things like offb */
r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
if (r)
return r;
/*
* No need to remove conflicting FBs for non-display class devices.
* This prevents the sysfb from being freed accidently.
*/
if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
(pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
/* Get rid of things like offb */
r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
if (r)
return r;
}
/* Enable TMZ based on IP_VERSION */
amdgpu_gmc_tmz_set(adev);
......@@ -4613,7 +4640,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
} else
adev->ucode_sysfs_en = true;
r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
r = amdgpu_device_attr_sysfs_init(adev);
if (r)
dev_err(adev->dev, "Could not create amdgpu device attr\n");
......@@ -4750,7 +4777,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
amdgpu_device_attr_sysfs_fini(adev);
amdgpu_fru_sysfs_fini(adev);
amdgpu_reg_state_sysfs_fini(adev);
......@@ -4777,7 +4804,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
void amdgpu_device_fini_sw(struct amdgpu_device *adev)
{
int idx;
int i, idx;
bool px;
amdgpu_device_ip_fini(adev);
......@@ -4785,6 +4812,11 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
adev->accel_working = false;
dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
for (i = 0; i < MAX_XCP; ++i) {
dma_fence_put(adev->isolation[i].spearhead);
amdgpu_sync_free(&adev->isolation[i].active);
amdgpu_sync_free(&adev->isolation[i].prev);
}
amdgpu_reset_fini(adev);
......@@ -4800,6 +4832,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
kfree(adev->fru_info);
adev->fru_info = NULL;
kfree(adev->xcp_mgr);
adev->xcp_mgr = NULL;
px = amdgpu_device_supports_px(adev_to_drm(adev));
if (px || (!dev_is_removable(&adev->pdev->dev) &&
......@@ -5331,6 +5366,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
amdgpu_ras_resume(adev);
......@@ -6903,22 +6939,117 @@ struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
{
struct dma_fence *old = NULL;
dma_fence_get(gang);
do {
dma_fence_put(old);
old = amdgpu_device_get_gang(adev);
if (old == gang)
break;
if (!dma_fence_is_signaled(old))
if (!dma_fence_is_signaled(old)) {
dma_fence_put(gang);
return old;
}
} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
old, gang) != old);
/*
* Drop it once for the exchanged reference in adev and once for the
* thread local reference acquired in amdgpu_device_get_gang().
*/
dma_fence_put(old);
dma_fence_put(old);
return NULL;
}
/**
* amdgpu_device_enforce_isolation - enforce HW isolation
* @adev: the amdgpu device pointer
* @ring: the HW ring the job is supposed to run on
* @job: the job which is about to be pushed to the HW ring
*
* Makes sure that only one client at a time can use the GFX block.
* Returns: The dependency to wait on before the job can be pushed to the HW.
* The function is called multiple times until NULL is returned.
*/
struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
struct amdgpu_ring *ring,
struct amdgpu_job *job)
{
struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
struct drm_sched_fence *f = job->base.s_fence;
struct dma_fence *dep;
void *owner;
int r;
/*
* For now enforce isolation only for the GFX block since we only need
* the cleaner shader on those rings.
*/
if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
return NULL;
/*
* All submissions where enforce isolation is false are handled as if
* they come from a single client. Use ~0l as the owner to distinct it
* from kernel submissions where the owner is NULL.
*/
owner = job->enforce_isolation ? f->owner : (void *)~0l;
mutex_lock(&adev->enforce_isolation_mutex);
/*
* The "spearhead" submission is the first one which changes the
* ownership to its client. We always need to wait for it to be
* pushed to the HW before proceeding with anything.
*/
if (&f->scheduled != isolation->spearhead &&
!dma_fence_is_signaled(isolation->spearhead)) {
dep = isolation->spearhead;
goto out_grab_ref;
}
if (isolation->owner != owner) {
/*
* Wait for any gang to be assembled before switching to a
* different owner or otherwise we could deadlock the
* submissions.
*/
if (!job->gang_submit) {
dep = amdgpu_device_get_gang(adev);
if (!dma_fence_is_signaled(dep))
goto out_return_dep;
dma_fence_put(dep);
}
dma_fence_put(isolation->spearhead);
isolation->spearhead = dma_fence_get(&f->scheduled);
amdgpu_sync_move(&isolation->active, &isolation->prev);
trace_amdgpu_isolation(isolation->owner, owner);
isolation->owner = owner;
}
/*
* Specifying the ring here helps to pipeline submissions even when
* isolation is enabled. If that is not desired for testing NULL can be
* used instead of the ring to enforce a CPU round trip while switching
* between clients.
*/
dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
if (r)
DRM_WARN("OOM tracking isolation\n");
out_grab_ref:
dma_fence_get(dep);
out_return_dep:
mutex_unlock(&adev->enforce_isolation_mutex);
return dep;
}
bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
{
switch (adev->asic_type) {
......
......@@ -113,8 +113,13 @@
#include "amdgpu_isp.h"
#endif
#define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
MODULE_FIRMWARE("amdgpu/ip_discovery.bin");
MODULE_FIRMWARE("amdgpu/vega10_ip_discovery.bin");
MODULE_FIRMWARE("amdgpu/vega12_ip_discovery.bin");
MODULE_FIRMWARE("amdgpu/vega20_ip_discovery.bin");
MODULE_FIRMWARE("amdgpu/raven_ip_discovery.bin");
MODULE_FIRMWARE("amdgpu/raven2_ip_discovery.bin");
MODULE_FIRMWARE("amdgpu/picasso_ip_discovery.bin");
#define mmIP_DISCOVERY_VERSION 0x16A00
#define mmRCC_CONFIG_MEMSIZE 0xde3
......@@ -297,21 +302,13 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
return ret;
}
static int amdgpu_discovery_read_binary_from_file(struct amdgpu_device *adev, uint8_t *binary)
static int amdgpu_discovery_read_binary_from_file(struct amdgpu_device *adev,
uint8_t *binary,
const char *fw_name)
{
const struct firmware *fw;
const char *fw_name;
int r;
switch (amdgpu_discovery) {
case 2:
fw_name = FIRMWARE_IP_DISCOVERY;
break;
default:
dev_warn(adev->dev, "amdgpu_discovery is not set properly\n");
return -EINVAL;
}
r = request_firmware(&fw, fw_name, adev->dev);
if (r) {
dev_err(adev->dev, "can't load firmware \"%s\"\n",
......@@ -404,10 +401,39 @@ static int amdgpu_discovery_verify_npsinfo(struct amdgpu_device *adev,
return 0;
}
static const char *amdgpu_discovery_get_fw_name(struct amdgpu_device *adev)
{
if (amdgpu_discovery == 2)
return "amdgpu/ip_discovery.bin";
switch (adev->asic_type) {
case CHIP_VEGA10:
return "amdgpu/vega10_ip_discovery.bin";
case CHIP_VEGA12:
return "amdgpu/vega12_ip_discovery.bin";
case CHIP_RAVEN:
if (adev->apu_flags & AMD_APU_IS_RAVEN2)
return "amdgpu/raven2_ip_discovery.bin";
else if (adev->apu_flags & AMD_APU_IS_PICASSO)
return "amdgpu/picasso_ip_discovery.bin";
else
return "amdgpu/raven_ip_discovery.bin";
case CHIP_VEGA20:
return "amdgpu/vega20_ip_discovery.bin";
case CHIP_ARCTURUS:
return "amdgpu/arcturus_ip_discovery.bin";
case CHIP_ALDEBARAN:
return "amdgpu/aldebaran_ip_discovery.bin";
default:
return NULL;
}
}
static int amdgpu_discovery_init(struct amdgpu_device *adev)
{
struct table_info *info;
struct binary_header *bhdr;
const char *fw_name;
uint16_t offset;
uint16_t size;
uint16_t checksum;
......@@ -419,9 +445,10 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
return -ENOMEM;
/* Read from file if it is the preferred option */
if (amdgpu_discovery == 2) {
fw_name = amdgpu_discovery_get_fw_name(adev);
if (fw_name != NULL) {
dev_info(adev->dev, "use ip discovery information from file");
r = amdgpu_discovery_read_binary_from_file(adev, adev->mman.discovery_bin);
r = amdgpu_discovery_read_binary_from_file(adev, adev->mman.discovery_bin, fw_name);
if (r) {
dev_err(adev->dev, "failed to read ip discovery binary from file\n");
......@@ -1290,6 +1317,7 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev)
uint16_t die_offset;
uint16_t ip_offset;
uint16_t num_dies;
uint32_t wafl_ver;
uint16_t num_ips;
uint16_t hw_id;
uint8_t inst;
......@@ -1303,6 +1331,7 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev)
return r;
}
wafl_ver = 0;
adev->gfx.xcc_mask = 0;
adev->sdma.sdma_mask = 0;
adev->vcn.inst_mask = 0;
......@@ -1403,6 +1432,10 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev)
adev->gfx.xcc_mask |=
(1U << ip->instance_number);
if (!wafl_ver && le16_to_cpu(ip->hw_id) == WAFLC_HWID)
wafl_ver = IP_VERSION_FULL(ip->major, ip->minor,
ip->revision, 0, 0);
for (k = 0; k < num_base_address; k++) {
/*
* convert the endianness of base addresses in place,
......@@ -1468,6 +1501,9 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev)
}
}
if (wafl_ver && !adev->ip_versions[XGMI_HWIP][0])
adev->ip_versions[XGMI_HWIP][0] = wafl_ver;
return 0;
}
......@@ -2509,6 +2545,38 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
{
int r;
switch (adev->asic_type) {
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_RAVEN:
case CHIP_VEGA20:
case CHIP_ARCTURUS:
case CHIP_ALDEBARAN:
/* this is not fatal. We have a fallback below
* if the new firmwares are not present. some of
* this will be overridden below to keep things
* consistent with the current behavior.
*/
r = amdgpu_discovery_reg_base_init(adev);
if (!r) {
amdgpu_discovery_harvest_ip(adev);
amdgpu_discovery_get_gfx_info(adev);
amdgpu_discovery_get_mall_info(adev);
amdgpu_discovery_get_vcn_info(adev);
}
break;
default:
r = amdgpu_discovery_reg_base_init(adev);
if (r)
return -EINVAL;
amdgpu_discovery_harvest_ip(adev);
amdgpu_discovery_get_gfx_info(adev);
amdgpu_discovery_get_mall_info(adev);
amdgpu_discovery_get_vcn_info(adev);
break;
}
switch (adev->asic_type) {
case CHIP_VEGA10:
vega10_reg_base_init(adev);
......@@ -2673,14 +2741,6 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 1, 0);
break;
default:
r = amdgpu_discovery_reg_base_init(adev);
if (r)
return -EINVAL;
amdgpu_discovery_harvest_ip(adev);
amdgpu_discovery_get_gfx_info(adev);
amdgpu_discovery_get_mall_info(adev);
amdgpu_discovery_get_vcn_info(adev);
break;
}
......@@ -2772,10 +2832,6 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
break;
}
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))
adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 4, 0);
/* set NBIO version */
switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
case IP_VERSION(6, 1, 0):
......
......@@ -139,6 +139,7 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5),
AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
AMDGPU_DEBUG_SMU_POOL = BIT(7),
};
unsigned int amdgpu_vram_limit = UINT_MAX;
......@@ -176,6 +177,7 @@ uint amdgpu_sdma_phase_quantum = 32;
char *amdgpu_disable_cu;
char *amdgpu_virtual_display;
bool enforce_isolation;
int amdgpu_modeset = -1;
/* Specifies the default granularity for SVM, used in buffer
* migration and restoration of backing memory when handling
......@@ -1037,6 +1039,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444);
module_param(enforce_isolation, bool, 0444);
MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on");
/**
* DOC: modeset (int)
* Override nomodeset (1 = override, -1 = auto). The default is -1 (auto).
*/
MODULE_PARM_DESC(modeset, "Override nomodeset (1 = enable, -1 = auto)");
module_param_named(modeset, amdgpu_modeset, int, 0444);
/**
* DOC: seamless (int)
* Seamless boot will keep the image on the screen during the boot process.
......@@ -1053,6 +1062,11 @@ module_param_named(seamless, amdgpu_seamless, int, 0444);
* limits the VRAM size reported to ROCm applications to the visible
* size, usually 256MB.
* - 0x4: Disable GPU soft recovery, always do a full reset
* - 0x8: Use VRAM for firmware loading
* - 0x10: Enable ACA based RAS logging
* - 0x20: Enable experimental resets
* - 0x40: Disable ring resets
* - 0x80: Use VRAM for SMU pool
*/
MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
module_param_named_unsafe(debug_mask, amdgpu_debug_mask, uint, 0444);
......@@ -2230,6 +2244,10 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: ring reset disabled\n");
adev->debug_disable_gpu_ring_reset = true;
}
if (amdgpu_debug_mask & AMDGPU_DEBUG_SMU_POOL) {
pr_info("debug: use vram for smu pool\n");
adev->pm.smu_debug_mask |= SMU_DEBUG_POOL_USE_VRAM;
}
}
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
......@@ -2257,6 +2275,12 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
int ret, retry = 0, i;
bool supports_atomic = false;
if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
(pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
if (drm_firmware_drivers_only() && amdgpu_modeset == -1)
return -EINVAL;
}
/* skip devices which are owned by radeon */
for (i = 0; i < ARRAY_SIZE(amdgpu_unsupported_pciidlist); i++) {
if (amdgpu_unsupported_pciidlist[i] == pdev->device)
......@@ -2990,9 +3014,6 @@ static int __init amdgpu_init(void)
{
int r;
if (drm_firmware_drivers_only())
return -EINVAL;
r = amdgpu_sync_init();
if (r)
goto error_sync;
......
......@@ -1665,15 +1665,8 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
}
mutex_lock(&adev->enforce_isolation_mutex);
for (i = 0; i < num_partitions; i++) {
if (adev->enforce_isolation[i] && !partition_values[i])
/* Going from enabled to disabled */
amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i));
else if (!adev->enforce_isolation[i] && partition_values[i])
/* Going from disabled to enabled */
amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
for (i = 0; i < num_partitions; i++)
adev->enforce_isolation[i] = partition_values[i];
}
mutex_unlock(&adev->enforce_isolation_mutex);
amdgpu_mes_update_enforce_isolation(adev);
......@@ -2160,11 +2153,16 @@ void amdgpu_gfx_profile_idle_work_handler(struct work_struct *work)
for (i = 0; i < (AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES); ++i)
fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]);
if (!fences && !atomic_read(&adev->gfx.total_submission_cnt)) {
r = amdgpu_dpm_switch_power_profile(adev, profile, false);
if (r)
dev_warn(adev->dev, "(%d) failed to disable %s power profile mode\n", r,
profile == PP_SMC_POWER_PROFILE_FULLSCREEN3D ?
"fullscreen 3D" : "compute");
mutex_lock(&adev->gfx.workload_profile_mutex);
if (adev->gfx.workload_profile_active) {
r = amdgpu_dpm_switch_power_profile(adev, profile, false);
if (r)
dev_warn(adev->dev, "(%d) failed to disable %s power profile mode\n", r,
profile == PP_SMC_POWER_PROFILE_FULLSCREEN3D ?
"fullscreen 3D" : "compute");
adev->gfx.workload_profile_active = false;
}
mutex_unlock(&adev->gfx.workload_profile_mutex);
} else {
schedule_delayed_work(&adev->gfx.idle_work, GFX_PROFILE_IDLE_TIMEOUT);
}
......@@ -2183,13 +2181,25 @@ void amdgpu_gfx_profile_ring_begin_use(struct amdgpu_ring *ring)
atomic_inc(&adev->gfx.total_submission_cnt);
if (!cancel_delayed_work_sync(&adev->gfx.idle_work)) {
cancel_delayed_work_sync(&adev->gfx.idle_work);
/* We can safely return early here because we've cancelled the
* the delayed work so there is no one else to set it to false
* and we don't care if someone else sets it to true.
*/
if (adev->gfx.workload_profile_active)
return;
mutex_lock(&adev->gfx.workload_profile_mutex);
if (!adev->gfx.workload_profile_active) {
r = amdgpu_dpm_switch_power_profile(adev, profile, true);
if (r)
dev_warn(adev->dev, "(%d) failed to disable %s power profile mode\n", r,
profile == PP_SMC_POWER_PROFILE_FULLSCREEN3D ?
"fullscreen 3D" : "compute");
adev->gfx.workload_profile_active = true;
}
mutex_unlock(&adev->gfx.workload_profile_mutex);
}
void amdgpu_gfx_profile_ring_end_use(struct amdgpu_ring *ring)
......
......@@ -482,6 +482,8 @@ struct amdgpu_gfx {
atomic_t total_submission_cnt;
struct delayed_work idle_work;
bool workload_profile_active;
struct mutex workload_profile_mutex;
};
struct amdgpu_gfx_ras_reg_entry {
......
......@@ -573,6 +573,7 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev)
unsigned vm_inv_engs[AMDGPU_MAX_VMHUBS] = {0};
unsigned i;
unsigned vmhub, inv_eng;
struct amdgpu_ring *shared_ring;
/* init the vm inv eng for all vmhubs */
for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS) {
......@@ -595,6 +596,10 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev)
ring == &adev->cper.ring_buf)
continue;
/* Skip if the ring is a shared ring */
if (amdgpu_sdma_is_shared_inv_eng(adev, ring))
continue;
inv_eng = ffs(vm_inv_engs[vmhub]);
if (!inv_eng) {
dev_err(adev->dev, "no VM inv eng for ring %s\n",
......@@ -607,6 +612,21 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev)
dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
ring->name, ring->vm_inv_eng, ring->vm_hub);
/* SDMA has a special packet which allows it to use the same
* invalidation engine for all the rings in one instance.
* Therefore, we do not allocate a separate VM invalidation engine
* for SDMA page rings. Instead, they share the VM invalidation
* engine with the SDMA gfx ring. This change ensures efficient
* resource management and avoids the issue of insufficient VM
* invalidation engines.
*/
shared_ring = amdgpu_sdma_get_shared_ring(adev, ring);
if (shared_ring) {
shared_ring->vm_inv_eng = ring->vm_inv_eng;
dev_info(adev->dev, "ring %s shares VM invalidation engine %u with ring %s on hub %u\n",
ring->name, ring->vm_inv_eng, shared_ring->name, ring->vm_hub);
continue;
}
}
return 0;
......
......@@ -209,7 +209,7 @@ static int amdgpu_vmid_grab_idle(struct amdgpu_ring *ring,
return 0;
}
fences = kmalloc_array(id_mgr->num_ids, sizeof(void *), GFP_KERNEL);
fences = kmalloc_array(id_mgr->num_ids, sizeof(void *), GFP_NOWAIT);
if (!fences)
return -ENOMEM;
......@@ -287,46 +287,34 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
(*id)->flushed_updates < updates ||
!(*id)->last_flush ||
((*id)->last_flush->context != fence_context &&
!dma_fence_is_signaled((*id)->last_flush))) {
!dma_fence_is_signaled((*id)->last_flush)))
needs_flush = true;
if ((*id)->owner != vm->immediate.fence_context ||
(!adev->vm_manager.concurrent_flush && needs_flush)) {
struct dma_fence *tmp;
/* Wait for the gang to be assembled before using a
* reserved VMID or otherwise the gang could deadlock.
/* Don't use per engine and per process VMID at the
* same time
*/
tmp = amdgpu_device_get_gang(adev);
if (!dma_fence_is_signaled(tmp) && tmp != job->gang_submit) {
if (adev->vm_manager.concurrent_flush)
ring = NULL;
/* to prevent one context starved by another context */
(*id)->pd_gpu_addr = 0;
tmp = amdgpu_sync_peek_fence(&(*id)->active, ring);
if (tmp) {
*id = NULL;
*fence = tmp;
*fence = dma_fence_get(tmp);
return 0;
}
dma_fence_put(tmp);
/* Make sure the id is owned by the gang before proceeding */
if (!job->gang_submit ||
(*id)->owner != vm->immediate.fence_context) {
/* Don't use per engine and per process VMID at the
* same time
*/
if (adev->vm_manager.concurrent_flush)
ring = NULL;
/* to prevent one context starved by another context */
(*id)->pd_gpu_addr = 0;
tmp = amdgpu_sync_peek_fence(&(*id)->active, ring);
if (tmp) {
*id = NULL;
*fence = dma_fence_get(tmp);
return 0;
}
}
needs_flush = true;
}
/* Good we can use this VMID. Remember this submission as
* user of the VMID.
*/
r = amdgpu_sync_fence(&(*id)->active, &job->base.s_fence->finished);
r = amdgpu_sync_fence(&(*id)->active, &job->base.s_fence->finished,
GFP_NOWAIT);
if (r)
return r;
......@@ -385,7 +373,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
* user of the VMID.
*/
r = amdgpu_sync_fence(&(*id)->active,
&job->base.s_fence->finished);
&job->base.s_fence->finished,
GFP_NOWAIT);
if (r)
return r;
......@@ -422,7 +411,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
if (r || !idle)
goto error;
if (amdgpu_vmid_uses_reserved(adev, vm, vmhub)) {
if (amdgpu_vmid_uses_reserved(vm, vmhub)) {
r = amdgpu_vmid_grab_reserved(vm, ring, job, &id, fence);
if (r || !id)
goto error;
......@@ -437,7 +426,8 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
/* Remember this submission as user of the VMID */
r = amdgpu_sync_fence(&id->active,
&job->base.s_fence->finished);
&job->base.s_fence->finished,
GFP_NOWAIT);
if (r)
goto error;
......@@ -474,19 +464,14 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
/*
* amdgpu_vmid_uses_reserved - check if a VM will use a reserved VMID
* @adev: amdgpu_device pointer
* @vm: the VM to check
* @vmhub: the VMHUB which will be used
*
* Returns: True if the VM will use a reserved VMID.
*/
bool amdgpu_vmid_uses_reserved(struct amdgpu_device *adev,
struct amdgpu_vm *vm, unsigned int vmhub)
bool amdgpu_vmid_uses_reserved(struct amdgpu_vm *vm, unsigned int vmhub)
{
return vm->reserved_vmid[vmhub] ||
(adev->enforce_isolation[(vm->root.bo->xcp_id != AMDGPU_XCP_NO_PARTITION) ?
vm->root.bo->xcp_id : 0] &&
AMDGPU_IS_GFXHUB(vmhub));
return vm->reserved_vmid[vmhub];
}
int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev,
......
......@@ -78,8 +78,7 @@ void amdgpu_pasid_free_delayed(struct dma_resv *resv,
bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
struct amdgpu_vmid *id);
bool amdgpu_vmid_uses_reserved(struct amdgpu_device *adev,
struct amdgpu_vm *vm, unsigned int vmhub);
bool amdgpu_vmid_uses_reserved(struct amdgpu_vm *vm, unsigned int vmhub);
int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev,
unsigned vmhub);
void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
......
......@@ -361,17 +361,24 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
struct amdgpu_job *job = to_amdgpu_job(sched_job);
struct dma_fence *fence = NULL;
struct dma_fence *fence;
int r;
r = drm_sched_entity_error(s_entity);
if (r)
goto error;
if (job->gang_submit)
if (job->gang_submit) {
fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit);
if (fence)
return fence;
}
fence = amdgpu_device_enforce_isolation(ring->adev, ring, job);
if (fence)
return fence;
if (!fence && job->vm && !job->vmid) {
if (job->vm && !job->vmid) {
r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
if (r) {
dev_err(ring->adev->dev, "Error getting VM ID (%d)\n", r);
......@@ -384,9 +391,10 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
*/
if (!fence)
job->vm = NULL;
return fence;
}
return fence;
return NULL;
error:
dma_fence_set_error(&job->base.s_fence->finished, r);
......
......@@ -145,9 +145,8 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
adev->mes.vmid_mask_gfxhub = 0xffffff00;
for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++) {
/* use only 1st MEC pipes */
if (i >= adev->gfx.mec.num_pipe_per_mec)
continue;
if (i >= (adev->gfx.mec.num_pipe_per_mec * adev->gfx.mec.num_mec))
break;
adev->mes.compute_hqd_mask[i] = 0xc;
}
......@@ -155,14 +154,9 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
adev->mes.gfx_hqd_mask[i] = i ? 0 : 0xfffffffe;
for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++) {
if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) <
IP_VERSION(6, 0, 0))
adev->mes.sdma_hqd_mask[i] = i ? 0 : 0x3fc;
/* zero sdma_hqd_mask for non-existent engine */
else if (adev->sdma.num_instances == 1)
adev->mes.sdma_hqd_mask[i] = i ? 0 : 0xfc;
else
adev->mes.sdma_hqd_mask[i] = 0xfc;
if (i >= adev->sdma.num_instances)
break;
adev->mes.sdma_hqd_mask[i] = 0xfc;
}
for (i = 0; i < AMDGPU_MAX_MES_PIPES; i++) {
......@@ -1336,14 +1330,14 @@ int amdgpu_mes_ctx_map_meta_data(struct amdgpu_device *adev,
DRM_ERROR("failed to do vm_bo_update on meta data\n");
goto error_del_bo_va;
}
amdgpu_sync_fence(&sync, bo_va->last_pt_update);
amdgpu_sync_fence(&sync, bo_va->last_pt_update, GFP_KERNEL);
r = amdgpu_vm_update_pdes(adev, vm, false);
if (r) {
DRM_ERROR("failed to update pdes on meta data\n");
goto error_del_bo_va;
}
amdgpu_sync_fence(&sync, vm->last_update);
amdgpu_sync_fence(&sync, vm->last_update, GFP_KERNEL);
amdgpu_sync_wait(&sync, false);
drm_exec_fini(&exec);
......
......@@ -153,6 +153,9 @@ static int psp_init_sriov_microcode(struct psp_context *psp)
adev->virt.autoload_ucode_id = AMDGPU_UCODE_ID_CP_MES1_DATA;
ret = psp_init_cap_microcode(psp, ucode_prefix);
break;
case IP_VERSION(13, 0, 12):
ret = psp_init_ta_microcode(psp, ucode_prefix);
break;
default:
return -EINVAL;
}
......@@ -1861,6 +1864,7 @@ int psp_ras_initialize(struct psp_context *psp)
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
ras_cmd->ras_in_message.init_flags.nps_mode =
adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
ras_cmd->ras_in_message.init_flags.active_umc_mask = adev->umc.active_mask;
ret = psp_ta_load(psp, &psp->ras_context.context);
......
......@@ -3473,6 +3473,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
adev, control->bad_channel_bitmap);
con->update_channel_flag = false;
}
/* The format action is only applied to new ASICs */
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
control->tbl_hdr.version < RAS_TABLE_VER_V3)
if (!amdgpu_ras_eeprom_reset_table(control))
if (amdgpu_ras_save_bad_pages(adev, NULL))
dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
}
return ret;
......
......@@ -161,6 +161,7 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 10):
return true;
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
return (adev->gmc.is_app_apu) ? false : true;
default:
......@@ -223,6 +224,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
return true;
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
control->i2c_address = EEPROM_I2C_MADDR_4;
return true;
......@@ -413,9 +415,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control
switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
case IP_VERSION(8, 10, 0):
case IP_VERSION(12, 0, 0):
hdr->version = RAS_TABLE_VER_V2_1;
return;
case IP_VERSION(12, 0, 0):
hdr->version = RAS_TABLE_VER_V3;
return;
default:
hdr->version = RAS_TABLE_VER_V1;
return;
......@@ -443,7 +447,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
hdr->header = RAS_TABLE_HDR_VAL;
amdgpu_ras_set_eeprom_table_version(control);
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE;
......@@ -461,7 +465,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
}
csum = __calc_hdr_byte_sum(control);
if (hdr->version == RAS_TABLE_VER_V2_1)
if (hdr->version >= RAS_TABLE_VER_V2_1)
csum += __calc_ras_info_byte_sum(control);
csum = -csum;
hdr->checksum = csum;
......@@ -757,7 +761,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
"Saved bad pages %d reaches threshold value %d\n",
control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
control->tbl_rai.health_percent = 0;
}
......@@ -770,7 +774,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
amdgpu_dpm_send_rma_reason(adev);
}
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE +
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
......@@ -810,7 +814,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
* now calculate gpu health percent
*/
if (amdgpu_bad_page_threshold != 0 &&
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
control->ras_num_bad_pages) * 100) /
......@@ -823,7 +827,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
csum += *pp;
csum += __calc_hdr_byte_sum(control);
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
csum += __calc_ras_info_byte_sum(control);
/* avoid sign extension when assigning to "checksum" */
csum = -csum;
......@@ -1040,7 +1044,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co
/* get available eeprom table version first before eeprom table init */
amdgpu_ras_set_eeprom_table_version(control);
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
return RAS_MAX_RECORD_COUNT_V2_1;
else
return RAS_MAX_RECORD_COUNT;
......@@ -1285,7 +1289,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
int buf_size, res;
u8 csum, *buf, *pp;
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
buf_size = RAS_TABLE_HEADER_SIZE +
RAS_TABLE_V2_1_INFO_SIZE +
control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
......@@ -1388,7 +1392,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
__decode_table_header_from_buf(hdr, buf);
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
control->ras_record_offset = RAS_RECORD_START_V2_1;
control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
......@@ -1428,7 +1432,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
control->ras_num_bad_pages);
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
if (res)
return res;
......@@ -1448,7 +1452,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
if (hdr->version == RAS_TABLE_VER_V2_1) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
res = __read_table_ras_info(control);
if (res)
return res;
......
......@@ -28,6 +28,7 @@
#define RAS_TABLE_VER_V1 0x00010000
#define RAS_TABLE_VER_V2_1 0x00021000
#define RAS_TABLE_VER_V3 0x00030000
struct amdgpu_device;
......