22 files changed, 596 insertions, 114 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 22866d1c3d69..01657830b470 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -425,6 +425,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 				   unsigned irq_type);
 int amdgpu_fence_emit(struct amdgpu_ring *ring, void *owner,
 		      struct amdgpu_fence **fence);
+int amdgpu_fence_recreate(struct amdgpu_ring *ring, void *owner,
+			  uint64_t seq, struct amdgpu_fence **fence);
 void amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_next(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
@@ -435,9 +437,6 @@ int amdgpu_fence_wait(struct amdgpu_fence *fence, bool interruptible);
 int amdgpu_fence_wait_any(struct amdgpu_device *adev,
 			  struct amdgpu_fence **fences,
 			  bool intr);
-long amdgpu_fence_wait_seq_timeout(struct amdgpu_device *adev,
-				   u64 *target_seq, bool intr,
-				   long timeout);
 struct amdgpu_fence *amdgpu_fence_ref(struct amdgpu_fence *fence);
 void amdgpu_fence_unref(struct amdgpu_fence **fence);
 
@@ -1622,6 +1621,7 @@ struct amdgpu_vce {
 	unsigned		fb_version;
 	atomic_t		handles[AMDGPU_MAX_VCE_HANDLES];
 	struct drm_file		*filp[AMDGPU_MAX_VCE_HANDLES];
+	uint32_t		img_size[AMDGPU_MAX_VCE_HANDLES];
 	struct delayed_work	idle_work;
 	const struct firmware	*fw;	/* VCE firmware */
 	struct amdgpu_ring	ring[AMDGPU_MAX_VCE_RINGS];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 36d34e0afbc3..f82a2dd83874 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -30,6 +30,7 @@
 
 #include <drm/drmP.h>
 #include "amdgpu.h"
+#include "amdgpu_trace.h"
 
 static int amdgpu_bo_list_create(struct amdgpu_fpriv *fpriv,
 				 struct amdgpu_bo_list **result,
@@ -124,6 +125,8 @@ static int amdgpu_bo_list_set(struct amdgpu_device *adev,
 			gws_obj = entry->robj;
 		if (entry->prefered_domains == AMDGPU_GEM_DOMAIN_OA)
 			oa_obj = entry->robj;
+
+		trace_amdgpu_bo_list_set(list, entry->robj);
 	}
 
 	for (i = 0; i < list->num_entries; ++i)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index f09b2cba40ca..1f040d85ac47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -181,8 +181,6 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
 		}
 		p->chunks[i].chunk_id = user_chunk.chunk_id;
 		p->chunks[i].length_dw = user_chunk.length_dw;
-		if (p->chunks[i].chunk_id == AMDGPU_CHUNK_ID_IB)
-			p->num_ibs++;
 
 		size = p->chunks[i].length_dw;
 		cdata = (void __user *)(unsigned long)user_chunk.chunk_data;
@@ -199,7 +197,12 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
 			goto out;
 		}
 
-		if (p->chunks[i].chunk_id == AMDGPU_CHUNK_ID_FENCE) {
+		switch (p->chunks[i].chunk_id) {
+		case AMDGPU_CHUNK_ID_IB:
+			p->num_ibs++;
+			break;
+
+		case AMDGPU_CHUNK_ID_FENCE:
 			size = sizeof(struct drm_amdgpu_cs_chunk_fence);
 			if (p->chunks[i].length_dw * sizeof(uint32_t) >= size) {
 				uint32_t handle;
@@ -221,6 +224,14 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
 				r = -EINVAL;
 				goto out;
 			}
+			break;
+
+		case AMDGPU_CHUNK_ID_DEPENDENCIES:
+			break;
+
+		default:
+			r = -EINVAL;
+			goto out;
 		}
 	}
 
@@ -445,8 +456,9 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, bo
 	for (i = 0; i < parser->nchunks; i++)
 		drm_free_large(parser->chunks[i].kdata);
 	kfree(parser->chunks);
-	for (i = 0; i < parser->num_ibs; i++)
-		amdgpu_ib_free(parser->adev, &parser->ibs[i]);
+	if (parser->ibs)
+		for (i = 0; i < parser->num_ibs; i++)
+			amdgpu_ib_free(parser->adev, &parser->ibs[i]);
 	kfree(parser->ibs);
 	if (parser->uf.bo)
 		drm_gem_object_unreference_unlocked(&parser->uf.bo->gem_base);
@@ -654,6 +666,64 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
+				  struct amdgpu_cs_parser *p)
+{
+	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+	struct amdgpu_ib *ib;
+	int i, j, r;
+
+	if (!p->num_ibs)
+		return 0;
+
+	/* Add dependencies to first IB */
+	ib = &p->ibs[0];
+	for (i = 0; i < p->nchunks; ++i) {
+		struct drm_amdgpu_cs_chunk_dep *deps;
+		struct amdgpu_cs_chunk *chunk;
+		unsigned num_deps;
+
+		chunk = &p->chunks[i];
+
+		if (chunk->chunk_id != AMDGPU_CHUNK_ID_DEPENDENCIES)
+			continue;
+
+		deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata;
+		num_deps = chunk->length_dw * 4 /
+			sizeof(struct drm_amdgpu_cs_chunk_dep);
+
+		for (j = 0; j < num_deps; ++j) {
+			struct amdgpu_fence *fence;
+			struct amdgpu_ring *ring;
+			struct amdgpu_ctx *ctx;
+
+			r = amdgpu_cs_get_ring(adev, deps[j].ip_type,
+					       deps[j].ip_instance,
+					       deps[j].ring, &ring);
+			if (r)
+				return r;
+
+			ctx = amdgpu_ctx_get(fpriv, deps[j].ctx_id);
+			if (ctx == NULL)
+				return -EINVAL;
+
+			r = amdgpu_fence_recreate(ring, p->filp,
+						  deps[j].handle,
+						  &fence);
+			if (r) {
+				amdgpu_ctx_put(ctx);
+				return r;
+			}
+
+			amdgpu_sync_fence(&ib->sync, fence);
+			amdgpu_fence_unref(&fence);
+			amdgpu_ctx_put(ctx);
+		}
+	}
+
+	return 0;
+}
+
 int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 {
 	struct amdgpu_device *adev = dev->dev_private;
@@ -688,11 +758,16 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 			else
 				DRM_ERROR("Failed to process the buffer list %d!\n", r);
 		}
-	} else {
+	}
+
+	if (!r) {
 		reserved_buffers = true;
 		r = amdgpu_cs_ib_fill(adev, &parser);
 	}
 
+	if (!r)
+		r = amdgpu_cs_dependencies(adev, &parser);
+
 	if (r) {
 		amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
 		up_read(&adev->exclusive_lock);
@@ -730,9 +805,9 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
 {
 	union drm_amdgpu_wait_cs *wait = data;
 	struct amdgpu_device *adev = dev->dev_private;
-	uint64_t seq[AMDGPU_MAX_RINGS] = {0};
-	struct amdgpu_ring *ring = NULL;
 	unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
+	struct amdgpu_fence *fence = NULL;
+	struct amdgpu_ring *ring = NULL;
 	struct amdgpu_ctx *ctx;
 	long r;
 
@@ -742,12 +817,19 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
 
 	r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance,
 			       wait->in.ring, &ring);
-	if (r)
+	if (r) {
+		amdgpu_ctx_put(ctx);
 		return r;
+	}
 
-	seq[ring->idx] = wait->in.handle;
+	r = amdgpu_fence_recreate(ring, filp, wait->in.handle, &fence);
+	if (r) {
+		amdgpu_ctx_put(ctx);
+		return r;
+	}
 
-	r = amdgpu_fence_wait_seq_timeout(adev, seq, true, timeout);
+	r = fence_wait_timeout(&fence->base, true, timeout);
+	amdgpu_fence_unref(&fence);
 	amdgpu_ctx_put(ctx);
 	if (r < 0)
 		return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fec487d1c870..d79009b65867 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1191,7 +1191,9 @@ static int amdgpu_early_init(struct amdgpu_device *adev)
 		return -EINVAL;
 	}
 
-
+	adev->ip_block_enabled = kcalloc(adev->num_ip_blocks, sizeof(bool), GFP_KERNEL);
+	if (adev->ip_block_enabled == NULL)
+		return -ENOMEM;
 
 	if (adev->ip_blocks == NULL) {
 		DRM_ERROR("No IP blocks found!\n");
@@ -1205,10 +1207,15 @@ static int amdgpu_early_init(struct amdgpu_device *adev)
 		} else {
 			if (adev->ip_blocks[i].funcs->early_init) {
 				r = adev->ip_blocks[i].funcs->early_init((void *)adev);
-				if (r)
+				if (r == -ENOENT)
+					adev->ip_block_enabled[i] = false;
+				else if (r)
 					return r;
+				else
+					adev->ip_block_enabled[i] = true;
+			} else {
+				adev->ip_block_enabled[i] = true;
 			}
-			adev->ip_block_enabled[i] = true;
 		}
 	}
 
@@ -1575,8 +1582,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
 	amdgpu_fence_driver_fini(adev);
 	amdgpu_fbdev_fini(adev);
 	r = amdgpu_fini(adev);
-	if (adev->ip_block_enabled)
-		kfree(adev->ip_block_enabled);
+	kfree(adev->ip_block_enabled);
 	adev->ip_block_enabled = NULL;
 	adev->accel_working = false;
 	/* free i2c buses */
@@ -2000,4 +2006,10 @@ int amdgpu_debugfs_init(struct drm_minor *minor)
 void amdgpu_debugfs_cleanup(struct drm_minor *minor)
 {
 }
+#else
+static int amdgpu_debugfs_regs_init(struct amdgpu_device *adev)
+{
+	return 0;
+}
+static void amdgpu_debugfs_regs_cleanup(struct amdgpu_device *adev) { }
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5c9918d01bf9..a7189a1fa6a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -136,6 +136,38 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, void *owner,
 }
 
 /**
+ * amdgpu_fence_recreate - recreate a fence from an user fence
+ *
+ * @ring: ring the fence is associated with
+ * @owner: creator of the fence
+ * @seq: user fence sequence number
+ * @fence: resulting amdgpu fence object
+ *
+ * Recreates a fence command from the user fence sequence number (all asics).
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int amdgpu_fence_recreate(struct amdgpu_ring *ring, void *owner,
+			  uint64_t seq, struct amdgpu_fence **fence)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	if (seq > ring->fence_drv.sync_seq[ring->idx])
+		return -EINVAL;
+
+	*fence = kmalloc(sizeof(struct amdgpu_fence), GFP_KERNEL);
+	if ((*fence) == NULL)
+		return -ENOMEM;
+
+	(*fence)->seq = seq;
+	(*fence)->ring = ring;
+	(*fence)->owner = owner;
+	fence_init(&(*fence)->base, &amdgpu_fence_ops,
+		&adev->fence_queue.lock, adev->fence_context + ring->idx,
+		(*fence)->seq);
+	return 0;
+}
+
+/**
  * amdgpu_fence_check_signaled - callback from fence_queue
  *
  * this function is called with fence_queue lock held, which is also used
@@ -517,12 +549,14 @@ static bool amdgpu_fence_any_seq_signaled(struct amdgpu_device *adev, u64 *seq)
  * the wait timeout, or an error for all other cases.
  * -EDEADLK is returned when a GPU lockup has been detected.
  */
-long amdgpu_fence_wait_seq_timeout(struct amdgpu_device *adev, u64 *target_seq,
-				   bool intr, long timeout)
+static long amdgpu_fence_wait_seq_timeout(struct amdgpu_device *adev,
+					  u64 *target_seq, bool intr,
+					  long timeout)
 {
 	uint64_t last_seq[AMDGPU_MAX_RINGS];
 	bool signaled;
-	int i, r;
+	int i;
+	long r;
 
 	if (timeout == 0) {
 		return amdgpu_fence_any_seq_signaled(adev, target_seq);
@@ -1023,7 +1057,7 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
 
 		amdgpu_fence_process(ring);
 
-		seq_printf(m, "--- ring %d ---\n", i);
+		seq_printf(m, "--- ring %d (%s) ---\n", i, ring->name);
 		seq_printf(m, "Last signaled fence 0x%016llx\n",
 			   (unsigned long long)atomic64_read(&ring->fence_drv.last_seq));
 		seq_printf(m, "Last emitted        0x%016llx\n",
@@ -1031,7 +1065,8 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
 
 		for (j = 0; j < AMDGPU_MAX_RINGS; ++j) {
 			struct amdgpu_ring *other = adev->rings[j];
-			if (i != j && other && other->fence_drv.initialized)
+			if (i != j && other && other->fence_drv.initialized &&
+			    ring->fence_drv.sync_seq[j])
 				seq_printf(m, "Last sync to ring %d 0x%016llx\n",
 					   j, ring->fence_drv.sync_seq[j]);
 		}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 0ec222295fee..ae43b58c9733 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -352,7 +352,7 @@ unsigned long amdgpu_gem_timeout(uint64_t timeout_ns)
 	if (((int64_t)timeout_ns) < 0)
 		return MAX_SCHEDULE_TIMEOUT;
 
-	timeout = ktime_sub_ns(ktime_get(), timeout_ns);
+	timeout = ktime_sub(ns_to_ktime(timeout_ns), ktime_get());
 	if (ktime_to_ns(timeout) < 0)
 		return 0;
 
@@ -496,7 +496,7 @@ error_unreserve:
 error_free:
 	drm_free_large(vm_bos);
 
-	if (r)
+	if (r && r != -ERESTARTSYS)
 		DRM_ERROR("Couldn't update BO_VA (%d)\n", r);
 }
 
@@ -525,8 +525,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	invalid_flags = ~(AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
-			AMDGPU_VM_PAGE_EXECUTABLE);
+	invalid_flags = ~(AMDGPU_VM_DELAY_UPDATE | AMDGPU_VM_PAGE_READABLE |
+			AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE);
 	if ((args->flags & invalid_flags)) {
 		dev_err(&dev->pdev->dev, "invalid flags 0x%08X vs 0x%08X\n",
 			args->flags, invalid_flags);
@@ -579,7 +579,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 		break;
 	}
 
-	if (!r)
+	if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE))
 		amdgpu_gem_va_update_vm(adev, bo_va);
 
 	drm_gem_object_unreference_unlocked(gobj);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index b56dd64bd4ea..961d7265c286 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -30,19 +30,21 @@ TRACE_EVENT(amdgpu_cs,
 	    TP_PROTO(struct amdgpu_cs_parser *p, int i),
 	    TP_ARGS(p, i),
 	    TP_STRUCT__entry(
+			     __field(struct amdgpu_bo_list *, bo_list)
 			     __field(u32, ring)
 			     __field(u32, dw)
 			     __field(u32, fences)
 			     ),
 
 	    TP_fast_assign(
+			   __entry->bo_list = p->bo_list;
 			   __entry->ring = p->ibs[i].ring->idx;
 			   __entry->dw = p->ibs[i].length_dw;
 			   __entry->fences = amdgpu_fence_count_emitted(
 				p->ibs[i].ring);
 			   ),
-	    TP_printk("ring=%u, dw=%u, fences=%u",
-		      __entry->ring, __entry->dw,
+	    TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
+		      __entry->bo_list, __entry->ring, __entry->dw,
 		      __entry->fences)
 );
 
@@ -61,6 +63,54 @@ TRACE_EVENT(amdgpu_vm_grab_id,
 	    TP_printk("vmid=%u, ring=%u", __entry->vmid, __entry->ring)
 );
 
+TRACE_EVENT(amdgpu_vm_bo_map,
+	    TP_PROTO(struct amdgpu_bo_va *bo_va,
+		     struct amdgpu_bo_va_mapping *mapping),
+	    TP_ARGS(bo_va, mapping),
+	    TP_STRUCT__entry(
+			     __field(struct amdgpu_bo *, bo)
+			     __field(long, start)
+			     __field(long, last)
+			     __field(u64, offset)
+			     __field(u32, flags)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->bo = bo_va->bo;
+			   __entry->start = mapping->it.start;
+			   __entry->last = mapping->it.last;
+			   __entry->offset = mapping->offset;
+			   __entry->flags = mapping->flags;
+			   ),
+	    TP_printk("bo=%p, start=%lx, last=%lx, offset=%010llx, flags=%08x",
+		      __entry->bo, __entry->start, __entry->last,
+		      __entry->offset, __entry->flags)
+);
+
+TRACE_EVENT(amdgpu_vm_bo_unmap,
+	    TP_PROTO(struct amdgpu_bo_va *bo_va,
+		     struct amdgpu_bo_va_mapping *mapping),
+	    TP_ARGS(bo_va, mapping),
+	    TP_STRUCT__entry(
+			     __field(struct amdgpu_bo *, bo)
+			     __field(long, start)
+			     __field(long, last)
+			     __field(u64, offset)
+			     __field(u32, flags)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->bo = bo_va->bo;
+			   __entry->start = mapping->it.start;
+			   __entry->last = mapping->it.last;
+			   __entry->offset = mapping->offset;
+			   __entry->flags = mapping->flags;
+			   ),
+	    TP_printk("bo=%p, start=%lx, last=%lx, offset=%010llx, flags=%08x",
+		      __entry->bo, __entry->start, __entry->last,
+		      __entry->offset, __entry->flags)
+);
+
 TRACE_EVENT(amdgpu_vm_bo_update,
 	    TP_PROTO(struct amdgpu_bo_va_mapping *mapping),
 	    TP_ARGS(mapping),
@@ -121,6 +171,21 @@ TRACE_EVENT(amdgpu_vm_flush,
 		      __entry->pd_addr, __entry->ring, __entry->id)
 );
 
+TRACE_EVENT(amdgpu_bo_list_set,
+	    TP_PROTO(struct amdgpu_bo_list *list, struct amdgpu_bo *bo),
+	    TP_ARGS(list, bo),
+	    TP_STRUCT__entry(
+			     __field(struct amdgpu_bo_list *, list)
+			     __field(struct amdgpu_bo *, bo)
+			     ),
+
+	    TP_fast_assign(
+			   __entry->list = list;
+			   __entry->bo = bo;
+			   ),
+	    TP_printk("list=%p, bo=%p", __entry->list, __entry->bo)
+);
+
 DECLARE_EVENT_CLASS(amdgpu_fence_request,
 
 	    TP_PROTO(struct drm_device *dev, int ring, u32 seqno),
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index d3706a498293..dd3415d2e45d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -674,7 +674,7 @@ static int amdgpu_ttm_tt_populate(struct ttm_tt *ttm)
 		return 0;
 
 	if (gtt && gtt->userptr) {
-		ttm->sg = kcalloc(1, sizeof(struct sg_table), GFP_KERNEL);
+		ttm->sg = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
 		if (!ttm->sg)
 			return -ENOMEM;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 1127a504f118..d3ca73090e39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -464,28 +464,42 @@ int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
  * @p: parser context
  * @lo: address of lower dword
  * @hi: address of higher dword
+ * @size: minimum size
  *
  * Patch relocation inside command stream with real buffer address
  */
-int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int hi)
+static int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx,
+			       int lo, int hi, unsigned size, uint32_t index)
 {
 	struct amdgpu_bo_va_mapping *mapping;
 	struct amdgpu_ib *ib = &p->ibs[ib_idx];
 	struct amdgpu_bo *bo;
 	uint64_t addr;
 
+	if (index == 0xffffffff)
+		index = 0;
+
 	addr = ((uint64_t)amdgpu_get_ib_value(p, ib_idx, lo)) |
 	       ((uint64_t)amdgpu_get_ib_value(p, ib_idx, hi)) << 32;
+	addr += ((uint64_t)size) * ((uint64_t)index);
 
 	mapping = amdgpu_cs_find_mapping(p, addr, &bo);
 	if (mapping == NULL) {
-		DRM_ERROR("Can't find BO for addr 0x%010Lx %d %d\n",
+		DRM_ERROR("Can't find BO for addr 0x%010Lx %d %d %d %d\n",
+			  addr, lo, hi, size, index);
+		return -EINVAL;
+	}
+
+	if ((addr + (uint64_t)size) >
+	    ((uint64_t)mapping->it.last + 1) * AMDGPU_GPU_PAGE_SIZE) {
+		DRM_ERROR("BO to small for addr 0x%010Lx %d %d\n",
 			  addr, lo, hi);
 		return -EINVAL;
 	}
 
 	addr -= ((uint64_t)mapping->it.start) * AMDGPU_GPU_PAGE_SIZE;
 	addr += amdgpu_bo_gpu_offset(bo);
+	addr -= ((uint64_t)size) * ((uint64_t)index);
 
 	ib->ptr[lo] = addr & 0xFFFFFFFF;
 	ib->ptr[hi] = addr >> 32;
@@ -494,6 +508,48 @@ int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int
 }
 
 /**
+ * amdgpu_vce_validate_handle - validate stream handle
+ *
+ * @p: parser context
+ * @handle: handle to validate
+ * @allocated: allocated a new handle?
+ *
+ * Validates the handle and return the found session index or -EINVAL
+ * we we don't have another free session index.
+ */
+static int amdgpu_vce_validate_handle(struct amdgpu_cs_parser *p,
+				      uint32_t handle, bool *allocated)
+{
+	unsigned i;
+
+	*allocated = false;
+
+	/* validate the handle */
+	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
+		if (atomic_read(&p->adev->vce.handles[i]) == handle) {
+			if (p->adev->vce.filp[i] != p->filp) {
+				DRM_ERROR("VCE handle collision detected!\n");
+				return -EINVAL;
+			}
+			return i;
+		}
+	}
+
+	/* handle not found try to alloc a new one */
+	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
+		if (!atomic_cmpxchg(&p->adev->vce.handles[i], 0, handle)) {
+			p->adev->vce.filp[i] = p->filp;
+			p->adev->vce.img_size[i] = 0;
+			*allocated = true;
+			return i;
+		}
+	}
+
+	DRM_ERROR("No more free VCE handles!\n");
+	return -EINVAL;
+}
+
+/**
  * amdgpu_vce_cs_parse - parse and validate the command stream
  *
  * @p: parser context
@@ -501,10 +557,15 @@ int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int
  */
 int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 {
-	uint32_t handle = 0;
-	bool destroy = false;
-	int i, r, idx = 0;
 	struct amdgpu_ib *ib = &p->ibs[ib_idx];
+	unsigned fb_idx = 0, bs_idx = 0;
+	int session_idx = -1;
+	bool destroyed = false;
+	bool created = false;
+	bool allocated = false;
+	uint32_t tmp, handle = 0;
+	uint32_t *size = &tmp;
+	int i, r = 0, idx = 0;
 
 	amdgpu_vce_note_usage(p->adev);
 
@@ -514,16 +575,44 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 
 		if ((len < 8) || (len & 3)) {
 			DRM_ERROR("invalid VCE command length (%d)!\n", len);
-			return -EINVAL;
+			r = -EINVAL;
+			goto out;
+		}
+
+		if (destroyed) {
+			DRM_ERROR("No other command allowed after destroy!\n");
+			r = -EINVAL;
+			goto out;
 		}
 
 		switch (cmd) {
 		case 0x00000001: // session
 			handle = amdgpu_get_ib_value(p, ib_idx, idx + 2);
+			session_idx = amdgpu_vce_validate_handle(p, handle,
+								 &allocated);
+			if (session_idx < 0)
+				return session_idx;
+			size = &p->adev->vce.img_size[session_idx];
 			break;
 
 		case 0x00000002: // task info
+			fb_idx = amdgpu_get_ib_value(p, ib_idx, idx + 6);
+			bs_idx = amdgpu_get_ib_value(p, ib_idx, idx + 7);
+			break;
+
 		case 0x01000001: // create
+			created = true;
+			if (!allocated) {
+				DRM_ERROR("Handle already in use!\n");
+				r = -EINVAL;
+				goto out;
+			}
+
+			*size = amdgpu_get_ib_value(p, ib_idx, idx + 8) *
+				amdgpu_get_ib_value(p, ib_idx, idx + 10) *
+				8 * 3 / 2;
+			break;
+
 		case 0x04000001: // config extension
 		case 0x04000002: // pic control
 		case 0x04000005: // rate control
@@ -534,60 +623,74 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx)
 			break;
 
 		case 0x03000001: // encode
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 10, idx + 9);
+			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 10, idx + 9,
+						*size, 0);
 			if (r)
-				return r;
+				goto out;
 
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 12, idx + 11);
+			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 12, idx + 11,
+						*size / 3, 0);
 			if (r)
-				return r;
+				goto out;
 			break;
 
 		case 0x02000001: // destroy
-			destroy = true;
+			destroyed = true;
 			break;
 
 		case 0x05000001: // context buffer
+			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+						*size * 2, 0);
+			if (r)
+				goto out;
+			break;
+
 		case 0x05000004: // video bitstream buffer
+			tmp = amdgpu_get_ib_value(p, ib_idx, idx + 4);
+			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+						tmp, bs_idx);
+			if (r)
+				goto out;
+			break;
+
 		case 0x05000005: // feedback buffer
-			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2);
+			r = amdgpu_vce_cs_reloc(p, ib_idx, idx + 3, idx + 2,
+						4096, fb_idx);
 			if (r)
-				return r;
+				goto out;
 			break;
 
 		default:
 			DRM_ERROR("invalid VCE command (0x%x)!\n", cmd);
-			return -EINVAL;
+			r = -EINVAL;
+			goto out;
 		}
 
-		idx += len / 4;
-	}
-
-	if (destroy) {
-		/* IB contains a destroy msg, free the handle */
-		for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i)
-			atomic_cmpxchg(&p->adev->vce.handles[i], handle, 0);
+		if (session_idx == -1) {
+			DRM_ERROR("no session command at start of IB\n");
+			r = -EINVAL;
+			goto out;
+		}
 
-		return 0;
+		idx += len / 4;
 	}
 
-	/* create or encode, validate the handle */
-	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
-		if (atomic_read(&p->adev->vce.handles[i]) == handle)
-			return 0;
+	if (allocated && !created) {
+		DRM_ERROR("New session without create command!\n");
+		r = -ENOENT;
 	}
 
-	/* handle not found try to alloc a new one */
-	for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i) {
-		if (!atomic_cmpxchg(&p->adev->vce.handles[i], 0, handle)) {
-			p->adev->vce.filp[i] = p->filp;
-			return 0;
-		}
+out:
+	if ((!r && destroyed) || (r && allocated)) {
+		/*
+		 * IB contains a destroy msg or we have allocated an
+		 * handle and got an error, anyway free the handle
+		 */
+		for (i = 0; i < AMDGPU_MAX_VCE_HANDLES; ++i)
+			atomic_cmpxchg(&p->adev->vce.handles[i], handle, 0);
 	}
 
-	DRM_ERROR("No more free VCE handles!\n");
-
-	return -EINVAL;
+	return r;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h
index b6a9d0956c60..7ccdb5927da5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.h
@@ -33,7 +33,6 @@ int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle,
 int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
 			       struct amdgpu_fence **fence);
 void amdgpu_vce_free_handles(struct amdgpu_device *adev, struct drm_file *filp);
-int amdgpu_vce_cs_reloc(struct amdgpu_cs_parser *p, uint32_t ib_idx, int lo, int hi);
 int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p, uint32_t ib_idx);
 bool amdgpu_vce_ring_emit_semaphore(struct amdgpu_ring *ring,
 				    struct amdgpu_semaphore *semaphore,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 407882b233c7..9a4e3b63f1cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1001,6 +1001,7 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,
 
 	list_add(&mapping->list, &bo_va->mappings);
 	interval_tree_insert(&mapping->it, &vm->va);
+	trace_amdgpu_vm_bo_map(bo_va, mapping);
 
 	bo_va->addr = 0;
 
@@ -1058,6 +1059,7 @@ error_free:
 	mutex_lock(&vm->mutex);
 	list_del(&mapping->list);
 	interval_tree_remove(&mapping->it, &vm->va);
+	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
 	kfree(mapping);
 
 error_unlock:
@@ -1099,6 +1101,7 @@ int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
 	mutex_lock(&vm->mutex);
 	list_del(&mapping->list);
 	interval_tree_remove(&mapping->it, &vm->va);
+	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
 
 	if (bo_va->addr) {
 		/* clear the old address */
@@ -1139,6 +1142,7 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
 	list_for_each_entry_safe(mapping, next, &bo_va->mappings, list) {
 		list_del(&mapping->list);
 		interval_tree_remove(&mapping->it, &vm->va);
+		trace_amdgpu_vm_bo_unmap(bo_va, mapping);
 		if (bo_va->addr)
 			list_add(&mapping->list, &vm->freed);
 		else
diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c
index 5dab578d6462..341c56681841 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik.c
@@ -2256,10 +2256,6 @@ int cik_set_ip_blocks(struct amdgpu_device *adev)
 		return -EINVAL;
 	}
 
-	adev->ip_block_enabled = kcalloc(adev->num_ip_blocks, sizeof(bool), GFP_KERNEL);
-	if (adev->ip_block_enabled == NULL)
-		return -ENOMEM;
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/cikd.h b/drivers/gpu/drm/amd/amdgpu/cikd.h
index 220865a44814..d19085a97064 100644
--- a/drivers/gpu/drm/amd/amdgpu/cikd.h
+++ b/drivers/gpu/drm/amd/amdgpu/cikd.h
@@ -552,4 +552,10 @@
 #define VCE_CMD_IB_AUTO		0x00000005
 #define VCE_CMD_SEMAPHORE	0x00000006
 
+/* valid for both DEFAULT_MTYPE and APE1_MTYPE */
+enum {
+	MTYPE_CACHED = 0,
+	MTYPE_NONCACHED = 3
+};
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
index e4936a452bc6..1a2d419cbf16 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
@@ -425,7 +425,7 @@ static int cz_dpm_init(struct amdgpu_device *adev)
 	pi->mgcg_cgtt_local1 = 0x0;
 	pi->clock_slow_down_step = 25000;
 	pi->skip_clock_slow_down = 1;
-	pi->enable_nb_ps_policy = 1;
+	pi->enable_nb_ps_policy = 0;
 	pi->caps_power_containment = true;
 	pi->caps_cac = true;
 	pi->didt_enabled = false;
@@ -1679,25 +1679,31 @@ static int cz_dpm_unforce_dpm_levels(struct amdgpu_device *adev)
 	if (ret)
 		return ret;
 
-	DRM_INFO("DPM unforce state min=%d, max=%d.\n",
-			pi->sclk_dpm.soft_min_clk,
-			pi->sclk_dpm.soft_max_clk);
+	DRM_DEBUG("DPM unforce state min=%d, max=%d.\n",
+		  pi->sclk_dpm.soft_min_clk,
+		  pi->sclk_dpm.soft_max_clk);
 
 	return 0;
 }
 
 static int cz_dpm_force_dpm_level(struct amdgpu_device *adev,
-				enum amdgpu_dpm_forced_level level)
+				  enum amdgpu_dpm_forced_level level)
 {
 	int ret = 0;
 
 	switch (level) {
 	case AMDGPU_DPM_FORCED_LEVEL_HIGH:
+		ret = cz_dpm_unforce_dpm_levels(adev);
+		if (ret)
+			return ret;
 		ret = cz_dpm_force_highest(adev);
 		if (ret)
 			return ret;
 		break;
 	case AMDGPU_DPM_FORCED_LEVEL_LOW:
+		ret = cz_dpm_unforce_dpm_levels(adev);
+		if (ret)
+			return ret;
 		ret = cz_dpm_force_lowest(adev);
 		if (ret)
 			return ret;
@@ -1711,6 +1717,8 @@ static int cz_dpm_force_dpm_level(struct amdgpu_device *adev,
 		break;
 	}
 
+	adev->pm.dpm.forced_level = level;
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/cz_dpm.h b/drivers/gpu/drm/amd/amdgpu/cz_dpm.h
index 782a74107664..99e1afc89629 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_dpm.h
+++ b/drivers/gpu/drm/amd/amdgpu/cz_dpm.h
@@ -46,7 +46,7 @@
 
 /* Do not change the following, it is also defined in SMU8.h */
 #define SMU_EnabledFeatureScoreboard_AcpDpmOn		0x00000001
-#define SMU_EnabledFeatureScoreboard_SclkDpmOn		0x00100000
+#define SMU_EnabledFeatureScoreboard_SclkDpmOn		0x00200000
 #define SMU_EnabledFeatureScoreboard_UvdDpmOn		0x00800000
 #define SMU_EnabledFeatureScoreboard_VceDpmOn		0x01000000
 
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index 5cde635978f9..6e77964f1b64 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -3403,19 +3403,25 @@ static int dce_v10_0_crtc_irq(struct amdgpu_device *adev,
 
 	switch (entry->src_data) {
 	case 0: /* vblank */
-		if (disp_int & interrupt_status_offsets[crtc].vblank) {
+		if (disp_int & interrupt_status_offsets[crtc].vblank)
 			dce_v10_0_crtc_vblank_int_ack(adev, crtc);
-			if (amdgpu_irq_enabled(adev, source, irq_type)) {
-				drm_handle_vblank(adev->ddev, crtc);
-			}
-			DRM_DEBUG("IH: D%d vblank\n", crtc + 1);
+		else
+			DRM_DEBUG("IH: IH event w/o asserted irq bit?\n");
+
+		if (amdgpu_irq_enabled(adev, source, irq_type)) {
+			drm_handle_vblank(adev->ddev, crtc);
 		}
+		DRM_DEBUG("IH: D%d vblank\n", crtc + 1);
+
 		break;
 	case 1: /* vline */
-		if (disp_int & interrupt_status_offsets[crtc].vline) {
+		if (disp_int & interrupt_status_offsets[crtc].vline)
 			dce_v10_0_crtc_vline_int_ack(adev, crtc);
-			DRM_DEBUG("IH: D%d vline\n", crtc + 1);
-		}
+		else
+			DRM_DEBUG("IH: IH event w/o asserted irq bit?\n");
+
+		DRM_DEBUG("IH: D%d vline\n", crtc + 1);
+
 		break;
 	default:
 		DRM_DEBUG("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index 95efd98b202d..7f7abb0e0be5 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -3402,19 +3402,25 @@ static int dce_v11_0_crtc_irq(struct amdgpu_device *adev,
 
 	switch (entry->src_data) {
 	case 0: /* vblank */
-		if (disp_int & interrupt_status_offsets[crtc].vblank) {
+		if (disp_int & interrupt_status_offsets[crtc].vblank)
 			dce_v11_0_crtc_vblank_int_ack(adev, crtc);
-			if (amdgpu_irq_enabled(adev, source, irq_type)) {
-				drm_handle_vblank(adev->ddev, crtc);
-			}
-			DRM_DEBUG("IH: D%d vblank\n", crtc + 1);
+		else
+			DRM_DEBUG("IH: IH event w/o asserted irq bit?\n");
+
+		if (amdgpu_irq_enabled(adev, source, irq_type)) {
+			drm_handle_vblank(adev->ddev, crtc);
 		}
+		DRM_DEBUG("IH: D%d vblank\n", crtc + 1);
+
 		break;
 	case 1: /* vline */
-		if (disp_int & interrupt_status_offsets[crtc].vline) {
+		if (disp_int & interrupt_status_offsets[crtc].vline)
 			dce_v11_0_crtc_vline_int_ack(adev, crtc);
-			DRM_DEBUG("IH: D%d vline\n", crtc + 1);
-		}
+		else
+			DRM_DEBUG("IH: IH event w/o asserted irq bit?\n");
+
+		DRM_DEBUG("IH: D%d vline\n", crtc + 1);
+
 		break;
 	default:
 		DRM_DEBUG("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data);
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index 72c27ac915f2..cc050a329c49 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -2566,6 +2566,7 @@ static void dce_v8_0_crtc_dpms(struct drm_crtc *crtc, int mode)
 	struct drm_device *dev = crtc->dev;
 	struct amdgpu_device *adev = dev->dev_private;
 	struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
+	unsigned type;
 
 	switch (mode) {
 	case DRM_MODE_DPMS_ON:
@@ -2574,6 +2575,9 @@ static void dce_v8_0_crtc_dpms(struct drm_crtc *crtc, int mode)
 		dce_v8_0_vga_enable(crtc, true);
 		amdgpu_atombios_crtc_blank(crtc, ATOM_DISABLE);
 		dce_v8_0_vga_enable(crtc, false);
+		/* Make sure VBLANK interrupt is still enabled */
+		type = amdgpu_crtc_idx_to_irq_type(adev, amdgpu_crtc->crtc_id);
+		amdgpu_irq_update(adev, &adev->crtc_irq, type);
 		drm_vblank_post_modeset(dev, amdgpu_crtc->crtc_id);
 		dce_v8_0_crtc_load_lut(crtc);
 		break;
@@ -3237,19 +3241,25 @@ static int dce_v8_0_crtc_irq(struct amdgpu_device *adev,
 
 	switch (entry->src_data) {
 	case 0: /* vblank */
-		if (disp_int & interrupt_status_offsets[crtc].vblank) {
+		if (disp_int & interrupt_status_offsets[crtc].vblank)
 			WREG32(mmLB_VBLANK_STATUS + crtc_offsets[crtc], LB_VBLANK_STATUS__VBLANK_ACK_MASK);
-			if (amdgpu_irq_enabled(adev, source, irq_type)) {
-				drm_handle_vblank(adev->ddev, crtc);
-			}
-			DRM_DEBUG("IH: D%d vblank\n", crtc + 1);
+		else
+			DRM_DEBUG("IH: IH event w/o asserted irq bit?\n");
+
+		if (amdgpu_irq_enabled(adev, source, irq_type)) {
+			drm_handle_vblank(adev->ddev, crtc);
 		}
+		DRM_DEBUG("IH: D%d vblank\n", crtc + 1);
+
 		break;
 	case 1: /* vline */
-		if (disp_int & interrupt_status_offsets[crtc].vline) {
+		if (disp_int & interrupt_status_offsets[crtc].vline)
 			WREG32(mmLB_VLINE_STATUS + crtc_offsets[crtc], LB_VLINE_STATUS__VLINE_ACK_MASK);
-			DRM_DEBUG("IH: D%d vline\n", crtc + 1);
-		}
+		else
+			DRM_DEBUG("IH: IH event w/o asserted irq bit?\n");
+
+		DRM_DEBUG("IH: D%d vline\n", crtc + 1);
+
 		break;
 	default:
 		DRM_DEBUG("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data);
@@ -3379,7 +3389,7 @@ static int dce_v8_0_hpd_irq(struct amdgpu_device *adev,
 	uint32_t disp_int, mask, int_control, tmp;
 	unsigned hpd;
 
-	if (entry->src_data > 6) {
+	if (entry->src_data >= adev->mode_info.num_hpd) {
 		DRM_DEBUG("Unhandled interrupt: %d %d\n", entry->src_id, entry->src_data);
 		return 0;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index cb7907447b81..2c188fb9fd22 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2010,6 +2010,46 @@ static void gfx_v7_0_setup_rb(struct amdgpu_device *adev,
 }
 
 /**
+ * gmc_v7_0_init_compute_vmid - gart enable
+ *
+ * @rdev: amdgpu_device pointer
+ *
+ * Initialize compute vmid sh_mem registers
+ *
+ */
+#define DEFAULT_SH_MEM_BASES	(0x6000)
+#define FIRST_COMPUTE_VMID	(8)
+#define LAST_COMPUTE_VMID	(16)
+static void gmc_v7_0_init_compute_vmid(struct amdgpu_device *adev)
+{
+	int i;
+	uint32_t sh_mem_config;
+	uint32_t sh_mem_bases;
+
+	/*
+	 * Configure apertures:
+	 * LDS:         0x60000000'00000000 - 0x60000001'00000000 (4GB)
+	 * Scratch:     0x60000001'00000000 - 0x60000002'00000000 (4GB)
+	 * GPUVM:       0x60010000'00000000 - 0x60020000'00000000 (1TB)
+	*/
+	sh_mem_bases = DEFAULT_SH_MEM_BASES | (DEFAULT_SH_MEM_BASES << 16);
+	sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+			SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+	sh_mem_config |= MTYPE_NONCACHED << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT;
+	mutex_lock(&adev->srbm_mutex);
+	for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
+		cik_srbm_select(adev, 0, 0, 0, i);
+		/* CP and shaders */
+		WREG32(mmSH_MEM_CONFIG, sh_mem_config);
+		WREG32(mmSH_MEM_APE1_BASE, 1);
+		WREG32(mmSH_MEM_APE1_LIMIT, 0);
+		WREG32(mmSH_MEM_BASES, sh_mem_bases);
+	}
+	cik_srbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
+/**
  * gfx_v7_0_gpu_init - setup the 3D engine
  *
  * @adev: amdgpu_device pointer
@@ -2230,6 +2270,8 @@ static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
 	cik_srbm_select(adev, 0, 0, 0, 0);
 	mutex_unlock(&adev->srbm_mutex);
 
+	gmc_v7_0_init_compute_vmid(adev);
+
 	WREG32(mmSX_DEBUG_1, 0x20);
 
 	WREG32(mmTA_CNTL_AUX, 0x00010000);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 14242bd33363..1c7c992dea37 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1813,10 +1813,7 @@ static u32 gfx_v8_0_get_rb_disabled(struct amdgpu_device *adev,
 	u32 data, mask;
 
 	data = RREG32(mmCC_RB_BACKEND_DISABLE);
-	if (data & 1)
-		data &= CC_RB_BACKEND_DISABLE__BACKEND_DISABLE_MASK;
-	else
-		data = 0;
+	data &= CC_RB_BACKEND_DISABLE__BACKEND_DISABLE_MASK;
 
 	data |= RREG32(mmGC_USER_RB_BACKEND_DISABLE);
 
@@ -1894,6 +1891,51 @@ static void gfx_v8_0_setup_rb(struct amdgpu_device *adev,
 	mutex_unlock(&adev->grbm_idx_mutex);
 }
 
+/**
+ * gmc_v8_0_init_compute_vmid - gart enable
+ *
+ * @rdev: amdgpu_device pointer
+ *
+ * Initialize compute vmid sh_mem registers
+ *
+ */
+#define DEFAULT_SH_MEM_BASES	(0x6000)
+#define FIRST_COMPUTE_VMID	(8)
+#define LAST_COMPUTE_VMID	(16)
+static void gmc_v8_0_init_compute_vmid(struct amdgpu_device *adev)
+{
+	int i;
+	uint32_t sh_mem_config;
+	uint32_t sh_mem_bases;
+
+	/*
+	 * Configure apertures:
+	 * LDS:         0x60000000'00000000 - 0x60000001'00000000 (4GB)
+	 * Scratch:     0x60000001'00000000 - 0x60000002'00000000 (4GB)
+	 * GPUVM:       0x60010000'00000000 - 0x60020000'00000000 (1TB)
+	 */
+	sh_mem_bases = DEFAULT_SH_MEM_BASES | (DEFAULT_SH_MEM_BASES << 16);
+
+	sh_mem_config = SH_MEM_ADDRESS_MODE_HSA64 <<
+			SH_MEM_CONFIG__ADDRESS_MODE__SHIFT |
+			SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+			SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+			MTYPE_CC << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+			SH_MEM_CONFIG__PRIVATE_ATC_MASK;
+
+	mutex_lock(&adev->srbm_mutex);
+	for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
+		vi_srbm_select(adev, 0, 0, 0, i);
+		/* CP and shaders */
+		WREG32(mmSH_MEM_CONFIG, sh_mem_config);
+		WREG32(mmSH_MEM_APE1_BASE, 1);
+		WREG32(mmSH_MEM_APE1_LIMIT, 0);
+		WREG32(mmSH_MEM_BASES, sh_mem_bases);
+	}
+	vi_srbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+}
+
 static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
 {
 	u32 gb_addr_config;
@@ -2113,6 +2155,8 @@ static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
 	vi_srbm_select(adev, 0, 0, 0, 0);
 	mutex_unlock(&adev->srbm_mutex);
 
+	gmc_v8_0_init_compute_vmid(adev);
+
 	mutex_lock(&adev->grbm_idx_mutex);
 	/*
 	 * making sure that the following register writes will be broadcasted
@@ -3081,7 +3125,7 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev)
 				WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER,
 				       AMDGPU_DOORBELL_KIQ << 2);
 				WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER,
-				       AMDGPU_DOORBELL_MEC_RING7 << 2);
+						0x7FFFF << 2);
 			}
 			tmp = RREG32(mmCP_HQD_PQ_DOORBELL_CONTROL);
 			tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
@@ -3097,6 +3141,12 @@ static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev)
 		WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL,
 		       mqd->cp_hqd_pq_doorbell_control);
 
+		/* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
+		ring->wptr = 0;
+		mqd->cp_hqd_pq_wptr = ring->wptr;
+		WREG32(mmCP_HQD_PQ_WPTR, mqd->cp_hqd_pq_wptr);
+		mqd->cp_hqd_pq_rptr = RREG32(mmCP_HQD_PQ_RPTR);
+
 		/* set the vmid for the queue */
 		mqd->cp_hqd_vmid = 0;
 		WREG32(mmCP_HQD_VMID, mqd->cp_hqd_vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index e3c1fde75363..7bb37b93993f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -439,6 +439,31 @@ static void sdma_v3_0_rlc_stop(struct amdgpu_device *adev)
 }
 
 /**
+ * sdma_v3_0_ctx_switch_enable - stop the async dma engines context switch
+ *
+ * @adev: amdgpu_device pointer
+ * @enable: enable/disable the DMA MEs context switch.
+ *
+ * Halt or unhalt the async dma engines context switch (VI).
+ */
+static void sdma_v3_0_ctx_switch_enable(struct amdgpu_device *adev, bool enable)
+{
+	u32 f32_cntl;
+	int i;
+
+	for (i = 0; i < SDMA_MAX_INSTANCE; i++) {
+		f32_cntl = RREG32(mmSDMA0_CNTL + sdma_offsets[i]);
+		if (enable)
+			f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
+					AUTO_CTXSW_ENABLE, 1);
+		else
+			f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
+					AUTO_CTXSW_ENABLE, 0);
+		WREG32(mmSDMA0_CNTL + sdma_offsets[i], f32_cntl);
+	}
+}
+
+/**
  * sdma_v3_0_enable - stop the async dma engines
  *
  * @adev: amdgpu_device pointer
@@ -648,6 +673,8 @@ static int sdma_v3_0_start(struct amdgpu_device *adev)
 
 	/* unhalt the MEs */
 	sdma_v3_0_enable(adev, true);
+	/* enable sdma ring preemption */
+	sdma_v3_0_ctx_switch_enable(adev, true);
 
 	/* start the gfx rings and rlc compute queues */
 	r = sdma_v3_0_gfx_resume(adev);
@@ -1079,6 +1106,7 @@ static int sdma_v3_0_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	sdma_v3_0_ctx_switch_enable(adev, false);
 	sdma_v3_0_enable(adev, false);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 90fc93c2c1d0..68552da40287 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -122,6 +122,32 @@ static void vi_smc_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 	spin_unlock_irqrestore(&adev->smc_idx_lock, flags);
 }
 
+/* smu_8_0_d.h */
+#define mmMP0PUB_IND_INDEX                                                      0x180
+#define mmMP0PUB_IND_DATA                                                       0x181
+
+static u32 cz_smc_rreg(struct amdgpu_device *adev, u32 reg)
+{
+	unsigned long flags;
+	u32 r;
+
+	spin_lock_irqsave(&adev->smc_idx_lock, flags);
+	WREG32(mmMP0PUB_IND_INDEX, (reg));
+	r = RREG32(mmMP0PUB_IND_DATA);
+	spin_unlock_irqrestore(&adev->smc_idx_lock, flags);
+	return r;
+}
+
+static void cz_smc_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&adev->smc_idx_lock, flags);
+	WREG32(mmMP0PUB_IND_INDEX, (reg));
+	WREG32(mmMP0PUB_IND_DATA, (v));
+	spin_unlock_irqrestore(&adev->smc_idx_lock, flags);
+}
+
 static u32 vi_uvd_ctx_rreg(struct amdgpu_device *adev, u32 reg)
 {
 	unsigned long flags;
@@ -1189,10 +1215,6 @@ int vi_set_ip_blocks(struct amdgpu_device *adev)
 		return -EINVAL;
 	}
 
-	adev->ip_block_enabled = kcalloc(adev->num_ip_blocks, sizeof(bool), GFP_KERNEL);
-	if (adev->ip_block_enabled == NULL)
-		return -ENOMEM;
-
 	return 0;
 }
 
@@ -1226,8 +1248,13 @@ static int vi_common_early_init(void *handle)
 	bool smc_enabled = false;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	adev->smc_rreg = &vi_smc_rreg;
-	adev->smc_wreg = &vi_smc_wreg;
+	if (adev->flags & AMDGPU_IS_APU) {
+		adev->smc_rreg = &cz_smc_rreg;
+		adev->smc_wreg = &cz_smc_wreg;
+	} else {
+		adev->smc_rreg = &vi_smc_rreg;
+		adev->smc_wreg = &vi_smc_wreg;
+	}
 	adev->pcie_rreg = &vi_pcie_rreg;
 	adev->pcie_wreg = &vi_pcie_wreg;
 	adev->uvd_ctx_rreg = &vi_uvd_ctx_rreg;