From 225758d8ba4fdcc1e8c9cf617fd89529bd4a9596 Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Tue, 9 Mar 2010 14:45:10 +0000 Subject: drm/radeon/kms: fence cleanup + more reliable GPU lockup detection V4 This patch cleanup the fence code, it drops the timeout field of fence as the time to complete each IB is unpredictable and shouldn't be bound. The fence cleanup lead to GPU lockup detection improvement, this patch introduce a callback, allowing to do asic specific test for lockup detection. In this patch the CP is use as a first indicator of GPU lockup. If CP doesn't make progress during 1second we assume we are facing a GPU lockup. To avoid overhead of testing GPU lockup frequently due to fence taking time to be signaled we query the lockup callback every 500msec. There is plenty code comment explaining the design & choise inside the code. This have been tested mostly on R3XX/R5XX hw, in normal running destkop (compiz firefox, quake3 running) the lockup callback wasn't call once (1 hour session). Also tested with forcing GPU lockup and lockup was reported after the 1s CP activity timeout. V2 switch to 500ms timeout so GPU lockup get call at least 2 times in less than 2sec. V3 store last jiffies in fence struct so on ERESTART, EBUSY we keep track of how long we already wait for a given fence V4 make sure we got up to date cp read pointer so we don't have false positive Signed-off-by: Jerome Glisse Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_fence.c | 102 ++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 47 deletions(-) (limited to 'drivers/gpu/drm/radeon/radeon_fence.c') diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 8495d4e32e18..393154268dea 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -57,7 +57,6 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence) radeon_fence_ring_emit(rdev, fence); fence->emited = true; - fence->timeout = jiffies + ((2000 * HZ) / 1000); list_del(&fence->list); list_add_tail(&fence->list, &rdev->fence_drv.emited); write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags); @@ -70,15 +69,34 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev) struct list_head *i, *n; uint32_t seq; bool wake = false; + unsigned long cjiffies; - if (rdev == NULL) { - return true; - } - if (rdev->shutdown) { - return true; - } seq = RREG32(rdev->fence_drv.scratch_reg); - rdev->fence_drv.last_seq = seq; + if (seq != rdev->fence_drv.last_seq) { + rdev->fence_drv.last_seq = seq; + rdev->fence_drv.last_jiffies = jiffies; + rdev->fence_drv.last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT; + } else { + cjiffies = jiffies; + if (time_after(cjiffies, rdev->fence_drv.last_jiffies)) { + cjiffies -= rdev->fence_drv.last_jiffies; + if (time_after(rdev->fence_drv.last_timeout, cjiffies)) { + /* update the timeout */ + rdev->fence_drv.last_timeout -= cjiffies; + } else { + /* the 500ms timeout is elapsed we should test + * for GPU lockup + */ + rdev->fence_drv.last_timeout = 1; + } + } else { + /* wrap around update last jiffies, we will just wait + * a little longer + */ + rdev->fence_drv.last_jiffies = cjiffies; + } + return false; + } n = NULL; list_for_each(i, &rdev->fence_drv.emited) { fence = list_entry(i, struct radeon_fence, list); @@ -170,9 +188,8 @@ bool radeon_fence_signaled(struct radeon_fence *fence) int radeon_fence_wait(struct radeon_fence *fence, bool intr) { struct radeon_device *rdev; - unsigned long cur_jiffies; - unsigned long timeout; - bool expired = false; + unsigned long irq_flags, timeout; + u32 seq; int r; if (fence == NULL) { @@ -183,14 +200,10 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr) if (radeon_fence_signaled(fence)) { return 0; } - + timeout = rdev->fence_drv.last_timeout; retry: - cur_jiffies = jiffies; - timeout = HZ / 100; - if (time_after(fence->timeout, cur_jiffies)) { - timeout = fence->timeout - cur_jiffies; - } - + /* save current sequence used to check for GPU lockup */ + seq = rdev->fence_drv.last_seq; if (intr) { radeon_irq_kms_sw_irq_get(rdev); r = wait_event_interruptible_timeout(rdev->fence_drv.queue, @@ -205,38 +218,34 @@ retry: radeon_irq_kms_sw_irq_put(rdev); } if (unlikely(!radeon_fence_signaled(fence))) { - if (unlikely(r == 0)) { - expired = true; + /* we were interrupted for some reason and fence isn't + * isn't signaled yet, resume wait + */ + if (r) { + timeout = r; + goto retry; } - if (unlikely(expired)) { - timeout = 1; - if (time_after(cur_jiffies, fence->timeout)) { - timeout = cur_jiffies - fence->timeout; - } - timeout = jiffies_to_msecs(timeout); - if (timeout > 500) { - DRM_ERROR("fence(%p:0x%08X) %lums timeout " - "going to reset GPU\n", - fence, fence->seq, timeout); - radeon_gpu_reset(rdev); - WREG32(rdev->fence_drv.scratch_reg, fence->seq); - } + /* don't protect read access to rdev->fence_drv.last_seq + * if we experiencing a lockup the value doesn't change + */ + if (seq == rdev->fence_drv.last_seq && radeon_gpu_is_lockup(rdev)) { + /* good news we believe it's a lockup */ + dev_warn(rdev->dev, "GPU lockup (last fence id 0x%08X)\n", seq); + r = radeon_gpu_reset(rdev); + if (r) + return r; + /* FIXME: what should we do ? marking everyone + * as signaled for now + */ + WREG32(rdev->fence_drv.scratch_reg, fence->seq); } + timeout = RADEON_FENCE_JIFFIES_TIMEOUT; + write_lock_irqsave(&rdev->fence_drv.lock, irq_flags); + rdev->fence_drv.last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT; + rdev->fence_drv.last_jiffies = jiffies; + write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags); goto retry; } - if (unlikely(expired)) { - rdev->fence_drv.count_timeout++; - cur_jiffies = jiffies; - timeout = 1; - if (time_after(cur_jiffies, fence->timeout)) { - timeout = cur_jiffies - fence->timeout; - } - timeout = jiffies_to_msecs(timeout); - DRM_ERROR("fence(%p:0x%08X) %lums timeout\n", - fence, fence->seq, timeout); - DRM_ERROR("last signaled fence(0x%08X)\n", - rdev->fence_drv.last_seq); - } return 0; } @@ -332,7 +341,6 @@ int radeon_fence_driver_init(struct radeon_device *rdev) INIT_LIST_HEAD(&rdev->fence_drv.created); INIT_LIST_HEAD(&rdev->fence_drv.emited); INIT_LIST_HEAD(&rdev->fence_drv.signaled); - rdev->fence_drv.count_timeout = 0; init_waitqueue_head(&rdev->fence_drv.queue); rdev->fence_drv.initialized = true; write_unlock_irqrestore(&rdev->fence_drv.lock, irq_flags); -- cgit v1.2.3 From a2d07b7438f015a0349bc9af3c96a8164549bbc5 Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Tue, 9 Mar 2010 14:45:11 +0000 Subject: drm/radeon/kms: rename gpu_reset to asic_reset Patch rename gpu_reset to asic_reset in prevision of having gpu_reset doing more stuff than just basic asic reset. Signed-off-by: Jerome Glisse Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_fence.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/radeon/radeon_fence.c') diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 393154268dea..2560740ff922 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -231,7 +231,7 @@ retry: if (seq == rdev->fence_drv.last_seq && radeon_gpu_is_lockup(rdev)) { /* good news we believe it's a lockup */ dev_warn(rdev->dev, "GPU lockup (last fence id 0x%08X)\n", seq); - r = radeon_gpu_reset(rdev); + r = radeon_asic_reset(rdev); if (r) return r; /* FIXME: what should we do ? marking everyone -- cgit v1.2.3 From 90aca4d2740255bd130ea71a91530b9920c70abe Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Tue, 9 Mar 2010 14:45:12 +0000 Subject: drm/radeon/kms: simplify & improve GPU reset V2 This simplify and improve GPU reset for R1XX-R6XX hw, it's not 100% reliable here are result: - R1XX/R2XX works bunch of time in a row, sometimes it seems it can work indifinitly - R3XX/R3XX the most unreliable one, sometimes you will be able to reset few times, sometimes not even once - R5XX more reliable than previous hw, seems to work most of the times but once in a while it fails for no obvious reasons (same status than previous reset just no same happy ending) - R6XX/R7XX are lot more reliable with this patch, still it seems that it can fail after a bunch (reset every 2sec for 3hour bring down the GPU & computer) This have been tested on various hw, for some odd reasons i wasn't able to lockup RS480/RS690 (while they use to love locking up). Note that on R1XX-R5XX the cursor will disapear after lockup haven't checked why, switch to console and back to X will restore cursor. Next step is to record the bogus command that leaded to the lockup. V2 Fix r6xx resume path to avoid reinitializing blit module, use the gpu_lockup boolean to avoid entering inifinite waiting loop on fence while reiniting the GPU Signed-off-by: Jerome Glisse Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_fence.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/radeon/radeon_fence.c') diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 2560740ff922..fcd7802d8a71 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -209,8 +209,9 @@ retry: r = wait_event_interruptible_timeout(rdev->fence_drv.queue, radeon_fence_signaled(fence), timeout); radeon_irq_kms_sw_irq_put(rdev); - if (unlikely(r < 0)) + if (unlikely(r < 0)) { return r; + } } else { radeon_irq_kms_sw_irq_get(rdev); r = wait_event_timeout(rdev->fence_drv.queue, @@ -230,14 +231,16 @@ retry: */ if (seq == rdev->fence_drv.last_seq && radeon_gpu_is_lockup(rdev)) { /* good news we believe it's a lockup */ - dev_warn(rdev->dev, "GPU lockup (last fence id 0x%08X)\n", seq); - r = radeon_asic_reset(rdev); - if (r) - return r; + WARN(1, "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n", fence->seq, seq); /* FIXME: what should we do ? marking everyone * as signaled for now */ + rdev->gpu_lockup = true; WREG32(rdev->fence_drv.scratch_reg, fence->seq); + r = radeon_gpu_reset(rdev); + if (r) + return r; + rdev->gpu_lockup = false; } timeout = RADEON_FENCE_JIFFIES_TIMEOUT; write_lock_irqsave(&rdev->fence_drv.lock, irq_flags); -- cgit v1.2.3