19 files changed, 1054 insertions, 104 deletions
diff --git a/drivers/video/tegra/host/chip_support.h b/drivers/video/tegra/host/chip_support.h
index eae99671a20d..16dd55dc0f2b 100644
--- a/drivers/video/tegra/host/chip_support.h
+++ b/drivers/video/tegra/host/chip_support.h
@@ -24,6 +24,7 @@
 
 struct output;
 struct nvhost_waitchk;
+struct nvhost_userctx_timeout;
 
 struct nvhost_chip_support {
 	struct {
@@ -42,10 +43,12 @@ struct nvhost_chip_support {
 			      int nr_unpins,
 			      u32 syncpt_id,
 			      u32 syncpt_incrs,
+			      struct nvhost_userctx_timeout *timeout,
 			      u32 *syncpt_value,
 			      bool null_kickoff);
 		int (*read3dreg)(struct nvhost_channel *channel,
 				struct nvhost_hwctx *hwctx,
+				struct nvhost_userctx_timeout *timeout,
 				u32 offset,
 				u32 *value);
 	} channel;
@@ -54,6 +57,24 @@ struct nvhost_chip_support {
 		void (*start)(struct nvhost_cdma *);
 		void (*stop)(struct nvhost_cdma *);
 		void (*kick)(struct  nvhost_cdma *);
+		int (*timeout_init)(struct nvhost_cdma *,
+				    u32 syncpt_id);
+		void (*timeout_destroy)(struct nvhost_cdma *);
+		void (*timeout_teardown_begin)(struct nvhost_cdma *);
+		void (*timeout_teardown_end)(struct nvhost_cdma *,
+					     u32 getptr);
+		void (*timeout_cpu_incr)(struct nvhost_cdma *,
+					 u32 getptr,
+					 u32 syncpt_incrs,
+					 u32 nr_slots);
+		void (*timeout_pb_incr)(struct nvhost_cdma *,
+					u32 getptr,
+					u32 syncpt_incrs,
+					u32 nr_slots,
+					bool exec_ctxsave);
+		void (*timeout_clear_ctxsave)(struct nvhost_cdma *,
+					u32 getptr,
+					u32 nr_slots);
 	} cdma;
 
 	struct {
diff --git a/drivers/video/tegra/host/debug.c b/drivers/video/tegra/host/debug.c
index a7ff51aed08b..8892a0072480 100644
--- a/drivers/video/tegra/host/debug.c
+++ b/drivers/video/tegra/host/debug.c
@@ -27,6 +27,10 @@
 
 pid_t nvhost_debug_null_kickoff_pid;
 
+pid_t nvhost_debug_force_timeout_pid;
+u32 nvhost_debug_force_timeout_val;
+u32 nvhost_debug_force_timeout_channel;
+
 void nvhost_debug_output(struct output *o, const char* fmt, ...)
 {
 	va_list args;
@@ -113,6 +117,12 @@ void nvhost_debug_init(struct nvhost_master *master)
 			&nvhost_debug_null_kickoff_pid);
 
 	nvhost_debug_scale_init(de);
+	debugfs_create_u32("force_timeout_pid", S_IRUGO|S_IWUSR, de,
+			&nvhost_debug_force_timeout_pid);
+	debugfs_create_u32("force_timeout_val", S_IRUGO|S_IWUSR, de,
+			&nvhost_debug_force_timeout_val);
+	debugfs_create_u32("force_timeout_channel", S_IRUGO|S_IWUSR, de,
+			&nvhost_debug_force_timeout_channel);
 }
 #else
 void nvhost_debug_init(struct nvhost_master *master)
diff --git a/drivers/video/tegra/host/debug.h b/drivers/video/tegra/host/debug.h
index 81017fe8d2a1..d3adcc60cf04 100644
--- a/drivers/video/tegra/host/debug.h
+++ b/drivers/video/tegra/host/debug.h
@@ -23,6 +23,7 @@
 #define __NVHOST_DEBUG_H
 
 #include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 struct output {
 	void (*fn)(void *ctx, const char* str, size_t len);
@@ -43,4 +44,8 @@ static inline void write_to_printk(void *ctx, const char* str, size_t len)
 void nvhost_debug_output(struct output *o, const char* fmt, ...);
 
 void nvhost_debug_scale_init(struct dentry *de);
+extern pid_t nvhost_debug_force_timeout_pid;
+extern u32 nvhost_debug_force_timeout_val;
+extern u32 nvhost_debug_force_timeout_channel;
+
 #endif /*__NVHOST_DEBUG_H */
diff --git a/drivers/video/tegra/host/dev.c b/drivers/video/tegra/host/dev.c
index 37d18bc2d175..5869e6f9805d 100644
--- a/drivers/video/tegra/host/dev.c
+++ b/drivers/video/tegra/host/dev.c
@@ -43,6 +43,7 @@
 #include <mach/gpufuse.h>
 
 #include "nvhost_scale.h"
+#include "debug.h"
 
 #define DRIVER_NAME "tegra_grhost"
 #define IFACE_NAME "nvhost"
@@ -64,6 +65,7 @@ struct nvhost_channel_userctx {
 	struct nvmap_client *nvmap;
 	struct nvhost_waitchk waitchks[NVHOST_MAX_WAIT_CHECKS];
 	struct nvhost_waitchk *cur_waitchk;
+	struct nvhost_userctx_timeout timeout;
 };
 
 struct nvhost_ctrl_userctx {
@@ -126,6 +128,7 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 		priv->hwctx = ch->ctxhandler.alloc(ch);
 		if (!priv->hwctx)
 			goto fail;
+		priv->hwctx->timeout = &priv->timeout;
 	}
 
 	priv->gathers = nvmap_mmap(priv->gather_mem);
@@ -312,6 +315,12 @@ static int nvhost_ioctl_channel_flush(
 	if (nvhost_debug_null_kickoff_pid == current->tgid)
 		null_kickoff = 1;
 
+	if ((nvhost_debug_force_timeout_pid == current->tgid) &&
+	    (nvhost_debug_force_timeout_channel == ctx->ch->chid)) {
+		ctx->timeout.timeout = nvhost_debug_force_timeout_val;
+	}
+	ctx->timeout.syncpt_id = ctx->hdr.syncpt_id;
+
 	/* context switch if needed, and submit user's gathers to the channel */
 	BUG_ON(!channel_op(ctx->ch).submit);
 	err = channel_op(ctx->ch).submit(ctx->ch, ctx->hwctx, ctx->nvmap,
@@ -320,6 +329,7 @@ static int nvhost_ioctl_channel_flush(
 				ctx->hdr.waitchk_mask,
 				ctx->unpinarray, num_unpin,
 				ctx->hdr.syncpt_id, ctx->hdr.syncpt_incrs,
+				&ctx->timeout,
 				&args->value,
 				null_kickoff);
 	if (err)
@@ -334,7 +344,8 @@ static int nvhost_ioctl_channel_read_3d_reg(
 {
 	BUG_ON(!channel_op(ctx->ch).read3dreg);
 	return channel_op(ctx->ch).read3dreg(ctx->ch, ctx->hwctx,
-					args->offset, &args->value);
+			&ctx->timeout,
+			args->offset, &args->value);
 }
 
 static long nvhost_channelctl(struct file *filp,
@@ -447,6 +458,17 @@ static long nvhost_channelctl(struct file *filp,
 		err = nvhost_module_set_rate(&priv->ch->mod, priv, rate, 0);
 		break;
 	}
+	case NVHOST_IOCTL_CHANNEL_SET_TIMEOUT:
+		priv->timeout.timeout =
+			(u32)((struct nvhost_set_timeout_args *)buf)->timeout;
+		dev_dbg(&priv->ch->dev->pdev->dev,
+			"%s: setting buffer timeout (%d ms) for userctx 0x%p\n",
+			__func__, priv->timeout.timeout, priv);
+		break;
+	case NVHOST_IOCTL_CHANNEL_GET_TIMEDOUT:
+		((struct nvhost_get_param_args *)buf)->value =
+				priv->timeout.has_timedout;
+		break;
 	default:
 		err = -ENOTTY;
 		break;
@@ -678,10 +700,6 @@ static void power_host(struct nvhost_module *mod, enum nvhost_power_action actio
 
 	if (action == NVHOST_POWER_ACTION_ON) {
 		nvhost_intr_start(&dev->intr, clk_get_rate(mod->clk[0]));
-		/* don't do it, as display may have changed syncpt
-		 * after the last save
-		 * nvhost_syncpt_reset(&dev->syncpt);
-		 */
 	} else if (action == NVHOST_POWER_ACTION_OFF) {
 		int i;
 		for (i = 0; i < dev->nb_channels; i++)
@@ -939,7 +957,6 @@ static int __devinit nvhost_probe(struct platform_device *pdev)
 		}
 	}
 
-
 	err = nvhost_cpuaccess_init(&host->cpuaccess, pdev);
 	if (err)
 		goto fail;
@@ -976,7 +993,6 @@ fail:
 	nvhost_remove_chip_support(host);
 	if (host->nvmap)
 		nvmap_client_put(host->nvmap);
-	/* TODO: [ahatala 2010-05-04] */
 	kfree(host);
 	return err;
 }
@@ -986,7 +1002,6 @@ static int __exit nvhost_remove(struct platform_device *pdev)
 	struct nvhost_master *host = platform_get_drvdata(pdev);
 	nvhost_remove_chip_support(host);
 	nvhost_remove_sysfs(&pdev->dev);
-	/*kfree(host);?*/
 	return 0;
 }
 
diff --git a/drivers/video/tegra/host/dev.h b/drivers/video/tegra/host/dev.h
index ee79bddf05d5..02b248b38d62 100644
--- a/drivers/video/tegra/host/dev.h
+++ b/drivers/video/tegra/host/dev.h
@@ -31,6 +31,7 @@
 #include "chip_support.h"
 
 #define NVHOST_MAJOR 0 /* dynamic */
+struct nvhost_hwctx;
 
 struct nvhost_master {
 	void __iomem *aperture;
@@ -56,6 +57,13 @@ struct nvhost_master {
 	struct nvhost_chip_support op;
 };
 
+struct nvhost_userctx_timeout {
+	u32 timeout;
+	bool has_timedout;
+	struct nvhost_hwctx *hwctx;
+	int syncpt_id;
+};
+
 void nvhost_debug_init(struct nvhost_master *master);
 void nvhost_debug_dump(struct nvhost_master *master);
 
diff --git a/drivers/video/tegra/host/nvhost_acm.c b/drivers/video/tegra/host/nvhost_acm.c
index 164617c7229d..9caaf4817257 100644
--- a/drivers/video/tegra/host/nvhost_acm.c
+++ b/drivers/video/tegra/host/nvhost_acm.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/device.h>
+#include <linux/delay.h>
 #include <mach/powergate.h>
 #include <mach/clk.h>
 #include <mach/hardware.h>
@@ -34,6 +35,47 @@
 
 #define ACM_POWERDOWN_HANDLER_DELAY_MSEC  25
 #define ACM_SUSPEND_WAIT_FOR_IDLE_TIMEOUT (2 * HZ)
+#define POWERGATE_DELAY 10
+
+void nvhost_module_reset(struct nvhost_module *mod)
+{
+	struct nvhost_master *dev;
+	dev = container_of(mod, struct nvhost_channel, mod)->dev;
+
+	dev_dbg(&dev->pdev->dev,
+		"%s: asserting %s module reset (id %d, id2 %d)\n",
+		__func__, mod->name,
+		mod->powergate_id, mod->powergate_id2);
+
+	/* assert module and mc client reset */
+	if (mod->powergate_id != -1) {
+		tegra_powergate_mc_disable(mod->powergate_id);
+		tegra_periph_reset_assert(mod->clk[0]);
+		tegra_powergate_mc_flush(mod->powergate_id);
+	}
+	if (mod->powergate_id2 != -1) {
+		tegra_powergate_mc_disable(mod->powergate_id2);
+		tegra_periph_reset_assert(mod->clk[1]);
+		tegra_powergate_mc_flush(mod->powergate_id2);
+	}
+
+	udelay(POWERGATE_DELAY);
+
+	/* deassert reset */
+	if (mod->powergate_id != -1) {
+		tegra_powergate_mc_flush_done(mod->powergate_id);
+		tegra_periph_reset_deassert(mod->clk[0]);
+		tegra_powergate_mc_enable(mod->powergate_id);
+	}
+	if (mod->powergate_id2 != -1) {
+		tegra_powergate_mc_flush_done(mod->powergate_id2);
+		tegra_periph_reset_deassert(mod->clk[1]);
+		tegra_powergate_mc_enable(mod->powergate_id2);
+	}
+
+	dev_dbg(&dev->pdev->dev, "%s: module %s out of reset\n",
+		__func__, mod->name);
+}
 
 void nvhost_module_busy(struct nvhost_module *mod)
 {
@@ -43,13 +85,15 @@ void nvhost_module_busy(struct nvhost_module *mod)
 		int i = 0;
 		if (mod->parent)
 			nvhost_module_busy(mod->parent);
-		if (mod->powergate_id != -1)
-			tegra_unpowergate_partition(mod->powergate_id);
-		if (mod->powergate_id2 != -1)
-			tegra_unpowergate_partition(mod->powergate_id2);
+		if (mod->can_powergate) {
+			if (mod->powergate_id != -1)
+				tegra_unpowergate_partition(mod->powergate_id);
+			if (mod->powergate_id2 != -1)
+				tegra_unpowergate_partition(mod->powergate_id2);
+		}
 		while (i < mod->num_clks)
 			clk_enable(mod->clk[i++]);
-		if (mod->func)
+		if (mod->can_powergate && mod->func)
 			mod->func(mod, NVHOST_POWER_ACTION_ON);
 		mod->powered = true;
 	}
@@ -68,12 +112,12 @@ static void powerdown_handler(struct work_struct *work)
 			mod->func(mod, NVHOST_POWER_ACTION_OFF);
 		for (i = 0; i < mod->num_clks; i++)
 			clk_disable(mod->clk[i]);
-		if (mod->powergate_id != -1)
-			tegra_powergate_partition(mod->powergate_id);
-
-		if (mod->powergate_id2 != -1)
-			tegra_powergate_partition(mod->powergate_id2);
-
+		if (mod->can_powergate) {
+			if (mod->powergate_id != -1)
+				tegra_powergate_partition(mod->powergate_id);
+			if (mod->powergate_id2 != -1)
+				tegra_powergate_partition(mod->powergate_id2);
+		}
 		mod->powered = false;
 		if (mod->parent)
 			nvhost_module_idle(mod->parent);
@@ -493,23 +537,30 @@ int nvhost_module_init(struct nvhost_module *mod, const char *name,
 	mod->num_clks = i;
 	mod->func = func;
 	mod->parent = parent;
+	mod->can_powergate = false;
 	mod->powered = false;
 	mod->powergate_id = -1;
 	mod->powergate_id2 = -1;
+	mod->powerdown_delay = ACM_POWERDOWN_HANDLER_DELAY_MSEC;
+
 	if (strcmp(name, "gr2d") == 0)
 		mod->powerdown_delay = 0;
-	else
-		mod->powerdown_delay = ACM_POWERDOWN_HANDLER_DELAY_MSEC;
-
-	if (strcmp(name, "gr3d") == 0) {
+	else if (strcmp(name, "gr3d") == 0) {
+		mod->can_powergate = !_3d_powergating_disabled();
 		if (!scale3d.init)
 			scale3d_init(mod);
 		mod->powergate_id = TEGRA_POWERGATE_3D;
+		if (!mod->can_powergate)
+			tegra_unpowergate_partition(mod->powergate_id);
 #ifdef CONFIG_ARCH_TEGRA_3x_SOC
 		mod->powergate_id2 = TEGRA_POWERGATE_3D1;
+		if (!mod->can_powergate)
+			tegra_unpowergate_partition(mod->powergate_id2);
 #endif
-	} else if (strcmp(name, "mpe") == 0)
+	} else if (strcmp(name, "mpe") == 0) {
+		mod->can_powergate = true;
 		mod->powergate_id = TEGRA_POWERGATE_MPE;
+	}
 
 	if (mod->powergate_id == TEGRA_POWERGATE_MPE
 		&& _mpe_powergating_disabled()) {
diff --git a/drivers/video/tegra/host/nvhost_acm.h b/drivers/video/tegra/host/nvhost_acm.h
index 6f3011e343cd..42bc89ab9d4b 100644
--- a/drivers/video/tegra/host/nvhost_acm.h
+++ b/drivers/video/tegra/host/nvhost_acm.h
@@ -56,6 +56,7 @@ struct nvhost_module {
 	atomic_t refcount;
 	wait_queue_head_t idle;
 	struct nvhost_module *parent;
+	bool can_powergate;
 	int powergate_id;
 	int powergate_id2;
 	int powerdown_delay;
@@ -68,6 +69,7 @@ int nvhost_module_init(struct nvhost_module *mod, const char *name,
 void nvhost_module_deinit(struct nvhost_module *mod);
 void nvhost_module_suspend(struct nvhost_module *mod, bool system_suspend);
 
+void nvhost_module_reset(struct nvhost_module *mod);
 void nvhost_module_busy(struct nvhost_module *mod);
 void nvhost_module_idle_mult(struct nvhost_module *mod, int refs);
 int nvhost_module_add_client(struct nvhost_module *mod, void *priv);
diff --git a/drivers/video/tegra/host/nvhost_cdma.c b/drivers/video/tegra/host/nvhost_cdma.c
index fcce8334f272..b125f76414cc 100644
--- a/drivers/video/tegra/host/nvhost_cdma.c
+++ b/drivers/video/tegra/host/nvhost_cdma.c
@@ -23,8 +23,11 @@
 #include "nvhost_cdma.h"
 #include "dev.h"
 #include <asm/cacheflush.h>
+
 #include <linux/slab.h>
 #include <trace/events/nvhost.h>
+#include <linux/interrupt.h>
+
 /*
  * TODO:
  *   stats
@@ -38,10 +41,13 @@
  * The sync queue is a circular buffer of u32s interpreted as:
  *   0: SyncPointID
  *   1: SyncPointValue
- *   2: NumSlots (how many pushbuffer slots to free)
- *   3: NumHandles
- *   4: nvmap client which pinned the handles
- *   5..: NumHandles * nvmemhandle to unpin
+ *   2: FirstDMAGet (start of submit in pushbuffer)
+ *   3: Timeout (time to live for this submit)
+ *   4: TimeoutContext (userctx that submitted buffer)
+ *   5: NumSlots (how many pushbuffer slots to free)
+ *   6: NumHandles
+ *   7: nvmap client which pinned the handles
+ *   8..: NumHandles * nvmemhandle to unpin
  *
  * There's always one word unused, so (accounting for wrap):
  *   - Write == Read => queue empty
@@ -55,7 +61,7 @@
  */
 
 /* Number of words needed to store an entry containing one handle */
-#define SYNC_QUEUE_MIN_ENTRY (4 + (2 * sizeof(void *) / sizeof(u32)))
+#define SYNC_QUEUE_MIN_ENTRY (SQ_IDX_HANDLES + (sizeof(void *)/4))
 
 /**
  * Reset to empty queue.
@@ -120,20 +126,46 @@ static unsigned int sync_queue_space(struct sync_queue *queue)
 }
 
 /**
+ * Debug routine used to dump sync_queue entries
+ */
+static void dump_sync_queue_entry(struct nvhost_cdma *cdma, u32 *entry)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+
+	dev_dbg(&dev->pdev->dev, "sync_queue index 0x%x\n",
+		(entry - cdma->sync_queue.buffer));
+	dev_dbg(&dev->pdev->dev, "    SYNCPT_ID   %d\n",
+		entry[SQ_IDX_SYNCPT_ID]);
+	dev_dbg(&dev->pdev->dev, "    SYNCPT_VAL  %d\n",
+		entry[SQ_IDX_SYNCPT_VAL]);
+	dev_dbg(&dev->pdev->dev, "    FIRST_GET   0x%x\n",
+		entry[SQ_IDX_FIRST_GET]);
+	dev_dbg(&dev->pdev->dev, "    TIMEOUT     %d\n",
+		entry[SQ_IDX_TIMEOUT]);
+	dev_dbg(&dev->pdev->dev, "    TIMEOUT_CTX 0x%x\n",
+		entry[SQ_IDX_TIMEOUT_CTX]);
+	dev_dbg(&dev->pdev->dev, "    NUM_SLOTS   %d\n",
+		entry[SQ_IDX_NUM_SLOTS]);
+	dev_dbg(&dev->pdev->dev, "    NUM_HANDLES %d\n",
+		entry[SQ_IDX_NUM_HANDLES]);
+}
+
+/**
  * Add an entry to the sync queue.
  */
-#define entry_size(_cnt)	((1 + _cnt)*sizeof(void *)/sizeof(u32))
+#define entry_size(_cnt)	((_cnt)*sizeof(void *)/sizeof(u32))
 
 static void add_to_sync_queue(struct sync_queue *queue,
 			      u32 sync_point_id, u32 sync_point_value,
 			      u32 nr_slots, struct nvmap_client *user_nvmap,
-			      struct nvmap_handle **handles, u32 nr_handles)
+			      struct nvmap_handle **handles, u32 nr_handles,
+			      u32 first_get,
+			      struct nvhost_userctx_timeout *timeout)
 {
 	struct nvhost_cdma *cdma;
 	struct nvhost_master *host;
-	u32 write = queue->write;
+	u32 size, write = queue->write;
 	u32 *p = queue->buffer + write;
-	u32 size = 4 + (entry_size(nr_handles));
 
 	cdma = container_of(queue, struct nvhost_cdma, sync_queue);
 	host = cdma_to_dev(cdma);
@@ -141,20 +173,29 @@ static void add_to_sync_queue(struct sync_queue *queue,
 	BUG_ON(sync_point_id == NVSYNCPT_INVALID);
 	BUG_ON(sync_queue_space(queue) < nr_handles);
 
+	size  = SQ_IDX_HANDLES;
+	size += entry_size(nr_handles);
+
 	write += size;
 	BUG_ON(write > host->sync_queue_size);
 
-	*p++ = sync_point_id;
-	*p++ = sync_point_value;
-	*p++ = nr_slots;
-	*p++ = nr_handles;
-	BUG_ON(!user_nvmap);
-	*(struct nvmap_client **)p = nvmap_client_get(user_nvmap);
+	p[SQ_IDX_SYNCPT_ID] = sync_point_id;
+	p[SQ_IDX_SYNCPT_VAL] = sync_point_value;
+	p[SQ_IDX_FIRST_GET] = first_get;
+	p[SQ_IDX_TIMEOUT] = timeout->timeout;
+	p[SQ_IDX_NUM_SLOTS] = nr_slots;
+	p[SQ_IDX_NUM_HANDLES] = nr_handles;
 
-	p = (u32 *)((void *)p + sizeof(struct nvmap_client *));
+	*(void **)(&p[SQ_IDX_TIMEOUT_CTX]) = timeout;
 
-	if (nr_handles)
-		memcpy(p, handles, nr_handles * sizeof(struct nvmap_handle *));
+	BUG_ON(!user_nvmap);
+	*(struct nvmap_client **)(&p[SQ_IDX_NVMAP_CTX]) =
+		nvmap_client_get(user_nvmap);
+
+	if (nr_handles) {
+		memcpy(&p[SQ_IDX_HANDLES], handles,
+			(nr_handles * sizeof(struct nvmap_handle *)));
+	}
 
 	/* If there's not enough room for another entry, wrap to the start. */
 	if ((write + SYNC_QUEUE_MIN_ENTRY) > host->sync_queue_size) {
@@ -165,7 +206,6 @@ static void add_to_sync_queue(struct sync_queue *queue,
 		BUG_ON(queue->read == 0);
 		write = 0;
 	}
-
 	queue->write = write;
 }
 
@@ -205,7 +245,8 @@ dequeue_sync_queue_head(struct sync_queue *queue)
 
 	BUG_ON(read == queue->write);
 
-	size = 4 + entry_size(queue->buffer[read + 3]);
+	size  = SQ_IDX_HANDLES;
+	size += entry_size(queue->buffer[read + SQ_IDX_NUM_HANDLES]);
 
 	read += size;
 	BUG_ON(read > host->sync_queue_size);
@@ -213,12 +254,9 @@ dequeue_sync_queue_head(struct sync_queue *queue)
 	/* If there's not enough room for another entry, wrap to the start. */
 	if ((read + SYNC_QUEUE_MIN_ENTRY) > host->sync_queue_size)
 		read = 0;
-
 	queue->read = read;
 }
 
-
-
 /**
  * Return the status of the cdma's sync queue or push buffer for the given event
  *  - sq empty: returns 1 for empty, 0 for not empty (as in "1 empty queue" :-)
@@ -269,6 +307,40 @@ unsigned int nvhost_cdma_wait(struct nvhost_cdma *cdma, enum cdma_event event)
 		down(&cdma->sem);
 		mutex_lock(&cdma->lock);
 	}
+	return 0;
+}
+
+/**
+ * Start timer for a buffer submition that has completed yet.
+ * Must be called with the cdma lock held.
+ */
+void nvhost_cdma_start_timer(struct nvhost_cdma *cdma, u32 syncpt_id,
+				u32 syncpt_val,
+				struct nvhost_userctx_timeout *timeout)
+{
+	BUG_ON(!timeout);
+	if (cdma->timeout.ctx_timeout) {
+		/* timer already started */
+		return;
+	}
+
+	cdma->timeout.ctx_timeout = timeout;
+	cdma->timeout.syncpt_id = syncpt_id;
+	cdma->timeout.syncpt_val = syncpt_val;
+	cdma->timeout.start_ktime = ktime_get();
+
+	schedule_delayed_work(&cdma->timeout.wq,
+			msecs_to_jiffies(timeout->timeout));
+}
+
+/**
+ * Stop timer when a buffer submition completes.
+ * Must be called with the cdma lock held.
+ */
+static void stop_cdma_timer(struct nvhost_cdma *cdma)
+{
+	cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.ctx_timeout = NULL;
 }
 
 /**
@@ -294,7 +366,10 @@ static void update_cdma(struct nvhost_cdma *cdma)
 	 */
 	for (;;) {
 		u32 syncpt_id, syncpt_val;
+		u32 timeout;
+		struct nvhost_userctx_timeout *timeout_ref = NULL;
 		unsigned int nr_slots, nr_handles;
+		struct nvhost_syncpt *sp = &dev->syncpt;
 		struct nvmap_handle **handles;
 		struct nvmap_client *nvmap;
 		u32 *sync;
@@ -306,26 +381,37 @@ static void update_cdma(struct nvhost_cdma *cdma)
 			break;
 		}
 
-		syncpt_id = *sync++;
-		syncpt_val = *sync++;
+		syncpt_id = sync[SQ_IDX_SYNCPT_ID];
+		syncpt_val = sync[SQ_IDX_SYNCPT_VAL];
+		timeout = sync[SQ_IDX_TIMEOUT];
+		timeout_ref = (struct nvhost_userctx_timeout *)
+				sync[SQ_IDX_TIMEOUT_CTX];
 
 		BUG_ON(syncpt_id == NVSYNCPT_INVALID);
 
 		/* Check whether this syncpt has completed, and bail if not */
-		if (!nvhost_syncpt_min_cmp(&dev->syncpt, syncpt_id, syncpt_val))
+		if (!nvhost_syncpt_min_cmp(sp, syncpt_id, syncpt_val)) {
+			/* Start timer on next pending syncpt */
+			if (timeout) {
+				nvhost_cdma_start_timer(cdma, syncpt_id,
+					syncpt_val, timeout_ref);
+			}
 			break;
+		}
 
-		nr_slots = *sync++;
-		nr_handles = *sync++;
-		nvmap = *(struct nvmap_client **)sync;
-		sync = ((void *)sync + sizeof(struct nvmap_client *));
-		handles = (struct nvmap_handle **)sync;
+		/* Cancel timeout, when a buffer completes */
+		if (cdma->timeout.ctx_timeout)
+			stop_cdma_timer(cdma);
+
+		nr_slots = sync[SQ_IDX_NUM_SLOTS];
+		nr_handles = sync[SQ_IDX_NUM_HANDLES];
+		nvmap = (struct nvmap_client *)sync[SQ_IDX_NVMAP_CTX];
+		handles = (struct nvmap_handle **)&sync[SQ_IDX_HANDLES];
 
 		BUG_ON(!nvmap);
 
 		/* Unpin the memory */
 		nvmap_unpin_handles(nvmap, handles, nr_handles);
-
 		nvmap_client_put(nvmap);
 
 		/* Pop push buffer slots */
@@ -349,6 +435,168 @@ static void update_cdma(struct nvhost_cdma *cdma)
 	}
 }
 
+static u32 *advance_next_entry(struct nvhost_cdma *cdma, u32 *read)
+{
+	struct nvhost_master *host;
+	u32 ridx;
+
+	host = cdma_to_dev(cdma);
+
+	/* move sync_queue read ptr to next entry */
+	ridx = (read - cdma->sync_queue.buffer);
+	ridx += (SQ_IDX_HANDLES + entry_size(read[SQ_IDX_NUM_HANDLES]));
+	if ((ridx + SYNC_QUEUE_MIN_ENTRY) > host->sync_queue_size)
+		ridx = 0;
+
+	/* return sync_queue entry */
+	return cdma->sync_queue.buffer + ridx;
+}
+
+void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
+		struct nvhost_syncpt *syncpt, struct device *dev)
+{
+	u32 first_get, get_restart;
+	u32 syncpt_incrs, nr_slots;
+	bool clear_ctxsave, exec_ctxsave;
+	struct sync_queue *queue = &cdma->sync_queue;
+	u32 *sync = sync_queue_head(queue);
+	u32 syncpt_val = nvhost_syncpt_update_min(syncpt,
+			cdma->timeout.syncpt_id);
+
+	dev_dbg(dev,
+		"%s: starting cleanup (thresh %d, queue rd 0x%x wr 0x%x)\n",
+		__func__,
+		syncpt_val, queue->read, queue->write);
+
+	/*
+	 * Move the sync_queue read pointer to the first entry that hasn't
+	 * completed based on the current HW syncpt value. It's likely there
+	 * won't be any (i.e. we're still at the head), but covers the case
+	 * where a syncpt incr happens just prior/during the teardown.
+	 */
+
+	dev_dbg(dev,
+		"%s: skip completed buffers still in sync_queue\n",
+		__func__);
+
+	while (sync != (queue->buffer + queue->write)) {
+		/* move read ptr to first blocked entry */
+		if (syncpt_val < sync[SQ_IDX_SYNCPT_VAL])
+			break;	/* not completed */
+
+		dump_sync_queue_entry(cdma, sync);
+		sync = advance_next_entry(cdma, sync);
+	}
+
+	/*
+	 * Walk the sync_queue, first incrementing with the CPU syncpts that
+	 * are partially executed (the first buffer) or fully skipped while
+	 * still in the current context (slots are also NOP-ed).
+	 *
+	 * At the point contexts are interleaved, syncpt increments must be
+	 * done inline with the pushbuffer from a GATHER buffer to maintain
+	 * the order (slots are modified to be a GATHER of syncpt incrs).
+	 *
+	 * Note: save in get_restart the location where the timed out buffer
+	 * started in the PB, so we can start the refetch from there (with the
+	 * modified NOP-ed PB slots). This lets things appear to have completed
+	 * properly for this buffer and resources are freed.
+	 */
+
+	dev_dbg(dev,
+		"%s: perform CPU incr on pending same ctx buffers\n",
+		__func__);
+
+	get_restart = cdma->last_put;
+	if (sync != (queue->buffer + queue->write))
+		get_restart = sync[SQ_IDX_FIRST_GET];
+
+	/* do CPU increments */
+	while (sync != (queue->buffer + queue->write)) {
+
+		/* different context, gets us out of this loop */
+		if ((void *)sync[SQ_IDX_TIMEOUT_CTX] !=
+				cdma->timeout.ctx_timeout)
+			break;
+
+		syncpt_incrs = (sync[SQ_IDX_SYNCPT_VAL] - syncpt_val);
+		first_get = sync[SQ_IDX_FIRST_GET];
+		nr_slots = sync[SQ_IDX_NUM_SLOTS];
+
+		/* won't need a timeout when replayed */
+		sync[SQ_IDX_TIMEOUT] = 0;
+
+		dev_dbg(dev,
+			"%s: CPU incr (%d)\n", __func__, syncpt_incrs);
+
+		dump_sync_queue_entry(cdma, sync);
+
+		/* safe to use CPU to incr syncpts */
+		cdma_op(cdma).timeout_cpu_incr(cdma, first_get,
+			syncpt_incrs, nr_slots);
+		syncpt_val += syncpt_incrs;
+		sync = advance_next_entry(cdma, sync);
+	}
+
+	dev_dbg(dev,
+		"%s: GPU incr blocked interleaved ctx buffers\n",
+		__func__);
+
+	clear_ctxsave = true;
+	exec_ctxsave = false;
+
+	/* setup GPU increments */
+	while (sync != (queue->buffer + queue->write)) {
+
+		syncpt_incrs = (sync[SQ_IDX_SYNCPT_VAL] - syncpt_val);
+		first_get = sync[SQ_IDX_FIRST_GET];
+		nr_slots = sync[SQ_IDX_NUM_SLOTS];
+
+		/* same context, increment in the pushbuffer */
+		if ((void *)sync[SQ_IDX_TIMEOUT_CTX] ==
+				cdma->timeout.ctx_timeout) {
+
+			/* won't need a timeout when replayed */
+			sync[SQ_IDX_TIMEOUT] = 0;
+
+			/* update buffer's syncpts in the pushbuffer */
+			cdma_op(cdma).timeout_pb_incr(cdma, first_get,
+				syncpt_incrs, nr_slots, exec_ctxsave);
+
+			clear_ctxsave = true;
+			exec_ctxsave = false;
+		} else {
+			dev_dbg(dev,
+				"%s: switch to a different userctx\n",
+				__func__);
+			/*
+			 * If previous context was the timed out context
+			 * then clear its CTXSAVE in this slot.
+			 */
+			if (clear_ctxsave) {
+				cdma_op(cdma).timeout_clear_ctxsave(cdma,
+					first_get, nr_slots);
+				clear_ctxsave = false;
+			}
+			exec_ctxsave = true;
+		}
+
+		dump_sync_queue_entry(cdma, sync);
+
+		syncpt_val = sync[SQ_IDX_SYNCPT_VAL];
+		sync = advance_next_entry(cdma, sync);
+	}
+
+	dev_dbg(dev,
+		"%s: finished sync_queue modification\n", __func__);
+
+	/* roll back DMAGET and start up channel again */
+	cdma_op(cdma).timeout_teardown_end(cdma, get_restart);
+
+	cdma->timeout.ctx_timeout->has_timedout = true;
+	mutex_unlock(&cdma->lock);
+}
+
 /**
  * Create a cdma
  */
@@ -361,6 +609,7 @@ int nvhost_cdma_init(struct nvhost_cdma *cdma)
 	sema_init(&cdma->sem, 0);
 	cdma->event = CDMA_EVENT_NONE;
 	cdma->running = false;
+	cdma->torndown = false;
 
 	/* allocate sync queue memory */
 	cdma->sync_queue.buffer = kzalloc(cdma_to_dev(cdma)->sync_queue_size
@@ -381,25 +630,59 @@ int nvhost_cdma_init(struct nvhost_cdma *cdma)
 void nvhost_cdma_deinit(struct nvhost_cdma *cdma)
 {
 	struct push_buffer *pb = &cdma->push_buffer;
+
 	BUG_ON(!cdma_pb_op(cdma).destroy);
 	BUG_ON(cdma->running);
 	kfree(cdma->sync_queue.buffer);
 	cdma->sync_queue.buffer = 0;
 	cdma_pb_op(cdma).destroy(pb);
+	cdma_op(cdma).timeout_destroy(cdma);
 }
 
-
 /**
  * Begin a cdma submit
  */
-void nvhost_cdma_begin(struct nvhost_cdma *cdma)
+int nvhost_cdma_begin(struct nvhost_cdma *cdma,
+		       struct nvhost_userctx_timeout *timeout)
 {
-	BUG_ON(!cdma_op(cdma).start);
 	mutex_lock(&cdma->lock);
-	if (!cdma->running)
+
+	if (timeout && timeout->has_timedout) {
+		struct nvhost_master *dev = cdma_to_dev(cdma);
+		u32 min, max;
+
+		min = nvhost_syncpt_update_min(&dev->syncpt,
+			cdma->timeout.syncpt_id);
+		max = nvhost_syncpt_read_min(&dev->syncpt,
+			cdma->timeout.syncpt_id);
+
+		dev_dbg(&dev->pdev->dev,
+			"%s: skip timed out ctx submit (min = %d, max = %d)\n",
+			__func__, min, max);
+		mutex_unlock(&cdma->lock);
+		return -ETIMEDOUT;
+	}
+	if (timeout->timeout) {
+		/* init state on first submit with timeout value */
+		if (!cdma->timeout.initialized) {
+			int err;
+			BUG_ON(!cdma_op(cdma).timeout_init);
+			err = cdma_op(cdma).timeout_init(cdma,
+				timeout->syncpt_id);
+			if (err) {
+				mutex_unlock(&cdma->lock);
+				return err;
+			}
+		}
+	}
+	if (!cdma->running) {
+		BUG_ON(!cdma_op(cdma).start);
 		cdma_op(cdma).start(cdma);
+	}
 	cdma->slots_free = 0;
 	cdma->slots_used = 0;
+	cdma->first_get = cdma_pb_op(cdma).putptr(&cdma->push_buffer);
+	return 0;
 }
 
 /**
@@ -443,8 +726,11 @@ void nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 void nvhost_cdma_end(struct nvhost_cdma *cdma,
 		struct nvmap_client *user_nvmap,
 		u32 sync_point_id, u32 sync_point_value,
-		struct nvmap_handle **handles, unsigned int nr_handles)
+		struct nvmap_handle **handles, unsigned int nr_handles,
+		struct nvhost_userctx_timeout *timeout)
 {
+	bool was_idle = (cdma->sync_queue.read == cdma->sync_queue.write);
+
 	BUG_ON(!cdma_op(cdma).kick);
 	cdma_op(cdma).kick(cdma);
 
@@ -459,15 +745,24 @@ void nvhost_cdma_end(struct nvhost_cdma *cdma,
 		/* Add reloc entries to sync queue (as many as will fit) */
 		if (count > nr_handles)
 			count = nr_handles;
+
 		add_to_sync_queue(&cdma->sync_queue, sync_point_id,
 				  sync_point_value, cdma->slots_used,
-				  user_nvmap, handles, count);
+				  user_nvmap, handles, count, cdma->first_get,
+				  timeout);
+
 		/* NumSlots only goes in the first packet */
 		cdma->slots_used = 0;
 		handles += count;
 		nr_handles -= count;
 	}
 
+	/* start timer on idle -> active transitions */
+	if (timeout->timeout && was_idle) {
+		nvhost_cdma_start_timer(cdma, sync_point_id, sync_point_value,
+			timeout);
+	}
+
 	mutex_unlock(&cdma->lock);
 }
 
diff --git a/drivers/video/tegra/host/nvhost_cdma.h b/drivers/video/tegra/host/nvhost_cdma.h
index 45c2f7c57a7f..8bdc18b90220 100644
--- a/drivers/video/tegra/host/nvhost_cdma.h
+++ b/drivers/video/tegra/host/nvhost_cdma.h
@@ -31,6 +31,9 @@
 
 #include "nvhost_acm.h"
 
+struct nvhost_syncpt;
+struct nvhost_userctx_timeout;
+
 /*
  * cdma
  *
@@ -55,12 +58,42 @@ struct push_buffer {
 	struct nvmap_handle **handles;	/* nvmap handle for each opcode pair */
 };
 
+struct syncpt_buffer {
+	struct nvmap_handle_ref *mem; /* handle to pushbuffer memory */
+	u32 *mapped;		/* mapped gather buffer (at channel offset */
+	u32 phys;		/* physical address (at channel offset) */
+	u32 incr_per_buffer;	/* max # of incrs per GATHER */
+	u32 words_per_incr;	/* # of DWORDS in buffer to incr a syncpt */
+};
+
+enum sync_queue_idx {
+	SQ_IDX_SYNCPT_ID   = 0,
+	SQ_IDX_SYNCPT_VAL  = 1,
+	SQ_IDX_FIRST_GET   = 2,
+	SQ_IDX_TIMEOUT     = 3,
+	SQ_IDX_TIMEOUT_CTX = 4,
+	SQ_IDX_NUM_SLOTS   = (SQ_IDX_TIMEOUT_CTX + sizeof(void *)/4),
+	SQ_IDX_NUM_HANDLES = (SQ_IDX_NUM_SLOTS + 1),
+	SQ_IDX_NVMAP_CTX   = (SQ_IDX_NUM_HANDLES + 1),
+	SQ_IDX_HANDLES     = (SQ_IDX_NVMAP_CTX + sizeof(void *)/4),
+};
+
 struct sync_queue {
 	unsigned int read;		    /* read position within buffer */
 	unsigned int write;		    /* write position within buffer */
 	u32 *buffer;                        /* queue data */
 };
 
+struct buffer_timeout {
+	struct delayed_work wq;		/* work queue */
+	bool initialized;		/* timer one-time setup flag */
+	u32 syncpt_id;			/* buffer completion syncpt id */
+	u32 syncpt_val;			/* syncpt value when completed */
+	ktime_t start_ktime;		/* starting time */
+	/* context timeout information */
+	struct nvhost_userctx_timeout *ctx_timeout;
+};
+
 enum cdma_event {
 	CDMA_EVENT_NONE,		/* not waiting for any event */
 	CDMA_EVENT_SYNC_QUEUE_EMPTY,	/* wait for empty sync queue */
@@ -74,11 +107,14 @@ struct nvhost_cdma {
 	enum cdma_event event;		/* event that sem is waiting for */
 	unsigned int slots_used;	/* pb slots used in current submit */
 	unsigned int slots_free;	/* pb slots free in current submit */
+	unsigned int first_get;		/* DMAGET value, where submit begins */
 	unsigned int last_put;		/* last value written to DMAPUT */
 	struct push_buffer push_buffer;	/* channel's push buffer */
+	struct syncpt_buffer syncpt_buffer; /* syncpt incr buffer */
 	struct sync_queue sync_queue;	/* channel's sync queue */
+	struct buffer_timeout timeout;	/* channel's timeout state/wq */
 	bool running;
-
+	bool torndown;
 };
 
 #define cdma_to_channel(cdma) container_of(cdma, struct nvhost_channel, cdma)
@@ -88,22 +124,27 @@ struct nvhost_cdma {
 #define pb_to_cdma(pb) container_of(pb, struct nvhost_cdma, push_buffer)
 #define cdma_pb_op(cdma) (cdma_to_dev(cdma)->op.push_buffer)
 
-
 int	nvhost_cdma_init(struct nvhost_cdma *cdma);
 void	nvhost_cdma_deinit(struct nvhost_cdma *cdma);
 void	nvhost_cdma_stop(struct nvhost_cdma *cdma);
-void	nvhost_cdma_begin(struct nvhost_cdma *cdma);
+int	nvhost_cdma_begin(struct nvhost_cdma *cdma,
+		struct nvhost_userctx_timeout *timeout);
 void	nvhost_cdma_push(struct nvhost_cdma *cdma, u32 op1, u32 op2);
 void	nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 		struct nvmap_handle *handle, u32 op1, u32 op2);
 void	nvhost_cdma_end(struct nvhost_cdma *cdma,
 		struct nvmap_client *user_nvmap,
 		u32 sync_point_id, u32 sync_point_value,
-		struct nvmap_handle **handles, unsigned int nr_handles);
+		struct nvmap_handle **handles, unsigned int nr_handles,
+		struct nvhost_userctx_timeout *timeout);
 void	nvhost_cdma_update(struct nvhost_cdma *cdma);
 void	nvhost_cdma_flush(struct nvhost_cdma *cdma);
 void	nvhost_cdma_peek(struct nvhost_cdma *cdma,
 		u32 dmaget, int slot, u32 *out);
-
 unsigned int nvhost_cdma_wait(struct nvhost_cdma *cdma, enum cdma_event event);
+void nvhost_cdma_start_timer(struct nvhost_cdma *cdma, u32 syncpt_id,
+				u32 syncpt_val,
+				struct nvhost_userctx_timeout *timeout);
+void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
+		struct nvhost_syncpt *syncpt, struct device *dev);
 #endif
diff --git a/drivers/video/tegra/host/nvhost_channel.c b/drivers/video/tegra/host/nvhost_channel.c
index ad8d403df0f7..d533ef3cd737 100644
--- a/drivers/video/tegra/host/nvhost_channel.c
+++ b/drivers/video/tegra/host/nvhost_channel.c
@@ -27,8 +27,6 @@
 
 #include <linux/platform_device.h>
 
-
-
 struct nvhost_channel *nvhost_getchannel(struct nvhost_channel *ch)
 {
 	int err = 0;
diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h
index c939a19d3fad..c35c6d0a010c 100644
--- a/drivers/video/tegra/host/nvhost_channel.h
+++ b/drivers/video/tegra/host/nvhost_channel.h
@@ -50,6 +50,8 @@ struct nvhost_channeldesc {
 
 struct nvhost_channel {
 	int refcount;
+	int chid;
+	u32 syncpt_id;
 	struct mutex reflock;
 	struct mutex submitlock;
 	void __iomem *aperture;
@@ -90,6 +92,8 @@ int nvhost_channel_submit(
 	int nr_unpins,
 	u32 syncpt_id,
 	u32 syncpt_incrs,
+	u32 timeout,
+	void *timeout_ctx,
 	u32 *syncpt_value,
 	bool null_kickoff);
 
diff --git a/drivers/video/tegra/host/nvhost_hwctx.h b/drivers/video/tegra/host/nvhost_hwctx.h
index 06df90e58fb5..f128584e96f4 100644
--- a/drivers/video/tegra/host/nvhost_hwctx.h
+++ b/drivers/video/tegra/host/nvhost_hwctx.h
@@ -31,16 +31,19 @@
 
 struct nvhost_channel;
 struct nvhost_cdma;
+struct nvhost_userctx_timeout;
 
 struct nvhost_hwctx {
 	struct kref ref;
 
 	struct nvhost_channel *channel;
+	struct nvhost_userctx_timeout *timeout;
 	bool valid;
 
 	struct nvmap_handle_ref *save;
 	u32 save_incrs;
 	u32 save_thresh;
+	u32 save_slots;
 
 	struct nvmap_handle_ref *restore;
 	u32 *restore_virt;
diff --git a/drivers/video/tegra/host/nvhost_intr.c b/drivers/video/tegra/host/nvhost_intr.c
index 753c18456198..30ef7d2b8bc6 100644
--- a/drivers/video/tegra/host/nvhost_intr.c
+++ b/drivers/video/tegra/host/nvhost_intr.c
@@ -145,7 +145,7 @@ static void action_ctxsave(struct nvhost_waitlist *waiter)
 	struct nvhost_hwctx *hwctx = waiter->data;
 	struct nvhost_channel *channel = hwctx->channel;
 
-	if (channel->ctxhandler.save_service)
+	if (channel->ctxhandler.save_service && !hwctx->timeout->has_timedout)
 		channel->ctxhandler.save_service(hwctx);
 	channel->ctxhandler.put(hwctx);
 }
diff --git a/drivers/video/tegra/host/nvhost_syncpt.c b/drivers/video/tegra/host/nvhost_syncpt.c
index 6236dedf5f88..3d2ec61e07a0 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.c
+++ b/drivers/video/tegra/host/nvhost_syncpt.c
@@ -115,9 +115,14 @@ int nvhost_syncpt_wait_timeout(struct nvhost_syncpt *sp, u32 id,
 
 	if (value)
 		*value = 0;
+
 	BUG_ON(!syncpt_op(sp).update_min);
-	if (!nvhost_syncpt_check_max(sp, id, thresh))
+	if (!nvhost_syncpt_check_max(sp, id, thresh)) {
+		WARN(1, "wait %d (%s) for (%d) wouldn't be met (max %d)\n",
+			id, syncpt_op(sp).name(sp, id), thresh,
+			nvhost_syncpt_read_max(sp, id));
 		return -EINVAL;
+	}
 
 	/* first check cache */
 	if (nvhost_syncpt_min_cmp(sp, id, thresh)) {
diff --git a/drivers/video/tegra/host/t20/3dctx_t20.c b/drivers/video/tegra/host/t20/3dctx_t20.c
index dadfbed3434a..7ad7166b2d3a 100644
--- a/drivers/video/tegra/host/t20/3dctx_t20.c
+++ b/drivers/video/tegra/host/t20/3dctx_t20.c
@@ -216,11 +216,12 @@ static void setup_restore_v0(u32 *ptr)
 /*** save ***/
 
 /* the same context save command sequence is used for all contexts. */
-static struct nvmap_handle_ref *save_buf = NULL;
-static phys_addr_t save_phys = 0;
-static unsigned int save_size = 0;
-static unsigned int save_incrs = 0;
-static unsigned int save_thresh = 0;
+static struct nvmap_handle_ref *save_buf;
+static phys_addr_t save_phys;
+static unsigned int save_size;
+static unsigned int save_incrs;
+static unsigned int save_thresh;
+static unsigned int save_slots;
 
 static void __init setup_save_regs(const struct ctx_saver *saver,
 			struct save_info *info,
@@ -648,6 +649,7 @@ static struct nvhost_hwctx *ctx3d_alloc_common(struct nvhost_channel *ch,
 	ctx->save = save_buf;
 	ctx->save_incrs = save_incrs;
 	ctx->save_thresh = save_thresh;
+	ctx->save_slots = save_slots;
 	ctx->restore_phys = nvmap_pin(nvmap, ctx->restore);
 	ctx->restore_size = restore_size;
 	ctx->restore_incrs = restore_incrs;
@@ -769,6 +771,15 @@ int __init t20_nvhost_3dctx_handler_init(struct nvhost_hwctx_handler *h)
 		return err;
 	}
 
+	save_slots = 1;		/* save_push_v0() */
+	if (s_is_v1) {
+		save_slots = 6;	/* save_push_v1() */
+		if (register_sets == 2)
+			save_slots += 2;
+		if (s_war_insert_syncpoints)
+			save_slots += 1;
+	}
+
 	save_ptr = nvmap_mmap(save_buf);
 	if (!save_ptr) {
 		nvmap_free(nvmap, save_buf);
diff --git a/drivers/video/tegra/host/t20/cdma_t20.c b/drivers/video/tegra/host/t20/cdma_t20.c
index eaba1c78af92..69c3039357a8 100644
--- a/drivers/video/tegra/host/t20/cdma_t20.c
+++ b/drivers/video/tegra/host/t20/cdma_t20.c
@@ -25,6 +25,9 @@
 #include "../dev.h"
 
 #include "hardware_t20.h"
+#include "syncpt_t20.h"
+
+static void t20_cdma_timeout_handler(struct work_struct *work);
 
 /*
  * push_buffer
@@ -155,6 +158,266 @@ static u32 t20_push_buffer_putptr(struct push_buffer *pb)
 	return pb->phys + pb->cur;
 }
 
+/*
+ * The syncpt incr buffer is filled with methods to increment syncpts, which
+ * is later GATHER-ed into the mainline PB. It's used when a timed out context
+ * is interleaved with other work, so needs to inline the syncpt increments
+ * to maintain the count (but otherwise does no work).
+ */
+
+/**
+ * Init timeout and syncpt incr buffer resources
+ */
+static int t20_cdma_timeout_init(struct nvhost_cdma *cdma,
+				 u32 syncpt_id)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	struct nvmap_client *nvmap = cdma_to_nvmap(cdma);
+	struct syncpt_buffer *sb = &cdma->syncpt_buffer;
+	struct nvhost_channel *ch = cdma_to_channel(cdma);
+	u32 i = 0;
+
+	if (syncpt_id == NVSYNCPT_INVALID)
+		return -EINVAL;
+
+	/* allocate and map syncpt incr memory */
+	sb->mem = nvmap_alloc(nvmap,
+			(SYNCPT_INCR_BUFFER_SIZE_WORDS * sizeof(u32)), 32,
+			NVMAP_HANDLE_WRITE_COMBINE);
+	if (IS_ERR_OR_NULL(sb->mem)) {
+		sb->mem = NULL;
+		goto fail;
+	}
+	sb->mapped = nvmap_mmap(sb->mem);
+	if (sb->mapped == NULL)
+		goto fail;
+
+	/* pin syncpt buffer and get physical address */
+	sb->phys = nvmap_pin(nvmap, sb->mem);
+	if (sb->phys >= 0xfffff000) {
+		sb->phys = 0;
+		goto fail;
+	}
+
+	dev_dbg(&dev->pdev->dev, "%s: SYNCPT_INCR buffer at 0x%x\n",
+		 __func__, sb->phys);
+
+	sb->words_per_incr = (syncpt_id == NVSYNCPT_3D) ? 5 : 3;
+	sb->incr_per_buffer = (SYNCPT_INCR_BUFFER_SIZE_WORDS /
+				sb->words_per_incr);
+
+	/* init buffer with SETCL and INCR_SYNCPT methods */
+	while (i < sb->incr_per_buffer) {
+		sb->mapped[i++] = nvhost_opcode_setclass(NV_HOST1X_CLASS_ID,
+						0, 0);
+		sb->mapped[i++] = nvhost_opcode_imm_incr_syncpt(
+						NV_CLASS_HOST_SYNCPT_IMMEDIATE,
+						syncpt_id);
+		if (syncpt_id == NVSYNCPT_3D) {
+			/* also contains base increments */
+			sb->mapped[i++] = nvhost_opcode_nonincr(
+						NV_CLASS_HOST_INCR_SYNCPT_BASE,
+						1);
+			sb->mapped[i++] = nvhost_class_host_incr_syncpt_base(
+						NVWAITBASE_3D, 1);
+		}
+		sb->mapped[i++] = nvhost_opcode_setclass(ch->desc->class,
+						0, 0);
+	}
+	wmb();
+
+	INIT_DELAYED_WORK(&cdma->timeout.wq, t20_cdma_timeout_handler);
+	cdma->timeout.initialized = true;
+
+	return 0;
+fail:
+	cdma_op(cdma).timeout_destroy(cdma);
+	return -ENOMEM;
+}
+
+/**
+ * Clean up timeout syncpt buffer resources
+ */
+static void t20_cdma_timeout_destroy(struct nvhost_cdma *cdma)
+{
+	struct nvmap_client *nvmap = cdma_to_nvmap(cdma);
+	struct syncpt_buffer *sb = &cdma->syncpt_buffer;
+
+	if (sb->mapped)
+		nvmap_munmap(sb->mem, sb->mapped);
+
+	if (sb->phys != 0)
+		nvmap_unpin(nvmap, sb->mem);
+
+	if (sb->mem)
+		nvmap_free(nvmap, sb->mem);
+
+	sb->mem = NULL;
+	sb->mapped = NULL;
+	sb->phys = 0;
+
+	if (cdma->timeout.initialized)
+		cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.initialized = false;
+}
+
+/**
+ * Increment timedout buffer's syncpt via CPU.
+ */
+static void t20_cdma_timeout_cpu_incr(struct nvhost_cdma *cdma, u32 getptr,
+				u32 syncpt_incrs, u32 nr_slots)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	struct push_buffer *pb = &cdma->push_buffer;
+	u32 i, getidx;
+
+	for (i = 0; i < syncpt_incrs; i++)
+		nvhost_syncpt_cpu_incr(&dev->syncpt, cdma->timeout.syncpt_id);
+
+	/* after CPU incr, ensure shadow is up to date */
+	nvhost_syncpt_update_min(&dev->syncpt, cdma->timeout.syncpt_id);
+
+	/* update WAITBASE_3D by same number of incrs */
+	if (cdma->timeout.syncpt_id == NVSYNCPT_3D) {
+		void __iomem *p;
+		p = dev->sync_aperture + HOST1X_SYNC_SYNCPT_BASE_0 +
+				(NVWAITBASE_3D * sizeof(u32));
+		writel(readl(p) + syncpt_incrs, p);
+	}
+
+	/* NOP all the PB slots */
+	getidx = getptr - pb->phys;
+	while (nr_slots--) {
+		u32 *p = (u32 *)((u32)pb->mapped + getidx);
+		*(p++) = NVHOST_OPCODE_NOOP;
+		*(p++) = NVHOST_OPCODE_NOOP;
+		dev_dbg(&dev->pdev->dev, "%s: NOP at 0x%x\n",
+			__func__, pb->phys + getidx);
+		getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+	}
+	wmb();
+}
+
+/**
+ * This routine is called at the point we transition back into a timed
+ * ctx. The syncpts are incremented via pushbuffer with a flag indicating
+ * whether there's a CTXSAVE that should be still executed (for the
+ * preceding HW ctx).
+ */
+static void t20_cdma_timeout_pb_incr(struct nvhost_cdma *cdma, u32 getptr,
+				u32 syncpt_incrs, u32 nr_slots,
+				bool exec_ctxsave)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	struct syncpt_buffer *sb = &cdma->syncpt_buffer;
+	struct push_buffer *pb = &cdma->push_buffer;
+	struct nvhost_userctx_timeout *timeout = cdma->timeout.ctx_timeout;
+	u32 getidx, *p;
+
+	/* should have enough slots to incr to desired count */
+	BUG_ON(syncpt_incrs > (nr_slots * sb->incr_per_buffer));
+
+	getidx = getptr - pb->phys;
+	if (exec_ctxsave) {
+		/* don't disrupt the CTXSAVE of a good/non-timed out ctx */
+		nr_slots -= timeout->hwctx->save_slots;
+		syncpt_incrs -= timeout->hwctx->save_incrs;
+
+		getidx += (timeout->hwctx->save_slots * 8);
+		getidx &= (PUSH_BUFFER_SIZE - 1);
+
+		dev_dbg(&dev->pdev->dev,
+			"%s: exec CTXSAVE of prev ctx (slots %d, incrs %d)\n",
+			__func__, nr_slots, syncpt_incrs);
+	}
+
+	while (syncpt_incrs) {
+		u32 incrs, count;
+
+		/* GATHER count are incrs * number of DWORDs per incr */
+		incrs = min(syncpt_incrs, sb->incr_per_buffer);
+		count = incrs * sb->words_per_incr;
+
+		p = (u32 *)((u32)pb->mapped + getidx);
+		*(p++) = nvhost_opcode_gather(count);
+		*(p++) = sb->phys;
+
+		dev_dbg(&dev->pdev->dev,
+			"%s: GATHER at 0x%x, from 0x%x, dcount = %d\n",
+			__func__,
+			pb->phys + getidx, sb->phys,
+			(incrs * sb->words_per_incr));
+
+		syncpt_incrs -= incrs;
+		getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+		nr_slots--;
+	}
+
+	/* NOP remaining slots */
+	while (nr_slots--) {
+		p = (u32 *)((u32)pb->mapped + getidx);
+		*(p++) = NVHOST_OPCODE_NOOP;
+		*(p++) = NVHOST_OPCODE_NOOP;
+		dev_dbg(&dev->pdev->dev, "%s: NOP at 0x%x\n",
+			__func__, pb->phys + getidx);
+		getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+	}
+	wmb();
+}
+
+/**
+ * Clear a context switch save for a timed out context that's been
+ * queued up in a non-timed out context.
+ */
+static void t20_cdma_timeout_clear_ctxsave(struct nvhost_cdma *cdma,
+				u32 getptr, u32 nr_slots)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	struct syncpt_buffer *sb = &cdma->syncpt_buffer;
+	struct push_buffer *pb = &cdma->push_buffer;
+	struct nvhost_userctx_timeout *timeout = cdma->timeout.ctx_timeout;
+	u32 getidx, *p;
+
+	getidx = getptr - pb->phys;
+	p = (u32 *)((u32)pb->mapped + getidx);
+
+	if (timeout->hwctx) {
+		u32 incrs, slots_to_clear;
+
+		slots_to_clear = timeout->hwctx->save_slots;
+		incrs = timeout->hwctx->save_incrs;
+
+		BUG_ON(slots_to_clear > nr_slots);
+		BUG_ON(incrs > sb->incr_per_buffer);
+
+		dev_dbg(&dev->pdev->dev,
+			"%s: clearing CTXSAVE at 0x%x, for %d slots %d incrs\n",
+			__func__, pb->phys + getidx, slots_to_clear, incrs);
+
+		/* first, GATHER incr for ctxsave */
+		if (incrs) {
+			u32 count = incrs * sb->words_per_incr;
+
+			p = (u32 *)((u32)pb->mapped + getidx);
+			*(p++) = nvhost_opcode_gather(count);
+			*(p++) = sb->phys;
+
+			getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+			slots_to_clear--;
+		}
+
+		/* NOP remaining slots */
+		while (slots_to_clear--) {
+			p = (u32 *)((u32)pb->mapped + getidx);
+			*(p++) = NVHOST_OPCODE_NOOP;
+			*(p++) = NVHOST_OPCODE_NOOP;
+			dev_dbg(&dev->pdev->dev, "%s: NOP at 0x%x\n",
+				__func__, pb->phys + getidx);
+			getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+		}
+	}
+	wmb();
+}
 
 /**
  * Start channel DMA
@@ -167,7 +430,6 @@ static void t20_cdma_start(struct nvhost_cdma *cdma)
 		return;
 
 	BUG_ON(!cdma_pb_op(cdma).putptr);
-
 	cdma->last_put = cdma_pb_op(cdma).putptr(&cdma->push_buffer);
 
 	writel(nvhost_channel_dmactrl(true, false, false),
@@ -190,6 +452,53 @@ static void t20_cdma_start(struct nvhost_cdma *cdma)
 }
 
 /**
+ * Similar to t20_cdma_start(), but rather than starting from an idle
+ * state (where DMA GET is set to DMA PUT), on a timeout we restore
+ * DMA GET from an explicit value (so DMA may again be pending).
+ */
+static void t20_cdma_timeout_restart(struct nvhost_cdma *cdma, u32 getptr)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	void __iomem *chan_regs = cdma_to_channel(cdma)->aperture;
+
+	if (cdma->running)
+		return;
+
+	BUG_ON(!cdma_pb_op(cdma).putptr);
+	cdma->last_put = cdma_pb_op(cdma).putptr(&cdma->push_buffer);
+
+	writel(nvhost_channel_dmactrl(true, false, false),
+		chan_regs + HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, end pointer (all of memory) */
+	writel(0, chan_regs + HOST1X_CHANNEL_DMASTART);
+	writel(0xFFFFFFFF, chan_regs + HOST1X_CHANNEL_DMAEND);
+
+	/* set GET, by loading the value in PUT (then reset GET) */
+	writel(getptr, chan_regs + HOST1X_CHANNEL_DMAPUT);
+	writel(nvhost_channel_dmactrl(true, true, true),
+		chan_regs + HOST1X_CHANNEL_DMACTRL);
+
+	dev_dbg(&dev->pdev->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__,
+		readl(chan_regs + HOST1X_CHANNEL_DMAGET),
+		readl(chan_regs + HOST1X_CHANNEL_DMAPUT),
+		cdma->last_put);
+
+	/* deassert GET reset and set PUT */
+	writel(nvhost_channel_dmactrl(true, false, false),
+		chan_regs + HOST1X_CHANNEL_DMACTRL);
+	writel(cdma->last_put, chan_regs + HOST1X_CHANNEL_DMAPUT);
+
+	/* start the command DMA */
+	writel(nvhost_channel_dmactrl(false, false, false),
+		chan_regs + HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/**
  * Kick channel DMA into action by writing its PUT offset (if it has changed)
  */
 static void t20_cdma_kick(struct nvhost_cdma *cdma)
@@ -235,12 +544,145 @@ void t20_cdma_peek(struct nvhost_cdma *cdma,
 	out[1] = p[offset + 1];
 }
 
+/**
+ * Stops both channel's command processor and CDMA immediately.
+ * Also, tears down the channel and resets corresponding module.
+ */
+void t20_cdma_timeout_teardown_begin(struct nvhost_cdma *cdma)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	struct nvhost_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	BUG_ON(cdma->torndown);
+
+	dev_dbg(&dev->pdev->dev,
+		"begin channel teardown (channel id %d)\n", ch->chid);
+
+	cmdproc_stop = readl(dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop = nvhost_sync_cmdproc_stop_chid(cmdproc_stop, ch->chid);
+	writel(cmdproc_stop, dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(&dev->pdev->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__,
+		readl(ch->aperture + HOST1X_CHANNEL_DMAGET),
+		readl(ch->aperture + HOST1X_CHANNEL_DMAPUT),
+		cdma->last_put);
+
+	writel(nvhost_channel_dmactrl(true, false, false),
+		ch->aperture + HOST1X_CHANNEL_DMACTRL);
+
+	writel(BIT(ch->chid), dev->sync_aperture + HOST1X_SYNC_CH_TEARDOWN);
+	nvhost_module_reset(&ch->mod);
+
+	cdma->running = false;
+	cdma->torndown = true;
+}
+
+void t20_cdma_timeout_teardown_end(struct nvhost_cdma *cdma, u32 getptr)
+{
+	struct nvhost_master *dev = cdma_to_dev(cdma);
+	struct nvhost_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	BUG_ON(!cdma->torndown || cdma->running);
+
+	dev_dbg(&dev->pdev->dev,
+		"end channel teardown (id %d, DMAGET restart = 0x%x)\n",
+		ch->chid, getptr);
+
+	cmdproc_stop = readl(dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop = nvhost_sync_cmdproc_run_chid(cmdproc_stop, ch->chid);
+	writel(cmdproc_stop, dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+
+	cdma->torndown = false;
+	t20_cdma_timeout_restart(cdma, getptr);
+}
+
+/**
+ * If this timeout fires, it indicates the current sync_queue entry has
+ * exceeded its TTL and the userctx should be timed out and remaining
+ * submits already issued cleaned up (future submits return an error).
+ */
+static void t20_cdma_timeout_handler(struct work_struct *work)
+{
+	struct nvhost_cdma *cdma;
+	struct nvhost_master *dev;
+	struct nvhost_syncpt *sp;
+	struct nvhost_channel *ch;
+
+	u32 syncpt_val;
+
+	u32 prev_cmdproc, cmdproc_stop;
+
+	cdma = container_of(to_delayed_work(work), struct nvhost_cdma,
+			    timeout.wq);
+	dev = cdma_to_dev(cdma);
+	sp = &dev->syncpt;
+	ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+
+	if (!cdma->timeout.ctx_timeout) {
+		dev_dbg(&dev->pdev->dev,
+			 "cdma_timeout: expired, but has NULL context\n");
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	/* stop processing to get a clean snapshot */
+	prev_cmdproc = readl(dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop = nvhost_sync_cmdproc_stop_chid(prev_cmdproc, ch->chid);
+	writel(cmdproc_stop, dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(&dev->pdev->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
+		prev_cmdproc, cmdproc_stop);
+
+	syncpt_val = nvhost_syncpt_update_min(&dev->syncpt,
+			cdma->timeout.syncpt_id);
+
+	/* has buffer actually completed? */
+	if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
+		dev_dbg(&dev->pdev->dev,
+			 "cdma_timeout: expired, but buffer had completed\n");
+		/* restore */
+		cmdproc_stop = nvhost_sync_cmdproc_run_chid(prev_cmdproc,
+			ch->chid);
+		writel(cmdproc_stop,
+			dev->sync_aperture + HOST1X_SYNC_CMDPROC_STOP);
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	dev_warn(&dev->pdev->dev,
+		"%s: timeout: %d (%s) ctx 0x%p, HW thresh %d, done %d\n",
+		__func__,
+		cdma->timeout.syncpt_id,
+		syncpt_op(sp).name(sp, cdma->timeout.syncpt_id),
+		cdma->timeout.ctx_timeout,
+		syncpt_val, cdma->timeout.syncpt_val);
+
+	/* stop HW, resetting channel/module */
+	cdma_op(cdma).timeout_teardown_begin(cdma);
+
+	nvhost_cdma_update_sync_queue(cdma, sp, &dev->pdev->dev);
+}
+
 int nvhost_init_t20_cdma_support(struct nvhost_master *host)
 {
 	host->op.cdma.start = t20_cdma_start;
 	host->op.cdma.stop = t20_cdma_stop;
 	host->op.cdma.kick = t20_cdma_kick;
 
+	host->op.cdma.timeout_init = t20_cdma_timeout_init;
+	host->op.cdma.timeout_destroy = t20_cdma_timeout_destroy;
+	host->op.cdma.timeout_teardown_begin = t20_cdma_timeout_teardown_begin;
+	host->op.cdma.timeout_teardown_end = t20_cdma_timeout_teardown_end;
+	host->op.cdma.timeout_cpu_incr = t20_cdma_timeout_cpu_incr;
+	host->op.cdma.timeout_pb_incr = t20_cdma_timeout_pb_incr;
+	host->op.cdma.timeout_clear_ctxsave = t20_cdma_timeout_clear_ctxsave;
+
 	host->sync_queue_size = NVHOST_SYNC_QUEUE_SIZE;
 
 	host->op.push_buffer.reset = t20_push_buffer_reset;
diff --git a/drivers/video/tegra/host/t20/channel_t20.c b/drivers/video/tegra/host/t20/channel_t20.c
index fdbf6ba7355d..b45c00421ec9 100644
--- a/drivers/video/tegra/host/t20/channel_t20.c
+++ b/drivers/video/tegra/host/t20/channel_t20.c
@@ -27,6 +27,7 @@
 
 #include "hardware_t20.h"
 #include "syncpt_t20.h"
+#include "../dev.h"
 
 #define NVHOST_NUMCHANNELS (NV_HOST1X_CHANNELS - 1)
 #define NVHOST_CHANNEL_BASE 0
@@ -42,10 +43,7 @@
 #define NVMODMUTEX_DSI       (9)
 #define NV_FIFO_READ_TIMEOUT 200000
 
-static void power_2d(struct nvhost_module *mod, enum nvhost_power_action action);
 static void power_3d(struct nvhost_module *mod, enum nvhost_power_action action);
-static void power_mpe(struct nvhost_module *mod, enum nvhost_power_action action);
-
 
 
 static const struct nvhost_channeldesc channelmap[] = {
@@ -74,7 +72,6 @@ static const struct nvhost_channeldesc channelmap[] = {
 	.waitbases     = BIT(NVWAITBASE_2D_0) | BIT(NVWAITBASE_2D_1),
 	.modulemutexes = BIT(NVMODMUTEX_2D_FULL) | BIT(NVMODMUTEX_2D_SIMPLE) |
 			 BIT(NVMODMUTEX_2D_SB_A) | BIT(NVMODMUTEX_2D_SB_B),
-	.power         = power_2d,
 },
 {
 	/* channel 3 */
@@ -98,7 +95,6 @@ static const struct nvhost_channeldesc channelmap[] = {
 			 BIT(NVSYNCPT_MPE_WR_SAFE),
 	.waitbases     = BIT(NVWAITBASE_MPE),
 	.class	       = NV_VIDEO_ENCODE_MPEG_CLASS_ID,
-	.power	       = power_mpe,
 	.exclusive     = true,
 	.keepalive     = true,
 },
@@ -138,6 +134,7 @@ static int t20_channel_init(struct nvhost_channel *ch,
 			    struct nvhost_master *dev, int index)
 {
 	ch->dev = dev;
+	ch->chid = index;
 	ch->desc = channelmap + index;
 	mutex_init(&ch->reflock);
 	mutex_init(&ch->submitlock);
@@ -161,6 +158,7 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 			      int nr_unpins,
 			      u32 syncpt_id,
 			      u32 syncpt_incrs,
+			      struct nvhost_userctx_timeout *timeout,
 			      u32 *syncpt_value,
 			      bool null_kickoff)
 {
@@ -176,6 +174,9 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	if (strcmp(channel->mod.name, "gr3d") == 0)
 		module3d_notify_busy();
 
+	/* before error checks, return current max */
+	*syncpt_value = nvhost_syncpt_read_max(sp, syncpt_id);
+
 	/* get submit lock */
 	err = mutex_lock_interruptible(&channel->submitlock);
 	if (err) {
@@ -198,11 +199,26 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 		}
 	}
 
+	/* begin a CDMA submit */
+	err = nvhost_cdma_begin(&channel->cdma, timeout);
+	if (err) {
+		mutex_unlock(&channel->submitlock);
+		nvhost_module_idle(&channel->mod);
+		return err;
+	}
+
 	/* context switch */
 	if (channel->cur_ctx != hwctx) {
 		trace_nvhost_channel_context_switch(channel->desc->name,
 		  channel->cur_ctx, hwctx);
 		hwctx_to_save = channel->cur_ctx;
+		if (hwctx_to_save && hwctx_to_save->timeout &&
+			hwctx_to_save->timeout->has_timedout) {
+			hwctx_to_save = NULL;
+			dev_dbg(&channel->dev->pdev->dev,
+				"%s: skip save of timed out context (0x%p)\n",
+				__func__, channel->cur_ctx->timeout);
+		}
 		if (hwctx_to_save) {
 			syncpt_incrs += hwctx_to_save->save_incrs;
 			hwctx_to_save->valid = true;
@@ -223,9 +239,6 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 		syncval = nvhost_syncpt_incr_max(sp,
 						syncpt_id, syncpt_incrs);
 
-	/* begin a CDMA submit */
-	nvhost_cdma_begin(&channel->cdma);
-
 	/* push save buffer (pre-gather setup depends on unit) */
 	if (hwctx_to_save)
 		channel->ctxhandler.save_push(&channel->cdma, hwctx_to_save);
@@ -281,7 +294,8 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 
 	/* end CDMA submit & stash pinned hMems into sync queue */
 	nvhost_cdma_end(&channel->cdma, user_nvmap,
-			syncpt_id, syncval, unpins, nr_unpins);
+			syncpt_id, syncval, unpins, nr_unpins,
+			timeout);
 
 	trace_nvhost_channel_submitted(channel->desc->name,
 			syncval-syncpt_incrs, syncval);
@@ -308,23 +322,16 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	return 0;
 }
 
-static void power_2d(struct nvhost_module *mod, enum nvhost_power_action action)
-{
-	/* TODO: [ahatala 2010-06-17] reimplement EPP hang war */
-	if (action == NVHOST_POWER_ACTION_OFF) {
-		/* TODO: [ahatala 2010-06-17] reset EPP */
-	}
-}
-
 static void power_3d(struct nvhost_module *mod, enum nvhost_power_action action)
 {
 	struct nvhost_channel *ch = container_of(mod, struct nvhost_channel, mod);
 	struct nvhost_hwctx *hwctx_to_save;
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	u32 syncpt_incrs, syncpt_val;
+	int err;
 	void *ref;
 
-	if (action != NVHOST_POWER_ACTION_OFF)
+	if ((action != NVHOST_POWER_ACTION_OFF) || !mod->can_powergate)
 		return;
 
 	mutex_lock(&ch->submitlock);
@@ -337,6 +344,12 @@ static void power_3d(struct nvhost_module *mod, enum nvhost_power_action action)
 	if (strcmp(mod->name, "gr3d") == 0)
 		module3d_notify_busy();
 
+	err = nvhost_cdma_begin(&ch->cdma, hwctx_to_save->timeout);
+	if (err) {
+		mutex_unlock(&ch->submitlock);
+		return;
+	}
+
 	hwctx_to_save->valid = true;
 	ch->ctxhandler.get(hwctx_to_save);
 	ch->cur_ctx = NULL;
@@ -345,9 +358,9 @@ static void power_3d(struct nvhost_module *mod, enum nvhost_power_action action)
 	syncpt_val = nvhost_syncpt_incr_max(&ch->dev->syncpt,
 					NVSYNCPT_3D, syncpt_incrs);
 
-	nvhost_cdma_begin(&ch->cdma);
 	ch->ctxhandler.save_push(&ch->cdma, hwctx_to_save);
-	nvhost_cdma_end(&ch->cdma, ch->dev->nvmap, NVSYNCPT_3D, syncpt_val, NULL, 0);
+	nvhost_cdma_end(&ch->cdma, ch->dev->nvmap, NVSYNCPT_3D, syncpt_val,
+			NULL, 0, hwctx_to_save->timeout);
 
 	nvhost_intr_add_action(&ch->dev->intr, NVSYNCPT_3D,
 			syncpt_val - syncpt_incrs + hwctx_to_save->save_thresh,
@@ -366,13 +379,10 @@ static void power_3d(struct nvhost_module *mod, enum nvhost_power_action action)
 	mutex_unlock(&ch->submitlock);
 }
 
-static void power_mpe(struct nvhost_module *mod, enum nvhost_power_action action)
-{
-}
-
 static int t20_channel_read_3d_reg(
 	struct nvhost_channel *channel,
 	struct nvhost_hwctx *hwctx,
+	struct nvhost_userctx_timeout *timeout,
 	u32 offset,
 	u32 *value)
 {
@@ -414,7 +424,7 @@ static int t20_channel_read_3d_reg(
 		NVSYNCPT_3D, syncpt_incrs);
 
 	/* begin a CDMA submit */
-	nvhost_cdma_begin(&channel->cdma);
+	nvhost_cdma_begin(&channel->cdma, timeout);
 
 	/* push save buffer (pre-gather setup depends on unit) */
 	if (hwctx_to_save)
@@ -463,7 +473,8 @@ static int t20_channel_read_3d_reg(
 
 	/* end CDMA submit  */
 	nvhost_cdma_end(&channel->cdma, channel->dev->nvmap,
-			NVSYNCPT_3D, syncval, NULL, 0);
+			NVSYNCPT_3D, syncval, NULL, 0,
+			timeout);
 
 	/*
 	 * schedule a context save interrupt (to drain the host FIFO
diff --git a/drivers/video/tegra/host/t20/hardware_t20.h b/drivers/video/tegra/host/t20/hardware_t20.h
index 1e68bdcde0fa..4245a44c6bc2 100644
--- a/drivers/video/tegra/host/t20/hardware_t20.h
+++ b/drivers/video/tegra/host/t20/hardware_t20.h
@@ -91,6 +91,8 @@ enum {
 	HOST1X_SYNC_SYNCPT_THRESH_CPU1_INT_STATUS = 0x48,
 	HOST1X_SYNC_SYNCPT_THRESH_INT_DISABLE = 0x60,
 	HOST1X_SYNC_SYNCPT_THRESH_INT_ENABLE_CPU0 = 0x68,
+	HOST1X_SYNC_CMDPROC_STOP = 0xac,
+	HOST1X_SYNC_CH_TEARDOWN = 0xb0,
 	HOST1X_SYNC_USEC_CLK = 0x1a4,
 	HOST1X_SYNC_CTXSW_TIMEOUT_CFG = 0x1a8,
 	HOST1X_SYNC_IP_BUSY_TIMEOUT = 0x1bc,
@@ -129,6 +131,20 @@ static inline unsigned int nvhost_sync_mlock_owner_owner_chid(u32 reg)
 	return (reg >> 8) & 0xf;
 }
 
+static inline unsigned int nvhost_sync_cmdproc_stop_chid(u32 reg, u32 chid)
+{
+	return reg | BIT(chid);
+}
+
+static inline unsigned int nvhost_sync_cmdproc_run_chid(u32 reg, u32 chid)
+{
+	return reg & ~(BIT(chid));
+}
+
+static inline unsigned int nvhost_sync_ch_teardown_chid(u32 reg, u32 chid)
+{
+	return reg | BIT(chid);
+}
 
 /* host class methods */
 enum {
@@ -271,4 +287,8 @@ int nvhost_drain_read_fifo(void __iomem *chan_regs,
 /* 8 bytes per slot. (This number does not include the final RESTART.) */
 #define PUSH_BUFFER_SIZE (NVHOST_GATHER_QUEUE_SIZE * 8)
 
+/* 4K page containing GATHERed methods to increment channel syncpts
+ * and replaces the original timed out contexts GATHER slots */
+#define SYNCPT_INCR_BUFFER_SIZE_WORDS   (4096 / sizeof(u32))
+
 #endif /* __NVHOST_HARDWARE_T20_H */
diff --git a/include/linux/nvhost_ioctl.h b/include/linux/nvhost_ioctl.h
index 6e49827b873c..ef6685ea418f 100644
--- a/include/linux/nvhost_ioctl.h
+++ b/include/linux/nvhost_ioctl.h
@@ -96,6 +96,10 @@ struct nvhost_clk_rate_args {
 	__u64 rate;
 };
 
+struct nvhost_set_timeout_args {
+	__u32 timeout;
+};
+
 #define NVHOST_IOCTL_CHANNEL_FLUSH		\
 	_IOR(NVHOST_IOCTL_MAGIC, 1, struct nvhost_get_param_args)
 #define NVHOST_IOCTL_CHANNEL_GET_SYNCPOINTS	\
@@ -116,8 +120,12 @@ struct nvhost_clk_rate_args {
 	_IOR(NVHOST_IOCTL_MAGIC, 9, struct nvhost_clk_rate_args)
 #define NVHOST_IOCTL_CHANNEL_SET_CLK_RATE		\
 	_IOW(NVHOST_IOCTL_MAGIC, 10, struct nvhost_clk_rate_args)
+#define NVHOST_IOCTL_CHANNEL_SET_TIMEOUT	\
+	_IOW(NVHOST_IOCTL_MAGIC, 11, struct nvhost_set_timeout_args)
+#define NVHOST_IOCTL_CHANNEL_GET_TIMEDOUT	\
+	_IOR(NVHOST_IOCTL_MAGIC, 12, struct nvhost_get_param_args)
 #define NVHOST_IOCTL_CHANNEL_LAST		\
-	_IOC_NR(NVHOST_IOCTL_CHANNEL_SET_CLK_RATE)
+	_IOC_NR(NVHOST_IOCTL_CHANNEL_GET_TIMEDOUT)
 #define NVHOST_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvhost_submit_hdr_ext)
 
 struct nvhost_ctrl_syncpt_read_args {