video: tegra: host: module debugger framework

Framework and implementation of a gk20a debugger/profiler session interface. Adds work toward optimized handling of context patch write sequences. These introduce cpu map/unmap operations and gpu l2 invaliates. Unless we take care to coalesce them they occur *per write*. Change-Id: I8afc11a6f6782b80996404acbd01bffe9653ebdd Signed-off-by: Ken Adams <kadams@nvidia.com> Reviewed-on: http://git-master/r/274416
author: Ken Adams <kadams@nvidia.com> 2013-09-17 12:55:54 -0400
committer: Dan Willemsen <dwillemsen@nvidia.com> 2013-09-27 12:53:49 -0700
commit: d55049b57a338403afe3a0e8d93ee83a9d63007d (patch)
tree: dd82aefcd9924e43d63c25d80e23a832d84b80de
parent: 409be5d3c52b2a6cd6a843d91f8fbf63f4d3b42b (diff)
22 files changed, 2631 insertions, 134 deletions
diff --git a/drivers/video/tegra/host/bus_client.c b/drivers/video/tegra/host/bus_client.c
index 7e1a4c829445..34e88f20881e 100644
--- a/drivers/video/tegra/host/bus_client.c
+++ b/drivers/video/tegra/host/bus_client.c
@@ -1,6 +1,4 @@
 /*
- * drivers/video/tegra/host/bus_client.c
- *
  * Tegra Graphics Host Client Module
  *
  * Copyright (c) 2010-2013, NVIDIA Corporation. All rights reserved.
@@ -217,7 +215,7 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 	}
 	filp->private_data = priv;
 	priv->ch = ch;
-	if(nvhost_module_add_client(ch->dev, priv))
+	if (nvhost_module_add_client(ch->dev, priv))
 		goto fail;
 
 	if (ch->ctxhandler && ch->ctxhandler->alloc) {
@@ -1108,9 +1106,10 @@ int nvhost_client_user_init(struct platform_device *dev)
 	struct nvhost_channel *ch = pdata->channel;
 
 	BUG_ON(!ch);
-	// reserve 3 minor #s for <dev> and as-<dev> and ctrl-<dev>
+	/* reserve 4 minor #s for <dev> and as-<dev>, ctrl-<dev>
+	 * and dbg-<dev> */
 
-	err = alloc_chrdev_region(&devno, 0, 3, IFACE_NAME);
+	err = alloc_chrdev_region(&devno, 0, 4, IFACE_NAME);
 	if (err < 0) {
 		dev_err(&dev->dev, "failed to allocate devno\n");
 		goto fail;
@@ -1135,6 +1134,16 @@ int nvhost_client_user_init(struct platform_device *dev)
 			goto fail;
 	}
 
+	if (pdata->dbg_ops) {
+		++devno;
+		pdata->dbg_node = nvhost_client_device_create(dev,
+					&pdata->dbg_cdev, "dbg-",
+					devno, pdata->dbg_ops);
+		if (pdata->dbg_node == NULL)
+			goto fail;
+	}
+
+
 	return 0;
 fail:
 	return err;
diff --git a/drivers/video/tegra/host/bus_client.h b/drivers/video/tegra/host/bus_client.h
index 07bc7104d283..db3e228e8eec 100644
--- a/drivers/video/tegra/host/bus_client.h
+++ b/drivers/video/tegra/host/bus_client.h
@@ -55,6 +55,4 @@ nvhost_client_request_firmware(struct platform_device *dev,
 
 int nvhost_client_device_get_resources(struct platform_device *dev);
 
-struct nvhost_hwctx *nvhost_channel_get_file_hwctx(int fd);
-
 #endif
diff --git a/drivers/video/tegra/host/dev.h b/drivers/video/tegra/host/dev.h
index 77330c3b0d05..107b1beaa0ba 100644
--- a/drivers/video/tegra/host/dev.h
+++ b/drivers/video/tegra/host/dev.h
@@ -39,7 +39,7 @@ void nvhost_device_list_remove(struct platform_device *pdev);
 #else
     /* manually enable and turn it on the mask */
     /*#define NVHOST_DEBUG*/
-    #define NVHOST_DEFAULT_DBG_MASK (dbg_info)
+    #define NVHOST_DEFAULT_DBG_MASK (dbg_err|dbg_info)
 #endif
 
 enum nvhost_dbg_categories {
@@ -52,6 +52,7 @@ enum nvhost_dbg_categories {
 	dbg_pmu     = BIT(6),  /* gk20a pmu */
 	dbg_clk     = BIT(7),  /* gk20a clk */
 	dbg_map     = BIT(8),  /* mem mappings */
+	dbg_gpu_dbg = BIT(9),  /* gpu debugger */
 	dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };
 
diff --git a/drivers/video/tegra/host/gk20a/Makefile b/drivers/video/tegra/host/gk20a/Makefile
index c22d74696389..2d7b9a524c67 100644
--- a/drivers/video/tegra/host/gk20a/Makefile
+++ b/drivers/video/tegra/host/gk20a/Makefile
@@ -11,6 +11,8 @@ nvhost-gk20a-objs  = \
 	channel_gk20a.o \
 	cdma_gk20a.o \
 	debug_gk20a.o \
+	dbg_gpu_gk20a.o \
+	regops_gk20a.o \
 	gr_gk20a.o \
 	kind_gk20a.o \
 	mm_gk20a.o \
diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.c b/drivers/video/tegra/host/gk20a/channel_gk20a.c
index d509510742be..6c584c448811 100644
--- a/drivers/video/tegra/host/gk20a/channel_gk20a.c
+++ b/drivers/video/tegra/host/gk20a/channel_gk20a.c
@@ -1495,6 +1495,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
 #if defined(CONFIG_TEGRA_GPU_CYCLE_STATS)
 	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
 #endif
+	mutex_init(&c->dbg_s_lock);
 	return 0;
 }
 
diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.h b/drivers/video/tegra/host/gk20a/channel_gk20a.h
index 5ade025d2a48..dca69aea6f01 100644
--- a/drivers/video/tegra/host/gk20a/channel_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/channel_gk20a.h
@@ -30,6 +30,7 @@ struct gk20a;
 struct gr_gk20a;
 struct mem_mgr;
 struct mem_handle;
+struct dbg_session_gk20a;
 
 #include "nvhost_channel.h"
 #include "nvhost_hwctx.h"
@@ -129,6 +130,8 @@ struct channel_gk20a {
 	struct mutex cyclestate_buffer_mutex;
 	} cyclestate;
 #endif
+	struct mutex dbg_s_lock;
+	struct dbg_session_gk20a *dbg_s;
 };
 
 static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
diff --git a/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c
new file mode 100644
index 000000000000..a4744e64e614
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c
@@ -0,0 +1,368 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/uaccess.h>
+#include <linux/nvhost.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+
+#include "dev.h"
+#include "nvhost_hwctx.h"
+#include "nvhost_acm.h"
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "gk20a_gating_reglist.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+
+struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
+	.exec_reg_ops = exec_regops_gk20a
+};
+
+/* silly allocator - just increment session id */
+static atomic_t session_id = ATOMIC_INIT(0);
+static int generate_session_id(void)
+{
+	return atomic_add_return(1, &session_id);
+}
+
+static int alloc_session(struct dbg_session_gk20a **_dbg_s)
+{
+	struct dbg_session_gk20a *dbg_s;
+	*_dbg_s = NULL;
+
+	nvhost_dbg_fn("");
+
+	dbg_s = kzalloc(sizeof(*dbg_s), GFP_KERNEL);
+	if (!dbg_s)
+		return -ENOMEM;
+
+	dbg_s->id = generate_session_id();
+	dbg_s->ops = &dbg_gpu_session_ops_gk20a;
+	*_dbg_s = dbg_s;
+	return 0;
+}
+
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
+{
+	struct dbg_session_gk20a *dbg_session;
+	struct nvhost_device_data *pdata;
+	struct platform_device *pdev;
+	struct device *dev;
+
+	int err;
+
+	pdata = container_of(inode->i_cdev,
+			     struct nvhost_device_data, dbg_cdev);
+	pdev = pdata->pdev;
+	dev  = &pdev->dev;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "dbg session: %s", dev_name(dev));
+
+	err  = alloc_session(&dbg_session);
+	if (err)
+		return err;
+
+	filp->private_data = dbg_session;
+	dbg_session->pdata = pdata;
+	dbg_session->pdev  = pdev;
+	dbg_session->dev   = dev;
+
+	return 0;
+}
+
+static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s)
+{
+	struct channel_gk20a *ch_gk20a = dbg_s->ch;
+	struct gk20a *g = dbg_s->ch->g;
+
+	nvhost_dbg_fn("");
+
+	/* wasn't bound to start with ? */
+	if (!ch_gk20a) {
+		nvhost_dbg(dbg_gpu_dbg | dbg_fn, "not bound already?");
+		return -ENODEV;
+	}
+
+	mutex_lock(&g->dbg_sessions_lock);
+	mutex_lock(&ch_gk20a->dbg_s_lock);
+
+	if (--g->dbg_sessions == 0) {
+		/* restore (can) powergate, clk state */
+		/* release pending exceptions to fault/be handled as usual */
+		/*TBD: ordering of these? */
+		g->elcg_enabled = true;
+		gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+		gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+
+		gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled);
+		/* ???  gr_gk20a_pg_gr_load_gating_prod(g, true); */
+
+		gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled);
+		gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled);
+
+		gk20a_pmu_enable_elpg(g);
+
+		nvhost_dbg(dbg_gpu_dbg | dbg_fn, "module idle");
+		nvhost_module_idle(dbg_s->pdev);
+	}
+
+	ch_gk20a->dbg_s = NULL;
+	dbg_s->ch       = NULL;
+	fput(dbg_s->hwctx_f);
+	dbg_s->hwctx_f   = NULL;
+
+	mutex_unlock(&ch_gk20a->dbg_s_lock);
+	mutex_unlock(&g->dbg_sessions_lock);
+
+	return 0;
+}
+
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
+{
+	struct dbg_session_gk20a *dbg_s = filp->private_data;
+
+	nvhost_dbg(dbg_gpu_dbg | dbg_fn, "%s", dev_name(dbg_s->dev));
+
+	/* unbind if it was bound */
+	if (!dbg_s->ch)
+		return 0;
+	dbg_unbind_channel_gk20a(dbg_s);
+
+	kfree(dbg_s);
+	return 0;
+}
+
+static int dbg_bind_channel_gk20a(struct dbg_session_gk20a *dbg_s,
+			  struct nvhost_dbg_gpu_bind_channel_args *args)
+{
+	struct file *f;
+	struct nvhost_hwctx *hwctx;
+	struct gk20a *g;
+	struct channel_gk20a *ch_gk20a;
+
+	nvhost_dbg(dbg_fn|dbg_gpu_dbg, "%s fd=%d",
+		   dev_name(dbg_s->dev), args->channel_fd);
+
+	if (args->channel_fd == ~0)
+		return dbg_unbind_channel_gk20a(dbg_s);
+
+	/* even though get_file_hwctx is doing this it releases it as well */
+	/* by holding it here we'll keep it from disappearing while the
+	 * debugger is in session */
+	f = fget(args->channel_fd);
+	if (!f)
+		return -ENODEV;
+
+	hwctx = nvhost_channel_get_file_hwctx(args->channel_fd);
+	if (!hwctx) {
+		nvhost_dbg_fn("no hwctx found for fd");
+		fput(f);
+		return -EINVAL;
+	}
+	/* be sure this is actually the right type of hwctx */
+	if (hwctx->channel->dev != dbg_s->pdev) {
+		nvhost_dbg_fn("hwctx module type mismatch");
+		fput(f);
+		return -EINVAL;
+	}
+	if (!hwctx->priv) {
+		nvhost_dbg_fn("no priv");
+		fput(f);
+		return -ENODEV;
+	}
+
+	ch_gk20a = (struct channel_gk20a *)hwctx->priv;
+	g = ch_gk20a->g;
+	nvhost_dbg_fn("%s hwchid=%d", dev_name(dbg_s->dev), ch_gk20a->hw_chid);
+
+	mutex_lock(&g->dbg_sessions_lock);
+	mutex_lock(&ch_gk20a->dbg_s_lock);
+
+	if (ch_gk20a->dbg_s) {
+		mutex_unlock(&ch_gk20a->dbg_s_lock);
+		mutex_unlock(&g->dbg_sessions_lock);
+		fput(f);
+		nvhost_dbg_fn("hwctx already in dbg session");
+		return -EBUSY;
+	}
+
+	dbg_s->hwctx_f  = f;
+	dbg_s->ch       = ch_gk20a;
+	ch_gk20a->dbg_s = dbg_s;
+
+	if (g->dbg_sessions++ == 0) {
+		u32 curr = gk20a_clk_get_rate(g);
+
+		/* save off current powergate, clk state.
+		 * set gpu module's can_powergate = 0.
+		 * set gpu module's clk to max.
+		 * while *a* debug session is active there will be no power or
+		 * clocking state changes allowed from mainline code (but they
+		 * should be saved).
+		 */
+		nvhost_module_busy(dbg_s->pdev);
+
+		gr_gk20a_slcg_gr_load_gating_prod(g, false);
+		gr_gk20a_slcg_perf_load_gating_prod(g, false);
+
+		gr_gk20a_blcg_gr_load_gating_prod(g, false);
+		/* ???  gr_gk20a_pg_gr_load_gating_prod(g, false); */
+		/* TBD: would rather not change elcg_enabled here */
+		g->elcg_enabled = false;
+		gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+		gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+
+		gk20a_pmu_disable_elpg(g);
+
+	}
+	mutex_unlock(&ch_gk20a->dbg_s_lock);
+	mutex_unlock(&g->dbg_sessions_lock);
+	return 0;
+}
+
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_exec_reg_ops_args *args);
+
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct dbg_session_gk20a *dbg_s = filp->private_data;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+	u8 buf[NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE];
+	int err = 0;
+
+	nvhost_dbg_fn("");
+
+	if ((_IOC_TYPE(cmd) != NVHOST_DBG_GPU_IOCTL_MAGIC) ||
+	    (_IOC_NR(cmd) == 0) ||
+	    (_IOC_NR(cmd) > NVHOST_DBG_GPU_IOCTL_LAST))
+		return -EFAULT;
+
+	BUG_ON(_IOC_SIZE(cmd) > NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE);
+
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+
+	switch (cmd) {
+	case NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL:
+		err = dbg_bind_channel_gk20a(dbg_s,
+			     (struct nvhost_dbg_gpu_bind_channel_args *)buf);
+		nvhost_dbg(dbg_gpu_dbg, "ret=%d", err);
+		break;
+
+	case NVHOST_DBG_GPU_IOCTL_REG_OPS:
+		err = nvhost_ioctl_channel_reg_ops(dbg_s,
+			   (struct nvhost_dbg_gpu_exec_reg_ops_args *)buf);
+		nvhost_dbg(dbg_gpu_dbg, "ret=%d", err);
+		break;
+
+	default:
+		nvhost_err(dev_from_gk20a(g),
+			   "unrecognized dbg gpu ioctl cmd: 0x%x",
+			   cmd);
+		err = -ENOTTY;
+		break;
+	}
+
+	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+		err = copy_to_user((void __user *)arg,
+				   buf, _IOC_SIZE(cmd));
+
+	return err;
+}
+
+/* In order to perform a context relative op the context has
+ * to be created already... which would imply that the
+ * context switch mechanism has already been put in place.
+ * So by the time we perform such an opertation it should always
+ * be possible to query for the appropriate context offsets, etc.
+ *
+ * But note: while the dbg_gpu bind requires the a channel fd with
+ * a bound hwctx it doesn't require an allocated gr/compute obj
+ * at that point... so just having the bound hwctx doesn't work
+ * to guarantee this.
+ */
+static bool gr_context_info_available(struct dbg_session_gk20a *dbg_s,
+				      struct gr_gk20a *gr)
+{
+	int err;
+
+	mutex_lock(&gr->ctx_mutex);
+	err = !gr->ctx_vars.golden_image_initialized;
+	mutex_unlock(&gr->ctx_mutex);
+	if (err)
+		return false;
+	return true;
+
+}
+
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_exec_reg_ops_args *args)
+{
+	int err;
+	struct device *dev = dbg_s->dev;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+	struct nvhost_dbg_gpu_reg_op *ops;
+	u64 ops_size = sizeof(ops[0]) * args->num_ops;
+
+	nvhost_dbg_fn("%d ops, total size %llu", args->num_ops, ops_size);
+
+	if (!dbg_s->ops) {
+		nvhost_err(dev, "can't call reg_ops on an unbound debugger session");
+		return -EINVAL;
+	}
+
+	/* be sure that ctx info is in place */
+	if (!gr_context_info_available(dbg_s, &g->gr)) {
+		nvhost_err(dev, "gr context data not available\n");
+		return -ENODEV;
+	}
+
+	ops = kzalloc(ops_size, GFP_KERNEL);
+	if (!ops) {
+		nvhost_err(dev, "Allocating memory failed!");
+		return -ENOMEM;
+	}
+
+	nvhost_dbg_fn("Copying regops from userspace");
+
+	if (copy_from_user(ops, (void *)(uintptr_t)args->ops, ops_size)) {
+		dev_err(dev, "copy_from_user failed!");
+		return -EFAULT;
+	}
+
+	err = dbg_s->ops->exec_reg_ops(dbg_s, ops, args->num_ops);
+
+	if (err) {
+		nvhost_err(dev, "dbg regops failed");
+		return err;
+	}
+
+	nvhost_dbg_fn("Copying result to userspace");
+
+	if (copy_to_user((void *)(uintptr_t)args->ops, ops, ops_size)) {
+		dev_err(dev, "copy_to_user failed!");
+		return -EFAULT;
+	}
+	return 0;
+}
diff --git a/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h
new file mode 100644
index 000000000000..48958b3f5eee
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h
@@ -0,0 +1,51 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __DBG_GPU_GK20A_H_
+#define __DBG_GPU_GK20A_H_
+
+/* module debug driver interface */
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp);
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp);
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+struct dbg_gpu_session_ops {
+	int (*exec_reg_ops)(struct dbg_session_gk20a *dbg_s,
+			    struct nvhost_dbg_gpu_reg_op *ops,
+			    u64 num_ops);
+};
+
+struct dbg_session_gk20a {
+	/* dbg session id used for trace/prints */
+	int id;
+
+	/* gpu module vagaries */
+	struct device             *dev;
+	struct platform_device    *pdev;
+	struct nvhost_device_data *pdata;
+
+	/* bound hwctx and channel */
+	struct file          *hwctx_f;
+	struct channel_gk20a *ch;
+
+	/* session operations */
+	struct dbg_gpu_session_ops *ops;
+};
+
+extern struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a;
+
+#endif /* __DBG_GPU_GK20A_H_ */
diff --git a/drivers/video/tegra/host/gk20a/gk20a.c b/drivers/video/tegra/host/gk20a/gk20a.c
index 8be8f4bd3ff2..f564a151155c 100644
--- a/drivers/video/tegra/host/gk20a/gk20a.c
+++ b/drivers/video/tegra/host/gk20a/gk20a.c
@@ -49,6 +49,7 @@
 #include "hw_sim_gk20a.h"
 #include "gk20a_scale.h"
 #include "gr3d/pod_scaling.h"
+#include "dbg_gpu_gk20a.h"
 
 #include "../../../../../arch/arm/mach-tegra/iomap.h"
 
@@ -89,6 +90,17 @@ const struct file_operations tegra_gk20a_ctrl_ops = {
 	.unlocked_ioctl = gk20a_ctrl_dev_ioctl,
 };
 
+const struct file_operations tegra_gk20a_dbg_gpu_ops = {
+	.owner = THIS_MODULE,
+	.release        = gk20a_dbg_gpu_dev_release,
+	.open           = gk20a_dbg_gpu_dev_open,
+	.unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#endif
+
+};
+
 static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
 {
 	writel(v, g->sim.regs+r);
@@ -532,6 +544,8 @@ int nvhost_init_gk20a_support(struct platform_device *dev)
 			goto fail;
 	}
 
+	mutex_init(&g->dbg_sessions_lock);
+
 	/* nvhost_as alloc_share can be called before gk20a is powered on.
 	   It requires mm sw states configured so init mm sw early here. */
 	err = gk20a_init_mm_setup_sw(g);
diff --git a/drivers/video/tegra/host/gk20a/gk20a.h b/drivers/video/tegra/host/gk20a/gk20a.h
index 4add3dff4fb4..066b7aaae788 100644
--- a/drivers/video/tegra/host/gk20a/gk20a.h
+++ b/drivers/video/tegra/host/gk20a/gk20a.h
@@ -95,6 +95,12 @@ struct gk20a {
 	struct dentry *debugfs_timeouts_enabled;
 	struct dentry *debugfs_gr_idle_timeout_default;
 #endif
+
+	/* held while manipulating # of debug sessions present */
+	/* also prevents debug sessions from attaching until released */
+	struct mutex dbg_sessions_lock;
+	int dbg_sessions; /* number attached */
+
 	void (*remove_support)(struct platform_device *);
 
 	struct notifier_block system_suspend_notifier;
@@ -248,6 +254,7 @@ int clk_gk20a_debugfs_init(struct platform_device *dev);
 #endif
 
 extern const struct file_operations tegra_gk20a_ctrl_ops;
+extern const struct file_operations tegra_gk20a_dbg_gpu_ops;
 struct nvhost_hwctx_handler *nvhost_gk20a_alloc_hwctx_handler(u32 syncpt,
 		u32 waitbase, struct nvhost_channel *ch);
 
diff --git a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
index ab403df84b51..909a166ae9c3 100644
--- a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
@@ -1,9 +1,7 @@
 /*
- * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
- *
  * GK20A Graphics Context
  *
- * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -14,9 +12,8 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __GR_CTX_GK20A_H__
 #define __GR_CTX_GK20A_H__
diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.c b/drivers/video/tegra/host/gk20a/gr_gk20a.c
index 45f9392f9d95..b526e31abf5a 100644
--- a/drivers/video/tegra/host/gk20a/gr_gk20a.c
+++ b/drivers/video/tegra/host/gk20a/gr_gk20a.c
@@ -24,6 +24,7 @@
 #include <linux/scatterlist.h>
 #include <linux/nvmap.h>
 #include <linux/tegra-soc.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
 
 #include "../dev.h"
 
@@ -49,10 +50,14 @@
 #include "chip_support.h"
 #include "nvhost_memmgr.h"
 #include "gk20a_gating_reglist.h"
+#include "gr_pri_gk20a.h"
+#include "regops_gk20a.h"
+
+
 
 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
-static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c,
-				    u32 addr, u32 data, u32 patch);
+static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
+				    u32 addr, u32 data, bool patch);
 
 /* global ctx buffer */
 static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
@@ -433,35 +438,92 @@ static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
 	return 0;
 }
 
-int gr_gk20a_submit_fecs_method(struct gk20a *g,
-			u32 mb_id, u32 mb_data, u32 mb_clr,
-			u32 mtd_data, u32 mtd_adr, u32 *mb_ret,
-			u32 opc_ok, u32 mb_ok, u32 opc_fail, u32 mb_fail)
+/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
+ * We should replace most, if not all, fecs method calls to this instead. */
+struct fecs_method_op_gk20a {
+	struct {
+		u32 addr;
+		u32 data;
+	} method;
+
+	struct {
+		u32 id;
+		u32 data;
+		u32 clr;
+		u32 *ret;
+		u32 ok;
+		u32 fail;
+	} mailbox;
+
+	struct {
+		u32 ok;
+		u32 fail;
+	} cond;
+
+};
+
+int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
+				   struct fecs_method_op_gk20a op)
 {
 	struct gr_gk20a *gr = &g->gr;
 	int ret;
 
 	mutex_lock(&gr->fecs_mutex);
 
-	if (mb_id != 0)
-		gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(mb_id),
-			mb_data);
+	if (op.mailbox.id != 0)
+		gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
+			     op.mailbox.data);
 
 	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
-		gr_fecs_ctxsw_mailbox_clear_value_f(mb_clr));
+		gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
 
-	gk20a_writel(g, gr_fecs_method_data_r(), mtd_data);
+	gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
 	gk20a_writel(g, gr_fecs_method_push_r(),
-		gr_fecs_method_push_adr_f(mtd_adr));
+		gr_fecs_method_push_adr_f(op.method.addr));
+
+	/* op.mb.id == 4 cases require waiting for completion on
+	 * for op.mb.id == 0 */
+	if (op.mailbox.id == 4)
+		op.mailbox.id = 0;
 
-	ret = gr_gk20a_ctx_wait_ucode(g, 0, mb_ret,
-		opc_ok, mb_ok, opc_fail, mb_fail);
+	ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
+				      op.cond.ok, op.mailbox.ok,
+				      op.cond.fail, op.mailbox.fail);
 
 	mutex_unlock(&gr->fecs_mutex);
 
 	return ret;
 }
 
+int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
+{
+	return gr_gk20a_submit_fecs_method_op(g,
+	      (struct fecs_method_op_gk20a) {
+		      .method.addr = fecs_method,
+		      .method.data = ~0,
+		      .mailbox = { .id   = 1, /*sideband?*/
+				   .data = ~0, .clr = ~0, .ret = ret,
+				   .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
+				   .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
+		      .cond.ok = GR_IS_UCODE_OP_EQUAL,
+		      .cond.fail = GR_IS_UCODE_OP_EQUAL });
+}
+
+/* Stop processing (stall) context switches at FECS */
+int gr_gk20a_disable_ctxsw(struct gk20a *g)
+{
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
+	return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
+}
+
+/* Start processing (continue) context switches at FECS */
+int gr_gk20a_enable_ctxsw(struct gk20a *g)
+{
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
+	return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
+}
+
+
 static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 {
 	u32 addr_lo;
@@ -504,33 +566,92 @@ clean_up:
 	return ret;
 }
 
-static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c,
-				    u32 addr, u32 data, u32 patch)
+/*
+ * Context state can be written directly or "patched" at times.
+ * So that code can be used in either situation it is written
+ * using a series _ctx_patch_write(..., patch) statements.
+ * However any necessary cpu map/unmap and gpu l2 invalidates
+ * should be minimized (to avoid doing it once per patch write).
+ * Before a sequence of these set up with "_ctx_patch_write_begin"
+ * and close with "_ctx_patch_write_end."
+ */
+static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+					  struct channel_ctx_gk20a *ch_ctx)
+{
+	/* being defensive still... */
+	if (ch_ctx->patch_ctx.cpu_va) {
+		nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?");
+		return -EBUSY;
+	}
+
+	ch_ctx->patch_ctx.cpu_va =
+		nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
+
+	if (!ch_ctx->patch_ctx.cpu_va)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+					struct channel_ctx_gk20a *ch_ctx)
+{
+	/* being defensive still... */
+	if (!ch_ctx->patch_ctx.cpu_va) {
+		nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?");
+		return -EINVAL;
+	}
+
+	nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref,
+			     ch_ctx->patch_ctx.cpu_va);
+	ch_ctx->patch_ctx.cpu_va = NULL;
+
+	gk20a_mm_l2_invalidate(g);
+	return 0;
+}
+
+static int gr_gk20a_ctx_patch_write(struct gk20a *g,
+				    struct channel_ctx_gk20a *ch_ctx,
+				    u32 addr, u32 data, bool patch)
 {
-	struct channel_ctx_gk20a *ch_ctx;
 	u32 patch_slot = 0;
 	void *patch_ptr = NULL;
+	bool mapped_here = false;
 
-	BUG_ON(patch != 0 && c == NULL);
+	BUG_ON(patch != 0 && ch_ctx == NULL);
 
 	if (patch) {
-		ch_ctx = &c->ch_ctx;
-		patch_ptr = nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
-		if (!patch_ptr)
-			return -ENOMEM;
+		if (!ch_ctx)
+			return -EINVAL;
+		/* we added an optimization prolog, epilog
+		 * to get rid of unnecessary maps and l2 invals.
+		 * but be defensive still... */
+		if (!ch_ctx->patch_ctx.cpu_va) {
+			int err;
+			nvhost_err(dev_from_gk20a(g),
+				   "per-write ctx patch begin?");
+			/* yes, gr_gk20a_ctx_patch_smpc causes this one */
+			err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+			if (err)
+				return err;
+			mapped_here = true;
+		} else {
+			mapped_here = false;
+			patch_ptr = ch_ctx->patch_ctx.cpu_va;
+		}
 
 		patch_slot = ch_ctx->patch_ctx.data_count * 2;
 
 		mem_wr32(patch_ptr, patch_slot++, addr);
 		mem_wr32(patch_ptr, patch_slot++, data);
 
-		nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref, patch_ptr);
-		gk20a_mm_l2_invalidate(g);
-
 		ch_ctx->patch_ctx.data_count++;
-	} else {
+
+		if (mapped_here)
+			gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
+	} else
 		gk20a_writel(g, addr, data);
-	}
 
 	return 0;
 }
@@ -545,12 +666,19 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
 	nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
 		   c->hw_chid, inst_base_ptr);
 
-	ret = gr_gk20a_submit_fecs_method(g, 0, 0, 0x30,
-			gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
-			gr_fecs_current_ctx_target_vid_mem_f() |
-			gr_fecs_current_ctx_valid_f(1),
-			gr_fecs_method_push_adr_bind_pointer_v(),
-			0, GR_IS_UCODE_OP_AND, 0x10, GR_IS_UCODE_OP_AND, 0x20);
+	ret = gr_gk20a_submit_fecs_method_op(g,
+		     (struct fecs_method_op_gk20a) {
+		     .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
+		     .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+				     gr_fecs_current_ctx_target_vid_mem_f() |
+				     gr_fecs_current_ctx_valid_f(1)),
+		     .mailbox = { .id = 0, .data = 0,
+				  .clr = 0x30,
+				  .ret = NULL,
+				  .ok = 0x10,
+				  .fail = 0x20, },
+		     .cond.ok = GR_IS_UCODE_OP_AND,
+		     .cond.fail = GR_IS_UCODE_OP_AND});
 	if (ret)
 		nvhost_err(dev_from_gk20a(g),
 			"bind channel instance failed");
@@ -621,9 +749,10 @@ clean_up:
 }
 
 static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
-			struct channel_gk20a *c, u32 patch)
+			struct channel_gk20a *c, bool patch)
 {
 	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = NULL;
 	u32 attrib_offset_in_chunk = 0;
 	u32 alpha_offset_in_chunk = 0;
 	u32 pd_ab_max_output;
@@ -633,7 +762,15 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
 
 	nvhost_dbg_fn("");
 
-	gr_gk20a_ctx_patch_write(g, c, gr_ds_tga_constraintlogic_r(),
+	if (patch) {
+		int err;
+		ch_ctx = &c->ch_ctx;
+		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+		if (err)
+			return err;
+	}
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
 		gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
 		gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
 		patch);
@@ -642,7 +779,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
 		gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
 		gr_pd_ab_dist_cfg1_max_output_granularity_v();
 
-	gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg1_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
 		gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
 		gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
 
@@ -658,7 +795,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
 			cbm_cfg_size2 = gr->alpha_cb_default_size *
 				gr->pes_tpc_count[ppc_index][gpc_index];
 
-			gr_gk20a_ctx_patch_write(g, c,
+			gr_gk20a_ctx_patch_write(g, ch_ctx,
 				gr_gpc0_ppc0_cbm_cfg_r() + temp +
 				proj_ppc_in_gpc_stride_v() * ppc_index,
 				gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
@@ -668,7 +805,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
 			attrib_offset_in_chunk += gr->attrib_cb_size *
 				gr->pes_tpc_count[ppc_index][gpc_index];
 
-			gr_gk20a_ctx_patch_write(g, c,
+			gr_gk20a_ctx_patch_write(g, ch_ctx,
 				gr_gpc0_ppc0_cbm_cfg2_r() + temp +
 				proj_ppc_in_gpc_stride_v() * ppc_index,
 				gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
@@ -679,11 +816,14 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
 		}
 	}
 
+	if (patch)
+		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
 	return 0;
 }
 
 static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
-			struct channel_gk20a *c, u32 patch)
+			struct channel_gk20a *c, bool patch)
 {
 	struct gr_gk20a *gr = &g->gr;
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
@@ -692,6 +832,12 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	u32 data;
 
 	nvhost_dbg_fn("");
+	if (patch) {
+		int err;
+		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+		if (err)
+			return err;
+	}
 
 	/* global pagepool buffer */
 	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
@@ -708,20 +854,20 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
 		addr, size);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_base_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
 		gr_scc_pagepool_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
 		gr_scc_pagepool_total_pages_f(size) |
 		gr_scc_pagepool_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_base_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
 		gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
 		gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_pd_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
 		gr_pd_pagepool_total_pages_f(size) |
 		gr_pd_pagepool_valid_true_f(), patch);
 
@@ -736,17 +882,17 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d",
 		addr, size);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_base_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
 		gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_size_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
 		gr_scc_bundle_cb_size_div_256b_f(size) |
 		gr_scc_bundle_cb_size_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_base_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
 		gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_size_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
 		gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
 		gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
 
@@ -760,7 +906,7 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	nvhost_dbg_info("bundle cb token limit : %d, state limit : %d",
 		   gr->bundle_cb_token_limit, data);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg2_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
 		gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
 		gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
 
@@ -772,20 +918,24 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 
 	nvhost_dbg_info("attrib cb addr : 0x%016llx", addr);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_attrib_cb_base_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
 		gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
 		gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
 		gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
 		gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
 
+	if (patch)
+		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
 	return 0;
 }
 
-static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, u32 patch)
+static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
 {
 	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = NULL;
 	u32 gpm_pd_cfg;
 	u32 pd_ab_dist_cfg0;
 	u32 ds_debug;
@@ -800,6 +950,14 @@ static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20
 	ds_debug = gk20a_readl(g, gr_ds_debug_r());
 	mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
 
+	if (patch) {
+		int err;
+		ch_ctx = &c->ch_ctx;
+		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+		if (err)
+			return err;
+	}
+
 	if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
 		pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
 		pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
@@ -811,24 +969,27 @@ static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20
 		ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
 		mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
 
-		gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
 	} else {
 		gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
 		pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
 		ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
 		mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
 
-		gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch);
-		gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
 	}
 
+	if (patch)
+		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
 	return 0;
 }
 
@@ -1147,7 +1308,7 @@ static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
 		gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
 	}
 
-	/* grSetupPDMapping stubbed for gk20a */
+	/* gr__setup_pd_mapping stubbed for gk20a */
 	gr_gk20a_setup_rop_mapping(g, gr);
 	gr_gk20a_setup_alpha_beta_tables(g, gr);
 
@@ -1192,13 +1353,22 @@ static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
 		u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
 		>> ram_in_base_shift_v());
 
+
 	nvhost_dbg_fn("");
 
-	ret = gr_gk20a_submit_fecs_method(g, 0, 0, 3,
-			gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
-			gr_fecs_current_ctx_target_vid_mem_f() |
-			gr_fecs_current_ctx_valid_f(1), save_type, 0,
-			GR_IS_UCODE_OP_AND, 1, GR_IS_UCODE_OP_AND, 2);
+	ret = gr_gk20a_submit_fecs_method_op(g,
+		(struct fecs_method_op_gk20a) {
+		.method.addr = save_type,
+		.method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+				gr_fecs_current_ctx_target_vid_mem_f() |
+				gr_fecs_current_ctx_valid_f(1)),
+		.mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
+			.ok = 1, .fail = 2,
+		},
+		.cond.ok = GR_IS_UCODE_OP_AND,
+		.cond.fail = GR_IS_UCODE_OP_AND,
+		 });
+
 	if (ret)
 		nvhost_err(dev_from_gk20a(g), "save context image failed");
 
@@ -1234,7 +1404,7 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	if (err)
 		goto clean_up;
 
-	err = gr_gk20a_commit_global_ctx_buffers(g, c, 0);
+	err = gr_gk20a_commit_global_ctx_buffers(g, c, false);
 	if (err)
 		goto clean_up;
 
@@ -1367,13 +1537,22 @@ static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 			u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
 			>> ram_in_base_shift_v());
 
-		ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0,
-				gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
-				gr_fecs_current_ctx_target_vid_mem_f() |
-				gr_fecs_current_ctx_valid_f(1),
-				gr_fecs_method_push_adr_restore_golden_v(), 0,
-				GR_IS_UCODE_OP_EQUAL, gr_fecs_ctxsw_mailbox_value_pass_v(),
-				GR_IS_UCODE_OP_SKIP, 0);
+		ret = gr_gk20a_submit_fecs_method_op(g,
+			  (struct fecs_method_op_gk20a) {
+				  .method.data =
+					  (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+					   gr_fecs_current_ctx_target_vid_mem_f() |
+					   gr_fecs_current_ctx_valid_f(1)),
+				  .method.addr =
+					  gr_fecs_method_push_adr_restore_golden_v(),
+				  .mailbox = {
+					  .id = 0, .data = 0,
+					  .clr = ~0, .ret = NULL,
+					  .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
+					  .fail = 0},
+				  .cond.ok = GR_IS_UCODE_OP_EQUAL,
+				  .cond.fail = GR_IS_UCODE_OP_SKIP});
+
 		if (ret)
 			nvhost_err(dev_from_gk20a(g),
 				   "restore context image failed");
@@ -1440,33 +1619,34 @@ static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
 	u32 zcull_ctx_image_size = 0;
 	u32 pm_ctx_image_size = 0;
 	u32 ret;
+	struct fecs_method_op_gk20a op = {
+		.mailbox = { .id = 0, .data = 0,
+			     .clr = ~0, .ok = 0, .fail = 0},
+		.method.data = 0,
+		.cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+		.cond.fail = GR_IS_UCODE_OP_SKIP,
+		};
 
 	nvhost_dbg_fn("");
-
-	ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
-			gr_fecs_method_push_adr_discover_image_size_v(),
-			&golden_ctx_image_size,
-			GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+	op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
+	op.mailbox.ret = &golden_ctx_image_size;
+	ret = gr_gk20a_submit_fecs_method_op(g, op);
 	if (ret) {
 		nvhost_err(dev_from_gk20a(g),
 			   "query golden image size failed");
 		return ret;
 	}
-
-	ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
-			gr_fecs_method_push_adr_discover_zcull_image_size_v(),
-			&zcull_ctx_image_size,
-			GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+	op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
+	op.mailbox.ret = &zcull_ctx_image_size;
+	ret = gr_gk20a_submit_fecs_method_op(g, op);
 	if (ret) {
 		nvhost_err(dev_from_gk20a(g),
 			   "query zcull ctx image size failed");
 		return ret;
 	}
-
-	ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
-			gr_fecs_method_push_adr_discover_pm_image_size_v(),
-			&pm_ctx_image_size,
-			GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+	op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
+	op.mailbox.ret = &pm_ctx_image_size;
+	ret = gr_gk20a_submit_fecs_method_op(g, op);
 	if (ret) {
 		nvhost_err(dev_from_gk20a(g),
 			   "query pm ctx image size failed");
@@ -1943,10 +2123,10 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
 			goto out;
 		}
 		gr_gk20a_elpg_protected_call(g,
-			gr_gk20a_commit_global_ctx_buffers(g, c, 1));
+			gr_gk20a_commit_global_ctx_buffers(g, c, true));
 	}
 
-	/* init gloden image, ELPG enabled after this is done */
+	/* init golden image, ELPG enabled after this is done */
 	err = gr_gk20a_init_golden_ctx_image(g, c);
 	if (err) {
 		nvhost_err(dev_from_gk20a(g),
@@ -3527,8 +3707,6 @@ static int gk20a_init_gr_setup_hw(struct gk20a *g)
 		gk20a_writel(g, sw_ctx_load->l[i].addr,
 			     sw_ctx_load->l[i].value);
 
-	/* TBD: add gr ctx overrides */
-
 	err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
 	if (err)
 		goto out;
@@ -3541,8 +3719,8 @@ static int gk20a_init_gr_setup_hw(struct gk20a *g)
 		gr_fe_go_idle_timeout_count_disabled_f());
 
 	/* override a few ctx state registers */
-	gr_gk20a_commit_global_cb_manager(g, NULL, 0);
-	gr_gk20a_commit_global_timeslice(g, NULL, 0);
+	gr_gk20a_commit_global_cb_manager(g, NULL, false);
+	gr_gk20a_commit_global_timeslice(g, NULL, false);
 
 	/* floorsweep anything left */
 	gr_gk20a_ctx_state_floorsweep(g);
@@ -4328,25 +4506,52 @@ clean_up:
 int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
 {
 	BUG_ON(size == NULL);
-	return gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 1,
-		gr_fecs_method_push_adr_discover_reglist_image_size_v(),
-		size, GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+	return gr_gk20a_submit_fecs_method_op(g,
+		   (struct fecs_method_op_gk20a) {
+			   .mailbox.id = 0,
+			   .mailbox.data = 0,
+			   .mailbox.clr = ~0,
+			   .method.data = 1,
+			   .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
+			   .mailbox.ret = size,
+			   .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+			   .mailbox.ok = 0,
+			   .cond.fail = GR_IS_UCODE_OP_SKIP,
+			   .mailbox.fail = 0});
 }
 
 int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
 {
-	return gr_gk20a_submit_fecs_method(g, 4,
-		gr_fecs_current_ctx_ptr_f(addr >> 12) |
-		gr_fecs_current_ctx_valid_f(1) | gr_fecs_current_ctx_target_vid_mem_f(),
-		~0, 1, gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
-		0, GR_IS_UCODE_OP_EQUAL, 1, GR_IS_UCODE_OP_SKIP, 0);
+	return gr_gk20a_submit_fecs_method_op(g,
+		   (struct fecs_method_op_gk20a){
+			   .mailbox.id = 4,
+			   .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
+					    gr_fecs_current_ctx_valid_f(1) |
+					    gr_fecs_current_ctx_target_vid_mem_f()),
+			   .mailbox.clr = ~0,
+			   .method.data = 1,
+			   .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
+			   .mailbox.ret = NULL,
+			   .cond.ok = GR_IS_UCODE_OP_EQUAL,
+			   .mailbox.ok = 1,
+			   .cond.fail = GR_IS_UCODE_OP_SKIP,
+			   .mailbox.fail = 0});
 }
 
 int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
 {
-	return gr_gk20a_submit_fecs_method(g, 4, u64_lo32(pmu_va >> 8),
-		~0, 1, gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
-		0, GR_IS_UCODE_OP_EQUAL, 1, GR_IS_UCODE_OP_SKIP, 0);
+	return gr_gk20a_submit_fecs_method_op(g,
+		   (struct fecs_method_op_gk20a) {
+			   .mailbox.id = 4,
+			   .mailbox.data = u64_lo32(pmu_va >> 8),
+			   .mailbox.clr = ~0,
+			   .method.data = 1,
+			   .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
+			   .mailbox.ret = NULL,
+			   .cond.ok = GR_IS_UCODE_OP_EQUAL,
+			   .mailbox.ok = 1,
+			   .cond.fail = GR_IS_UCODE_OP_SKIP,
+			   .mailbox.fail = 0});
 }
 
 int gk20a_gr_suspend(struct gk20a *g)
@@ -4381,3 +4586,1212 @@ int gk20a_gr_suspend(struct gk20a *g)
 	nvhost_dbg_fn("done");
 	return ret;
 }
+
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+					       u32 addr,
+					       bool is_quad, u32 quad,
+					       u32 *context_buffer,
+					       u32 context_buffer_size,
+					       u32 *priv_offset);
+
+/* This function will decode a priv address and return the partition type and numbers. */
+int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
+			      int  *addr_type, /* enum ctxsw_addr_type */
+			      u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
+			      u32 *broadcast_flags)
+{
+	u32 gpc_addr;
+	u32 ppc_address;
+	u32 ppc_broadcast_addr;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+	/* setup defaults */
+	ppc_address = 0;
+	ppc_broadcast_addr = 0;
+	*addr_type = CTXSW_ADDR_TYPE_SYS;
+	*broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
+	*gpc_num = 0;
+	*tpc_num = 0;
+	*ppc_num = 0;
+	*be_num  = 0;
+
+	if (pri_is_gpc_addr(addr)) {
+		*addr_type = CTXSW_ADDR_TYPE_GPC;
+		gpc_addr = pri_gpccs_addr_mask(addr);
+		if (pri_is_gpc_addr_shared(addr)) {
+			*addr_type = CTXSW_ADDR_TYPE_GPC;
+			*broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
+		} else
+			*gpc_num = pri_get_gpc_num(addr);
+
+		if (pri_is_tpc_addr(gpc_addr)) {
+			*addr_type = CTXSW_ADDR_TYPE_TPC;
+			if (pri_is_tpc_addr_shared(gpc_addr)) {
+				*broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
+				return 0;
+			}
+			*tpc_num = pri_get_tpc_num(gpc_addr);
+		}
+		return 0;
+	} else if (pri_is_be_addr(addr)) {
+		*addr_type = CTXSW_ADDR_TYPE_BE;
+		if (pri_is_be_addr_shared(addr)) {
+			*broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
+			return 0;
+		}
+		*be_num = pri_get_be_num(addr);
+		return 0;
+	} else {
+		*addr_type = CTXSW_ADDR_TYPE_SYS;
+		return 0;
+	}
+	/* PPC!?!?!?! */
+
+	/*NOTREACHED*/
+	return -EINVAL;
+}
+
+static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
+				      u32 gpc_num,
+				      u32 *priv_addr_table, u32 *t)
+{
+    u32 ppc_num;
+
+    nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+    for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
+	    priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
+						   gpc_num, ppc_num);
+
+    return 0;
+}
+
+/*
+ * The context buffer is indexed using BE broadcast addresses and GPC/TPC
+ * unicast addresses. This function will convert a BE unicast address to a BE
+ * broadcast address and split a GPC/TPC broadcast address into a table of
+ * GPC/TPC addresses.  The addresses generated by this function can be
+ * successfully processed by gr_gk20a_find_priv_offset_in_buffer
+ */
+static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
+					   u32 addr,
+					   u32 *priv_addr_table,
+					   u32 *num_registers)
+{
+	int addr_type; /*enum ctxsw_addr_type */
+	u32 gpc_num, tpc_num, ppc_num, be_num;
+	u32 broadcast_flags;
+	u32 t;
+	int err;
+
+	t = 0;
+	*num_registers = 0;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+	err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+					&gpc_num, &tpc_num, &ppc_num, &be_num,
+					&broadcast_flags);
+	nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type);
+	if (err)
+		return err;
+
+	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
+		/* The BE broadcast registers are included in the compressed PRI
+		 * table. Convert a BE unicast address to a broadcast address
+		 * so that we can look up the offset. */
+		if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
+		    !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
+			priv_addr_table[t++] = pri_be_shared_addr(addr);
+		else
+			priv_addr_table[t++] = addr;
+
+		*num_registers = t;
+		return 0;
+	}
+
+	/* The GPC/TPC unicast registers are included in the compressed PRI
+	 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
+	 * that we can look up the offsets. */
+	if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
+		for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
+
+			if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+				for (tpc_num = 0;
+				     tpc_num < g->gr.gpc_tpc_count[gpc_num];
+				     tpc_num++)
+					priv_addr_table[t++] =
+						pri_tpc_addr(pri_tpccs_addr_mask(addr),
+							     gpc_num, tpc_num);
+
+			else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
+				err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+							       priv_addr_table, &t);
+				if (err)
+					return err;
+			} else
+				priv_addr_table[t++] =
+					pri_gpc_addr(pri_gpccs_addr_mask(addr),
+						     gpc_num);
+		}
+	} else {
+		if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+			for (tpc_num = 0;
+			     tpc_num < g->gr.gpc_tpc_count[gpc_num];
+			     tpc_num++)
+				priv_addr_table[t++] =
+					pri_tpc_addr(pri_tpccs_addr_mask(addr),
+						     gpc_num, tpc_num);
+		else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
+			err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+						       priv_addr_table, &t);
+		else
+			priv_addr_table[t++] = addr;
+	}
+
+	*num_registers = t;
+	return 0;
+}
+
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+				    u32 addr,
+				    u32 max_offsets,
+				    u32 *offsets, u32 *offset_addrs,
+				    u32 *num_offsets,
+				    bool is_quad, u32 quad)
+{
+	u32 i;
+	u32 priv_offset = 0;
+	u32 *priv_registers;
+	u32 num_registers = 0;
+	int err = 0;
+	u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
+		proj_scal_litter_num_tpc_per_gpc_v();
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+	/* implementation is crossed-up if either of these happen */
+	if (max_offsets > potential_offsets)
+		return -EINVAL;
+
+	if (!g->gr.ctx_vars.golden_image_initialized)
+		return -ENODEV;
+
+	priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
+	if (IS_ERR_OR_NULL(priv_registers)) {
+		nvhost_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
+		err = PTR_ERR(priv_registers);
+		goto cleanup;
+	}
+	memset(offsets,      0, sizeof(u32) * max_offsets);
+	memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+	*num_offsets = 0;
+
+	gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
+
+	if ((max_offsets > 1) && (num_registers > max_offsets)) {
+		err = -EINVAL;
+		goto cleanup;
+	}
+
+	if ((max_offsets == 1) && (num_registers > 1))
+		num_registers = 1;
+
+	if (!g->gr.ctx_vars.local_golden_image) {
+		nvhost_dbg_fn("no context switch header info to work with");
+		err = -EINVAL;
+		goto cleanup;
+	}
+
+	for (i = 0; i < num_registers; i++) {
+		err = gr_gk20a_find_priv_offset_in_buffer(g,
+						  priv_registers[i],
+						  is_quad, quad,
+						  g->gr.ctx_vars.local_golden_image,
+						  g->gr.ctx_vars.golden_image_size,
+						  &priv_offset);
+		if (err) {
+			nvhost_dbg_fn("Could not determine priv_offset for addr:0x%x",
+				      addr); /*, grPriRegStr(addr)));*/
+			goto cleanup;
+		}
+
+		offsets[i] = priv_offset;
+		offset_addrs[i] = priv_registers[i];
+	}
+
+    *num_offsets = num_registers;
+
+ cleanup:
+
+    if (!IS_ERR_OR_NULL(priv_registers))
+	    kfree(priv_registers);
+
+    return err;
+}
+
+/* Setup some register tables.  This looks hacky; our
+ * register/offset functions are just that, functions.
+ * So they can't be used as initializers... TBD: fix to
+ * generate consts at least on an as-needed basis.
+ */
+static const u32 _num_ovr_perf_regs = 17;
+static u32 _ovr_perf_regs[17] = { 0, };
+/* Following are the blocks of registers that the ucode
+ stores in the extended region.*/
+/* ==  ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
+static const u32 _num_sm_dsm_perf_regs = 5;
+/* ==  ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
+static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
+static u32 _sm_dsm_perf_regs[5];
+static u32 _sm_dsm_perf_ctrl_regs[4];
+
+static void init_sm_dsm_reg_info(void)
+{
+	if (_ovr_perf_regs[0] != 0)
+		return;
+
+	_ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
+	_ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
+	_ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
+	_ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
+	_ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
+	_ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
+	_ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
+	_ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
+	_ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
+	_ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
+	_ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
+	_ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
+	_ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
+	_ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
+	_ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
+	_ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
+	_ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
+
+
+	_sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
+	_sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
+	_sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
+	_sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
+	_sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
+
+	_sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
+	_sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
+	_sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
+	_sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
+
+}
+
+/* TBD: would like to handle this elsewhere, at a higher level.
+ * these are currently constructed in a "test-then-write" style
+ * which makes it impossible to know externally whether a ctx
+ * write will actually occur. so later we should put a lazy,
+ *  map-and-hold system in the patch write state */
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+			    struct channel_ctx_gk20a *ch_ctx,
+			    u32 addr, u32 data,
+			    u8 *context)
+{
+	u32 num_gpc = g->gr.gpc_count;
+	u32 num_tpc;
+	u32 tpc, gpc, reg;
+	u32 chk_addr;
+	u32 vaddr_lo;
+	u32 vaddr_hi;
+	u32 tmp;
+
+	init_sm_dsm_reg_info();
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+	for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
+		for (gpc = 0; gpc < num_gpc; gpc++)  {
+			num_tpc = g->gr.gpc_tpc_count[gpc];
+			for (tpc = 0; tpc < num_tpc; tpc++) {
+				chk_addr = ((proj_gpc_stride_v() * gpc) +
+					    (proj_tpc_in_gpc_stride_v() * tpc) +
+					    _ovr_perf_regs[reg]);
+				if (chk_addr != addr)
+					continue;
+				/* reset the patch count from previous
+				   runs,if ucode has already processed
+				   it */
+				tmp = mem_rd32(context +
+				       ctxsw_prog_main_image_patch_count_o(), 0);
+
+				if (!tmp)
+					ch_ctx->patch_ctx.data_count = 0;
+
+				gr_gk20a_ctx_patch_write(g, ch_ctx,
+							 addr, data, true);
+
+				vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+				vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+
+				mem_wr32(context +
+					 ctxsw_prog_main_image_patch_count_o(),
+					 0, ch_ctx->patch_ctx.data_count);
+				mem_wr32(context +
+					 ctxsw_prog_main_image_patch_adr_lo_o(),
+					 0, vaddr_lo);
+				mem_wr32(context +
+					 ctxsw_prog_main_image_patch_adr_hi_o(),
+					 0, vaddr_hi);
+
+				/* we're not caching these on cpu side,
+				   but later watch for it */
+
+				/* the l2 invalidate in the patch_write
+				 * would be too early for this? */
+				gk20a_mm_l2_invalidate(g);
+				return 0;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
+{
+	u32 reg;
+	u32 quad_ctrl;
+	u32 half_ctrl;
+	u32 tpc, gpc;
+	u32 gpc_tpc_addr;
+	u32 gpc_tpc_stride;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "offset=0x%x", offset);
+
+	gpc = pri_get_gpc_num(offset);
+	gpc_tpc_addr = pri_gpccs_addr_mask(offset);
+	tpc = pri_get_tpc_num(gpc_tpc_addr);
+
+	quad_ctrl = quad & 0x1; /* first bit tells us quad */
+	half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
+
+	gpc_tpc_stride = gpc * proj_gpc_stride_v() +
+		tpc * proj_tpc_in_gpc_stride_v();
+	gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
+
+	reg = gk20a_readl(g, gpc_tpc_addr);
+	reg = set_field(reg,
+		gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
+		quad_ctrl);
+
+	gk20a_writel(g, gpc_tpc_addr, reg);
+
+	gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
+	reg = gk20a_readl(g, gpc_tpc_addr);
+	reg = set_field(reg,
+		gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
+		half_ctrl);
+	gk20a_writel(g, gpc_tpc_addr, reg);
+}
+
+#define ILLEGAL_ID (~0)
+
+static inline bool check_main_image_header_magic(void *context)
+{
+	u32 magic = mem_rd32(context +
+			     ctxsw_prog_main_image_magic_value_o(), 0);
+	nvhost_dbg(dbg_gpu_dbg, "main image magic=0x%x", magic);
+	return magic == ctxsw_prog_main_image_magic_value_v_value_v();
+}
+static inline bool check_local_header_magic(void *context)
+{
+	u32 magic = mem_rd32(context +
+			     ctxsw_prog_local_magic_value_o(), 0);
+	nvhost_dbg(dbg_gpu_dbg, "local magic=0x%x",  magic);
+	return magic == ctxsw_prog_local_magic_value_v_value_v();
+
+}
+
+/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
+static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
+{
+	return 256;
+}
+
+static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
+						   u32 addr,
+						   bool is_quad, u32 quad,
+						   u32 *context_buffer,
+						   u32 context_buffer_size,
+						   u32 *priv_offset)
+{
+	u32 i, data32;
+	u32 gpc_num, tpc_num;
+	u32 num_gpcs, num_tpcs;
+	u32 chk_addr;
+	u32 ext_priv_offset, ext_priv_size;
+	void *context;
+	u32 offset_to_segment, offset_to_segment_end;
+	u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
+	u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
+	u32 num_ext_gpccs_ext_buffer_segments;
+	u32 inter_seg_offset;
+	u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
+	u32 max_tpc_count;
+	u32 *sm_dsm_perf_ctrl_regs = NULL;
+	u32 num_sm_dsm_perf_ctrl_regs = 0;
+	u32 *sm_dsm_perf_regs = NULL;
+	u32 num_sm_dsm_perf_regs = 0;
+	u32 buffer_segments_size = 0;
+	u32 marker_size = 0;
+	u32 control_register_stride = 0;
+	u32 perf_register_stride = 0;
+
+	/* Only have TPC registers in extended region, so if not a TPC reg,
+	   then return error so caller can look elsewhere. */
+	if (pri_is_gpc_addr(addr))   {
+		u32 gpc_addr = 0;
+		gpc_num = pri_get_gpc_num(addr);
+		gpc_addr = pri_gpccs_addr_mask(addr);
+		if (pri_is_tpc_addr(gpc_addr))
+			tpc_num = pri_get_tpc_num(gpc_addr);
+		else
+			return -EINVAL;
+
+		nvhost_dbg_info(" gpc = %d tpc = %d",
+				gpc_num, tpc_num);
+	} else
+		return -EINVAL;
+
+	buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+	/* note below is in words/num_registers */
+	marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
+
+	context = context_buffer;
+	/* sanity check main header */
+	if (!check_main_image_header_magic(context)) {
+		nvhost_err(dev_from_gk20a(g),
+			   "Invalid main header: magic value");
+		return -EINVAL;
+	}
+	num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	if (gpc_num >= num_gpcs) {
+		nvhost_err(dev_from_gk20a(g),
+		   "GPC 0x%08x is greater than total count 0x%08x!\n",
+			   gpc_num, num_gpcs);
+		return -EINVAL;
+	}
+
+	data32 = mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+	ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
+	if (0 == ext_priv_size) {
+		nvhost_dbg_info(" No extended memory in context buffer");
+		return -EINVAL;
+	}
+	ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
+
+	offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
+	offset_to_segment_end = offset_to_segment +
+		(ext_priv_size * buffer_segments_size);
+
+	/* check local header magic */
+	context += ctxsw_prog_ucode_header_size_in_bytes();
+	if (!check_local_header_magic(context)) {
+		nvhost_err(dev_from_gk20a(g),
+			   "Invalid local header: magic value\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * See if the incoming register address is in the first table of
+	 * registers. We check this by decoding only the TPC addr portion.
+	 * If we get a hit on the TPC bit, we then double check the address
+	 * by computing it from the base gpc/tpc strides.  Then make sure
+	 * it is a real match.
+	 */
+	num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
+	sm_dsm_perf_regs = _sm_dsm_perf_regs;
+	perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
+
+	init_sm_dsm_reg_info();
+
+	for (i = 0; i < num_sm_dsm_perf_regs; i++) {
+		if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
+			sm_dsm_perf_reg_id = i;
+
+			nvhost_dbg_info("register match: 0x%08x",
+					sm_dsm_perf_regs[i]);
+
+			chk_addr = (proj_gpc_base_v() +
+				   (proj_gpc_stride_v() * gpc_num) +
+				   proj_tpc_in_gpc_base_v() +
+				   (proj_tpc_in_gpc_stride_v() * tpc_num) +
+				   (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
+
+			if (chk_addr != addr) {
+				nvhost_err(dev_from_gk20a(g),
+				   "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+					   addr, chk_addr);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+
+	/* Didn't find reg in supported group 1.
+	 *  so try the second group now */
+	num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
+	sm_dsm_perf_ctrl_regs     = _sm_dsm_perf_ctrl_regs;
+	control_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+
+	if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
+		for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
+			if ((addr & tpc_gpc_mask) ==
+			    (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
+				sm_dsm_perf_ctrl_reg_id = i;
+
+				nvhost_dbg_info("register match: 0x%08x",
+						sm_dsm_perf_ctrl_regs[i]);
+
+				chk_addr = (proj_gpc_base_v() +
+					   (proj_gpc_stride_v() * gpc_num) +
+					   proj_tpc_in_gpc_base_v() +
+					   (proj_tpc_in_gpc_stride_v() * tpc_num) +
+					   (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
+					    tpc_gpc_mask));
+
+				if (chk_addr != addr) {
+					nvhost_err(dev_from_gk20a(g),
+						   "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+						   addr, chk_addr);
+					return -EINVAL;
+
+				}
+
+				break;
+			}
+		}
+	}
+
+	if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
+	    (ILLEGAL_ID == sm_dsm_perf_reg_id))
+		return -EINVAL;
+
+	/* Skip the FECS extended header, nothing there for us now. */
+	offset_to_segment += buffer_segments_size;
+
+	/* skip through the GPCCS extended headers until we get to the data for
+	 * our GPC.  The size of each gpc extended segment is enough to hold the
+	 * max tpc count for the gpcs,in 256b chunks.
+	 */
+
+	max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
+
+	num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
+
+	offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
+			      buffer_segments_size * gpc_num);
+
+	num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+
+	/* skip the head marker to start with */
+	inter_seg_offset = marker_size;
+
+	if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
+		/* skip over control regs of TPC's before the one we want.
+		 *  then skip to the register in this tpc */
+		inter_seg_offset = inter_seg_offset +
+			(tpc_num * control_register_stride) +
+			sm_dsm_perf_ctrl_reg_id;
+	} else {
+		/* skip all the control registers */
+		inter_seg_offset = inter_seg_offset +
+			(num_tpcs * control_register_stride);
+
+		/* skip the marker between control and counter segments */
+		inter_seg_offset += marker_size;
+
+		/* skip over counter regs of TPCs before the one we want */
+		inter_seg_offset = inter_seg_offset +
+			(tpc_num * perf_register_stride) *
+			ctxsw_prog_extended_num_smpc_quadrants_v();
+
+		/* skip over the register for the quadrants we do not want.
+		 *  then skip to the register in this tpc */
+		inter_seg_offset = inter_seg_offset +
+			(perf_register_stride * quad) +
+			sm_dsm_perf_reg_id;
+	}
+
+	/* set the offset to the segment offset plus the inter segment offset to
+	 *  our register */
+	offset_to_segment += (inter_seg_offset * 4);
+
+	/* last sanity check: did we somehow compute an offset outside the
+	 * extended buffer? */
+	if (offset_to_segment > offset_to_segment_end) {
+		nvhost_err(dev_from_gk20a(g),
+			   "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
+			   offset_to_segment, offset_to_segment_end);
+		return -EINVAL;
+	}
+
+	*priv_offset = offset_to_segment;
+
+	return 0;
+}
+
+
+static int
+gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
+					     int addr_type,/* enum ctxsw_addr_type */
+					     u32 pri_addr,
+					     u32 gpc_num, u32 num_tpcs,
+					     u32 num_ppcs, u32 ppc_mask,
+					     u32 *priv_offset)
+{
+	u32 i;
+	u32 address, base_address;
+	u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
+	u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
+	struct aiv_gk20a *reg;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
+
+	if (!g->gr.ctx_vars.valid)
+		return -EINVAL;
+
+	/* Process the SYS/BE segment. */
+	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
+		for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+			reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
+			address    = reg->addr;
+			sys_offset = reg->index;
+
+			if (pri_addr == address) {
+				*priv_offset = sys_offset;
+				return 0;
+			}
+		}
+	}
+
+	/* Process the TPC segment. */
+	if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+		for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
+			for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+				reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
+				address = reg->addr;
+				tpc_addr = pri_tpccs_addr_mask(address);
+				base_address = proj_gpc_base_v() +
+					(gpc_num * proj_gpc_stride_v()) +
+					proj_tpc_in_gpc_base_v() +
+					(tpc_num * proj_tpc_in_gpc_stride_v());
+				address = base_address + tpc_addr;
+				/*
+				 * The data for the TPCs is interleaved in the context buffer.
+				 * Example with num_tpcs = 2
+				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+				 */
+				tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
+
+				if (pri_addr == address) {
+					*priv_offset = tpc_offset;
+					return 0;
+				}
+			}
+		}
+	}
+
+	/* Process the PPC segment. */
+	if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+		for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
+			for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+				reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
+				address = reg->addr;
+				ppc_addr = pri_ppccs_addr_mask(address);
+				base_address = proj_gpc_base_v() +
+					(gpc_num * proj_gpc_stride_v()) +
+					proj_ppc_in_gpc_base_v() +
+					(ppc_num * proj_ppc_in_gpc_stride_v());
+				address = base_address + ppc_addr;
+				/*
+				 * The data for the PPCs is interleaved in the context buffer.
+				 * Example with numPpcs = 2
+				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+				 */
+				ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
+
+				if (pri_addr == address)  {
+					*priv_offset = ppc_offset;
+					return 0;
+				}
+			}
+		}
+	}
+
+
+	/* Process the GPC segment. */
+	if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+		for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+			reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
+
+			address = reg->addr;
+			gpc_addr = pri_gpccs_addr_mask(address);
+			gpc_offset = reg->index;
+
+			base_address = proj_gpc_base_v() +
+				(gpc_num * proj_gpc_stride_v());
+			address = base_address + gpc_addr;
+
+			if (pri_addr == address) {
+				*priv_offset = gpc_offset;
+				return 0;
+			}
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
+					       void *context,
+					       u32 *num_ppcs, u32 *ppc_mask,
+					       u32 *reg_ppc_count)
+{
+	u32 data32;
+	u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+
+	/*
+	 * if there is only 1 PES_PER_GPC, then we put the PES registers
+	 * in the GPC reglist, so we can't error out if ppc.count == 0
+	 */
+	if ((!g->gr.ctx_vars.valid) ||
+	    ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
+	     (litter_num_pes_per_gpc > 1)))
+		return -EINVAL;
+
+	data32 = mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+
+	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
+	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
+
+	*reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
+
+	return 0;
+}
+
+
+
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the context buffer.
+ */
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+					       u32 addr,
+					       bool is_quad, u32 quad,
+					       u32 *context_buffer,
+					       u32 context_buffer_size,
+					       u32 *priv_offset)
+{
+	struct gr_gk20a *gr = &g->gr;
+	u32 i, data32;
+	int err;
+	int addr_type; /*enum ctxsw_addr_type */
+	u32 broadcast_flags;
+	u32 gpc_num, tpc_num, ppc_num, be_num;
+	u32 num_gpcs, num_tpcs, num_ppcs;
+	u32 offset;
+	u32 sys_priv_offset, gpc_priv_offset;
+	u32 ppc_mask, reg_list_ppc_count;
+	void *context;
+	u32 offset_to_segment;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+	err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+					&gpc_num, &tpc_num, &ppc_num, &be_num,
+					&broadcast_flags);
+	if (err)
+		return err;
+
+	context = context_buffer;
+	if (!check_main_image_header_magic(context)) {
+		nvhost_err(dev_from_gk20a(g),
+			   "Invalid main header: magic value");
+		return -EINVAL;
+	}
+	num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+
+	/* Parse the FECS local header. */
+	context += ctxsw_prog_ucode_header_size_in_bytes();
+	if (!check_local_header_magic(context)) {
+		nvhost_err(dev_from_gk20a(g),
+			   "Invalid FECS local header: magic value\n");
+		return -EINVAL;
+	}
+	data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+	sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+	/* If found in Ext buffer, ok.
+	 * If it failed and we expected to find it there (quad offset)
+	 * then return the error.  Otherwise continue on.
+	 */
+	err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
+				      addr, is_quad, quad, context_buffer,
+				      context_buffer_size, priv_offset);
+	if (!err || (err && is_quad))
+		return err;
+
+	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
+		/* Find the offset in the FECS segment. */
+		offset_to_segment = sys_priv_offset *
+			ctxsw_prog_ucode_header_size_in_bytes();
+
+		err = gr_gk20a_process_context_buffer_priv_segment(g,
+					   addr_type, addr,
+					   0, 0, 0, 0,
+					   &offset);
+		if (err)
+			return err;
+
+		*priv_offset = (offset_to_segment + offset);
+		return 0;
+	}
+
+	if ((gpc_num + 1) > num_gpcs)  {
+		nvhost_err(dev_from_gk20a(g),
+			   "GPC %d not in this context buffer.\n",
+			   gpc_num);
+		return -EINVAL;
+	}
+
+	/* Parse the GPCCS local header(s).*/
+	for (i = 0; i < num_gpcs; i++) {
+		context += ctxsw_prog_ucode_header_size_in_bytes();
+		if (!check_local_header_magic(context)) {
+			nvhost_err(dev_from_gk20a(g),
+				   "Invalid GPCCS local header: magic value\n");
+			return -EINVAL;
+
+		}
+		data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+		gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+		err = gr_gk20a_determine_ppc_configuration(g, context,
+							   &num_ppcs, &ppc_mask,
+							   &reg_list_ppc_count);
+		if (err)
+			return err;
+
+		num_tpcs = mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+
+		if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
+			nvhost_err(dev_from_gk20a(g),
+			   "GPC %d TPC %d not in this context buffer.\n",
+				   gpc_num, tpc_num);
+			return -EINVAL;
+		}
+
+		/* Find the offset in the GPCCS segment.*/
+		if (i == gpc_num) {
+			offset_to_segment = gpc_priv_offset *
+				ctxsw_prog_ucode_header_size_in_bytes();
+
+			if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+				/*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
+			} else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+				/* The ucode stores TPC data before PPC data.
+				 * Advance offset past TPC data to PPC data. */
+				offset_to_segment +=
+					((gr->ctx_vars.ctxsw_regs.tpc.count *
+					  num_tpcs) << 2);
+			} else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+				/* The ucode stores TPC/PPC data before GPC data.
+				 * Advance offset past TPC/PPC data to GPC data. */
+				/* note 1 PES_PER_GPC case */
+				u32 litter_num_pes_per_gpc =
+					proj_scal_litter_num_pes_per_gpc_v();
+				if (litter_num_pes_per_gpc > 1) {
+					offset_to_segment +=
+						(((gr->ctx_vars.ctxsw_regs.tpc.count *
+						   num_tpcs) << 2) +
+						 ((reg_list_ppc_count * num_ppcs) << 2));
+				} else {
+					offset_to_segment +=
+						((gr->ctx_vars.ctxsw_regs.tpc.count *
+						  num_tpcs) << 2);
+				}
+			} else {
+				nvhost_err(dev_from_gk20a(g),
+					   " Unknown address type.\n");
+				return -EINVAL;
+			}
+			err = gr_gk20a_process_context_buffer_priv_segment(g,
+							   addr_type, addr,
+							   i, num_tpcs,
+							   num_ppcs, ppc_mask,
+							   &offset);
+			if (err)
+			    return -EINVAL;
+
+			*priv_offset = offset_to_segment + offset;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+			  struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
+{
+	struct gk20a *g = ch->g;
+	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	void *ctx_ptr = NULL;
+	int curr_gr_chid, curr_gr_ctx;
+	bool ch_is_curr_ctx, restart_gr_ctxsw = false;
+	bool restart_fifo_ctxsw = false;
+	u32 i, j, offset, v;
+	u32 max_offsets = proj_scal_max_gpcs_v() *
+		proj_scal_max_tpc_per_gpc_v();
+	u32 *offsets = NULL;
+	u32 *offset_addrs = NULL;
+	u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
+	int err, pass;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
+		   num_ctx_wr_ops, num_ctx_rd_ops);
+
+	/* TBD: set timeout */
+	/* pin_context will disable channel switching.
+	 * at that point the hardware state can be inspected to
+	 * determine if the context we're interested in is current.
+	 */
+#if 0
+	err = fifo_gk20a_disable_fifo_ctxsw(g, c);
+	if (err) {
+		dev_warn(dev_from_gk20a(g), "failed to fifo ctxsw\n");
+		goto clean_up;
+	}
+	restart_fifo_ctxsw = true;
+#endif
+
+	{
+		u32 reg = gk20a_readl(g, 0x0041a084);
+		nvhost_dbg(dbg_gpu_dbg, "flcn_cfg_rm=0x%x",
+			   reg);
+	}
+
+	err = gr_gk20a_disable_ctxsw(g);
+	if (err) {
+		nvhost_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
+		/* this should probably be ctx-fatal... */
+		goto cleanup;
+	}
+
+	restart_gr_ctxsw = true;
+
+	curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
+	curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
+	ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
+	if (ch_is_curr_ctx) {
+		for (pass = 0; pass < 2; pass++) {
+			ctx_op_nr = 0;
+			for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+				/* only do ctx ops and only on the right pass */
+				if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+				    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+				     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+					continue;
+
+				/* if this is a quad access, setup for special access*/
+				if (ctx_ops[i].is_quad)
+					gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad,
+								 ctx_ops[i].offset);
+				offset = ctx_ops[i].offset;
+
+				if (pass == 0) { /* write pass */
+					v = gk20a_readl(g, offset);
+					v &= ~ctx_ops[i].and_n_mask_lo;
+					v |= ctx_ops[i].value_lo;
+					gk20a_writel(g, offset, v);
+
+					nvhost_dbg(dbg_gpu_dbg,
+						   "direct wr: offset=0x%x v=0x%x",
+						   offset, v);
+
+					if (ctx_ops[i].op == REGOP(WRITE_64)) {
+						v = gk20a_readl(g, offset + 4);
+						v &= ~ctx_ops[i].and_n_mask_hi;
+						v |= ctx_ops[i].value_hi;
+						gk20a_writel(g, offset + 4, v);
+
+						nvhost_dbg(dbg_gpu_dbg,
+							   "direct wr: offset=0x%x v=0x%x",
+							   offset + 4, v);
+					}
+
+				} else { /* read pass */
+					ctx_ops[i].value_lo =
+						gk20a_readl(g, offset);
+
+					nvhost_dbg(dbg_gpu_dbg,
+						   "direct rd: offset=0x%x v=0x%x",
+						   offset, ctx_ops[i].value_lo);
+
+					if (ctx_ops[i].op == REGOP(READ_64)) {
+						ctx_ops[i].value_hi =
+							gk20a_readl(g, offset + 4);
+
+						nvhost_dbg(dbg_gpu_dbg,
+							   "direct rd: offset=0x%x v=0x%x",
+							   offset, ctx_ops[i].value_lo);
+					} else
+						ctx_ops[i].value_hi = 0;
+				}
+				ctx_op_nr++;
+			}
+		}
+		goto cleanup;
+	}
+
+	/* they're the same size, so just use one alloc for both */
+	offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
+	if (!offsets) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+	offset_addrs = offsets + max_offsets;
+
+	/* would have been a variant of gr_gk20a_apply_instmem_overrides */
+	/* recoded in-place instead.*/
+	ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
+	if (!ctx_ptr) {
+		err = -ENOMEM;
+		ctx_ptr = NULL;
+		goto cleanup;
+	}
+
+	/* Channel gr_ctx buffer is gpu cacheable; so flush and invalidate.
+	 * There should be no on-going/in-flight references by the gpu now. */
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, true);
+
+	/* write to appropriate place in context image,
+	 * first have to figure out where that really is */
+
+	/* first pass is writes, second reads */
+	for (pass = 0; pass < 2; pass++) {
+		ctx_op_nr = 0;
+		for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+			u32 num_offsets;
+
+			/* only do ctx ops and only on the right pass */
+			if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+			    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+			     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+				continue;
+
+			gr_gk20a_get_ctx_buffer_offsets(g,
+						ctx_ops[i].offset,
+						max_offsets,
+						offsets, offset_addrs,
+						&num_offsets,
+						ctx_ops[i].is_quad,
+						ctx_ops[i].quad);
+
+			/* if this is a quad access, setup for special access*/
+			if (ctx_ops[i].is_quad)
+				gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad,
+							 ctx_ops[i].offset);
+
+			for (j = 0; j < num_offsets; j++) {
+				/* sanity check, don't write outside, worst case */
+				if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
+					continue;
+				if (pass == 0) { /* write pass */
+					v = mem_rd32(ctx_ptr + offsets[j], 0);
+					v &= ~ctx_ops[i].and_n_mask_lo;
+					v |= ctx_ops[i].value_lo;
+					mem_wr32(ctx_ptr + offsets[j], 0, v);
+
+					nvhost_dbg(dbg_gpu_dbg,
+						   "context wr: offset=0x%x v=0x%x",
+						   offsets[j], v);
+
+					if (ctx_ops[i].op == REGOP(WRITE_64)) {
+						v = mem_rd32(ctx_ptr + offsets[j] + 4, 0);
+						v &= ~ctx_ops[i].and_n_mask_hi;
+						v |= ctx_ops[i].value_hi;
+						mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
+
+						nvhost_dbg(dbg_gpu_dbg,
+							   "context wr: offset=0x%x v=0x%x",
+							   offsets[j] + 4, v);
+					}
+
+					/* check to see if we need to add a special WAR
+					   for some of the SMPC perf regs */
+					gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
+							v, ctx_ptr);
+
+				} else { /* read pass */
+					ctx_ops[i].value_lo =
+						mem_rd32(ctx_ptr + offsets[0], 0);
+
+					nvhost_dbg(dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
+						   offsets[0], ctx_ops[i].value_lo);
+
+					if (ctx_ops[i].op == REGOP(READ_64)) {
+						ctx_ops[i].value_hi =
+							mem_rd32(ctx_ptr + offsets[0] + 4, 0);
+
+						nvhost_dbg(dbg_gpu_dbg,
+							   "context rd: offset=0x%x v=0x%x",
+							   offsets[0] + 4, ctx_ops[i].value_hi);
+					} else
+						ctx_ops[i].value_hi = 0;
+				}
+			}
+			ctx_op_nr++;
+		}
+	}
+#if 0
+	/* flush cpu caches for the ctx buffer? only if cpu cached, of course.
+	 * they aren't, yet */
+	if (cached) {
+		FLUSH_CPU_DCACHE(ctx_ptr,
+			 sg_phys(ch_ctx->gr_ctx.mem.ref), size);
+	}
+#endif
+
+ cleanup:
+	if (offsets)
+		kfree(offsets);
+
+	if (ctx_ptr)
+		nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
+
+	if (restart_gr_ctxsw) {
+		int tmp_err = gr_gk20a_enable_ctxsw(g);
+		if (tmp_err) {
+			nvhost_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
+			err = tmp_err;
+		}
+	}
+
+	if (restart_fifo_ctxsw) {
+#if 0
+		fifo_gk20a_enable_fifo_ctxsw(g);
+#endif
+	}
+
+	return err;
+}
diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.h b/drivers/video/tegra/host/gk20a/gr_gk20a.h
index 7b7cdec2a1d1..b6979f99f1a8 100644
--- a/drivers/video/tegra/host/gk20a/gr_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/gr_gk20a.h
@@ -1,7 +1,5 @@
 /*
- * drivers/video/tegra/host/gk20a/gr_gk20a.h
- *
- * GK20A graphics
+ * GK20A Graphics Engine
  *
  * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
  *
@@ -14,9 +12,8 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #ifndef __GR_GK20A_H__
 #define __GR_GK20A_H__
@@ -31,7 +28,7 @@
 #define INVALID_SCREEN_TILE_ROW_OFFSET	0xFFFFFFFF
 #define INVALID_MAX_WAYS		0xFFFFFFFF
 
-enum global_ctx_buffer {
+enum /* global_ctx_buffer */ {
 	CIRCULAR		= 0,
 	PAGEPOOL		= 1,
 	ATTRIBUTE		= 2,
@@ -43,7 +40,7 @@ enum global_ctx_buffer {
 };
 
 /* either ATTRIBUTE or ATTRIBUTE_VPR maps to ATTRIBUTE_VA */
-enum global_ctx_buffer_va {
+enum  /*global_ctx_buffer_va */ {
 	CIRCULAR_VA		= 0,
 	PAGEPOOL_VA		= 1,
 	ATTRIBUTE_VA		= 2,
@@ -316,5 +313,15 @@ void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine);
 
 int gk20a_gr_suspend(struct gk20a *g);
 
+struct nvhost_dbg_gpu_reg_op;
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+			  struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops);
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+				    u32 addr,
+				    u32 max_offsets,
+				    u32 *offsets, u32 *offset_addrs,
+				    u32 *num_offsets,
+				    bool is_quad, u32 quad);
 
 #endif /*__GR_GK20A_H__*/
diff --git a/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h b/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h
new file mode 100644
index 000000000000..a82a1ee7caa8
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h
@@ -0,0 +1,179 @@
+/*
+ * GK20A Graphics Context Pri Register Addressing
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _NVHOST_GR_PRI_GK20A_H_
+#define _NVHOST_GR_PRI_GK20A_H_
+
+/*
+ * These convenience macros are generally for use in the management/modificaiton
+ * of the context state store for gr/compute contexts.
+ */
+
+/*
+ * GPC pri addressing
+ */
+static inline u32 pri_gpccs_addr_width(void)
+{
+	return 15; /*from where?*/
+}
+static inline u32 pri_gpccs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_gpccs_addr_width()) - 1);
+}
+static inline u32 pri_gpc_addr(u32 addr, u32 gpc)
+{
+	return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + addr;
+}
+static inline bool pri_is_gpc_addr_shared(u32 addr)
+{
+	return (addr >= proj_gpc_shared_base_v()) &&
+		(addr < proj_gpc_shared_base_v() + proj_gpc_stride_v());
+}
+static inline bool pri_is_gpc_addr(u32 addr)
+{
+	return	((addr >= proj_gpc_base_v()) &&
+		 (addr < proj_gpc_base_v() +
+		  proj_scal_litter_num_gpcs_v() * proj_gpc_stride_v())) ||
+		pri_is_gpc_addr_shared(addr);
+}
+static inline u32 pri_get_gpc_num(u32 addr)
+{
+	u32 i, start;
+	u32 num_gpcs = proj_scal_litter_num_gpcs_v();
+
+	for (i = 0; i < num_gpcs; i++) {
+		start = proj_gpc_base_v() + (i * proj_gpc_stride_v());
+		if ((addr >= start) && (addr < (start + proj_gpc_stride_v())))
+			return i;
+	}
+	return 0;
+}
+/*
+ * TPC pri addressing
+ */
+static inline u32 pri_tpccs_addr_width(void)
+{
+	return 11; /* from where? */
+}
+static inline u32 pri_tpccs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_tpccs_addr_width()) - 1);
+}
+static inline u32 pri_tpc_addr(u32 addr, u32 gpc, u32 tpc)
+{
+	return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+		proj_tpc_in_gpc_base_v() + (tpc * proj_tpc_in_gpc_stride_v()) +
+		addr;
+}
+static inline bool pri_is_tpc_addr_shared(u32 addr)
+{
+	return (addr >= proj_tpc_in_gpc_shared_base_v()) &&
+		(addr < (proj_tpc_in_gpc_shared_base_v() +
+			 proj_tpc_in_gpc_stride_v()));
+}
+static inline bool pri_is_tpc_addr(u32 addr)
+{
+	return ((addr >= proj_tpc_in_gpc_base_v()) &&
+		(addr < proj_tpc_in_gpc_base_v() + (proj_scal_litter_num_tpc_per_gpc_v() *
+						    proj_tpc_in_gpc_stride_v())))
+		||
+		pri_is_tpc_addr_shared(addr);
+}
+static inline u32 pri_get_tpc_num(u32 addr)
+{
+	u32 i, start;
+	u32 num_tpcs = proj_scal_litter_num_tpc_per_gpc_v();
+
+	for (i = 0; i < num_tpcs; i++) {
+		start = proj_tpc_in_gpc_base_v() + (i * proj_tpc_in_gpc_stride_v());
+		if ((addr >= start) && (addr < (start + proj_tpc_in_gpc_stride_v())))
+			return i;
+	}
+	return 0;
+}
+
+/*
+ * BE pri addressing
+ */
+static inline u32 pri_becs_addr_width(void)
+{
+	return 10;/* from where? */
+}
+static inline u32 pri_becs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_becs_addr_width()) - 1);
+}
+static inline bool pri_is_be_addr_shared(u32 addr)
+{
+	return (addr >= proj_rop_shared_base_v()) &&
+		(addr < proj_rop_shared_base_v() + proj_rop_stride_v());
+}
+static inline u32 pri_be_shared_addr(u32 addr)
+{
+	return proj_rop_shared_base_v() + pri_becs_addr_mask(addr);
+}
+static inline bool pri_is_be_addr(u32 addr)
+{
+	return	((addr >= proj_rop_base_v()) &&
+		 (addr < proj_rop_base_v()+proj_scal_litter_num_fbps_v() * proj_rop_stride_v())) ||
+		pri_is_be_addr_shared(addr);
+}
+
+static inline u32 pri_get_be_num(u32 addr)
+{
+	u32 i, start;
+	u32 num_fbps = proj_scal_litter_num_fbps_v();
+	for (i = 0; i < num_fbps; i++) {
+		start = proj_rop_base_v() + (i * proj_rop_stride_v());
+		if ((addr >= start) && (addr < (start + proj_rop_stride_v())))
+			return i;
+	}
+	return 0;
+}
+
+/*
+ * PPC pri addressing
+ */
+static inline u32 pri_ppccs_addr_width(void)
+{
+	return 9; /* from where? */
+}
+static inline u32 pri_ppccs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_ppccs_addr_width()) - 1);
+}
+static inline u32 pri_ppc_addr(u32 addr, u32 gpc, u32 ppc)
+{
+	return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+		proj_ppc_in_gpc_base_v() + (ppc * proj_ppc_in_gpc_stride_v()) + addr;
+}
+
+enum ctxsw_addr_type {
+	CTXSW_ADDR_TYPE_SYS = 0,
+	CTXSW_ADDR_TYPE_GPC = 1,
+	CTXSW_ADDR_TYPE_TPC = 2,
+	CTXSW_ADDR_TYPE_BE  = 3,
+	CTXSW_ADDR_TYPE_PPC = 4
+};
+
+#define PRI_BROADCAST_FLAGS_NONE  0
+#define PRI_BROADCAST_FLAGS_GPC   BIT(0)
+#define PRI_BROADCAST_FLAGS_TPC   BIT(1)
+#define PRI_BROADCAST_FLAGS_BE    BIT(2)
+#define PRI_BROADCAST_FLAGS_PPC   BIT(3)
+
+#endif /*_NVHOST_GR_PRI_GK20A_H_ */
diff --git a/drivers/video/tegra/host/gk20a/mm_gk20a.h b/drivers/video/tegra/host/gk20a/mm_gk20a.h
index 03eb05ef946d..4db90c9b80bf 100644
--- a/drivers/video/tegra/host/gk20a/mm_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/mm_gk20a.h
@@ -78,6 +78,7 @@ struct userd_desc {
 
 struct patch_desc {
 	struct mem_desc mem;
+	void *cpu_va;
 	u64 gpu_va;
 	u32 data_count;
 };
diff --git a/drivers/video/tegra/host/gk20a/regops_gk20a.c b/drivers/video/tegra/host/gk20a/regops_gk20a.c
new file mode 100644
index 000000000000..d35f6961ab0b
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/regops_gk20a.c
@@ -0,0 +1,270 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+
+#include "dev.h"
+#include "nvhost_hwctx.h"
+/*#include "nvhost_acm.h"*/
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+			     u32 *ctx_rd_count, u32 *ctx_wr_count,
+			     struct nvhost_dbg_gpu_reg_op *ops,
+			     u32 op_count);
+
+
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+		      struct nvhost_dbg_gpu_reg_op *ops,
+		      u64 num_ops)
+{
+	int err = 0, i;
+	struct channel_gk20a *ch = dbg_s->ch;
+	struct gk20a *g = dbg_s->ch->g;
+	/*struct gr_gk20a *gr = &g->gr;*/
+	u32 data32_lo = 0, data32_hi = 0;
+	u32 ctx_rd_count = 0, ctx_wr_count = 0;
+	bool skip_read_lo = false, skip_read_hi = false;
+	bool ok;
+
+	nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
+
+	ok = validate_reg_ops(dbg_s,
+			      &ctx_rd_count, &ctx_wr_count,
+			      ops, num_ops);
+	if (!ok) {
+		dev_err(dbg_s->dev, "invalid op(s)");
+		err = -EINVAL;
+		/* each op has its own err/status */
+		goto clean_up;
+	}
+
+	for (i = 0; i < num_ops; i++) {
+		/* if it isn't global then it is done in the ctx ops... */
+		if (ops[i].type != REGOP(TYPE_GLOBAL))
+			continue;
+
+		switch (ops[i].op) {
+
+		case REGOP(READ_32):
+			ops[i].value_hi = 0;
+			ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+			nvhost_dbg(dbg_gpu_dbg, "read_32 0x%08x from 0x%08x",
+				   ops[i].value_lo, ops[i].offset);
+
+			break;
+
+		case REGOP(READ_64):
+			ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+			ops[i].value_hi =
+				gk20a_readl(g, ops[i].offset + 4);
+
+			nvhost_dbg(dbg_gpu_dbg, "read_64 0x%08x:%08x from 0x%08x",
+				   ops[i].value_hi, ops[i].value_lo,
+				   ops[i].offset);
+		break;
+
+		case REGOP(WRITE_32):
+		case REGOP(WRITE_64):
+			/* some of this appears wonky/unnecessary but
+			   we've kept it for compat with existing
+			   debugger code.  just in case... */
+			if (ops[i].and_n_mask_lo == ~(u32)0) {
+				data32_lo = ops[i].value_lo;
+				skip_read_lo = true;
+			}
+
+			if ((ops[i].op == REGOP(WRITE_64)) &&
+			    (ops[i].and_n_mask_hi == ~(u32)0)) {
+				data32_hi = ops[i].value_hi;
+				skip_read_hi = true;
+			}
+
+			/* read first 32bits */
+			if (unlikely(skip_read_lo == false)) {
+				data32_lo = gk20a_readl(g, ops[i].offset);
+				data32_lo &= ~ops[i].and_n_mask_lo;
+				data32_lo |= ops[i].value_lo;
+			}
+
+			/* if desired, read second 32bits */
+			if ((ops[i].op == REGOP(WRITE_64)) &&
+			    !skip_read_hi) {
+				data32_hi = gk20a_readl(g, ops[i].offset + 4);
+				data32_hi &= ~ops[i].and_n_mask_hi;
+				data32_hi |= ops[i].value_hi;
+			}
+
+			/* now update first 32bits */
+			gk20a_writel(g, ops[i].offset, data32_lo);
+			nvhost_dbg(dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+				   data32_lo, ops[i].offset);
+			/* if desired, update second 32bits */
+			if (ops[i].op == REGOP(WRITE_64)) {
+				gk20a_writel(g, ops[i].offset + 4, data32_hi);
+				nvhost_dbg(dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+					   data32_hi, ops[i].offset + 4);
+
+			}
+
+
+			break;
+
+		/* shouldn't happen as we've already screened */
+		default:
+			BUG();
+			err = -EINVAL;
+			goto clean_up;
+			break;
+		}
+	}
+
+	if (ctx_wr_count | ctx_rd_count) {
+		err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+					    ctx_wr_count, ctx_rd_count);
+		if (err) {
+			dev_warn(dbg_s->dev,
+				 "failed to perform ctx ops\n");
+			goto clean_up;
+		}
+	}
+
+ clean_up:
+	nvhost_dbg(dbg_gpu_dbg, "ret=%d", err);
+	return err;
+
+}
+
+
+static int validate_reg_op_info(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_reg_op *op)
+{
+	int err = 0;
+
+	op->status = REGOP(STATUS_SUCCESS);
+
+	switch (op->op) {
+	case REGOP(READ_32):
+	case REGOP(READ_64):
+	case REGOP(WRITE_32):
+	case REGOP(WRITE_64):
+		break;
+	default:
+		op->status |= REGOP(STATUS_UNSUPPORTED_OP);
+		/*nvhost_err(dbg_s->dev, "Invalid regops op %d!", op->op);*/
+		err = -EINVAL;
+		break;
+	}
+
+	switch (op->type) {
+	case REGOP(TYPE_GLOBAL):
+	case REGOP(TYPE_GR_CTX):
+	case REGOP(TYPE_GR_CTX_TPC):
+	case REGOP(TYPE_GR_CTX_SM):
+	case REGOP(TYPE_GR_CTX_CROP):
+	case REGOP(TYPE_GR_CTX_ZROP):
+	case REGOP(TYPE_GR_CTX_QUAD):
+		break;
+	/*
+	case NVHOST_DBG_GPU_REG_OP_TYPE_FB:
+	*/
+	default:
+		op->status |= REGOP(STATUS_INVALID_TYPE);
+		/*nvhost_err(dbg_s->dev, "Invalid regops type %d!", op->type);*/
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s,
+				  struct nvhost_dbg_gpu_reg_op *op)
+{
+	int err = 0, temp_err;
+	u32 buf_offset_lo, buf_offset_addr, num_offsets;
+	bool is_ctx_op = reg_op_is_gr_ctx(op->type);
+
+	op->status = 0;
+	/*TBD: get this size from the register resource directly */
+	if (!is_ctx_op && op->offset >= SZ_16M) {
+		op->status = REGOP(STATUS_INVALID_OFFSET);
+		err = -EINVAL;
+	} else if (is_ctx_op) {
+		if (!dbg_s->ch) {
+			nvhost_err(dbg_s->dev, "can't perform ctx regop unless bound");
+			temp_err = -EINVAL;
+		} else
+			temp_err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->ch->g,
+					   op->offset,
+					   1,
+					   &buf_offset_lo,
+					   &buf_offset_addr,
+					   &num_offsets,
+					   op->type == REGOP(TYPE_GR_CTX_QUAD),
+					   op->quad);
+		if (temp_err) {
+			op->status |= REGOP(STATUS_INVALID_OFFSET);
+			err = -EINVAL;
+		}
+		if (!buf_offset_lo) {
+			op->status |= REGOP(STATUS_INVALID_OFFSET);
+			err = -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+			    u32 *ctx_rd_count, u32 *ctx_wr_count,
+			    struct nvhost_dbg_gpu_reg_op *ops,
+			    u32 op_count)
+{
+	u32 i;
+	int err;
+	bool ok = true;
+
+	/* keep going until the end so every op can get
+	 * a separate error code if needed */
+	for (i = 0; i < op_count; i++) {
+
+		err = validate_reg_op_info(dbg_s, &ops[i]);
+		ok &= !err;
+
+		if (reg_op_is_gr_ctx(ops[i].type)) {
+			if (reg_op_is_read(ops[i].op))
+				(*ctx_rd_count)++;
+			else
+				(*ctx_wr_count)++;
+		}
+
+		err = validate_reg_op_offset(dbg_s, &ops[i]);
+		ok &= !err;
+	}
+
+	nvhost_dbg_fn("ctx_wrs:%d ctx_rds:%d\n", *ctx_wr_count, *ctx_rd_count);
+
+	return ok;
+}
diff --git a/drivers/video/tegra/host/gk20a/regops_gk20a.h b/drivers/video/tegra/host/gk20a/regops_gk20a.h
new file mode 100644
index 000000000000..231882946a08
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/regops_gk20a.h
@@ -0,0 +1,46 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __REGOPS_GK20A_H_
+#define __REGOPS_GK20A_H_
+
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+		      struct nvhost_dbg_gpu_reg_op *ops,
+		      u64 num_ops);
+
+/* turn seriously unwieldy names -> something shorter */
+#define REGOP(x) NVHOST_DBG_GPU_REG_OP_##x
+
+
+static inline bool reg_op_is_gr_ctx(u8 type)
+{
+	return  type == REGOP(TYPE_GR_CTX) ||
+		type == REGOP(TYPE_GR_CTX_TPC) ||
+		type == REGOP(TYPE_GR_CTX_SM) ||
+		type == REGOP(TYPE_GR_CTX_CROP) ||
+		type == REGOP(TYPE_GR_CTX_ZROP) ||
+		type == REGOP(TYPE_GR_CTX_QUAD);
+}
+static inline bool reg_op_is_read(u8 op)
+{
+	return  op == REGOP(READ_32) ||
+		op == REGOP(READ_64) ;
+}
+
+
+#endif /* __REGOPS_GK20A_H_ */
diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h
index c919b89a2e5e..a2552cd1ef27 100644
--- a/drivers/video/tegra/host/nvhost_channel.h
+++ b/drivers/video/tegra/host/nvhost_channel.h
@@ -129,4 +129,6 @@ void nvhost_free_channel_internal(struct nvhost_channel *ch,
 
 int nvhost_channel_save_context(struct nvhost_channel *ch);
 
+struct nvhost_hwctx *nvhost_channel_get_file_hwctx(int fd);
+
 #endif
diff --git a/drivers/video/tegra/host/nvhost_hwctx.h b/drivers/video/tegra/host/nvhost_hwctx.h
index 0672571aa32a..43c39387c98f 100644
--- a/drivers/video/tegra/host/nvhost_hwctx.h
+++ b/drivers/video/tegra/host/nvhost_hwctx.h
@@ -1,9 +1,7 @@
 /*
- * drivers/video/tegra/host/nvhost_hwctx.h
- *
  * Tegra Graphics Host Hardware Context Interface
  *
- * Copyright (c) 2010-2013, NVIDIA Corporation.
+ * Copyright (c) 2010-2013, NVIDIA Corporation.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -29,6 +27,7 @@
 struct nvhost_channel;
 struct nvhost_cdma;
 struct mem_mgr;
+struct nvhost_dbg_session;
 
 struct nvhost_hwctx {
 	struct kref ref;
@@ -47,6 +46,7 @@ struct nvhost_hwctx {
 
 	struct list_head as_share_bound_list_node;
 	struct nvhost_as_share *as_share;
+	struct nvhost_dbg_session *dbg_session;
 };
 
 struct nvhost_hwctx_handler {
@@ -82,4 +82,5 @@ enum {
 
 #define HWCTX_REGINFO(offset, count, type) {offset, count, HWCTX_REGINFO_##type, offset}
 #define HWCTX_REGINFO_RST(offset, count, type, rst) {offset, count, HWCTX_REGINFO_##type, rst}
+
 #endif
diff --git a/drivers/video/tegra/host/t124/t124.c b/drivers/video/tegra/host/t124/t124.c
index 75a248591c19..6bd236113e6b 100644
--- a/drivers/video/tegra/host/t124/t124.c
+++ b/drivers/video/tegra/host/t124/t124.c
@@ -443,6 +443,7 @@ struct nvhost_device_data tegra_gk20a_info = {
 	.can_powergate		= true,
 	.alloc_hwctx_handler	= nvhost_gk20a_alloc_hwctx_handler,
 	.ctrl_ops		= &tegra_gk20a_ctrl_ops,
+	.dbg_ops                = &tegra_gk20a_dbg_gpu_ops,
 	.moduleid		= NVHOST_MODULE_GPU,
 	.init			= nvhost_gk20a_init,
 	.deinit			= nvhost_gk20a_deinit,
diff --git a/include/linux/nvhost.h b/include/linux/nvhost.h
index 60136ba8ae45..896027702314 100644
--- a/include/linux/nvhost.h
+++ b/include/linux/nvhost.h
@@ -190,7 +190,10 @@ struct nvhost_device_data {
 	struct cdev ctrl_cdev;
 	const struct file_operations *ctrl_ops;    /* ctrl ops for the module */
 
-	/*	void	*priv;*/
+	/* module debugger */
+	struct device *dbg_node;
+	struct cdev dbg_cdev;
+	const struct file_operations *dbg_ops;
 
 	struct kobject *power_kobj;	/* kobject to hold power sysfs entries */
 	struct nvhost_device_power_attr *power_attrib;	/* sysfs attributes */
diff --git a/include/linux/nvhost_dbg_gpu_ioctl.h b/include/linux/nvhost_dbg_gpu_ioctl.h
new file mode 100644
index 000000000000..2866a6d9df2d
--- /dev/null
+++ b/include/linux/nvhost_dbg_gpu_ioctl.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __LINUX_NVHOST_DBG_GPU_IOCTL_H
+#define __LINUX_NVHOST_DBG_GPU_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#if !defined(__KERNEL__)
+#define __user
+#endif
+
+#define NVHOST_DBG_GPU_IOCTL_MAGIC 'D'
+
+/*
+ * /dev/nvhost-dbg-* devices
+ *
+ * Opening a '/dev/nvhost-dbg-<module_name>' device node creates a new debugger
+ * session.  nvhost channels (for the same module) can then be bound to such a
+ * session.
+ *
+ * Once a nvhost channel has been bound to a debugger session it cannot be
+ * bound to another.
+ *
+ * As long as there is an open device file to the session, or any bound
+ * nvhost channels it will be valid.  Once all references to the session
+ * are removed the session is deleted.
+ *
+ */
+
+/*
+ * Binding/attaching a debugger session to an nvhost gpu channel
+ *
+ * The 'channel_fd' given here is the fd used to allocate the
+ * gpu channel context.  To detach/unbind the debugger session
+ * use a channel_fd of -1.
+ *
+ */
+struct nvhost_dbg_gpu_bind_channel_args {
+	__u32 channel_fd; /* in*/
+	__u32 _pad0[1];
+};
+
+#define NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL				\
+	_IOWR(NVHOST_DBG_GPU_IOCTL_MAGIC, 1, struct nvhost_dbg_gpu_bind_channel_args)
+
+/*
+ * Register operations
+ */
+/* valid op values */
+#define NVHOST_DBG_GPU_REG_OP_READ_32                             (0x00000000)
+#define NVHOST_DBG_GPU_REG_OP_WRITE_32                            (0x00000001)
+#define NVHOST_DBG_GPU_REG_OP_READ_64                             (0x00000002)
+#define NVHOST_DBG_GPU_REG_OP_WRITE_64                            (0x00000003)
+/* note: 8b ops are unsupported */
+#define NVHOST_DBG_GPU_REG_OP_READ_08                             (0x00000004)
+#define NVHOST_DBG_GPU_REG_OP_WRITE_08                            (0x00000005)
+
+/* valid type values */
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GLOBAL                         (0x00000000)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX                         (0x00000001)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_TPC                     (0x00000002)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_SM                      (0x00000004)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_CROP                    (0x00000008)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_ZROP                    (0x00000010)
+/*#define NVHOST_DBG_GPU_REG_OP_TYPE_FB                           (0x00000020)*/
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_QUAD                    (0x00000040)
+
+/* valid status values */
+#define NVHOST_DBG_GPU_REG_OP_STATUS_SUCCESS                      (0x00000000)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OP                   (0x00000001)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_TYPE                 (0x00000002)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET               (0x00000004)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_UNSUPPORTED_OP               (0x00000008)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_MASK                 (0x00000010)
+
+struct nvhost_dbg_gpu_reg_op {
+	__u8    op;
+	__u8    type;
+	__u8    status;
+	__u8    quad;
+	__u8    is_quad;
+	__u8    _pad0[3];
+	__u32   group_mask;
+	__u32   sub_group_mask;
+	__u32   offset;
+	__u32   value_hi;
+	__u32   value_lo;
+	__u32   and_n_mask_hi;
+	__u32   and_n_mask_lo;
+	__u32 _pad1[1];
+};
+
+struct nvhost_dbg_gpu_exec_reg_ops_args {
+	__u64 ops; /* pointer to nvhost_reg_op operations */
+	__u32 num_ops;
+	__u32 _pad0[1];
+};
+
+#define NVHOST_DBG_GPU_IOCTL_REG_OPS					\
+	_IOWR(NVHOST_DBG_GPU_IOCTL_MAGIC, 2, struct nvhost_dbg_gpu_exec_reg_ops_args)
+
+
+#define NVHOST_DBG_GPU_IOCTL_LAST		\
+	_IOC_NR(NVHOST_DBG_GPU_IOCTL_REG_OPS)
+#define NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE		\
+	sizeof(struct nvhost_dbg_gpu_exec_reg_ops_args)
+
+#endif
author	Ken Adams <kadams@nvidia.com>	2013-09-17 12:55:54 -0400
committer	Dan Willemsen <dwillemsen@nvidia.com>	2013-09-27 12:53:49 -0700
commit	d55049b57a338403afe3a0e8d93ee83a9d63007d (patch)
tree	dd82aefcd9924e43d63c25d80e23a832d84b80de
parent	409be5d3c52b2a6cd6a843d91f8fbf63f4d3b42b (diff)