summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Adams <kadams@nvidia.com>2013-09-17 12:55:54 -0400
committerDan Willemsen <dwillemsen@nvidia.com>2013-09-27 12:53:49 -0700
commitd55049b57a338403afe3a0e8d93ee83a9d63007d (patch)
treedd82aefcd9924e43d63c25d80e23a832d84b80de
parent409be5d3c52b2a6cd6a843d91f8fbf63f4d3b42b (diff)
video: tegra: host: module debugger framework
Framework and implementation of a gk20a debugger/profiler session interface. Adds work toward optimized handling of context patch write sequences. These introduce cpu map/unmap operations and gpu l2 invaliates. Unless we take care to coalesce them they occur *per write*. Change-Id: I8afc11a6f6782b80996404acbd01bffe9653ebdd Signed-off-by: Ken Adams <kadams@nvidia.com> Reviewed-on: http://git-master/r/274416
-rw-r--r--drivers/video/tegra/host/bus_client.c19
-rw-r--r--drivers/video/tegra/host/bus_client.h2
-rw-r--r--drivers/video/tegra/host/dev.h3
-rw-r--r--drivers/video/tegra/host/gk20a/Makefile2
-rw-r--r--drivers/video/tegra/host/gk20a/channel_gk20a.c1
-rw-r--r--drivers/video/tegra/host/gk20a/channel_gk20a.h3
-rw-r--r--drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c368
-rw-r--r--drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h51
-rw-r--r--drivers/video/tegra/host/gk20a/gk20a.c14
-rw-r--r--drivers/video/tegra/host/gk20a/gk20a.h7
-rw-r--r--drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h9
-rw-r--r--drivers/video/tegra/host/gk20a/gr_gk20a.c1630
-rw-r--r--drivers/video/tegra/host/gk20a/gr_gk20a.h23
-rw-r--r--drivers/video/tegra/host/gk20a/gr_pri_gk20a.h179
-rw-r--r--drivers/video/tegra/host/gk20a/mm_gk20a.h1
-rw-r--r--drivers/video/tegra/host/gk20a/regops_gk20a.c270
-rw-r--r--drivers/video/tegra/host/gk20a/regops_gk20a.h46
-rw-r--r--drivers/video/tegra/host/nvhost_channel.h2
-rw-r--r--drivers/video/tegra/host/nvhost_hwctx.h7
-rw-r--r--drivers/video/tegra/host/t124/t124.c1
-rw-r--r--include/linux/nvhost.h5
-rw-r--r--include/linux/nvhost_dbg_gpu_ioctl.h122
22 files changed, 2631 insertions, 134 deletions
diff --git a/drivers/video/tegra/host/bus_client.c b/drivers/video/tegra/host/bus_client.c
index 7e1a4c829445..34e88f20881e 100644
--- a/drivers/video/tegra/host/bus_client.c
+++ b/drivers/video/tegra/host/bus_client.c
@@ -1,6 +1,4 @@
/*
- * drivers/video/tegra/host/bus_client.c
- *
* Tegra Graphics Host Client Module
*
* Copyright (c) 2010-2013, NVIDIA Corporation. All rights reserved.
@@ -217,7 +215,7 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
}
filp->private_data = priv;
priv->ch = ch;
- if(nvhost_module_add_client(ch->dev, priv))
+ if (nvhost_module_add_client(ch->dev, priv))
goto fail;
if (ch->ctxhandler && ch->ctxhandler->alloc) {
@@ -1108,9 +1106,10 @@ int nvhost_client_user_init(struct platform_device *dev)
struct nvhost_channel *ch = pdata->channel;
BUG_ON(!ch);
- // reserve 3 minor #s for <dev> and as-<dev> and ctrl-<dev>
+ /* reserve 4 minor #s for <dev> and as-<dev>, ctrl-<dev>
+ * and dbg-<dev> */
- err = alloc_chrdev_region(&devno, 0, 3, IFACE_NAME);
+ err = alloc_chrdev_region(&devno, 0, 4, IFACE_NAME);
if (err < 0) {
dev_err(&dev->dev, "failed to allocate devno\n");
goto fail;
@@ -1135,6 +1134,16 @@ int nvhost_client_user_init(struct platform_device *dev)
goto fail;
}
+ if (pdata->dbg_ops) {
+ ++devno;
+ pdata->dbg_node = nvhost_client_device_create(dev,
+ &pdata->dbg_cdev, "dbg-",
+ devno, pdata->dbg_ops);
+ if (pdata->dbg_node == NULL)
+ goto fail;
+ }
+
+
return 0;
fail:
return err;
diff --git a/drivers/video/tegra/host/bus_client.h b/drivers/video/tegra/host/bus_client.h
index 07bc7104d283..db3e228e8eec 100644
--- a/drivers/video/tegra/host/bus_client.h
+++ b/drivers/video/tegra/host/bus_client.h
@@ -55,6 +55,4 @@ nvhost_client_request_firmware(struct platform_device *dev,
int nvhost_client_device_get_resources(struct platform_device *dev);
-struct nvhost_hwctx *nvhost_channel_get_file_hwctx(int fd);
-
#endif
diff --git a/drivers/video/tegra/host/dev.h b/drivers/video/tegra/host/dev.h
index 77330c3b0d05..107b1beaa0ba 100644
--- a/drivers/video/tegra/host/dev.h
+++ b/drivers/video/tegra/host/dev.h
@@ -39,7 +39,7 @@ void nvhost_device_list_remove(struct platform_device *pdev);
#else
/* manually enable and turn it on the mask */
/*#define NVHOST_DEBUG*/
- #define NVHOST_DEFAULT_DBG_MASK (dbg_info)
+ #define NVHOST_DEFAULT_DBG_MASK (dbg_err|dbg_info)
#endif
enum nvhost_dbg_categories {
@@ -52,6 +52,7 @@ enum nvhost_dbg_categories {
dbg_pmu = BIT(6), /* gk20a pmu */
dbg_clk = BIT(7), /* gk20a clk */
dbg_map = BIT(8), /* mem mappings */
+ dbg_gpu_dbg = BIT(9), /* gpu debugger */
dbg_mem = BIT(31), /* memory accesses, very verbose */
};
diff --git a/drivers/video/tegra/host/gk20a/Makefile b/drivers/video/tegra/host/gk20a/Makefile
index c22d74696389..2d7b9a524c67 100644
--- a/drivers/video/tegra/host/gk20a/Makefile
+++ b/drivers/video/tegra/host/gk20a/Makefile
@@ -11,6 +11,8 @@ nvhost-gk20a-objs = \
channel_gk20a.o \
cdma_gk20a.o \
debug_gk20a.o \
+ dbg_gpu_gk20a.o \
+ regops_gk20a.o \
gr_gk20a.o \
kind_gk20a.o \
mm_gk20a.o \
diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.c b/drivers/video/tegra/host/gk20a/channel_gk20a.c
index d509510742be..6c584c448811 100644
--- a/drivers/video/tegra/host/gk20a/channel_gk20a.c
+++ b/drivers/video/tegra/host/gk20a/channel_gk20a.c
@@ -1495,6 +1495,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
#if defined(CONFIG_TEGRA_GPU_CYCLE_STATS)
mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
#endif
+ mutex_init(&c->dbg_s_lock);
return 0;
}
diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.h b/drivers/video/tegra/host/gk20a/channel_gk20a.h
index 5ade025d2a48..dca69aea6f01 100644
--- a/drivers/video/tegra/host/gk20a/channel_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/channel_gk20a.h
@@ -30,6 +30,7 @@ struct gk20a;
struct gr_gk20a;
struct mem_mgr;
struct mem_handle;
+struct dbg_session_gk20a;
#include "nvhost_channel.h"
#include "nvhost_hwctx.h"
@@ -129,6 +130,8 @@ struct channel_gk20a {
struct mutex cyclestate_buffer_mutex;
} cyclestate;
#endif
+ struct mutex dbg_s_lock;
+ struct dbg_session_gk20a *dbg_s;
};
static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
diff --git a/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c
new file mode 100644
index 000000000000..a4744e64e614
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c
@@ -0,0 +1,368 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/uaccess.h>
+#include <linux/nvhost.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+
+#include "dev.h"
+#include "nvhost_hwctx.h"
+#include "nvhost_acm.h"
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "gk20a_gating_reglist.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+
+struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
+ .exec_reg_ops = exec_regops_gk20a
+};
+
+/* silly allocator - just increment session id */
+static atomic_t session_id = ATOMIC_INIT(0);
+static int generate_session_id(void)
+{
+ return atomic_add_return(1, &session_id);
+}
+
+static int alloc_session(struct dbg_session_gk20a **_dbg_s)
+{
+ struct dbg_session_gk20a *dbg_s;
+ *_dbg_s = NULL;
+
+ nvhost_dbg_fn("");
+
+ dbg_s = kzalloc(sizeof(*dbg_s), GFP_KERNEL);
+ if (!dbg_s)
+ return -ENOMEM;
+
+ dbg_s->id = generate_session_id();
+ dbg_s->ops = &dbg_gpu_session_ops_gk20a;
+ *_dbg_s = dbg_s;
+ return 0;
+}
+
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
+{
+ struct dbg_session_gk20a *dbg_session;
+ struct nvhost_device_data *pdata;
+ struct platform_device *pdev;
+ struct device *dev;
+
+ int err;
+
+ pdata = container_of(inode->i_cdev,
+ struct nvhost_device_data, dbg_cdev);
+ pdev = pdata->pdev;
+ dev = &pdev->dev;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "dbg session: %s", dev_name(dev));
+
+ err = alloc_session(&dbg_session);
+ if (err)
+ return err;
+
+ filp->private_data = dbg_session;
+ dbg_session->pdata = pdata;
+ dbg_session->pdev = pdev;
+ dbg_session->dev = dev;
+
+ return 0;
+}
+
+static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s)
+{
+ struct channel_gk20a *ch_gk20a = dbg_s->ch;
+ struct gk20a *g = dbg_s->ch->g;
+
+ nvhost_dbg_fn("");
+
+ /* wasn't bound to start with ? */
+ if (!ch_gk20a) {
+ nvhost_dbg(dbg_gpu_dbg | dbg_fn, "not bound already?");
+ return -ENODEV;
+ }
+
+ mutex_lock(&g->dbg_sessions_lock);
+ mutex_lock(&ch_gk20a->dbg_s_lock);
+
+ if (--g->dbg_sessions == 0) {
+ /* restore (can) powergate, clk state */
+ /* release pending exceptions to fault/be handled as usual */
+ /*TBD: ordering of these? */
+ g->elcg_enabled = true;
+ gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+ gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+
+ gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled);
+ /* ??? gr_gk20a_pg_gr_load_gating_prod(g, true); */
+
+ gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled);
+ gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled);
+
+ gk20a_pmu_enable_elpg(g);
+
+ nvhost_dbg(dbg_gpu_dbg | dbg_fn, "module idle");
+ nvhost_module_idle(dbg_s->pdev);
+ }
+
+ ch_gk20a->dbg_s = NULL;
+ dbg_s->ch = NULL;
+ fput(dbg_s->hwctx_f);
+ dbg_s->hwctx_f = NULL;
+
+ mutex_unlock(&ch_gk20a->dbg_s_lock);
+ mutex_unlock(&g->dbg_sessions_lock);
+
+ return 0;
+}
+
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
+{
+ struct dbg_session_gk20a *dbg_s = filp->private_data;
+
+ nvhost_dbg(dbg_gpu_dbg | dbg_fn, "%s", dev_name(dbg_s->dev));
+
+ /* unbind if it was bound */
+ if (!dbg_s->ch)
+ return 0;
+ dbg_unbind_channel_gk20a(dbg_s);
+
+ kfree(dbg_s);
+ return 0;
+}
+
+static int dbg_bind_channel_gk20a(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_bind_channel_args *args)
+{
+ struct file *f;
+ struct nvhost_hwctx *hwctx;
+ struct gk20a *g;
+ struct channel_gk20a *ch_gk20a;
+
+ nvhost_dbg(dbg_fn|dbg_gpu_dbg, "%s fd=%d",
+ dev_name(dbg_s->dev), args->channel_fd);
+
+ if (args->channel_fd == ~0)
+ return dbg_unbind_channel_gk20a(dbg_s);
+
+ /* even though get_file_hwctx is doing this it releases it as well */
+ /* by holding it here we'll keep it from disappearing while the
+ * debugger is in session */
+ f = fget(args->channel_fd);
+ if (!f)
+ return -ENODEV;
+
+ hwctx = nvhost_channel_get_file_hwctx(args->channel_fd);
+ if (!hwctx) {
+ nvhost_dbg_fn("no hwctx found for fd");
+ fput(f);
+ return -EINVAL;
+ }
+ /* be sure this is actually the right type of hwctx */
+ if (hwctx->channel->dev != dbg_s->pdev) {
+ nvhost_dbg_fn("hwctx module type mismatch");
+ fput(f);
+ return -EINVAL;
+ }
+ if (!hwctx->priv) {
+ nvhost_dbg_fn("no priv");
+ fput(f);
+ return -ENODEV;
+ }
+
+ ch_gk20a = (struct channel_gk20a *)hwctx->priv;
+ g = ch_gk20a->g;
+ nvhost_dbg_fn("%s hwchid=%d", dev_name(dbg_s->dev), ch_gk20a->hw_chid);
+
+ mutex_lock(&g->dbg_sessions_lock);
+ mutex_lock(&ch_gk20a->dbg_s_lock);
+
+ if (ch_gk20a->dbg_s) {
+ mutex_unlock(&ch_gk20a->dbg_s_lock);
+ mutex_unlock(&g->dbg_sessions_lock);
+ fput(f);
+ nvhost_dbg_fn("hwctx already in dbg session");
+ return -EBUSY;
+ }
+
+ dbg_s->hwctx_f = f;
+ dbg_s->ch = ch_gk20a;
+ ch_gk20a->dbg_s = dbg_s;
+
+ if (g->dbg_sessions++ == 0) {
+ u32 curr = gk20a_clk_get_rate(g);
+
+ /* save off current powergate, clk state.
+ * set gpu module's can_powergate = 0.
+ * set gpu module's clk to max.
+ * while *a* debug session is active there will be no power or
+ * clocking state changes allowed from mainline code (but they
+ * should be saved).
+ */
+ nvhost_module_busy(dbg_s->pdev);
+
+ gr_gk20a_slcg_gr_load_gating_prod(g, false);
+ gr_gk20a_slcg_perf_load_gating_prod(g, false);
+
+ gr_gk20a_blcg_gr_load_gating_prod(g, false);
+ /* ??? gr_gk20a_pg_gr_load_gating_prod(g, false); */
+ /* TBD: would rather not change elcg_enabled here */
+ g->elcg_enabled = false;
+ gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+ gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+
+ gk20a_pmu_disable_elpg(g);
+
+ }
+ mutex_unlock(&ch_gk20a->dbg_s_lock);
+ mutex_unlock(&g->dbg_sessions_lock);
+ return 0;
+}
+
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_exec_reg_ops_args *args);
+
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct dbg_session_gk20a *dbg_s = filp->private_data;
+ struct gk20a *g = get_gk20a(dbg_s->pdev);
+ u8 buf[NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE];
+ int err = 0;
+
+ nvhost_dbg_fn("");
+
+ if ((_IOC_TYPE(cmd) != NVHOST_DBG_GPU_IOCTL_MAGIC) ||
+ (_IOC_NR(cmd) == 0) ||
+ (_IOC_NR(cmd) > NVHOST_DBG_GPU_IOCTL_LAST))
+ return -EFAULT;
+
+ BUG_ON(_IOC_SIZE(cmd) > NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE);
+
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+ return -EFAULT;
+ }
+
+ switch (cmd) {
+ case NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL:
+ err = dbg_bind_channel_gk20a(dbg_s,
+ (struct nvhost_dbg_gpu_bind_channel_args *)buf);
+ nvhost_dbg(dbg_gpu_dbg, "ret=%d", err);
+ break;
+
+ case NVHOST_DBG_GPU_IOCTL_REG_OPS:
+ err = nvhost_ioctl_channel_reg_ops(dbg_s,
+ (struct nvhost_dbg_gpu_exec_reg_ops_args *)buf);
+ nvhost_dbg(dbg_gpu_dbg, "ret=%d", err);
+ break;
+
+ default:
+ nvhost_err(dev_from_gk20a(g),
+ "unrecognized dbg gpu ioctl cmd: 0x%x",
+ cmd);
+ err = -ENOTTY;
+ break;
+ }
+
+ if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+ err = copy_to_user((void __user *)arg,
+ buf, _IOC_SIZE(cmd));
+
+ return err;
+}
+
+/* In order to perform a context relative op the context has
+ * to be created already... which would imply that the
+ * context switch mechanism has already been put in place.
+ * So by the time we perform such an opertation it should always
+ * be possible to query for the appropriate context offsets, etc.
+ *
+ * But note: while the dbg_gpu bind requires the a channel fd with
+ * a bound hwctx it doesn't require an allocated gr/compute obj
+ * at that point... so just having the bound hwctx doesn't work
+ * to guarantee this.
+ */
+static bool gr_context_info_available(struct dbg_session_gk20a *dbg_s,
+ struct gr_gk20a *gr)
+{
+ int err;
+
+ mutex_lock(&gr->ctx_mutex);
+ err = !gr->ctx_vars.golden_image_initialized;
+ mutex_unlock(&gr->ctx_mutex);
+ if (err)
+ return false;
+ return true;
+
+}
+
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_exec_reg_ops_args *args)
+{
+ int err;
+ struct device *dev = dbg_s->dev;
+ struct gk20a *g = get_gk20a(dbg_s->pdev);
+ struct nvhost_dbg_gpu_reg_op *ops;
+ u64 ops_size = sizeof(ops[0]) * args->num_ops;
+
+ nvhost_dbg_fn("%d ops, total size %llu", args->num_ops, ops_size);
+
+ if (!dbg_s->ops) {
+ nvhost_err(dev, "can't call reg_ops on an unbound debugger session");
+ return -EINVAL;
+ }
+
+ /* be sure that ctx info is in place */
+ if (!gr_context_info_available(dbg_s, &g->gr)) {
+ nvhost_err(dev, "gr context data not available\n");
+ return -ENODEV;
+ }
+
+ ops = kzalloc(ops_size, GFP_KERNEL);
+ if (!ops) {
+ nvhost_err(dev, "Allocating memory failed!");
+ return -ENOMEM;
+ }
+
+ nvhost_dbg_fn("Copying regops from userspace");
+
+ if (copy_from_user(ops, (void *)(uintptr_t)args->ops, ops_size)) {
+ dev_err(dev, "copy_from_user failed!");
+ return -EFAULT;
+ }
+
+ err = dbg_s->ops->exec_reg_ops(dbg_s, ops, args->num_ops);
+
+ if (err) {
+ nvhost_err(dev, "dbg regops failed");
+ return err;
+ }
+
+ nvhost_dbg_fn("Copying result to userspace");
+
+ if (copy_to_user((void *)(uintptr_t)args->ops, ops, ops_size)) {
+ dev_err(dev, "copy_to_user failed!");
+ return -EFAULT;
+ }
+ return 0;
+}
diff --git a/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h
new file mode 100644
index 000000000000..48958b3f5eee
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h
@@ -0,0 +1,51 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __DBG_GPU_GK20A_H_
+#define __DBG_GPU_GK20A_H_
+
+/* module debug driver interface */
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp);
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp);
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+struct dbg_gpu_session_ops {
+ int (*exec_reg_ops)(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_reg_op *ops,
+ u64 num_ops);
+};
+
+struct dbg_session_gk20a {
+ /* dbg session id used for trace/prints */
+ int id;
+
+ /* gpu module vagaries */
+ struct device *dev;
+ struct platform_device *pdev;
+ struct nvhost_device_data *pdata;
+
+ /* bound hwctx and channel */
+ struct file *hwctx_f;
+ struct channel_gk20a *ch;
+
+ /* session operations */
+ struct dbg_gpu_session_ops *ops;
+};
+
+extern struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a;
+
+#endif /* __DBG_GPU_GK20A_H_ */
diff --git a/drivers/video/tegra/host/gk20a/gk20a.c b/drivers/video/tegra/host/gk20a/gk20a.c
index 8be8f4bd3ff2..f564a151155c 100644
--- a/drivers/video/tegra/host/gk20a/gk20a.c
+++ b/drivers/video/tegra/host/gk20a/gk20a.c
@@ -49,6 +49,7 @@
#include "hw_sim_gk20a.h"
#include "gk20a_scale.h"
#include "gr3d/pod_scaling.h"
+#include "dbg_gpu_gk20a.h"
#include "../../../../../arch/arm/mach-tegra/iomap.h"
@@ -89,6 +90,17 @@ const struct file_operations tegra_gk20a_ctrl_ops = {
.unlocked_ioctl = gk20a_ctrl_dev_ioctl,
};
+const struct file_operations tegra_gk20a_dbg_gpu_ops = {
+ .owner = THIS_MODULE,
+ .release = gk20a_dbg_gpu_dev_release,
+ .open = gk20a_dbg_gpu_dev_open,
+ .unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#endif
+
+};
+
static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
{
writel(v, g->sim.regs+r);
@@ -532,6 +544,8 @@ int nvhost_init_gk20a_support(struct platform_device *dev)
goto fail;
}
+ mutex_init(&g->dbg_sessions_lock);
+
/* nvhost_as alloc_share can be called before gk20a is powered on.
It requires mm sw states configured so init mm sw early here. */
err = gk20a_init_mm_setup_sw(g);
diff --git a/drivers/video/tegra/host/gk20a/gk20a.h b/drivers/video/tegra/host/gk20a/gk20a.h
index 4add3dff4fb4..066b7aaae788 100644
--- a/drivers/video/tegra/host/gk20a/gk20a.h
+++ b/drivers/video/tegra/host/gk20a/gk20a.h
@@ -95,6 +95,12 @@ struct gk20a {
struct dentry *debugfs_timeouts_enabled;
struct dentry *debugfs_gr_idle_timeout_default;
#endif
+
+ /* held while manipulating # of debug sessions present */
+ /* also prevents debug sessions from attaching until released */
+ struct mutex dbg_sessions_lock;
+ int dbg_sessions; /* number attached */
+
void (*remove_support)(struct platform_device *);
struct notifier_block system_suspend_notifier;
@@ -248,6 +254,7 @@ int clk_gk20a_debugfs_init(struct platform_device *dev);
#endif
extern const struct file_operations tegra_gk20a_ctrl_ops;
+extern const struct file_operations tegra_gk20a_dbg_gpu_ops;
struct nvhost_hwctx_handler *nvhost_gk20a_alloc_hwctx_handler(u32 syncpt,
u32 waitbase, struct nvhost_channel *ch);
diff --git a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
index ab403df84b51..909a166ae9c3 100644
--- a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
@@ -1,9 +1,7 @@
/*
- * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h
- *
* GK20A Graphics Context
*
- * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -14,9 +12,8 @@
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __GR_CTX_GK20A_H__
#define __GR_CTX_GK20A_H__
diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.c b/drivers/video/tegra/host/gk20a/gr_gk20a.c
index 45f9392f9d95..b526e31abf5a 100644
--- a/drivers/video/tegra/host/gk20a/gr_gk20a.c
+++ b/drivers/video/tegra/host/gk20a/gr_gk20a.c
@@ -24,6 +24,7 @@
#include <linux/scatterlist.h>
#include <linux/nvmap.h>
#include <linux/tegra-soc.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
#include "../dev.h"
@@ -49,10 +50,14 @@
#include "chip_support.h"
#include "nvhost_memmgr.h"
#include "gk20a_gating_reglist.h"
+#include "gr_pri_gk20a.h"
+#include "regops_gk20a.h"
+
+
static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
-static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c,
- u32 addr, u32 data, u32 patch);
+static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
+ u32 addr, u32 data, bool patch);
/* global ctx buffer */
static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
@@ -433,35 +438,92 @@ static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
return 0;
}
-int gr_gk20a_submit_fecs_method(struct gk20a *g,
- u32 mb_id, u32 mb_data, u32 mb_clr,
- u32 mtd_data, u32 mtd_adr, u32 *mb_ret,
- u32 opc_ok, u32 mb_ok, u32 opc_fail, u32 mb_fail)
+/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
+ * We should replace most, if not all, fecs method calls to this instead. */
+struct fecs_method_op_gk20a {
+ struct {
+ u32 addr;
+ u32 data;
+ } method;
+
+ struct {
+ u32 id;
+ u32 data;
+ u32 clr;
+ u32 *ret;
+ u32 ok;
+ u32 fail;
+ } mailbox;
+
+ struct {
+ u32 ok;
+ u32 fail;
+ } cond;
+
+};
+
+int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
+ struct fecs_method_op_gk20a op)
{
struct gr_gk20a *gr = &g->gr;
int ret;
mutex_lock(&gr->fecs_mutex);
- if (mb_id != 0)
- gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(mb_id),
- mb_data);
+ if (op.mailbox.id != 0)
+ gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
+ op.mailbox.data);
gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
- gr_fecs_ctxsw_mailbox_clear_value_f(mb_clr));
+ gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
- gk20a_writel(g, gr_fecs_method_data_r(), mtd_data);
+ gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
gk20a_writel(g, gr_fecs_method_push_r(),
- gr_fecs_method_push_adr_f(mtd_adr));
+ gr_fecs_method_push_adr_f(op.method.addr));
+
+ /* op.mb.id == 4 cases require waiting for completion on
+ * for op.mb.id == 0 */
+ if (op.mailbox.id == 4)
+ op.mailbox.id = 0;
- ret = gr_gk20a_ctx_wait_ucode(g, 0, mb_ret,
- opc_ok, mb_ok, opc_fail, mb_fail);
+ ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
+ op.cond.ok, op.mailbox.ok,
+ op.cond.fail, op.mailbox.fail);
mutex_unlock(&gr->fecs_mutex);
return ret;
}
+int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
+{
+ return gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a) {
+ .method.addr = fecs_method,
+ .method.data = ~0,
+ .mailbox = { .id = 1, /*sideband?*/
+ .data = ~0, .clr = ~0, .ret = ret,
+ .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
+ .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
+ .cond.ok = GR_IS_UCODE_OP_EQUAL,
+ .cond.fail = GR_IS_UCODE_OP_EQUAL });
+}
+
+/* Stop processing (stall) context switches at FECS */
+int gr_gk20a_disable_ctxsw(struct gk20a *g)
+{
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
+ return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
+}
+
+/* Start processing (continue) context switches at FECS */
+int gr_gk20a_enable_ctxsw(struct gk20a *g)
+{
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
+ return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
+}
+
+
static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
{
u32 addr_lo;
@@ -504,33 +566,92 @@ clean_up:
return ret;
}
-static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c,
- u32 addr, u32 data, u32 patch)
+/*
+ * Context state can be written directly or "patched" at times.
+ * So that code can be used in either situation it is written
+ * using a series _ctx_patch_write(..., patch) statements.
+ * However any necessary cpu map/unmap and gpu l2 invalidates
+ * should be minimized (to avoid doing it once per patch write).
+ * Before a sequence of these set up with "_ctx_patch_write_begin"
+ * and close with "_ctx_patch_write_end."
+ */
+static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+ struct channel_ctx_gk20a *ch_ctx)
+{
+ /* being defensive still... */
+ if (ch_ctx->patch_ctx.cpu_va) {
+ nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?");
+ return -EBUSY;
+ }
+
+ ch_ctx->patch_ctx.cpu_va =
+ nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
+
+ if (!ch_ctx->patch_ctx.cpu_va)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+ struct channel_ctx_gk20a *ch_ctx)
+{
+ /* being defensive still... */
+ if (!ch_ctx->patch_ctx.cpu_va) {
+ nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?");
+ return -EINVAL;
+ }
+
+ nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref,
+ ch_ctx->patch_ctx.cpu_va);
+ ch_ctx->patch_ctx.cpu_va = NULL;
+
+ gk20a_mm_l2_invalidate(g);
+ return 0;
+}
+
+static int gr_gk20a_ctx_patch_write(struct gk20a *g,
+ struct channel_ctx_gk20a *ch_ctx,
+ u32 addr, u32 data, bool patch)
{
- struct channel_ctx_gk20a *ch_ctx;
u32 patch_slot = 0;
void *patch_ptr = NULL;
+ bool mapped_here = false;
- BUG_ON(patch != 0 && c == NULL);
+ BUG_ON(patch != 0 && ch_ctx == NULL);
if (patch) {
- ch_ctx = &c->ch_ctx;
- patch_ptr = nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref);
- if (!patch_ptr)
- return -ENOMEM;
+ if (!ch_ctx)
+ return -EINVAL;
+ /* we added an optimization prolog, epilog
+ * to get rid of unnecessary maps and l2 invals.
+ * but be defensive still... */
+ if (!ch_ctx->patch_ctx.cpu_va) {
+ int err;
+ nvhost_err(dev_from_gk20a(g),
+ "per-write ctx patch begin?");
+ /* yes, gr_gk20a_ctx_patch_smpc causes this one */
+ err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+ if (err)
+ return err;
+ mapped_here = true;
+ } else {
+ mapped_here = false;
+ patch_ptr = ch_ctx->patch_ctx.cpu_va;
+ }
patch_slot = ch_ctx->patch_ctx.data_count * 2;
mem_wr32(patch_ptr, patch_slot++, addr);
mem_wr32(patch_ptr, patch_slot++, data);
- nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref, patch_ptr);
- gk20a_mm_l2_invalidate(g);
-
ch_ctx->patch_ctx.data_count++;
- } else {
+
+ if (mapped_here)
+ gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
+ } else
gk20a_writel(g, addr, data);
- }
return 0;
}
@@ -545,12 +666,19 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
nvhost_dbg_info("bind channel %d inst ptr 0x%08x",
c->hw_chid, inst_base_ptr);
- ret = gr_gk20a_submit_fecs_method(g, 0, 0, 0x30,
- gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
- gr_fecs_current_ctx_target_vid_mem_f() |
- gr_fecs_current_ctx_valid_f(1),
- gr_fecs_method_push_adr_bind_pointer_v(),
- 0, GR_IS_UCODE_OP_AND, 0x10, GR_IS_UCODE_OP_AND, 0x20);
+ ret = gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a) {
+ .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
+ .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+ gr_fecs_current_ctx_target_vid_mem_f() |
+ gr_fecs_current_ctx_valid_f(1)),
+ .mailbox = { .id = 0, .data = 0,
+ .clr = 0x30,
+ .ret = NULL,
+ .ok = 0x10,
+ .fail = 0x20, },
+ .cond.ok = GR_IS_UCODE_OP_AND,
+ .cond.fail = GR_IS_UCODE_OP_AND});
if (ret)
nvhost_err(dev_from_gk20a(g),
"bind channel instance failed");
@@ -621,9 +749,10 @@ clean_up:
}
static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
- struct channel_gk20a *c, u32 patch)
+ struct channel_gk20a *c, bool patch)
{
struct gr_gk20a *gr = &g->gr;
+ struct channel_ctx_gk20a *ch_ctx = NULL;
u32 attrib_offset_in_chunk = 0;
u32 alpha_offset_in_chunk = 0;
u32 pd_ab_max_output;
@@ -633,7 +762,15 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
nvhost_dbg_fn("");
- gr_gk20a_ctx_patch_write(g, c, gr_ds_tga_constraintlogic_r(),
+ if (patch) {
+ int err;
+ ch_ctx = &c->ch_ctx;
+ err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+ if (err)
+ return err;
+ }
+
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
patch);
@@ -642,7 +779,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
gr_pd_ab_dist_cfg1_max_output_granularity_v();
- gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg1_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
@@ -658,7 +795,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
cbm_cfg_size2 = gr->alpha_cb_default_size *
gr->pes_tpc_count[ppc_index][gpc_index];
- gr_gk20a_ctx_patch_write(g, c,
+ gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpc0_ppc0_cbm_cfg_r() + temp +
proj_ppc_in_gpc_stride_v() * ppc_index,
gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
@@ -668,7 +805,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
attrib_offset_in_chunk += gr->attrib_cb_size *
gr->pes_tpc_count[ppc_index][gpc_index];
- gr_gk20a_ctx_patch_write(g, c,
+ gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpc0_ppc0_cbm_cfg2_r() + temp +
proj_ppc_in_gpc_stride_v() * ppc_index,
gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
@@ -679,11 +816,14 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
}
}
+ if (patch)
+ gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
return 0;
}
static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
- struct channel_gk20a *c, u32 patch)
+ struct channel_gk20a *c, bool patch)
{
struct gr_gk20a *gr = &g->gr;
struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
@@ -692,6 +832,12 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
u32 data;
nvhost_dbg_fn("");
+ if (patch) {
+ int err;
+ err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+ if (err)
+ return err;
+ }
/* global pagepool buffer */
addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
@@ -708,20 +854,20 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
addr, size);
- gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_base_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
gr_scc_pagepool_base_addr_39_8_f(addr), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
gr_scc_pagepool_total_pages_f(size) |
gr_scc_pagepool_valid_true_f(), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_base_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_pd_pagepool_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
gr_pd_pagepool_total_pages_f(size) |
gr_pd_pagepool_valid_true_f(), patch);
@@ -736,17 +882,17 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d",
addr, size);
- gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_base_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_size_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
gr_scc_bundle_cb_size_div_256b_f(size) |
gr_scc_bundle_cb_size_valid_true_f(), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_base_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_size_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
@@ -760,7 +906,7 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
nvhost_dbg_info("bundle cb token limit : %d, state limit : %d",
gr->bundle_cb_token_limit, data);
- gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg2_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) |
gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
@@ -772,20 +918,24 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
nvhost_dbg_info("attrib cb addr : 0x%016llx", addr);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_attrib_cb_base_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
+ if (patch)
+ gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
return 0;
}
-static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, u32 patch)
+static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
{
struct gr_gk20a *gr = &g->gr;
+ struct channel_ctx_gk20a *ch_ctx = NULL;
u32 gpm_pd_cfg;
u32 pd_ab_dist_cfg0;
u32 ds_debug;
@@ -800,6 +950,14 @@ static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20
ds_debug = gk20a_readl(g, gr_ds_debug_r());
mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
+ if (patch) {
+ int err;
+ ch_ctx = &c->ch_ctx;
+ err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+ if (err)
+ return err;
+ }
+
if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
@@ -811,24 +969,27 @@ static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20
ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
} else {
gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch);
- gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+ gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
}
+ if (patch)
+ gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
return 0;
}
@@ -1147,7 +1308,7 @@ static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
}
- /* grSetupPDMapping stubbed for gk20a */
+ /* gr__setup_pd_mapping stubbed for gk20a */
gr_gk20a_setup_rop_mapping(g, gr);
gr_gk20a_setup_alpha_beta_tables(g, gr);
@@ -1192,13 +1353,22 @@ static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
>> ram_in_base_shift_v());
+
nvhost_dbg_fn("");
- ret = gr_gk20a_submit_fecs_method(g, 0, 0, 3,
- gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
- gr_fecs_current_ctx_target_vid_mem_f() |
- gr_fecs_current_ctx_valid_f(1), save_type, 0,
- GR_IS_UCODE_OP_AND, 1, GR_IS_UCODE_OP_AND, 2);
+ ret = gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a) {
+ .method.addr = save_type,
+ .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+ gr_fecs_current_ctx_target_vid_mem_f() |
+ gr_fecs_current_ctx_valid_f(1)),
+ .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
+ .ok = 1, .fail = 2,
+ },
+ .cond.ok = GR_IS_UCODE_OP_AND,
+ .cond.fail = GR_IS_UCODE_OP_AND,
+ });
+
if (ret)
nvhost_err(dev_from_gk20a(g), "save context image failed");
@@ -1234,7 +1404,7 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
if (err)
goto clean_up;
- err = gr_gk20a_commit_global_ctx_buffers(g, c, 0);
+ err = gr_gk20a_commit_global_ctx_buffers(g, c, false);
if (err)
goto clean_up;
@@ -1367,13 +1537,22 @@ static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl)
>> ram_in_base_shift_v());
- ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0,
- gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
- gr_fecs_current_ctx_target_vid_mem_f() |
- gr_fecs_current_ctx_valid_f(1),
- gr_fecs_method_push_adr_restore_golden_v(), 0,
- GR_IS_UCODE_OP_EQUAL, gr_fecs_ctxsw_mailbox_value_pass_v(),
- GR_IS_UCODE_OP_SKIP, 0);
+ ret = gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a) {
+ .method.data =
+ (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+ gr_fecs_current_ctx_target_vid_mem_f() |
+ gr_fecs_current_ctx_valid_f(1)),
+ .method.addr =
+ gr_fecs_method_push_adr_restore_golden_v(),
+ .mailbox = {
+ .id = 0, .data = 0,
+ .clr = ~0, .ret = NULL,
+ .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
+ .fail = 0},
+ .cond.ok = GR_IS_UCODE_OP_EQUAL,
+ .cond.fail = GR_IS_UCODE_OP_SKIP});
+
if (ret)
nvhost_err(dev_from_gk20a(g),
"restore context image failed");
@@ -1440,33 +1619,34 @@ static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
u32 zcull_ctx_image_size = 0;
u32 pm_ctx_image_size = 0;
u32 ret;
+ struct fecs_method_op_gk20a op = {
+ .mailbox = { .id = 0, .data = 0,
+ .clr = ~0, .ok = 0, .fail = 0},
+ .method.data = 0,
+ .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+ .cond.fail = GR_IS_UCODE_OP_SKIP,
+ };
nvhost_dbg_fn("");
-
- ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
- gr_fecs_method_push_adr_discover_image_size_v(),
- &golden_ctx_image_size,
- GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+ op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
+ op.mailbox.ret = &golden_ctx_image_size;
+ ret = gr_gk20a_submit_fecs_method_op(g, op);
if (ret) {
nvhost_err(dev_from_gk20a(g),
"query golden image size failed");
return ret;
}
-
- ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
- gr_fecs_method_push_adr_discover_zcull_image_size_v(),
- &zcull_ctx_image_size,
- GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+ op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
+ op.mailbox.ret = &zcull_ctx_image_size;
+ ret = gr_gk20a_submit_fecs_method_op(g, op);
if (ret) {
nvhost_err(dev_from_gk20a(g),
"query zcull ctx image size failed");
return ret;
}
-
- ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0,
- gr_fecs_method_push_adr_discover_pm_image_size_v(),
- &pm_ctx_image_size,
- GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+ op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
+ op.mailbox.ret = &pm_ctx_image_size;
+ ret = gr_gk20a_submit_fecs_method_op(g, op);
if (ret) {
nvhost_err(dev_from_gk20a(g),
"query pm ctx image size failed");
@@ -1943,10 +2123,10 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
goto out;
}
gr_gk20a_elpg_protected_call(g,
- gr_gk20a_commit_global_ctx_buffers(g, c, 1));
+ gr_gk20a_commit_global_ctx_buffers(g, c, true));
}
- /* init gloden image, ELPG enabled after this is done */
+ /* init golden image, ELPG enabled after this is done */
err = gr_gk20a_init_golden_ctx_image(g, c);
if (err) {
nvhost_err(dev_from_gk20a(g),
@@ -3527,8 +3707,6 @@ static int gk20a_init_gr_setup_hw(struct gk20a *g)
gk20a_writel(g, sw_ctx_load->l[i].addr,
sw_ctx_load->l[i].value);
- /* TBD: add gr ctx overrides */
-
err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
if (err)
goto out;
@@ -3541,8 +3719,8 @@ static int gk20a_init_gr_setup_hw(struct gk20a *g)
gr_fe_go_idle_timeout_count_disabled_f());
/* override a few ctx state registers */
- gr_gk20a_commit_global_cb_manager(g, NULL, 0);
- gr_gk20a_commit_global_timeslice(g, NULL, 0);
+ gr_gk20a_commit_global_cb_manager(g, NULL, false);
+ gr_gk20a_commit_global_timeslice(g, NULL, false);
/* floorsweep anything left */
gr_gk20a_ctx_state_floorsweep(g);
@@ -4328,25 +4506,52 @@ clean_up:
int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
{
BUG_ON(size == NULL);
- return gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 1,
- gr_fecs_method_push_adr_discover_reglist_image_size_v(),
- size, GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0);
+ return gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a) {
+ .mailbox.id = 0,
+ .mailbox.data = 0,
+ .mailbox.clr = ~0,
+ .method.data = 1,
+ .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
+ .mailbox.ret = size,
+ .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+ .mailbox.ok = 0,
+ .cond.fail = GR_IS_UCODE_OP_SKIP,
+ .mailbox.fail = 0});
}
int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
{
- return gr_gk20a_submit_fecs_method(g, 4,
- gr_fecs_current_ctx_ptr_f(addr >> 12) |
- gr_fecs_current_ctx_valid_f(1) | gr_fecs_current_ctx_target_vid_mem_f(),
- ~0, 1, gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
- 0, GR_IS_UCODE_OP_EQUAL, 1, GR_IS_UCODE_OP_SKIP, 0);
+ return gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a){
+ .mailbox.id = 4,
+ .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
+ gr_fecs_current_ctx_valid_f(1) |
+ gr_fecs_current_ctx_target_vid_mem_f()),
+ .mailbox.clr = ~0,
+ .method.data = 1,
+ .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
+ .mailbox.ret = NULL,
+ .cond.ok = GR_IS_UCODE_OP_EQUAL,
+ .mailbox.ok = 1,
+ .cond.fail = GR_IS_UCODE_OP_SKIP,
+ .mailbox.fail = 0});
}
int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
{
- return gr_gk20a_submit_fecs_method(g, 4, u64_lo32(pmu_va >> 8),
- ~0, 1, gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
- 0, GR_IS_UCODE_OP_EQUAL, 1, GR_IS_UCODE_OP_SKIP, 0);
+ return gr_gk20a_submit_fecs_method_op(g,
+ (struct fecs_method_op_gk20a) {
+ .mailbox.id = 4,
+ .mailbox.data = u64_lo32(pmu_va >> 8),
+ .mailbox.clr = ~0,
+ .method.data = 1,
+ .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
+ .mailbox.ret = NULL,
+ .cond.ok = GR_IS_UCODE_OP_EQUAL,
+ .mailbox.ok = 1,
+ .cond.fail = GR_IS_UCODE_OP_SKIP,
+ .mailbox.fail = 0});
}
int gk20a_gr_suspend(struct gk20a *g)
@@ -4381,3 +4586,1212 @@ int gk20a_gr_suspend(struct gk20a *g)
nvhost_dbg_fn("done");
return ret;
}
+
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+ u32 addr,
+ bool is_quad, u32 quad,
+ u32 *context_buffer,
+ u32 context_buffer_size,
+ u32 *priv_offset);
+
+/* This function will decode a priv address and return the partition type and numbers. */
+int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
+ int *addr_type, /* enum ctxsw_addr_type */
+ u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
+ u32 *broadcast_flags)
+{
+ u32 gpc_addr;
+ u32 ppc_address;
+ u32 ppc_broadcast_addr;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+ /* setup defaults */
+ ppc_address = 0;
+ ppc_broadcast_addr = 0;
+ *addr_type = CTXSW_ADDR_TYPE_SYS;
+ *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
+ *gpc_num = 0;
+ *tpc_num = 0;
+ *ppc_num = 0;
+ *be_num = 0;
+
+ if (pri_is_gpc_addr(addr)) {
+ *addr_type = CTXSW_ADDR_TYPE_GPC;
+ gpc_addr = pri_gpccs_addr_mask(addr);
+ if (pri_is_gpc_addr_shared(addr)) {
+ *addr_type = CTXSW_ADDR_TYPE_GPC;
+ *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
+ } else
+ *gpc_num = pri_get_gpc_num(addr);
+
+ if (pri_is_tpc_addr(gpc_addr)) {
+ *addr_type = CTXSW_ADDR_TYPE_TPC;
+ if (pri_is_tpc_addr_shared(gpc_addr)) {
+ *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
+ return 0;
+ }
+ *tpc_num = pri_get_tpc_num(gpc_addr);
+ }
+ return 0;
+ } else if (pri_is_be_addr(addr)) {
+ *addr_type = CTXSW_ADDR_TYPE_BE;
+ if (pri_is_be_addr_shared(addr)) {
+ *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
+ return 0;
+ }
+ *be_num = pri_get_be_num(addr);
+ return 0;
+ } else {
+ *addr_type = CTXSW_ADDR_TYPE_SYS;
+ return 0;
+ }
+ /* PPC!?!?!?! */
+
+ /*NOTREACHED*/
+ return -EINVAL;
+}
+
+static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
+ u32 gpc_num,
+ u32 *priv_addr_table, u32 *t)
+{
+ u32 ppc_num;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+ for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
+ priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
+ gpc_num, ppc_num);
+
+ return 0;
+}
+
+/*
+ * The context buffer is indexed using BE broadcast addresses and GPC/TPC
+ * unicast addresses. This function will convert a BE unicast address to a BE
+ * broadcast address and split a GPC/TPC broadcast address into a table of
+ * GPC/TPC addresses. The addresses generated by this function can be
+ * successfully processed by gr_gk20a_find_priv_offset_in_buffer
+ */
+static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
+ u32 addr,
+ u32 *priv_addr_table,
+ u32 *num_registers)
+{
+ int addr_type; /*enum ctxsw_addr_type */
+ u32 gpc_num, tpc_num, ppc_num, be_num;
+ u32 broadcast_flags;
+ u32 t;
+ int err;
+
+ t = 0;
+ *num_registers = 0;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+ err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+ &gpc_num, &tpc_num, &ppc_num, &be_num,
+ &broadcast_flags);
+ nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type);
+ if (err)
+ return err;
+
+ if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+ (addr_type == CTXSW_ADDR_TYPE_BE)) {
+ /* The BE broadcast registers are included in the compressed PRI
+ * table. Convert a BE unicast address to a broadcast address
+ * so that we can look up the offset. */
+ if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
+ !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
+ priv_addr_table[t++] = pri_be_shared_addr(addr);
+ else
+ priv_addr_table[t++] = addr;
+
+ *num_registers = t;
+ return 0;
+ }
+
+ /* The GPC/TPC unicast registers are included in the compressed PRI
+ * tables. Convert a GPC/TPC broadcast address to unicast addresses so
+ * that we can look up the offsets. */
+ if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
+ for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
+
+ if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+ for (tpc_num = 0;
+ tpc_num < g->gr.gpc_tpc_count[gpc_num];
+ tpc_num++)
+ priv_addr_table[t++] =
+ pri_tpc_addr(pri_tpccs_addr_mask(addr),
+ gpc_num, tpc_num);
+
+ else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
+ err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+ priv_addr_table, &t);
+ if (err)
+ return err;
+ } else
+ priv_addr_table[t++] =
+ pri_gpc_addr(pri_gpccs_addr_mask(addr),
+ gpc_num);
+ }
+ } else {
+ if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+ for (tpc_num = 0;
+ tpc_num < g->gr.gpc_tpc_count[gpc_num];
+ tpc_num++)
+ priv_addr_table[t++] =
+ pri_tpc_addr(pri_tpccs_addr_mask(addr),
+ gpc_num, tpc_num);
+ else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
+ err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+ priv_addr_table, &t);
+ else
+ priv_addr_table[t++] = addr;
+ }
+
+ *num_registers = t;
+ return 0;
+}
+
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+ u32 addr,
+ u32 max_offsets,
+ u32 *offsets, u32 *offset_addrs,
+ u32 *num_offsets,
+ bool is_quad, u32 quad)
+{
+ u32 i;
+ u32 priv_offset = 0;
+ u32 *priv_registers;
+ u32 num_registers = 0;
+ int err = 0;
+ u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
+ proj_scal_litter_num_tpc_per_gpc_v();
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+ /* implementation is crossed-up if either of these happen */
+ if (max_offsets > potential_offsets)
+ return -EINVAL;
+
+ if (!g->gr.ctx_vars.golden_image_initialized)
+ return -ENODEV;
+
+ priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
+ if (IS_ERR_OR_NULL(priv_registers)) {
+ nvhost_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
+ err = PTR_ERR(priv_registers);
+ goto cleanup;
+ }
+ memset(offsets, 0, sizeof(u32) * max_offsets);
+ memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+ *num_offsets = 0;
+
+ gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
+
+ if ((max_offsets > 1) && (num_registers > max_offsets)) {
+ err = -EINVAL;
+ goto cleanup;
+ }
+
+ if ((max_offsets == 1) && (num_registers > 1))
+ num_registers = 1;
+
+ if (!g->gr.ctx_vars.local_golden_image) {
+ nvhost_dbg_fn("no context switch header info to work with");
+ err = -EINVAL;
+ goto cleanup;
+ }
+
+ for (i = 0; i < num_registers; i++) {
+ err = gr_gk20a_find_priv_offset_in_buffer(g,
+ priv_registers[i],
+ is_quad, quad,
+ g->gr.ctx_vars.local_golden_image,
+ g->gr.ctx_vars.golden_image_size,
+ &priv_offset);
+ if (err) {
+ nvhost_dbg_fn("Could not determine priv_offset for addr:0x%x",
+ addr); /*, grPriRegStr(addr)));*/
+ goto cleanup;
+ }
+
+ offsets[i] = priv_offset;
+ offset_addrs[i] = priv_registers[i];
+ }
+
+ *num_offsets = num_registers;
+
+ cleanup:
+
+ if (!IS_ERR_OR_NULL(priv_registers))
+ kfree(priv_registers);
+
+ return err;
+}
+
+/* Setup some register tables. This looks hacky; our
+ * register/offset functions are just that, functions.
+ * So they can't be used as initializers... TBD: fix to
+ * generate consts at least on an as-needed basis.
+ */
+static const u32 _num_ovr_perf_regs = 17;
+static u32 _ovr_perf_regs[17] = { 0, };
+/* Following are the blocks of registers that the ucode
+ stores in the extended region.*/
+/* == ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
+static const u32 _num_sm_dsm_perf_regs = 5;
+/* == ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
+static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
+static u32 _sm_dsm_perf_regs[5];
+static u32 _sm_dsm_perf_ctrl_regs[4];
+
+static void init_sm_dsm_reg_info(void)
+{
+ if (_ovr_perf_regs[0] != 0)
+ return;
+
+ _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
+ _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
+ _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
+ _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
+ _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
+ _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
+ _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
+ _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
+ _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
+ _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
+ _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
+ _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
+ _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
+ _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
+ _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
+ _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
+ _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
+
+
+ _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
+ _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
+ _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
+ _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
+ _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
+
+ _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
+ _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
+ _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
+ _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
+
+}
+
+/* TBD: would like to handle this elsewhere, at a higher level.
+ * these are currently constructed in a "test-then-write" style
+ * which makes it impossible to know externally whether a ctx
+ * write will actually occur. so later we should put a lazy,
+ * map-and-hold system in the patch write state */
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+ struct channel_ctx_gk20a *ch_ctx,
+ u32 addr, u32 data,
+ u8 *context)
+{
+ u32 num_gpc = g->gr.gpc_count;
+ u32 num_tpc;
+ u32 tpc, gpc, reg;
+ u32 chk_addr;
+ u32 vaddr_lo;
+ u32 vaddr_hi;
+ u32 tmp;
+
+ init_sm_dsm_reg_info();
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+ for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
+ for (gpc = 0; gpc < num_gpc; gpc++) {
+ num_tpc = g->gr.gpc_tpc_count[gpc];
+ for (tpc = 0; tpc < num_tpc; tpc++) {
+ chk_addr = ((proj_gpc_stride_v() * gpc) +
+ (proj_tpc_in_gpc_stride_v() * tpc) +
+ _ovr_perf_regs[reg]);
+ if (chk_addr != addr)
+ continue;
+ /* reset the patch count from previous
+ runs,if ucode has already processed
+ it */
+ tmp = mem_rd32(context +
+ ctxsw_prog_main_image_patch_count_o(), 0);
+
+ if (!tmp)
+ ch_ctx->patch_ctx.data_count = 0;
+
+ gr_gk20a_ctx_patch_write(g, ch_ctx,
+ addr, data, true);
+
+ vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+ vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+
+ mem_wr32(context +
+ ctxsw_prog_main_image_patch_count_o(),
+ 0, ch_ctx->patch_ctx.data_count);
+ mem_wr32(context +
+ ctxsw_prog_main_image_patch_adr_lo_o(),
+ 0, vaddr_lo);
+ mem_wr32(context +
+ ctxsw_prog_main_image_patch_adr_hi_o(),
+ 0, vaddr_hi);
+
+ /* we're not caching these on cpu side,
+ but later watch for it */
+
+ /* the l2 invalidate in the patch_write
+ * would be too early for this? */
+ gk20a_mm_l2_invalidate(g);
+ return 0;
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
+{
+ u32 reg;
+ u32 quad_ctrl;
+ u32 half_ctrl;
+ u32 tpc, gpc;
+ u32 gpc_tpc_addr;
+ u32 gpc_tpc_stride;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "offset=0x%x", offset);
+
+ gpc = pri_get_gpc_num(offset);
+ gpc_tpc_addr = pri_gpccs_addr_mask(offset);
+ tpc = pri_get_tpc_num(gpc_tpc_addr);
+
+ quad_ctrl = quad & 0x1; /* first bit tells us quad */
+ half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
+
+ gpc_tpc_stride = gpc * proj_gpc_stride_v() +
+ tpc * proj_tpc_in_gpc_stride_v();
+ gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
+
+ reg = gk20a_readl(g, gpc_tpc_addr);
+ reg = set_field(reg,
+ gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
+ quad_ctrl);
+
+ gk20a_writel(g, gpc_tpc_addr, reg);
+
+ gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
+ reg = gk20a_readl(g, gpc_tpc_addr);
+ reg = set_field(reg,
+ gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
+ half_ctrl);
+ gk20a_writel(g, gpc_tpc_addr, reg);
+}
+
+#define ILLEGAL_ID (~0)
+
+static inline bool check_main_image_header_magic(void *context)
+{
+ u32 magic = mem_rd32(context +
+ ctxsw_prog_main_image_magic_value_o(), 0);
+ nvhost_dbg(dbg_gpu_dbg, "main image magic=0x%x", magic);
+ return magic == ctxsw_prog_main_image_magic_value_v_value_v();
+}
+static inline bool check_local_header_magic(void *context)
+{
+ u32 magic = mem_rd32(context +
+ ctxsw_prog_local_magic_value_o(), 0);
+ nvhost_dbg(dbg_gpu_dbg, "local magic=0x%x", magic);
+ return magic == ctxsw_prog_local_magic_value_v_value_v();
+
+}
+
+/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
+static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
+{
+ return 256;
+}
+
+static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
+ u32 addr,
+ bool is_quad, u32 quad,
+ u32 *context_buffer,
+ u32 context_buffer_size,
+ u32 *priv_offset)
+{
+ u32 i, data32;
+ u32 gpc_num, tpc_num;
+ u32 num_gpcs, num_tpcs;
+ u32 chk_addr;
+ u32 ext_priv_offset, ext_priv_size;
+ void *context;
+ u32 offset_to_segment, offset_to_segment_end;
+ u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
+ u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
+ u32 num_ext_gpccs_ext_buffer_segments;
+ u32 inter_seg_offset;
+ u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
+ u32 max_tpc_count;
+ u32 *sm_dsm_perf_ctrl_regs = NULL;
+ u32 num_sm_dsm_perf_ctrl_regs = 0;
+ u32 *sm_dsm_perf_regs = NULL;
+ u32 num_sm_dsm_perf_regs = 0;
+ u32 buffer_segments_size = 0;
+ u32 marker_size = 0;
+ u32 control_register_stride = 0;
+ u32 perf_register_stride = 0;
+
+ /* Only have TPC registers in extended region, so if not a TPC reg,
+ then return error so caller can look elsewhere. */
+ if (pri_is_gpc_addr(addr)) {
+ u32 gpc_addr = 0;
+ gpc_num = pri_get_gpc_num(addr);
+ gpc_addr = pri_gpccs_addr_mask(addr);
+ if (pri_is_tpc_addr(gpc_addr))
+ tpc_num = pri_get_tpc_num(gpc_addr);
+ else
+ return -EINVAL;
+
+ nvhost_dbg_info(" gpc = %d tpc = %d",
+ gpc_num, tpc_num);
+ } else
+ return -EINVAL;
+
+ buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+ /* note below is in words/num_registers */
+ marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
+
+ context = context_buffer;
+ /* sanity check main header */
+ if (!check_main_image_header_magic(context)) {
+ nvhost_err(dev_from_gk20a(g),
+ "Invalid main header: magic value");
+ return -EINVAL;
+ }
+ num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+ if (gpc_num >= num_gpcs) {
+ nvhost_err(dev_from_gk20a(g),
+ "GPC 0x%08x is greater than total count 0x%08x!\n",
+ gpc_num, num_gpcs);
+ return -EINVAL;
+ }
+
+ data32 = mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+ ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
+ if (0 == ext_priv_size) {
+ nvhost_dbg_info(" No extended memory in context buffer");
+ return -EINVAL;
+ }
+ ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
+
+ offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
+ offset_to_segment_end = offset_to_segment +
+ (ext_priv_size * buffer_segments_size);
+
+ /* check local header magic */
+ context += ctxsw_prog_ucode_header_size_in_bytes();
+ if (!check_local_header_magic(context)) {
+ nvhost_err(dev_from_gk20a(g),
+ "Invalid local header: magic value\n");
+ return -EINVAL;
+ }
+
+ /*
+ * See if the incoming register address is in the first table of
+ * registers. We check this by decoding only the TPC addr portion.
+ * If we get a hit on the TPC bit, we then double check the address
+ * by computing it from the base gpc/tpc strides. Then make sure
+ * it is a real match.
+ */
+ num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
+ sm_dsm_perf_regs = _sm_dsm_perf_regs;
+ perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
+
+ init_sm_dsm_reg_info();
+
+ for (i = 0; i < num_sm_dsm_perf_regs; i++) {
+ if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
+ sm_dsm_perf_reg_id = i;
+
+ nvhost_dbg_info("register match: 0x%08x",
+ sm_dsm_perf_regs[i]);
+
+ chk_addr = (proj_gpc_base_v() +
+ (proj_gpc_stride_v() * gpc_num) +
+ proj_tpc_in_gpc_base_v() +
+ (proj_tpc_in_gpc_stride_v() * tpc_num) +
+ (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
+
+ if (chk_addr != addr) {
+ nvhost_err(dev_from_gk20a(g),
+ "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+ addr, chk_addr);
+ return -EINVAL;
+ }
+ break;
+ }
+ }
+
+ /* Didn't find reg in supported group 1.
+ * so try the second group now */
+ num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
+ sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
+ control_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+
+ if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
+ for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
+ if ((addr & tpc_gpc_mask) ==
+ (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
+ sm_dsm_perf_ctrl_reg_id = i;
+
+ nvhost_dbg_info("register match: 0x%08x",
+ sm_dsm_perf_ctrl_regs[i]);
+
+ chk_addr = (proj_gpc_base_v() +
+ (proj_gpc_stride_v() * gpc_num) +
+ proj_tpc_in_gpc_base_v() +
+ (proj_tpc_in_gpc_stride_v() * tpc_num) +
+ (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
+ tpc_gpc_mask));
+
+ if (chk_addr != addr) {
+ nvhost_err(dev_from_gk20a(g),
+ "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+ addr, chk_addr);
+ return -EINVAL;
+
+ }
+
+ break;
+ }
+ }
+ }
+
+ if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
+ (ILLEGAL_ID == sm_dsm_perf_reg_id))
+ return -EINVAL;
+
+ /* Skip the FECS extended header, nothing there for us now. */
+ offset_to_segment += buffer_segments_size;
+
+ /* skip through the GPCCS extended headers until we get to the data for
+ * our GPC. The size of each gpc extended segment is enough to hold the
+ * max tpc count for the gpcs,in 256b chunks.
+ */
+
+ max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
+
+ num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
+
+ offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
+ buffer_segments_size * gpc_num);
+
+ num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+
+ /* skip the head marker to start with */
+ inter_seg_offset = marker_size;
+
+ if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
+ /* skip over control regs of TPC's before the one we want.
+ * then skip to the register in this tpc */
+ inter_seg_offset = inter_seg_offset +
+ (tpc_num * control_register_stride) +
+ sm_dsm_perf_ctrl_reg_id;
+ } else {
+ /* skip all the control registers */
+ inter_seg_offset = inter_seg_offset +
+ (num_tpcs * control_register_stride);
+
+ /* skip the marker between control and counter segments */
+ inter_seg_offset += marker_size;
+
+ /* skip over counter regs of TPCs before the one we want */
+ inter_seg_offset = inter_seg_offset +
+ (tpc_num * perf_register_stride) *
+ ctxsw_prog_extended_num_smpc_quadrants_v();
+
+ /* skip over the register for the quadrants we do not want.
+ * then skip to the register in this tpc */
+ inter_seg_offset = inter_seg_offset +
+ (perf_register_stride * quad) +
+ sm_dsm_perf_reg_id;
+ }
+
+ /* set the offset to the segment offset plus the inter segment offset to
+ * our register */
+ offset_to_segment += (inter_seg_offset * 4);
+
+ /* last sanity check: did we somehow compute an offset outside the
+ * extended buffer? */
+ if (offset_to_segment > offset_to_segment_end) {
+ nvhost_err(dev_from_gk20a(g),
+ "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
+ offset_to_segment, offset_to_segment_end);
+ return -EINVAL;
+ }
+
+ *priv_offset = offset_to_segment;
+
+ return 0;
+}
+
+
+static int
+gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
+ int addr_type,/* enum ctxsw_addr_type */
+ u32 pri_addr,
+ u32 gpc_num, u32 num_tpcs,
+ u32 num_ppcs, u32 ppc_mask,
+ u32 *priv_offset)
+{
+ u32 i;
+ u32 address, base_address;
+ u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
+ u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
+ struct aiv_gk20a *reg;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
+
+ if (!g->gr.ctx_vars.valid)
+ return -EINVAL;
+
+ /* Process the SYS/BE segment. */
+ if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+ (addr_type == CTXSW_ADDR_TYPE_BE)) {
+ for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+ reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
+ address = reg->addr;
+ sys_offset = reg->index;
+
+ if (pri_addr == address) {
+ *priv_offset = sys_offset;
+ return 0;
+ }
+ }
+ }
+
+ /* Process the TPC segment. */
+ if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+ for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
+ for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+ reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
+ address = reg->addr;
+ tpc_addr = pri_tpccs_addr_mask(address);
+ base_address = proj_gpc_base_v() +
+ (gpc_num * proj_gpc_stride_v()) +
+ proj_tpc_in_gpc_base_v() +
+ (tpc_num * proj_tpc_in_gpc_stride_v());
+ address = base_address + tpc_addr;
+ /*
+ * The data for the TPCs is interleaved in the context buffer.
+ * Example with num_tpcs = 2
+ * 0 1 2 3 4 5 6 7 8 9 10 11 ...
+ * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
+ */
+ tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
+
+ if (pri_addr == address) {
+ *priv_offset = tpc_offset;
+ return 0;
+ }
+ }
+ }
+ }
+
+ /* Process the PPC segment. */
+ if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+ for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
+ for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+ reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
+ address = reg->addr;
+ ppc_addr = pri_ppccs_addr_mask(address);
+ base_address = proj_gpc_base_v() +
+ (gpc_num * proj_gpc_stride_v()) +
+ proj_ppc_in_gpc_base_v() +
+ (ppc_num * proj_ppc_in_gpc_stride_v());
+ address = base_address + ppc_addr;
+ /*
+ * The data for the PPCs is interleaved in the context buffer.
+ * Example with numPpcs = 2
+ * 0 1 2 3 4 5 6 7 8 9 10 11 ...
+ * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ...
+ */
+ ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
+
+ if (pri_addr == address) {
+ *priv_offset = ppc_offset;
+ return 0;
+ }
+ }
+ }
+ }
+
+
+ /* Process the GPC segment. */
+ if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+ for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+ reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
+
+ address = reg->addr;
+ gpc_addr = pri_gpccs_addr_mask(address);
+ gpc_offset = reg->index;
+
+ base_address = proj_gpc_base_v() +
+ (gpc_num * proj_gpc_stride_v());
+ address = base_address + gpc_addr;
+
+ if (pri_addr == address) {
+ *priv_offset = gpc_offset;
+ return 0;
+ }
+ }
+ }
+
+ return -EINVAL;
+}
+
+static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
+ void *context,
+ u32 *num_ppcs, u32 *ppc_mask,
+ u32 *reg_ppc_count)
+{
+ u32 data32;
+ u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+
+ /*
+ * if there is only 1 PES_PER_GPC, then we put the PES registers
+ * in the GPC reglist, so we can't error out if ppc.count == 0
+ */
+ if ((!g->gr.ctx_vars.valid) ||
+ ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
+ (litter_num_pes_per_gpc > 1)))
+ return -EINVAL;
+
+ data32 = mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+
+ *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
+ *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
+
+ *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
+
+ return 0;
+}
+
+
+
+/*
+ * This function will return the 32 bit offset for a priv register if it is
+ * present in the context buffer.
+ */
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+ u32 addr,
+ bool is_quad, u32 quad,
+ u32 *context_buffer,
+ u32 context_buffer_size,
+ u32 *priv_offset)
+{
+ struct gr_gk20a *gr = &g->gr;
+ u32 i, data32;
+ int err;
+ int addr_type; /*enum ctxsw_addr_type */
+ u32 broadcast_flags;
+ u32 gpc_num, tpc_num, ppc_num, be_num;
+ u32 num_gpcs, num_tpcs, num_ppcs;
+ u32 offset;
+ u32 sys_priv_offset, gpc_priv_offset;
+ u32 ppc_mask, reg_list_ppc_count;
+ void *context;
+ u32 offset_to_segment;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr);
+
+ err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+ &gpc_num, &tpc_num, &ppc_num, &be_num,
+ &broadcast_flags);
+ if (err)
+ return err;
+
+ context = context_buffer;
+ if (!check_main_image_header_magic(context)) {
+ nvhost_err(dev_from_gk20a(g),
+ "Invalid main header: magic value");
+ return -EINVAL;
+ }
+ num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+
+ /* Parse the FECS local header. */
+ context += ctxsw_prog_ucode_header_size_in_bytes();
+ if (!check_local_header_magic(context)) {
+ nvhost_err(dev_from_gk20a(g),
+ "Invalid FECS local header: magic value\n");
+ return -EINVAL;
+ }
+ data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+ sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+ /* If found in Ext buffer, ok.
+ * If it failed and we expected to find it there (quad offset)
+ * then return the error. Otherwise continue on.
+ */
+ err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
+ addr, is_quad, quad, context_buffer,
+ context_buffer_size, priv_offset);
+ if (!err || (err && is_quad))
+ return err;
+
+ if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+ (addr_type == CTXSW_ADDR_TYPE_BE)) {
+ /* Find the offset in the FECS segment. */
+ offset_to_segment = sys_priv_offset *
+ ctxsw_prog_ucode_header_size_in_bytes();
+
+ err = gr_gk20a_process_context_buffer_priv_segment(g,
+ addr_type, addr,
+ 0, 0, 0, 0,
+ &offset);
+ if (err)
+ return err;
+
+ *priv_offset = (offset_to_segment + offset);
+ return 0;
+ }
+
+ if ((gpc_num + 1) > num_gpcs) {
+ nvhost_err(dev_from_gk20a(g),
+ "GPC %d not in this context buffer.\n",
+ gpc_num);
+ return -EINVAL;
+ }
+
+ /* Parse the GPCCS local header(s).*/
+ for (i = 0; i < num_gpcs; i++) {
+ context += ctxsw_prog_ucode_header_size_in_bytes();
+ if (!check_local_header_magic(context)) {
+ nvhost_err(dev_from_gk20a(g),
+ "Invalid GPCCS local header: magic value\n");
+ return -EINVAL;
+
+ }
+ data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+ gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+ err = gr_gk20a_determine_ppc_configuration(g, context,
+ &num_ppcs, &ppc_mask,
+ &reg_list_ppc_count);
+ if (err)
+ return err;
+
+ num_tpcs = mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+
+ if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
+ nvhost_err(dev_from_gk20a(g),
+ "GPC %d TPC %d not in this context buffer.\n",
+ gpc_num, tpc_num);
+ return -EINVAL;
+ }
+
+ /* Find the offset in the GPCCS segment.*/
+ if (i == gpc_num) {
+ offset_to_segment = gpc_priv_offset *
+ ctxsw_prog_ucode_header_size_in_bytes();
+
+ if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+ /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
+ } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+ /* The ucode stores TPC data before PPC data.
+ * Advance offset past TPC data to PPC data. */
+ offset_to_segment +=
+ ((gr->ctx_vars.ctxsw_regs.tpc.count *
+ num_tpcs) << 2);
+ } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+ /* The ucode stores TPC/PPC data before GPC data.
+ * Advance offset past TPC/PPC data to GPC data. */
+ /* note 1 PES_PER_GPC case */
+ u32 litter_num_pes_per_gpc =
+ proj_scal_litter_num_pes_per_gpc_v();
+ if (litter_num_pes_per_gpc > 1) {
+ offset_to_segment +=
+ (((gr->ctx_vars.ctxsw_regs.tpc.count *
+ num_tpcs) << 2) +
+ ((reg_list_ppc_count * num_ppcs) << 2));
+ } else {
+ offset_to_segment +=
+ ((gr->ctx_vars.ctxsw_regs.tpc.count *
+ num_tpcs) << 2);
+ }
+ } else {
+ nvhost_err(dev_from_gk20a(g),
+ " Unknown address type.\n");
+ return -EINVAL;
+ }
+ err = gr_gk20a_process_context_buffer_priv_segment(g,
+ addr_type, addr,
+ i, num_tpcs,
+ num_ppcs, ppc_mask,
+ &offset);
+ if (err)
+ return -EINVAL;
+
+ *priv_offset = offset_to_segment + offset;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+ struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+ u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
+{
+ struct gk20a *g = ch->g;
+ struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+ void *ctx_ptr = NULL;
+ int curr_gr_chid, curr_gr_ctx;
+ bool ch_is_curr_ctx, restart_gr_ctxsw = false;
+ bool restart_fifo_ctxsw = false;
+ u32 i, j, offset, v;
+ u32 max_offsets = proj_scal_max_gpcs_v() *
+ proj_scal_max_tpc_per_gpc_v();
+ u32 *offsets = NULL;
+ u32 *offset_addrs = NULL;
+ u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
+ int err, pass;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
+ num_ctx_wr_ops, num_ctx_rd_ops);
+
+ /* TBD: set timeout */
+ /* pin_context will disable channel switching.
+ * at that point the hardware state can be inspected to
+ * determine if the context we're interested in is current.
+ */
+#if 0
+ err = fifo_gk20a_disable_fifo_ctxsw(g, c);
+ if (err) {
+ dev_warn(dev_from_gk20a(g), "failed to fifo ctxsw\n");
+ goto clean_up;
+ }
+ restart_fifo_ctxsw = true;
+#endif
+
+ {
+ u32 reg = gk20a_readl(g, 0x0041a084);
+ nvhost_dbg(dbg_gpu_dbg, "flcn_cfg_rm=0x%x",
+ reg);
+ }
+
+ err = gr_gk20a_disable_ctxsw(g);
+ if (err) {
+ nvhost_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
+ /* this should probably be ctx-fatal... */
+ goto cleanup;
+ }
+
+ restart_gr_ctxsw = true;
+
+ curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
+ curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
+ ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
+ if (ch_is_curr_ctx) {
+ for (pass = 0; pass < 2; pass++) {
+ ctx_op_nr = 0;
+ for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+ /* only do ctx ops and only on the right pass */
+ if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+ (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+ ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+ continue;
+
+ /* if this is a quad access, setup for special access*/
+ if (ctx_ops[i].is_quad)
+ gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad,
+ ctx_ops[i].offset);
+ offset = ctx_ops[i].offset;
+
+ if (pass == 0) { /* write pass */
+ v = gk20a_readl(g, offset);
+ v &= ~ctx_ops[i].and_n_mask_lo;
+ v |= ctx_ops[i].value_lo;
+ gk20a_writel(g, offset, v);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "direct wr: offset=0x%x v=0x%x",
+ offset, v);
+
+ if (ctx_ops[i].op == REGOP(WRITE_64)) {
+ v = gk20a_readl(g, offset + 4);
+ v &= ~ctx_ops[i].and_n_mask_hi;
+ v |= ctx_ops[i].value_hi;
+ gk20a_writel(g, offset + 4, v);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "direct wr: offset=0x%x v=0x%x",
+ offset + 4, v);
+ }
+
+ } else { /* read pass */
+ ctx_ops[i].value_lo =
+ gk20a_readl(g, offset);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "direct rd: offset=0x%x v=0x%x",
+ offset, ctx_ops[i].value_lo);
+
+ if (ctx_ops[i].op == REGOP(READ_64)) {
+ ctx_ops[i].value_hi =
+ gk20a_readl(g, offset + 4);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "direct rd: offset=0x%x v=0x%x",
+ offset, ctx_ops[i].value_lo);
+ } else
+ ctx_ops[i].value_hi = 0;
+ }
+ ctx_op_nr++;
+ }
+ }
+ goto cleanup;
+ }
+
+ /* they're the same size, so just use one alloc for both */
+ offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
+ if (!offsets) {
+ err = -ENOMEM;
+ goto cleanup;
+ }
+ offset_addrs = offsets + max_offsets;
+
+ /* would have been a variant of gr_gk20a_apply_instmem_overrides */
+ /* recoded in-place instead.*/
+ ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref);
+ if (!ctx_ptr) {
+ err = -ENOMEM;
+ ctx_ptr = NULL;
+ goto cleanup;
+ }
+
+ /* Channel gr_ctx buffer is gpu cacheable; so flush and invalidate.
+ * There should be no on-going/in-flight references by the gpu now. */
+ gk20a_mm_fb_flush(g);
+ gk20a_mm_l2_flush(g, true);
+
+ /* write to appropriate place in context image,
+ * first have to figure out where that really is */
+
+ /* first pass is writes, second reads */
+ for (pass = 0; pass < 2; pass++) {
+ ctx_op_nr = 0;
+ for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+ u32 num_offsets;
+
+ /* only do ctx ops and only on the right pass */
+ if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+ (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+ ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+ continue;
+
+ gr_gk20a_get_ctx_buffer_offsets(g,
+ ctx_ops[i].offset,
+ max_offsets,
+ offsets, offset_addrs,
+ &num_offsets,
+ ctx_ops[i].is_quad,
+ ctx_ops[i].quad);
+
+ /* if this is a quad access, setup for special access*/
+ if (ctx_ops[i].is_quad)
+ gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad,
+ ctx_ops[i].offset);
+
+ for (j = 0; j < num_offsets; j++) {
+ /* sanity check, don't write outside, worst case */
+ if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
+ continue;
+ if (pass == 0) { /* write pass */
+ v = mem_rd32(ctx_ptr + offsets[j], 0);
+ v &= ~ctx_ops[i].and_n_mask_lo;
+ v |= ctx_ops[i].value_lo;
+ mem_wr32(ctx_ptr + offsets[j], 0, v);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "context wr: offset=0x%x v=0x%x",
+ offsets[j], v);
+
+ if (ctx_ops[i].op == REGOP(WRITE_64)) {
+ v = mem_rd32(ctx_ptr + offsets[j] + 4, 0);
+ v &= ~ctx_ops[i].and_n_mask_hi;
+ v |= ctx_ops[i].value_hi;
+ mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "context wr: offset=0x%x v=0x%x",
+ offsets[j] + 4, v);
+ }
+
+ /* check to see if we need to add a special WAR
+ for some of the SMPC perf regs */
+ gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
+ v, ctx_ptr);
+
+ } else { /* read pass */
+ ctx_ops[i].value_lo =
+ mem_rd32(ctx_ptr + offsets[0], 0);
+
+ nvhost_dbg(dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
+ offsets[0], ctx_ops[i].value_lo);
+
+ if (ctx_ops[i].op == REGOP(READ_64)) {
+ ctx_ops[i].value_hi =
+ mem_rd32(ctx_ptr + offsets[0] + 4, 0);
+
+ nvhost_dbg(dbg_gpu_dbg,
+ "context rd: offset=0x%x v=0x%x",
+ offsets[0] + 4, ctx_ops[i].value_hi);
+ } else
+ ctx_ops[i].value_hi = 0;
+ }
+ }
+ ctx_op_nr++;
+ }
+ }
+#if 0
+ /* flush cpu caches for the ctx buffer? only if cpu cached, of course.
+ * they aren't, yet */
+ if (cached) {
+ FLUSH_CPU_DCACHE(ctx_ptr,
+ sg_phys(ch_ctx->gr_ctx.mem.ref), size);
+ }
+#endif
+
+ cleanup:
+ if (offsets)
+ kfree(offsets);
+
+ if (ctx_ptr)
+ nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr);
+
+ if (restart_gr_ctxsw) {
+ int tmp_err = gr_gk20a_enable_ctxsw(g);
+ if (tmp_err) {
+ nvhost_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
+ err = tmp_err;
+ }
+ }
+
+ if (restart_fifo_ctxsw) {
+#if 0
+ fifo_gk20a_enable_fifo_ctxsw(g);
+#endif
+ }
+
+ return err;
+}
diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.h b/drivers/video/tegra/host/gk20a/gr_gk20a.h
index 7b7cdec2a1d1..b6979f99f1a8 100644
--- a/drivers/video/tegra/host/gk20a/gr_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/gr_gk20a.h
@@ -1,7 +1,5 @@
/*
- * drivers/video/tegra/host/gk20a/gr_gk20a.h
- *
- * GK20A graphics
+ * GK20A Graphics Engine
*
* Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
*
@@ -14,9 +12,8 @@
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __GR_GK20A_H__
#define __GR_GK20A_H__
@@ -31,7 +28,7 @@
#define INVALID_SCREEN_TILE_ROW_OFFSET 0xFFFFFFFF
#define INVALID_MAX_WAYS 0xFFFFFFFF
-enum global_ctx_buffer {
+enum /* global_ctx_buffer */ {
CIRCULAR = 0,
PAGEPOOL = 1,
ATTRIBUTE = 2,
@@ -43,7 +40,7 @@ enum global_ctx_buffer {
};
/* either ATTRIBUTE or ATTRIBUTE_VPR maps to ATTRIBUTE_VA */
-enum global_ctx_buffer_va {
+enum /*global_ctx_buffer_va */ {
CIRCULAR_VA = 0,
PAGEPOOL_VA = 1,
ATTRIBUTE_VA = 2,
@@ -316,5 +313,15 @@ void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine);
int gk20a_gr_suspend(struct gk20a *g);
+struct nvhost_dbg_gpu_reg_op;
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+ struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+ u32 num_ctx_wr_ops, u32 num_ctx_rd_ops);
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+ u32 addr,
+ u32 max_offsets,
+ u32 *offsets, u32 *offset_addrs,
+ u32 *num_offsets,
+ bool is_quad, u32 quad);
#endif /*__GR_GK20A_H__*/
diff --git a/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h b/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h
new file mode 100644
index 000000000000..a82a1ee7caa8
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h
@@ -0,0 +1,179 @@
+/*
+ * GK20A Graphics Context Pri Register Addressing
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _NVHOST_GR_PRI_GK20A_H_
+#define _NVHOST_GR_PRI_GK20A_H_
+
+/*
+ * These convenience macros are generally for use in the management/modificaiton
+ * of the context state store for gr/compute contexts.
+ */
+
+/*
+ * GPC pri addressing
+ */
+static inline u32 pri_gpccs_addr_width(void)
+{
+ return 15; /*from where?*/
+}
+static inline u32 pri_gpccs_addr_mask(u32 addr)
+{
+ return addr & ((1 << pri_gpccs_addr_width()) - 1);
+}
+static inline u32 pri_gpc_addr(u32 addr, u32 gpc)
+{
+ return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + addr;
+}
+static inline bool pri_is_gpc_addr_shared(u32 addr)
+{
+ return (addr >= proj_gpc_shared_base_v()) &&
+ (addr < proj_gpc_shared_base_v() + proj_gpc_stride_v());
+}
+static inline bool pri_is_gpc_addr(u32 addr)
+{
+ return ((addr >= proj_gpc_base_v()) &&
+ (addr < proj_gpc_base_v() +
+ proj_scal_litter_num_gpcs_v() * proj_gpc_stride_v())) ||
+ pri_is_gpc_addr_shared(addr);
+}
+static inline u32 pri_get_gpc_num(u32 addr)
+{
+ u32 i, start;
+ u32 num_gpcs = proj_scal_litter_num_gpcs_v();
+
+ for (i = 0; i < num_gpcs; i++) {
+ start = proj_gpc_base_v() + (i * proj_gpc_stride_v());
+ if ((addr >= start) && (addr < (start + proj_gpc_stride_v())))
+ return i;
+ }
+ return 0;
+}
+/*
+ * TPC pri addressing
+ */
+static inline u32 pri_tpccs_addr_width(void)
+{
+ return 11; /* from where? */
+}
+static inline u32 pri_tpccs_addr_mask(u32 addr)
+{
+ return addr & ((1 << pri_tpccs_addr_width()) - 1);
+}
+static inline u32 pri_tpc_addr(u32 addr, u32 gpc, u32 tpc)
+{
+ return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+ proj_tpc_in_gpc_base_v() + (tpc * proj_tpc_in_gpc_stride_v()) +
+ addr;
+}
+static inline bool pri_is_tpc_addr_shared(u32 addr)
+{
+ return (addr >= proj_tpc_in_gpc_shared_base_v()) &&
+ (addr < (proj_tpc_in_gpc_shared_base_v() +
+ proj_tpc_in_gpc_stride_v()));
+}
+static inline bool pri_is_tpc_addr(u32 addr)
+{
+ return ((addr >= proj_tpc_in_gpc_base_v()) &&
+ (addr < proj_tpc_in_gpc_base_v() + (proj_scal_litter_num_tpc_per_gpc_v() *
+ proj_tpc_in_gpc_stride_v())))
+ ||
+ pri_is_tpc_addr_shared(addr);
+}
+static inline u32 pri_get_tpc_num(u32 addr)
+{
+ u32 i, start;
+ u32 num_tpcs = proj_scal_litter_num_tpc_per_gpc_v();
+
+ for (i = 0; i < num_tpcs; i++) {
+ start = proj_tpc_in_gpc_base_v() + (i * proj_tpc_in_gpc_stride_v());
+ if ((addr >= start) && (addr < (start + proj_tpc_in_gpc_stride_v())))
+ return i;
+ }
+ return 0;
+}
+
+/*
+ * BE pri addressing
+ */
+static inline u32 pri_becs_addr_width(void)
+{
+ return 10;/* from where? */
+}
+static inline u32 pri_becs_addr_mask(u32 addr)
+{
+ return addr & ((1 << pri_becs_addr_width()) - 1);
+}
+static inline bool pri_is_be_addr_shared(u32 addr)
+{
+ return (addr >= proj_rop_shared_base_v()) &&
+ (addr < proj_rop_shared_base_v() + proj_rop_stride_v());
+}
+static inline u32 pri_be_shared_addr(u32 addr)
+{
+ return proj_rop_shared_base_v() + pri_becs_addr_mask(addr);
+}
+static inline bool pri_is_be_addr(u32 addr)
+{
+ return ((addr >= proj_rop_base_v()) &&
+ (addr < proj_rop_base_v()+proj_scal_litter_num_fbps_v() * proj_rop_stride_v())) ||
+ pri_is_be_addr_shared(addr);
+}
+
+static inline u32 pri_get_be_num(u32 addr)
+{
+ u32 i, start;
+ u32 num_fbps = proj_scal_litter_num_fbps_v();
+ for (i = 0; i < num_fbps; i++) {
+ start = proj_rop_base_v() + (i * proj_rop_stride_v());
+ if ((addr >= start) && (addr < (start + proj_rop_stride_v())))
+ return i;
+ }
+ return 0;
+}
+
+/*
+ * PPC pri addressing
+ */
+static inline u32 pri_ppccs_addr_width(void)
+{
+ return 9; /* from where? */
+}
+static inline u32 pri_ppccs_addr_mask(u32 addr)
+{
+ return addr & ((1 << pri_ppccs_addr_width()) - 1);
+}
+static inline u32 pri_ppc_addr(u32 addr, u32 gpc, u32 ppc)
+{
+ return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+ proj_ppc_in_gpc_base_v() + (ppc * proj_ppc_in_gpc_stride_v()) + addr;
+}
+
+enum ctxsw_addr_type {
+ CTXSW_ADDR_TYPE_SYS = 0,
+ CTXSW_ADDR_TYPE_GPC = 1,
+ CTXSW_ADDR_TYPE_TPC = 2,
+ CTXSW_ADDR_TYPE_BE = 3,
+ CTXSW_ADDR_TYPE_PPC = 4
+};
+
+#define PRI_BROADCAST_FLAGS_NONE 0
+#define PRI_BROADCAST_FLAGS_GPC BIT(0)
+#define PRI_BROADCAST_FLAGS_TPC BIT(1)
+#define PRI_BROADCAST_FLAGS_BE BIT(2)
+#define PRI_BROADCAST_FLAGS_PPC BIT(3)
+
+#endif /*_NVHOST_GR_PRI_GK20A_H_ */
diff --git a/drivers/video/tegra/host/gk20a/mm_gk20a.h b/drivers/video/tegra/host/gk20a/mm_gk20a.h
index 03eb05ef946d..4db90c9b80bf 100644
--- a/drivers/video/tegra/host/gk20a/mm_gk20a.h
+++ b/drivers/video/tegra/host/gk20a/mm_gk20a.h
@@ -78,6 +78,7 @@ struct userd_desc {
struct patch_desc {
struct mem_desc mem;
+ void *cpu_va;
u64 gpu_va;
u32 data_count;
};
diff --git a/drivers/video/tegra/host/gk20a/regops_gk20a.c b/drivers/video/tegra/host/gk20a/regops_gk20a.c
new file mode 100644
index 000000000000..d35f6961ab0b
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/regops_gk20a.c
@@ -0,0 +1,270 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+
+#include "dev.h"
+#include "nvhost_hwctx.h"
+/*#include "nvhost_acm.h"*/
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+ u32 *ctx_rd_count, u32 *ctx_wr_count,
+ struct nvhost_dbg_gpu_reg_op *ops,
+ u32 op_count);
+
+
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_reg_op *ops,
+ u64 num_ops)
+{
+ int err = 0, i;
+ struct channel_gk20a *ch = dbg_s->ch;
+ struct gk20a *g = dbg_s->ch->g;
+ /*struct gr_gk20a *gr = &g->gr;*/
+ u32 data32_lo = 0, data32_hi = 0;
+ u32 ctx_rd_count = 0, ctx_wr_count = 0;
+ bool skip_read_lo = false, skip_read_hi = false;
+ bool ok;
+
+ nvhost_dbg(dbg_fn | dbg_gpu_dbg, "");
+
+ ok = validate_reg_ops(dbg_s,
+ &ctx_rd_count, &ctx_wr_count,
+ ops, num_ops);
+ if (!ok) {
+ dev_err(dbg_s->dev, "invalid op(s)");
+ err = -EINVAL;
+ /* each op has its own err/status */
+ goto clean_up;
+ }
+
+ for (i = 0; i < num_ops; i++) {
+ /* if it isn't global then it is done in the ctx ops... */
+ if (ops[i].type != REGOP(TYPE_GLOBAL))
+ continue;
+
+ switch (ops[i].op) {
+
+ case REGOP(READ_32):
+ ops[i].value_hi = 0;
+ ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+ nvhost_dbg(dbg_gpu_dbg, "read_32 0x%08x from 0x%08x",
+ ops[i].value_lo, ops[i].offset);
+
+ break;
+
+ case REGOP(READ_64):
+ ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+ ops[i].value_hi =
+ gk20a_readl(g, ops[i].offset + 4);
+
+ nvhost_dbg(dbg_gpu_dbg, "read_64 0x%08x:%08x from 0x%08x",
+ ops[i].value_hi, ops[i].value_lo,
+ ops[i].offset);
+ break;
+
+ case REGOP(WRITE_32):
+ case REGOP(WRITE_64):
+ /* some of this appears wonky/unnecessary but
+ we've kept it for compat with existing
+ debugger code. just in case... */
+ if (ops[i].and_n_mask_lo == ~(u32)0) {
+ data32_lo = ops[i].value_lo;
+ skip_read_lo = true;
+ }
+
+ if ((ops[i].op == REGOP(WRITE_64)) &&
+ (ops[i].and_n_mask_hi == ~(u32)0)) {
+ data32_hi = ops[i].value_hi;
+ skip_read_hi = true;
+ }
+
+ /* read first 32bits */
+ if (unlikely(skip_read_lo == false)) {
+ data32_lo = gk20a_readl(g, ops[i].offset);
+ data32_lo &= ~ops[i].and_n_mask_lo;
+ data32_lo |= ops[i].value_lo;
+ }
+
+ /* if desired, read second 32bits */
+ if ((ops[i].op == REGOP(WRITE_64)) &&
+ !skip_read_hi) {
+ data32_hi = gk20a_readl(g, ops[i].offset + 4);
+ data32_hi &= ~ops[i].and_n_mask_hi;
+ data32_hi |= ops[i].value_hi;
+ }
+
+ /* now update first 32bits */
+ gk20a_writel(g, ops[i].offset, data32_lo);
+ nvhost_dbg(dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+ data32_lo, ops[i].offset);
+ /* if desired, update second 32bits */
+ if (ops[i].op == REGOP(WRITE_64)) {
+ gk20a_writel(g, ops[i].offset + 4, data32_hi);
+ nvhost_dbg(dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+ data32_hi, ops[i].offset + 4);
+
+ }
+
+
+ break;
+
+ /* shouldn't happen as we've already screened */
+ default:
+ BUG();
+ err = -EINVAL;
+ goto clean_up;
+ break;
+ }
+ }
+
+ if (ctx_wr_count | ctx_rd_count) {
+ err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+ ctx_wr_count, ctx_rd_count);
+ if (err) {
+ dev_warn(dbg_s->dev,
+ "failed to perform ctx ops\n");
+ goto clean_up;
+ }
+ }
+
+ clean_up:
+ nvhost_dbg(dbg_gpu_dbg, "ret=%d", err);
+ return err;
+
+}
+
+
+static int validate_reg_op_info(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_reg_op *op)
+{
+ int err = 0;
+
+ op->status = REGOP(STATUS_SUCCESS);
+
+ switch (op->op) {
+ case REGOP(READ_32):
+ case REGOP(READ_64):
+ case REGOP(WRITE_32):
+ case REGOP(WRITE_64):
+ break;
+ default:
+ op->status |= REGOP(STATUS_UNSUPPORTED_OP);
+ /*nvhost_err(dbg_s->dev, "Invalid regops op %d!", op->op);*/
+ err = -EINVAL;
+ break;
+ }
+
+ switch (op->type) {
+ case REGOP(TYPE_GLOBAL):
+ case REGOP(TYPE_GR_CTX):
+ case REGOP(TYPE_GR_CTX_TPC):
+ case REGOP(TYPE_GR_CTX_SM):
+ case REGOP(TYPE_GR_CTX_CROP):
+ case REGOP(TYPE_GR_CTX_ZROP):
+ case REGOP(TYPE_GR_CTX_QUAD):
+ break;
+ /*
+ case NVHOST_DBG_GPU_REG_OP_TYPE_FB:
+ */
+ default:
+ op->status |= REGOP(STATUS_INVALID_TYPE);
+ /*nvhost_err(dbg_s->dev, "Invalid regops type %d!", op->type);*/
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
+
+static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_reg_op *op)
+{
+ int err = 0, temp_err;
+ u32 buf_offset_lo, buf_offset_addr, num_offsets;
+ bool is_ctx_op = reg_op_is_gr_ctx(op->type);
+
+ op->status = 0;
+ /*TBD: get this size from the register resource directly */
+ if (!is_ctx_op && op->offset >= SZ_16M) {
+ op->status = REGOP(STATUS_INVALID_OFFSET);
+ err = -EINVAL;
+ } else if (is_ctx_op) {
+ if (!dbg_s->ch) {
+ nvhost_err(dbg_s->dev, "can't perform ctx regop unless bound");
+ temp_err = -EINVAL;
+ } else
+ temp_err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->ch->g,
+ op->offset,
+ 1,
+ &buf_offset_lo,
+ &buf_offset_addr,
+ &num_offsets,
+ op->type == REGOP(TYPE_GR_CTX_QUAD),
+ op->quad);
+ if (temp_err) {
+ op->status |= REGOP(STATUS_INVALID_OFFSET);
+ err = -EINVAL;
+ }
+ if (!buf_offset_lo) {
+ op->status |= REGOP(STATUS_INVALID_OFFSET);
+ err = -EINVAL;
+ }
+ }
+
+ return err;
+}
+
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+ u32 *ctx_rd_count, u32 *ctx_wr_count,
+ struct nvhost_dbg_gpu_reg_op *ops,
+ u32 op_count)
+{
+ u32 i;
+ int err;
+ bool ok = true;
+
+ /* keep going until the end so every op can get
+ * a separate error code if needed */
+ for (i = 0; i < op_count; i++) {
+
+ err = validate_reg_op_info(dbg_s, &ops[i]);
+ ok &= !err;
+
+ if (reg_op_is_gr_ctx(ops[i].type)) {
+ if (reg_op_is_read(ops[i].op))
+ (*ctx_rd_count)++;
+ else
+ (*ctx_wr_count)++;
+ }
+
+ err = validate_reg_op_offset(dbg_s, &ops[i]);
+ ok &= !err;
+ }
+
+ nvhost_dbg_fn("ctx_wrs:%d ctx_rds:%d\n", *ctx_wr_count, *ctx_rd_count);
+
+ return ok;
+}
diff --git a/drivers/video/tegra/host/gk20a/regops_gk20a.h b/drivers/video/tegra/host/gk20a/regops_gk20a.h
new file mode 100644
index 000000000000..231882946a08
--- /dev/null
+++ b/drivers/video/tegra/host/gk20a/regops_gk20a.h
@@ -0,0 +1,46 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __REGOPS_GK20A_H_
+#define __REGOPS_GK20A_H_
+
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+ struct nvhost_dbg_gpu_reg_op *ops,
+ u64 num_ops);
+
+/* turn seriously unwieldy names -> something shorter */
+#define REGOP(x) NVHOST_DBG_GPU_REG_OP_##x
+
+
+static inline bool reg_op_is_gr_ctx(u8 type)
+{
+ return type == REGOP(TYPE_GR_CTX) ||
+ type == REGOP(TYPE_GR_CTX_TPC) ||
+ type == REGOP(TYPE_GR_CTX_SM) ||
+ type == REGOP(TYPE_GR_CTX_CROP) ||
+ type == REGOP(TYPE_GR_CTX_ZROP) ||
+ type == REGOP(TYPE_GR_CTX_QUAD);
+}
+static inline bool reg_op_is_read(u8 op)
+{
+ return op == REGOP(READ_32) ||
+ op == REGOP(READ_64) ;
+}
+
+
+#endif /* __REGOPS_GK20A_H_ */
diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h
index c919b89a2e5e..a2552cd1ef27 100644
--- a/drivers/video/tegra/host/nvhost_channel.h
+++ b/drivers/video/tegra/host/nvhost_channel.h
@@ -129,4 +129,6 @@ void nvhost_free_channel_internal(struct nvhost_channel *ch,
int nvhost_channel_save_context(struct nvhost_channel *ch);
+struct nvhost_hwctx *nvhost_channel_get_file_hwctx(int fd);
+
#endif
diff --git a/drivers/video/tegra/host/nvhost_hwctx.h b/drivers/video/tegra/host/nvhost_hwctx.h
index 0672571aa32a..43c39387c98f 100644
--- a/drivers/video/tegra/host/nvhost_hwctx.h
+++ b/drivers/video/tegra/host/nvhost_hwctx.h
@@ -1,9 +1,7 @@
/*
- * drivers/video/tegra/host/nvhost_hwctx.h
- *
* Tegra Graphics Host Hardware Context Interface
*
- * Copyright (c) 2010-2013, NVIDIA Corporation.
+ * Copyright (c) 2010-2013, NVIDIA Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -29,6 +27,7 @@
struct nvhost_channel;
struct nvhost_cdma;
struct mem_mgr;
+struct nvhost_dbg_session;
struct nvhost_hwctx {
struct kref ref;
@@ -47,6 +46,7 @@ struct nvhost_hwctx {
struct list_head as_share_bound_list_node;
struct nvhost_as_share *as_share;
+ struct nvhost_dbg_session *dbg_session;
};
struct nvhost_hwctx_handler {
@@ -82,4 +82,5 @@ enum {
#define HWCTX_REGINFO(offset, count, type) {offset, count, HWCTX_REGINFO_##type, offset}
#define HWCTX_REGINFO_RST(offset, count, type, rst) {offset, count, HWCTX_REGINFO_##type, rst}
+
#endif
diff --git a/drivers/video/tegra/host/t124/t124.c b/drivers/video/tegra/host/t124/t124.c
index 75a248591c19..6bd236113e6b 100644
--- a/drivers/video/tegra/host/t124/t124.c
+++ b/drivers/video/tegra/host/t124/t124.c
@@ -443,6 +443,7 @@ struct nvhost_device_data tegra_gk20a_info = {
.can_powergate = true,
.alloc_hwctx_handler = nvhost_gk20a_alloc_hwctx_handler,
.ctrl_ops = &tegra_gk20a_ctrl_ops,
+ .dbg_ops = &tegra_gk20a_dbg_gpu_ops,
.moduleid = NVHOST_MODULE_GPU,
.init = nvhost_gk20a_init,
.deinit = nvhost_gk20a_deinit,
diff --git a/include/linux/nvhost.h b/include/linux/nvhost.h
index 60136ba8ae45..896027702314 100644
--- a/include/linux/nvhost.h
+++ b/include/linux/nvhost.h
@@ -190,7 +190,10 @@ struct nvhost_device_data {
struct cdev ctrl_cdev;
const struct file_operations *ctrl_ops; /* ctrl ops for the module */
- /* void *priv;*/
+ /* module debugger */
+ struct device *dbg_node;
+ struct cdev dbg_cdev;
+ const struct file_operations *dbg_ops;
struct kobject *power_kobj; /* kobject to hold power sysfs entries */
struct nvhost_device_power_attr *power_attrib; /* sysfs attributes */
diff --git a/include/linux/nvhost_dbg_gpu_ioctl.h b/include/linux/nvhost_dbg_gpu_ioctl.h
new file mode 100644
index 000000000000..2866a6d9df2d
--- /dev/null
+++ b/include/linux/nvhost_dbg_gpu_ioctl.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __LINUX_NVHOST_DBG_GPU_IOCTL_H
+#define __LINUX_NVHOST_DBG_GPU_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#if !defined(__KERNEL__)
+#define __user
+#endif
+
+#define NVHOST_DBG_GPU_IOCTL_MAGIC 'D'
+
+/*
+ * /dev/nvhost-dbg-* devices
+ *
+ * Opening a '/dev/nvhost-dbg-<module_name>' device node creates a new debugger
+ * session. nvhost channels (for the same module) can then be bound to such a
+ * session.
+ *
+ * Once a nvhost channel has been bound to a debugger session it cannot be
+ * bound to another.
+ *
+ * As long as there is an open device file to the session, or any bound
+ * nvhost channels it will be valid. Once all references to the session
+ * are removed the session is deleted.
+ *
+ */
+
+/*
+ * Binding/attaching a debugger session to an nvhost gpu channel
+ *
+ * The 'channel_fd' given here is the fd used to allocate the
+ * gpu channel context. To detach/unbind the debugger session
+ * use a channel_fd of -1.
+ *
+ */
+struct nvhost_dbg_gpu_bind_channel_args {
+ __u32 channel_fd; /* in*/
+ __u32 _pad0[1];
+};
+
+#define NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL \
+ _IOWR(NVHOST_DBG_GPU_IOCTL_MAGIC, 1, struct nvhost_dbg_gpu_bind_channel_args)
+
+/*
+ * Register operations
+ */
+/* valid op values */
+#define NVHOST_DBG_GPU_REG_OP_READ_32 (0x00000000)
+#define NVHOST_DBG_GPU_REG_OP_WRITE_32 (0x00000001)
+#define NVHOST_DBG_GPU_REG_OP_READ_64 (0x00000002)
+#define NVHOST_DBG_GPU_REG_OP_WRITE_64 (0x00000003)
+/* note: 8b ops are unsupported */
+#define NVHOST_DBG_GPU_REG_OP_READ_08 (0x00000004)
+#define NVHOST_DBG_GPU_REG_OP_WRITE_08 (0x00000005)
+
+/* valid type values */
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GLOBAL (0x00000000)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX (0x00000001)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_TPC (0x00000002)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_SM (0x00000004)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_CROP (0x00000008)
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_ZROP (0x00000010)
+/*#define NVHOST_DBG_GPU_REG_OP_TYPE_FB (0x00000020)*/
+#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_QUAD (0x00000040)
+
+/* valid status values */
+#define NVHOST_DBG_GPU_REG_OP_STATUS_SUCCESS (0x00000000)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OP (0x00000001)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_TYPE (0x00000002)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET (0x00000004)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_UNSUPPORTED_OP (0x00000008)
+#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_MASK (0x00000010)
+
+struct nvhost_dbg_gpu_reg_op {
+ __u8 op;
+ __u8 type;
+ __u8 status;
+ __u8 quad;
+ __u8 is_quad;
+ __u8 _pad0[3];
+ __u32 group_mask;
+ __u32 sub_group_mask;
+ __u32 offset;
+ __u32 value_hi;
+ __u32 value_lo;
+ __u32 and_n_mask_hi;
+ __u32 and_n_mask_lo;
+ __u32 _pad1[1];
+};
+
+struct nvhost_dbg_gpu_exec_reg_ops_args {
+ __u64 ops; /* pointer to nvhost_reg_op operations */
+ __u32 num_ops;
+ __u32 _pad0[1];
+};
+
+#define NVHOST_DBG_GPU_IOCTL_REG_OPS \
+ _IOWR(NVHOST_DBG_GPU_IOCTL_MAGIC, 2, struct nvhost_dbg_gpu_exec_reg_ops_args)
+
+
+#define NVHOST_DBG_GPU_IOCTL_LAST \
+ _IOC_NR(NVHOST_DBG_GPU_IOCTL_REG_OPS)
+#define NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE \
+ sizeof(struct nvhost_dbg_gpu_exec_reg_ops_args)
+
+#endif