diff options
author | Ken Adams <kadams@nvidia.com> | 2013-09-17 12:55:54 -0400 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2013-09-27 12:53:49 -0700 |
commit | d55049b57a338403afe3a0e8d93ee83a9d63007d (patch) | |
tree | dd82aefcd9924e43d63c25d80e23a832d84b80de | |
parent | 409be5d3c52b2a6cd6a843d91f8fbf63f4d3b42b (diff) |
video: tegra: host: module debugger framework
Framework and implementation of a gk20a
debugger/profiler session interface.
Adds work toward optimized handling of
context patch write sequences. These introduce
cpu map/unmap operations and gpu l2 invaliates.
Unless we take care to coalesce them they occur
*per write*.
Change-Id: I8afc11a6f6782b80996404acbd01bffe9653ebdd
Signed-off-by: Ken Adams <kadams@nvidia.com>
Reviewed-on: http://git-master/r/274416
22 files changed, 2631 insertions, 134 deletions
diff --git a/drivers/video/tegra/host/bus_client.c b/drivers/video/tegra/host/bus_client.c index 7e1a4c829445..34e88f20881e 100644 --- a/drivers/video/tegra/host/bus_client.c +++ b/drivers/video/tegra/host/bus_client.c @@ -1,6 +1,4 @@ /* - * drivers/video/tegra/host/bus_client.c - * * Tegra Graphics Host Client Module * * Copyright (c) 2010-2013, NVIDIA Corporation. All rights reserved. @@ -217,7 +215,7 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp) } filp->private_data = priv; priv->ch = ch; - if(nvhost_module_add_client(ch->dev, priv)) + if (nvhost_module_add_client(ch->dev, priv)) goto fail; if (ch->ctxhandler && ch->ctxhandler->alloc) { @@ -1108,9 +1106,10 @@ int nvhost_client_user_init(struct platform_device *dev) struct nvhost_channel *ch = pdata->channel; BUG_ON(!ch); - // reserve 3 minor #s for <dev> and as-<dev> and ctrl-<dev> + /* reserve 4 minor #s for <dev> and as-<dev>, ctrl-<dev> + * and dbg-<dev> */ - err = alloc_chrdev_region(&devno, 0, 3, IFACE_NAME); + err = alloc_chrdev_region(&devno, 0, 4, IFACE_NAME); if (err < 0) { dev_err(&dev->dev, "failed to allocate devno\n"); goto fail; @@ -1135,6 +1134,16 @@ int nvhost_client_user_init(struct platform_device *dev) goto fail; } + if (pdata->dbg_ops) { + ++devno; + pdata->dbg_node = nvhost_client_device_create(dev, + &pdata->dbg_cdev, "dbg-", + devno, pdata->dbg_ops); + if (pdata->dbg_node == NULL) + goto fail; + } + + return 0; fail: return err; diff --git a/drivers/video/tegra/host/bus_client.h b/drivers/video/tegra/host/bus_client.h index 07bc7104d283..db3e228e8eec 100644 --- a/drivers/video/tegra/host/bus_client.h +++ b/drivers/video/tegra/host/bus_client.h @@ -55,6 +55,4 @@ nvhost_client_request_firmware(struct platform_device *dev, int nvhost_client_device_get_resources(struct platform_device *dev); -struct nvhost_hwctx *nvhost_channel_get_file_hwctx(int fd); - #endif diff --git a/drivers/video/tegra/host/dev.h b/drivers/video/tegra/host/dev.h index 77330c3b0d05..107b1beaa0ba 100644 --- a/drivers/video/tegra/host/dev.h +++ b/drivers/video/tegra/host/dev.h @@ -39,7 +39,7 @@ void nvhost_device_list_remove(struct platform_device *pdev); #else /* manually enable and turn it on the mask */ /*#define NVHOST_DEBUG*/ - #define NVHOST_DEFAULT_DBG_MASK (dbg_info) + #define NVHOST_DEFAULT_DBG_MASK (dbg_err|dbg_info) #endif enum nvhost_dbg_categories { @@ -52,6 +52,7 @@ enum nvhost_dbg_categories { dbg_pmu = BIT(6), /* gk20a pmu */ dbg_clk = BIT(7), /* gk20a clk */ dbg_map = BIT(8), /* mem mappings */ + dbg_gpu_dbg = BIT(9), /* gpu debugger */ dbg_mem = BIT(31), /* memory accesses, very verbose */ }; diff --git a/drivers/video/tegra/host/gk20a/Makefile b/drivers/video/tegra/host/gk20a/Makefile index c22d74696389..2d7b9a524c67 100644 --- a/drivers/video/tegra/host/gk20a/Makefile +++ b/drivers/video/tegra/host/gk20a/Makefile @@ -11,6 +11,8 @@ nvhost-gk20a-objs = \ channel_gk20a.o \ cdma_gk20a.o \ debug_gk20a.o \ + dbg_gpu_gk20a.o \ + regops_gk20a.o \ gr_gk20a.o \ kind_gk20a.o \ mm_gk20a.o \ diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.c b/drivers/video/tegra/host/gk20a/channel_gk20a.c index d509510742be..6c584c448811 100644 --- a/drivers/video/tegra/host/gk20a/channel_gk20a.c +++ b/drivers/video/tegra/host/gk20a/channel_gk20a.c @@ -1495,6 +1495,7 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid) #if defined(CONFIG_TEGRA_GPU_CYCLE_STATS) mutex_init(&c->cyclestate.cyclestate_buffer_mutex); #endif + mutex_init(&c->dbg_s_lock); return 0; } diff --git a/drivers/video/tegra/host/gk20a/channel_gk20a.h b/drivers/video/tegra/host/gk20a/channel_gk20a.h index 5ade025d2a48..dca69aea6f01 100644 --- a/drivers/video/tegra/host/gk20a/channel_gk20a.h +++ b/drivers/video/tegra/host/gk20a/channel_gk20a.h @@ -30,6 +30,7 @@ struct gk20a; struct gr_gk20a; struct mem_mgr; struct mem_handle; +struct dbg_session_gk20a; #include "nvhost_channel.h" #include "nvhost_hwctx.h" @@ -129,6 +130,8 @@ struct channel_gk20a { struct mutex cyclestate_buffer_mutex; } cyclestate; #endif + struct mutex dbg_s_lock; + struct dbg_session_gk20a *dbg_s; }; static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch) diff --git a/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c new file mode 100644 index 000000000000..a4744e64e614 --- /dev/null +++ b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.c @@ -0,0 +1,368 @@ +/* + * Tegra GK20A GPU Debugger Driver + * + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/cdev.h> +#include <linux/uaccess.h> +#include <linux/nvhost.h> +#include <linux/nvhost_dbg_gpu_ioctl.h> + +#include "dev.h" +#include "nvhost_hwctx.h" +#include "nvhost_acm.h" +#include "gk20a.h" +#include "gr_gk20a.h" +#include "gk20a_gating_reglist.h" +#include "dbg_gpu_gk20a.h" +#include "regops_gk20a.h" + +struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = { + .exec_reg_ops = exec_regops_gk20a +}; + +/* silly allocator - just increment session id */ +static atomic_t session_id = ATOMIC_INIT(0); +static int generate_session_id(void) +{ + return atomic_add_return(1, &session_id); +} + +static int alloc_session(struct dbg_session_gk20a **_dbg_s) +{ + struct dbg_session_gk20a *dbg_s; + *_dbg_s = NULL; + + nvhost_dbg_fn(""); + + dbg_s = kzalloc(sizeof(*dbg_s), GFP_KERNEL); + if (!dbg_s) + return -ENOMEM; + + dbg_s->id = generate_session_id(); + dbg_s->ops = &dbg_gpu_session_ops_gk20a; + *_dbg_s = dbg_s; + return 0; +} + +int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp) +{ + struct dbg_session_gk20a *dbg_session; + struct nvhost_device_data *pdata; + struct platform_device *pdev; + struct device *dev; + + int err; + + pdata = container_of(inode->i_cdev, + struct nvhost_device_data, dbg_cdev); + pdev = pdata->pdev; + dev = &pdev->dev; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "dbg session: %s", dev_name(dev)); + + err = alloc_session(&dbg_session); + if (err) + return err; + + filp->private_data = dbg_session; + dbg_session->pdata = pdata; + dbg_session->pdev = pdev; + dbg_session->dev = dev; + + return 0; +} + +static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s) +{ + struct channel_gk20a *ch_gk20a = dbg_s->ch; + struct gk20a *g = dbg_s->ch->g; + + nvhost_dbg_fn(""); + + /* wasn't bound to start with ? */ + if (!ch_gk20a) { + nvhost_dbg(dbg_gpu_dbg | dbg_fn, "not bound already?"); + return -ENODEV; + } + + mutex_lock(&g->dbg_sessions_lock); + mutex_lock(&ch_gk20a->dbg_s_lock); + + if (--g->dbg_sessions == 0) { + /* restore (can) powergate, clk state */ + /* release pending exceptions to fault/be handled as usual */ + /*TBD: ordering of these? */ + g->elcg_enabled = true; + gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A); + gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A); + + gr_gk20a_blcg_gr_load_gating_prod(g, g->blcg_enabled); + /* ??? gr_gk20a_pg_gr_load_gating_prod(g, true); */ + + gr_gk20a_slcg_gr_load_gating_prod(g, g->slcg_enabled); + gr_gk20a_slcg_perf_load_gating_prod(g, g->slcg_enabled); + + gk20a_pmu_enable_elpg(g); + + nvhost_dbg(dbg_gpu_dbg | dbg_fn, "module idle"); + nvhost_module_idle(dbg_s->pdev); + } + + ch_gk20a->dbg_s = NULL; + dbg_s->ch = NULL; + fput(dbg_s->hwctx_f); + dbg_s->hwctx_f = NULL; + + mutex_unlock(&ch_gk20a->dbg_s_lock); + mutex_unlock(&g->dbg_sessions_lock); + + return 0; +} + +int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp) +{ + struct dbg_session_gk20a *dbg_s = filp->private_data; + + nvhost_dbg(dbg_gpu_dbg | dbg_fn, "%s", dev_name(dbg_s->dev)); + + /* unbind if it was bound */ + if (!dbg_s->ch) + return 0; + dbg_unbind_channel_gk20a(dbg_s); + + kfree(dbg_s); + return 0; +} + +static int dbg_bind_channel_gk20a(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_bind_channel_args *args) +{ + struct file *f; + struct nvhost_hwctx *hwctx; + struct gk20a *g; + struct channel_gk20a *ch_gk20a; + + nvhost_dbg(dbg_fn|dbg_gpu_dbg, "%s fd=%d", + dev_name(dbg_s->dev), args->channel_fd); + + if (args->channel_fd == ~0) + return dbg_unbind_channel_gk20a(dbg_s); + + /* even though get_file_hwctx is doing this it releases it as well */ + /* by holding it here we'll keep it from disappearing while the + * debugger is in session */ + f = fget(args->channel_fd); + if (!f) + return -ENODEV; + + hwctx = nvhost_channel_get_file_hwctx(args->channel_fd); + if (!hwctx) { + nvhost_dbg_fn("no hwctx found for fd"); + fput(f); + return -EINVAL; + } + /* be sure this is actually the right type of hwctx */ + if (hwctx->channel->dev != dbg_s->pdev) { + nvhost_dbg_fn("hwctx module type mismatch"); + fput(f); + return -EINVAL; + } + if (!hwctx->priv) { + nvhost_dbg_fn("no priv"); + fput(f); + return -ENODEV; + } + + ch_gk20a = (struct channel_gk20a *)hwctx->priv; + g = ch_gk20a->g; + nvhost_dbg_fn("%s hwchid=%d", dev_name(dbg_s->dev), ch_gk20a->hw_chid); + + mutex_lock(&g->dbg_sessions_lock); + mutex_lock(&ch_gk20a->dbg_s_lock); + + if (ch_gk20a->dbg_s) { + mutex_unlock(&ch_gk20a->dbg_s_lock); + mutex_unlock(&g->dbg_sessions_lock); + fput(f); + nvhost_dbg_fn("hwctx already in dbg session"); + return -EBUSY; + } + + dbg_s->hwctx_f = f; + dbg_s->ch = ch_gk20a; + ch_gk20a->dbg_s = dbg_s; + + if (g->dbg_sessions++ == 0) { + u32 curr = gk20a_clk_get_rate(g); + + /* save off current powergate, clk state. + * set gpu module's can_powergate = 0. + * set gpu module's clk to max. + * while *a* debug session is active there will be no power or + * clocking state changes allowed from mainline code (but they + * should be saved). + */ + nvhost_module_busy(dbg_s->pdev); + + gr_gk20a_slcg_gr_load_gating_prod(g, false); + gr_gk20a_slcg_perf_load_gating_prod(g, false); + + gr_gk20a_blcg_gr_load_gating_prod(g, false); + /* ??? gr_gk20a_pg_gr_load_gating_prod(g, false); */ + /* TBD: would rather not change elcg_enabled here */ + g->elcg_enabled = false; + gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A); + gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A); + + gk20a_pmu_disable_elpg(g); + + } + mutex_unlock(&ch_gk20a->dbg_s_lock); + mutex_unlock(&g->dbg_sessions_lock); + return 0; +} + +static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_exec_reg_ops_args *args); + +long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct dbg_session_gk20a *dbg_s = filp->private_data; + struct gk20a *g = get_gk20a(dbg_s->pdev); + u8 buf[NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE]; + int err = 0; + + nvhost_dbg_fn(""); + + if ((_IOC_TYPE(cmd) != NVHOST_DBG_GPU_IOCTL_MAGIC) || + (_IOC_NR(cmd) == 0) || + (_IOC_NR(cmd) > NVHOST_DBG_GPU_IOCTL_LAST)) + return -EFAULT; + + BUG_ON(_IOC_SIZE(cmd) > NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd))) + return -EFAULT; + } + + switch (cmd) { + case NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL: + err = dbg_bind_channel_gk20a(dbg_s, + (struct nvhost_dbg_gpu_bind_channel_args *)buf); + nvhost_dbg(dbg_gpu_dbg, "ret=%d", err); + break; + + case NVHOST_DBG_GPU_IOCTL_REG_OPS: + err = nvhost_ioctl_channel_reg_ops(dbg_s, + (struct nvhost_dbg_gpu_exec_reg_ops_args *)buf); + nvhost_dbg(dbg_gpu_dbg, "ret=%d", err); + break; + + default: + nvhost_err(dev_from_gk20a(g), + "unrecognized dbg gpu ioctl cmd: 0x%x", + cmd); + err = -ENOTTY; + break; + } + + if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ)) + err = copy_to_user((void __user *)arg, + buf, _IOC_SIZE(cmd)); + + return err; +} + +/* In order to perform a context relative op the context has + * to be created already... which would imply that the + * context switch mechanism has already been put in place. + * So by the time we perform such an opertation it should always + * be possible to query for the appropriate context offsets, etc. + * + * But note: while the dbg_gpu bind requires the a channel fd with + * a bound hwctx it doesn't require an allocated gr/compute obj + * at that point... so just having the bound hwctx doesn't work + * to guarantee this. + */ +static bool gr_context_info_available(struct dbg_session_gk20a *dbg_s, + struct gr_gk20a *gr) +{ + int err; + + mutex_lock(&gr->ctx_mutex); + err = !gr->ctx_vars.golden_image_initialized; + mutex_unlock(&gr->ctx_mutex); + if (err) + return false; + return true; + +} + +static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_exec_reg_ops_args *args) +{ + int err; + struct device *dev = dbg_s->dev; + struct gk20a *g = get_gk20a(dbg_s->pdev); + struct nvhost_dbg_gpu_reg_op *ops; + u64 ops_size = sizeof(ops[0]) * args->num_ops; + + nvhost_dbg_fn("%d ops, total size %llu", args->num_ops, ops_size); + + if (!dbg_s->ops) { + nvhost_err(dev, "can't call reg_ops on an unbound debugger session"); + return -EINVAL; + } + + /* be sure that ctx info is in place */ + if (!gr_context_info_available(dbg_s, &g->gr)) { + nvhost_err(dev, "gr context data not available\n"); + return -ENODEV; + } + + ops = kzalloc(ops_size, GFP_KERNEL); + if (!ops) { + nvhost_err(dev, "Allocating memory failed!"); + return -ENOMEM; + } + + nvhost_dbg_fn("Copying regops from userspace"); + + if (copy_from_user(ops, (void *)(uintptr_t)args->ops, ops_size)) { + dev_err(dev, "copy_from_user failed!"); + return -EFAULT; + } + + err = dbg_s->ops->exec_reg_ops(dbg_s, ops, args->num_ops); + + if (err) { + nvhost_err(dev, "dbg regops failed"); + return err; + } + + nvhost_dbg_fn("Copying result to userspace"); + + if (copy_to_user((void *)(uintptr_t)args->ops, ops, ops_size)) { + dev_err(dev, "copy_to_user failed!"); + return -EFAULT; + } + return 0; +} diff --git a/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h new file mode 100644 index 000000000000..48958b3f5eee --- /dev/null +++ b/drivers/video/tegra/host/gk20a/dbg_gpu_gk20a.h @@ -0,0 +1,51 @@ +/* + * Tegra GK20A GPU Debugger Driver + * + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __DBG_GPU_GK20A_H_ +#define __DBG_GPU_GK20A_H_ + +/* module debug driver interface */ +int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp); +int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp); +long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); + +struct dbg_gpu_session_ops { + int (*exec_reg_ops)(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_reg_op *ops, + u64 num_ops); +}; + +struct dbg_session_gk20a { + /* dbg session id used for trace/prints */ + int id; + + /* gpu module vagaries */ + struct device *dev; + struct platform_device *pdev; + struct nvhost_device_data *pdata; + + /* bound hwctx and channel */ + struct file *hwctx_f; + struct channel_gk20a *ch; + + /* session operations */ + struct dbg_gpu_session_ops *ops; +}; + +extern struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a; + +#endif /* __DBG_GPU_GK20A_H_ */ diff --git a/drivers/video/tegra/host/gk20a/gk20a.c b/drivers/video/tegra/host/gk20a/gk20a.c index 8be8f4bd3ff2..f564a151155c 100644 --- a/drivers/video/tegra/host/gk20a/gk20a.c +++ b/drivers/video/tegra/host/gk20a/gk20a.c @@ -49,6 +49,7 @@ #include "hw_sim_gk20a.h" #include "gk20a_scale.h" #include "gr3d/pod_scaling.h" +#include "dbg_gpu_gk20a.h" #include "../../../../../arch/arm/mach-tegra/iomap.h" @@ -89,6 +90,17 @@ const struct file_operations tegra_gk20a_ctrl_ops = { .unlocked_ioctl = gk20a_ctrl_dev_ioctl, }; +const struct file_operations tegra_gk20a_dbg_gpu_ops = { + .owner = THIS_MODULE, + .release = gk20a_dbg_gpu_dev_release, + .open = gk20a_dbg_gpu_dev_open, + .unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = gk20a_dbg_gpu_dev_ioctl, +#endif + +}; + static inline void sim_writel(struct gk20a *g, u32 r, u32 v) { writel(v, g->sim.regs+r); @@ -532,6 +544,8 @@ int nvhost_init_gk20a_support(struct platform_device *dev) goto fail; } + mutex_init(&g->dbg_sessions_lock); + /* nvhost_as alloc_share can be called before gk20a is powered on. It requires mm sw states configured so init mm sw early here. */ err = gk20a_init_mm_setup_sw(g); diff --git a/drivers/video/tegra/host/gk20a/gk20a.h b/drivers/video/tegra/host/gk20a/gk20a.h index 4add3dff4fb4..066b7aaae788 100644 --- a/drivers/video/tegra/host/gk20a/gk20a.h +++ b/drivers/video/tegra/host/gk20a/gk20a.h @@ -95,6 +95,12 @@ struct gk20a { struct dentry *debugfs_timeouts_enabled; struct dentry *debugfs_gr_idle_timeout_default; #endif + + /* held while manipulating # of debug sessions present */ + /* also prevents debug sessions from attaching until released */ + struct mutex dbg_sessions_lock; + int dbg_sessions; /* number attached */ + void (*remove_support)(struct platform_device *); struct notifier_block system_suspend_notifier; @@ -248,6 +254,7 @@ int clk_gk20a_debugfs_init(struct platform_device *dev); #endif extern const struct file_operations tegra_gk20a_ctrl_ops; +extern const struct file_operations tegra_gk20a_dbg_gpu_ops; struct nvhost_hwctx_handler *nvhost_gk20a_alloc_hwctx_handler(u32 syncpt, u32 waitbase, struct nvhost_channel *ch); diff --git a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h index ab403df84b51..909a166ae9c3 100644 --- a/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h +++ b/drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h @@ -1,9 +1,7 @@ /* - * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.h - * * GK20A Graphics Context * - * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -14,9 +12,8 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __GR_CTX_GK20A_H__ #define __GR_CTX_GK20A_H__ diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.c b/drivers/video/tegra/host/gk20a/gr_gk20a.c index 45f9392f9d95..b526e31abf5a 100644 --- a/drivers/video/tegra/host/gk20a/gr_gk20a.c +++ b/drivers/video/tegra/host/gk20a/gr_gk20a.c @@ -24,6 +24,7 @@ #include <linux/scatterlist.h> #include <linux/nvmap.h> #include <linux/tegra-soc.h> +#include <linux/nvhost_dbg_gpu_ioctl.h> #include "../dev.h" @@ -49,10 +50,14 @@ #include "chip_support.h" #include "nvhost_memmgr.h" #include "gk20a_gating_reglist.h" +#include "gr_pri_gk20a.h" +#include "regops_gk20a.h" + + static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va); -static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c, - u32 addr, u32 data, u32 patch); +static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx, + u32 addr, u32 data, bool patch); /* global ctx buffer */ static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g); @@ -433,35 +438,92 @@ static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id, return 0; } -int gr_gk20a_submit_fecs_method(struct gk20a *g, - u32 mb_id, u32 mb_data, u32 mb_clr, - u32 mtd_data, u32 mtd_adr, u32 *mb_ret, - u32 opc_ok, u32 mb_ok, u32 opc_fail, u32 mb_fail) +/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...) + * We should replace most, if not all, fecs method calls to this instead. */ +struct fecs_method_op_gk20a { + struct { + u32 addr; + u32 data; + } method; + + struct { + u32 id; + u32 data; + u32 clr; + u32 *ret; + u32 ok; + u32 fail; + } mailbox; + + struct { + u32 ok; + u32 fail; + } cond; + +}; + +int gr_gk20a_submit_fecs_method_op(struct gk20a *g, + struct fecs_method_op_gk20a op) { struct gr_gk20a *gr = &g->gr; int ret; mutex_lock(&gr->fecs_mutex); - if (mb_id != 0) - gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(mb_id), - mb_data); + if (op.mailbox.id != 0) + gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id), + op.mailbox.data); gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), - gr_fecs_ctxsw_mailbox_clear_value_f(mb_clr)); + gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr)); - gk20a_writel(g, gr_fecs_method_data_r(), mtd_data); + gk20a_writel(g, gr_fecs_method_data_r(), op.method.data); gk20a_writel(g, gr_fecs_method_push_r(), - gr_fecs_method_push_adr_f(mtd_adr)); + gr_fecs_method_push_adr_f(op.method.addr)); + + /* op.mb.id == 4 cases require waiting for completion on + * for op.mb.id == 0 */ + if (op.mailbox.id == 4) + op.mailbox.id = 0; - ret = gr_gk20a_ctx_wait_ucode(g, 0, mb_ret, - opc_ok, mb_ok, opc_fail, mb_fail); + ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret, + op.cond.ok, op.mailbox.ok, + op.cond.fail, op.mailbox.fail); mutex_unlock(&gr->fecs_mutex); return ret; } +int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret) +{ + return gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a) { + .method.addr = fecs_method, + .method.data = ~0, + .mailbox = { .id = 1, /*sideband?*/ + .data = ~0, .clr = ~0, .ret = ret, + .ok = gr_fecs_ctxsw_mailbox_value_pass_v(), + .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), }, + .cond.ok = GR_IS_UCODE_OP_EQUAL, + .cond.fail = GR_IS_UCODE_OP_EQUAL }); +} + +/* Stop processing (stall) context switches at FECS */ +int gr_gk20a_disable_ctxsw(struct gk20a *g) +{ + nvhost_dbg(dbg_fn | dbg_gpu_dbg, ""); + return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0); +} + +/* Start processing (continue) context switches at FECS */ +int gr_gk20a_enable_ctxsw(struct gk20a *g) +{ + nvhost_dbg(dbg_fn | dbg_gpu_dbg, ""); + return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0); +} + + static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va) { u32 addr_lo; @@ -504,33 +566,92 @@ clean_up: return ret; } -static int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_gk20a *c, - u32 addr, u32 data, u32 patch) +/* + * Context state can be written directly or "patched" at times. + * So that code can be used in either situation it is written + * using a series _ctx_patch_write(..., patch) statements. + * However any necessary cpu map/unmap and gpu l2 invalidates + * should be minimized (to avoid doing it once per patch write). + * Before a sequence of these set up with "_ctx_patch_write_begin" + * and close with "_ctx_patch_write_end." + */ +static int gr_gk20a_ctx_patch_write_begin(struct gk20a *g, + struct channel_ctx_gk20a *ch_ctx) +{ + /* being defensive still... */ + if (ch_ctx->patch_ctx.cpu_va) { + nvhost_err(dev_from_gk20a(g), "nested ctx patch begin?"); + return -EBUSY; + } + + ch_ctx->patch_ctx.cpu_va = + nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref); + + if (!ch_ctx->patch_ctx.cpu_va) + return -ENOMEM; + + return 0; +} + +static int gr_gk20a_ctx_patch_write_end(struct gk20a *g, + struct channel_ctx_gk20a *ch_ctx) +{ + /* being defensive still... */ + if (!ch_ctx->patch_ctx.cpu_va) { + nvhost_err(dev_from_gk20a(g), "dangling ctx patch end?"); + return -EINVAL; + } + + nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref, + ch_ctx->patch_ctx.cpu_va); + ch_ctx->patch_ctx.cpu_va = NULL; + + gk20a_mm_l2_invalidate(g); + return 0; +} + +static int gr_gk20a_ctx_patch_write(struct gk20a *g, + struct channel_ctx_gk20a *ch_ctx, + u32 addr, u32 data, bool patch) { - struct channel_ctx_gk20a *ch_ctx; u32 patch_slot = 0; void *patch_ptr = NULL; + bool mapped_here = false; - BUG_ON(patch != 0 && c == NULL); + BUG_ON(patch != 0 && ch_ctx == NULL); if (patch) { - ch_ctx = &c->ch_ctx; - patch_ptr = nvhost_memmgr_mmap(ch_ctx->patch_ctx.mem.ref); - if (!patch_ptr) - return -ENOMEM; + if (!ch_ctx) + return -EINVAL; + /* we added an optimization prolog, epilog + * to get rid of unnecessary maps and l2 invals. + * but be defensive still... */ + if (!ch_ctx->patch_ctx.cpu_va) { + int err; + nvhost_err(dev_from_gk20a(g), + "per-write ctx patch begin?"); + /* yes, gr_gk20a_ctx_patch_smpc causes this one */ + err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); + if (err) + return err; + mapped_here = true; + } else { + mapped_here = false; + patch_ptr = ch_ctx->patch_ctx.cpu_va; + } patch_slot = ch_ctx->patch_ctx.data_count * 2; mem_wr32(patch_ptr, patch_slot++, addr); mem_wr32(patch_ptr, patch_slot++, data); - nvhost_memmgr_munmap(ch_ctx->patch_ctx.mem.ref, patch_ptr); - gk20a_mm_l2_invalidate(g); - ch_ctx->patch_ctx.data_count++; - } else { + + if (mapped_here) + gr_gk20a_ctx_patch_write_end(g, ch_ctx); + + } else gk20a_writel(g, addr, data); - } return 0; } @@ -545,12 +666,19 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g, nvhost_dbg_info("bind channel %d inst ptr 0x%08x", c->hw_chid, inst_base_ptr); - ret = gr_gk20a_submit_fecs_method(g, 0, 0, 0x30, - gr_fecs_current_ctx_ptr_f(inst_base_ptr) | - gr_fecs_current_ctx_target_vid_mem_f() | - gr_fecs_current_ctx_valid_f(1), - gr_fecs_method_push_adr_bind_pointer_v(), - 0, GR_IS_UCODE_OP_AND, 0x10, GR_IS_UCODE_OP_AND, 0x20); + ret = gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a) { + .method.addr = gr_fecs_method_push_adr_bind_pointer_v(), + .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) | + gr_fecs_current_ctx_target_vid_mem_f() | + gr_fecs_current_ctx_valid_f(1)), + .mailbox = { .id = 0, .data = 0, + .clr = 0x30, + .ret = NULL, + .ok = 0x10, + .fail = 0x20, }, + .cond.ok = GR_IS_UCODE_OP_AND, + .cond.fail = GR_IS_UCODE_OP_AND}); if (ret) nvhost_err(dev_from_gk20a(g), "bind channel instance failed"); @@ -621,9 +749,10 @@ clean_up: } static int gr_gk20a_commit_global_cb_manager(struct gk20a *g, - struct channel_gk20a *c, u32 patch) + struct channel_gk20a *c, bool patch) { struct gr_gk20a *gr = &g->gr; + struct channel_ctx_gk20a *ch_ctx = NULL; u32 attrib_offset_in_chunk = 0; u32 alpha_offset_in_chunk = 0; u32 pd_ab_max_output; @@ -633,7 +762,15 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g, nvhost_dbg_fn(""); - gr_gk20a_ctx_patch_write(g, c, gr_ds_tga_constraintlogic_r(), + if (patch) { + int err; + ch_ctx = &c->ch_ctx; + err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); + if (err) + return err; + } + + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(), gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) | gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size), patch); @@ -642,7 +779,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g, gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) / gr_pd_ab_dist_cfg1_max_output_granularity_v(); - gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg1_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(), gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) | gr_pd_ab_dist_cfg1_max_batches_init_f(), patch); @@ -658,7 +795,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g, cbm_cfg_size2 = gr->alpha_cb_default_size * gr->pes_tpc_count[ppc_index][gpc_index]; - gr_gk20a_ctx_patch_write(g, c, + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpc0_ppc0_cbm_cfg_r() + temp + proj_ppc_in_gpc_stride_v() * ppc_index, gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) | @@ -668,7 +805,7 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g, attrib_offset_in_chunk += gr->attrib_cb_size * gr->pes_tpc_count[ppc_index][gpc_index]; - gr_gk20a_ctx_patch_write(g, c, + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpc0_ppc0_cbm_cfg2_r() + temp + proj_ppc_in_gpc_stride_v() * ppc_index, gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) | @@ -679,11 +816,14 @@ static int gr_gk20a_commit_global_cb_manager(struct gk20a *g, } } + if (patch) + gr_gk20a_ctx_patch_write_end(g, ch_ctx); + return 0; } static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, - struct channel_gk20a *c, u32 patch) + struct channel_gk20a *c, bool patch) { struct gr_gk20a *gr = &g->gr; struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx; @@ -692,6 +832,12 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, u32 data; nvhost_dbg_fn(""); + if (patch) { + int err; + err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); + if (err) + return err; + } /* global pagepool buffer */ addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >> @@ -708,20 +854,20 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, nvhost_dbg_info("pagepool buffer addr : 0x%016llx, size : %d", addr, size); - gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_base_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(), gr_scc_pagepool_base_addr_39_8_f(addr), patch); - gr_gk20a_ctx_patch_write(g, c, gr_scc_pagepool_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(), gr_scc_pagepool_total_pages_f(size) | gr_scc_pagepool_valid_true_f(), patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_base_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(), gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gcc_pagepool_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(), gr_gpcs_gcc_pagepool_total_pages_f(size), patch); - gr_gk20a_ctx_patch_write(g, c, gr_pd_pagepool_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(), gr_pd_pagepool_total_pages_f(size) | gr_pd_pagepool_valid_true_f(), patch); @@ -736,17 +882,17 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, nvhost_dbg_info("bundle cb addr : 0x%016llx, size : %d", addr, size); - gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_base_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(), gr_scc_bundle_cb_base_addr_39_8_f(addr), patch); - gr_gk20a_ctx_patch_write(g, c, gr_scc_bundle_cb_size_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(), gr_scc_bundle_cb_size_div_256b_f(size) | gr_scc_bundle_cb_size_valid_true_f(), patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_base_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(), gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_bundle_cb_size_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(), gr_gpcs_setup_bundle_cb_size_div_256b_f(size) | gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch); @@ -760,7 +906,7 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, nvhost_dbg_info("bundle cb token limit : %d, state limit : %d", gr->bundle_cb_token_limit, data); - gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg2_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(), gr_pd_ab_dist_cfg2_token_limit_f(gr->bundle_cb_token_limit) | gr_pd_ab_dist_cfg2_state_limit_f(data), patch); @@ -772,20 +918,24 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, nvhost_dbg_info("attrib cb addr : 0x%016llx", addr); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_setup_attrib_cb_base_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(), gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) | gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(), + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(), gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) | gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch); + if (patch) + gr_gk20a_ctx_patch_write_end(g, ch_ctx); + return 0; } -static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, u32 patch) +static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch) { struct gr_gk20a *gr = &g->gr; + struct channel_ctx_gk20a *ch_ctx = NULL; u32 gpm_pd_cfg; u32 pd_ab_dist_cfg0; u32 ds_debug; @@ -800,6 +950,14 @@ static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20 ds_debug = gk20a_readl(g, gr_ds_debug_r()); mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r()); + if (patch) { + int err; + ch_ctx = &c->ch_ctx; + err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx); + if (err) + return err; + } + if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) { pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r()); pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r()); @@ -811,24 +969,27 @@ static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20 ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug; mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug; - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch); - gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch); - gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch); } else { gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg; pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0; ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug; mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug; - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch); - gr_gk20a_ctx_patch_write(g, c, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch); - gr_gk20a_ctx_patch_write(g, c, gr_ds_debug_r(), ds_debug, patch); - gr_gk20a_ctx_patch_write(g, c, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch); + gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch); } + if (patch) + gr_gk20a_ctx_patch_write_end(g, ch_ctx); + return 0; } @@ -1147,7 +1308,7 @@ static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g) gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc); } - /* grSetupPDMapping stubbed for gk20a */ + /* gr__setup_pd_mapping stubbed for gk20a */ gr_gk20a_setup_rop_mapping(g, gr); gr_gk20a_setup_alpha_beta_tables(g, gr); @@ -1192,13 +1353,22 @@ static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type) u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl) >> ram_in_base_shift_v()); + nvhost_dbg_fn(""); - ret = gr_gk20a_submit_fecs_method(g, 0, 0, 3, - gr_fecs_current_ctx_ptr_f(inst_base_ptr) | - gr_fecs_current_ctx_target_vid_mem_f() | - gr_fecs_current_ctx_valid_f(1), save_type, 0, - GR_IS_UCODE_OP_AND, 1, GR_IS_UCODE_OP_AND, 2); + ret = gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a) { + .method.addr = save_type, + .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) | + gr_fecs_current_ctx_target_vid_mem_f() | + gr_fecs_current_ctx_valid_f(1)), + .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL, + .ok = 1, .fail = 2, + }, + .cond.ok = GR_IS_UCODE_OP_AND, + .cond.fail = GR_IS_UCODE_OP_AND, + }); + if (ret) nvhost_err(dev_from_gk20a(g), "save context image failed"); @@ -1234,7 +1404,7 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g, if (err) goto clean_up; - err = gr_gk20a_commit_global_ctx_buffers(g, c, 0); + err = gr_gk20a_commit_global_ctx_buffers(g, c, false); if (err) goto clean_up; @@ -1367,13 +1537,22 @@ static int gr_gk20a_load_golden_ctx_image(struct gk20a *g, u64_lo32(sg_phys(c->inst_block.mem.sgt->sgl) >> ram_in_base_shift_v()); - ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, - gr_fecs_current_ctx_ptr_f(inst_base_ptr) | - gr_fecs_current_ctx_target_vid_mem_f() | - gr_fecs_current_ctx_valid_f(1), - gr_fecs_method_push_adr_restore_golden_v(), 0, - GR_IS_UCODE_OP_EQUAL, gr_fecs_ctxsw_mailbox_value_pass_v(), - GR_IS_UCODE_OP_SKIP, 0); + ret = gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a) { + .method.data = + (gr_fecs_current_ctx_ptr_f(inst_base_ptr) | + gr_fecs_current_ctx_target_vid_mem_f() | + gr_fecs_current_ctx_valid_f(1)), + .method.addr = + gr_fecs_method_push_adr_restore_golden_v(), + .mailbox = { + .id = 0, .data = 0, + .clr = ~0, .ret = NULL, + .ok = gr_fecs_ctxsw_mailbox_value_pass_v(), + .fail = 0}, + .cond.ok = GR_IS_UCODE_OP_EQUAL, + .cond.fail = GR_IS_UCODE_OP_SKIP}); + if (ret) nvhost_err(dev_from_gk20a(g), "restore context image failed"); @@ -1440,33 +1619,34 @@ static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr) u32 zcull_ctx_image_size = 0; u32 pm_ctx_image_size = 0; u32 ret; + struct fecs_method_op_gk20a op = { + .mailbox = { .id = 0, .data = 0, + .clr = ~0, .ok = 0, .fail = 0}, + .method.data = 0, + .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL, + .cond.fail = GR_IS_UCODE_OP_SKIP, + }; nvhost_dbg_fn(""); - - ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0, - gr_fecs_method_push_adr_discover_image_size_v(), - &golden_ctx_image_size, - GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0); + op.method.addr = gr_fecs_method_push_adr_discover_image_size_v(); + op.mailbox.ret = &golden_ctx_image_size; + ret = gr_gk20a_submit_fecs_method_op(g, op); if (ret) { nvhost_err(dev_from_gk20a(g), "query golden image size failed"); return ret; } - - ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0, - gr_fecs_method_push_adr_discover_zcull_image_size_v(), - &zcull_ctx_image_size, - GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0); + op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v(); + op.mailbox.ret = &zcull_ctx_image_size; + ret = gr_gk20a_submit_fecs_method_op(g, op); if (ret) { nvhost_err(dev_from_gk20a(g), "query zcull ctx image size failed"); return ret; } - - ret = gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 0, - gr_fecs_method_push_adr_discover_pm_image_size_v(), - &pm_ctx_image_size, - GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0); + op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v(); + op.mailbox.ret = &pm_ctx_image_size; + ret = gr_gk20a_submit_fecs_method_op(g, op); if (ret) { nvhost_err(dev_from_gk20a(g), "query pm ctx image size failed"); @@ -1943,10 +2123,10 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c, goto out; } gr_gk20a_elpg_protected_call(g, - gr_gk20a_commit_global_ctx_buffers(g, c, 1)); + gr_gk20a_commit_global_ctx_buffers(g, c, true)); } - /* init gloden image, ELPG enabled after this is done */ + /* init golden image, ELPG enabled after this is done */ err = gr_gk20a_init_golden_ctx_image(g, c); if (err) { nvhost_err(dev_from_gk20a(g), @@ -3527,8 +3707,6 @@ static int gk20a_init_gr_setup_hw(struct gk20a *g) gk20a_writel(g, sw_ctx_load->l[i].addr, sw_ctx_load->l[i].value); - /* TBD: add gr ctx overrides */ - err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT); if (err) goto out; @@ -3541,8 +3719,8 @@ static int gk20a_init_gr_setup_hw(struct gk20a *g) gr_fe_go_idle_timeout_count_disabled_f()); /* override a few ctx state registers */ - gr_gk20a_commit_global_cb_manager(g, NULL, 0); - gr_gk20a_commit_global_timeslice(g, NULL, 0); + gr_gk20a_commit_global_cb_manager(g, NULL, false); + gr_gk20a_commit_global_timeslice(g, NULL, false); /* floorsweep anything left */ gr_gk20a_ctx_state_floorsweep(g); @@ -4328,25 +4506,52 @@ clean_up: int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size) { BUG_ON(size == NULL); - return gr_gk20a_submit_fecs_method(g, 0, 0, ~0, 1, - gr_fecs_method_push_adr_discover_reglist_image_size_v(), - size, GR_IS_UCODE_OP_NOT_EQUAL, 0, GR_IS_UCODE_OP_SKIP, 0); + return gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a) { + .mailbox.id = 0, + .mailbox.data = 0, + .mailbox.clr = ~0, + .method.data = 1, + .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(), + .mailbox.ret = size, + .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL, + .mailbox.ok = 0, + .cond.fail = GR_IS_UCODE_OP_SKIP, + .mailbox.fail = 0}); } int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr) { - return gr_gk20a_submit_fecs_method(g, 4, - gr_fecs_current_ctx_ptr_f(addr >> 12) | - gr_fecs_current_ctx_valid_f(1) | gr_fecs_current_ctx_target_vid_mem_f(), - ~0, 1, gr_fecs_method_push_adr_set_reglist_bind_instance_v(), - 0, GR_IS_UCODE_OP_EQUAL, 1, GR_IS_UCODE_OP_SKIP, 0); + return gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a){ + .mailbox.id = 4, + .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) | + gr_fecs_current_ctx_valid_f(1) | + gr_fecs_current_ctx_target_vid_mem_f()), + .mailbox.clr = ~0, + .method.data = 1, + .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(), + .mailbox.ret = NULL, + .cond.ok = GR_IS_UCODE_OP_EQUAL, + .mailbox.ok = 1, + .cond.fail = GR_IS_UCODE_OP_SKIP, + .mailbox.fail = 0}); } int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va) { - return gr_gk20a_submit_fecs_method(g, 4, u64_lo32(pmu_va >> 8), - ~0, 1, gr_fecs_method_push_adr_set_reglist_virtual_address_v(), - 0, GR_IS_UCODE_OP_EQUAL, 1, GR_IS_UCODE_OP_SKIP, 0); + return gr_gk20a_submit_fecs_method_op(g, + (struct fecs_method_op_gk20a) { + .mailbox.id = 4, + .mailbox.data = u64_lo32(pmu_va >> 8), + .mailbox.clr = ~0, + .method.data = 1, + .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(), + .mailbox.ret = NULL, + .cond.ok = GR_IS_UCODE_OP_EQUAL, + .mailbox.ok = 1, + .cond.fail = GR_IS_UCODE_OP_SKIP, + .mailbox.fail = 0}); } int gk20a_gr_suspend(struct gk20a *g) @@ -4381,3 +4586,1212 @@ int gk20a_gr_suspend(struct gk20a *g) nvhost_dbg_fn("done"); return ret; } + +static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, + u32 addr, + bool is_quad, u32 quad, + u32 *context_buffer, + u32 context_buffer_size, + u32 *priv_offset); + +/* This function will decode a priv address and return the partition type and numbers. */ +int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr, + int *addr_type, /* enum ctxsw_addr_type */ + u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num, + u32 *broadcast_flags) +{ + u32 gpc_addr; + u32 ppc_address; + u32 ppc_broadcast_addr; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr); + + /* setup defaults */ + ppc_address = 0; + ppc_broadcast_addr = 0; + *addr_type = CTXSW_ADDR_TYPE_SYS; + *broadcast_flags = PRI_BROADCAST_FLAGS_NONE; + *gpc_num = 0; + *tpc_num = 0; + *ppc_num = 0; + *be_num = 0; + + if (pri_is_gpc_addr(addr)) { + *addr_type = CTXSW_ADDR_TYPE_GPC; + gpc_addr = pri_gpccs_addr_mask(addr); + if (pri_is_gpc_addr_shared(addr)) { + *addr_type = CTXSW_ADDR_TYPE_GPC; + *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC; + } else + *gpc_num = pri_get_gpc_num(addr); + + if (pri_is_tpc_addr(gpc_addr)) { + *addr_type = CTXSW_ADDR_TYPE_TPC; + if (pri_is_tpc_addr_shared(gpc_addr)) { + *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC; + return 0; + } + *tpc_num = pri_get_tpc_num(gpc_addr); + } + return 0; + } else if (pri_is_be_addr(addr)) { + *addr_type = CTXSW_ADDR_TYPE_BE; + if (pri_is_be_addr_shared(addr)) { + *broadcast_flags |= PRI_BROADCAST_FLAGS_BE; + return 0; + } + *be_num = pri_get_be_num(addr); + return 0; + } else { + *addr_type = CTXSW_ADDR_TYPE_SYS; + return 0; + } + /* PPC!?!?!?! */ + + /*NOTREACHED*/ + return -EINVAL; +} + +static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr, + u32 gpc_num, + u32 *priv_addr_table, u32 *t) +{ + u32 ppc_num; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr); + + for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++) + priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr), + gpc_num, ppc_num); + + return 0; +} + +/* + * The context buffer is indexed using BE broadcast addresses and GPC/TPC + * unicast addresses. This function will convert a BE unicast address to a BE + * broadcast address and split a GPC/TPC broadcast address into a table of + * GPC/TPC addresses. The addresses generated by this function can be + * successfully processed by gr_gk20a_find_priv_offset_in_buffer + */ +static int gr_gk20a_create_priv_addr_table(struct gk20a *g, + u32 addr, + u32 *priv_addr_table, + u32 *num_registers) +{ + int addr_type; /*enum ctxsw_addr_type */ + u32 gpc_num, tpc_num, ppc_num, be_num; + u32 broadcast_flags; + u32 t; + int err; + + t = 0; + *num_registers = 0; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr); + + err = gr_gk20a_decode_priv_addr(g, addr, &addr_type, + &gpc_num, &tpc_num, &ppc_num, &be_num, + &broadcast_flags); + nvhost_dbg(dbg_gpu_dbg, "addr_type = %d", addr_type); + if (err) + return err; + + if ((addr_type == CTXSW_ADDR_TYPE_SYS) || + (addr_type == CTXSW_ADDR_TYPE_BE)) { + /* The BE broadcast registers are included in the compressed PRI + * table. Convert a BE unicast address to a broadcast address + * so that we can look up the offset. */ + if ((addr_type == CTXSW_ADDR_TYPE_BE) && + !(broadcast_flags & PRI_BROADCAST_FLAGS_BE)) + priv_addr_table[t++] = pri_be_shared_addr(addr); + else + priv_addr_table[t++] = addr; + + *num_registers = t; + return 0; + } + + /* The GPC/TPC unicast registers are included in the compressed PRI + * tables. Convert a GPC/TPC broadcast address to unicast addresses so + * that we can look up the offsets. */ + if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) { + for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) { + + if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) + for (tpc_num = 0; + tpc_num < g->gr.gpc_tpc_count[gpc_num]; + tpc_num++) + priv_addr_table[t++] = + pri_tpc_addr(pri_tpccs_addr_mask(addr), + gpc_num, tpc_num); + + else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) { + err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num, + priv_addr_table, &t); + if (err) + return err; + } else + priv_addr_table[t++] = + pri_gpc_addr(pri_gpccs_addr_mask(addr), + gpc_num); + } + } else { + if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC) + for (tpc_num = 0; + tpc_num < g->gr.gpc_tpc_count[gpc_num]; + tpc_num++) + priv_addr_table[t++] = + pri_tpc_addr(pri_tpccs_addr_mask(addr), + gpc_num, tpc_num); + else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) + err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num, + priv_addr_table, &t); + else + priv_addr_table[t++] = addr; + } + + *num_registers = t; + return 0; +} + +int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g, + u32 addr, + u32 max_offsets, + u32 *offsets, u32 *offset_addrs, + u32 *num_offsets, + bool is_quad, u32 quad) +{ + u32 i; + u32 priv_offset = 0; + u32 *priv_registers; + u32 num_registers = 0; + int err = 0; + u32 potential_offsets = proj_scal_litter_num_gpcs_v() * + proj_scal_litter_num_tpc_per_gpc_v(); + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr); + + /* implementation is crossed-up if either of these happen */ + if (max_offsets > potential_offsets) + return -EINVAL; + + if (!g->gr.ctx_vars.golden_image_initialized) + return -ENODEV; + + priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL); + if (IS_ERR_OR_NULL(priv_registers)) { + nvhost_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets); + err = PTR_ERR(priv_registers); + goto cleanup; + } + memset(offsets, 0, sizeof(u32) * max_offsets); + memset(offset_addrs, 0, sizeof(u32) * max_offsets); + *num_offsets = 0; + + gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers); + + if ((max_offsets > 1) && (num_registers > max_offsets)) { + err = -EINVAL; + goto cleanup; + } + + if ((max_offsets == 1) && (num_registers > 1)) + num_registers = 1; + + if (!g->gr.ctx_vars.local_golden_image) { + nvhost_dbg_fn("no context switch header info to work with"); + err = -EINVAL; + goto cleanup; + } + + for (i = 0; i < num_registers; i++) { + err = gr_gk20a_find_priv_offset_in_buffer(g, + priv_registers[i], + is_quad, quad, + g->gr.ctx_vars.local_golden_image, + g->gr.ctx_vars.golden_image_size, + &priv_offset); + if (err) { + nvhost_dbg_fn("Could not determine priv_offset for addr:0x%x", + addr); /*, grPriRegStr(addr)));*/ + goto cleanup; + } + + offsets[i] = priv_offset; + offset_addrs[i] = priv_registers[i]; + } + + *num_offsets = num_registers; + + cleanup: + + if (!IS_ERR_OR_NULL(priv_registers)) + kfree(priv_registers); + + return err; +} + +/* Setup some register tables. This looks hacky; our + * register/offset functions are just that, functions. + * So they can't be used as initializers... TBD: fix to + * generate consts at least on an as-needed basis. + */ +static const u32 _num_ovr_perf_regs = 17; +static u32 _ovr_perf_regs[17] = { 0, }; +/* Following are the blocks of registers that the ucode + stores in the extended region.*/ +/* == ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */ +static const u32 _num_sm_dsm_perf_regs = 5; +/* == ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/ +static const u32 _num_sm_dsm_perf_ctrl_regs = 4; +static u32 _sm_dsm_perf_regs[5]; +static u32 _sm_dsm_perf_ctrl_regs[4]; + +static void init_sm_dsm_reg_info(void) +{ + if (_ovr_perf_regs[0] != 0) + return; + + _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r(); + _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r(); + _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r(); + _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r(); + _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r(); + _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r(); + _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r(); + _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r(); + _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r(); + _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r(); + _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r(); + _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r(); + _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r(); + _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r(); + _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r(); + _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r(); + _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r(); + + + _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r(); + _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r(); + _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r(); + _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r(); + _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r(); + + _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r(); + _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r(); + _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r(); + _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r(); + +} + +/* TBD: would like to handle this elsewhere, at a higher level. + * these are currently constructed in a "test-then-write" style + * which makes it impossible to know externally whether a ctx + * write will actually occur. so later we should put a lazy, + * map-and-hold system in the patch write state */ +int gr_gk20a_ctx_patch_smpc(struct gk20a *g, + struct channel_ctx_gk20a *ch_ctx, + u32 addr, u32 data, + u8 *context) +{ + u32 num_gpc = g->gr.gpc_count; + u32 num_tpc; + u32 tpc, gpc, reg; + u32 chk_addr; + u32 vaddr_lo; + u32 vaddr_hi; + u32 tmp; + + init_sm_dsm_reg_info(); + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr); + + for (reg = 0; reg < _num_ovr_perf_regs; reg++) { + for (gpc = 0; gpc < num_gpc; gpc++) { + num_tpc = g->gr.gpc_tpc_count[gpc]; + for (tpc = 0; tpc < num_tpc; tpc++) { + chk_addr = ((proj_gpc_stride_v() * gpc) + + (proj_tpc_in_gpc_stride_v() * tpc) + + _ovr_perf_regs[reg]); + if (chk_addr != addr) + continue; + /* reset the patch count from previous + runs,if ucode has already processed + it */ + tmp = mem_rd32(context + + ctxsw_prog_main_image_patch_count_o(), 0); + + if (!tmp) + ch_ctx->patch_ctx.data_count = 0; + + gr_gk20a_ctx_patch_write(g, ch_ctx, + addr, data, true); + + vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va); + vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va); + + mem_wr32(context + + ctxsw_prog_main_image_patch_count_o(), + 0, ch_ctx->patch_ctx.data_count); + mem_wr32(context + + ctxsw_prog_main_image_patch_adr_lo_o(), + 0, vaddr_lo); + mem_wr32(context + + ctxsw_prog_main_image_patch_adr_hi_o(), + 0, vaddr_hi); + + /* we're not caching these on cpu side, + but later watch for it */ + + /* the l2 invalidate in the patch_write + * would be too early for this? */ + gk20a_mm_l2_invalidate(g); + return 0; + } + } + } + + return 0; +} + + +void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset) +{ + u32 reg; + u32 quad_ctrl; + u32 half_ctrl; + u32 tpc, gpc; + u32 gpc_tpc_addr; + u32 gpc_tpc_stride; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "offset=0x%x", offset); + + gpc = pri_get_gpc_num(offset); + gpc_tpc_addr = pri_gpccs_addr_mask(offset); + tpc = pri_get_tpc_num(gpc_tpc_addr); + + quad_ctrl = quad & 0x1; /* first bit tells us quad */ + half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */ + + gpc_tpc_stride = gpc * proj_gpc_stride_v() + + tpc * proj_tpc_in_gpc_stride_v(); + gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride; + + reg = gk20a_readl(g, gpc_tpc_addr); + reg = set_field(reg, + gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(), + quad_ctrl); + + gk20a_writel(g, gpc_tpc_addr, reg); + + gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride; + reg = gk20a_readl(g, gpc_tpc_addr); + reg = set_field(reg, + gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(), + half_ctrl); + gk20a_writel(g, gpc_tpc_addr, reg); +} + +#define ILLEGAL_ID (~0) + +static inline bool check_main_image_header_magic(void *context) +{ + u32 magic = mem_rd32(context + + ctxsw_prog_main_image_magic_value_o(), 0); + nvhost_dbg(dbg_gpu_dbg, "main image magic=0x%x", magic); + return magic == ctxsw_prog_main_image_magic_value_v_value_v(); +} +static inline bool check_local_header_magic(void *context) +{ + u32 magic = mem_rd32(context + + ctxsw_prog_local_magic_value_o(), 0); + nvhost_dbg(dbg_gpu_dbg, "local magic=0x%x", magic); + return magic == ctxsw_prog_local_magic_value_v_value_v(); + +} + +/* most likely dupe of ctxsw_gpccs_header__size_1_v() */ +static inline int ctxsw_prog_ucode_header_size_in_bytes(void) +{ + return 256; +} + +static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g, + u32 addr, + bool is_quad, u32 quad, + u32 *context_buffer, + u32 context_buffer_size, + u32 *priv_offset) +{ + u32 i, data32; + u32 gpc_num, tpc_num; + u32 num_gpcs, num_tpcs; + u32 chk_addr; + u32 ext_priv_offset, ext_priv_size; + void *context; + u32 offset_to_segment, offset_to_segment_end; + u32 sm_dsm_perf_reg_id = ILLEGAL_ID; + u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID; + u32 num_ext_gpccs_ext_buffer_segments; + u32 inter_seg_offset; + u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1); + u32 max_tpc_count; + u32 *sm_dsm_perf_ctrl_regs = NULL; + u32 num_sm_dsm_perf_ctrl_regs = 0; + u32 *sm_dsm_perf_regs = NULL; + u32 num_sm_dsm_perf_regs = 0; + u32 buffer_segments_size = 0; + u32 marker_size = 0; + u32 control_register_stride = 0; + u32 perf_register_stride = 0; + + /* Only have TPC registers in extended region, so if not a TPC reg, + then return error so caller can look elsewhere. */ + if (pri_is_gpc_addr(addr)) { + u32 gpc_addr = 0; + gpc_num = pri_get_gpc_num(addr); + gpc_addr = pri_gpccs_addr_mask(addr); + if (pri_is_tpc_addr(gpc_addr)) + tpc_num = pri_get_tpc_num(gpc_addr); + else + return -EINVAL; + + nvhost_dbg_info(" gpc = %d tpc = %d", + gpc_num, tpc_num); + } else + return -EINVAL; + + buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v(); + /* note below is in words/num_registers */ + marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2; + + context = context_buffer; + /* sanity check main header */ + if (!check_main_image_header_magic(context)) { + nvhost_err(dev_from_gk20a(g), + "Invalid main header: magic value"); + return -EINVAL; + } + num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0); + if (gpc_num >= num_gpcs) { + nvhost_err(dev_from_gk20a(g), + "GPC 0x%08x is greater than total count 0x%08x!\n", + gpc_num, num_gpcs); + return -EINVAL; + } + + data32 = mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0); + ext_priv_size = ctxsw_prog_main_extended_buffer_ctl_size_v(data32); + if (0 == ext_priv_size) { + nvhost_dbg_info(" No extended memory in context buffer"); + return -EINVAL; + } + ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32); + + offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes(); + offset_to_segment_end = offset_to_segment + + (ext_priv_size * buffer_segments_size); + + /* check local header magic */ + context += ctxsw_prog_ucode_header_size_in_bytes(); + if (!check_local_header_magic(context)) { + nvhost_err(dev_from_gk20a(g), + "Invalid local header: magic value\n"); + return -EINVAL; + } + + /* + * See if the incoming register address is in the first table of + * registers. We check this by decoding only the TPC addr portion. + * If we get a hit on the TPC bit, we then double check the address + * by computing it from the base gpc/tpc strides. Then make sure + * it is a real match. + */ + num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs; + sm_dsm_perf_regs = _sm_dsm_perf_regs; + perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v(); + + init_sm_dsm_reg_info(); + + for (i = 0; i < num_sm_dsm_perf_regs; i++) { + if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) { + sm_dsm_perf_reg_id = i; + + nvhost_dbg_info("register match: 0x%08x", + sm_dsm_perf_regs[i]); + + chk_addr = (proj_gpc_base_v() + + (proj_gpc_stride_v() * gpc_num) + + proj_tpc_in_gpc_base_v() + + (proj_tpc_in_gpc_stride_v() * tpc_num) + + (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask)); + + if (chk_addr != addr) { + nvhost_err(dev_from_gk20a(g), + "Oops addr miss-match! : 0x%08x != 0x%08x\n", + addr, chk_addr); + return -EINVAL; + } + break; + } + } + + /* Didn't find reg in supported group 1. + * so try the second group now */ + num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs; + sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs; + control_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v(); + + if (ILLEGAL_ID == sm_dsm_perf_reg_id) { + for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) { + if ((addr & tpc_gpc_mask) == + (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) { + sm_dsm_perf_ctrl_reg_id = i; + + nvhost_dbg_info("register match: 0x%08x", + sm_dsm_perf_ctrl_regs[i]); + + chk_addr = (proj_gpc_base_v() + + (proj_gpc_stride_v() * gpc_num) + + proj_tpc_in_gpc_base_v() + + (proj_tpc_in_gpc_stride_v() * tpc_num) + + (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] & + tpc_gpc_mask)); + + if (chk_addr != addr) { + nvhost_err(dev_from_gk20a(g), + "Oops addr miss-match! : 0x%08x != 0x%08x\n", + addr, chk_addr); + return -EINVAL; + + } + + break; + } + } + } + + if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) && + (ILLEGAL_ID == sm_dsm_perf_reg_id)) + return -EINVAL; + + /* Skip the FECS extended header, nothing there for us now. */ + offset_to_segment += buffer_segments_size; + + /* skip through the GPCCS extended headers until we get to the data for + * our GPC. The size of each gpc extended segment is enough to hold the + * max tpc count for the gpcs,in 256b chunks. + */ + + max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v(); + + num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2); + + offset_to_segment += (num_ext_gpccs_ext_buffer_segments * + buffer_segments_size * gpc_num); + + num_tpcs = g->gr.gpc_tpc_count[gpc_num]; + + /* skip the head marker to start with */ + inter_seg_offset = marker_size; + + if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) { + /* skip over control regs of TPC's before the one we want. + * then skip to the register in this tpc */ + inter_seg_offset = inter_seg_offset + + (tpc_num * control_register_stride) + + sm_dsm_perf_ctrl_reg_id; + } else { + /* skip all the control registers */ + inter_seg_offset = inter_seg_offset + + (num_tpcs * control_register_stride); + + /* skip the marker between control and counter segments */ + inter_seg_offset += marker_size; + + /* skip over counter regs of TPCs before the one we want */ + inter_seg_offset = inter_seg_offset + + (tpc_num * perf_register_stride) * + ctxsw_prog_extended_num_smpc_quadrants_v(); + + /* skip over the register for the quadrants we do not want. + * then skip to the register in this tpc */ + inter_seg_offset = inter_seg_offset + + (perf_register_stride * quad) + + sm_dsm_perf_reg_id; + } + + /* set the offset to the segment offset plus the inter segment offset to + * our register */ + offset_to_segment += (inter_seg_offset * 4); + + /* last sanity check: did we somehow compute an offset outside the + * extended buffer? */ + if (offset_to_segment > offset_to_segment_end) { + nvhost_err(dev_from_gk20a(g), + "Overflow ctxsw buffer! 0x%08x > 0x%08x\n", + offset_to_segment, offset_to_segment_end); + return -EINVAL; + } + + *priv_offset = offset_to_segment; + + return 0; +} + + +static int +gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g, + int addr_type,/* enum ctxsw_addr_type */ + u32 pri_addr, + u32 gpc_num, u32 num_tpcs, + u32 num_ppcs, u32 ppc_mask, + u32 *priv_offset) +{ + u32 i; + u32 address, base_address; + u32 sys_offset, gpc_offset, tpc_offset, ppc_offset; + u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr; + struct aiv_gk20a *reg; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "pri_addr=0x%x", pri_addr); + + if (!g->gr.ctx_vars.valid) + return -EINVAL; + + /* Process the SYS/BE segment. */ + if ((addr_type == CTXSW_ADDR_TYPE_SYS) || + (addr_type == CTXSW_ADDR_TYPE_BE)) { + for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) { + reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i]; + address = reg->addr; + sys_offset = reg->index; + + if (pri_addr == address) { + *priv_offset = sys_offset; + return 0; + } + } + } + + /* Process the TPC segment. */ + if (addr_type == CTXSW_ADDR_TYPE_TPC) { + for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) { + for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) { + reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i]; + address = reg->addr; + tpc_addr = pri_tpccs_addr_mask(address); + base_address = proj_gpc_base_v() + + (gpc_num * proj_gpc_stride_v()) + + proj_tpc_in_gpc_base_v() + + (tpc_num * proj_tpc_in_gpc_stride_v()); + address = base_address + tpc_addr; + /* + * The data for the TPCs is interleaved in the context buffer. + * Example with num_tpcs = 2 + * 0 1 2 3 4 5 6 7 8 9 10 11 ... + * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ... + */ + tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4); + + if (pri_addr == address) { + *priv_offset = tpc_offset; + return 0; + } + } + } + } + + /* Process the PPC segment. */ + if (addr_type == CTXSW_ADDR_TYPE_PPC) { + for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) { + for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) { + reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i]; + address = reg->addr; + ppc_addr = pri_ppccs_addr_mask(address); + base_address = proj_gpc_base_v() + + (gpc_num * proj_gpc_stride_v()) + + proj_ppc_in_gpc_base_v() + + (ppc_num * proj_ppc_in_gpc_stride_v()); + address = base_address + ppc_addr; + /* + * The data for the PPCs is interleaved in the context buffer. + * Example with numPpcs = 2 + * 0 1 2 3 4 5 6 7 8 9 10 11 ... + * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ... + */ + ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4); + + if (pri_addr == address) { + *priv_offset = ppc_offset; + return 0; + } + } + } + } + + + /* Process the GPC segment. */ + if (addr_type == CTXSW_ADDR_TYPE_GPC) { + for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) { + reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i]; + + address = reg->addr; + gpc_addr = pri_gpccs_addr_mask(address); + gpc_offset = reg->index; + + base_address = proj_gpc_base_v() + + (gpc_num * proj_gpc_stride_v()); + address = base_address + gpc_addr; + + if (pri_addr == address) { + *priv_offset = gpc_offset; + return 0; + } + } + } + + return -EINVAL; +} + +static int gr_gk20a_determine_ppc_configuration(struct gk20a *g, + void *context, + u32 *num_ppcs, u32 *ppc_mask, + u32 *reg_ppc_count) +{ + u32 data32; + u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v(); + + /* + * if there is only 1 PES_PER_GPC, then we put the PES registers + * in the GPC reglist, so we can't error out if ppc.count == 0 + */ + if ((!g->gr.ctx_vars.valid) || + ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) && + (litter_num_pes_per_gpc > 1))) + return -EINVAL; + + data32 = mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0); + + *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32); + *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32); + + *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count; + + return 0; +} + + + +/* + * This function will return the 32 bit offset for a priv register if it is + * present in the context buffer. + */ +static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, + u32 addr, + bool is_quad, u32 quad, + u32 *context_buffer, + u32 context_buffer_size, + u32 *priv_offset) +{ + struct gr_gk20a *gr = &g->gr; + u32 i, data32; + int err; + int addr_type; /*enum ctxsw_addr_type */ + u32 broadcast_flags; + u32 gpc_num, tpc_num, ppc_num, be_num; + u32 num_gpcs, num_tpcs, num_ppcs; + u32 offset; + u32 sys_priv_offset, gpc_priv_offset; + u32 ppc_mask, reg_list_ppc_count; + void *context; + u32 offset_to_segment; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "addr=0x%x", addr); + + err = gr_gk20a_decode_priv_addr(g, addr, &addr_type, + &gpc_num, &tpc_num, &ppc_num, &be_num, + &broadcast_flags); + if (err) + return err; + + context = context_buffer; + if (!check_main_image_header_magic(context)) { + nvhost_err(dev_from_gk20a(g), + "Invalid main header: magic value"); + return -EINVAL; + } + num_gpcs = mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0); + + /* Parse the FECS local header. */ + context += ctxsw_prog_ucode_header_size_in_bytes(); + if (!check_local_header_magic(context)) { + nvhost_err(dev_from_gk20a(g), + "Invalid FECS local header: magic value\n"); + return -EINVAL; + } + data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0); + sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32); + + /* If found in Ext buffer, ok. + * If it failed and we expected to find it there (quad offset) + * then return the error. Otherwise continue on. + */ + err = gr_gk20a_find_priv_offset_in_ext_buffer(g, + addr, is_quad, quad, context_buffer, + context_buffer_size, priv_offset); + if (!err || (err && is_quad)) + return err; + + if ((addr_type == CTXSW_ADDR_TYPE_SYS) || + (addr_type == CTXSW_ADDR_TYPE_BE)) { + /* Find the offset in the FECS segment. */ + offset_to_segment = sys_priv_offset * + ctxsw_prog_ucode_header_size_in_bytes(); + + err = gr_gk20a_process_context_buffer_priv_segment(g, + addr_type, addr, + 0, 0, 0, 0, + &offset); + if (err) + return err; + + *priv_offset = (offset_to_segment + offset); + return 0; + } + + if ((gpc_num + 1) > num_gpcs) { + nvhost_err(dev_from_gk20a(g), + "GPC %d not in this context buffer.\n", + gpc_num); + return -EINVAL; + } + + /* Parse the GPCCS local header(s).*/ + for (i = 0; i < num_gpcs; i++) { + context += ctxsw_prog_ucode_header_size_in_bytes(); + if (!check_local_header_magic(context)) { + nvhost_err(dev_from_gk20a(g), + "Invalid GPCCS local header: magic value\n"); + return -EINVAL; + + } + data32 = mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0); + gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32); + + err = gr_gk20a_determine_ppc_configuration(g, context, + &num_ppcs, &ppc_mask, + ®_list_ppc_count); + if (err) + return err; + + num_tpcs = mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0); + + if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) { + nvhost_err(dev_from_gk20a(g), + "GPC %d TPC %d not in this context buffer.\n", + gpc_num, tpc_num); + return -EINVAL; + } + + /* Find the offset in the GPCCS segment.*/ + if (i == gpc_num) { + offset_to_segment = gpc_priv_offset * + ctxsw_prog_ucode_header_size_in_bytes(); + + if (addr_type == CTXSW_ADDR_TYPE_TPC) { + /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/ + } else if (addr_type == CTXSW_ADDR_TYPE_PPC) { + /* The ucode stores TPC data before PPC data. + * Advance offset past TPC data to PPC data. */ + offset_to_segment += + ((gr->ctx_vars.ctxsw_regs.tpc.count * + num_tpcs) << 2); + } else if (addr_type == CTXSW_ADDR_TYPE_GPC) { + /* The ucode stores TPC/PPC data before GPC data. + * Advance offset past TPC/PPC data to GPC data. */ + /* note 1 PES_PER_GPC case */ + u32 litter_num_pes_per_gpc = + proj_scal_litter_num_pes_per_gpc_v(); + if (litter_num_pes_per_gpc > 1) { + offset_to_segment += + (((gr->ctx_vars.ctxsw_regs.tpc.count * + num_tpcs) << 2) + + ((reg_list_ppc_count * num_ppcs) << 2)); + } else { + offset_to_segment += + ((gr->ctx_vars.ctxsw_regs.tpc.count * + num_tpcs) << 2); + } + } else { + nvhost_err(dev_from_gk20a(g), + " Unknown address type.\n"); + return -EINVAL; + } + err = gr_gk20a_process_context_buffer_priv_segment(g, + addr_type, addr, + i, num_tpcs, + num_ppcs, ppc_mask, + &offset); + if (err) + return -EINVAL; + + *priv_offset = offset_to_segment + offset; + return 0; + } + } + + return -EINVAL; +} + + +int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, + struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops, + u32 num_ctx_wr_ops, u32 num_ctx_rd_ops) +{ + struct gk20a *g = ch->g; + struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx; + void *ctx_ptr = NULL; + int curr_gr_chid, curr_gr_ctx; + bool ch_is_curr_ctx, restart_gr_ctxsw = false; + bool restart_fifo_ctxsw = false; + u32 i, j, offset, v; + u32 max_offsets = proj_scal_max_gpcs_v() * + proj_scal_max_tpc_per_gpc_v(); + u32 *offsets = NULL; + u32 *offset_addrs = NULL; + u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops}; + int err, pass; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "wr_ops=%d rd_ops=%d", + num_ctx_wr_ops, num_ctx_rd_ops); + + /* TBD: set timeout */ + /* pin_context will disable channel switching. + * at that point the hardware state can be inspected to + * determine if the context we're interested in is current. + */ +#if 0 + err = fifo_gk20a_disable_fifo_ctxsw(g, c); + if (err) { + dev_warn(dev_from_gk20a(g), "failed to fifo ctxsw\n"); + goto clean_up; + } + restart_fifo_ctxsw = true; +#endif + + { + u32 reg = gk20a_readl(g, 0x0041a084); + nvhost_dbg(dbg_gpu_dbg, "flcn_cfg_rm=0x%x", + reg); + } + + err = gr_gk20a_disable_ctxsw(g); + if (err) { + nvhost_err(dev_from_gk20a(g), "unable to stop gr ctxsw"); + /* this should probably be ctx-fatal... */ + goto cleanup; + } + + restart_gr_ctxsw = true; + + curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r()); + curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx); + ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid); + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx); + if (ch_is_curr_ctx) { + for (pass = 0; pass < 2; pass++) { + ctx_op_nr = 0; + for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) { + /* only do ctx ops and only on the right pass */ + if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) || + (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) || + ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) + continue; + + /* if this is a quad access, setup for special access*/ + if (ctx_ops[i].is_quad) + gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad, + ctx_ops[i].offset); + offset = ctx_ops[i].offset; + + if (pass == 0) { /* write pass */ + v = gk20a_readl(g, offset); + v &= ~ctx_ops[i].and_n_mask_lo; + v |= ctx_ops[i].value_lo; + gk20a_writel(g, offset, v); + + nvhost_dbg(dbg_gpu_dbg, + "direct wr: offset=0x%x v=0x%x", + offset, v); + + if (ctx_ops[i].op == REGOP(WRITE_64)) { + v = gk20a_readl(g, offset + 4); + v &= ~ctx_ops[i].and_n_mask_hi; + v |= ctx_ops[i].value_hi; + gk20a_writel(g, offset + 4, v); + + nvhost_dbg(dbg_gpu_dbg, + "direct wr: offset=0x%x v=0x%x", + offset + 4, v); + } + + } else { /* read pass */ + ctx_ops[i].value_lo = + gk20a_readl(g, offset); + + nvhost_dbg(dbg_gpu_dbg, + "direct rd: offset=0x%x v=0x%x", + offset, ctx_ops[i].value_lo); + + if (ctx_ops[i].op == REGOP(READ_64)) { + ctx_ops[i].value_hi = + gk20a_readl(g, offset + 4); + + nvhost_dbg(dbg_gpu_dbg, + "direct rd: offset=0x%x v=0x%x", + offset, ctx_ops[i].value_lo); + } else + ctx_ops[i].value_hi = 0; + } + ctx_op_nr++; + } + } + goto cleanup; + } + + /* they're the same size, so just use one alloc for both */ + offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL); + if (!offsets) { + err = -ENOMEM; + goto cleanup; + } + offset_addrs = offsets + max_offsets; + + /* would have been a variant of gr_gk20a_apply_instmem_overrides */ + /* recoded in-place instead.*/ + ctx_ptr = nvhost_memmgr_mmap(ch_ctx->gr_ctx.mem.ref); + if (!ctx_ptr) { + err = -ENOMEM; + ctx_ptr = NULL; + goto cleanup; + } + + /* Channel gr_ctx buffer is gpu cacheable; so flush and invalidate. + * There should be no on-going/in-flight references by the gpu now. */ + gk20a_mm_fb_flush(g); + gk20a_mm_l2_flush(g, true); + + /* write to appropriate place in context image, + * first have to figure out where that really is */ + + /* first pass is writes, second reads */ + for (pass = 0; pass < 2; pass++) { + ctx_op_nr = 0; + for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) { + u32 num_offsets; + + /* only do ctx ops and only on the right pass */ + if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) || + (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) || + ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) + continue; + + gr_gk20a_get_ctx_buffer_offsets(g, + ctx_ops[i].offset, + max_offsets, + offsets, offset_addrs, + &num_offsets, + ctx_ops[i].is_quad, + ctx_ops[i].quad); + + /* if this is a quad access, setup for special access*/ + if (ctx_ops[i].is_quad) + gr_gk20a_access_smpc_reg(g, ctx_ops[i].quad, + ctx_ops[i].offset); + + for (j = 0; j < num_offsets; j++) { + /* sanity check, don't write outside, worst case */ + if (offsets[j] >= g->gr.ctx_vars.golden_image_size) + continue; + if (pass == 0) { /* write pass */ + v = mem_rd32(ctx_ptr + offsets[j], 0); + v &= ~ctx_ops[i].and_n_mask_lo; + v |= ctx_ops[i].value_lo; + mem_wr32(ctx_ptr + offsets[j], 0, v); + + nvhost_dbg(dbg_gpu_dbg, + "context wr: offset=0x%x v=0x%x", + offsets[j], v); + + if (ctx_ops[i].op == REGOP(WRITE_64)) { + v = mem_rd32(ctx_ptr + offsets[j] + 4, 0); + v &= ~ctx_ops[i].and_n_mask_hi; + v |= ctx_ops[i].value_hi; + mem_wr32(ctx_ptr + offsets[j] + 4, 0, v); + + nvhost_dbg(dbg_gpu_dbg, + "context wr: offset=0x%x v=0x%x", + offsets[j] + 4, v); + } + + /* check to see if we need to add a special WAR + for some of the SMPC perf regs */ + gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j], + v, ctx_ptr); + + } else { /* read pass */ + ctx_ops[i].value_lo = + mem_rd32(ctx_ptr + offsets[0], 0); + + nvhost_dbg(dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x", + offsets[0], ctx_ops[i].value_lo); + + if (ctx_ops[i].op == REGOP(READ_64)) { + ctx_ops[i].value_hi = + mem_rd32(ctx_ptr + offsets[0] + 4, 0); + + nvhost_dbg(dbg_gpu_dbg, + "context rd: offset=0x%x v=0x%x", + offsets[0] + 4, ctx_ops[i].value_hi); + } else + ctx_ops[i].value_hi = 0; + } + } + ctx_op_nr++; + } + } +#if 0 + /* flush cpu caches for the ctx buffer? only if cpu cached, of course. + * they aren't, yet */ + if (cached) { + FLUSH_CPU_DCACHE(ctx_ptr, + sg_phys(ch_ctx->gr_ctx.mem.ref), size); + } +#endif + + cleanup: + if (offsets) + kfree(offsets); + + if (ctx_ptr) + nvhost_memmgr_munmap(ch_ctx->gr_ctx.mem.ref, ctx_ptr); + + if (restart_gr_ctxsw) { + int tmp_err = gr_gk20a_enable_ctxsw(g); + if (tmp_err) { + nvhost_err(dev_from_gk20a(g), "unable to restart ctxsw!\n"); + err = tmp_err; + } + } + + if (restart_fifo_ctxsw) { +#if 0 + fifo_gk20a_enable_fifo_ctxsw(g); +#endif + } + + return err; +} diff --git a/drivers/video/tegra/host/gk20a/gr_gk20a.h b/drivers/video/tegra/host/gk20a/gr_gk20a.h index 7b7cdec2a1d1..b6979f99f1a8 100644 --- a/drivers/video/tegra/host/gk20a/gr_gk20a.h +++ b/drivers/video/tegra/host/gk20a/gr_gk20a.h @@ -1,7 +1,5 @@ /* - * drivers/video/tegra/host/gk20a/gr_gk20a.h - * - * GK20A graphics + * GK20A Graphics Engine * * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. * @@ -14,9 +12,8 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __GR_GK20A_H__ #define __GR_GK20A_H__ @@ -31,7 +28,7 @@ #define INVALID_SCREEN_TILE_ROW_OFFSET 0xFFFFFFFF #define INVALID_MAX_WAYS 0xFFFFFFFF -enum global_ctx_buffer { +enum /* global_ctx_buffer */ { CIRCULAR = 0, PAGEPOOL = 1, ATTRIBUTE = 2, @@ -43,7 +40,7 @@ enum global_ctx_buffer { }; /* either ATTRIBUTE or ATTRIBUTE_VPR maps to ATTRIBUTE_VA */ -enum global_ctx_buffer_va { +enum /*global_ctx_buffer_va */ { CIRCULAR_VA = 0, PAGEPOOL_VA = 1, ATTRIBUTE_VA = 2, @@ -316,5 +313,15 @@ void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine); int gk20a_gr_suspend(struct gk20a *g); +struct nvhost_dbg_gpu_reg_op; +int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, + struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops, + u32 num_ctx_wr_ops, u32 num_ctx_rd_ops); +int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g, + u32 addr, + u32 max_offsets, + u32 *offsets, u32 *offset_addrs, + u32 *num_offsets, + bool is_quad, u32 quad); #endif /*__GR_GK20A_H__*/ diff --git a/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h b/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h new file mode 100644 index 000000000000..a82a1ee7caa8 --- /dev/null +++ b/drivers/video/tegra/host/gk20a/gr_pri_gk20a.h @@ -0,0 +1,179 @@ +/* + * GK20A Graphics Context Pri Register Addressing + * + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _NVHOST_GR_PRI_GK20A_H_ +#define _NVHOST_GR_PRI_GK20A_H_ + +/* + * These convenience macros are generally for use in the management/modificaiton + * of the context state store for gr/compute contexts. + */ + +/* + * GPC pri addressing + */ +static inline u32 pri_gpccs_addr_width(void) +{ + return 15; /*from where?*/ +} +static inline u32 pri_gpccs_addr_mask(u32 addr) +{ + return addr & ((1 << pri_gpccs_addr_width()) - 1); +} +static inline u32 pri_gpc_addr(u32 addr, u32 gpc) +{ + return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + addr; +} +static inline bool pri_is_gpc_addr_shared(u32 addr) +{ + return (addr >= proj_gpc_shared_base_v()) && + (addr < proj_gpc_shared_base_v() + proj_gpc_stride_v()); +} +static inline bool pri_is_gpc_addr(u32 addr) +{ + return ((addr >= proj_gpc_base_v()) && + (addr < proj_gpc_base_v() + + proj_scal_litter_num_gpcs_v() * proj_gpc_stride_v())) || + pri_is_gpc_addr_shared(addr); +} +static inline u32 pri_get_gpc_num(u32 addr) +{ + u32 i, start; + u32 num_gpcs = proj_scal_litter_num_gpcs_v(); + + for (i = 0; i < num_gpcs; i++) { + start = proj_gpc_base_v() + (i * proj_gpc_stride_v()); + if ((addr >= start) && (addr < (start + proj_gpc_stride_v()))) + return i; + } + return 0; +} +/* + * TPC pri addressing + */ +static inline u32 pri_tpccs_addr_width(void) +{ + return 11; /* from where? */ +} +static inline u32 pri_tpccs_addr_mask(u32 addr) +{ + return addr & ((1 << pri_tpccs_addr_width()) - 1); +} +static inline u32 pri_tpc_addr(u32 addr, u32 gpc, u32 tpc) +{ + return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + + proj_tpc_in_gpc_base_v() + (tpc * proj_tpc_in_gpc_stride_v()) + + addr; +} +static inline bool pri_is_tpc_addr_shared(u32 addr) +{ + return (addr >= proj_tpc_in_gpc_shared_base_v()) && + (addr < (proj_tpc_in_gpc_shared_base_v() + + proj_tpc_in_gpc_stride_v())); +} +static inline bool pri_is_tpc_addr(u32 addr) +{ + return ((addr >= proj_tpc_in_gpc_base_v()) && + (addr < proj_tpc_in_gpc_base_v() + (proj_scal_litter_num_tpc_per_gpc_v() * + proj_tpc_in_gpc_stride_v()))) + || + pri_is_tpc_addr_shared(addr); +} +static inline u32 pri_get_tpc_num(u32 addr) +{ + u32 i, start; + u32 num_tpcs = proj_scal_litter_num_tpc_per_gpc_v(); + + for (i = 0; i < num_tpcs; i++) { + start = proj_tpc_in_gpc_base_v() + (i * proj_tpc_in_gpc_stride_v()); + if ((addr >= start) && (addr < (start + proj_tpc_in_gpc_stride_v()))) + return i; + } + return 0; +} + +/* + * BE pri addressing + */ +static inline u32 pri_becs_addr_width(void) +{ + return 10;/* from where? */ +} +static inline u32 pri_becs_addr_mask(u32 addr) +{ + return addr & ((1 << pri_becs_addr_width()) - 1); +} +static inline bool pri_is_be_addr_shared(u32 addr) +{ + return (addr >= proj_rop_shared_base_v()) && + (addr < proj_rop_shared_base_v() + proj_rop_stride_v()); +} +static inline u32 pri_be_shared_addr(u32 addr) +{ + return proj_rop_shared_base_v() + pri_becs_addr_mask(addr); +} +static inline bool pri_is_be_addr(u32 addr) +{ + return ((addr >= proj_rop_base_v()) && + (addr < proj_rop_base_v()+proj_scal_litter_num_fbps_v() * proj_rop_stride_v())) || + pri_is_be_addr_shared(addr); +} + +static inline u32 pri_get_be_num(u32 addr) +{ + u32 i, start; + u32 num_fbps = proj_scal_litter_num_fbps_v(); + for (i = 0; i < num_fbps; i++) { + start = proj_rop_base_v() + (i * proj_rop_stride_v()); + if ((addr >= start) && (addr < (start + proj_rop_stride_v()))) + return i; + } + return 0; +} + +/* + * PPC pri addressing + */ +static inline u32 pri_ppccs_addr_width(void) +{ + return 9; /* from where? */ +} +static inline u32 pri_ppccs_addr_mask(u32 addr) +{ + return addr & ((1 << pri_ppccs_addr_width()) - 1); +} +static inline u32 pri_ppc_addr(u32 addr, u32 gpc, u32 ppc) +{ + return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + + proj_ppc_in_gpc_base_v() + (ppc * proj_ppc_in_gpc_stride_v()) + addr; +} + +enum ctxsw_addr_type { + CTXSW_ADDR_TYPE_SYS = 0, + CTXSW_ADDR_TYPE_GPC = 1, + CTXSW_ADDR_TYPE_TPC = 2, + CTXSW_ADDR_TYPE_BE = 3, + CTXSW_ADDR_TYPE_PPC = 4 +}; + +#define PRI_BROADCAST_FLAGS_NONE 0 +#define PRI_BROADCAST_FLAGS_GPC BIT(0) +#define PRI_BROADCAST_FLAGS_TPC BIT(1) +#define PRI_BROADCAST_FLAGS_BE BIT(2) +#define PRI_BROADCAST_FLAGS_PPC BIT(3) + +#endif /*_NVHOST_GR_PRI_GK20A_H_ */ diff --git a/drivers/video/tegra/host/gk20a/mm_gk20a.h b/drivers/video/tegra/host/gk20a/mm_gk20a.h index 03eb05ef946d..4db90c9b80bf 100644 --- a/drivers/video/tegra/host/gk20a/mm_gk20a.h +++ b/drivers/video/tegra/host/gk20a/mm_gk20a.h @@ -78,6 +78,7 @@ struct userd_desc { struct patch_desc { struct mem_desc mem; + void *cpu_va; u64 gpu_va; u32 data_count; }; diff --git a/drivers/video/tegra/host/gk20a/regops_gk20a.c b/drivers/video/tegra/host/gk20a/regops_gk20a.c new file mode 100644 index 000000000000..d35f6961ab0b --- /dev/null +++ b/drivers/video/tegra/host/gk20a/regops_gk20a.c @@ -0,0 +1,270 @@ +/* + * + * Tegra GK20A GPU Debugger Driver Register Ops + * + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/nvhost_dbg_gpu_ioctl.h> + +#include "dev.h" +#include "nvhost_hwctx.h" +/*#include "nvhost_acm.h"*/ +#include "gk20a.h" +#include "gr_gk20a.h" +#include "dbg_gpu_gk20a.h" +#include "regops_gk20a.h" + +static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s, + u32 *ctx_rd_count, u32 *ctx_wr_count, + struct nvhost_dbg_gpu_reg_op *ops, + u32 op_count); + + +int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_reg_op *ops, + u64 num_ops) +{ + int err = 0, i; + struct channel_gk20a *ch = dbg_s->ch; + struct gk20a *g = dbg_s->ch->g; + /*struct gr_gk20a *gr = &g->gr;*/ + u32 data32_lo = 0, data32_hi = 0; + u32 ctx_rd_count = 0, ctx_wr_count = 0; + bool skip_read_lo = false, skip_read_hi = false; + bool ok; + + nvhost_dbg(dbg_fn | dbg_gpu_dbg, ""); + + ok = validate_reg_ops(dbg_s, + &ctx_rd_count, &ctx_wr_count, + ops, num_ops); + if (!ok) { + dev_err(dbg_s->dev, "invalid op(s)"); + err = -EINVAL; + /* each op has its own err/status */ + goto clean_up; + } + + for (i = 0; i < num_ops; i++) { + /* if it isn't global then it is done in the ctx ops... */ + if (ops[i].type != REGOP(TYPE_GLOBAL)) + continue; + + switch (ops[i].op) { + + case REGOP(READ_32): + ops[i].value_hi = 0; + ops[i].value_lo = gk20a_readl(g, ops[i].offset); + nvhost_dbg(dbg_gpu_dbg, "read_32 0x%08x from 0x%08x", + ops[i].value_lo, ops[i].offset); + + break; + + case REGOP(READ_64): + ops[i].value_lo = gk20a_readl(g, ops[i].offset); + ops[i].value_hi = + gk20a_readl(g, ops[i].offset + 4); + + nvhost_dbg(dbg_gpu_dbg, "read_64 0x%08x:%08x from 0x%08x", + ops[i].value_hi, ops[i].value_lo, + ops[i].offset); + break; + + case REGOP(WRITE_32): + case REGOP(WRITE_64): + /* some of this appears wonky/unnecessary but + we've kept it for compat with existing + debugger code. just in case... */ + if (ops[i].and_n_mask_lo == ~(u32)0) { + data32_lo = ops[i].value_lo; + skip_read_lo = true; + } + + if ((ops[i].op == REGOP(WRITE_64)) && + (ops[i].and_n_mask_hi == ~(u32)0)) { + data32_hi = ops[i].value_hi; + skip_read_hi = true; + } + + /* read first 32bits */ + if (unlikely(skip_read_lo == false)) { + data32_lo = gk20a_readl(g, ops[i].offset); + data32_lo &= ~ops[i].and_n_mask_lo; + data32_lo |= ops[i].value_lo; + } + + /* if desired, read second 32bits */ + if ((ops[i].op == REGOP(WRITE_64)) && + !skip_read_hi) { + data32_hi = gk20a_readl(g, ops[i].offset + 4); + data32_hi &= ~ops[i].and_n_mask_hi; + data32_hi |= ops[i].value_hi; + } + + /* now update first 32bits */ + gk20a_writel(g, ops[i].offset, data32_lo); + nvhost_dbg(dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ", + data32_lo, ops[i].offset); + /* if desired, update second 32bits */ + if (ops[i].op == REGOP(WRITE_64)) { + gk20a_writel(g, ops[i].offset + 4, data32_hi); + nvhost_dbg(dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ", + data32_hi, ops[i].offset + 4); + + } + + + break; + + /* shouldn't happen as we've already screened */ + default: + BUG(); + err = -EINVAL; + goto clean_up; + break; + } + } + + if (ctx_wr_count | ctx_rd_count) { + err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops, + ctx_wr_count, ctx_rd_count); + if (err) { + dev_warn(dbg_s->dev, + "failed to perform ctx ops\n"); + goto clean_up; + } + } + + clean_up: + nvhost_dbg(dbg_gpu_dbg, "ret=%d", err); + return err; + +} + + +static int validate_reg_op_info(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_reg_op *op) +{ + int err = 0; + + op->status = REGOP(STATUS_SUCCESS); + + switch (op->op) { + case REGOP(READ_32): + case REGOP(READ_64): + case REGOP(WRITE_32): + case REGOP(WRITE_64): + break; + default: + op->status |= REGOP(STATUS_UNSUPPORTED_OP); + /*nvhost_err(dbg_s->dev, "Invalid regops op %d!", op->op);*/ + err = -EINVAL; + break; + } + + switch (op->type) { + case REGOP(TYPE_GLOBAL): + case REGOP(TYPE_GR_CTX): + case REGOP(TYPE_GR_CTX_TPC): + case REGOP(TYPE_GR_CTX_SM): + case REGOP(TYPE_GR_CTX_CROP): + case REGOP(TYPE_GR_CTX_ZROP): + case REGOP(TYPE_GR_CTX_QUAD): + break; + /* + case NVHOST_DBG_GPU_REG_OP_TYPE_FB: + */ + default: + op->status |= REGOP(STATUS_INVALID_TYPE); + /*nvhost_err(dbg_s->dev, "Invalid regops type %d!", op->type);*/ + err = -EINVAL; + break; + } + + return err; +} + +static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_reg_op *op) +{ + int err = 0, temp_err; + u32 buf_offset_lo, buf_offset_addr, num_offsets; + bool is_ctx_op = reg_op_is_gr_ctx(op->type); + + op->status = 0; + /*TBD: get this size from the register resource directly */ + if (!is_ctx_op && op->offset >= SZ_16M) { + op->status = REGOP(STATUS_INVALID_OFFSET); + err = -EINVAL; + } else if (is_ctx_op) { + if (!dbg_s->ch) { + nvhost_err(dbg_s->dev, "can't perform ctx regop unless bound"); + temp_err = -EINVAL; + } else + temp_err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->ch->g, + op->offset, + 1, + &buf_offset_lo, + &buf_offset_addr, + &num_offsets, + op->type == REGOP(TYPE_GR_CTX_QUAD), + op->quad); + if (temp_err) { + op->status |= REGOP(STATUS_INVALID_OFFSET); + err = -EINVAL; + } + if (!buf_offset_lo) { + op->status |= REGOP(STATUS_INVALID_OFFSET); + err = -EINVAL; + } + } + + return err; +} + +static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s, + u32 *ctx_rd_count, u32 *ctx_wr_count, + struct nvhost_dbg_gpu_reg_op *ops, + u32 op_count) +{ + u32 i; + int err; + bool ok = true; + + /* keep going until the end so every op can get + * a separate error code if needed */ + for (i = 0; i < op_count; i++) { + + err = validate_reg_op_info(dbg_s, &ops[i]); + ok &= !err; + + if (reg_op_is_gr_ctx(ops[i].type)) { + if (reg_op_is_read(ops[i].op)) + (*ctx_rd_count)++; + else + (*ctx_wr_count)++; + } + + err = validate_reg_op_offset(dbg_s, &ops[i]); + ok &= !err; + } + + nvhost_dbg_fn("ctx_wrs:%d ctx_rds:%d\n", *ctx_wr_count, *ctx_rd_count); + + return ok; +} diff --git a/drivers/video/tegra/host/gk20a/regops_gk20a.h b/drivers/video/tegra/host/gk20a/regops_gk20a.h new file mode 100644 index 000000000000..231882946a08 --- /dev/null +++ b/drivers/video/tegra/host/gk20a/regops_gk20a.h @@ -0,0 +1,46 @@ +/* + * + * Tegra GK20A GPU Debugger Driver Register Ops + * + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __REGOPS_GK20A_H_ +#define __REGOPS_GK20A_H_ + +int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s, + struct nvhost_dbg_gpu_reg_op *ops, + u64 num_ops); + +/* turn seriously unwieldy names -> something shorter */ +#define REGOP(x) NVHOST_DBG_GPU_REG_OP_##x + + +static inline bool reg_op_is_gr_ctx(u8 type) +{ + return type == REGOP(TYPE_GR_CTX) || + type == REGOP(TYPE_GR_CTX_TPC) || + type == REGOP(TYPE_GR_CTX_SM) || + type == REGOP(TYPE_GR_CTX_CROP) || + type == REGOP(TYPE_GR_CTX_ZROP) || + type == REGOP(TYPE_GR_CTX_QUAD); +} +static inline bool reg_op_is_read(u8 op) +{ + return op == REGOP(READ_32) || + op == REGOP(READ_64) ; +} + + +#endif /* __REGOPS_GK20A_H_ */ diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h index c919b89a2e5e..a2552cd1ef27 100644 --- a/drivers/video/tegra/host/nvhost_channel.h +++ b/drivers/video/tegra/host/nvhost_channel.h @@ -129,4 +129,6 @@ void nvhost_free_channel_internal(struct nvhost_channel *ch, int nvhost_channel_save_context(struct nvhost_channel *ch); +struct nvhost_hwctx *nvhost_channel_get_file_hwctx(int fd); + #endif diff --git a/drivers/video/tegra/host/nvhost_hwctx.h b/drivers/video/tegra/host/nvhost_hwctx.h index 0672571aa32a..43c39387c98f 100644 --- a/drivers/video/tegra/host/nvhost_hwctx.h +++ b/drivers/video/tegra/host/nvhost_hwctx.h @@ -1,9 +1,7 @@ /* - * drivers/video/tegra/host/nvhost_hwctx.h - * * Tegra Graphics Host Hardware Context Interface * - * Copyright (c) 2010-2013, NVIDIA Corporation. + * Copyright (c) 2010-2013, NVIDIA Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -29,6 +27,7 @@ struct nvhost_channel; struct nvhost_cdma; struct mem_mgr; +struct nvhost_dbg_session; struct nvhost_hwctx { struct kref ref; @@ -47,6 +46,7 @@ struct nvhost_hwctx { struct list_head as_share_bound_list_node; struct nvhost_as_share *as_share; + struct nvhost_dbg_session *dbg_session; }; struct nvhost_hwctx_handler { @@ -82,4 +82,5 @@ enum { #define HWCTX_REGINFO(offset, count, type) {offset, count, HWCTX_REGINFO_##type, offset} #define HWCTX_REGINFO_RST(offset, count, type, rst) {offset, count, HWCTX_REGINFO_##type, rst} + #endif diff --git a/drivers/video/tegra/host/t124/t124.c b/drivers/video/tegra/host/t124/t124.c index 75a248591c19..6bd236113e6b 100644 --- a/drivers/video/tegra/host/t124/t124.c +++ b/drivers/video/tegra/host/t124/t124.c @@ -443,6 +443,7 @@ struct nvhost_device_data tegra_gk20a_info = { .can_powergate = true, .alloc_hwctx_handler = nvhost_gk20a_alloc_hwctx_handler, .ctrl_ops = &tegra_gk20a_ctrl_ops, + .dbg_ops = &tegra_gk20a_dbg_gpu_ops, .moduleid = NVHOST_MODULE_GPU, .init = nvhost_gk20a_init, .deinit = nvhost_gk20a_deinit, diff --git a/include/linux/nvhost.h b/include/linux/nvhost.h index 60136ba8ae45..896027702314 100644 --- a/include/linux/nvhost.h +++ b/include/linux/nvhost.h @@ -190,7 +190,10 @@ struct nvhost_device_data { struct cdev ctrl_cdev; const struct file_operations *ctrl_ops; /* ctrl ops for the module */ - /* void *priv;*/ + /* module debugger */ + struct device *dbg_node; + struct cdev dbg_cdev; + const struct file_operations *dbg_ops; struct kobject *power_kobj; /* kobject to hold power sysfs entries */ struct nvhost_device_power_attr *power_attrib; /* sysfs attributes */ diff --git a/include/linux/nvhost_dbg_gpu_ioctl.h b/include/linux/nvhost_dbg_gpu_ioctl.h new file mode 100644 index 000000000000..2866a6d9df2d --- /dev/null +++ b/include/linux/nvhost_dbg_gpu_ioctl.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef __LINUX_NVHOST_DBG_GPU_IOCTL_H +#define __LINUX_NVHOST_DBG_GPU_IOCTL_H + +#include <linux/ioctl.h> +#include <linux/types.h> + +#if !defined(__KERNEL__) +#define __user +#endif + +#define NVHOST_DBG_GPU_IOCTL_MAGIC 'D' + +/* + * /dev/nvhost-dbg-* devices + * + * Opening a '/dev/nvhost-dbg-<module_name>' device node creates a new debugger + * session. nvhost channels (for the same module) can then be bound to such a + * session. + * + * Once a nvhost channel has been bound to a debugger session it cannot be + * bound to another. + * + * As long as there is an open device file to the session, or any bound + * nvhost channels it will be valid. Once all references to the session + * are removed the session is deleted. + * + */ + +/* + * Binding/attaching a debugger session to an nvhost gpu channel + * + * The 'channel_fd' given here is the fd used to allocate the + * gpu channel context. To detach/unbind the debugger session + * use a channel_fd of -1. + * + */ +struct nvhost_dbg_gpu_bind_channel_args { + __u32 channel_fd; /* in*/ + __u32 _pad0[1]; +}; + +#define NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL \ + _IOWR(NVHOST_DBG_GPU_IOCTL_MAGIC, 1, struct nvhost_dbg_gpu_bind_channel_args) + +/* + * Register operations + */ +/* valid op values */ +#define NVHOST_DBG_GPU_REG_OP_READ_32 (0x00000000) +#define NVHOST_DBG_GPU_REG_OP_WRITE_32 (0x00000001) +#define NVHOST_DBG_GPU_REG_OP_READ_64 (0x00000002) +#define NVHOST_DBG_GPU_REG_OP_WRITE_64 (0x00000003) +/* note: 8b ops are unsupported */ +#define NVHOST_DBG_GPU_REG_OP_READ_08 (0x00000004) +#define NVHOST_DBG_GPU_REG_OP_WRITE_08 (0x00000005) + +/* valid type values */ +#define NVHOST_DBG_GPU_REG_OP_TYPE_GLOBAL (0x00000000) +#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX (0x00000001) +#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_TPC (0x00000002) +#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_SM (0x00000004) +#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_CROP (0x00000008) +#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_ZROP (0x00000010) +/*#define NVHOST_DBG_GPU_REG_OP_TYPE_FB (0x00000020)*/ +#define NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX_QUAD (0x00000040) + +/* valid status values */ +#define NVHOST_DBG_GPU_REG_OP_STATUS_SUCCESS (0x00000000) +#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OP (0x00000001) +#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_TYPE (0x00000002) +#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET (0x00000004) +#define NVHOST_DBG_GPU_REG_OP_STATUS_UNSUPPORTED_OP (0x00000008) +#define NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_MASK (0x00000010) + +struct nvhost_dbg_gpu_reg_op { + __u8 op; + __u8 type; + __u8 status; + __u8 quad; + __u8 is_quad; + __u8 _pad0[3]; + __u32 group_mask; + __u32 sub_group_mask; + __u32 offset; + __u32 value_hi; + __u32 value_lo; + __u32 and_n_mask_hi; + __u32 and_n_mask_lo; + __u32 _pad1[1]; +}; + +struct nvhost_dbg_gpu_exec_reg_ops_args { + __u64 ops; /* pointer to nvhost_reg_op operations */ + __u32 num_ops; + __u32 _pad0[1]; +}; + +#define NVHOST_DBG_GPU_IOCTL_REG_OPS \ + _IOWR(NVHOST_DBG_GPU_IOCTL_MAGIC, 2, struct nvhost_dbg_gpu_exec_reg_ops_args) + + +#define NVHOST_DBG_GPU_IOCTL_LAST \ + _IOC_NR(NVHOST_DBG_GPU_IOCTL_REG_OPS) +#define NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE \ + sizeof(struct nvhost_dbg_gpu_exec_reg_ops_args) + +#endif |