/*
 * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>

#include "freedreno_pm4.h"

#include "buffers.h"
#include "cffdec.h"
#include "disasm.h"
#include "redump.h"
#include "rnnutil.h"
#include "script.h"

/* ************************************************************************* */
/* originally based on kernel recovery dump code: */

static const struct cffdec_options *options;

static bool needs_wfi = false;
static bool summary = false;
static bool in_summary = false;
static int vertices;

static inline unsigned
regcnt(void)
{
   if (options->gpu_id >= 500)
      return 0xffff;
   else
      return 0x7fff;
}

static int
is_64b(void)
{
   return options->gpu_id >= 500;
}

static int draws[4];
static struct {
   uint64_t base;
   uint32_t size; /* in dwords */
   /* Generally cmdstream consists of multiple IB calls to different
    * buffers, which are themselves often re-used for each tile.  The
    * triggered flag serves two purposes to help make it more clear
    * what part of the cmdstream is before vs after the the GPU hang:
    *
    * 1) if in IB2 we are passed the point within the IB2 buffer where
    *    the GPU hung, but IB1 is not passed the point within its
    *    buffer where the GPU had hung, then we know the GPU hang
    *    happens on a future use of that IB2 buffer.
    *
    * 2) if in an IB1 or IB2 buffer that is not the one where the GPU
    *    hung, but we've already passed the trigger point at the same
    *    IB level, we know that we are passed the point where the GPU
    *    had hung.
    *
    * So this is a one way switch, false->true.  And a higher #'d
    * IB level isn't considered triggered unless the lower #'d IB
    * level is.
    */
   bool triggered;
} ibs[4];
static int ib;

static int draw_count;
static int current_draw_count;

/* query mode.. to handle symbolic register name queries, we need to
 * defer parsing query string until after gpu_id is know and rnn db
 * loaded:
 */
static int *queryvals;

static bool
quiet(int lvl)
{
   if ((options->draw_filter != -1) &&
       (options->draw_filter != current_draw_count))
      return true;
   if ((lvl >= 3) && (summary || options->querystrs || options->script))
      return true;
   if ((lvl >= 2) && (options->querystrs || options->script))
      return true;
   return false;
}

void
printl(int lvl, const char *fmt, ...)
{
   va_list args;
   if (quiet(lvl))
      return;
   va_start(args, fmt);
   vprintf(fmt, args);
   va_end(args);
}

static const char *levels[] = {
   "\t",
   "\t\t",
   "\t\t\t",
   "\t\t\t\t",
   "\t\t\t\t\t",
   "\t\t\t\t\t\t",
   "\t\t\t\t\t\t\t",
   "\t\t\t\t\t\t\t\t",
   "\t\t\t\t\t\t\t\t\t",
   "x",
   "x",
   "x",
   "x",
   "x",
   "x",
};

enum state_src_t {
   STATE_SRC_DIRECT,
   STATE_SRC_INDIRECT,
   STATE_SRC_BINDLESS,
};

/* SDS (CP_SET_DRAW_STATE) helpers: */
static void load_all_groups(int level);
static void disable_all_groups(void);

static void dump_tex_samp(uint32_t *texsamp, enum state_src_t src, int num_unit,
                          int level);
static void dump_tex_const(uint32_t *texsamp, int num_unit, int level);

static bool
highlight_gpuaddr(uint64_t gpuaddr)
{
   if (!options->ibs[ib].base)
      return false;

   if ((ib > 0) && options->ibs[ib - 1].base && !ibs[ib - 1].triggered)
      return false;

   if (ibs[ib].triggered)
      return options->color;

   if (options->ibs[ib].base != ibs[ib].base)
      return false;

   uint64_t start = ibs[ib].base + 4 * (ibs[ib].size - options->ibs[ib].rem);
   uint64_t end = ibs[ib].base + 4 * ibs[ib].size;

   bool triggered = (start <= gpuaddr) && (gpuaddr <= end);

   ibs[ib].triggered |= triggered;

   if (triggered)
      printf("ESTIMATED CRASH LOCATION!\n");

   return triggered & options->color;
}

static void
dump_hex(uint32_t *dwords, uint32_t sizedwords, int level)
{
   int i, j;
   int lastzero = 1;

   if (quiet(2))
      return;

   for (i = 0; i < sizedwords; i += 8) {
      int zero = 1;

      /* always show first row: */
      if (i == 0)
         zero = 0;

      for (j = 0; (j < 8) && (i + j < sizedwords) && zero; j++)
         if (dwords[i + j])
            zero = 0;

      if (zero && !lastzero)
         printf("*\n");

      lastzero = zero;

      if (zero)
         continue;

      uint64_t addr = gpuaddr(&dwords[i]);
      bool highlight = highlight_gpuaddr(addr);

      if (highlight)
         printf("\x1b[0;1;31m");

      if (is_64b()) {
         printf("%016" PRIx64 ":%s", addr, levels[level]);
      } else {
         printf("%08x:%s", (uint32_t)addr, levels[level]);
      }

      if (highlight)
         printf("\x1b[0m");

      printf("%04x:", i * 4);

      for (j = 0; (j < 8) && (i + j < sizedwords); j++) {
         printf(" %08x", dwords[i + j]);
      }

      printf("\n");
   }
}

static void
dump_float(float *dwords, uint32_t sizedwords, int level)
{
   int i;
   for (i = 0; i < sizedwords; i++) {
      if ((i % 8) == 0) {
         if (is_64b()) {
            printf("%016" PRIx64 ":%s", gpuaddr(dwords), levels[level]);
         } else {
            printf("%08x:%s", (uint32_t)gpuaddr(dwords), levels[level]);
         }
      } else {
         printf(" ");
      }
      printf("%8f", *(dwords++));
      if ((i % 8) == 7)
         printf("\n");
   }
   if (i % 8)
      printf("\n");
}

/* I believe the surface format is low bits:
#define RB_COLOR_INFO__COLOR_FORMAT_MASK                   0x0000000fL
comments in sys2gmem_tex_const indicate that address is [31:12], but
looks like at least some of the bits above the format have different meaning..
*/
static void
parse_dword_addr(uint32_t dword, uint32_t *gpuaddr, uint32_t *flags,
                 uint32_t mask)
{
   assert(!is_64b()); /* this is only used on a2xx */
   *gpuaddr = dword & ~mask;
   *flags = dword & mask;
}

static uint32_t type0_reg_vals[0xffff + 1];
static uint8_t type0_reg_rewritten[sizeof(type0_reg_vals) /
                                   8]; /* written since last draw */
static uint8_t type0_reg_written[sizeof(type0_reg_vals) / 8];
static uint32_t lastvals[ARRAY_SIZE(type0_reg_vals)];

static bool
reg_rewritten(uint32_t regbase)
{
   return !!(type0_reg_rewritten[regbase / 8] & (1 << (regbase % 8)));
}

bool
reg_written(uint32_t regbase)
{
   return !!(type0_reg_written[regbase / 8] & (1 << (regbase % 8)));
}

static void
clear_rewritten(void)
{
   memset(type0_reg_rewritten, 0, sizeof(type0_reg_rewritten));
}

static void
clear_written(void)
{
   memset(type0_reg_written, 0, sizeof(type0_reg_written));
   clear_rewritten();
}

uint32_t
reg_lastval(uint32_t regbase)
{
   return lastvals[regbase];
}

static void
clear_lastvals(void)
{
   memset(lastvals, 0, sizeof(lastvals));
}

uint32_t
reg_val(uint32_t regbase)
{
   return type0_reg_vals[regbase];
}

void
reg_set(uint32_t regbase, uint32_t val)
{
   assert(regbase < regcnt());
   type0_reg_vals[regbase] = val;
   type0_reg_written[regbase / 8] |= (1 << (regbase % 8));
   type0_reg_rewritten[regbase / 8] |= (1 << (regbase % 8));
}

static void
reg_dump_scratch(const char *name, uint32_t dword, int level)
{
   unsigned r;

   if (quiet(3))
      return;

   r = regbase("CP_SCRATCH[0].REG");

   // if not, try old a2xx/a3xx version:
   if (!r)
      r = regbase("CP_SCRATCH_REG0");

   if (!r)
      return;

   printf("%s:%u,%u,%u,%u\n", levels[level], reg_val(r + 4), reg_val(r + 5),
          reg_val(r + 6), reg_val(r + 7));
}

static void
dump_gpuaddr_size(uint64_t gpuaddr, int level, int sizedwords, int quietlvl)
{
   void *buf;

   if (quiet(quietlvl))
      return;

   buf = hostptr(gpuaddr);
   if (buf) {
      dump_hex(buf, sizedwords, level + 1);
   }
}

static void
dump_gpuaddr(uint64_t gpuaddr, int level)
{
   dump_gpuaddr_size(gpuaddr, level, 64, 3);
}

static void
reg_dump_gpuaddr(const char *name, uint32_t dword, int level)
{
   dump_gpuaddr(dword, level);
}

uint32_t gpuaddr_lo;
static void
reg_gpuaddr_lo(const char *name, uint32_t dword, int level)
{
   gpuaddr_lo = dword;
}

static void
reg_dump_gpuaddr_hi(const char *name, uint32_t dword, int level)
{
   dump_gpuaddr(gpuaddr_lo | (((uint64_t)dword) << 32), level);
}

static void
reg_dump_gpuaddr64(const char *name, uint64_t qword, int level)
{
   dump_gpuaddr(qword, level);
}

static void
dump_shader(const char *ext, void *buf, int bufsz)
{
   if (options->dump_shaders) {
      static int n = 0;
      char filename[16];
      int fd;
      sprintf(filename, "%04d.%s", n++, ext);
      fd = open(filename, O_WRONLY | O_TRUNC | O_CREAT, 0644);
      if (fd != -1) {
         write(fd, buf, bufsz);
         close(fd);
      }
   }
}

static void
disasm_gpuaddr(const char *name, uint64_t gpuaddr, int level)
{
   void *buf;

   gpuaddr &= 0xfffffffffffffff0;

   if (quiet(3))
      return;

   buf = hostptr(gpuaddr);
   if (buf) {
      uint32_t sizedwords = hostlen(gpuaddr) / 4;
      const char *ext;

      dump_hex(buf, min(64, sizedwords), level + 1);
      try_disasm_a3xx(buf, sizedwords, level + 2, stdout, options->gpu_id);

      /* this is a bit ugly way, but oh well.. */
      if (strstr(name, "SP_VS_OBJ")) {
         ext = "vo3";
      } else if (strstr(name, "SP_FS_OBJ")) {
         ext = "fo3";
      } else if (strstr(name, "SP_GS_OBJ")) {
         ext = "go3";
      } else if (strstr(name, "SP_CS_OBJ")) {
         ext = "co3";
      } else {
         ext = NULL;
      }

      if (ext)
         dump_shader(ext, buf, sizedwords * 4);
   }
}

static void
reg_disasm_gpuaddr(const char *name, uint32_t dword, int level)
{
   disasm_gpuaddr(name, dword, level);
}

static void
reg_disasm_gpuaddr_hi(const char *name, uint32_t dword, int level)
{
   disasm_gpuaddr(name, gpuaddr_lo | (((uint64_t)dword) << 32), level);
}

static void
reg_disasm_gpuaddr64(const char *name, uint64_t qword, int level)
{
   disasm_gpuaddr(name, qword, level);
}

/* Find the value of the TEX_COUNT register that corresponds to the named
 * TEX_SAMP/TEX_CONST reg.
 *
 * Note, this kinda assumes an equal # of samplers and textures, but not
 * really sure if there is a much better option.  I suppose on a6xx we
 * could instead decode the bitfields in SP_xS_CONFIG
 */
static int
get_tex_count(const char *name)
{
   char count_reg[strlen(name) + 5];
   char *p;

   p = strstr(name, "CONST");
   if (!p)
      p = strstr(name, "SAMP");
   if (!p)
      return 0;

   int n = p - name;
   strncpy(count_reg, name, n);
   strcpy(count_reg + n, "COUNT");

   return reg_val(regbase(count_reg));
}

static void
reg_dump_tex_samp_hi(const char *name, uint32_t dword, int level)
{
   if (!in_summary)
      return;

   int num_unit = get_tex_count(name);
   uint64_t gpuaddr = gpuaddr_lo | (((uint64_t)dword) << 32);
   void *buf = hostptr(gpuaddr);

   if (!buf)
      return;

   dump_tex_samp(buf, STATE_SRC_DIRECT, num_unit, level + 1);
}

static void
reg_dump_tex_const_hi(const char *name, uint32_t dword, int level)
{
   if (!in_summary)
      return;

   int num_unit = get_tex_count(name);
   uint64_t gpuaddr = gpuaddr_lo | (((uint64_t)dword) << 32);
   void *buf = hostptr(gpuaddr);

   if (!buf)
      return;

   dump_tex_const(buf, num_unit, level + 1);
}

/*
 * Registers with special handling (rnndec_decode() handles rest):
 */
#define REG(x, fxn)    { #x, fxn }
#define REG64(x, fxn)  { #x, .fxn64 = fxn, .is_reg64 = true }
static struct {
   const char *regname;
   void (*fxn)(const char *name, uint32_t dword, int level);
   void (*fxn64)(const char *name, uint64_t qword, int level);
   uint32_t regbase;
   bool is_reg64;
} reg_a2xx[] = {
      REG(CP_SCRATCH_REG0, reg_dump_scratch),
      REG(CP_SCRATCH_REG1, reg_dump_scratch),
      REG(CP_SCRATCH_REG2, reg_dump_scratch),
      REG(CP_SCRATCH_REG3, reg_dump_scratch),
      REG(CP_SCRATCH_REG4, reg_dump_scratch),
      REG(CP_SCRATCH_REG5, reg_dump_scratch),
      REG(CP_SCRATCH_REG6, reg_dump_scratch),
      REG(CP_SCRATCH_REG7, reg_dump_scratch),
      {NULL},
}, reg_a3xx[] = {
      REG(CP_SCRATCH_REG0, reg_dump_scratch),
      REG(CP_SCRATCH_REG1, reg_dump_scratch),
      REG(CP_SCRATCH_REG2, reg_dump_scratch),
      REG(CP_SCRATCH_REG3, reg_dump_scratch),
      REG(CP_SCRATCH_REG4, reg_dump_scratch),
      REG(CP_SCRATCH_REG5, reg_dump_scratch),
      REG(CP_SCRATCH_REG6, reg_dump_scratch),
      REG(CP_SCRATCH_REG7, reg_dump_scratch),
      REG(VSC_SIZE_ADDRESS, reg_dump_gpuaddr),
      REG(SP_VS_PVT_MEM_ADDR_REG, reg_dump_gpuaddr),
      REG(SP_FS_PVT_MEM_ADDR_REG, reg_dump_gpuaddr),
      REG(SP_VS_OBJ_START_REG, reg_disasm_gpuaddr),
      REG(SP_FS_OBJ_START_REG, reg_disasm_gpuaddr),
      REG(TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
      {NULL},
}, reg_a4xx[] = {
      REG(CP_SCRATCH[0].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x1].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x2].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x3].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
      REG(SP_VS_PVT_MEM_ADDR, reg_dump_gpuaddr),
      REG(SP_FS_PVT_MEM_ADDR, reg_dump_gpuaddr),
      REG(SP_GS_PVT_MEM_ADDR, reg_dump_gpuaddr),
      REG(SP_HS_PVT_MEM_ADDR, reg_dump_gpuaddr),
      REG(SP_DS_PVT_MEM_ADDR, reg_dump_gpuaddr),
      REG(SP_CS_PVT_MEM_ADDR, reg_dump_gpuaddr),
      REG(SP_VS_OBJ_START, reg_disasm_gpuaddr),
      REG(SP_FS_OBJ_START, reg_disasm_gpuaddr),
      REG(SP_GS_OBJ_START, reg_disasm_gpuaddr),
      REG(SP_HS_OBJ_START, reg_disasm_gpuaddr),
      REG(SP_DS_OBJ_START, reg_disasm_gpuaddr),
      REG(SP_CS_OBJ_START, reg_disasm_gpuaddr),
      REG(TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
      REG(TPL1_TP_HS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
      REG(TPL1_TP_DS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
      REG(TPL1_TP_GS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
      REG(TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, reg_dump_gpuaddr),
      {NULL},
}, reg_a5xx[] = {
      REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),
      REG(SP_VS_OBJ_START_LO, reg_gpuaddr_lo),
      REG(SP_VS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
      REG(SP_HS_OBJ_START_LO, reg_gpuaddr_lo),
      REG(SP_HS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
      REG(SP_DS_OBJ_START_LO, reg_gpuaddr_lo),
      REG(SP_DS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
      REG(SP_GS_OBJ_START_LO, reg_gpuaddr_lo),
      REG(SP_GS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
      REG(SP_FS_OBJ_START_LO, reg_gpuaddr_lo),
      REG(SP_FS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
      REG(SP_CS_OBJ_START_LO, reg_gpuaddr_lo),
      REG(SP_CS_OBJ_START_HI, reg_disasm_gpuaddr_hi),
      REG(TPL1_VS_TEX_CONST_LO, reg_gpuaddr_lo),
      REG(TPL1_VS_TEX_CONST_HI, reg_dump_tex_const_hi),
      REG(TPL1_VS_TEX_SAMP_LO, reg_gpuaddr_lo),
      REG(TPL1_VS_TEX_SAMP_HI, reg_dump_tex_samp_hi),
      REG(TPL1_HS_TEX_CONST_LO, reg_gpuaddr_lo),
      REG(TPL1_HS_TEX_CONST_HI, reg_dump_tex_const_hi),
      REG(TPL1_HS_TEX_SAMP_LO, reg_gpuaddr_lo),
      REG(TPL1_HS_TEX_SAMP_HI, reg_dump_tex_samp_hi),
      REG(TPL1_DS_TEX_CONST_LO, reg_gpuaddr_lo),
      REG(TPL1_DS_TEX_CONST_HI, reg_dump_tex_const_hi),
      REG(TPL1_DS_TEX_SAMP_LO, reg_gpuaddr_lo),
      REG(TPL1_DS_TEX_SAMP_HI, reg_dump_tex_samp_hi),
      REG(TPL1_GS_TEX_CONST_LO, reg_gpuaddr_lo),
      REG(TPL1_GS_TEX_CONST_HI, reg_dump_tex_const_hi),
      REG(TPL1_GS_TEX_SAMP_LO, reg_gpuaddr_lo),
      REG(TPL1_GS_TEX_SAMP_HI, reg_dump_tex_samp_hi),
      REG(TPL1_FS_TEX_CONST_LO, reg_gpuaddr_lo),
      REG(TPL1_FS_TEX_CONST_HI, reg_dump_tex_const_hi),
      REG(TPL1_FS_TEX_SAMP_LO, reg_gpuaddr_lo),
      REG(TPL1_FS_TEX_SAMP_HI, reg_dump_tex_samp_hi),
      REG(TPL1_CS_TEX_CONST_LO, reg_gpuaddr_lo),
      REG(TPL1_CS_TEX_CONST_HI, reg_dump_tex_const_hi),
      REG(TPL1_CS_TEX_SAMP_LO, reg_gpuaddr_lo),
      REG(TPL1_CS_TEX_SAMP_HI, reg_dump_tex_samp_hi),
      REG(TPL1_TP_BORDER_COLOR_BASE_ADDR_LO, reg_gpuaddr_lo),
      REG(TPL1_TP_BORDER_COLOR_BASE_ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[0].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[0].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[1].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[1].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[2].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[2].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[3].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[3].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[4].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[4].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[5].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[5].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[6].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[6].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT_FLAG_BUFFER[7].ADDR_LO, reg_gpuaddr_lo),
//      REG(RB_MRT_FLAG_BUFFER[7].ADDR_HI, reg_dump_gpuaddr_hi),
//      REG(RB_BLIT_FLAG_DST_LO, reg_gpuaddr_lo),
//      REG(RB_BLIT_FLAG_DST_HI, reg_dump_gpuaddr_hi),
//      REG(RB_MRT[0].BASE_LO, reg_gpuaddr_lo),
//      REG(RB_MRT[0].BASE_HI, reg_dump_gpuaddr_hi),
//      REG(RB_DEPTH_BUFFER_BASE_LO, reg_gpuaddr_lo),
//      REG(RB_DEPTH_BUFFER_BASE_HI, reg_dump_gpuaddr_hi),
//      REG(RB_DEPTH_FLAG_BUFFER_BASE_LO, reg_gpuaddr_lo),
//      REG(RB_DEPTH_FLAG_BUFFER_BASE_HI, reg_dump_gpuaddr_hi),
//      REG(RB_BLIT_DST_LO, reg_gpuaddr_lo),
//      REG(RB_BLIT_DST_HI, reg_dump_gpuaddr_hi),

//      REG(RB_2D_SRC_LO, reg_gpuaddr_lo),
//      REG(RB_2D_SRC_HI, reg_dump_gpuaddr_hi),
//      REG(RB_2D_SRC_FLAGS_LO, reg_gpuaddr_lo),
//      REG(RB_2D_SRC_FLAGS_HI, reg_dump_gpuaddr_hi),
//      REG(RB_2D_DST_LO, reg_gpuaddr_lo),
//      REG(RB_2D_DST_HI, reg_dump_gpuaddr_hi),
//      REG(RB_2D_DST_FLAGS_LO, reg_gpuaddr_lo),
//      REG(RB_2D_DST_FLAGS_HI, reg_dump_gpuaddr_hi),

      {NULL},
}, reg_a6xx[] = {
      REG(CP_SCRATCH[0x4].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x5].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x6].REG, reg_dump_scratch),
      REG(CP_SCRATCH[0x7].REG, reg_dump_scratch),

      REG64(SP_VS_OBJ_START, reg_disasm_gpuaddr64),
      REG64(SP_HS_OBJ_START, reg_disasm_gpuaddr64),
      REG64(SP_DS_OBJ_START, reg_disasm_gpuaddr64),
      REG64(SP_GS_OBJ_START, reg_disasm_gpuaddr64),
      REG64(SP_FS_OBJ_START, reg_disasm_gpuaddr64),
      REG64(SP_CS_OBJ_START, reg_disasm_gpuaddr64),

      REG64(SP_VS_TEX_CONST, reg_dump_gpuaddr64),
      REG64(SP_VS_TEX_SAMP, reg_dump_gpuaddr64),
      REG64(SP_HS_TEX_CONST, reg_dump_gpuaddr64),
      REG64(SP_HS_TEX_SAMP, reg_dump_gpuaddr64),
      REG64(SP_DS_TEX_CONST, reg_dump_gpuaddr64),
      REG64(SP_DS_TEX_SAMP, reg_dump_gpuaddr64),
      REG64(SP_GS_TEX_CONST, reg_dump_gpuaddr64),
      REG64(SP_GS_TEX_SAMP, reg_dump_gpuaddr64),
      REG64(SP_FS_TEX_CONST, reg_dump_gpuaddr64),
      REG64(SP_FS_TEX_SAMP, reg_dump_gpuaddr64),
      REG64(SP_CS_TEX_CONST, reg_dump_gpuaddr64),
      REG64(SP_CS_TEX_SAMP, reg_dump_gpuaddr64),

      {NULL},
}, *type0_reg;

static struct rnn *rnn;

static void
init_rnn(const char *gpuname)
{
   rnn = rnn_new(!options->color);

   rnn_load(rnn, gpuname);

   if (options->querystrs) {
      int i;
      queryvals = calloc(options->nquery, sizeof(queryvals[0]));

      for (i = 0; i < options->nquery; i++) {
         int val = strtol(options->querystrs[i], NULL, 0);

         if (val == 0)
            val = regbase(options->querystrs[i]);

         queryvals[i] = val;
         printf("querystr: %s -> 0x%x\n", options->querystrs[i], queryvals[i]);
      }
   }

   for (unsigned idx = 0; type0_reg[idx].regname; idx++) {
      type0_reg[idx].regbase = regbase(type0_reg[idx].regname);
      if (!type0_reg[idx].regbase) {
         printf("invalid register name: %s\n", type0_reg[idx].regname);
         exit(1);
      }
   }
}

void
reset_regs(void)
{
   clear_written();
   clear_lastvals();
   memset(&ibs, 0, sizeof(ibs));
}

void
cffdec_init(const struct cffdec_options *_options)
{
   options = _options;
   summary = options->summary;

   /* in case we're decoding multiple files: */
   free(queryvals);
   reset_regs();
   draw_count = 0;

   /* TODO we need an API to free/cleanup any previous rnn */

   switch (options->gpu_id) {
   case 200 ... 299:
      type0_reg = reg_a2xx;
      init_rnn("a2xx");
      break;
   case 300 ... 399:
      type0_reg = reg_a3xx;
      init_rnn("a3xx");
      break;
   case 400 ... 499:
      type0_reg = reg_a4xx;
      init_rnn("a4xx");
      break;
   case 500 ... 599:
      type0_reg = reg_a5xx;
      init_rnn("a5xx");
      break;
   case 600 ... 699:
      type0_reg = reg_a6xx;
      init_rnn("a6xx");
      break;
   default:
      errx(-1, "unsupported gpu");
   }
}

const char *
pktname(unsigned opc)
{
   return rnn_enumname(rnn, "adreno_pm4_type3_packets", opc);
}

const char *
regname(uint32_t regbase, int color)
{
   return rnn_regname(rnn, regbase, color);
}

uint32_t
regbase(const char *name)
{
   return rnn_regbase(rnn, name);
}

static int
endswith(uint32_t regbase, const char *suffix)
{
   const char *name = regname(regbase, 0);
   const char *s = strstr(name, suffix);
   if (!s)
      return 0;
   return (s - strlen(name) + strlen(suffix)) == name;
}

void
dump_register_val(uint32_t regbase, uint32_t dword, int level)
{
   struct rnndecaddrinfo *info = rnn_reginfo(rnn, regbase);

   if (info && info->typeinfo) {
      uint64_t gpuaddr = 0;
      char *decoded = rnndec_decodeval(rnn->vc, info->typeinfo, dword);
      printf("%s%s: %s", levels[level], info->name, decoded);

      /* Try and figure out if we are looking at a gpuaddr.. this
       * might be useful for other gen's too, but at least a5xx has
       * the _HI/_LO suffix we can look for.  Maybe a better approach
       * would be some special annotation in the xml..
       * for a6xx use "address" and "waddress" types
       */
      if (options->gpu_id >= 600) {
         if (!strcmp(info->typeinfo->name, "address") ||
             !strcmp(info->typeinfo->name, "waddress")) {
            gpuaddr = (((uint64_t)reg_val(regbase + 1)) << 32) | dword;
         }
      } else if (options->gpu_id >= 500) {
         if (endswith(regbase, "_HI") && endswith(regbase - 1, "_LO")) {
            gpuaddr = (((uint64_t)dword) << 32) | reg_val(regbase - 1);
         } else if (endswith(regbase, "_LO") && endswith(regbase + 1, "_HI")) {
            gpuaddr = (((uint64_t)reg_val(regbase + 1)) << 32) | dword;
         }
      }

      if (gpuaddr && hostptr(gpuaddr)) {
         printf("\t\tbase=%" PRIx64 ", offset=%" PRIu64 ", size=%u",
                gpubaseaddr(gpuaddr), gpuaddr - gpubaseaddr(gpuaddr),
                hostlen(gpubaseaddr(gpuaddr)));
      }

      printf("\n");

      free(decoded);
   } else if (info) {
      printf("%s%s: %08x\n", levels[level], info->name, dword);
   } else {
      printf("%s<%04x>: %08x\n", levels[level], regbase, dword);
   }

   if (info) {
      free(info->name);
      free(info);
   }
}

static void
dump_register(uint32_t regbase, uint32_t dword, int level)
{
   if (!quiet(3)) {
      dump_register_val(regbase, dword, level);
   }

   for (unsigned idx = 0; type0_reg[idx].regname; idx++) {
      if (type0_reg[idx].regbase == regbase) {
         if (type0_reg[idx].is_reg64) {
            uint64_t qword = (((uint64_t)reg_val(regbase + 1)) << 32) | dword;
            type0_reg[idx].fxn64(type0_reg[idx].regname, qword, level);
         } else {
            type0_reg[idx].fxn(type0_reg[idx].regname, dword, level);
         }
         break;
      }
   }
}

static bool
is_banked_reg(uint32_t regbase)
{
   return (0x2000 <= regbase) && (regbase < 0x2400);
}

static void
dump_registers(uint32_t regbase, uint32_t *dwords, uint32_t sizedwords,
               int level)
{
   while (sizedwords--) {
      int last_summary = summary;

      /* access to non-banked registers needs a WFI:
       * TODO banked register range for a2xx??
       */
      if (needs_wfi && !is_banked_reg(regbase))
         printl(2, "NEEDS WFI: %s (%x)\n", regname(regbase, 1), regbase);

      reg_set(regbase, *dwords);
      dump_register(regbase, *dwords, level);
      regbase++;
      dwords++;
      summary = last_summary;
   }
}

static void
dump_domain(uint32_t *dwords, uint32_t sizedwords, int level, const char *name)
{
   struct rnndomain *dom;
   int i;

   dom = rnn_finddomain(rnn->db, name);

   if (!dom)
      return;

   if (script_packet)
      script_packet(dwords, sizedwords, rnn, dom);

   if (quiet(2))
      return;

   for (i = 0; i < sizedwords; i++) {
      struct rnndecaddrinfo *info = rnndec_decodeaddr(rnn->vc, dom, i, 0);
      char *decoded;
      if (!(info && info->typeinfo))
         break;
      uint64_t value = dwords[i];
      if (info->typeinfo->high >= 32 && i < sizedwords - 1) {
         value |= (uint64_t)dwords[i + 1] << 32;
         i++; /* skip the next dword since we're printing it now */
      }
      decoded = rnndec_decodeval(rnn->vc, info->typeinfo, value);
      /* Unlike the register printing path, we don't print the name
       * of the register, so if it doesn't contain other named
       * things (i.e. it isn't a bitset) then print the register
       * name as if it's a bitset with a single entry. This avoids
       * having to create a dummy register with a single entry to
       * get a name in the decoding.
       */
      if (info->typeinfo->type == RNN_TTYPE_BITSET ||
          info->typeinfo->type == RNN_TTYPE_INLINE_BITSET) {
         printf("%s%s\n", levels[level], decoded);
      } else {
         printf("%s{ %s%s%s = %s }\n", levels[level], rnn->vc->colors->rname,
                info->name, rnn->vc->colors->reset, decoded);
      }
      free(decoded);
      free(info->name);
      free(info);
   }
}

static uint32_t bin_x1, bin_x2, bin_y1, bin_y2;
static unsigned mode;
static const char *render_mode;
static enum {
   MODE_BINNING = 0x1,
   MODE_GMEM = 0x2,
   MODE_BYPASS = 0x4,
   MODE_ALL = MODE_BINNING | MODE_GMEM | MODE_BYPASS,
} enable_mask = MODE_ALL;
static bool skip_ib2_enable_global;
static bool skip_ib2_enable_local;

static void
print_mode(int level)
{
   if ((options->gpu_id >= 500) && !quiet(2)) {
      printf("%smode: %s\n", levels[level], render_mode);
      printf("%sskip_ib2: g=%d, l=%d\n", levels[level], skip_ib2_enable_global,
             skip_ib2_enable_local);
   }
}

static bool
skip_query(void)
{
   switch (options->query_mode) {
   case QUERY_ALL:
      /* never skip: */
      return false;
   case QUERY_WRITTEN:
      for (int i = 0; i < options->nquery; i++) {
         uint32_t regbase = queryvals[i];
         if (!reg_written(regbase)) {
            continue;
         }
         if (reg_rewritten(regbase)) {
            return false;
         }
      }
      return true;
   case QUERY_DELTA:
      for (int i = 0; i < options->nquery; i++) {
         uint32_t regbase = queryvals[i];
         if (!reg_written(regbase)) {
            continue;
         }
         uint32_t lastval = reg_val(regbase);
         if (lastval != lastvals[regbase]) {
            return false;
         }
      }
      return true;
   }
   return true;
}

static void
__do_query(const char *primtype, uint32_t num_indices)
{
   int n = 0;

   if ((500 <= options->gpu_id) && (options->gpu_id < 700)) {
      uint32_t scissor_tl = reg_val(regbase("GRAS_SC_WINDOW_SCISSOR_TL"));
      uint32_t scissor_br = reg_val(regbase("GRAS_SC_WINDOW_SCISSOR_BR"));

      bin_x1 = scissor_tl & 0xffff;
      bin_y1 = scissor_tl >> 16;
      bin_x2 = scissor_br & 0xffff;
      bin_y2 = scissor_br >> 16;
   }

   for (int i = 0; i < options->nquery; i++) {
      uint32_t regbase = queryvals[i];
      if (reg_written(regbase)) {
         uint32_t lastval = reg_val(regbase);
         printf("%4d: %s(%u,%u-%u,%u):%u:", draw_count, primtype, bin_x1,
                bin_y1, bin_x2, bin_y2, num_indices);
         if (options->gpu_id >= 500)
            printf("%s:", render_mode);
         printf("\t%08x", lastval);
         if (lastval != lastvals[regbase]) {
            printf("!");
         } else {
            printf(" ");
         }
         if (reg_rewritten(regbase)) {
            printf("+");
         } else {
            printf(" ");
         }
         dump_register_val(regbase, lastval, 0);
         n++;
      }
   }

   if (n > 1)
      printf("\n");
}

static void
do_query_compare(const char *primtype, uint32_t num_indices)
{
   unsigned saved_enable_mask = enable_mask;
   const char *saved_render_mode = render_mode;

   /* in 'query-compare' mode, we want to see if the register is writtten
    * or changed in any mode:
    *
    * (NOTE: this could cause false-positive for 'query-delta' if the reg
    * is written with different values in binning vs sysmem/gmem mode, as
    * we don't track previous values per-mode, but I think we can live with
    * that)
    */
   enable_mask = MODE_ALL;

   clear_rewritten();
   load_all_groups(0);

   if (!skip_query()) {
      /* dump binning pass values: */
      enable_mask = MODE_BINNING;
      render_mode = "BINNING";
      clear_rewritten();
      load_all_groups(0);
      __do_query(primtype, num_indices);

      /* dump draw pass values: */
      enable_mask = MODE_GMEM | MODE_BYPASS;
      render_mode = "DRAW";
      clear_rewritten();
      load_all_groups(0);
      __do_query(primtype, num_indices);

      printf("\n");
   }

   enable_mask = saved_enable_mask;
   render_mode = saved_render_mode;

   disable_all_groups();
}

/* well, actually query and script..
 * NOTE: call this before dump_register_summary()
 */
static void
do_query(const char *primtype, uint32_t num_indices)
{
   if (script_draw)
      script_draw(primtype, num_indices);

   if (options->query_compare) {
      do_query_compare(primtype, num_indices);
      return;
   }

   if (skip_query())
      return;

   __do_query(primtype, num_indices);
}

static void
cp_im_loadi(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t start = dwords[1] >> 16;
   uint32_t size = dwords[1] & 0xffff;
   const char *type = NULL, *ext = NULL;
   gl_shader_stage disasm_type;

   switch (dwords[0]) {
   case 0:
      type = "vertex";
      ext = "vo";
      disasm_type = MESA_SHADER_VERTEX;
      break;
   case 1:
      type = "fragment";
      ext = "fo";
      disasm_type = MESA_SHADER_FRAGMENT;
      break;
   default:
      type = "<unknown>";
      disasm_type = 0;
      break;
   }

   printf("%s%s shader, start=%04x, size=%04x\n", levels[level], type, start,
          size);
   disasm_a2xx(dwords + 2, sizedwords - 2, level + 2, disasm_type);

   /* dump raw shader: */
   if (ext)
      dump_shader(ext, dwords + 2, (sizedwords - 2) * 4);
}

static void
cp_wide_reg_write(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t reg = dwords[0] & 0xffff;
   int i;
   for (i = 1; i < sizedwords; i++) {
      dump_register(reg, dwords[i], level + 1);
      reg_set(reg, dwords[i]);
      reg++;
   }
}

enum state_t {
   TEX_SAMP = 1,
   TEX_CONST,
   TEX_MIPADDR, /* a3xx only */
   SHADER_PROG,
   SHADER_CONST,

   // image/ssbo state:
   SSBO_0,
   SSBO_1,
   SSBO_2,

   UBO,

   // unknown things, just to hexdumps:
   UNKNOWN_DWORDS,
   UNKNOWN_2DWORDS,
   UNKNOWN_4DWORDS,
};

enum adreno_state_block {
   SB_VERT_TEX = 0,
   SB_VERT_MIPADDR = 1,
   SB_FRAG_TEX = 2,
   SB_FRAG_MIPADDR = 3,
   SB_VERT_SHADER = 4,
   SB_GEOM_SHADER = 5,
   SB_FRAG_SHADER = 6,
   SB_COMPUTE_SHADER = 7,
};

/* TODO there is probably a clever way to let rnndec parse things so
 * we don't have to care about packet format differences across gens
 */

static void
a3xx_get_state_type(uint32_t *dwords, gl_shader_stage *stage,
                    enum state_t *state, enum state_src_t *src)
{
   unsigned state_block_id = (dwords[0] >> 19) & 0x7;
   unsigned state_type = dwords[1] & 0x3;
   static const struct {
      gl_shader_stage stage;
      enum state_t state;
   } lookup[0xf][0x3] = {
      [SB_VERT_TEX][0] = {MESA_SHADER_VERTEX, TEX_SAMP},
      [SB_VERT_TEX][1] = {MESA_SHADER_VERTEX, TEX_CONST},
      [SB_FRAG_TEX][0] = {MESA_SHADER_FRAGMENT, TEX_SAMP},
      [SB_FRAG_TEX][1] = {MESA_SHADER_FRAGMENT, TEX_CONST},
      [SB_VERT_SHADER][0] = {MESA_SHADER_VERTEX, SHADER_PROG},
      [SB_VERT_SHADER][1] = {MESA_SHADER_VERTEX, SHADER_CONST},
      [SB_FRAG_SHADER][0] = {MESA_SHADER_FRAGMENT, SHADER_PROG},
      [SB_FRAG_SHADER][1] = {MESA_SHADER_FRAGMENT, SHADER_CONST},
   };

   *stage = lookup[state_block_id][state_type].stage;
   *state = lookup[state_block_id][state_type].state;
   unsigned state_src = (dwords[0] >> 16) & 0x7;
   if (state_src == 0 /* SS_DIRECT */)
      *src = STATE_SRC_DIRECT;
   else
      *src = STATE_SRC_INDIRECT;
}

static enum state_src_t
_get_state_src(unsigned dword0)
{
   switch ((dword0 >> 16) & 0x3) {
   case 0: /* SS4_DIRECT / SS6_DIRECT */
      return STATE_SRC_DIRECT;
   case 2: /* SS4_INDIRECT / SS6_INDIRECT */
      return STATE_SRC_INDIRECT;
   case 1: /* SS6_BINDLESS */
      return STATE_SRC_BINDLESS;
   default:
      return STATE_SRC_DIRECT;
   }
}

static void
_get_state_type(unsigned state_block_id, unsigned state_type,
                gl_shader_stage *stage, enum state_t *state)
{
   static const struct {
      gl_shader_stage stage;
      enum state_t state;
   } lookup[0x10][0x4] = {
      // SB4_VS_TEX:
      [0x0][0] = {MESA_SHADER_VERTEX, TEX_SAMP},
      [0x0][1] = {MESA_SHADER_VERTEX, TEX_CONST},
      [0x0][2] = {MESA_SHADER_VERTEX, UBO},
      // SB4_HS_TEX:
      [0x1][0] = {MESA_SHADER_TESS_CTRL, TEX_SAMP},
      [0x1][1] = {MESA_SHADER_TESS_CTRL, TEX_CONST},
      [0x1][2] = {MESA_SHADER_TESS_CTRL, UBO},
      // SB4_DS_TEX:
      [0x2][0] = {MESA_SHADER_TESS_EVAL, TEX_SAMP},
      [0x2][1] = {MESA_SHADER_TESS_EVAL, TEX_CONST},
      [0x2][2] = {MESA_SHADER_TESS_EVAL, UBO},
      // SB4_GS_TEX:
      [0x3][0] = {MESA_SHADER_GEOMETRY, TEX_SAMP},
      [0x3][1] = {MESA_SHADER_GEOMETRY, TEX_CONST},
      [0x3][2] = {MESA_SHADER_GEOMETRY, UBO},
      // SB4_FS_TEX:
      [0x4][0] = {MESA_SHADER_FRAGMENT, TEX_SAMP},
      [0x4][1] = {MESA_SHADER_FRAGMENT, TEX_CONST},
      [0x4][2] = {MESA_SHADER_FRAGMENT, UBO},
      // SB4_CS_TEX:
      [0x5][0] = {MESA_SHADER_COMPUTE, TEX_SAMP},
      [0x5][1] = {MESA_SHADER_COMPUTE, TEX_CONST},
      [0x5][2] = {MESA_SHADER_COMPUTE, UBO},
      // SB4_VS_SHADER:
      [0x8][0] = {MESA_SHADER_VERTEX, SHADER_PROG},
      [0x8][1] = {MESA_SHADER_VERTEX, SHADER_CONST},
      [0x8][2] = {MESA_SHADER_VERTEX, UBO},
      // SB4_HS_SHADER
      [0x9][0] = {MESA_SHADER_TESS_CTRL, SHADER_PROG},
      [0x9][1] = {MESA_SHADER_TESS_CTRL, SHADER_CONST},
      [0x9][2] = {MESA_SHADER_TESS_CTRL, UBO},
      // SB4_DS_SHADER
      [0xa][0] = {MESA_SHADER_TESS_EVAL, SHADER_PROG},
      [0xa][1] = {MESA_SHADER_TESS_EVAL, SHADER_CONST},
      [0xa][2] = {MESA_SHADER_TESS_EVAL, UBO},
      // SB4_GS_SHADER
      [0xb][0] = {MESA_SHADER_GEOMETRY, SHADER_PROG},
      [0xb][1] = {MESA_SHADER_GEOMETRY, SHADER_CONST},
      [0xb][2] = {MESA_SHADER_GEOMETRY, UBO},
      // SB4_FS_SHADER:
      [0xc][0] = {MESA_SHADER_FRAGMENT, SHADER_PROG},
      [0xc][1] = {MESA_SHADER_FRAGMENT, SHADER_CONST},
      [0xc][2] = {MESA_SHADER_FRAGMENT, UBO},
      // SB4_CS_SHADER:
      [0xd][0] = {MESA_SHADER_COMPUTE, SHADER_PROG},
      [0xd][1] = {MESA_SHADER_COMPUTE, SHADER_CONST},
      [0xd][2] = {MESA_SHADER_COMPUTE, UBO},
      [0xd][3] = {MESA_SHADER_COMPUTE, SSBO_0}, /* a6xx location */
      // SB4_SSBO (shared across all stages)
      [0xe][0] = {0, SSBO_0}, /* a5xx (and a4xx?) location */
      [0xe][1] = {0, SSBO_1},
      [0xe][2] = {0, SSBO_2},
      // SB4_CS_SSBO
      [0xf][0] = {MESA_SHADER_COMPUTE, SSBO_0},
      [0xf][1] = {MESA_SHADER_COMPUTE, SSBO_1},
      [0xf][2] = {MESA_SHADER_COMPUTE, SSBO_2},
      // unknown things
      /* This looks like combined UBO state for 3d stages (a5xx and
       * before??  I think a6xx has UBO state per shader stage:
       */
      [0x6][2] = {0, UBO},
      [0x7][1] = {0, UNKNOWN_2DWORDS},
   };

   *stage = lookup[state_block_id][state_type].stage;
   *state = lookup[state_block_id][state_type].state;
}

static void
a4xx_get_state_type(uint32_t *dwords, gl_shader_stage *stage,
                    enum state_t *state, enum state_src_t *src)
{
   unsigned state_block_id = (dwords[0] >> 18) & 0xf;
   unsigned state_type = dwords[1] & 0x3;
   _get_state_type(state_block_id, state_type, stage, state);
   *src = _get_state_src(dwords[0]);
}

static void
a6xx_get_state_type(uint32_t *dwords, gl_shader_stage *stage,
                    enum state_t *state, enum state_src_t *src)
{
   unsigned state_block_id = (dwords[0] >> 18) & 0xf;
   unsigned state_type = (dwords[0] >> 14) & 0x3;
   _get_state_type(state_block_id, state_type, stage, state);
   *src = _get_state_src(dwords[0]);
}

static void
dump_tex_samp(uint32_t *texsamp, enum state_src_t src, int num_unit, int level)
{
   for (int i = 0; i < num_unit; i++) {
      /* work-around to reduce noise for opencl blob which always
       * writes the max # regardless of # of textures used
       */
      if ((num_unit == 16) && (texsamp[0] == 0) && (texsamp[1] == 0))
         break;

      if ((300 <= options->gpu_id) && (options->gpu_id < 400)) {
         dump_domain(texsamp, 2, level + 2, "A3XX_TEX_SAMP");
         dump_hex(texsamp, 2, level + 1);
         texsamp += 2;
      } else if ((400 <= options->gpu_id) && (options->gpu_id < 500)) {
         dump_domain(texsamp, 2, level + 2, "A4XX_TEX_SAMP");
         dump_hex(texsamp, 2, level + 1);
         texsamp += 2;
      } else if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
         dump_domain(texsamp, 4, level + 2, "A5XX_TEX_SAMP");
         dump_hex(texsamp, 4, level + 1);
         texsamp += 4;
      } else if ((600 <= options->gpu_id) && (options->gpu_id < 700)) {
         dump_domain(texsamp, 4, level + 2, "A6XX_TEX_SAMP");
         dump_hex(texsamp, 4, level + 1);
         texsamp += src == STATE_SRC_BINDLESS ? 16 : 4;
      }
   }
}

static void
dump_tex_const(uint32_t *texconst, int num_unit, int level)
{
   for (int i = 0; i < num_unit; i++) {
      /* work-around to reduce noise for opencl blob which always
       * writes the max # regardless of # of textures used
       */
      if ((num_unit == 16) && (texconst[0] == 0) && (texconst[1] == 0) &&
          (texconst[2] == 0) && (texconst[3] == 0))
         break;

      if ((300 <= options->gpu_id) && (options->gpu_id < 400)) {
         dump_domain(texconst, 4, level + 2, "A3XX_TEX_CONST");
         dump_hex(texconst, 4, level + 1);
         texconst += 4;
      } else if ((400 <= options->gpu_id) && (options->gpu_id < 500)) {
         dump_domain(texconst, 8, level + 2, "A4XX_TEX_CONST");
         if (options->dump_textures) {
            uint32_t addr = texconst[4] & ~0x1f;
            dump_gpuaddr(addr, level - 2);
         }
         dump_hex(texconst, 8, level + 1);
         texconst += 8;
      } else if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
         dump_domain(texconst, 12, level + 2, "A5XX_TEX_CONST");
         if (options->dump_textures) {
            uint64_t addr =
               (((uint64_t)texconst[5] & 0x1ffff) << 32) | texconst[4];
            dump_gpuaddr_size(addr, level - 2, hostlen(addr) / 4, 3);
         }
         dump_hex(texconst, 12, level + 1);
         texconst += 12;
      } else if ((600 <= options->gpu_id) && (options->gpu_id < 700)) {
         dump_domain(texconst, 16, level + 2, "A6XX_TEX_CONST");
         if (options->dump_textures) {
            uint64_t addr =
               (((uint64_t)texconst[5] & 0x1ffff) << 32) | texconst[4];
            dump_gpuaddr_size(addr, level - 2, hostlen(addr) / 4, 3);
         }
         dump_hex(texconst, 16, level + 1);
         texconst += 16;
      }
   }
}

static void
cp_load_state(uint32_t *dwords, uint32_t sizedwords, int level)
{
   gl_shader_stage stage;
   enum state_t state;
   enum state_src_t src;
   uint32_t num_unit = (dwords[0] >> 22) & 0x1ff;
   uint64_t ext_src_addr;
   void *contents;
   int i;

   if (quiet(2) && !options->script)
      return;

   if (options->gpu_id >= 600)
      a6xx_get_state_type(dwords, &stage, &state, &src);
   else if (options->gpu_id >= 400)
      a4xx_get_state_type(dwords, &stage, &state, &src);
   else
      a3xx_get_state_type(dwords, &stage, &state, &src);

   switch (src) {
   case STATE_SRC_DIRECT:
      ext_src_addr = 0;
      break;
   case STATE_SRC_INDIRECT:
      if (is_64b()) {
         ext_src_addr = dwords[1] & 0xfffffffc;
         ext_src_addr |= ((uint64_t)dwords[2]) << 32;
      } else {
         ext_src_addr = dwords[1] & 0xfffffffc;
      }

      break;
   case STATE_SRC_BINDLESS: {
      const unsigned base_reg = stage == MESA_SHADER_COMPUTE
                                   ? regbase("HLSQ_CS_BINDLESS_BASE[0].ADDR")
                                   : regbase("HLSQ_BINDLESS_BASE[0].ADDR");

      if (is_64b()) {
         const unsigned reg = base_reg + (dwords[1] >> 28) * 2;
         ext_src_addr = reg_val(reg) & 0xfffffffc;
         ext_src_addr |= ((uint64_t)reg_val(reg + 1)) << 32;
      } else {
         const unsigned reg = base_reg + (dwords[1] >> 28);
         ext_src_addr = reg_val(reg) & 0xfffffffc;
      }

      ext_src_addr += 4 * (dwords[1] & 0xffffff);
      break;
   }
   }

   if (ext_src_addr)
      contents = hostptr(ext_src_addr);
   else
      contents = is_64b() ? dwords + 3 : dwords + 2;

   if (!contents)
      return;

   switch (state) {
   case SHADER_PROG: {
      const char *ext = NULL;

      if (quiet(2))
         return;

      if (options->gpu_id >= 400)
         num_unit *= 16;
      else if (options->gpu_id >= 300)
         num_unit *= 4;

      /* shaders:
       *
       * note: num_unit seems to be # of instruction groups, where
       * an instruction group has 4 64bit instructions.
       */
      if (stage == MESA_SHADER_VERTEX) {
         ext = "vo3";
      } else if (stage == MESA_SHADER_GEOMETRY) {
         ext = "go3";
      } else if (stage == MESA_SHADER_COMPUTE) {
         ext = "co3";
      } else if (stage == MESA_SHADER_FRAGMENT) {
         ext = "fo3";
      }

      if (contents)
         try_disasm_a3xx(contents, num_unit * 2, level + 2, stdout,
                         options->gpu_id);

      /* dump raw shader: */
      if (ext)
         dump_shader(ext, contents, num_unit * 2 * 4);

      break;
   }
   case SHADER_CONST: {
      if (quiet(2))
         return;

      /* uniforms/consts:
       *
       * note: num_unit seems to be # of pairs of dwords??
       */

      if (options->gpu_id >= 400)
         num_unit *= 2;

      dump_float(contents, num_unit * 2, level + 1);
      dump_hex(contents, num_unit * 2, level + 1);

      break;
   }
   case TEX_MIPADDR: {
      uint32_t *addrs = contents;

      if (quiet(2))
         return;

      /* mipmap consts block just appears to be array of num_unit gpu addr's: */
      for (i = 0; i < num_unit; i++) {
         void *ptr = hostptr(addrs[i]);
         printf("%s%2d: %08x\n", levels[level + 1], i, addrs[i]);
         if (options->dump_textures) {
            printf("base=%08x\n", (uint32_t)gpubaseaddr(addrs[i]));
            dump_hex(ptr, hostlen(addrs[i]) / 4, level + 1);
         }
      }
      break;
   }
   case TEX_SAMP: {
      dump_tex_samp(contents, src, num_unit, level);
      break;
   }
   case TEX_CONST: {
      dump_tex_const(contents, num_unit, level);
      break;
   }
   case SSBO_0: {
      uint32_t *ssboconst = (uint32_t *)contents;

      for (i = 0; i < num_unit; i++) {
         int sz = 4;
         if (400 <= options->gpu_id && options->gpu_id < 500) {
            dump_domain(ssboconst, 4, level + 2, "A4XX_SSBO_0");
         } else if (500 <= options->gpu_id && options->gpu_id < 600) {
            dump_domain(ssboconst, 4, level + 2, "A5XX_SSBO_0");
         } else if (600 <= options->gpu_id && options->gpu_id < 700) {
            sz = 16;
            dump_domain(ssboconst, 16, level + 2, "A6XX_IBO");
         }
         dump_hex(ssboconst, sz, level + 1);
         ssboconst += sz;
      }
      break;
   }
   case SSBO_1: {
      uint32_t *ssboconst = (uint32_t *)contents;

      for (i = 0; i < num_unit; i++) {
         if (400 <= options->gpu_id && options->gpu_id < 500)
            dump_domain(ssboconst, 2, level + 2, "A4XX_SSBO_1");
         else if (500 <= options->gpu_id && options->gpu_id < 600)
            dump_domain(ssboconst, 2, level + 2, "A5XX_SSBO_1");
         dump_hex(ssboconst, 2, level + 1);
         ssboconst += 2;
      }
      break;
   }
   case SSBO_2: {
      uint32_t *ssboconst = (uint32_t *)contents;

      for (i = 0; i < num_unit; i++) {
         /* TODO a4xx and a5xx might be same: */
         if ((500 <= options->gpu_id) && (options->gpu_id < 600)) {
            dump_domain(ssboconst, 2, level + 2, "A5XX_SSBO_2");
            dump_hex(ssboconst, 2, level + 1);
         }
         if (options->dump_textures) {
            uint64_t addr =
               (((uint64_t)ssboconst[1] & 0x1ffff) << 32) | ssboconst[0];
            dump_gpuaddr_size(addr, level - 2, hostlen(addr) / 4, 3);
         }
         ssboconst += 2;
      }
      break;
   }
   case UBO: {
      uint32_t *uboconst = (uint32_t *)contents;

      for (i = 0; i < num_unit; i++) {
         // TODO probably similar on a4xx..
         if (500 <= options->gpu_id && options->gpu_id < 600)
            dump_domain(uboconst, 2, level + 2, "A5XX_UBO");
         else if (600 <= options->gpu_id && options->gpu_id < 700)
            dump_domain(uboconst, 2, level + 2, "A6XX_UBO");
         dump_hex(uboconst, 2, level + 1);
         uboconst += src == STATE_SRC_BINDLESS ? 16 : 2;
      }
      break;
   }
   case UNKNOWN_DWORDS: {
      if (quiet(2))
         return;
      dump_hex(contents, num_unit, level + 1);
      break;
   }
   case UNKNOWN_2DWORDS: {
      if (quiet(2))
         return;
      dump_hex(contents, num_unit * 2, level + 1);
      break;
   }
   case UNKNOWN_4DWORDS: {
      if (quiet(2))
         return;
      dump_hex(contents, num_unit * 4, level + 1);
      break;
   }
   default:
      if (quiet(2))
         return;
      /* hmm.. */
      dump_hex(contents, num_unit, level + 1);
      break;
   }
}

static void
cp_set_bin(uint32_t *dwords, uint32_t sizedwords, int level)
{
   bin_x1 = dwords[1] & 0xffff;
   bin_y1 = dwords[1] >> 16;
   bin_x2 = dwords[2] & 0xffff;
   bin_y2 = dwords[2] >> 16;
}

static void
dump_a2xx_tex_const(uint32_t *dwords, uint32_t sizedwords, uint32_t val,
                    int level)
{
   uint32_t w, h, p;
   uint32_t gpuaddr, flags, mip_gpuaddr, mip_flags;
   uint32_t min, mag, swiz, clamp_x, clamp_y, clamp_z;
   static const char *filter[] = {
      "point",
      "bilinear",
      "bicubic",
   };
   static const char *clamp[] = {
      "wrap",
      "mirror",
      "clamp-last-texel",
   };
   static const char swiznames[] = "xyzw01??";

   /* see sys2gmem_tex_const[] in adreno_a2xxx.c */

   /* Texture, FormatXYZW=Unsigned, ClampXYZ=Wrap/Repeat,
    * RFMode=ZeroClamp-1, Dim=1:2d, pitch
    */
   p = (dwords[0] >> 22) << 5;
   clamp_x = (dwords[0] >> 10) & 0x3;
   clamp_y = (dwords[0] >> 13) & 0x3;
   clamp_z = (dwords[0] >> 16) & 0x3;

   /* Format=6:8888_WZYX, EndianSwap=0:None, ReqSize=0:256bit, DimHi=0,
    * NearestClamp=1:OGL Mode
    */
   parse_dword_addr(dwords[1], &gpuaddr, &flags, 0xfff);

   /* Width, Height, EndianSwap=0:None */
   w = (dwords[2] & 0x1fff) + 1;
   h = ((dwords[2] >> 13) & 0x1fff) + 1;

   /* NumFormat=0:RF, DstSelXYZW=XYZW, ExpAdj=0, MagFilt=MinFilt=0:Point,
    * Mip=2:BaseMap
    */
   mag = (dwords[3] >> 19) & 0x3;
   min = (dwords[3] >> 21) & 0x3;
   swiz = (dwords[3] >> 1) & 0xfff;

   /* VolMag=VolMin=0:Point, MinMipLvl=0, MaxMipLvl=1, LodBiasH=V=0,
    * Dim3d=0
    */
   // XXX

   /* BorderColor=0:ABGRBlack, ForceBC=0:diable, TriJuice=0, Aniso=0,
    * Dim=1:2d, MipPacking=0
    */
   parse_dword_addr(dwords[5], &mip_gpuaddr, &mip_flags, 0xfff);

   printf("%sset texture const %04x\n", levels[level], val);
   printf("%sclamp x/y/z: %s/%s/%s\n", levels[level + 1], clamp[clamp_x],
          clamp[clamp_y], clamp[clamp_z]);
   printf("%sfilter min/mag: %s/%s\n", levels[level + 1], filter[min],
          filter[mag]);
   printf("%sswizzle: %c%c%c%c\n", levels[level + 1],
          swiznames[(swiz >> 0) & 0x7], swiznames[(swiz >> 3) & 0x7],
          swiznames[(swiz >> 6) & 0x7], swiznames[(swiz >> 9) & 0x7]);
   printf("%saddr=%08x (flags=%03x), size=%dx%d, pitch=%d, format=%s\n",
          levels[level + 1], gpuaddr, flags, w, h, p,
          rnn_enumname(rnn, "a2xx_sq_surfaceformat", flags & 0xf));
   printf("%smipaddr=%08x (flags=%03x)\n", levels[level + 1], mip_gpuaddr,
          mip_flags);
}

static void
dump_a2xx_shader_const(uint32_t *dwords, uint32_t sizedwords, uint32_t val,
                       int level)
{
   int i;
   printf("%sset shader const %04x\n", levels[level], val);
   for (i = 0; i < sizedwords;) {
      uint32_t gpuaddr, flags;
      parse_dword_addr(dwords[i++], &gpuaddr, &flags, 0xf);
      void *addr = hostptr(gpuaddr);
      if (addr) {
         const char *fmt =
            rnn_enumname(rnn, "a2xx_sq_surfaceformat", flags & 0xf);
         uint32_t size = dwords[i++];
         printf("%saddr=%08x, size=%d, format=%s\n", levels[level + 1], gpuaddr,
                size, fmt);
         // TODO maybe dump these as bytes instead of dwords?
         size = (size + 3) / 4; // for now convert to dwords
         dump_hex(addr, min(size, 64), level + 1);
         if (size > min(size, 64))
            printf("%s\t\t...\n", levels[level + 1]);
         dump_float(addr, min(size, 64), level + 1);
         if (size > min(size, 64))
            printf("%s\t\t...\n", levels[level + 1]);
      }
   }
}

static void
cp_set_const(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t val = dwords[0] & 0xffff;
   switch ((dwords[0] >> 16) & 0xf) {
   case 0x0:
      dump_float((float *)(dwords + 1), sizedwords - 1, level + 1);
      break;
   case 0x1:
      /* need to figure out how const space is partitioned between
       * attributes, textures, etc..
       */
      if (val < 0x78) {
         dump_a2xx_tex_const(dwords + 1, sizedwords - 1, val, level);
      } else {
         dump_a2xx_shader_const(dwords + 1, sizedwords - 1, val, level);
      }
      break;
   case 0x2:
      printf("%sset bool const %04x\n", levels[level], val);
      break;
   case 0x3:
      printf("%sset loop const %04x\n", levels[level], val);
      break;
   case 0x4:
      val += 0x2000;
      if (dwords[0] & 0x80000000) {
         uint32_t srcreg = dwords[1];
         uint32_t dstval = dwords[2];

         /* TODO: not sure what happens w/ payload != 2.. */
         assert(sizedwords == 3);
         assert(srcreg < ARRAY_SIZE(type0_reg_vals));

         /* note: rnn_regname uses a static buf so we can't do
          * two regname() calls for one printf..
          */
         printf("%s%s = %08x + ", levels[level], regname(val, 1), dstval);
         printf("%s (%08x)\n", regname(srcreg, 1), type0_reg_vals[srcreg]);

         dstval += type0_reg_vals[srcreg];

         dump_registers(val, &dstval, 1, level + 1);
      } else {
         dump_registers(val, dwords + 1, sizedwords - 1, level + 1);
      }
      break;
   }
}

static void dump_register_summary(int level);

static void
cp_event_write(uint32_t *dwords, uint32_t sizedwords, int level)
{
   const char *name = rnn_enumname(rnn, "vgt_event_type", dwords[0]);
   printl(2, "%sevent %s\n", levels[level], name);

   if (name && (options->gpu_id > 500)) {
      char eventname[64];
      snprintf(eventname, sizeof(eventname), "EVENT:%s", name);
      if (!strcmp(name, "BLIT")) {
         do_query(eventname, 0);
         print_mode(level);
         dump_register_summary(level);
      }
   }
}

static void
dump_register_summary(int level)
{
   uint32_t i;
   bool saved_summary = summary;
   summary = false;

   in_summary = true;

   /* dump current state of registers: */
   printl(2, "%sdraw[%i] register values\n", levels[level], draw_count);
   for (i = 0; i < regcnt(); i++) {
      uint32_t regbase = i;
      uint32_t lastval = reg_val(regbase);
      /* skip registers that haven't been updated since last draw/blit: */
      if (!(options->allregs || reg_rewritten(regbase)))
         continue;
      if (!reg_written(regbase))
         continue;
      if (lastval != lastvals[regbase]) {
         printl(2, "!");
         lastvals[regbase] = lastval;
      } else {
         printl(2, " ");
      }
      if (reg_rewritten(regbase)) {
         printl(2, "+");
      } else {
         printl(2, " ");
      }
      printl(2, "\t%08x", lastval);
      if (!quiet(2)) {
         dump_register(regbase, lastval, level);
      }
   }

   clear_rewritten();

   in_summary = false;

   draw_count++;
   summary = saved_summary;
}

static uint32_t
draw_indx_common(uint32_t *dwords, int level)
{
   uint32_t prim_type = dwords[1] & 0x1f;
   uint32_t source_select = (dwords[1] >> 6) & 0x3;
   uint32_t num_indices = dwords[2];
   const char *primtype;

   primtype = rnn_enumname(rnn, "pc_di_primtype", prim_type);

   do_query(primtype, num_indices);

   printl(2, "%sdraw:          %d\n", levels[level], draws[ib]);
   printl(2, "%sprim_type:     %s (%d)\n", levels[level], primtype, prim_type);
   printl(2, "%ssource_select: %s (%d)\n", levels[level],
          rnn_enumname(rnn, "pc_di_src_sel", source_select), source_select);
   printl(2, "%snum_indices:   %d\n", levels[level], num_indices);

   vertices += num_indices;

   draws[ib]++;

   return num_indices;
}

enum pc_di_index_size {
   INDEX_SIZE_IGN = 0,
   INDEX_SIZE_16_BIT = 0,
   INDEX_SIZE_32_BIT = 1,
   INDEX_SIZE_8_BIT = 2,
   INDEX_SIZE_INVALID = 0,
};

static void
cp_draw_indx(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t num_indices = draw_indx_common(dwords, level);

   assert(!is_64b());

   /* if we have an index buffer, dump that: */
   if (sizedwords == 5) {
      void *ptr = hostptr(dwords[3]);
      printl(2, "%sgpuaddr:       %08x\n", levels[level], dwords[3]);
      printl(2, "%sidx_size:      %d\n", levels[level], dwords[4]);
      if (ptr) {
         enum pc_di_index_size size =
            ((dwords[1] >> 11) & 1) | ((dwords[1] >> 12) & 2);
         if (!quiet(2)) {
            int i;
            printf("%sidxs:         ", levels[level]);
            if (size == INDEX_SIZE_8_BIT) {
               uint8_t *idx = ptr;
               for (i = 0; i < dwords[4]; i++)
                  printf(" %u", idx[i]);
            } else if (size == INDEX_SIZE_16_BIT) {
               uint16_t *idx = ptr;
               for (i = 0; i < dwords[4] / 2; i++)
                  printf(" %u", idx[i]);
            } else if (size == INDEX_SIZE_32_BIT) {
               uint32_t *idx = ptr;
               for (i = 0; i < dwords[4] / 4; i++)
                  printf(" %u", idx[i]);
            }
            printf("\n");
            dump_hex(ptr, dwords[4] / 4, level + 1);
         }
      }
   }

   /* don't bother dumping registers for the dummy draw_indx's.. */
   if (num_indices > 0)
      dump_register_summary(level);

   needs_wfi = true;
}

static void
cp_draw_indx_2(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t num_indices = draw_indx_common(dwords, level);
   enum pc_di_index_size size =
      ((dwords[1] >> 11) & 1) | ((dwords[1] >> 12) & 2);
   void *ptr = &dwords[3];
   int sz = 0;

   assert(!is_64b());

   /* CP_DRAW_INDX_2 has embedded/inline idx buffer: */
   if (!quiet(2)) {
      int i;
      printf("%sidxs:         ", levels[level]);
      if (size == INDEX_SIZE_8_BIT) {
         uint8_t *idx = ptr;
         for (i = 0; i < num_indices; i++)
            printf(" %u", idx[i]);
         sz = num_indices;
      } else if (size == INDEX_SIZE_16_BIT) {
         uint16_t *idx = ptr;
         for (i = 0; i < num_indices; i++)
            printf(" %u", idx[i]);
         sz = num_indices * 2;
      } else if (size == INDEX_SIZE_32_BIT) {
         uint32_t *idx = ptr;
         for (i = 0; i < num_indices; i++)
            printf(" %u", idx[i]);
         sz = num_indices * 4;
      }
      printf("\n");
      dump_hex(ptr, sz / 4, level + 1);
   }

   /* don't bother dumping registers for the dummy draw_indx's.. */
   if (num_indices > 0)
      dump_register_summary(level);
}

static void
cp_draw_indx_offset(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t num_indices = dwords[2];
   uint32_t prim_type = dwords[0] & 0x1f;

   do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), num_indices);
   print_mode(level);

   /* don't bother dumping registers for the dummy draw_indx's.. */
   if (num_indices > 0)
      dump_register_summary(level);
}

static void
cp_draw_indx_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t prim_type = dwords[0] & 0x1f;
   uint64_t addr;

   do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
   print_mode(level);

   if (is_64b())
      addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
   else
      addr = dwords[1];
   dump_gpuaddr_size(addr, level, 0x10, 2);

   if (is_64b())
      addr = (((uint64_t)dwords[5] & 0x1ffff) << 32) | dwords[4];
   else
      addr = dwords[3];
   dump_gpuaddr_size(addr, level, 0x10, 2);

   dump_register_summary(level);
}

static void
cp_draw_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t prim_type = dwords[0] & 0x1f;
   uint64_t addr;

   do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
   print_mode(level);

   addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
   dump_gpuaddr_size(addr, level, 0x10, 2);

   dump_register_summary(level);
}

static void
cp_draw_indirect_multi(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t prim_type = dwords[0] & 0x1f;
   uint32_t count = dwords[2];

   do_query(rnn_enumname(rnn, "pc_di_primtype", prim_type), 0);
   print_mode(level);

   struct rnndomain *domain = rnn_finddomain(rnn->db, "CP_DRAW_INDIRECT_MULTI");
   uint32_t count_dword = rnndec_decodereg(rnn->vc, domain, "INDIRECT_COUNT");
   uint32_t addr_dword = rnndec_decodereg(rnn->vc, domain, "INDIRECT");
   uint64_t stride_dword = rnndec_decodereg(rnn->vc, domain, "STRIDE");

   if (count_dword) {
      uint64_t count_addr =
         ((uint64_t)dwords[count_dword + 1] << 32) | dwords[count_dword];
      uint32_t *buf = hostptr(count_addr);

      /* Don't print more draws than this if we don't know the indirect
       * count. It's possible the user will give ~0 or some other large
       * value, expecting the GPU to fill in the draw count, and we don't
       * want to print a gazillion draws in that case:
       */
      const uint32_t max_draw_count = 0x100;

      /* Assume the indirect count is garbage if it's larger than this
       * (quite large) value or 0. Hopefully this catches most cases.
       */
      const uint32_t max_indirect_draw_count = 0x10000;

      if (buf) {
         printf("%sindirect count: %u\n", levels[level], *buf);
         if (*buf == 0 || *buf > max_indirect_draw_count) {
            /* garbage value */
            count = min(count, max_draw_count);
         } else {
            /* not garbage */
            count = min(count, *buf);
         }
      } else {
         count = min(count, max_draw_count);
      }
   }

   if (addr_dword && stride_dword) {
      uint64_t addr =
         ((uint64_t)dwords[addr_dword + 1] << 32) | dwords[addr_dword];
      uint32_t stride = dwords[stride_dword];

      for (unsigned i = 0; i < count; i++, addr += stride) {
         printf("%sdraw %d:\n", levels[level], i);
         dump_gpuaddr_size(addr, level, 0x10, 2);
      }
   }

   dump_register_summary(level);
}

static void
cp_run_cl(uint32_t *dwords, uint32_t sizedwords, int level)
{
   do_query("COMPUTE", 1);
   dump_register_summary(level);
}

static void
cp_nop(uint32_t *dwords, uint32_t sizedwords, int level)
{
   const char *buf = (void *)dwords;
   int i;

   if (quiet(3))
      return;

   // blob doesn't use CP_NOP for string_marker but it does
   // use it for things that end up looking like, but aren't
   // ascii chars:
   if (!options->decode_markers)
      return;

   for (i = 0; i < 4 * sizedwords; i++) {
      if (buf[i] == '\0')
         break;
      if (isascii(buf[i]))
         printf("%c", buf[i]);
   }
   printf("\n");
}

static void
cp_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
{
   /* traverse indirect buffers */
   uint64_t ibaddr;
   uint32_t ibsize;
   uint32_t *ptr = NULL;

   if (is_64b()) {
      /* a5xx+.. high 32b of gpu addr, then size: */
      ibaddr = dwords[0];
      ibaddr |= ((uint64_t)dwords[1]) << 32;
      ibsize = dwords[2];
   } else {
      ibaddr = dwords[0];
      ibsize = dwords[1];
   }

   if (!quiet(3)) {
      if (is_64b()) {
         printf("%sibaddr:%016" PRIx64 "\n", levels[level], ibaddr);
      } else {
         printf("%sibaddr:%08x\n", levels[level], (uint32_t)ibaddr);
      }
      printf("%sibsize:%08x\n", levels[level], ibsize);
   }

   if (options->once && has_dumped(ibaddr, enable_mask))
      return;

   /* 'query-compare' mode implies 'once' mode, although we need only to
    * process the cmdstream for *any* enable_mask mode, since we are
    * comparing binning vs draw reg values at the same time, ie. it is
    * not useful to process the same draw in both binning and draw pass.
    */
   if (options->query_compare && has_dumped(ibaddr, MODE_ALL))
      return;

   /* map gpuaddr back to hostptr: */
   ptr = hostptr(ibaddr);

   if (ptr) {
      /* If the GPU hung within the target IB, the trigger point will be
       * just after the current CP_INDIRECT_BUFFER.  Because the IB is
       * executed but never returns.  Account for this by checking if
       * the IB returned:
       */
      highlight_gpuaddr(gpuaddr(&dwords[is_64b() ? 3 : 2]));

      ib++;
      ibs[ib].base = ibaddr;
      ibs[ib].size = ibsize;

      dump_commands(ptr, ibsize, level);
      ib--;
   } else {
      fprintf(stderr, "could not find: %016" PRIx64 " (%d)\n", ibaddr, ibsize);
   }
}

static void
cp_start_bin(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint64_t ibaddr;
   uint32_t ibsize;
   uint32_t loopcount;
   uint32_t *ptr = NULL;

   loopcount = dwords[0];
   ibaddr = dwords[1];
   ibaddr |= ((uint64_t)dwords[2]) << 32;
   ibsize = dwords[3];

   /* map gpuaddr back to hostptr: */
   ptr = hostptr(ibaddr);

   if (ptr) {
      /* If the GPU hung within the target IB, the trigger point will be
       * just after the current CP_START_BIN.  Because the IB is
       * executed but never returns.  Account for this by checking if
       * the IB returned:
       */
      highlight_gpuaddr(gpuaddr(&dwords[5]));

      /* TODO: we should duplicate the body of the loop after each bin, so
       * that draws get the correct state. We should also figure out if there
       * are any registers that can tell us what bin we're in when we hang so
       * that crashdec points to the right place.
       */
      ib++;
      for (uint32_t i = 0; i < loopcount; i++) {
         ibs[ib].base = ibaddr;
         ibs[ib].size = ibsize;
         printf("%sbin %u\n", levels[level], i);
         dump_commands(ptr, ibsize, level);
         ibaddr += ibsize;
         ptr += ibsize;
      }
      ib--;
   } else {
      fprintf(stderr, "could not find: %016" PRIx64 " (%d)\n", ibaddr, ibsize);
   }
}

static void
cp_wfi(uint32_t *dwords, uint32_t sizedwords, int level)
{
   needs_wfi = false;
}

static void
cp_mem_write(uint32_t *dwords, uint32_t sizedwords, int level)
{
   if (quiet(2))
      return;

   if (is_64b()) {
      uint64_t gpuaddr = dwords[0] | (((uint64_t)dwords[1]) << 32);
      printf("%sgpuaddr:%016" PRIx64 "\n", levels[level], gpuaddr);
      dump_hex(&dwords[2], sizedwords - 2, level + 1);

      if (pkt_is_type4(dwords[2]) || pkt_is_type7(dwords[2]))
         dump_commands(&dwords[2], sizedwords - 2, level + 1);
   } else {
      uint32_t gpuaddr = dwords[0];
      printf("%sgpuaddr:%08x\n", levels[level], gpuaddr);
      dump_float((float *)&dwords[1], sizedwords - 1, level + 1);
   }
}

static void
cp_rmw(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t val = dwords[0] & 0xffff;
   uint32_t and = dwords[1];
   uint32_t or = dwords[2];
   printl(3, "%srmw (%s & 0x%08x) | 0x%08x)\n", levels[level], regname(val, 1),
          and, or);
   if (needs_wfi)
      printl(2, "NEEDS WFI: rmw (%s & 0x%08x) | 0x%08x)\n", regname(val, 1),
             and, or);
   reg_set(val, (reg_val(val) & and) | or);
}

static void
cp_reg_mem(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t val = dwords[0] & 0xffff;
   printl(3, "%sbase register: %s\n", levels[level], regname(val, 1));

   if (quiet(2))
      return;

   uint64_t gpuaddr = dwords[1] | (((uint64_t)dwords[2]) << 32);
   printf("%sgpuaddr:%016" PRIx64 "\n", levels[level], gpuaddr);
   void *ptr = hostptr(gpuaddr);
   if (ptr) {
      uint32_t cnt = (dwords[0] >> 19) & 0x3ff;
      dump_hex(ptr, cnt, level + 1);
   }
}

struct draw_state {
   uint16_t enable_mask;
   uint16_t flags;
   uint32_t count;
   uint64_t addr;
};

struct draw_state state[32];

#define FLAG_DIRTY              0x1
#define FLAG_DISABLE            0x2
#define FLAG_DISABLE_ALL_GROUPS 0x4
#define FLAG_LOAD_IMMED         0x8

static int draw_mode;

static void
disable_group(unsigned group_id)
{
   struct draw_state *ds = &state[group_id];
   memset(ds, 0, sizeof(*ds));
}

static void
disable_all_groups(void)
{
   for (unsigned i = 0; i < ARRAY_SIZE(state); i++)
      disable_group(i);
}

static void
load_group(unsigned group_id, int level)
{
   struct draw_state *ds = &state[group_id];

   if (!ds->count)
      return;

   printl(2, "%sgroup_id: %u\n", levels[level], group_id);
   printl(2, "%scount: %d\n", levels[level], ds->count);
   printl(2, "%saddr: %016llx\n", levels[level], ds->addr);
   printl(2, "%sflags: %x\n", levels[level], ds->flags);

   if (options->gpu_id >= 600) {
      printl(2, "%senable_mask: 0x%x\n", levels[level], ds->enable_mask);

      if (!(ds->enable_mask & enable_mask)) {
         printl(2, "%s\tskipped!\n\n", levels[level]);
         return;
      }
   }

   void *ptr = hostptr(ds->addr);
   if (ptr) {
      if (!quiet(2))
         dump_hex(ptr, ds->count, level + 1);

      ib++;
      dump_commands(ptr, ds->count, level + 1);
      ib--;
   }
}

static void
load_all_groups(int level)
{
   /* sanity check, we should never recursively hit recursion here, and if
    * we do bad things happen:
    */
   static bool loading_groups = false;
   if (loading_groups) {
      printf("ERROR: nothing in draw state should trigger recursively loading "
             "groups!\n");
      return;
   }
   loading_groups = true;
   for (unsigned i = 0; i < ARRAY_SIZE(state); i++)
      load_group(i, level);
   loading_groups = false;

   /* in 'query-compare' mode, defer disabling all groups until we have a
    * chance to process the query:
    */
   if (!options->query_compare)
      disable_all_groups();
}

static void
cp_set_draw_state(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t i;

   for (i = 0; i < sizedwords;) {
      struct draw_state *ds;
      uint32_t count = dwords[i] & 0xffff;
      uint32_t group_id = (dwords[i] >> 24) & 0x1f;
      uint32_t enable_mask = (dwords[i] >> 20) & 0xf;
      uint32_t flags = (dwords[i] >> 16) & 0xf;
      uint64_t addr;

      if (is_64b()) {
         addr = dwords[i + 1];
         addr |= ((uint64_t)dwords[i + 2]) << 32;
         i += 3;
      } else {
         addr = dwords[i + 1];
         i += 2;
      }

      if (flags & FLAG_DISABLE_ALL_GROUPS) {
         disable_all_groups();
         continue;
      }

      if (flags & FLAG_DISABLE) {
         disable_group(group_id);
         continue;
      }

      assert(group_id < ARRAY_SIZE(state));
      disable_group(group_id);

      ds = &state[group_id];

      ds->enable_mask = enable_mask;
      ds->flags = flags;
      ds->count = count;
      ds->addr = addr;

      if (flags & FLAG_LOAD_IMMED) {
         load_group(group_id, level);
         disable_group(group_id);
      }
   }
}

static void
cp_set_mode(uint32_t *dwords, uint32_t sizedwords, int level)
{
   draw_mode = dwords[0];
}

/* execute compute shader */
static void
cp_exec_cs(uint32_t *dwords, uint32_t sizedwords, int level)
{
   do_query("compute", 0);
   dump_register_summary(level);
}

static void
cp_exec_cs_indirect(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint64_t addr;

   if (is_64b()) {
      addr = (((uint64_t)dwords[2] & 0x1ffff) << 32) | dwords[1];
   } else {
      addr = dwords[1];
   }

   printl(3, "%saddr: %016llx\n", levels[level], addr);
   dump_gpuaddr_size(addr, level, 0x10, 2);

   do_query("compute", 0);
   dump_register_summary(level);
}

static void
cp_set_marker(uint32_t *dwords, uint32_t sizedwords, int level)
{
   render_mode = rnn_enumname(rnn, "a6xx_marker", dwords[0] & 0xf);

   if (!strcmp(render_mode, "RM6_BINNING")) {
      enable_mask = MODE_BINNING;
   } else if (!strcmp(render_mode, "RM6_GMEM")) {
      enable_mask = MODE_GMEM;
   } else if (!strcmp(render_mode, "RM6_BYPASS")) {
      enable_mask = MODE_BYPASS;
   }
}

static void
cp_set_render_mode(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint64_t addr;
   uint32_t *ptr, len;

   assert(is_64b());

   /* TODO seems to have two ptrs, 9 dwords total (incl pkt7 hdr)..
    * not sure if this can come in different sizes.
    *
    * First ptr doesn't seem to be cmdstream, second one does.
    *
    * Comment from downstream kernel:
    *
    * SRM -- set render mode (ex binning, direct render etc)
    * SRM is set by UMD usually at start of IB to tell CP the type of
    * preemption.
    * KMD needs to set SRM to NULL to indicate CP that rendering is
    * done by IB.
    * ------------------------------------------------------------------
    *
    * Seems to always be one of these two:
    * 70ec0008 00000001 001c0000 00000000 00000010 00000003 0000000d 001c2000
    * 00000000 70ec0008 00000001 001c0000 00000000 00000000 00000003 0000000d
    * 001c2000 00000000
    *
    */

   assert(options->gpu_id >= 500);

   render_mode = rnn_enumname(rnn, "render_mode_cmd", dwords[0]);

   if (sizedwords == 1)
      return;

   addr = dwords[1];
   addr |= ((uint64_t)dwords[2]) << 32;

   mode = dwords[3];

   dump_gpuaddr(addr, level + 1);

   if (sizedwords == 5)
      return;

   assert(sizedwords == 8);

   len = dwords[5];
   addr = dwords[6];
   addr |= ((uint64_t)dwords[7]) << 32;

   printl(3, "%saddr: 0x%016lx\n", levels[level], addr);
   printl(3, "%slen:  0x%x\n", levels[level], len);

   ptr = hostptr(addr);

   if (ptr) {
      if (!quiet(2)) {
         ib++;
         dump_commands(ptr, len, level + 1);
         ib--;
         dump_hex(ptr, len, level + 1);
      }
   }
}

static void
cp_compute_checkpoint(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint64_t addr;
   uint32_t *ptr, len;

   assert(is_64b());
   assert(options->gpu_id >= 500);

   assert(sizedwords == 8);

   addr = dwords[5];
   addr |= ((uint64_t)dwords[6]) << 32;
   len = dwords[7];

   printl(3, "%saddr: 0x%016" PRIx64 "\n", levels[level], addr);
   printl(3, "%slen:  0x%x\n", levels[level], len);

   ptr = hostptr(addr);

   if (ptr) {
      if (!quiet(2)) {
         ib++;
         dump_commands(ptr, len, level + 1);
         ib--;
         dump_hex(ptr, len, level + 1);
      }
   }
}

static void
cp_blit(uint32_t *dwords, uint32_t sizedwords, int level)
{
   do_query(rnn_enumname(rnn, "cp_blit_cmd", dwords[0]), 0);
   print_mode(level);
   dump_register_summary(level);
}

static void
cp_context_reg_bunch(uint32_t *dwords, uint32_t sizedwords, int level)
{
   int i;

   /* NOTE: seems to write same reg multiple times.. not sure if different parts
    * of these are triggered by the FLUSH_SO_n events?? (if that is what they
    * actually are?)
    */
   bool saved_summary = summary;
   summary = false;

   for (i = 0; i < sizedwords; i += 2) {
      dump_register(dwords[i + 0], dwords[i + 1], level + 1);
      reg_set(dwords[i + 0], dwords[i + 1]);
   }

   summary = saved_summary;
}

static void
cp_reg_write(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint32_t reg = dwords[1] & 0xffff;

   dump_register(reg, dwords[2], level + 1);
   reg_set(reg, dwords[2]);
}

static void
cp_set_ctxswitch_ib(uint32_t *dwords, uint32_t sizedwords, int level)
{
   uint64_t addr;
   uint32_t size = dwords[2] & 0xffff;
   void *ptr;

   addr = dwords[0] | ((uint64_t)dwords[1] << 32);

   if (!quiet(3)) {
      printf("%saddr=%" PRIx64 "\n", levels[level], addr);
   }

   ptr = hostptr(addr);
   if (ptr) {
      dump_commands(ptr, size, level + 1);
   }
}

static void
cp_skip_ib2_enable_global(uint32_t *dwords, uint32_t sizedwords, int level)
{
   skip_ib2_enable_global = dwords[0];
}

static void
cp_skip_ib2_enable_local(uint32_t *dwords, uint32_t sizedwords, int level)
{
   skip_ib2_enable_local = dwords[0];
}

#define CP(x, fxn, ...) { "CP_" #x, fxn, ##__VA_ARGS__ }
static const struct type3_op {
   const char *name;
   void (*fxn)(uint32_t *dwords, uint32_t sizedwords, int level);
   struct {
      bool load_all_groups;
   } options;
} type3_op[] = {
   CP(NOP, cp_nop),
   CP(INDIRECT_BUFFER, cp_indirect),
   CP(INDIRECT_BUFFER_PFD, cp_indirect),
   CP(WAIT_FOR_IDLE, cp_wfi),
   CP(REG_RMW, cp_rmw),
   CP(REG_TO_MEM, cp_reg_mem),
   CP(MEM_TO_REG, cp_reg_mem), /* same layout as CP_REG_TO_MEM */
   CP(MEM_WRITE, cp_mem_write),
   CP(EVENT_WRITE, cp_event_write),
   CP(RUN_OPENCL, cp_run_cl),
   CP(DRAW_INDX, cp_draw_indx, {.load_all_groups = true}),
   CP(DRAW_INDX_2, cp_draw_indx_2, {.load_all_groups = true}),
   CP(SET_CONSTANT, cp_set_const),
   CP(IM_LOAD_IMMEDIATE, cp_im_loadi),
   CP(WIDE_REG_WRITE, cp_wide_reg_write),

   /* for a3xx */
   CP(LOAD_STATE, cp_load_state),
   CP(SET_BIN, cp_set_bin),

   /* for a4xx */
   CP(LOAD_STATE4, cp_load_state),
   CP(SET_DRAW_STATE, cp_set_draw_state),
   CP(DRAW_INDX_OFFSET, cp_draw_indx_offset, {.load_all_groups = true}),
   CP(EXEC_CS, cp_exec_cs, {.load_all_groups = true}),
   CP(EXEC_CS_INDIRECT, cp_exec_cs_indirect, {.load_all_groups = true}),

   /* for a5xx */
   CP(SET_RENDER_MODE, cp_set_render_mode),
   CP(COMPUTE_CHECKPOINT, cp_compute_checkpoint),
   CP(BLIT, cp_blit),
   CP(CONTEXT_REG_BUNCH, cp_context_reg_bunch),
   CP(DRAW_INDIRECT, cp_draw_indirect, {.load_all_groups = true}),
   CP(DRAW_INDX_INDIRECT, cp_draw_indx_indirect, {.load_all_groups = true}),
   CP(DRAW_INDIRECT_MULTI, cp_draw_indirect_multi, {.load_all_groups = true}),
   CP(SKIP_IB2_ENABLE_GLOBAL, cp_skip_ib2_enable_global),
   CP(SKIP_IB2_ENABLE_LOCAL, cp_skip_ib2_enable_local),

   /* for a6xx */
   CP(LOAD_STATE6_GEOM, cp_load_state),
   CP(LOAD_STATE6_FRAG, cp_load_state),
   CP(LOAD_STATE6, cp_load_state),
   CP(SET_MODE, cp_set_mode),
   CP(SET_MARKER, cp_set_marker),
   CP(REG_WRITE, cp_reg_write),

   CP(SET_CTXSWITCH_IB, cp_set_ctxswitch_ib),

   CP(START_BIN, cp_start_bin),
};

static void
noop_fxn(uint32_t *dwords, uint32_t sizedwords, int level)
{
}

static const struct type3_op *
get_type3_op(unsigned opc)
{
   static const struct type3_op dummy_op = {
      .fxn = noop_fxn,
   };
   const char *name = pktname(opc);

   if (!name)
      return &dummy_op;

   for (unsigned i = 0; i < ARRAY_SIZE(type3_op); i++)
      if (!strcmp(name, type3_op[i].name))
         return &type3_op[i];

   return &dummy_op;
}

void
dump_commands(uint32_t *dwords, uint32_t sizedwords, int level)
{
   int dwords_left = sizedwords;
   uint32_t count = 0; /* dword count including packet header */
   uint32_t val;

   //	assert(dwords);
   if (!dwords) {
      printf("NULL cmd buffer!\n");
      return;
   }

   assert(ib < ARRAY_SIZE(draws));
   draws[ib] = 0;

   while (dwords_left > 0) {

      current_draw_count = draw_count;

      /* hack, this looks like a -1 underflow, in some versions
       * when it tries to write zero registers via pkt0
       */
      //		if ((dwords[0] >> 16) == 0xffff)
      //			goto skip;

      if (pkt_is_type0(dwords[0])) {
         printl(3, "t0");
         count = type0_pkt_size(dwords[0]) + 1;
         val = type0_pkt_offset(dwords[0]);
         assert(val < regcnt());
         printl(3, "%swrite %s%s (%04x)\n", levels[level + 1], regname(val, 1),
                (dwords[0] & 0x8000) ? " (same register)" : "", val);
         dump_registers(val, dwords + 1, count - 1, level + 2);
         if (!quiet(3))
            dump_hex(dwords, count, level + 1);
      } else if (pkt_is_type4(dwords[0])) {
         /* basically the same(ish) as type0 prior to a5xx */
         printl(3, "t4");
         count = type4_pkt_size(dwords[0]) + 1;
         val = type4_pkt_offset(dwords[0]);
         assert(val < regcnt());
         printl(3, "%swrite %s (%04x)\n", levels[level + 1], regname(val, 1),
                val);
         dump_registers(val, dwords + 1, count - 1, level + 2);
         if (!quiet(3))
            dump_hex(dwords, count, level + 1);
#if 0
      } else if (pkt_is_type1(dwords[0])) {
         printl(3, "t1");
         count = 3;
         val = dwords[0] & 0xfff;
         printl(3, "%swrite %s\n", levels[level+1], regname(val, 1));
         dump_registers(val, dwords+1, 1, level+2);
         val = (dwords[0] >> 12) & 0xfff;
         printl(3, "%swrite %s\n", levels[level+1], regname(val, 1));
         dump_registers(val, dwords+2, 1, level+2);
         if (!quiet(3))
            dump_hex(dwords, count, level+1);
      } else if (pkt_is_type2(dwords[0])) {
         printl(3, "t2");
         printf("%sNOP\n", levels[level+1]);
         count = 1;
         if (!quiet(3))
            dump_hex(dwords, count, level+1);
#endif
      } else if (pkt_is_type3(dwords[0])) {
         count = type3_pkt_size(dwords[0]) + 1;
         val = cp_type3_opcode(dwords[0]);
         const struct type3_op *op = get_type3_op(val);
         if (op->options.load_all_groups)
            load_all_groups(level + 1);
         printl(3, "t3");
         const char *name = pktname(val);
         if (!quiet(2)) {
            printf("\t%sopcode: %s%s%s (%02x) (%d dwords)%s\n", levels[level],
                   rnn->vc->colors->bctarg, name, rnn->vc->colors->reset, val,
                   count, (dwords[0] & 0x1) ? " (predicated)" : "");
         }
         if (name)
            dump_domain(dwords + 1, count - 1, level + 2, name);
         op->fxn(dwords + 1, count - 1, level + 1);
         if (!quiet(2))
            dump_hex(dwords, count, level + 1);
      } else if (pkt_is_type7(dwords[0])) {
         count = type7_pkt_size(dwords[0]) + 1;
         val = cp_type7_opcode(dwords[0]);
         const struct type3_op *op = get_type3_op(val);
         if (op->options.load_all_groups)
            load_all_groups(level + 1);
         printl(3, "t7");
         const char *name = pktname(val);
         if (!quiet(2)) {
            printf("\t%sopcode: %s%s%s (%02x) (%d dwords)\n", levels[level],
                   rnn->vc->colors->bctarg, name, rnn->vc->colors->reset, val,
                   count);
         }
         if (name) {
            /* special hack for two packets that decode the same way
             * on a6xx:
             */
            if (!strcmp(name, "CP_LOAD_STATE6_FRAG") ||
                !strcmp(name, "CP_LOAD_STATE6_GEOM"))
               name = "CP_LOAD_STATE6";
            dump_domain(dwords + 1, count - 1, level + 2, name);
         }
         op->fxn(dwords + 1, count - 1, level + 1);
         if (!quiet(2))
            dump_hex(dwords, count, level + 1);
      } else if (pkt_is_type2(dwords[0])) {
         printl(3, "t2");
         printl(3, "%snop\n", levels[level + 1]);
      } else {
         /* for 5xx+ we can do a passable job of looking for start of next valid
          * packet: */
         if (options->gpu_id >= 500) {
            while (dwords_left > 0) {
               if (pkt_is_type7(dwords[0]) || pkt_is_type4(dwords[0]))
                  break;
               printf("bad type! %08x\n", dwords[0]);
               dwords++;
               dwords_left--;
            }
         } else {
            printf("bad type! %08x\n", dwords[0]);
            return;
         }
      }

      dwords += count;
      dwords_left -= count;
   }

   if (dwords_left < 0)
      printf("**** this ain't right!! dwords_left=%d\n", dwords_left);
}
