/**************************************************************************
 *
 * Copyright 2010-2021 VMware, Inc.
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 **************************************************************************/


#include "pipe/p_config.h"

#include "util/u_math.h"
#include "util/u_cpu_detect.h"
#include "util/u_pack_color.h"
#include "util/u_rect.h"
#include "util/u_sse.h"

#include "lp_jit.h"
#include "lp_rast.h"
#include "lp_debug.h"
#include "lp_state_fs.h"
#include "lp_linear_priv.h"


#if defined(PIPE_ARCH_SSE)


/* For debugging (LP_DEBUG=linear), shade areas of run-time fallback
 * purple.  Keep blending active so we can see more of what's going
 * on.
 */
static boolean
linear_fallback(const struct lp_rast_state *state,
                unsigned x, unsigned y,
                unsigned width, unsigned height,
                uint8_t *color,
                unsigned stride)
{
   unsigned col = 0x808000ff;
   int i;

   for (y = 0; y < height; y++) {
      for (i = 0; i < 64; i++) {
         *((uint32_t *)(color + y*stride) + x + i) = col;
      }
   }

   return TRUE;
}


/* Run our configurable linear shader pipeline:
 */
static boolean
lp_fs_linear_run(const struct lp_rast_state *state,
                 unsigned x, unsigned y,
                 unsigned width, unsigned height,
                 const float (*a0)[4],
                 const float (*dadx)[4],
                 const float (*dady)[4],
                 uint8_t *color,
                 unsigned stride)
{
   const struct lp_fragment_shader_variant *variant = state->variant;
   const struct lp_tgsi_info *info = &variant->shader->info;
   struct lp_jit_linear_context jit;
   lp_jit_linear_llvm_func jit_func = variant->jit_linear_llvm;

   struct lp_linear_sampler samp[LP_MAX_LINEAR_TEXTURES];
   struct lp_linear_interp interp[LP_MAX_LINEAR_INPUTS];
   uint8_t constants[LP_MAX_LINEAR_CONSTANTS][4];

   const float w0 = a0[0][3];
   float oow = 1.0f/w0;

   unsigned input_mask = variant->linear_input_mask;
   int nr_consts = info->base.file_max[TGSI_FILE_CONSTANT]+1;
   int nr_tex = info->num_texs;
   int i, j;

   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);

   /* Require constant w in these rectangles:
    */
   if (dadx[0][3] != 0.0f ||
       dady[0][3] != 0.0f) {
      if (LP_DEBUG & DEBUG_LINEAR2)
         debug_printf("  -- w not constant\n");
      goto fail;
   }

   /* XXX: Per statechange:
    */
   for (i = 0; i < nr_consts; i++) {
      for (j = 0; j < 4; j++) {
         float val = state->jit_context.constants[0][i*4+j];
         if (val < 0.0f || val > 1.0f) {
            if (LP_DEBUG & DEBUG_LINEAR2)
               debug_printf("  -- const[%d] out of range\n", i);
            goto fail;
         }
         constants[i][j] = (uint8_t)(val * 255.0f);
      }
   }
   jit.constants = (const uint8_t (*)[4])constants;

   /* We assume BGRA ordering */
   assert(variant->key.cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
          variant->key.cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM);

   jit.blend_color =
         state->jit_context.u8_blend_color[32] +
         (state->jit_context.u8_blend_color[16] << 8) +
         (state->jit_context.u8_blend_color[0] << 16) +
         (state->jit_context.u8_blend_color[48] << 24);

   jit.alpha_ref_value = float_to_ubyte(state->jit_context.alpha_ref_value);

   /* XXX: Per primitive:
    */
   while (input_mask) {
      int i = u_bit_scan(&input_mask);
      unsigned usage_mask = info->base.input_usage_mask[i];
      boolean perspective =
            info->base.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE ||
            (info->base.input_interpolate[i] == TGSI_INTERPOLATE_COLOR &&
             !variant->key.flatshade);

      if (!lp_linear_init_interp(&interp[i],
                                 x, y, width, height,
                                 usage_mask,
                                 perspective,
                                 oow,
                                 a0[i+1],
                                 dadx[i+1],
                                 dady[i+1])) {
         if (LP_DEBUG & DEBUG_LINEAR2)
            debug_printf("  -- init_interp(%d) failed\n", i);
         goto fail;
      }

      jit.inputs[i] = &interp[i].base;
   }


   /* XXX: Per primitive: Initialize linear or nearest samplers:
    */
   for (i = 0; i < nr_tex; i++) {
      const struct lp_tgsi_texture_info *tex_info = &info->tex[i];
      unsigned unit = tex_info->sampler_unit;

      /* XXX: some texture coordinates are linear!
       */
      //boolean perspective = (info->base.input_interpolate[i] ==
      //                       TGSI_INTERPOLATE_PERSPECTIVE);

      if (!lp_linear_init_sampler(&samp[i],
                                  tex_info,
                                  lp_fs_variant_key_sampler_idx(&variant->key, unit),
                                  &state->jit_context.textures[unit],
                                  x, y, width, height,
                                  a0, dadx, dady)) {
         if (LP_DEBUG & DEBUG_LINEAR2)
            debug_printf("  -- init_sampler(%d) failed\n", i);
         goto fail;
      }

      jit.tex[i] = &samp[i].base;
   }

   /* JIT function already does blending */
   jit.color0 = color + x * 4 + y * stride;
   for (y = 0; y < height; y++) {
      jit_func(&jit, 0, 0, width);
      jit.color0 += stride;
   }

   return TRUE;

fail:
   /* Visually distinguish this from other fallbacks:
    */
   if (LP_DEBUG & DEBUG_LINEAR) {
      return linear_fallback(state, x, y, width, height, color, stride);
   }

   return FALSE;
}


static void
check_linear_interp_mask_a(struct lp_fragment_shader_variant *variant)
{
   const struct lp_tgsi_info *info = &variant->shader->info;
   struct lp_jit_linear_context jit;

   struct lp_linear_sampler samp[LP_MAX_LINEAR_TEXTURES];
   struct lp_linear_interp interp[LP_MAX_LINEAR_INPUTS];
   uint8_t constants[LP_MAX_LINEAR_CONSTANTS][4];
   PIPE_ALIGN_VAR(16) uint8_t color0[TILE_SIZE*4];

   int nr_inputs = info->base.file_max[TGSI_FILE_INPUT]+1;
   int nr_tex = info->num_texs;
   int i;

   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);

   jit.constants = (const uint8_t (*)[4])constants;

   for (i = 0; i < nr_tex; i++) {
      lp_linear_init_noop_sampler(&samp[i]);
      jit.tex[i] = &samp[i].base;
   }

   for (i = 0; i < nr_inputs; i++) {
      lp_linear_init_noop_interp(&interp[i]);
      jit.inputs[i] = &interp[i].base;
   }

   jit.color0 = color0;

   (void)variant->jit_linear_llvm(&jit, 0, 0, 0);

   /* Find out which interpolators were called, and store this as a
    * mask:
    */
   for (i = 0; i < nr_inputs; i++)
      variant->linear_input_mask |= (interp[i].row[0] << i);
}


/* Until the above is working, look at texture information and guess
 * that any input used as a texture coordinate is not used for
 * anything else.
 */
static void
check_linear_interp_mask_b(struct lp_fragment_shader_variant *variant)
{
   const struct lp_tgsi_info *info = &variant->shader->info;
   int nr_inputs = info->base.file_max[TGSI_FILE_INPUT]+1;
   int nr_tex = info->num_texs;
   unsigned tex_mask = 0;
   int i;

   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);

   for (i = 0; i < nr_tex; i++) {
      const struct lp_tgsi_texture_info *tex_info = &info->tex[i];
      const struct lp_tgsi_channel_info *schan = &tex_info->coord[0];
      const struct lp_tgsi_channel_info *tchan = &tex_info->coord[1];
      tex_mask |= 1 << schan->u.index;
      tex_mask |= 1 << tchan->u.index;
   }

   variant->linear_input_mask = ((1 << nr_inputs) - 1) & ~tex_mask;
}


void
lp_linear_check_variant(struct lp_fragment_shader_variant *variant)
{
   const struct lp_fragment_shader_variant_key *key = &variant->key;
   const struct lp_fragment_shader *shader = variant->shader;
   const struct lp_tgsi_info *info = &shader->info;
   int i;

   if (info->base.file_max[TGSI_FILE_CONSTANT] >= LP_MAX_LINEAR_CONSTANTS ||
       info->base.file_max[TGSI_FILE_INPUT] >= LP_MAX_LINEAR_INPUTS) {
      if (LP_DEBUG & DEBUG_LINEAR)
         debug_printf("  -- too many inputs/constants\n");
      goto fail;
   }

   /* If we have a fastpath which implements the entire varient, use
    * that.
    */
   if (lp_linear_check_fastpath(variant)) {
      return;
   }

   /* Otherwise, can we build up a spanline-based linear path for this
    * variant?
    */

   /* Check static sampler state.
    */
   for (i = 0; i < info->num_texs; i++) {
      const struct lp_tgsi_texture_info *tex_info = &info->tex[i];
      unsigned unit = tex_info->sampler_unit;

      /* XXX: Relax this once setup premultiplies by oow:
       */
      if (info->base.input_interpolate[unit] != TGSI_INTERPOLATE_PERSPECTIVE) {
         if (LP_DEBUG & DEBUG_LINEAR)
            debug_printf(" -- samp[%d]: texcoord not perspective\n", i);
         goto fail;
      }

      struct lp_sampler_static_state *samp = lp_fs_variant_key_sampler_idx(key, unit);
      if (!lp_linear_check_sampler(samp, tex_info)) {
         if (LP_DEBUG & DEBUG_LINEAR)
            debug_printf(" -- samp[%d]: check_sampler failed\n", i);
         goto fail;
      }
   }

   /* Check shader.  May not have been jitted.
    */
   if (variant->linear_function == NULL) {
      if (LP_DEBUG & DEBUG_LINEAR)
         debug_printf("  -- no linear shader\n");
      goto fail;
   }

   /* Hook in the catchall shader runner:
    */
   variant->jit_linear = lp_fs_linear_run;

   /* Figure out which inputs we don't need to interpolate (because
    * they are only used as texture coordinates).  This is important
    * as we can cope with texture coordinates which exceed 1.0, but
    * cannot do so for regular inputs.
    */
   if (1)
      check_linear_interp_mask_a(variant);
   else
      check_linear_interp_mask_b(variant);


   if (0) {
      lp_debug_fs_variant(variant);
      debug_printf("linear input mask: 0x%x\n", variant->linear_input_mask);
   }

   return;

fail:
   if (LP_DEBUG & DEBUG_LINEAR) {
      lp_debug_fs_variant(variant);
      debug_printf("    ----> no linear path for this variant\n");
   }
}


#else
void
lp_linear_check_variant(struct lp_fragment_shader_variant *variant)
{
}
#endif
