gpu_neon: don't crash on large primitives in enhancement mode
authornotaz <notasas@gmail.com>
Thu, 24 Aug 2023 20:07:56 +0000 (23:07 +0300)
committernotaz <notasas@gmail.com>
Thu, 24 Aug 2023 23:03:07 +0000 (02:03 +0300)
plugins/gpu_neon/psx_gpu/psx_gpu.c
plugins/gpu_neon/psx_gpu/psx_gpu.h
plugins/gpu_neon/psx_gpu/psx_gpu_4x.c
plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
plugins/gpu_neon/psx_gpu/psx_gpu_simd.c
plugins/gpu_neon/psx_gpu/psx_gpu_simd.h
plugins/gpu_neon/psx_gpu/vector_ops.h

index e252d04..370d8f2 100644 (file)
@@ -16,6 +16,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
+#include <assert.h>
 
 #include "common.h"
 #ifndef NEON_BUILD
@@ -772,24 +773,23 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
     printf("mismatch on %s %s: %x vs %x\n", #_a, #_b, _a, _b)                  \
 
 
-#ifndef NDEBUG
-#define setup_spans_debug_check(span_edge_data_element)                        \
-{                                                                              \
-  u32 _num_spans = &span_edge_data_element - psx_gpu->span_edge_data;          \
-  if (_num_spans > MAX_SPANS)                                                  \
-    *(volatile int *)0 = 1;                                                    \
-  if (_num_spans < psx_gpu->num_spans)                                         \
-  {                                                                            \
-    if(span_edge_data_element.num_blocks > MAX_BLOCKS_PER_ROW)                 \
-      *(volatile int *)0 = 2;                                                  \
-    if(span_edge_data_element.y >= 2048)                                       \
-      *(volatile int *)0 = 3;                                                  \
-  }                                                                            \
-}                                                                              \
-
+#if !defined(NEON_BUILD) && !defined(NDEBUG)
+static void setup_spans_debug_check(psx_gpu_struct *psx_gpu,
+  edge_data_struct *span_edge_data_element)
+{
+  u32 _num_spans = span_edge_data_element - psx_gpu->span_edge_data;
+  if (_num_spans > MAX_SPANS)
+    *(volatile int *)0 = 1;
+  if (_num_spans < psx_gpu->num_spans)
+  {
+    if(span_edge_data_element->num_blocks > MAX_BLOCKS_PER_ROW)
+      *(volatile int *)0 = 2;
+    if(span_edge_data_element->y >= 2048)
+      *(volatile int *)0 = 3;
+  }
+}
 #else
-#define setup_spans_debug_check(span_edge_data_element)                        \
-
+#define setup_spans_debug_check(psx_gpu, span_edge_data_element)
 #endif
 
 #define setup_spans_prologue_alternate_yes()                                   \
@@ -856,6 +856,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   span_b_offset = psx_gpu->span_b_offset;                                      \
                                                                                \
   vec_8x16u c_0x0001;                                                          \
+  vec_4x16u c_max_blocks_per_row;                                              \
                                                                                \
   dup_8x16b(c_0x0001, 0x0001);                                                 \
   dup_8x16b(left_edge, psx_gpu->viewport_start_x);                             \
@@ -864,6 +865,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   dup_4x16b(c_0x04, 0x04);                                                     \
   dup_4x16b(c_0x07, 0x07);                                                     \
   dup_4x16b(c_0xFFFE, 0xFFFE);                                                 \
+  dup_4x16b(c_max_blocks_per_row, MAX_BLOCKS_PER_ROW);                         \
 
 
 #define compute_edge_delta_x2()                                                \
@@ -1087,6 +1089,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   and_4x16b(span_shift, left_right_x_16.high, c_0x07);                         \
   shl_variable_4x16b(span_shift, c_0xFFFE, span_shift);                        \
   shr_4x16b(left_right_x_16.high, left_right_x_16.high, 3);                    \
+  min_4x16b(left_right_x_16.high, left_right_x_16.high, c_max_blocks_per_row); \
                                                                                \
   u32 i;                                                                       \
   for(i = 0; i < 4; i++)                                                       \
@@ -1095,7 +1098,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
     span_edge_data[i].num_blocks = left_right_x_16.high.e[i];                  \
     span_edge_data[i].right_mask = span_shift.e[i];                            \
     span_edge_data[i].y = y_x4.e[i];                                           \
-    setup_spans_debug_check(span_edge_data[i]);                                \
+    setup_spans_debug_check(psx_gpu, &span_edge_data[i]);                      \
   }                                                                            \
                                                                                \
   span_edge_data += 4;                                                         \
@@ -1125,7 +1128,9 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
                                                                                \
   setup_spans_prologue_b();                                                    \
                                                                                \
-  if(height > 0)                                                               \
+  if (height > 512)                                                            \
+    height = 512;                                                              \
+  if (height > 0)                                                              \
   {                                                                            \
     y_x4.e[0] = y_a;                                                           \
     y_x4.e[1] = y_a + 1;                                                       \
@@ -1173,7 +1178,9 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
                                                                                \
   setup_spans_prologue_b();                                                    \
                                                                                \
-  if(height > 0)                                                               \
+  if (height > 512)                                                            \
+    height = 512;                                                              \
+  if (height > 0)                                                              \
   {                                                                            \
     y_x4.e[0] = y_a;                                                           \
     y_x4.e[1] = y_a - 1;                                                       \
@@ -1363,7 +1370,9 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
 
   setup_spans_prologue_b();
 
-  if(height_minor_a > 0)
+  if (height_minor_a > 512)
+    height_minor_a = 512;
+  if (height_minor_a > 0)
   {
     y_x4.e[0] = y_a;
     y_x4.e[1] = y_a - 1;
@@ -1405,7 +1414,9 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
     setup_spans_clip(increment, no);
   }
 
-  if(height_minor_b > 0)
+  if (height_minor_b > 512)
+    height_minor_b = 512;
+  if (height_minor_b > 0)
   {
     y_x4.e[0] = y_a;
     y_x4.e[1] = y_a + 1;
@@ -3045,6 +3056,7 @@ static void render_triangle_p(psx_gpu_struct *psx_gpu,
       }
     }
   }
+  assert(psx_gpu->span_edge_data[0].y < 1024u);
 
   u32 render_state = flags &
    (RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND | 
@@ -3914,17 +3926,9 @@ void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
 
 #ifndef NEON_BUILD
 
-void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
  s32 v, s32 width, s32 height, u32 color)
 {
-  if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
-   RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0 &&
-   (psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) == 0)
-  {
-    setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
-    return;
-  }
-
   u32 right_width = ((width - 1) & 0x7) + 1;
   u32 right_mask_bits = (0xFF << right_width);
   u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + x;
@@ -3992,8 +3996,9 @@ void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
 
 #endif
 
-void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
- s32 u, s32 v, s32 width, s32 height, u32 color)
+static void __attribute__((noinline))
+setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color)
 {
   u32 r = color & 0xFF;
   u32 g = (color >> 8) & 0xFF;
@@ -4007,7 +4012,7 @@ void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
 
   u32 num_width;
 
-  if(psx_gpu->num_blocks > MAX_BLOCKS)
+  if(psx_gpu->num_blocks)
   {
     flush_render_block_buffer(psx_gpu);
   }
@@ -4051,6 +4056,29 @@ void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
   }
 }
 
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color);
+
+void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color)
+{
+  if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
+   RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0 &&
+   (psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) == 0)
+  {
+    setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
+    return;
+  }
+
+  while (width > 0)
+  {
+    s32 w1 = width > 512 ? 512 : width;
+    setup_sprite_untextured_512(psx_gpu, x, y, 0, 0, w1, height, color);
+    x += 512;
+    width -= 512;
+  }
+}
+
 
 #define setup_sprite_blocks_switch_textured(texture_mode)                      \
   setup_sprite_##texture_mode                                                  \
index 88e40ac..957b434 100644 (file)
 #ifndef PSX_GPU_H
 #define PSX_GPU_H
 
+#define MAX_SPANS             512
+#define MAX_BLOCKS            64
+#define MAX_BLOCKS_PER_ROW    128
+
+#define SPAN_DATA_BLOCKS_SIZE 32
+
+#ifndef __ASSEMBLER__
+
 #include "vector_types.h"
 
 typedef enum
@@ -101,12 +109,6 @@ typedef struct
   vec_8x16u dither_offsets;  
 } block_struct;
 
-#define MAX_SPANS             512
-#define MAX_BLOCKS            64
-#define MAX_BLOCKS_PER_ROW    128
-
-#define SPAN_DATA_BLOCKS_SIZE 32
-
 typedef struct render_block_handler_struct render_block_handler_struct;
 
 typedef struct
@@ -261,5 +263,5 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
  const vertex_struct * __restrict__ a, const vertex_struct * __restrict__ b,
  const vertex_struct * __restrict__ c);
 
-#endif
-
+#endif // __ASSEMBLER__
+#endif // PSX_GPU_H
index d7ec340..bd6c7a1 100644 (file)
@@ -237,7 +237,11 @@ void setup_sprite_16bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
 static void setup_sprite_untextured_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y,\r
  s32 u, s32 v, s32 width, s32 height, u32 color)\r
 {\r
-  setup_sprite_untextured(psx_gpu, x, y, u, v, width * 2, height * 2, color);\r
+  width *= 2;\r
+  height *= 2;\r
+  if (width > 1024)\r
+    width = 1024;\r
+  setup_sprite_untextured(psx_gpu, x, y, u, v, width, height, color);\r
 }\r
 \r
 #define setup_sprite_blocks_switch_textured_4x(texture_mode)                   \\r
index c62c1ba..f0ba39f 100644 (file)
  * General Public License for more details.
  */
 
-#define MAX_SPANS                                         512
-#define MAX_BLOCKS                                        64
-#define MAX_BLOCKS_PER_ROW                                128
-
-#define RENDER_STATE_MASK_EVALUATE                        0x20
-#define RENDER_FLAGS_MODULATE_TEXELS                      0x1
-#define RENDER_FLAGS_BLEND                                0x2
 #define RENDER_INTERLACE_ENABLED                          0x1
 
+#include "psx_gpu.h"
 #include "psx_gpu_offsets.h"
 
 #define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
 
 #ifdef __MACH__
 #define flush_render_block_buffer _flush_render_block_buffer
-#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
 #define update_texture_8bpp_cache _update_texture_8bpp_cache
 #endif
 
@@ -565,6 +558,8 @@ function(compute_all_gradients)
 #define left_x_32_low                            d22
 #define left_x_32_high                           d23
 
+#define tmp_max_blocks                           d20
+
 #define edges_xy                                 q0
 #define edges_dx_dy                              d2
 #define edge_shifts                              d3
@@ -819,8 +814,10 @@ function(compute_all_gradients)
   str b, [span_b_offset], #4;                                                  \
   setup_spans_adjust_interpolants_##direction();                               \
                                                                                \
+  vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
   vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
                                                                                \
   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
                                                                                \
@@ -867,8 +864,10 @@ function(compute_all_gradients)
   str b, [span_b_offset], #4;                                                  \
   setup_spans_adjust_interpolants_##direction();                               \
                                                                                \
-  vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmov.u16 tmp_max_blocks, #MAX_BLOCKS_PER_ROW;                                \
   vshr.u16 left_right_x_16_high, left_right_x_16_high, #3;                     \
+  vshl.u16 span_shifts, c_0xFFFE, span_shifts;                                 \
+  vmin.u16 left_right_x_16_high, left_right_x_16_high, tmp_max_blocks;         \
                                                                                \
   vst4.u16 { left_right_x_16, span_shifts_y }, [span_edge_data]!;              \
                                                                                \
@@ -908,7 +907,9 @@ function(compute_all_gradients)
   ble 1f;                                                                      \
                                                                                \
   orr temp, y_a, y_a, lsl #16;                                                 \
+  cmp height, #512;                                                            \
   add temp, temp, #(1 << 16);                                                  \
+  movgt height, #512;                                                          \
   add y_a, temp, #2;                                                           \
   add y_a, y_a, #(2 << 16);                                                    \
   vmov y_x4, temp, y_a;                                                        \
@@ -963,7 +964,9 @@ function(compute_all_gradients)
   ble 1f;                                                                      \
                                                                                \
   orr temp, y_a, y_a, lsl #16;                                                 \
+  cmp height, #512;                                                            \
   sub temp, temp, #(1 << 16);                                                  \
+  movgt height, #512;                                                          \
   sub y_a, temp, #2;                                                           \
   sub y_a, y_a, #(2 << 16);                                                    \
   vmov y_x4, temp, y_a;                                                        \
@@ -5826,14 +5829,7 @@ function(setup_sprite_16bpp_4x)
 
 .align 3
 
-function(setup_sprite_untextured)
-  ldrh r12, [psx_gpu, #psx_gpu_render_state_offset]
-  tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
-    | RENDER_FLAGS_BLEND)
-  ldrbeq r12, [psx_gpu, #psx_gpu_render_mode_offset]
-  tsteq r12, #RENDER_INTERLACE_ENABLED
-  beq setup_sprite_untextured_simple
-
+function(setup_sprite_untextured_512)
   stmdb sp!, { r4 - r11, r14 }
 
   ldr width, [sp, #40]
index 0039254..ac4af9d 100644 (file)
@@ -115,6 +115,7 @@ typedef union
 #define gvhaddq_u16(d, a, b)     d.u16 = vhaddq_u16(a.u16, b.u16)
 #define gvmax_s16(d, a, b)       d.s16 = vmax_s16(a.s16, b.s16)
 #define gvmin_s16(d, a, b)       d.s16 = vmin_s16(a.s16, b.s16)
+#define gvmin_u16(d, a, b)       d.u16 = vmin_u16(a.u16, b.u16)
 #define gvminq_u8(d, a, b)       d.u8  = vminq_u8(a.u8, b.u8)
 #define gvminq_u16(d, a, b)      d.u16 = vminq_u16(a.u16, b.u16)
 #define gvmla_s32(d, a, b)       d.s32 = vmla_s32(d.s32, a.s32, b.s32)
@@ -353,7 +354,8 @@ typedef union
 }
 #endif // !__SSSE3__
 #ifdef __SSE4_1__
-#define gvminq_u16(d, a, b)      d.m = _mm_min_epu16(a.m, b.m)
+#define gvmin_u16(d, a, b)       d.m = _mm_min_epu16(a.m, b.m)
+#define gvminq_u16               gvmin_u16
 #define gvmovl_u8(d, s)          d.m = _mm_cvtepu8_epi16(s.m)
 #define gvmovl_s8(d, s)          d.m = _mm_cvtepi8_epi16(s.m)
 #define gvmovl_s32(d, s)         d.m = _mm_cvtepi32_epi64(s.m)
@@ -463,11 +465,12 @@ typedef union
 // can do this because the caller needs the msb clear
 #define gvhaddq_u16(d, a, b)     d.u16 = (a.u16 + b.u16) >> 1
 #endif
-#ifndef gvminq_u16
-#define gvminq_u16(d, a, b) { \
+#ifndef gvmin_u16
+#define gvmin_u16(d, a, b) { \
   gvu16 t_ = a.u16 < b.u16; \
   d.u16 = (a.u16 & t_) | (b.u16 & ~t_); \
 }
+#define gvminq_u16               gvmin_u16
 #endif
 #ifndef gvmlsq_s32
 #define gvmlsq_s32(d, a, b)      d.s32 -= a.s32 * b.s32
@@ -1093,6 +1096,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
   span_b_offset = psx_gpu->span_b_offset;                                      \
                                                                                \
   vec_8x16u c_0x0001;                                                          \
+  vec_4x16u c_max_blocks_per_row;                                              \
                                                                                \
   gvdupq_n_u16(c_0x0001, 0x0001);                                              \
   gvdupq_n_u16(left_edge, psx_gpu->viewport_start_x);                          \
@@ -1101,6 +1105,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
   gvdup_n_u16(c_0x04, 0x04);                                                   \
   gvdup_n_u16(c_0x07, 0x07);                                                   \
   gvdup_n_u16(c_0xFFFE, 0xFFFE);                                               \
+  gvdup_n_u16(c_max_blocks_per_row, MAX_BLOCKS_PER_ROW);                       \
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 // better encoding, remaining bits are unused anyway
@@ -1351,6 +1356,7 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
   gvand(span_shift, left_right_x_16_hi, c_0x07);                               \
   setup_spans_make_span_shift(span_shift);                                     \
   gvshr_n_u16(left_right_x_16_hi, left_right_x_16_hi, 3);                      \
+  gvmin_u16(left_right_x_16_hi, left_right_x_16_hi, c_max_blocks_per_row);     \
                                                                                \
   gvst4_pi_u16(left_right_x_16_lo, left_right_x_16_hi, span_shift, y_x4,       \
     span_edge_data);                                                           \
@@ -1380,7 +1386,9 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
                                                                                \
   setup_spans_prologue_b();                                                    \
                                                                                \
-  if(height > 0)                                                               \
+  if (height > 512)                                                            \
+    height = 512;                                                              \
+  if (height > 0)                                                              \
   {                                                                            \
     u64 y_x4_ = ((u64)(y_a + 3) << 48) | ((u64)(u16)(y_a + 2) << 32)           \
               | (u32)((y_a + 1) << 16) | (u16)y_a;                             \
@@ -1426,7 +1434,9 @@ void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
                                                                                \
   setup_spans_prologue_b();                                                    \
                                                                                \
-  if(height > 0)                                                               \
+  if (height > 512)                                                            \
+    height = 512;                                                              \
+  if (height > 0)                                                              \
   {                                                                            \
     u64 y_x4_ = ((u64)(y_a - 3) << 48) | ((u64)(u16)(y_a - 2) << 32)           \
               | (u32)((y_a - 1) << 16) | (u16)y_a;                             \
@@ -1642,7 +1652,9 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
 
   setup_spans_prologue_b();
 
-  if(height_minor_a > 0)
+  if (height_minor_a > 512)
+    height_minor_a = 512;
+  if (height_minor_a > 0)
   {
     u64 y_x4_ = ((u64)(y_a - 3) << 48) | ((u64)(u16)(y_a - 2) << 32)
               | (u32)((y_a - 1) << 16) | (u16)y_a;
@@ -1683,7 +1695,9 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
     setup_spans_clip(increment, no);
   }
 
-  if(height_minor_b > 0)
+  if (height_minor_b > 512)
+    height_minor_b = 512;
+  if (height_minor_b > 0)
   {
     u64 y_x4_ = ((u64)(y_a + 3) << 48) | ((u64)(u16)(y_a + 2) << 32)
               | (u32)((y_a + 1) << 16) | (u16)y_a;
@@ -3167,19 +3181,11 @@ void blend_blocks_textured_unblended_off(psx_gpu_struct *psx_gpu)
 {
 }
 
-void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
  s32 v, s32 width, s32 height, u32 color)
 {
-  if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
-   RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0 &&
-   (psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) == 0)
-  {
-    setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
-    return;
-  }
-
 #if 0
-  setup_sprite_untextured_(psx_gpu, x, y, u, v, width, height, color);
+  setup_sprite_untextured_512_(psx_gpu, x, y, u, v, width, height, color);
   return;
 #endif
   u32 right_width = ((width - 1) & 0x7) + 1;
index a8080af..3d1d1bd 100644 (file)
@@ -84,8 +84,7 @@
 #define setup_sprite_4bpp_4x setup_sprite_4bpp_4x_
 #define setup_sprite_8bpp_4x setup_sprite_8bpp_4x_
 #define setup_sprite_16bpp_4x setup_sprite_16bpp_4x_
-#define setup_sprite_untextured setup_sprite_untextured_
-#define setup_sprite_untextured_simple setup_sprite_untextured_simple_
+#define setup_sprite_untextured_512 setup_sprite_untextured_512_
 #define scale2x_tiles8 scale2x_tiles8_
 #endif
 
@@ -205,10 +204,8 @@ void setup_sprite_8bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
 void setup_sprite_16bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
  s32 width, s32 height, u32 color);
 
-void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+void setup_sprite_untextured_512(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
  s32 v, s32 width, s32 height, u32 color);
-void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
- s32 u, s32 v, s32 width, s32 height, u32 color);
 
 void scale2x_tiles8(void *dst, const void *src, int w8, int h);
 
@@ -275,7 +272,6 @@ void scale2x_tiles8(void *dst, const void *src, int w8, int h);
 #undef setup_sprite_4bpp_4x
 #undef setup_sprite_8bpp_4x
 #undef setup_sprite_16bpp_4x
-#undef setup_sprite_untextured
-#undef setup_sprite_untextured_simple
+#undef setup_sprite_untextured_512
 #undef scale2x_tiles8
 #endif
index 6f2bcbf..6bc7643 100644 (file)
     (dest).e[_i] = result;                                                     \
   })                                                                           \
 
+#define min_4x16b(dest, source_a, source_b)                                    \
+  foreach_element(4,                                                           \
+  {                                                                            \
+    s32 result = (source_a).e[_i];                                             \
+    if((source_b).e[_i] < result)                                              \
+      result = (source_b).e[_i];                                               \
+    (dest).e[_i] = result;                                                     \
+  })                                                                           \
+
 #define min_8x16b(dest, source_a, source_b)                                    \
   foreach_element(8,                                                           \
   {                                                                            \