From: notaz <notasas@gmail.com>
Date: Mon, 18 Nov 2024 01:15:32 +0000 (+0200)
Subject: rect quad optimizations
X-Git-Tag: r25~101
X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f060f4be678b55bc3e1c3f3d5b3cf4baff84ef3a;p=pcsx_rearmed.git

rect quad optimizations
---

diff --git a/Makefile b/Makefile
index 39b5fbaf..83559310 100644
--- a/Makefile
+++ b/Makefile
@@ -229,7 +229,7 @@ plugins/dfsound/out.o: CFLAGS += -DHAVE_LIBRETRO
 endif
 
 # builtin gpu
-OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o
+OBJS += plugins/gpulib/gpu.o plugins/gpulib/vout_pl.o plugins/gpulib/prim.o
 ifeq "$(BUILTIN_GPU)" "neon"
 CFLAGS += -DGPU_NEON
 OBJS += plugins/gpu_neon/psx_gpu_if.o
diff --git a/include/compiler_features.h b/include/compiler_features.h
index 21549ddf..77114efb 100644
--- a/include/compiler_features.h
+++ b/include/compiler_features.h
@@ -2,6 +2,7 @@
 #ifdef __GNUC__
 # define likely(x)       __builtin_expect((x),1)
 # define unlikely(x)     __builtin_expect((x),0)
+# define preload         __builtin_prefetch
 # ifdef __clang__
 #  define noinline       __attribute__((noinline))
 # else
@@ -11,6 +12,7 @@
 #else
 # define likely(x)       (x)
 # define unlikely(x)     (x)
+# define preload         (x)
 # define noinline
 # define attr_unused
 #endif
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
index e78feaf2..1fa06a15 100644
--- a/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
+++ b/plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
@@ -16,6 +16,7 @@
 
 #include "common.h"
 #include "../../gpulib/gpu_timing.h"
+#include "../../gpulib/gpu.h"
 
 #ifndef command_lengths
 const u8 command_lengths[256] =
@@ -245,12 +246,27 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y,
 #define SET_Ex(r, v)
 #endif
 
+static void textured_sprite(psx_gpu_struct *psx_gpu, const u32 *list,
+  s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles)
+{
+  s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x);
+  s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y);
+  u8 v = (list[2] >> 8) & 0xff;
+  u8 u = list[2] & 0xff;
+
+  set_clut(psx_gpu, list[2] >> 16);
+
+  render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]);
+  gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height));
+}
+
 u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
  s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command)
 {
   vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
   u32 current_command = 0, command_length;
   u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last;
+  u32 siplified_prim[4*4];
 
   u32 *list_start = list;
   u32 *list_end = list + (size / 4);
@@ -328,8 +344,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x2C ... 0x2F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[9]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[4] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
         set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
   
         get_vertex_data_xy_uv(0, 2);   
@@ -383,8 +410,19 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x3C ... 0x3F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[11]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[5] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
   
         get_vertex_data_xy_uv_rgb(0, 0);
         get_vertex_data_xy_uv_rgb(1, 6);
@@ -525,23 +563,12 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
-  
-      case 0x64 ... 0x67:
-      {        
-        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = list_s16[6] & 0x3FF;
-        s32 height = list_s16[7] & 0x1FF;
-
-        set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
+      case 0x64 ... 0x67:
+        textured_sprite(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF,
+          &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x68 ... 0x6B:
       {
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -565,22 +592,11 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
-  
-      case 0x74 ... 0x77:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = 8, height = 8;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
+      case 0x74 ... 0x77:
+        textured_sprite(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x78 ... 0x7B:
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -594,19 +610,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
       }
   
       case 0x7C ... 0x7F:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = 16, height = 16;
-
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
+        textured_sprite(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
   
 #ifdef PCSX
       case 0x1F:                   //  irq?
@@ -1155,12 +1160,31 @@ static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
 }
 #endif
 
+static void textured_sprite_enh(psx_gpu_struct *psx_gpu, const u32 *list,
+  s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles)
+{
+  s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x);
+  s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y);
+  s32 width_b = width, height_b = height;
+  u8 v = (list[2] >> 8) & 0xff;
+  u8 u = list[2] & 0xff;
+
+  set_clut(psx_gpu, list[2] >> 16);
+
+  render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]);
+  gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height));
+
+  if (check_enhanced_range(psx_gpu, x, x + width))
+    do_sprite_enhanced(psx_gpu, x, y, u, v, width_b, height_b, list[0]);
+}
+
 u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
  s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command)
 {
   vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
   u32 current_command = 0, command_length;
   u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last;
+  u32 siplified_prim[4*4];
 
   u32 *list_start = list;
   u32 *list_end = list + (size / 4);
@@ -1265,8 +1289,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x2C ... 0x2F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[9]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[4] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_t(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
         set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
   
         get_vertex_data_xy_uv(0, 2);   
@@ -1318,8 +1353,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0x3C ... 0x3F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[11]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[5] >> 16);
+        if ((simplified_count = prim_try_simplify_quad_gt(siplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &siplified_prim[i * 4];
+            textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
   
         get_vertex_data_xy_uv_rgb(0, 0);
         get_vertex_data_xy_uv_rgb(1, 6);
@@ -1475,30 +1521,12 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         }
         break;
       }
-  
-      case 0x64 ... 0x67:
-      {        
-        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = list_s16[6] & 0x3FF;
-        s32 height = list_s16[7] & 0x1FF;
-
-        set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
-
-        if (check_enhanced_range(psx_gpu, x, x + width)) {
-          width = list_s16[6] & 0x3FF;
-          height = list_s16[7] & 0x1FF;
-          do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
-        }
+      case 0x64 ... 0x67:
+        textured_sprite_enh(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF,
+          &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x68 ... 0x6B:
       {
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -1528,26 +1556,11 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]);
         break;
       }
-  
-      case 0x74 ... 0x77:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = 8, height = 8;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
-
-        if (check_enhanced_range(psx_gpu, x, x + 8))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, 8, 8, list[0]);
+      case 0x74 ... 0x77:
+        textured_sprite_enh(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x78 ... 0x7B:
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -1562,25 +1575,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]);
         break;
       }
-  
-      case 0x7C ... 0x7F:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = 16, height = 16;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
-
-        if (check_enhanced_range(psx_gpu, x, x + 16))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, 16, 16, list[0]);
+      case 0x7C ... 0x7F:
+        textured_sprite_enh(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
 
       case 0x80 ... 0x9F:          //  vid -> vid
       case 0xA0 ... 0xBF:          //  sys -> vid
diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp
index c7169dd6..0f124f80 100644
--- a/plugins/gpu_unai/gpulib_if.cpp
+++ b/plugins/gpu_unai/gpulib_if.cpp
@@ -375,6 +375,36 @@ static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
 #endif
 
 #include "../gpulib/gpu_timing.h"
+
+static inline void textured_sprite(int &cpu_cycles_sum, int &cpu_cycles)
+{
+  u32 PRIM = le32_to_u32(gpu_unai.PacketBuffer.U4[0]) >> 24;
+  gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
+  u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+  s32 w = 0, h = 0;
+
+  //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+  // This fixes Silent Hill running animation on loading screens:
+  // (On PSX, color values 0x00-0x7F darken the source texture's color,
+  //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+  //  0x80 leaves source texture color unchanged, HOWEVER,
+  //   gpu_unai uses a simple lighting LUT whereby only the upper
+  //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+  //   0x80.
+  //
+  // NOTE: I've changed all textured sprite draw commands here and
+  //  elsewhere to use proper behavior, but left poly commands
+  //  alone, I don't want to slow rendering down too much. (TODO)
+  //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+  // Strip lower 3 bits of each color and determine if lighting should be used:
+  if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
+    driver_idx |= Lighting;
+  PS driver = gpuSpriteDrivers[driver_idx];
+  PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer };
+  gpuDrawS(packet, driver, &w, &h);
+  gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
+}
+
 extern const unsigned char cmd_lengths[256];
 
 int do_cmd_list(u32 *list_, int list_len,
@@ -468,8 +498,20 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x2D:
       case 0x2E:
       case 0x2F: {          // Textured 4-pt poly
-        gpuSetCLUT   (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
+        u32 simplified_count;
         gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[4]) >> 16);
+        if ((simplified_count = prim_try_simplify_quad_t(gpu_unai.PacketBuffer.U4,
+              gpu_unai.PacketBuffer.U4)))
+        {
+          for (i = 0;; ) {
+            textured_sprite(cpu_cycles_sum, cpu_cycles);
+            if (++i >= simplified_count)
+              break;
+            memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16);
+          }
+          break;
+        }
+        gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
 
         u32 driver_idx =
           //(gpu_unai.blit_mask?1024:0) |
@@ -542,8 +584,20 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x3D:
       case 0x3E:
       case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        gpuSetTexture (le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16);
+        u32 simplified_count;
+        gpuSetTexture(le32_to_u32(gpu_unai.PacketBuffer.U4[5]) >> 16);
+        if ((simplified_count = prim_try_simplify_quad_gt(gpu_unai.PacketBuffer.U4,
+              gpu_unai.PacketBuffer.U4)))
+        {
+          for (i = 0;; ) {
+            textured_sprite(cpu_cycles_sum, cpu_cycles);
+            if (++i >= simplified_count)
+              break;
+            memcpy(&gpu_unai.PacketBuffer.U4[0], &gpu_unai.PacketBuffer.U4[i * 4], 16);
+          }
+          break;
+        }
+        gpuSetCLUT(le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
         PP driver = gpuPolySpanDrivers[
           //(gpu_unai.blit_mask?1024:0) |
           Dithering |
@@ -651,31 +705,9 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x64:
       case 0x65:
       case 0x66:
-      case 0x67: {          // Textured rectangle (variable size)
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
-        s32 w = 0, h = 0;
-
-        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
-        // This fixes Silent Hill running animation on loading screens:
-        // (On PSX, color values 0x00-0x7F darken the source texture's color,
-        //  0x81-FF lighten textures (ultimately clamped to 0x1F),
-        //  0x80 leaves source texture color unchanged, HOWEVER,
-        //   gpu_unai uses a simple lighting LUT whereby only the upper
-        //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
-        //   0x80.
-        // 
-        // NOTE: I've changed all textured sprite draw commands here and
-        //  elsewhere to use proper behavior, but left poly commands
-        //  alone, I don't want to slow rendering down too much. (TODO)
-        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
-        // Strip lower 3 bits of each color and determine if lighting should be used:
-        if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
-          driver_idx |= Lighting;
-        PS driver = gpuSpriteDrivers[driver_idx];
-        gpuDrawS(packet, driver, &w, &h);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
-      } break;
+      case 0x67:            // Textured rectangle (variable size)
+        textured_sprite(cpu_cycles_sum, cpu_cycles);
+        break;
 
       case 0x68:
       case 0x69:
@@ -704,18 +736,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x76:
       case 0x77: {          // Textured rectangle (8x8)
         gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00080008);
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
-        s32 w = 0, h = 0;
-
-        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
-        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
-        // Strip lower 3 bits of each color and determine if lighting should be used:
-        if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
-          driver_idx |= Lighting;
-        PS driver = gpuSpriteDrivers[driver_idx];
-        gpuDrawS(packet, driver, &w, &h);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
+        textured_sprite(cpu_cycles_sum, cpu_cycles);
       } break;
 
       case 0x78:
@@ -734,17 +755,7 @@ int do_cmd_list(u32 *list_, int list_len,
       case 0x7E:
       case 0x7F: {          // Textured rectangle (16x16)
         gpu_unai.PacketBuffer.U4[3] = u32_to_le32(0x00100010);
-        gpuSetCLUT    (le32_to_u32(gpu_unai.PacketBuffer.U4[2]) >> 16);
-        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
-        s32 w = 0, h = 0;
-        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
-        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
-        // Strip lower 3 bits of each color and determine if lighting should be used:
-        if ((le32_raw(gpu_unai.PacketBuffer.U4[0]) & HTOLE32(0xF8F8F8)) != HTOLE32(0x808080))
-          driver_idx |= Lighting;
-        PS driver = gpuSpriteDrivers[driver_idx];
-        gpuDrawS(packet, driver, &w, &h);
-        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(w, h));
+        textured_sprite(cpu_cycles_sum, cpu_cycles);
       } break;
 
 #ifdef TEST
diff --git a/plugins/gpulib/Makefile b/plugins/gpulib/Makefile
index cff61410..53aaa886 100644
--- a/plugins/gpulib/Makefile
+++ b/plugins/gpulib/Makefile
@@ -5,7 +5,7 @@ endif
 
 include ../../config.mak
 
-OBJS += gpu.o
+OBJS += gpu.o prim.o
 
 ifeq "$(ARCH)" "arm"
 OBJS += vout_pl.o
diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c
index df1c1c6c..88aa6704 100644
--- a/plugins/gpulib/gpu.c
+++ b/plugins/gpulib/gpu.c
@@ -17,23 +17,11 @@
 #include "gpu_timing.h"
 #include "../../libpcsxcore/gpu.h" // meh
 #include "../../frontend/plugin_lib.h"
+#include "../../include/compiler_features.h"
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
-#ifdef __GNUC__
-# define unlikely(x) __builtin_expect((x), 0)
-# define preload __builtin_prefetch
-# ifndef __clang__
-#  define noinline __attribute__((noinline,noclone))
-# else
-#  define noinline __attribute__((noinline))
-# endif
-#else
-# define unlikely(x)
-# define preload(...)
-# define noinline
-#endif
 
 //#define log_io gpu_log
 #define log_io(...)
diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h
index e654500d..570d8421 100644
--- a/plugins/gpulib/gpu.h
+++ b/plugins/gpulib/gpu.h
@@ -147,6 +147,9 @@ void vout_update(void);
 void vout_blank(void);
 void vout_set_config(const struct rearmed_cbs *config);
 
+int  prim_try_simplify_quad_t (void *simplified, const void *prim);
+int  prim_try_simplify_quad_gt(void *simplified, const void *prim);
+
 /* listing these here for correct linkage if rasterizer uses c++ */
 struct GPUFreeze;
 
diff --git a/plugins/gpulib/prim.c b/plugins/gpulib/prim.c
new file mode 100644
index 00000000..d6294641
--- /dev/null
+++ b/plugins/gpulib/prim.c
@@ -0,0 +1,249 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../../include/compiler_features.h"
+#include "gpu.h"
+
+// retain neon's ability to sample textures pixel-perfectly
+#ifdef GPU_NEON
+#define STRICT
+#endif
+
+struct vert_t
+{
+  union {
+    struct {
+      int16_t x, y;
+    };
+    uint32_t xy;
+  };
+  union {
+    struct {
+      uint8_t u, v;
+      int16_t clut;
+    };
+    uint32_t uvclut;
+  };
+};
+
+// gt ~ gouraud textured
+struct vert_gt
+{
+  uint32_t rgb;
+  struct vert_t t;
+};
+
+struct quad_t
+{
+  uint32_t rgb_c;
+  struct vert_t v[4];
+};
+
+struct quad_gt
+{
+  struct vert_gt v[4];
+};
+
+struct sprite
+{
+  uint32_t rgb_c;
+  union {
+    struct {
+      int16_t x, y;
+    };
+    uint32_t xy;
+  };
+  union {
+    struct {
+      uint8_t u, v;
+      int16_t clut;
+    };
+    uint32_t uvclut;
+  };
+  int16_t w, h;
+};
+
+// debug
+#if 0
+static void log_quad_t(const struct quad_t *q, int ret)
+{
+#if 1
+  printf("quad_t %08x", q->rgb_c);
+  int i;
+  for (i = 0; i < 4; i++)
+    printf(" | %3d,%3d %3d,%3d",
+        q->v[i].x, q->v[i].y, q->v[i].u, q->v[i].v);
+  printf(" -> %d\n", ret);
+#endif
+}
+
+static void log_quad_gt(const struct vert_gt *v, int ret)
+{
+#if 1
+  printf("quad_gt %02x", v[0].rgb >> 24);
+  int i;
+  for (i = 0; i < 4; i++)
+    printf(" | %3d,%3d %3d,%3d %06x",
+        v[i].t.x, v[i].t.y, v[i].t.u, v[i].t.v, v[i].rgb & 0xffffff);
+  printf(" -> %d\n", ret);
+#endif
+}
+
+int prim_try_simplify_quad_t_(void *simplified, const void *prim_);
+int prim_try_simplify_quad_t(void *simplified, const void *prim_)
+{
+  struct quad_t prim = *(struct quad_t *)prim_;
+  int ret = prim_try_simplify_quad_t_(simplified, prim_);
+  #define prim_try_simplify_quad_t prim_try_simplify_quad_t_
+  ///if (!ret)
+    log_quad_t(&prim, ret);
+  return ret;
+}
+
+int prim_try_simplify_quad_gt_(void *simplified, const void *prim_);
+int prim_try_simplify_quad_gt(void *simplified, const void *prim_)
+{
+  struct quad_gt prim = *(struct quad_gt *)prim_;
+  int ret = prim_try_simplify_quad_gt_(simplified, prim_);
+  #define prim_try_simplify_quad_gt prim_try_simplify_quad_gt_
+  ///if (!ret)
+    log_quad_gt(prim.v, ret);
+  return ret;
+}
+#endif // debug
+
+static noinline int simplify_quad_t(void *simplified, const struct vert_t *v,
+  int xd, int ud, int yd, int vd, uint32_t rgb_c, uint16_t clut)
+{
+  struct sprite *s = simplified;
+  int ret = 1;
+  rgb_c &= HTOLE32(0x03ffffff);
+  rgb_c |= HTOLE32(0x64000000);
+  xd = abs(xd);
+  ud = abs(ud);
+  s[0].rgb_c = rgb_c;
+  s[0].xy = v->xy;
+  s[0].u = v->u;
+  s[0].v = v->v;
+  s[0].clut = clut;
+  s[0].w = HTOLE16(xd);
+  s[0].h = HTOLE16(yd);
+#ifndef STRICT
+  if (xd != ud) {
+    int mid = xd / 2;
+    s[0].w = HTOLE16(mid);
+    s[1].rgb_c = rgb_c;
+    s[1].x = HTOLE16(LE16TOH(s[0].x) + mid);
+    s[1].y = s[0].y;
+    s[1].u = s[0].u + mid + ud - xd;
+    s[1].v = s[0].v;
+    s[1].clut = clut;
+    s[1].w = HTOLE16(xd - mid);
+    s[1].h = s[0].h;
+    ret = 2;
+  }
+  if (yd != vd) {
+    int i, mid = yd / 2, y = LE16TOH(s[0].y);
+    memcpy(s + ret, s, sizeof(s[0]) * ret);
+    for (i = 0; i < ret; i++) {
+      s[i].h = HTOLE16(mid);
+      s[ret+i].y = HTOLE16(y + mid);
+      s[ret+i].h = HTOLE16(yd - mid);
+      s[ret+i].v = s[0].v + mid + vd - yd;
+    }
+    ret *= 2;
+  }
+#endif
+  return ret;
+}
+
+// this is split to reduce gcc spilling
+static noinline int prim_try_simplify_quad_t2(void *simplified,
+  const struct vert_t *v, uint32_t rgb_c)
+{
+  do {
+    int yd = LE16TOH(v[2].y) - LE16TOH(v[0].y);
+    int xd, ud, vd;
+    if (yd < 0)
+      break;
+    xd = LE16TOH(v[1].x) - LE16TOH(v[0].x);
+    ud = LE16TOH(v[1].u) - LE16TOH(v[0].u);
+    vd = LE16TOH(v[2].v) - LE16TOH(v[0].v);
+#ifdef STRICT
+    if (xd != ud || yd != vd)
+#else
+    if (abs(xd - ud) > 1 || abs(yd - vd) > 1)
+#endif
+      break;
+    return simplify_quad_t(simplified, xd < 0 ? &v[1] : &v[0],
+             xd, ud, yd, vd, rgb_c, v[0].clut);
+  }
+  while (0);
+  return 0;
+}
+
+static noinline int prim_try_simplify_quad_gt2(void *simplified,
+  const struct vert_gt *v)
+{
+  do {
+    int yd = LE16TOH(v[2].t.y) - LE16TOH(v[0].t.y);
+    int xd, ud, vd;
+    if (yd < 0)
+      break;
+    xd = LE16TOH(v[1].t.x) - LE16TOH(v[0].t.x);
+    ud = LE16TOH(v[1].t.u) - LE16TOH(v[0].t.u);
+    vd = LE16TOH(v[2].t.v) - LE16TOH(v[0].t.v);
+#ifdef STRICT
+    if (xd != ud || yd != vd)
+#else
+    if (abs(xd - ud) > 1 || abs(yd - vd) > 1)
+#endif
+      break;
+    if (!(v[0].rgb & HTOLE32(1 << 24))) { // modulation/"lighting"
+      uint32_t i, xor = 0, rgb0 = v[0].rgb;
+      for (i = 1; i < 4; i++)
+        xor |= rgb0 ^ v[i].rgb;
+      if (xor & HTOLE32(0xf8f8f8))
+        break;
+    }
+    return simplify_quad_t(simplified, xd < 0 ? &v[1].t : &v[0].t,
+        xd, ud, yd, vd, v[0].rgb, v[0].t.clut);
+  }
+  while (0);
+  return 0;
+}
+
+// 2c-2f
+int prim_try_simplify_quad_t(void *simplified, const void *prim_)
+{
+  const struct quad_t *prim = prim_;
+  const struct vert_t *v = prim->v;
+  int ret = 0;
+  do {
+    if (v[0].y != v[1].y || v[0].x != v[2].x || v[2].y != v[3].y || v[1].x != v[3].x)
+      break;
+    if (v[0].v != v[1].v || v[0].u != v[2].u || v[2].v != v[3].v || v[1].u != v[3].u)
+      break;
+    ret = prim_try_simplify_quad_t2(simplified, v, prim->rgb_c);
+  }
+  while (0);
+  return ret;
+}
+
+// 3c-3f
+int prim_try_simplify_quad_gt(void *simplified, const void *prim)
+{
+  const struct vert_gt *v = prim;
+  int ret = 0;
+  do {
+    if (v[0].t.y != v[1].t.y || v[0].t.x != v[2].t.x || v[2].t.y != v[3].t.y || v[1].t.x != v[3].t.x)
+      break;
+    if (v[0].t.v != v[1].t.v || v[0].t.u != v[2].t.u || v[2].t.v != v[3].t.v || v[1].t.u != v[3].t.u)
+      break;
+    ret = prim_try_simplify_quad_gt2(simplified, v);
+  }
+  while (0);
+  return ret;
+}
+
+// vim:shiftwidth=2:expandtab