frontend: update libpicofe, fix missed callbacks
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_parse.c
index af26fa3..66d2f40 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "common.h"
 #include "../../gpulib/gpu_timing.h"
+#include "../../gpulib/gpu.h"
 
 #ifndef command_lengths
 const u8 command_lengths[256] =
@@ -39,7 +40,7 @@ const u8 command_lengths[256] =
 };
 #endif
 
-void update_texture_ptr(psx_gpu_struct *psx_gpu)
+static void update_texture_ptr(psx_gpu_struct *psx_gpu)
 {
   u8 *texture_base;
   u8 *texture_ptr;
@@ -91,7 +92,7 @@ void update_texture_ptr(psx_gpu_struct *psx_gpu)
   psx_gpu->texture_page_ptr = texture_ptr;  
 }
 
-void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings)
+static void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings)
 {
   texture_settings &= 0x1FF;
   if(psx_gpu->texture_settings != texture_settings)
@@ -107,14 +108,14 @@ void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings)
 
     psx_gpu->render_state_base = render_state_base;
 
-    psx_gpu->current_texture_mask = 0x1 << new_texture_page;
+    psx_gpu->current_texture_mask = 1u << new_texture_page;
 
     if(texture_mode == TEXTURE_MODE_8BPP)
     {     
       // In 8bpp mode 256x256 takes up two pages. If it's on the right edge it
       // wraps back around to the left edge.
       u32 adjacent_texture_page = ((texture_settings + 1) & 0xF) | (texture_settings & 0x10);
-      psx_gpu->current_texture_mask |= 0x1 << adjacent_texture_page;
+      psx_gpu->current_texture_mask |= 1u << adjacent_texture_page;
 
       if((psx_gpu->last_8bpp_texture_page ^ new_texture_page) & 0x1)
       {
@@ -135,17 +136,18 @@ void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings)
   }
 }
 
-void set_clut(psx_gpu_struct *psx_gpu, u32 clut_settings)
+static void set_clut(psx_gpu_struct *psx_gpu, u32 clut_settings)
 {
-  if(psx_gpu->clut_settings != clut_settings)
+  clut_settings &= 0x7FFF;
+  if (psx_gpu->clut_settings != clut_settings)
   {
     flush_render_block_buffer(psx_gpu);
     psx_gpu->clut_settings = clut_settings;
-    psx_gpu->clut_ptr = psx_gpu->vram_ptr + ((clut_settings & 0x7FFF) * 16);
+    psx_gpu->clut_ptr = psx_gpu->vram_ptr + clut_settings * 16;
   }
 }
 
-void set_triangle_color(psx_gpu_struct *psx_gpu, u32 triangle_color)
+static void set_triangle_color(psx_gpu_struct *psx_gpu, u32 triangle_color)
 {
   if(psx_gpu->triangle_color != triangle_color)
   {
@@ -200,21 +202,9 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y,
   }
 }
 
-#define sign_extend_12bit(value)                                               \
-  (((s32)((value) << 20)) >> 20)                                               \
-
-#define sign_extend_11bit(value)                                               \
-  (((s32)((value) << 21)) >> 21)                                               \
-
-#define sign_extend_10bit(value)                                               \
-  (((s32)((value) << 22)) >> 22)                                               \
-
-
 #define get_vertex_data_xy(vertex_number, offset16)                            \
-  vertexes[vertex_number].x =                                                  \
-   sign_extend_12bit(list_s16[offset16]) + psx_gpu->offset_x;                  \
-  vertexes[vertex_number].y =                                                  \
-   sign_extend_12bit(list_s16[(offset16) + 1]) + psx_gpu->offset_y;            \
+  vertexes[vertex_number].x = sign_extend_11bit(list_s16[offset16]);           \
+  vertexes[vertex_number].y = sign_extend_11bit(list_s16[(offset16) + 1]);     \
 
 #define get_vertex_data_uv(vertex_number, offset16)                            \
   vertexes[vertex_number].u = list_s16[offset16] & 0xFF;                       \
@@ -251,11 +241,58 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y,
 #define SET_Ex(r, v)
 #endif
 
+static void textured_sprite(psx_gpu_struct *psx_gpu, const u32 *list,
+  s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles)
+{
+  s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x);
+  s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y);
+  u8 v = (list[2] >> 8) & 0xff;
+  u8 u = list[2] & 0xff;
+
+  set_clut(psx_gpu, list[2] >> 16);
+
+  render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]);
+  gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height));
+}
+
+static void undo_offset(vertex_struct *vertexes, prepared_triangle *triangle)
+{
+  s32 i;
+  for (i = 0; i < 3; i++)
+  {
+    vertexes[i].x -= triangle->offset_x;
+    vertexes[i].y -= triangle->offset_y;
+  }
+}
+
+static void do_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
+ u32 current_command)
+{
+  prepared_triangle triangle;
+  if (prepare_triangle(psx_gpu, vertexes, &triangle))
+    render_triangle_p(psx_gpu, triangle.vertexes, current_command);
+}
+
+static void do_quad(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
+ u32 current_command)
+{
+  prepared_triangle triangle;
+  if (prepare_triangle(psx_gpu, vertexes, &triangle))
+  {
+    render_triangle_p(psx_gpu, triangle.vertexes, current_command);
+    undo_offset(vertexes, &triangle);
+  }
+  if (prepare_triangle(psx_gpu, vertexes + 1, &triangle))
+    render_triangle_p(psx_gpu, triangle.vertexes, current_command);
+}
+
 u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
- s32 *cpu_cycles_out, u32 *last_command)
+ s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command)
 {
   vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
-  u32 current_command = 0, command_length, cpu_cycles = 0;
+  u32 current_command = 0, command_length;
+  u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last;
+  u32 simplified_prim[4*4];
 
   u32 *list_start = list;
   u32 *list_end = list + (size / 4);
@@ -284,7 +321,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         u32 color = list[0] & 0xFFFFFF;
 
         do_fill(psx_gpu, x, y, width, height, color);
-        cpu_cycles += gput_fill(width, height);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_fill(width, height));
         break;
       }
 
@@ -296,8 +333,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy(1, 4);
         get_vertex_data_xy(2, 6);
           
-        render_triangle(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base();
+        do_triangle(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
         break;
       }
   
@@ -311,8 +348,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv(1, 6);
         get_vertex_data_xy_uv(2, 10);
   
-        render_triangle(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base_t();
+        do_triangle(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
         break;
       }
   
@@ -324,27 +361,37 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy(1, 4);
         get_vertex_data_xy(2, 6);
         get_vertex_data_xy(3, 8);
-  
-        render_triangle(psx_gpu, vertexes, current_command);
-        render_triangle(psx_gpu, &(vertexes[1]), current_command);
-        cpu_cycles += gput_quad_base();
+
+        do_quad(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
         break;
       }
   
       case 0x2C ... 0x2F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[9]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[4] >> 16);
+        if (!(psx_gpu->render_state_base & RENDER_STATE_DITHER) &&
+            (simplified_count = prim_try_simplify_quad_t(simplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &simplified_prim[i * 4];
+            textured_sprite(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
         set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
   
         get_vertex_data_xy_uv(0, 2);   
         get_vertex_data_xy_uv(1, 6);   
         get_vertex_data_xy_uv(2, 10);  
         get_vertex_data_xy_uv(3, 14);
-  
-        render_triangle(psx_gpu, vertexes, current_command);
-        render_triangle(psx_gpu, &(vertexes[1]), current_command);
-        cpu_cycles += gput_quad_base_t();
+
+        do_quad(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
         break;
       }
   
@@ -354,8 +401,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_rgb(1, 4);
         get_vertex_data_xy_rgb(2, 8);
   
-        render_triangle(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base_g();
+        do_triangle(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
         break;
       }
   
@@ -368,8 +415,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv_rgb(1, 6);
         get_vertex_data_xy_uv_rgb(2, 12);
 
-        render_triangle(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base_gt();
+        do_triangle(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
         break;
       }
   
@@ -379,26 +426,36 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_rgb(1, 4);
         get_vertex_data_xy_rgb(2, 8);
         get_vertex_data_xy_rgb(3, 12);
-  
-        render_triangle(psx_gpu, vertexes, current_command);
-        render_triangle(psx_gpu, &(vertexes[1]), current_command);
-        cpu_cycles += gput_quad_base_g();
+
+        do_quad(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
         break;
       }
   
       case 0x3C ... 0x3F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[11]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[5] >> 16);
+        if (!(psx_gpu->render_state_base & RENDER_STATE_DITHER) &&
+            (simplified_count = prim_try_simplify_quad_gt(simplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &simplified_prim[i * 4];
+            textured_sprite(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
   
         get_vertex_data_xy_uv_rgb(0, 0);
         get_vertex_data_xy_uv_rgb(1, 6);
         get_vertex_data_xy_uv_rgb(2, 12);
         get_vertex_data_xy_uv_rgb(3, 18);
-  
-        render_triangle(psx_gpu, vertexes, current_command);
-        render_triangle(psx_gpu, &(vertexes[1]), current_command);
-        cpu_cycles += gput_quad_base_gt();
+
+        do_quad(psx_gpu, vertexes, current_command);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
         break;
       }
   
@@ -410,7 +467,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         vertexes[1].y = list_s16[5] + psx_gpu->offset_y;
 
         render_line(psx_gpu, vertexes, current_command, list[0], 0);
-        cpu_cycles += gput_line(0);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
         break;
       }
   
@@ -432,7 +489,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
           render_line(psx_gpu, vertexes, current_command, list[0], 0);
-          cpu_cycles += gput_line(0);
+          gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 
           list_position++;
           num_vertexes++;
@@ -467,7 +524,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         vertexes[1].y = list_s16[7] + psx_gpu->offset_y;
 
         render_line(psx_gpu, vertexes, current_command, 0, 0);
-        cpu_cycles += gput_line(0);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
         break;
       }
  
@@ -498,7 +555,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
           render_line(psx_gpu, vertexes, current_command, 0, 0);
-          cpu_cycles += gput_line(0);
+          gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 
           list_position += 2;
           num_vertexes++;
@@ -527,26 +584,15 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
-  
-      case 0x64 ... 0x67:
-      {        
-        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = list_s16[6] & 0x3FF;
-        s32 height = list_s16[7] & 0x1FF;
-
-        set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+      case 0x64 ... 0x67:
+        textured_sprite(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF,
+          &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x68 ... 0x6B:
       {
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -555,7 +601,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
-        cpu_cycles += gput_sprite(1, 1);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
         break;
       }
   
@@ -567,25 +613,14 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
-  
-      case 0x74 ... 0x77:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = 8, height = 8;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+      case 0x74 ... 0x77:
+        textured_sprite(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x78 ... 0x7B:
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -594,24 +629,13 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
         break;
       }
   
       case 0x7C ... 0x7F:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
-        s32 width = 16, height = 16;
-
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF,
-           &width, &height, current_command, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+        textured_sprite(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
   
 #ifdef PCSX
       case 0x1F:                   //  irq?
@@ -661,7 +685,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
       case 0xE1:
         set_texture(psx_gpu, list[0]);
 
-        if(list[0] & (1 << 9))
+        if ((psx_gpu->allow_dithering && (list[0] & (1 << 9)))
+            || psx_gpu->force_dithering)
           psx_gpu->render_state_base |= RENDER_STATE_DITHER;
         else
           psx_gpu->render_state_base &= ~RENDER_STATE_DITHER;
@@ -754,10 +779,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0xE5:
       {
-        s32 offset_x = list[0] << 21;
-        s32 offset_y = list[0] << 10;
-        psx_gpu->offset_x = offset_x >> 21;
-        psx_gpu->offset_y = offset_y >> 21; 
+        psx_gpu->offset_x = sign_extend_11bit(list[0]);
+        psx_gpu->offset_y = sign_extend_11bit(list[0] >> 11);
   
         SET_Ex(5, list[0]);
         break;
@@ -789,7 +812,8 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   }
 
 breakloop:
-  *cpu_cycles_out += cpu_cycles;
+  *cpu_cycles_sum_out += cpu_cycles_sum;
+  *cpu_cycles_last = cpu_cycles;
   *last_command = current_command;
   return list - list_start;
 }
@@ -908,6 +932,7 @@ static void select_enhancement_buf(psx_gpu_struct *psx_gpu)
   psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \
   psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \
   psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \
+  psx_gpu->hacks_active = 0; \
   psx_gpu->uvrgb_phase = 0x8000; \
 }
 
@@ -922,7 +947,7 @@ static int enhancement_enable(psx_gpu_struct *psx_gpu)
   psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1;
   if (psx_gpu->viewport_end_x - psx_gpu->viewport_start_x + 1 > 1024)
     psx_gpu->viewport_end_x = psx_gpu->viewport_start_x + 1023;
-  psx_gpu->uvrgb_phase = 0x7fff;
+  //psx_gpu->uvrgb_phase = 0x7fff;
   return 1;
 }
 
@@ -1016,8 +1041,6 @@ void scale2x_tiles8(void *dst, const void *src, int w8, int h)
 }
 #endif
 
-static int disable_main_render;
-
 // simple check for a case where no clipping is used
 //  - now handled by adjusting the viewport
 static int check_enhanced_range(psx_gpu_struct *psx_gpu, int x, int y)
@@ -1025,102 +1048,66 @@ static int check_enhanced_range(psx_gpu_struct *psx_gpu, int x, int y)
   return 1;
 }
 
-static int is_in_array(int val, int array[], int len)
+static u32 uv_hack(psx_gpu_struct *psx_gpu, const vertex_struct *vertex_ptrs)
 {
-  int i;
-  for (i = 0; i < len; i++)
-    if (array[i] == val)
-      return 1;
-  return 0;
-}
-
-static int make_members_unique(int array[], int len)
-{
-  int i, j;
-  for (i = j = 1; i < len; i++)
-    if (!is_in_array(array[i], array, j))
-      array[j++] = array[i];
-
-  if (array[0] > array[1]) {
-    i = array[0]; array[0] = array[1]; array[1] = i;
-  }
-  return j;
-}
-
-static void patch_u(vertex_struct *vertex_ptrs, int count, int old, int new)
-{
-  int i;
-  for (i = 0; i < count; i++)
-    if (vertex_ptrs[i].u == old)
-      vertex_ptrs[i].u = new;
-}
-
-static void patch_v(vertex_struct *vertex_ptrs, int count, int old, int new)
-{
-  int i;
-  for (i = 0; i < count; i++)
-    if (vertex_ptrs[i].v == old)
-      vertex_ptrs[i].v = new;
-}
-
-static void uv_hack(vertex_struct *vertex_ptrs, int vertex_count)
-{
-  int i, u[4], v[4];
-
-  for (i = 0; i < vertex_count; i++) {
-    u[i] = vertex_ptrs[i].u;
-    v[i] = vertex_ptrs[i].v;
-  }
-  if (make_members_unique(u, vertex_count) == 2 && u[1] - u[0] >= 8) {
-    if ((u[0] & 7) == 7) {
-      patch_u(vertex_ptrs, vertex_count, u[0], u[0] + 1);
-      //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0]+1, u[1]);
-    }
-    else if ((u[1] & 7) == 0 || u[1] - u[0] > 128) {
-      patch_u(vertex_ptrs, vertex_count, u[1], u[1] - 1);
-      //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0], u[1]-1);
-    }
-  }
-  if (make_members_unique(v, vertex_count) == 2 && ((v[0] - v[1]) & 7) == 0) {
-    if ((v[0] & 7) == 7) {
-      patch_v(vertex_ptrs, vertex_count, v[0], v[0] + 1);
-      //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0]+1, v[1]);
-    }
-    else if ((v[1] & 7) == 0) {
-      patch_v(vertex_ptrs, vertex_count, v[1], v[1] - 1);
-      //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0], v[1]-1);
-    }
+  int i, have_right_edge = 0, have_bottom_edge = 0, bad_u = 0, bad_v = 0;
+  u32 hacks = 0;
+
+  for (i = 0; i < 3; i++) {
+    int j = (i + 1) % 3, k = (i + 2) % 3;
+    int du = abs((int)vertex_ptrs[i].u - (int)vertex_ptrs[j].u);
+    int dv = abs((int)vertex_ptrs[i].v - (int)vertex_ptrs[j].v);
+    if (du && (du & 7) != 7)
+      bad_u = 1;
+    if (dv && (dv & 7) != 7)
+      bad_v = 1;
+    if (vertex_ptrs[i].x == vertex_ptrs[j].x && vertex_ptrs[k].x < vertex_ptrs[j].x)
+      have_right_edge = 1;
+    if (vertex_ptrs[i].y == vertex_ptrs[j].y)// && vertex_ptrs[k].y < vertex_ptrs[j].y)
+      have_bottom_edge = 1;
   }
+  if (have_right_edge && bad_u)
+    hacks |= AHACK_TEXTURE_ADJ_U;
+  if (have_bottom_edge && bad_v)
+    hacks |= AHACK_TEXTURE_ADJ_V;
+  return hacks;
 }
 
 static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
  vertex_struct *vertexes, u32 current_command)
 {
-  vertex_struct *vertex_ptrs[3];
+  prepared_triangle triangle;
 
-  if (!prepare_triangle(psx_gpu, vertexes, vertex_ptrs))
+  if (!prepare_triangle(psx_gpu, vertexes, &triangle))
     return;
 
-  if (!disable_main_render)
-    render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+  if (!psx_gpu->hack_disable_main)
+    render_triangle_p(psx_gpu, triangle.vertexes, current_command);
 
-  if (!check_enhanced_range(psx_gpu, vertex_ptrs[0]->x, vertex_ptrs[2]->x))
+  if (!check_enhanced_range(psx_gpu, triangle.vertexes[0]->x,
+        triangle.vertexes[2]->x))
     return;
 
   if (!enhancement_enable(psx_gpu))
     return;
 
-  shift_vertices3(vertex_ptrs);
+  if ((current_command & RENDER_FLAGS_TEXTURE_MAP) && psx_gpu->hack_texture_adj)
+    psx_gpu->hacks_active |= uv_hack(psx_gpu, vertexes);
+  shift_vertices3(triangle.vertexes);
   shift_triangle_area();
-  render_triangle_p(psx_gpu, vertex_ptrs, current_command);
-  unshift_vertices3(vertex_ptrs);
+  render_triangle_p(psx_gpu, triangle.vertexes, current_command);
+  //unshift_vertices3(triangle.vertexes);
 }
 
 static void do_quad_enhanced(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
  u32 current_command)
 {
+  s16 x12_save[2] = { vertexes[1].x, vertexes[2].x };
+  s16 y12_save[2] = { vertexes[1].y, vertexes[2].y };
   do_triangle_enhanced(psx_gpu, vertexes, current_command);
   enhancement_disable();
+  vertexes[1].x = x12_save[0], vertexes[2].x = x12_save[1];
+  vertexes[1].y = y12_save[0], vertexes[2].y = y12_save[1];
   do_triangle_enhanced(psx_gpu, &vertexes[1], current_command);
 }
 
@@ -1201,11 +1188,31 @@ static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
 }
 #endif
 
+static void textured_sprite_enh(psx_gpu_struct *psx_gpu, const u32 *list,
+  s32 width, s32 height, u32 *cpu_cycles_sum, u32 *cpu_cycles)
+{
+  s32 x = sign_extend_11bit(list[1] + psx_gpu->offset_x);
+  s32 y = sign_extend_11bit((list[1] >> 16) + psx_gpu->offset_y);
+  s32 width_b = width, height_b = height;
+  u8 v = (list[2] >> 8) & 0xff;
+  u8 u = list[2] & 0xff;
+
+  set_clut(psx_gpu, list[2] >> 16);
+
+  render_sprite(psx_gpu, x, y, u, v, &width, &height, list[0] >> 24, list[0]);
+  gput_sum(*cpu_cycles_sum, *cpu_cycles, gput_sprite(width, height));
+
+  if (check_enhanced_range(psx_gpu, x, x + width))
+    do_sprite_enhanced(psx_gpu, x, y, u, v, width_b, height_b, list[0]);
+}
+
 u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
- s32 *cpu_cycles_out, u32 *last_command)
+ s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command)
 {
   vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
-  u32 current_command = 0, command_length, cpu_cycles = 0;
+  u32 current_command = 0, command_length;
+  u32 cpu_cycles_sum = 0, cpu_cycles = *cpu_cycles_last;
+  u32 simplified_prim[4*4];
 
   u32 *list_start = list;
   u32 *list_end = list + (size / 4);
@@ -1244,7 +1251,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         x &= ~0xF;
         width = ((width + 0xF) & ~0xF);
-        cpu_cycles += gput_fill(width, height);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_fill(width, height));
         if (width == 0 || height == 0)
           break;
 
@@ -1275,7 +1282,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy(2, 6);
 
         do_triangle_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base());
         break;
       }
   
@@ -1290,7 +1297,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv(2, 10);
   
         do_triangle_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base_t();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_t());
         break;
       }
   
@@ -1304,14 +1311,26 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy(3, 8);
 
         do_quad_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_quad_base();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base());
         break;
       }
   
       case 0x2C ... 0x2F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[9]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[4] >> 16);
+        if (!(psx_gpu->render_state_base & RENDER_STATE_DITHER) &&
+            (simplified_count = prim_try_simplify_quad_t(simplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &simplified_prim[i * 4];
+            textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
         set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
   
         get_vertex_data_xy_uv(0, 2);   
@@ -1319,9 +1338,8 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv(2, 10);  
         get_vertex_data_xy_uv(3, 14);
   
-        uv_hack(vertexes, 4);
         do_quad_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_quad_base_t();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
         break;
       }
   
@@ -1332,7 +1350,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_rgb(2, 8);
   
         do_triangle_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base_g();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_g());
         break;
       }
   
@@ -1346,7 +1364,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_uv_rgb(2, 12);
 
         do_triangle_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_poly_base_gt();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_poly_base_gt());
         break;
       }
   
@@ -1358,23 +1376,34 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         get_vertex_data_xy_rgb(3, 12);
   
         do_quad_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_quad_base_g();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_g());
         break;
       }
   
       case 0x3C ... 0x3F:
       {
-        set_clut(psx_gpu, list_s16[5]);
-        set_texture(psx_gpu, list_s16[11]);
+        u32 i, simplified_count;
+        set_texture(psx_gpu, list[5] >> 16);
+        if (!(psx_gpu->render_state_base & RENDER_STATE_DITHER) &&
+            (simplified_count = prim_try_simplify_quad_gt(simplified_prim, list)))
+        {
+          for (i = 0; i < simplified_count; i++) {
+            const u32 *list_ = &simplified_prim[i * 4];
+            textured_sprite_enh(psx_gpu, list_, list_[3] & 0x3FF,
+              (list_[3] >> 16) & 0x1FF, &cpu_cycles_sum, &cpu_cycles);
+          }
+          break;
+        }
+
+        set_clut(psx_gpu, list[2] >> 16);
   
         get_vertex_data_xy_uv_rgb(0, 0);
         get_vertex_data_xy_uv_rgb(1, 6);
         get_vertex_data_xy_uv_rgb(2, 12);
         get_vertex_data_xy_uv_rgb(3, 18);
 
-        uv_hack(vertexes, 4);
         do_quad_enhanced(psx_gpu, vertexes, current_command);
-        cpu_cycles += gput_quad_base_gt();
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
         break;
       }
   
@@ -1388,7 +1417,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         render_line(psx_gpu, vertexes, current_command, list[0], 0);
         if (enhancement_enable(psx_gpu))
           render_line(psx_gpu, vertexes, current_command, list[0], 1);
-        cpu_cycles += gput_line(0);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
         break;
       }
   
@@ -1413,7 +1442,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           render_line(psx_gpu, vertexes, current_command, list[0], 0);
           if (enhancement_enable(psx_gpu))
             render_line(psx_gpu, vertexes, current_command, list[0], 1);
-          cpu_cycles += gput_line(0);
+          gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 
           list_position++;
           num_vertexes++;
@@ -1450,7 +1479,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
         render_line(psx_gpu, vertexes, current_command, 0, 0);
         if (enhancement_enable(psx_gpu))
           render_line(psx_gpu, vertexes, current_command, 0, 1);
-        cpu_cycles += gput_line(0);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
         break;
       }
  
@@ -1484,7 +1513,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
           render_line(psx_gpu, vertexes, current_command, 0, 0);
           if (enhancement_enable(psx_gpu))
             render_line(psx_gpu, vertexes, current_command, 0, 1);
-          cpu_cycles += gput_line(0);
+          gput_sum(cpu_cycles_sum, cpu_cycles, gput_line(0));
 
           list_position += 2;
           num_vertexes++;
@@ -1513,33 +1542,21 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
 
-        if (check_enhanced_range(psx_gpu, x, x + width))
+        if (check_enhanced_range(psx_gpu, x, x + width)) {
+          width = list_s16[4] & 0x3FF;
+          height = list_s16[5] & 0x1FF;
           do_sprite_enhanced(psx_gpu, x, y, 0, 0, width, height, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+        }
         break;
       }
-  
-      case 0x64 ... 0x67:
-      {        
-        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = list_s16[6] & 0x3FF;
-        s32 height = list_s16[7] & 0x1FF;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-
-        if (check_enhanced_range(psx_gpu, x, x + width))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+      case 0x64 ... 0x67:
+        textured_sprite_enh(psx_gpu, list, list[3] & 0x3FF, (list[3] >> 16) & 0x1FF,
+          &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x68 ... 0x6B:
       {
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -1548,10 +1565,10 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(1, 1));
 
         if (check_enhanced_range(psx_gpu, x, x + 1))
-          do_sprite_enhanced(psx_gpu, x, y, 0, 0, width, height, list[0]);
-        cpu_cycles += gput_sprite(1, 1);
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, 1, 1, list[0]);
         break;
       }
   
@@ -1563,32 +1580,17 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
 
         if (check_enhanced_range(psx_gpu, x, x + 8))
-          do_sprite_enhanced(psx_gpu, x, y, 0, 0, width, height, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]);
         break;
       }
-  
-      case 0x74 ... 0x77:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = 8, height = 8;
 
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
-
-        if (check_enhanced_range(psx_gpu, x, x + 8))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+      case 0x74 ... 0x77:
+        textured_sprite_enh(psx_gpu, list, 8, 8, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
-  
+
       case 0x78 ... 0x7B:
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
@@ -1597,31 +1599,16 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
 
         render_sprite(psx_gpu, x, y, 0, 0, &width, &height,
            current_command, list[0]);
+        gput_sum(cpu_cycles_sum, cpu_cycles, gput_sprite(width, height));
 
         if (check_enhanced_range(psx_gpu, x, x + 16))
-          do_sprite_enhanced(psx_gpu, x, y, 0, 0, width, height, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]);
         break;
       }
-  
-      case 0x7C ... 0x7F:
-      {        
-        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
-        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u8 u = list_s16[4];
-        u8 v = list_s16[4] >> 8;
-        s32 width = 16, height = 16;
-
-        set_clut(psx_gpu, list_s16[5]);
-
-        render_sprite(psx_gpu, x, y, u, v,
-           &width, &height, current_command, list[0]);
 
-        if (check_enhanced_range(psx_gpu, x, x + 16))
-          do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
-        cpu_cycles += gput_sprite(width, height);
+      case 0x7C ... 0x7F:
+        textured_sprite_enh(psx_gpu, list, 16, 16, &cpu_cycles_sum, &cpu_cycles);
         break;
-      }
 
       case 0x80 ... 0x9F:          //  vid -> vid
       case 0xA0 ... 0xBF:          //  sys -> vid
@@ -1631,7 +1618,8 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
       case 0xE1:
         set_texture(psx_gpu, list[0]);
 
-        if(list[0] & (1 << 9))
+        if ((psx_gpu->allow_dithering && (list[0] & (1 << 9)))
+            || psx_gpu->force_dithering)
           psx_gpu->render_state_base |= RENDER_STATE_DITHER;
         else
           psx_gpu->render_state_base &= ~RENDER_STATE_DITHER;
@@ -1736,10 +1724,8 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   
       case 0xE5:
       {
-        s32 offset_x = list[0] << 21;
-        s32 offset_y = list[0] << 10;
-        psx_gpu->offset_x = offset_x >> 21;
-        psx_gpu->offset_y = offset_y >> 21; 
+        psx_gpu->offset_x = sign_extend_11bit(list[0]);
+        psx_gpu->offset_y = sign_extend_11bit(list[0] >> 11);
   
         SET_Ex(5, list[0]);
         break;
@@ -1773,7 +1759,8 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
   enhancement_disable();
 
 breakloop:
-  *cpu_cycles_out += cpu_cycles;
+  *cpu_cycles_sum_out += cpu_cycles_sum;
+  *cpu_cycles_last = cpu_cycles;
   *last_command = current_command;
   return list - list_start;
 }