gpu_neon: rework buffering to reduce flickering
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_parse.c
index d3616bd..de227d5 100644 (file)
@@ -45,7 +45,6 @@ void update_texture_ptr(psx_gpu_struct *psx_gpu)
 
   switch((psx_gpu->render_state_base >> 8) & 0x3)
   {
-    default:
     case TEXTURE_MODE_4BPP:
       texture_base = psx_gpu->texture_4bpp_cache[psx_gpu->current_texture_page];
 
@@ -75,6 +74,7 @@ void update_texture_ptr(psx_gpu_struct *psx_gpu)
       texture_ptr += (psx_gpu->texture_window_y >> 4) << 12;
       break;
 
+    default:
     case TEXTURE_MODE_16BPP:
       texture_base = (u8 *)(psx_gpu->vram_ptr);
       texture_base += (psx_gpu->current_texture_page & 0xF) * 128;
@@ -92,6 +92,7 @@ void update_texture_ptr(psx_gpu_struct *psx_gpu)
 
 void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings)
 {
+  texture_settings &= 0x1FF;
   if(psx_gpu->texture_settings != texture_settings)
   {
     u32 new_texture_page = texture_settings & 0x1F;
@@ -249,10 +250,9 @@ static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y,
 #define SET_Ex(r, v)
 #endif
 
-vertex_struct vertexes[4] __attribute__((aligned(32)));
-
 u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
 {
+  vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
   u32 current_command = 0, command_length;
 
   u32 *list_start = list;
@@ -407,7 +407,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         vertexes[1].x = list_s16[4] + psx_gpu->offset_x;
         vertexes[1].y = list_s16[5] + psx_gpu->offset_y;
 
-        render_line(psx_gpu, vertexes, current_command, list[0]);
+        render_line(psx_gpu, vertexes, current_command, list[0], 0);
                        break;
       }
   
@@ -428,13 +428,16 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
           vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
-          render_line(psx_gpu, vertexes, current_command, list[0]);
+          render_line(psx_gpu, vertexes, current_command, list[0], 0);
 
           list_position++;
           num_vertexes++;
 
           if(list_position >= list_end)
-            break;
+          {
+            current_command = (u32)-1;
+            goto breakloop;
+          }
 
           xy = *list_position;
           if((xy & 0xF000F000) == 0x50005000)
@@ -459,7 +462,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         vertexes[1].x = list_s16[6] + psx_gpu->offset_x;
         vertexes[1].y = list_s16[7] + psx_gpu->offset_y;
 
-        render_line(psx_gpu, vertexes, current_command, 0);
+        render_line(psx_gpu, vertexes, current_command, 0, 0);
                        break;
       }
  
@@ -489,13 +492,16 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
           vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
-          render_line(psx_gpu, vertexes, current_command, 0);
+          render_line(psx_gpu, vertexes, current_command, 0, 0);
 
           list_position += 2;
           num_vertexes++;
 
           if(list_position >= list_end)
-            break;
+          {
+            current_command = (u32)-1;
+            goto breakloop;
+          }
 
           color = list_position[0];
           if((color & 0xF000F000) == 0x50005000)
@@ -600,18 +606,29 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
                        break;
       }
   
-               case 0x80:          //  vid -> vid
-        render_block_move(psx_gpu, list_s16[2] & 0x3FF, list_s16[3] & 0x1FF,
-         list_s16[4] & 0x3FF, list_s16[5] & 0x1FF,
-         ((list_s16[6] - 1) & 0x3FF) + 1, ((list_s16[7] - 1) & 0x1FF) + 1);
-                       break;
 #ifdef PCSX
-               case 0xA0:          //  sys -> vid
-               case 0xC0:          //  vid -> sys
-                       goto breakloop;
+      case 0x80 ... 0x9F:          //  vid -> vid
+      case 0xA0 ... 0xBF:          //  sys -> vid
+      case 0xC0 ... 0xDF:          //  vid -> sys
+        goto breakloop;
 #else
-               case 0xA0:          //  sys -> vid
+      case 0x80 ... 0x9F:          //  vid -> vid
+      {
+        u32 sx = list_s16[2] & 0x3FF;
+        u32 sy = list_s16[3] & 0x1FF;
+        u32 dx = list_s16[4] & 0x3FF;
+        u32 dy = list_s16[5] & 0x1FF;
+        u32 w = ((list_s16[6] - 1) & 0x3FF) + 1;
+        u32 h = ((list_s16[7] - 1) & 0x1FF) + 1;
+
+        if (sx == dx && sy == dy && psx_gpu->mask_msb == 0)
+          break;
+
+        render_block_move(psx_gpu, sx, sy, dx, dy, w, h);
+        break;
+      } 
+
+      case 0xA0 ... 0xBF:          //  sys -> vid
       {
         u32 load_x = list_s16[2] & 0x3FF;
         u32 load_y = list_s16[3] & 0x1FF;
@@ -629,12 +646,12 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
                        break;
       }
 
-               case 0xC0:          //  vid -> sys
-                       break;
+      case 0xC0 ... 0xDF:          //  vid -> sys
+        break;
 #endif
 
                case 0xE1:
-        set_texture(psx_gpu, list[0] & 0x1FF);
+        set_texture(psx_gpu, list[0]);
 
         if(list[0] & (1 << 9))
           psx_gpu->render_state_base |= RENDER_STATE_DITHER;
@@ -677,11 +694,21 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         }
         SET_Ex(2, list[0]);
         break;
-               }
+      }
+
+      case 0xE3:
+      {
+        s16 viewport_start_x = list[0] & 0x3FF;
+        s16 viewport_start_y = (list[0] >> 10) & 0x1FF;
+
+        if(viewport_start_x == psx_gpu->viewport_start_x &&
+         viewport_start_y == psx_gpu->viewport_start_y)
+        {
+          break;
+        }
   
-               case 0xE3:
-        psx_gpu->viewport_start_x = list[0] & 0x3FF;
-        psx_gpu->viewport_start_y = (list[0] >> 10) & 0x1FF;
+        psx_gpu->viewport_start_x = viewport_start_x;
+        psx_gpu->viewport_start_y = viewport_start_y;
 
 #ifdef TEXTURE_CACHE_4BPP
         psx_gpu->viewport_mask =
@@ -689,12 +716,23 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
          psx_gpu->viewport_start_y, psx_gpu->viewport_end_x,
          psx_gpu->viewport_end_y);
 #endif
-                       SET_Ex(3, list[0]);
-                       break;
-  
-               case 0xE4:
-        psx_gpu->viewport_end_x = list[0] & 0x3FF;
-        psx_gpu->viewport_end_y = (list[0] >> 10) & 0x1FF;
+        SET_Ex(3, list[0]);
+        break;
+      }
+
+      case 0xE4:
+      {
+        s16 viewport_end_x = list[0] & 0x3FF;
+        s16 viewport_end_y = (list[0] >> 10) & 0x1FF;
+
+        if(viewport_end_x == psx_gpu->viewport_end_x &&
+         viewport_end_y == psx_gpu->viewport_end_y)
+        {
+          break;
+        }
+
+        psx_gpu->viewport_end_x = viewport_end_x;
+        psx_gpu->viewport_end_y = viewport_end_y;
 
 #ifdef TEXTURE_CACHE_4BPP
         psx_gpu->viewport_mask =
@@ -702,10 +740,11 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
          psx_gpu->viewport_start_y, psx_gpu->viewport_end_x,
          psx_gpu->viewport_end_y);
 #endif
-                       SET_Ex(4, list[0]);
-                       break;
+        SET_Ex(4, list[0]);
+        break;
+      }
   
-               case 0xE5:
+      case 0xE5:
       {
         s32 offset_x = list[0] << 21;
         s32 offset_y = list[0] << 10;
@@ -741,55 +780,311 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
        }
   }
 
-#ifdef PCSX
 breakloop:
-#endif
   if (last_command != NULL)
     *last_command = current_command;
   return list - list_start;
 }
 
+#ifdef PCSX
+
+// this thing has become such a PITA, should just handle the 2048 width really
+static void update_enhancement_buf_scanouts(psx_gpu_struct *psx_gpu,
+    int x, int y, int w, int h)
+{
+  int max_bufs = ARRAY_SIZE(psx_gpu->enhancement_scanouts);
+  struct psx_gpu_scanout *s;
+  int i, sel, right, bottom;
+  u32 tol_x = 48, tol_y = 16;
+  u32 intersection;
+
+  //w = (w + 15) & ~15;
+  psx_gpu->saved_hres = w;
+  assert(!(max_bufs & (max_bufs - 1)));
+  for (i = 0; i < max_bufs; i++) {
+    s = &psx_gpu->enhancement_scanouts[i];
+    if (s->x == x && s->y == y && w - s->w <= tol_x && h - s->h <= tol_y)
+      return;
+  }
+
+  // evict any scanout that intersects
+  right = x + w;
+  bottom = y + h;
+  for (i = 0, sel = -1; i < max_bufs; i++) {
+    s = &psx_gpu->enhancement_scanouts[i];
+    if (s->x >= right) continue;
+    if (s->x + s->w <= x) continue;
+    if (s->y >= bottom) continue;
+    if (s->y + s->h <= y) continue;
+    // ... but allow upto 16 pixels intersection that some games do
+    if ((intersection = s->x + s->w - x) - 1u <= tol_x) {
+      s->w -= intersection;
+      continue;
+    }
+    if ((intersection = s->y + s->h - y) - 1u <= tol_y) {
+      s->h -= intersection;
+      continue;
+    }
+    //printf("%4d%4d%4dx%d evicted\n", s->x, s->y, s->w, s->h);
+    s->w = 0;
+    sel = i;
+    break;
+  }
+  if (sel >= 0) {
+    // 2nd intersection check
+    for (i = 0; i < max_bufs; i++) {
+      s = &psx_gpu->enhancement_scanouts[i];
+      if (!s->w)
+        continue;
+      if ((intersection = right - s->x) - 1u <= tol_x) {
+        w -= intersection;
+        break;
+      }
+      if ((intersection = bottom - s->y) - 1u <= tol_y) {
+        h -= intersection;
+        break;
+      }
+    }
+  }
+  else
+    sel = psx_gpu->enhancement_scanout_eselect++;
+  psx_gpu->enhancement_scanout_eselect &= max_bufs - 1;
+  s = &psx_gpu->enhancement_scanouts[sel];
+  s->x = x;
+  s->y = y;
+  s->w = w;
+  s->h = h;
+
+  sync_enhancement_buffers(x, y, w, h);
+#if 0
+  printf("scanouts:\n");
+  for (i = 0; i < ARRAY_SIZE(psx_gpu->enhancement_scanouts); i++) {
+    s = &psx_gpu->enhancement_scanouts[i];
+    if (s->w)
+      printf("%4d%4d%4dx%d\n", s->x, s->y, s->w, s->h);
+  }
+#endif
+}
+
+static int select_enhancement_buf_index(psx_gpu_struct *psx_gpu, s32 x, s32 y)
+{
+  int i;
+  for (i = 0; i < ARRAY_SIZE(psx_gpu->enhancement_scanouts); i++) {
+    const struct psx_gpu_scanout *s = &psx_gpu->enhancement_scanouts[i];
+    if (s->x <= x && x < s->x + s->w &&
+        s->y <= y && y < s->y + s->h)
+      return i;
+  }
+  return -1;
+}
+
+#define select_enhancement_buf_by_index(psx_gpu_, i_) \
+  ((psx_gpu_)->enhancement_buf_ptr + ((i_) << 20))
+
+static void *select_enhancement_buf_ptr(psx_gpu_struct *psx_gpu, s32 x, s32 y)
+{
+  int i = select_enhancement_buf_index(psx_gpu, x, y);
+  return i >= 0 ? select_enhancement_buf_by_index(psx_gpu, i) : NULL;
+}
+
+static void select_enhancement_buf(psx_gpu_struct *psx_gpu)
+{
+  s32 x = psx_gpu->saved_viewport_start_x;
+  s32 y = psx_gpu->saved_viewport_start_y;
+  psx_gpu->enhancement_current_buf_ptr = select_enhancement_buf_ptr(psx_gpu, x, y);
+}
+
 #define enhancement_disable() { \
   psx_gpu->vram_out_ptr = psx_gpu->vram_ptr; \
   psx_gpu->viewport_start_x = psx_gpu->saved_viewport_start_x; \
   psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \
   psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \
   psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \
-  psx_gpu->render_mode &= ~RENDER_DOUBLE_MODE; \
+  psx_gpu->uvrgb_phase = 0x8000; \
 }
 
-#define enhancement_enable() { \
-  psx_gpu->vram_out_ptr = psx_gpu->enhancement_buf_ptr; \
-  psx_gpu->viewport_start_x = psx_gpu->saved_viewport_start_x * 2; \
-  psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y * 2; \
-  psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x * 2; \
-  psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2; \
-  psx_gpu->render_mode |= RENDER_DOUBLE_MODE; \
+static int enhancement_enable(psx_gpu_struct *psx_gpu)
+{
+  if (!psx_gpu->enhancement_current_buf_ptr)
+    return 0;
+  psx_gpu->vram_out_ptr = psx_gpu->enhancement_current_buf_ptr;
+  psx_gpu->viewport_start_x = psx_gpu->saved_viewport_start_x * 2;
+  psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y * 2;
+  psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x * 2 + 1;
+  psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1;
+  if (psx_gpu->viewport_end_x - psx_gpu->viewport_start_x + 1 > 1024)
+    psx_gpu->viewport_end_x = psx_gpu->viewport_start_x + 1023;
+  psx_gpu->uvrgb_phase = 0x7fff;
+  return 1;
 }
 
 #define shift_vertices3(v) { \
-  v[0]->x *= 2; \
-  v[0]->y *= 2; \
-  v[1]->x *= 2; \
-  v[1]->y *= 2; \
-  v[2]->x *= 2; \
-  v[2]->y *= 2; \
+  v[0]->x <<= 1; \
+  v[0]->y <<= 1; \
+  v[1]->x <<= 1; \
+  v[1]->y <<= 1; \
+  v[2]->x <<= 1; \
+  v[2]->y <<= 1; \
 }
 
 #define unshift_vertices3(v) { \
-  v[0]->x /= 2; \
-  v[0]->y /= 2; \
-  v[1]->x /= 2; \
-  v[1]->y /= 2; \
-  v[2]->x /= 2; \
-  v[2]->y /= 2; \
+  v[0]->x >>= 1; \
+  v[0]->y >>= 1; \
+  v[1]->x >>= 1; \
+  v[1]->y >>= 1; \
+  v[2]->x >>= 1; \
+  v[2]->y >>= 1; \
 }
 
 #define shift_triangle_area() \
   psx_gpu->triangle_area *= 4
 
+#ifndef NEON_BUILD
+void scale2x_tiles8(void *dst, const void *src, int w8, int h)
+{
+  uint16_t* d = (uint16_t*)dst;
+  const uint16_t* s = (const uint16_t*)src;
+
+  while ( h-- )
+  {
+    uint16_t* d_save = d;
+    const uint16_t* s_save = s;
+    int w = w8;
+
+    while ( w-- )
+    {
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+
+      d[    0 ] = *s;
+      d[    1 ] = *s;
+      d[ 1024 ] = *s;
+      d[ 1025 ] = *s;
+      d += 2; s++;
+    }
+
+    d = d_save + 2048;
+    s = s_save + 1024; /* or 512? */
+  }
+}
+#endif
+
 static int disable_main_render;
 
+// simple check for a case where no clipping is used
+//  - now handled by adjusting the viewport
+static int check_enhanced_range(psx_gpu_struct *psx_gpu, int x, int y)
+{
+  return 1;
+}
+
+static int is_in_array(int val, int array[], int len)
+{
+  int i;
+  for (i = 0; i < len; i++)
+    if (array[i] == val)
+      return 1;
+  return 0;
+}
+
+static int make_members_unique(int array[], int len)
+{
+  int i, j;
+  for (i = j = 1; i < len; i++)
+    if (!is_in_array(array[i], array, j))
+      array[j++] = array[i];
+
+  if (array[0] > array[1]) {
+    i = array[0]; array[0] = array[1]; array[1] = i;
+  }
+  return j;
+}
+
+static void patch_u(vertex_struct *vertex_ptrs, int count, int old, int new)
+{
+  int i;
+  for (i = 0; i < count; i++)
+    if (vertex_ptrs[i].u == old)
+      vertex_ptrs[i].u = new;
+}
+
+static void patch_v(vertex_struct *vertex_ptrs, int count, int old, int new)
+{
+  int i;
+  for (i = 0; i < count; i++)
+    if (vertex_ptrs[i].v == old)
+      vertex_ptrs[i].v = new;
+}
+
+static void uv_hack(vertex_struct *vertex_ptrs, int vertex_count)
+{
+  int i, u[4], v[4];
+
+  for (i = 0; i < vertex_count; i++) {
+    u[i] = vertex_ptrs[i].u;
+    v[i] = vertex_ptrs[i].v;
+  }
+  if (make_members_unique(u, vertex_count) == 2 && u[1] - u[0] >= 8) {
+    if ((u[0] & 7) == 7) {
+      patch_u(vertex_ptrs, vertex_count, u[0], u[0] + 1);
+      //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0]+1, u[1]);
+    }
+    else if ((u[1] & 7) == 0 || u[1] - u[0] > 128) {
+      patch_u(vertex_ptrs, vertex_count, u[1], u[1] - 1);
+      //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0], u[1]-1);
+    }
+  }
+  if (make_members_unique(v, vertex_count) == 2 && ((v[0] - v[1]) & 7) == 0) {
+    if ((v[0] & 7) == 7) {
+      patch_v(vertex_ptrs, vertex_count, v[0], v[0] + 1);
+      //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0]+1, v[1]);
+    }
+    else if ((v[1] & 7) == 0) {
+      patch_v(vertex_ptrs, vertex_count, v[1], v[1] - 1);
+      //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0], v[1]-1);
+    }
+  }
+}
+
 static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
  vertex_struct *vertexes, u32 current_command)
 {
@@ -801,41 +1096,107 @@ static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
   if (!disable_main_render)
     render_triangle_p(psx_gpu, vertex_ptrs, current_command);
 
-  enhancement_enable();
+  if (!check_enhanced_range(psx_gpu, vertex_ptrs[0]->x, vertex_ptrs[2]->x))
+    return;
+
+  if (!enhancement_enable(psx_gpu))
+    return;
+
   shift_vertices3(vertex_ptrs);
   shift_triangle_area();
   render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+  unshift_vertices3(vertex_ptrs);
 }
 
 static void do_quad_enhanced(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
  u32 current_command)
+{
+  do_triangle_enhanced(psx_gpu, vertexes, current_command);
+  enhancement_disable();
+  do_triangle_enhanced(psx_gpu, &vertexes[1], current_command);
+}
+
+#if 0
+
+#define fill_vertex(i, x_, y_, u_, v_, rgb_) \
+  vertexes[i].x = x_; \
+  vertexes[i].y = y_; \
+  vertexes[i].u = u_; \
+  vertexes[i].v = v_; \
+  vertexes[i].r = rgb_; \
+  vertexes[i].g = (rgb_) >> 8; \
+  vertexes[i].b = (rgb_) >> 16
+
+static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
+ u32 u, u32 v, u32 w, u32 h, u32 cmd_rgb)
 {
   vertex_struct *vertex_ptrs[3];
+  u32 flags = (cmd_rgb >> 24);
+  u32 color = cmd_rgb & 0xffffff;
+  u32 render_state_base_saved = psx_gpu->render_state_base;
+  int x1, y1;
+  u8 u1, v1;
 
-  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs)) {
-    if (!disable_main_render)
-      render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+  flags &=
+   (RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND |
+   RENDER_FLAGS_TEXTURE_MAP);
+
+  set_triangle_color(psx_gpu, color);
+  if(color == 0x808080)
+    flags |= RENDER_FLAGS_MODULATE_TEXELS;
 
-    enhancement_enable();
+  psx_gpu->render_state_base &= ~RENDER_STATE_DITHER;
+  enhancement_enable();
+
+  x1 = x + w;
+  y1 = y + h;
+  u1 = u + w;
+  v1 = v + h;
+  // FIXME..
+  if (u1 < u) u1 = 0xff;
+  if (v1 < v) v1 = 0xff;
+
+  // 0-2
+  // |/
+  // 1
+  fill_vertex(0, x,  y,  u,  v,  color);
+  fill_vertex(1, x,  y1, u,  v1, color);
+  fill_vertex(2, x1, y,  u1, v,  color);
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs)) {
     shift_vertices3(vertex_ptrs);
     shift_triangle_area();
-    render_triangle_p(psx_gpu, vertex_ptrs, current_command);
-    unshift_vertices3(vertex_ptrs);
+    render_triangle_p(psx_gpu, vertex_ptrs, flags);
   }
-  enhancement_disable();
-  if (prepare_triangle(psx_gpu, &vertexes[1], vertex_ptrs)) {
-    if (!disable_main_render)
-      render_triangle_p(psx_gpu, vertex_ptrs, current_command);
 
-    enhancement_enable();
+  //   0
+  //  /|
+  // 1-2
+  fill_vertex(0, x1, y,  u1, v,  color);
+  fill_vertex(1, x,  y1, u,  v1, color);
+  fill_vertex(2, x1, y1, u1, v1, color);
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs)) {
     shift_vertices3(vertex_ptrs);
     shift_triangle_area();
-    render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+    render_triangle_p(psx_gpu, vertex_ptrs, flags);
   }
+
+  psx_gpu->render_state_base = render_state_base_saved;
+}
+#else
+static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
+ u32 u, u32 v, u32 w, u32 h, u32 cmd_rgb)
+{
+  u32 flags = (cmd_rgb >> 24);
+  u32 color = cmd_rgb & 0xffffff;
+
+  render_sprite_4x(psx_gpu, x, y, u, v, w, h, flags, color);
 }
+#endif
 
-u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
+u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
+ u32 *last_command)
 {
+  vertex_struct vertexes[4] __attribute__((aligned(16))) = {};
   u32 current_command = 0, command_length;
 
   u32 *list_start = list;
@@ -845,6 +1206,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
   psx_gpu->saved_viewport_start_y = psx_gpu->viewport_start_y;
   psx_gpu->saved_viewport_end_x = psx_gpu->viewport_end_x;
   psx_gpu->saved_viewport_end_y = psx_gpu->viewport_end_y;
+  select_enhancement_buf(psx_gpu);
 
   for(; list < list_end; list += 1 + command_length)
   {
@@ -870,13 +1232,23 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         u32 width = list_s16[4] & 0x3FF;
         u32 height = list_s16[5] & 0x1FF;
         u32 color = list[0] & 0xFFFFFF;
+        s32 i1, i2;
 
         x &= ~0xF;
         width = ((width + 0xF) & ~0xF);
+        if (width == 0 || height == 0)
+          break;
 
         do_fill(psx_gpu, x, y, width, height, color);
 
-        psx_gpu->vram_out_ptr = psx_gpu->enhancement_buf_ptr;
+        i1 = select_enhancement_buf_index(psx_gpu, x, y);
+        i2 = select_enhancement_buf_index(psx_gpu, x + width - 1, y + height - 1);
+        if (i1 < 0 || i1 != i2) {
+          sync_enhancement_buffers(x, y, width, height);
+          break;
+        }
+
+        psx_gpu->vram_out_ptr = select_enhancement_buf_by_index(psx_gpu, i1);
         x *= 2;
         y *= 2;
         width *= 2;
@@ -935,6 +1307,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         get_vertex_data_xy_uv(2, 10);  
         get_vertex_data_xy_uv(3, 14);
   
+        uv_hack(vertexes, 4);
         do_quad_enhanced(psx_gpu, vertexes, current_command);
         break;
       }
@@ -992,6 +1365,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         get_vertex_data_xy_uv_rgb(2, 12);
         get_vertex_data_xy_uv_rgb(3, 18);
 
+        uv_hack(vertexes, 4);
         do_quad_enhanced(psx_gpu, vertexes, current_command);
         break;
       }
@@ -1003,7 +1377,9 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         vertexes[1].x = list_s16[4] + psx_gpu->offset_x;
         vertexes[1].y = list_s16[5] + psx_gpu->offset_y;
 
-        render_line(psx_gpu, vertexes, current_command, list[0]);
+        render_line(psx_gpu, vertexes, current_command, list[0], 0);
+        if (enhancement_enable(psx_gpu))
+          render_line(psx_gpu, vertexes, current_command, list[0], 1);
         break;
       }
   
@@ -1024,13 +1400,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
           vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
-          render_line(psx_gpu, vertexes, current_command, list[0]);
+          enhancement_disable();
+          render_line(psx_gpu, vertexes, current_command, list[0], 0);
+          if (enhancement_enable(psx_gpu))
+            render_line(psx_gpu, vertexes, current_command, list[0], 1);
 
           list_position++;
           num_vertexes++;
 
           if(list_position >= list_end)
-            break;
+          {
+            current_command = (u32)-1;
+            goto breakloop;
+          }
 
           xy = *list_position;
           if((xy & 0xF000F000) == 0x50005000)
@@ -1055,7 +1437,9 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         vertexes[1].x = list_s16[6] + psx_gpu->offset_x;
         vertexes[1].y = list_s16[7] + psx_gpu->offset_y;
 
-        render_line(psx_gpu, vertexes, current_command, 0);
+        render_line(psx_gpu, vertexes, current_command, 0, 0);
+        if (enhancement_enable(psx_gpu))
+          render_line(psx_gpu, vertexes, current_command, 0, 1);
         break;
       }
  
@@ -1085,13 +1469,19 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
           vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
-          render_line(psx_gpu, vertexes, current_command, 0);
+          enhancement_disable();
+          render_line(psx_gpu, vertexes, current_command, 0, 0);
+          if (enhancement_enable(psx_gpu))
+            render_line(psx_gpu, vertexes, current_command, 0, 1);
 
           list_position += 2;
           num_vertexes++;
 
           if(list_position >= list_end)
-            break;
+          {
+            current_command = (u32)-1;
+            goto breakloop;
+          }
 
           color = list_position[0];
           if((color & 0xF000F000) == 0x50005000)
@@ -1110,6 +1500,9 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         u32 height = list_s16[5] & 0x1FF;
 
         render_sprite(psx_gpu, x, y, 0, 0, width, height, current_command, list[0]);
+
+        if (check_enhanced_range(psx_gpu, x, x + width))
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, width, height, list[0]);
         break;
       }
   
@@ -1117,14 +1510,18 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
       {        
         u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
         u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
+        u8 u = list_s16[4];
+        u8 v = list_s16[4] >> 8;
         u32 width = list_s16[6] & 0x3FF;
         u32 height = list_s16[7] & 0x1FF;
 
         set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, width, height,
+        render_sprite(psx_gpu, x, y, u, v, width, height,
          current_command, list[0]);
+
+        if (check_enhanced_range(psx_gpu, x, x + width))
+          do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
         break;
       }
   
@@ -1137,6 +1534,9 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
 
         render_sprite(psx_gpu, x, y, 0, 0, 1, 1, current_command, list[0]);
+
+        if (check_enhanced_range(psx_gpu, x, x + 1))
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, 1, 1, list[0]);
         break;
       }
   
@@ -1149,6 +1549,9 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
 
         render_sprite(psx_gpu, x, y, 0, 0, 8, 8, current_command, list[0]);
+
+        if (check_enhanced_range(psx_gpu, x, x + 8))
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]);
         break;
       }
   
@@ -1159,12 +1562,16 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
         s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
+        u8 u = list_s16[4];
+        u8 v = list_s16[4] >> 8;
 
         set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, 8, 8,
+        render_sprite(psx_gpu, x, y, u, v, 8, 8,
          current_command, list[0]);
+
+        if (check_enhanced_range(psx_gpu, x, x + 8))
+          do_sprite_enhanced(psx_gpu, x, y, u, v, 8, 8, list[0]);
         break;
       }
   
@@ -1177,6 +1584,9 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
         s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
 
         render_sprite(psx_gpu, x, y, 0, 0, 16, 16, current_command, list[0]);
+
+        if (check_enhanced_range(psx_gpu, x, x + 16))
+          do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]);
         break;
       }
   
@@ -1187,50 +1597,25 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
       {        
         s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
         s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
-        u32 uv = list_s16[4];
+        u8 u = list_s16[4];
+        u8 v = list_s16[4] >> 8;
 
         set_clut(psx_gpu, list_s16[5]);
 
-        render_sprite(psx_gpu, x, y, uv & 0xFF, (uv >> 8) & 0xFF, 16, 16,
-         current_command, list[0]);
-        break;
-      }
-  
-      case 0x80:          //  vid -> vid
-        render_block_move(psx_gpu, list_s16[2] & 0x3FF, list_s16[3] & 0x1FF,
-         list_s16[4] & 0x3FF, list_s16[5] & 0x1FF,
-         ((list_s16[6] - 1) & 0x3FF) + 1, ((list_s16[7] - 1) & 0x1FF) + 1);
-        break;
-#ifdef PCSX
-      case 0xA0:          //  sys -> vid
-      case 0xC0:          //  vid -> sys
-        goto breakloop;
-#else
-      case 0xA0:          //  sys -> vid
-      {
-        u32 load_x = list_s16[2] & 0x3FF;
-        u32 load_y = list_s16[3] & 0x1FF;
-        u32 load_width = list_s16[4] & 0x3FF;
-        u32 load_height = list_s16[5] & 0x1FF;
-        u32 load_size = load_width * load_height;
-  
-        command_length += load_size / 2;
-
-        if(load_size & 1)
-          command_length++;
+        render_sprite(psx_gpu, x, y, u, v, 16, 16, current_command, list[0]);
 
-        render_block_copy(psx_gpu, (u16 *)&(list_s16[6]), load_x, load_y,
-         load_width, load_height, load_width);
+        if (check_enhanced_range(psx_gpu, x, x + 16))
+          do_sprite_enhanced(psx_gpu, x, y, u, v, 16, 16, list[0]);
         break;
       }
 
-      case 0xC0:          //  vid -> sys
-        break;
-#endif
+      case 0x80 ... 0x9F:          //  vid -> vid
+      case 0xA0 ... 0xBF:          //  sys -> vid
+      case 0xC0 ... 0xDF:          //  vid -> sys
+        goto breakloop;
 
       case 0xE1:
-        set_texture(psx_gpu, list[0] & 0x1FF);
+        set_texture(psx_gpu, list[0]);
 
         if(list[0] & (1 << 9))
           psx_gpu->render_state_base |= RENDER_STATE_DITHER;
@@ -1276,10 +1661,21 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
       }
   
       case 0xE3:
-        psx_gpu->viewport_start_x = list[0] & 0x3FF;
-        psx_gpu->viewport_start_y = (list[0] >> 10) & 0x1FF;
-        psx_gpu->saved_viewport_start_x = psx_gpu->viewport_start_x;
-        psx_gpu->saved_viewport_start_y = psx_gpu->viewport_start_y;
+      {
+        s16 viewport_start_x = list[0] & 0x3FF;
+        s16 viewport_start_y = (list[0] >> 10) & 0x1FF;
+
+        if(viewport_start_x == psx_gpu->viewport_start_x &&
+         viewport_start_y == psx_gpu->viewport_start_y)
+        {
+          break;
+        }
+        psx_gpu->viewport_start_x = viewport_start_x;
+        psx_gpu->viewport_start_y = viewport_start_y;
+        psx_gpu->saved_viewport_start_x = viewport_start_x;
+        psx_gpu->saved_viewport_start_y = viewport_start_y;
+
+        select_enhancement_buf(psx_gpu);
 
 #ifdef TEXTURE_CACHE_4BPP
         psx_gpu->viewport_mask =
@@ -1289,12 +1685,25 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
 #endif
         SET_Ex(3, list[0]);
         break;
-  
+      }
+
       case 0xE4:
-        psx_gpu->viewport_end_x = list[0] & 0x3FF;
-        psx_gpu->viewport_end_y = (list[0] >> 10) & 0x1FF;
-        psx_gpu->saved_viewport_end_x = psx_gpu->viewport_end_x;
-        psx_gpu->saved_viewport_end_y = psx_gpu->viewport_end_y;
+      {
+        s16 viewport_end_x = list[0] & 0x3FF;
+        s16 viewport_end_y = (list[0] >> 10) & 0x1FF;
+
+        if(viewport_end_x == psx_gpu->viewport_end_x &&
+         viewport_end_y == psx_gpu->viewport_end_y)
+        {
+          break;
+        }
+
+        psx_gpu->viewport_end_x = viewport_end_x;
+        psx_gpu->viewport_end_y = viewport_end_y;
+        psx_gpu->saved_viewport_end_x = viewport_end_x;
+        psx_gpu->saved_viewport_end_y = viewport_end_y;
+
+        select_enhancement_buf(psx_gpu);
 
 #ifdef TEXTURE_CACHE_4BPP
         psx_gpu->viewport_mask =
@@ -1304,6 +1713,7 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
 #endif
         SET_Ex(4, list[0]);
         break;
+      }
   
       case 0xE5:
       {
@@ -1341,13 +1751,14 @@ u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_c
     }
   }
 
-#ifdef PCSX
+  enhancement_disable();
+
 breakloop:
-#endif
-enhancement_disable();
   if (last_command != NULL)
     *last_command = current_command;
   return list - list_start;
 }
 
-// vim:shiftwidth=2:expandtab
+#endif /* PCSX */
+
+// vim:ts=2:shiftwidth=2:expandtab