psx_gpu: consolidate C code, implement exnhancement asm
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu.c
index c9175df..3de2ece 100644 (file)
@@ -20,7 +20,6 @@
 
 u32 span_pixels = 0;
 u32 span_pixel_blocks = 0;
-u32 span_pixel_blocks_unaligned = 0;
 u32 spans = 0;
 u32 triangles = 0;
 u32 sprites = 0;
@@ -39,9 +38,6 @@ u32 texel_blocks_8bpp = 0;
 u32 texel_blocks_16bpp = 0;
 u32 texel_blocks_untextured = 0;
 u32 blend_blocks = 0;
-u32 untextured_pixels = 0;
-u32 blend_pixels = 0;
-u32 transparent_pixels = 0;
 u32 render_buffer_flushes = 0;
 u32 state_changes = 0;
 u32 left_split_triangles = 0;
@@ -49,10 +45,10 @@ u32 flat_triangles = 0;
 u32 clipped_triangles = 0;
 u32 zero_block_spans = 0;
 u32 texture_cache_loads = 0;
-u32 false_modulated_triangles = 0;
-u32 false_modulated_sprites = 0;
+u32 false_modulated_blocks = 0;
 
-u32 reciprocal_table[512];
+/* double size for enhancement */
+u32 reciprocal_table[512 * 2];
 
 
 typedef s32 fixed_type;
@@ -241,6 +237,7 @@ u32 invalidate_texture_cache_region_viewport(psx_gpu_struct *psx_gpu, u32 x1,
 {
   u32 mask = texture_region_mask(x1, y1, x2, y2) &
    psx_gpu->viewport_mask;
+
   psx_gpu->dirty_textures_4bpp_mask |= mask;
   psx_gpu->dirty_textures_8bpp_mask |= mask;
   psx_gpu->dirty_textures_8bpp_alternate_mask |= mask;
@@ -248,6 +245,58 @@ u32 invalidate_texture_cache_region_viewport(psx_gpu_struct *psx_gpu, u32 x1,
   return mask;
 }
 
+void update_texture_cache_region(psx_gpu_struct *psx_gpu, u32 x1, u32 y1,
+ u32 x2, u32 y2)
+{
+  u32 mask = texture_region_mask(x1, y1, x2, y2);
+  u32 texture_page;
+  u8 *texture_page_ptr;
+  u16 *vram_ptr;
+  u32 texel_block;
+  u32 sub_x, sub_y;
+
+  psx_gpu->dirty_textures_8bpp_mask |= mask;
+  psx_gpu->dirty_textures_8bpp_alternate_mask |= mask;
+
+  if ((psx_gpu->dirty_textures_4bpp_mask & mask) == 0 &&
+      (x1 & 3) == 0 && (y1 & 15) == 0 && x2 - x1 < 4 && y2 - y1 < 16)
+  {
+    texture_page = ((x1 / 64) & 15) + (y1 / 256) * 16;
+    texture_page_ptr = psx_gpu->texture_4bpp_cache[texture_page];
+    texture_page_ptr += (x1 / 4 & 15) * 16*16 + (y1 / 16 & 15) * 16*16*16;
+    vram_ptr = psx_gpu->vram_ptr + x1 + y1 * 1024;
+    sub_x = 4;
+    sub_y = 16;
+
+    while(sub_y)
+    {
+      while(sub_x)
+      {
+        texel_block = *vram_ptr;
+
+        texture_page_ptr[0] = texel_block & 0xF;
+        texture_page_ptr[1] = (texel_block >> 4) & 0xF;
+        texture_page_ptr[2] = (texel_block >> 8) & 0xF;
+        texture_page_ptr[3] = texel_block >> 12;
+        
+        vram_ptr++;
+        texture_page_ptr += 4;
+
+        sub_x--;          
+      }
+
+      vram_ptr -= 4;
+      sub_x = 4;
+
+      sub_y--;
+      vram_ptr += 1024;
+    }
+  }
+  else
+  {
+    psx_gpu->dirty_textures_4bpp_mask |= mask;
+  }
+}
 
 void update_texture_8bpp_cache_slice(psx_gpu_struct *psx_gpu,
  u32 texture_page);
@@ -257,7 +306,7 @@ void update_texture_8bpp_cache_slice(psx_gpu_struct *psx_gpu,
 void update_texture_4bpp_cache(psx_gpu_struct *psx_gpu)
 {
   u32 current_texture_page = psx_gpu->current_texture_page;
-  u8 *texture_page_ptr = psx_gpu->texture_page_ptr;
+  u8 *texture_page_ptr = psx_gpu->texture_page_base;
   u16 *vram_ptr = psx_gpu->vram_ptr;
 
   u32 texel_block;
@@ -285,6 +334,7 @@ void update_texture_4bpp_cache(psx_gpu_struct *psx_gpu)
         while(sub_x)
         {
           texel_block = *vram_ptr;
+
           texture_page_ptr[0] = texel_block & 0xF;
           texture_page_ptr[1] = (texel_block >> 4) & 0xF;
           texture_page_ptr[2] = (texel_block >> 8) & 0xF;
@@ -319,7 +369,7 @@ void update_texture_4bpp_cache(psx_gpu_struct *psx_gpu)
 void update_texture_8bpp_cache_slice(psx_gpu_struct *psx_gpu,
  u32 texture_page)
 {
-  u16 *texture_page_ptr = psx_gpu->texture_page_ptr;
+  u16 *texture_page_ptr = psx_gpu->texture_page_base;
   u16 *vram_ptr = psx_gpu->vram_ptr;
 
   u32 tile_x, tile_y;
@@ -404,6 +454,48 @@ void setup_blocks_shaded_untextured_undithered_unswizzled_indirect(
 
 void flush_render_block_buffer(psx_gpu_struct *psx_gpu)
 {
+  if((psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) &&
+   (psx_gpu->primitive_type == PRIMITIVE_TYPE_SPRITE))
+  {
+    u32 num_blocks_dest = 0;
+    block_struct *block_src = psx_gpu->blocks; 
+    block_struct *block_dest = psx_gpu->blocks;
+
+    u16 *vram_ptr = psx_gpu->vram_ptr;
+    u32 i;
+
+    if(psx_gpu->render_mode & RENDER_INTERLACE_ODD)
+    {
+      for(i = 0; i < psx_gpu->num_blocks; i++)
+      {
+        u32 fb_offset = (u32)((u8 *)block_src->fb_ptr - (u8 *)vram_ptr);
+        if(fb_offset & (1 << 11))
+        {
+          *block_dest = *block_src;
+          num_blocks_dest++;
+          block_dest++;
+        }
+        block_src++;
+      }
+    }
+    else
+    {
+      for(i = 0; i < psx_gpu->num_blocks; i++)
+      {
+        u32 fb_offset = (u32)((u8 *)block_src->fb_ptr - (u8 *)vram_ptr);
+        if((fb_offset & (1 << 11)) == 0)
+        {
+          *block_dest = *block_src;
+          num_blocks_dest++;
+          block_dest++;
+        }
+        block_src++;
+      }
+    }
+
+    psx_gpu->num_blocks = num_blocks_dest;
+  }
+
   if(psx_gpu->num_blocks)
   {
     render_block_handler_struct *render_block_handler =
@@ -413,8 +505,10 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu)
     render_block_handler->shade_blocks(psx_gpu);
     render_block_handler->blend_blocks(psx_gpu);
 
+#ifdef PROFILE
     span_pixel_blocks += psx_gpu->num_blocks;
     render_buffer_flushes++;
+#endif
 
     psx_gpu->num_blocks = 0;
   }
@@ -473,7 +567,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
 
   vec_4x32u uvrg_base;
   vec_4x32u b_base;
-  vec_4x32u const_0x8000;
+  vec_4x32u uvrgb_phase;
 
   vec_4x16s d0_a_d3_c, d0_b, d0_c;
   vec_4x16s d1_a, d1_b, d1_c_d2_a;
@@ -502,12 +596,12 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   setup_gradient_calculation_input(1, b);
   setup_gradient_calculation_input(2, c);
 
-  dup_4x32b(const_0x8000, 0x8000);
+  dup_4x32b(uvrgb_phase, psx_gpu->uvrgb_phase);
   shl_long_4x16b(uvrg_base, x0_a_y0_c, 16);
   shl_long_4x16b(b_base, x0_b, 16);
 
-  add_4x32b(uvrg_base, uvrg_base, const_0x8000);
-  add_4x32b(b_base, b_base, const_0x8000);
+  add_4x32b(uvrg_base, uvrg_base, uvrgb_phase);
+  add_4x32b(b_base, b_base, uvrgb_phase);
 
   // Can probably pair these, but it'll require careful register allocation
   sub_4x16b(d0_a_d3_c, x1_a_y1_c, x0_a_y0_c);
@@ -761,7 +855,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
                                                                                \
   dup_2x32b(edge_shifts, edge_shift);                                          \
   sub_2x32b(heights_b, heights, c_0x01);                                       \
-  shr_2x32b(height_reciprocals, edge_shifts, 12);                              \
+  shr_2x32b(height_reciprocals, edge_shifts, 10);                              \
                                                                                \
   mla_2x32b(heights_b, x_starts, heights);                                     \
   bic_immediate_4x16b(vector_cast(vec_4x16u, edge_shifts), 0xE0);              \
@@ -790,8 +884,8 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   sub_2x32b(widths, x_ends, x_starts);                                         \
   width_alt = x_c - start_c;                                                   \
                                                                                \
-  shr_2x32b(height_reciprocals, edge_shifts, 12);                              \
-  height_reciprocal_alt = edge_shift_alt >> 12;                                \
+  shr_2x32b(height_reciprocals, edge_shifts, 10);                              \
+  height_reciprocal_alt = edge_shift_alt >> 10;                                \
                                                                                \
   bic_immediate_4x16b(vector_cast(vec_4x16u, edge_shifts), 0xE0);              \
   edge_shift_alt &= 0x1F;                                                      \
@@ -1748,6 +1842,8 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
   }                                                                            \
 
 #define setup_blocks_add_blocks_direct()                                       \
+  texel_blocks_untextured += span_num_blocks;                                  \
+  span_pixel_blocks += span_num_blocks                                         \
 
 
 #define setup_blocks_builder(shading, texturing, dithering, sw, target)        \
@@ -1777,7 +1873,7 @@ void setup_blocks_##shading##_##texturing##_##dithering##_##sw##_##target(     \
     if(span_num_blocks)                                                        \
     {                                                                          \
       y = span_edge_data->y;                                                   \
-      fb_ptr = psx_gpu->vram_ptr + span_edge_data->left_x + (y * 1024);        \
+      fb_ptr = psx_gpu->vram_out_ptr + span_edge_data->left_x + (y * 1024);    \
                                                                                \
       setup_blocks_span_initialize_##shading##_##texturing();                  \
       setup_blocks_span_initialize_##dithering(texturing);                     \
@@ -1787,7 +1883,6 @@ void setup_blocks_##shading##_##texturing##_##dithering##_##sw##_##target(     \
       s32 pixel_span = span_num_blocks * 8;                                    \
       pixel_span -= __builtin_popcount(span_edge_data->right_mask & 0xFF);     \
       span_pixels += pixel_span;                                               \
-      span_pixel_blocks_unaligned += (pixel_span + 7) / 8;                     \
                                                                                \
       span_num_blocks--;                                                       \
       while(span_num_blocks)                                                   \
@@ -2017,16 +2112,33 @@ void texture_blocks_16bpp(psx_gpu_struct *psx_gpu)
 }                                                                              \
 
 
-#define shade_blocks_textured_modulated_shaded_primitive_load()                \
+#define shade_blocks_textured_false_modulated_check_dithered(target)           \
+  if(psx_gpu->triangle_color == 0x808080)                                      \
+  {                                                                            \
+    false_modulated_blocks += num_blocks;                                      \
+  }                                                                            \
+
+#define shade_blocks_textured_false_modulated_check_undithered(target)         \
+  if(psx_gpu->triangle_color == 0x808080)                                      \
+  {                                                                            \
+                                                                               \
+    shade_blocks_textured_unmodulated_##target(psx_gpu);                       \
+    false_modulated_blocks += num_blocks;                                      \
+    return;                                                                    \
+  }                                                                            \
+
 
-#define shade_blocks_textured_modulated_unshaded_primitive_load()              \
+#define shade_blocks_textured_modulated_shaded_primitive_load(dithering,       \
+ target)                                                                       \
+
+#define shade_blocks_textured_modulated_unshaded_primitive_load(dithering,     \
+ target)                                                                       \
 {                                                                              \
   u32 color = psx_gpu->triangle_color;                                         \
   dup_8x8b(colors_r, color);                                                   \
   dup_8x8b(colors_g, color >> 8);                                              \
   dup_8x8b(colors_b, color >> 16);                                             \
-  if(psx_gpu->triangle_color == 0x808080)                                      \
-    false_modulated_triangles++;                                               \
+  shade_blocks_textured_false_modulated_check_##dithering(target);             \
 }                                                                              \
 
 #define shade_blocks_textured_modulated_shaded_block_load()                    \
@@ -2091,7 +2203,8 @@ void shade_blocks_##shading##_textured_modulated_##dithering##_##target(       \
                                                                                \
   dup_8x16b(d128_0x8000, 0x8000);                                              \
                                                                                \
-  shade_blocks_textured_modulated_##shading##_primitive_load();                \
+  shade_blocks_textured_modulated_##shading##_primitive_load(dithering,        \
+   target);                                                                    \
                                                                                \
   while(num_blocks)                                                            \
   {                                                                            \
@@ -2157,6 +2270,9 @@ void shade_blocks_unshaded_textured_modulated_dithered_indirect(psx_gpu_struct
 void shade_blocks_unshaded_textured_modulated_undithered_indirect(psx_gpu_struct
  *psx_gpu);
 
+void shade_blocks_textured_unmodulated_indirect(psx_gpu_struct *psx_gpu);
+void shade_blocks_textured_unmodulated_direct(psx_gpu_struct *psx_gpu);
+
 #ifndef NEON_BUILD
 
 shade_blocks_textured_modulated_builder(shaded, dithered, direct);
@@ -2204,8 +2320,38 @@ void shade_blocks_textured_unmodulated_##target(psx_gpu_struct *psx_gpu)       \
   }                                                                            \
 }                                                                              \
 
-void shade_blocks_textured_unmodulated_indirect(psx_gpu_struct *psx_gpu);
-void shade_blocks_textured_unmodulated_direct(psx_gpu_struct *psx_gpu);
+#define shade_blocks_textured_unmodulated_dithered_builder(target)             \
+void shade_blocks_textured_unmodulated_dithered_##target(psx_gpu_struct        \
+ *psx_gpu)                                                                     \
+{                                                                              \
+  block_struct *block = psx_gpu->blocks;                                       \
+  u32 num_blocks = psx_gpu->num_blocks;                                        \
+  vec_8x16u draw_mask;                                                         \
+  vec_8x16u test_mask = psx_gpu->test_mask;                                    \
+  u32 draw_mask_bits;                                                          \
+                                                                               \
+  vec_8x16u pixels;                                                            \
+  shade_blocks_load_msb_mask_##target();                                       \
+                                                                               \
+  while(num_blocks)                                                            \
+  {                                                                            \
+    vec_8x16u zero_mask;                                                       \
+                                                                               \
+    draw_mask_bits = block->draw_mask_bits;                                    \
+    dup_8x16b(draw_mask, draw_mask_bits);                                      \
+    tst_8x16b(draw_mask, draw_mask, test_mask);                                \
+                                                                               \
+    pixels = block->texels;                                                    \
+                                                                               \
+    cmpeqz_8x16b(zero_mask, pixels);                                           \
+    or_8x16b(zero_mask, draw_mask, zero_mask);                                 \
+                                                                               \
+    shade_blocks_store_##target(zero_mask, pixels);                            \
+                                                                               \
+    num_blocks--;                                                              \
+    block++;                                                                   \
+  }                                                                            \
+}                                                                              \
 
 #ifndef NEON_BUILD
 
@@ -2219,7 +2365,7 @@ void shade_blocks_unshaded_untextured_indirect(psx_gpu_struct *psx_gpu);
 void shade_blocks_unshaded_untextured_direct(psx_gpu_struct *psx_gpu);
 
 #ifndef NEON_BUILD
-                                                                                
+                                                                               
 void shade_blocks_unshaded_untextured_indirect(psx_gpu_struct *psx_gpu)
 {
 }
@@ -2760,10 +2906,8 @@ char *render_block_flag_strings[] =
    (triangle_y_direction_##direction_c << 4) |                                 \
    (triangle_winding_##winding << 6))                                          \
 
-psx_gpu_struct __attribute__((aligned(64))) psx_gpu_alt;
-
-void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
- u32 flags)
+static int prepare_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
+ vertex_struct *vertexes_out[3])
 {
   s32 y_top, y_bottom;
   s32 triangle_area;
@@ -2775,12 +2919,16 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 
   triangle_area = triangle_signed_area_x2(a->x, a->y, b->x, b->y, c->x, c->y);
 
+#ifdef PROFILE
   triangles++;
+#endif
 
   if(triangle_area == 0)
   {
+#ifdef PROFILE
     trivial_rejects++;
-    return;
+#endif
+    return 0;
   }
 
   if(b->y < a->y)
@@ -2799,8 +2947,10 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 
   if((y_bottom - y_top) >= 512)
   {
+#ifdef PROFILE
     trivial_rejects++;
-    return;
+#endif
+    return 0;
   }
 
   if(triangle_area < 0)
@@ -2821,23 +2971,42 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
       vertex_swap(a, b);
   }
 
-  if((c->x - a->x) >= 1024)
+  if((c->x - psx_gpu->offset_x) >= 1024 || (c->x - a->x) >= 1024)
   {
+#ifdef PROFILE
     trivial_rejects++;
-    return;
+#endif
+    return 0;
   }
 
   if(invalidate_texture_cache_region_viewport(psx_gpu, a->x, y_top, c->x,
    y_bottom) == 0)
   {
+#ifdef PROFILE
     trivial_rejects++;
-    return;
+#endif
+    return 0;
   }
 
-  psx_gpu->num_spans = 0;
   psx_gpu->triangle_area = triangle_area;
   psx_gpu->triangle_winding = triangle_winding;
 
+  vertexes_out[0] = a;
+  vertexes_out[1] = b;
+  vertexes_out[2] = c;
+
+  return 1;
+}
+
+static void render_triangle_p(psx_gpu_struct *psx_gpu,
+ vertex_struct *vertex_ptrs[3], u32 flags)
+{
+  psx_gpu->num_spans = 0;
+
+  vertex_struct *a = vertex_ptrs[0];
+  vertex_struct *b = vertex_ptrs[1];
+  vertex_struct *c = vertex_ptrs[2];
+
   s32 y_delta_a = b->y - a->y;
   s32 y_delta_b = c->y - b->y;
   s32 y_delta_c = c->y - a->y;
@@ -2849,7 +3018,7 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
   compute_all_gradients(psx_gpu, a, b, c);
 
   switch(y_direction_a | (y_direction_b << 2) | (y_direction_c << 4) |
-   (triangle_winding << 6))
+   (psx_gpu->triangle_winding << 6))
   {
     triangle_case(up, up, up, negative):
     triangle_case(up, up, flat, negative):
@@ -2924,7 +3093,31 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
       break;
   }
 
+#ifdef PROFILE
   spans += psx_gpu->num_spans;
+#endif
+
+  if(unlikely(psx_gpu->render_mode & RENDER_INTERLACE_ENABLED))
+  {
+    u32 i;
+
+    if(psx_gpu->render_mode & RENDER_INTERLACE_ODD)
+    {
+      for(i = 0; i < psx_gpu->num_spans; i++)
+      {
+        if((psx_gpu->span_edge_data[i].y & 1) == 0)
+          psx_gpu->span_edge_data[i].num_blocks = 0;
+      }
+    }
+    else
+    {
+      for(i = 0; i < psx_gpu->num_spans; i++)
+      {
+        if(psx_gpu->span_edge_data[i].y & 1)
+          psx_gpu->span_edge_data[i].num_blocks = 0;
+      }
+    }
+  }
 
   u32 render_state = flags &
    (RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND | 
@@ -2936,7 +3129,9 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
   {
     psx_gpu->render_state = render_state;
     flush_render_block_buffer(psx_gpu);
+#ifdef PROFILE
     state_changes++;
+#endif
   }
 
   psx_gpu->primitive_type = PRIMITIVE_TYPE_TRIANGLE;
@@ -2947,6 +3142,14 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
    (psx_gpu);
 }
 
+void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
+ u32 flags)
+{
+  vertex_struct *vertex_ptrs[3];
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs))
+    render_triangle_p(psx_gpu, vertex_ptrs, flags);
+}
+
 
 void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu);
 
@@ -2982,14 +3185,17 @@ void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu)
 #endif
 
 
-#define setup_sprite_tiled_initialize_4bpp()                                   \
+#define setup_sprite_tiled_initialize_4bpp_clut()                              \
   u16 *clut_ptr = psx_gpu->clut_ptr;                                           \
   vec_8x16u clut_a, clut_b;                                                    \
   vec_16x8u clut_low, clut_high;                                               \
                                                                                \
   load_8x16b(clut_a, clut_ptr);                                                \
   load_8x16b(clut_b, clut_ptr + 8);                                            \
-  unzip_16x8b(clut_low, clut_high, clut_a, clut_b);                            \
+  unzip_16x8b(clut_low, clut_high, clut_a, clut_b)                             \
+
+#define setup_sprite_tiled_initialize_4bpp()                                   \
+  setup_sprite_tiled_initialize_4bpp_clut();                                   \
                                                                                \
   if(psx_gpu->current_texture_mask & psx_gpu->dirty_textures_4bpp_mask)        \
     update_texture_4bpp_cache(psx_gpu)                                         \
@@ -3006,10 +3212,6 @@ void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu)
   load_64b(texels, texture_block_ptr)                                          \
 
 
-#define setup_sprite_tile_setup_block_yes(side, offset, texture_mode)          \
-
-#define setup_sprite_tile_setup_block_no(side, offset, texture_mode)           \
-
 #define setup_sprite_tile_add_blocks(tile_num_blocks)                          \
   num_blocks += tile_num_blocks;                                               \
   sprite_blocks += tile_num_blocks;                                            \
@@ -3155,34 +3357,36 @@ void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu)
 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
 
 
-#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode)  \
+#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
+ x4mode)                                                                       \
 do                                                                             \
 {                                                                              \
   sub_tile_height = column_data;                                               \
-  setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
-  setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge);                 \
+  setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
+  setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge);         \
 } while(0)                                                                     \
 
-#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode)   \
+#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
+ x4mode)                                                                       \
 do                                                                             \
 {                                                                              \
   u32 tiles_remaining = column_data >> 16;                                     \
   sub_tile_height = column_data & 0xFF;                                        \
-  setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
+  setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
   tiles_remaining -= 1;                                                        \
                                                                                \
   while(tiles_remaining)                                                       \
   {                                                                            \
     sub_tile_height = 16;                                                      \
-    setup_sprite_tile_##edge_mode##_##texture_mode(edge);                      \
+    setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);              \
     tiles_remaining--;                                                         \
   }                                                                            \
                                                                                \
   sub_tile_height = (column_data >> 8) & 0xFF;                                 \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
-  setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge);                 \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
+  setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge);         \
 } while(0)                                                                     \
 
 
@@ -3195,15 +3399,18 @@ do                                                                             \
   column_data |= (tile_height - 1) << 16                                       \
 
 
+#define RIGHT_MASK_BIT_SHIFT 8
+#define RIGHT_MASK_BIT_SHIFT_4x 16
+
 #define setup_sprite_tile_column_width_single(texture_mode, multi_height,      \
- edge_mode, edge)                                                              \
+ edge_mode, edge, x4mode)                                                      \
 {                                                                              \
   setup_sprite_column_data_##multi_height();                                   \
   left_mask_bits = left_block_mask | right_block_mask;                         \
-  right_mask_bits = left_mask_bits >> 8;                                       \
+  right_mask_bits = left_mask_bits >> RIGHT_MASK_BIT_SHIFT##x4mode;            \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(edge_mode, edge,              \
-   texture_mode);                                                              \
+   texture_mode, x4mode);                                                      \
 }                                                                              \
 
 #define setup_sprite_tiled_advance_column()                                    \
@@ -3211,18 +3418,22 @@ do                                                                             \
   if((texture_offset_base & 0xF00) == 0)                                       \
     texture_offset_base -= (0x100 + 0xF00)                                     \
 
+#define FB_PTR_MULTIPLIER 1
+#define FB_PTR_MULTIPLIER_4x 2
+
 #define setup_sprite_tile_column_width_multi(texture_mode, multi_height,       \
- left_mode, right_mode)                                                        \
+ left_mode, right_mode, x4mode)                                                \
 {                                                                              \
   setup_sprite_column_data_##multi_height();                                   \
-  s32 fb_ptr_advance_column = 16 - (1024 * height);                            \
+  s32 fb_ptr_advance_column = (16 - (1024 * height))                           \
+    * FB_PTR_MULTIPLIER##x4mode;                                               \
                                                                                \
   tile_width -= 2;                                                             \
   left_mask_bits = left_block_mask;                                            \
-  right_mask_bits = left_mask_bits >> 8;                                       \
+  right_mask_bits = left_mask_bits >> RIGHT_MASK_BIT_SHIFT##x4mode;            \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(left_mode, right,             \
-   texture_mode);                                                              \
+   texture_mode, x4mode);                                                      \
   fb_ptr += fb_ptr_advance_column;                                             \
                                                                                \
   left_mask_bits = 0x00;                                                       \
@@ -3231,22 +3442,297 @@ do                                                                             \
   while(tile_width)                                                            \
   {                                                                            \
     setup_sprite_tiled_advance_column();                                       \
-    setup_sprite_tile_column_height_##multi_height(full, none, texture_mode);  \
+    setup_sprite_tile_column_height_##multi_height(full, none,                 \
+     texture_mode, x4mode);                                                    \
     fb_ptr += fb_ptr_advance_column;                                           \
     tile_width--;                                                              \
   }                                                                            \
                                                                                \
   left_mask_bits = right_block_mask;                                           \
-  right_mask_bits = left_mask_bits >> 8;                                       \
+  right_mask_bits = left_mask_bits >> RIGHT_MASK_BIT_SHIFT##x4mode;            \
                                                                                \
   setup_sprite_tiled_advance_column();                                         \
   setup_sprite_tile_column_height_##multi_height(right_mode, left,             \
-   texture_mode);                                                              \
+   texture_mode, x4mode);                                                      \
 }                                                                              \
 
 
-#define setup_sprite_tiled_builder(texture_mode)                               \
-void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
+/* 4x stuff */
+#define setup_sprite_tiled_initialize_4bpp_4x()                                \
+  setup_sprite_tiled_initialize_4bpp_clut()                                    \
+
+#define setup_sprite_tiled_initialize_8bpp_4x()                                \
+
+
+#define setup_sprite_tile_full_4bpp_4x(edge)                                   \
+{                                                                              \
+  vec_8x8u texels_low, texels_high;                                            \
+  vec_8x16u pixels, pixels_wide;                                               \
+  setup_sprite_tile_add_blocks(sub_tile_height * 2 * 4);                       \
+  u32 left_mask_bits_a = left_mask_bits & 0xFF;                                \
+  u32 left_mask_bits_b = left_mask_bits >> 8;                                  \
+  u32 right_mask_bits_a = right_mask_bits & 0xFF;                              \
+  u32 right_mask_bits_b = right_mask_bits >> 8;                                \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    tbl_16(texels_low, texels, clut_low);                                      \
+    tbl_16(texels_high, texels, clut_high);                                    \
+    zip_8x16b(pixels, texels_low, texels_high);                                \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.low, pixels.low);    \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.high, pixels.high);  \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 1024 + 8;                                         \
+    block++;                                                                   \
+                                                                               \
+    setup_sprite_tile_fetch_texel_block_8bpp(8);                               \
+    tbl_16(texels_low, texels, clut_low);                                      \
+    tbl_16(texels_high, texels, clut_high);                                    \
+    zip_8x16b(pixels, texels_low, texels_high);                                \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.low, pixels.low);    \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 16;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 1024 + 16;                                        \
+    block++;                                                                   \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.high, pixels.high);  \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 24;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 1024 + 24;                                        \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
+
+#define setup_sprite_tile_half_4bpp_4x(edge)                                   \
+{                                                                              \
+  vec_8x8u texels_low, texels_high;                                            \
+  vec_8x16u pixels, pixels_wide;                                               \
+  setup_sprite_tile_add_blocks(sub_tile_height * 4);                           \
+  u32 edge##_mask_bits_a = edge##_mask_bits & 0xFF;                            \
+  u32 edge##_mask_bits_b = edge##_mask_bits >> 8;                              \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    tbl_16(texels_low, texels, clut_low);                                      \
+    tbl_16(texels_high, texels, clut_high);                                    \
+    zip_8x16b(pixels, texels_low, texels_high);                                \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.low, pixels.low);    \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.high, pixels.high);  \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 1024 + 8;                                         \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
+
+  
+#define setup_sprite_tile_full_8bpp_4x(edge)                                   \
+{                                                                              \
+  setup_sprite_tile_add_blocks(sub_tile_height * 2 * 4);                       \
+  vec_16x8u texels_wide;                                                       \
+  u32 left_mask_bits_a = left_mask_bits & 0xFF;                                \
+  u32 left_mask_bits_b = left_mask_bits >> 8;                                  \
+  u32 right_mask_bits_a = right_mask_bits & 0xFF;                              \
+  u32 right_mask_bits_b = right_mask_bits >> 8;                                \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    zip_8x16b(vector_cast(vec_8x16u, texels_wide), texels, texels);            \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 1024 + 8;                                         \
+    block++;                                                                   \
+                                                                               \
+    setup_sprite_tile_fetch_texel_block_8bpp(8);                               \
+    zip_8x16b(vector_cast(vec_8x16u, texels_wide), texels, texels);            \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 16;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 1024 + 16;                                        \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 24;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 24 + 1024;                                        \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
+
+#define setup_sprite_tile_half_8bpp_4x(edge)                                   \
+{                                                                              \
+  setup_sprite_tile_add_blocks(sub_tile_height * 4);                           \
+  vec_16x8u texels_wide;                                                       \
+  u32 edge##_mask_bits_a = edge##_mask_bits & 0xFF;                            \
+  u32 edge##_mask_bits_b = edge##_mask_bits >> 8;                              \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    zip_8x16b(vector_cast(vec_8x16u, texels_wide), texels, texels);            \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 8 + 1024;                                         \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
+
+  
+#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
+  texture_offset = texture_offset_base + 8;                                    \
+  fb_ptr += 16                                                                 \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
+  texture_offset = texture_offset_base                                         \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
+  setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
+
+#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
+  texture_offset = texture_offset_base                                         \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
+  fb_ptr -= 16                                                                 \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
+  setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
+
+#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
+
+
+#define setup_sprite_offset_u_adjust()                                         \
+
+#define setup_sprite_comapre_left_block_mask()                                 \
+  ((left_block_mask & 0xFF) == 0xFF)                                           \
+
+#define setup_sprite_comapre_right_block_mask()                                \
+  (((right_block_mask >> 8) & 0xFF) == 0xFF)                                   \
+
+
+#define setup_sprite_offset_u_adjust_4x()                                      \
+  offset_u *= 2;                                                               \
+  offset_u_right = offset_u_right * 2 + 1                                      \
+
+#define setup_sprite_comapre_left_block_mask_4x()                              \
+  ((left_block_mask & 0xFFFF) == 0xFFFF)                                       \
+
+#define setup_sprite_comapre_right_block_mask_4x()                             \
+  (((right_block_mask >> 16) & 0xFFFF) == 0xFFFF)                              \
+
+
+#define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
+void setup_sprite_##texture_mode##x4mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,\
  s32 u, s32 v, s32 width, s32 height, u32 color)                               \
 {                                                                              \
   s32 offset_u = u & 0xF;                                                      \
@@ -3258,8 +3744,10 @@ void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
   s32 tile_width = width_rounded / 16;                                         \
   u32 offset_u_right = width_rounded & 0xF;                                    \
                                                                                \
-  u32 left_block_mask = ~(0xFFFF << offset_u);                                 \
-  u32 right_block_mask = 0xFFFE << offset_u_right;                             \
+  setup_sprite_offset_u_adjust##x4mode();                                      \
+                                                                               \
+  u32 left_block_mask = ~(0xFFFFFFFF << offset_u);                             \
+  u32 right_block_mask = 0xFFFFFFFE << offset_u_right;                         \
                                                                                \
   u32 left_mask_bits;                                                          \
   u32 right_mask_bits;                                                         \
@@ -3276,19 +3764,19 @@ void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
   u32 texture_offset_base = texture_offset;                                    \
   u32 control_mask;                                                            \
                                                                                \
-  u16 *fb_ptr = psx_gpu->vram_ptr + (y * 1024) + (x - offset_u);               \
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + (x - offset_u);           \
   u32 num_blocks = psx_gpu->num_blocks;                                        \
   block_struct *block = psx_gpu->blocks + num_blocks;                          \
                                                                                \
   u16 *texture_block_ptr;                                                      \
   vec_8x8u texels;                                                             \
                                                                                \
-  setup_sprite_tiled_initialize_##texture_mode();                              \
+  setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
                                                                                \
   control_mask = tile_width == 1;                                              \
   control_mask |= (tile_height == 1) << 1;                                     \
-  control_mask |= ((left_block_mask & 0xFF) == 0xFF) << 2;                     \
-  control_mask |= (((right_block_mask >> 8) & 0xFF) == 0xFF) << 3;             \
+  control_mask |= setup_sprite_comapre_left_block_mask##x4mode() << 2;         \
+  control_mask |= setup_sprite_comapre_right_block_mask##x4mode() << 3;        \
                                                                                \
   sprites_##texture_mode++;                                                    \
                                                                                \
@@ -3296,64 +3784,77 @@ void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
   {                                                                            \
     default:                                                                   \
     case 0x0:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, full, full);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, full, full,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x1:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, multi, full, none);  \
+      setup_sprite_tile_column_width_single(texture_mode, multi, full, none,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x2:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, full, full);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, full, full,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x3:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, single, full, none); \
+      setup_sprite_tile_column_width_single(texture_mode, single, full, none,  \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x4:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, half, full);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, half, full,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x5:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, multi, half, right); \
+      setup_sprite_tile_column_width_single(texture_mode, multi, half, right,  \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x6:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, half, full);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, half, full,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x7:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, single, half, right);\
+      setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x8:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, full, half);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, full, half,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x9:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, multi, half, left);  \
+      setup_sprite_tile_column_width_single(texture_mode, multi, half, left,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xA:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, full, half);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, full, half,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xB:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, single, half, left); \
+      setup_sprite_tile_column_width_single(texture_mode, single, half, left,  \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xC:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, half, half);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, half, half,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xE:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, half, half);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, half, half,   \
+       x4mode);                                                                \
       break;                                                                   \
   }                                                                            \
 }                                                                              \
 
-
 void setup_sprite_4bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
  s32 width, s32 height, u32 color);
 void setup_sprite_8bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
@@ -3361,9 +3862,19 @@ void setup_sprite_8bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
 void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
  s32 width, s32 height, u32 color);
 
+void setup_sprite_4bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
+ s32 width, s32 height, u32 color);
+void setup_sprite_8bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
+ s32 width, s32 height, u32 color);
+void setup_sprite_16bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
+ s32 width, s32 height, u32 color);
+
 #ifndef NEON_BUILD
-setup_sprite_tiled_builder(4bpp);
-setup_sprite_tiled_builder(8bpp);
+setup_sprite_tiled_builder(4bpp,);
+setup_sprite_tiled_builder(8bpp,);
+
+setup_sprite_tiled_builder(4bpp,_4x);
+setup_sprite_tiled_builder(8bpp,_4x);
 
 void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
  s32 v, s32 width, s32 height, u32 color)
@@ -3371,7 +3882,7 @@ void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
   u32 left_offset = u & 0x7;
   u32 width_rounded = width + left_offset + 7;
 
-  u16 *fb_ptr = psx_gpu->vram_ptr + (y * 1024) + (x - left_offset);
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + (s32)(x - left_offset);
   u32 right_width = width_rounded & 0x7;
   u32 block_width = width_rounded / 8;
   u32 fb_ptr_pitch = (1024 + 8) - (block_width * 8);
@@ -3493,7 +4004,7 @@ void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
 {
   u32 right_width = ((width - 1) & 0x7) + 1;
   u32 right_mask_bits = (0xFF << right_width);
-  u16 *fb_ptr = psx_gpu->vram_ptr + (y * 1024) + x;
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + x;
   u32 block_width = (width + 7) / 8;
   u32 fb_ptr_pitch = 1024 - ((block_width - 1) * 8);
   u32 blocks_remaining;
@@ -3521,7 +4032,10 @@ void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
   {
     blocks_remaining = block_width - 1;
     num_blocks += block_width;
+
+#ifdef PROFILE
     sprite_blocks += block_width;
+#endif
 
     if(num_blocks > MAX_BLOCKS)
     {
@@ -3656,6 +4170,10 @@ void render_sprite(psx_gpu_struct *psx_gpu, s32 x, s32 y, u32 u, u32 v,
   s32 x_right = x + width - 1;
   s32 y_bottom = y + height - 1;
 
+#ifdef PROFILE
+  sprites++;
+#endif
+
   if(invalidate_texture_cache_region_viewport(psx_gpu, x, y, x_right,
    y_bottom) == 0)
   {
@@ -3687,23 +4205,25 @@ void render_sprite(psx_gpu_struct *psx_gpu, s32 x, s32 y, u32 u, u32 v,
   if((width <= 0) || (height <= 0))
     return;
 
-  sprites++;
-
+#ifdef PROFILE
   span_pixels += width * height;
   spans += height;
+#endif
 
   u32 render_state = flags &
    (RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND |
    RENDER_FLAGS_TEXTURE_MAP);
   render_state |=
    (psx_gpu->render_state_base & ~RENDER_STATE_DITHER);
-  
+
   if((psx_gpu->render_state != render_state) ||
    (psx_gpu->primitive_type != PRIMITIVE_TYPE_SPRITE))
   {
     psx_gpu->render_state = render_state;
     flush_render_block_buffer(psx_gpu);
+#ifdef PROFILE
     state_changes++;
+#endif
   }
 
   psx_gpu->primitive_type = PRIMITIVE_TYPE_SPRITE;
@@ -3888,9 +4408,18 @@ void render_sprite(psx_gpu_struct *psx_gpu, s32 x, s32 y, u32 u, u32 v,
 #define set_line_gradients(minor)                                              \
 {                                                                              \
   s32 gradient_divisor = delta_##minor;                                        \
-  gradient_r = int_to_fixed(vertex_b->r - vertex_a->r) / gradient_divisor;     \
-  gradient_g = int_to_fixed(vertex_b->g - vertex_a->g) / gradient_divisor;     \
-  gradient_b = int_to_fixed(vertex_b->b - vertex_a->b) / gradient_divisor;     \
+  if(gradient_divisor != 0)                                                    \
+  {                                                                            \
+    gradient_r = int_to_fixed(vertex_b->r - vertex_a->r) / gradient_divisor;   \
+    gradient_g = int_to_fixed(vertex_b->g - vertex_a->g) / gradient_divisor;   \
+    gradient_b = int_to_fixed(vertex_b->b - vertex_a->b) / gradient_divisor;   \
+  }                                                                            \
+  else                                                                         \
+  {                                                                            \
+    gradient_r = 0;                                                            \
+    gradient_g = 0;                                                            \
+    gradient_b = 0;                                                            \
+  }                                                                            \
   current_r = fixed_center(vertex_a->r);                                       \
   current_g = fixed_center(vertex_a->g);                                       \
   current_b = fixed_center(vertex_a->b);                                       \
@@ -3958,9 +4487,6 @@ do                                                                             \
   {                                                                            \
     delta_y *= -1;                                                             \
                                                                                \
-    if(delta_y >= 512)                                                         \
-      return;                                                                  \
-                                                                               \
     if(delta_x > delta_y)                                                      \
     {                                                                          \
       draw_line_span_horizontal(decrement, shading, blending, dithering,       \
@@ -3974,9 +4500,6 @@ do                                                                             \
   }                                                                            \
   else                                                                         \
   {                                                                            \
-    if(delta_y >= 512)                                                         \
-      return;                                                                  \
-                                                                               \
     if(delta_x > delta_y)                                                      \
     {                                                                          \
       draw_line_span_horizontal(increment, shading, blending, dithering,       \
@@ -3991,7 +4514,7 @@ do                                                                             \
 
                                                                                 
 void render_line(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, u32 flags,
- u32 color)
+ u32 color, int double_resolution)
 {
   s32 color_r, color_g, color_b;
   u32 triangle_winding = 0;
@@ -4025,7 +4548,9 @@ void render_line(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, u32 flags,
 
   u32 control_mask;
 
+#ifdef PROFILE
   lines++;
+#endif
 
   if(vertex_a->x >= vertex_b->x)
   {
@@ -4041,12 +4566,22 @@ void render_line(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, u32 flags,
   delta_x = x_b - x_a;
   delta_y = y_b - y_a;
 
-  if(delta_x >= 1024)
+  if(delta_x >= 1024 || delta_y >= 512 || delta_y <= -512)
     return;
 
+  if(double_resolution)
+  {
+    x_a *= 2;
+    x_b *= 2;
+    y_a *= 2;
+    y_b *= 2;
+    delta_x *= 2;
+    delta_y *= 2;
+  }
+
   flags &= ~RENDER_FLAGS_TEXTURE_MAP;
 
-  vram_ptr = psx_gpu->vram_ptr + (y_a * 1024) + x_a;
+  vram_ptr = psx_gpu->vram_out_ptr + (y_a * 1024) + x_a;
 
   control_mask = 0x0;
 
@@ -4233,32 +4768,97 @@ void render_line(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, u32 flags,
 void render_block_fill(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
  u32 width, u32 height)
 {
+  if((width == 0) || (height == 0))
+    return;
+
   invalidate_texture_cache_region(psx_gpu, x, y, x + width - 1, y + height - 1);
 
-#ifndef NEON_BUILD
   u32 r = color & 0xFF;
   u32 g = (color >> 8) & 0xFF;
   u32 b = (color >> 16) & 0xFF;
-  u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10);
+  u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10) |
+   psx_gpu->mask_msb;
+  u32 color_32bpp = color_16bpp | (color_16bpp << 16);
 
-  u16 *vram_ptr = psx_gpu->vram_ptr + x + (y * 1024);
-  u32 draw_x, draw_y;
+  u32 *vram_ptr = (u32 *)(psx_gpu->vram_out_ptr + x + (y * 1024));
 
-  for(draw_y = 0; draw_y < height; draw_y++)
+  u32 pitch = 512 - (width / 2);
+  u32 num_width;
+
+  if(psx_gpu->render_mode & RENDER_INTERLACE_ENABLED)
   {
-    for(draw_x = 0; draw_x < width; draw_x++)
+    pitch += 512;
+    height /= 2;
+
+    if(psx_gpu->render_mode & RENDER_INTERLACE_ODD)
+      vram_ptr += 512; 
+  }
+
+  while(height)
+  {
+    num_width = width;
+    while(num_width)
     {
-      vram_ptr[draw_x] = color_16bpp;
+      vram_ptr[0] = color_32bpp;
+      vram_ptr[1] = color_32bpp;
+      vram_ptr[2] = color_32bpp;
+      vram_ptr[3] = color_32bpp;
+      vram_ptr[4] = color_32bpp;
+      vram_ptr[5] = color_32bpp;
+      vram_ptr[6] = color_32bpp;
+      vram_ptr[7] = color_32bpp;
+
+      vram_ptr += 8;
+      num_width -= 16;
     }
 
-    vram_ptr += 1024;
+    vram_ptr += pitch;
+    height--;
   }
-#else
-  void render_block_fill_body(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
-   u32 width, u32 height);
+}
 
-  render_block_fill_body(psx_gpu, color, x, y, width, height);
-#endif
+void render_block_fill_enh(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
+ u32 width, u32 height)
+{
+  if((width == 0) || (height == 0))
+    return;
+
+  if(width > 1024)
+    width = 1024;
+
+  u32 r = color & 0xFF;
+  u32 g = (color >> 8) & 0xFF;
+  u32 b = (color >> 16) & 0xFF;
+  u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10) |
+   psx_gpu->mask_msb;
+  u32 color_32bpp = color_16bpp | (color_16bpp << 16);
+
+  u32 *vram_ptr = (u32 *)(psx_gpu->vram_out_ptr + x + (y * 1024));
+
+  u32 pitch = 1024 / 2 - (width / 2);
+  u32 num_width;
+
+  while(height)
+  {
+    num_width = width;
+    while(num_width)
+    {
+      vram_ptr[0] = color_32bpp;
+      vram_ptr[1] = color_32bpp;
+      vram_ptr[2] = color_32bpp;
+      vram_ptr[3] = color_32bpp;
+      vram_ptr[4] = color_32bpp;
+      vram_ptr[5] = color_32bpp;
+      vram_ptr[6] = color_32bpp;
+      vram_ptr[7] = color_32bpp;
+
+      vram_ptr += 8;
+      num_width -= 16;
+    }
+
+    vram_ptr += pitch;
+    height--;
+  }
 }
 
 void render_block_copy(psx_gpu_struct *psx_gpu, u16 *source, u32 x, u32 y,
@@ -4266,16 +4866,19 @@ void render_block_copy(psx_gpu_struct *psx_gpu, u16 *source, u32 x, u32 y,
 {
   u16 *vram_ptr = psx_gpu->vram_ptr + x + (y * 1024);
   u32 draw_x, draw_y;
+  u32 mask_msb = psx_gpu->mask_msb;
 
-  invalidate_texture_cache_region(psx_gpu, x, y, x + width - 1, y + height - 1);
+  if((width == 0) || (height == 0))
+    return;
 
-  //printf("copy for %d, %d\n", width, height);
+  flush_render_block_buffer(psx_gpu);
+  invalidate_texture_cache_region(psx_gpu, x, y, x + width - 1, y + height - 1);
 
   for(draw_y = 0; draw_y < height; draw_y++)
   {
     for(draw_x = 0; draw_x < width; draw_x++)
     {
-      vram_ptr[draw_x] = source[draw_x];
+      vram_ptr[draw_x] = source[draw_x] | mask_msb;
     }
 
     source += pitch;
@@ -4298,16 +4901,17 @@ void initialize_reciprocal_table(void)
   u32 height_reciprocal;
   s32 shift;
 
-  for(height = 1; height < 512; height++)
+  for(height = 1; height < sizeof(reciprocal_table)
+       / sizeof(reciprocal_table[0]); height++)
   {
     shift = __builtin_clz(height);
     height_normalized = height << shift;
-    height_reciprocal = ((1ULL << 50) + (height_normalized - 1)) /
+    height_reciprocal = ((1ULL << 52) + (height_normalized - 1)) /
      height_normalized;
 
-    shift = 32 - (50 - shift);
+    shift = 32 - (52 - shift);
 
-    reciprocal_table[height] = (height_reciprocal << 12) | shift;
+    reciprocal_table[height] = (height_reciprocal << 10) | shift;
   }
 }
 
@@ -4322,15 +4926,6 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
 
   psx_gpu->test_mask = test_mask;
 
-  psx_gpu->pixel_count_mode = 0;
-  psx_gpu->pixel_compare_mode = 0;
-
-  psx_gpu->vram_pixel_counts_a = malloc(sizeof(u8) * 1024 * 512);
-  psx_gpu->vram_pixel_counts_b = malloc(sizeof(u8) * 1024 * 512);
-  memset(psx_gpu->vram_pixel_counts_a, 0, sizeof(u8) * 1024 * 512);
-  memset(psx_gpu->vram_pixel_counts_b, 0, sizeof(u8) * 1024 * 512);
-  psx_gpu->compare_vram = malloc(sizeof(u16) * 1024 * 512);
-
   psx_gpu->dirty_textures_4bpp_mask = 0xFFFFFFFF;
   psx_gpu->dirty_textures_8bpp_mask = 0xFFFFFFFF;
   psx_gpu->dirty_textures_8bpp_alternate_mask = 0xFFFFFFFF;
@@ -4344,14 +4939,24 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
   psx_gpu->render_state = 0;
   psx_gpu->render_state_base = 0;
   psx_gpu->num_blocks = 0;
+  psx_gpu->uvrgb_phase = 0x8000;
 
   psx_gpu->vram_ptr = vram;
+  psx_gpu->vram_out_ptr = vram;
 
+  psx_gpu->texture_page_base = psx_gpu->vram_ptr;
   psx_gpu->texture_page_ptr = psx_gpu->vram_ptr;
   psx_gpu->clut_ptr = psx_gpu->vram_ptr;
 
   psx_gpu->mask_msb = 0;
 
+  psx_gpu->texture_window_x = 0;
+  psx_gpu->texture_window_y = 0;
+  psx_gpu->texture_mask_width = 0xFF;
+  psx_gpu->texture_mask_height = 0xFF;
+
+  psx_gpu->render_mode = 0;
+
   memset(psx_gpu->vram_ptr, 0, sizeof(u16) * 1024 * 512);
 
   initialize_reciprocal_table();
@@ -4367,13 +4972,14 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
   // d1: (2 3 6 7): y0
   // d2: (4 5 6 7): x0 ^ y0
 
-
   psx_gpu->dither_table[0] = dither_table_row(-4, 0, -3, 1);
   psx_gpu->dither_table[1] = dither_table_row(2, -2, 3, -1);
   psx_gpu->dither_table[2] = dither_table_row(-3, 1, -4, 0);
   psx_gpu->dither_table[3] = dither_table_row(3, -1, 2, -2);
 
   psx_gpu->primitive_type = PRIMITIVE_TYPE_UNKNOWN;
+
+  psx_gpu->enhancement_x_threshold = 256;
 }
 
 u64 get_us(void)
@@ -4438,3 +5044,4 @@ void triangle_benchmark(psx_gpu_struct *psx_gpu)
 
 #endif
 
+#include "psx_gpu_4x.c"