drc/psx_gpu: handle more calling conventions
[pcsx_rearmed.git] / plugins / gpu_neon / psx_gpu / psx_gpu_arm_neon.S
index a2bfa5b..efb065d 100644 (file)
@@ -20,6 +20,7 @@
 #define RENDER_STATE_MASK_EVALUATE                        0x20
 #define RENDER_FLAGS_MODULATE_TEXELS                      0x1
 #define RENDER_FLAGS_BLEND                                0x2
+#define RENDER_INTERLACE_ENABLED                          0x1
 
 #include "psx_gpu_offsets.h"
 
@@ -30,6 +31,8 @@
 #define edge_data_right_mask_offset                       4
 #define edge_data_y_offset                                6
 
+.syntax unified
+.text
 
 #define psx_gpu                                           r0
 #define v_a                                               r1
 
 .align 4
 
-/* FIXME: users of this should be in psx_gpu instead */
-#ifndef __PIC__
-#define load_pointer(register, pointer)                                        \
-  movw register, :lower16:pointer;                                             \
-  movt register, :upper16:pointer;                                             \
+#ifndef __MACH__
 
-#else
-#define load_pointer(register, pointer)                                        \
-  ldr  register, =pointer                                                      \
+#define function(name)                                                         \
+  .global name;                                                                \
+  .type name, %function;                                                       \
+  name:                                                                        \
 
-#endif
+#define JT_OP_REL(table_label, index_reg, temp)
+#define JT_OP(x...) x
+#define JTE(start, target) target
+
+#define EXTRA_UNSAVED_REGS
+
+#else
 
 #define function(name)                                                         \
-  .global name;                                                                \
+  .globl _##name;                                                              \
   name:                                                                        \
+  _##name:                                                                     \
+
+#define JT_OP_REL(table_label, index_reg, temp)                                \
+  adr temp, table_label;                                                       \
+  ldr temp, [ temp, index_reg, lsl #2 ];                                       \
+  add pc, pc, temp                                                             \
+
+#define JT_OP(x...)
+#define JTE(start, target) (target - start)
+
+// r7 is preserved, but add it for EABI alignment..
+#define EXTRA_UNSAVED_REGS r7, r9,
+
+#define flush_render_block_buffer _flush_render_block_buffer
+#define setup_sprite_untextured_simple _setup_sprite_untextured_simple
+#define update_texture_8bpp_cache _update_texture_8bpp_cache
+
+#endif
 
 @ r0: psx_gpu
 @ r1: v_a
@@ -575,7 +599,7 @@ function(compute_all_gradients)
   vld1.32 { uvrg }, [ temp ];                                                  \
   add temp, psx_gpu, #psx_gpu_uvrg_dy_offset;                                  \
   vld1.32 { uvrg_dy }, [ temp ];                                               \
-  load_pointer(reciprocal_table_ptr, reciprocal_table);                        \
+  ldr reciprocal_table_ptr, [ psx_gpu, #psx_gpu_reciprocal_table_ptr_offset ]; \
                                                                                \
   vmov.u32 c_0x01, #0x01                                                       \
 
@@ -623,7 +647,7 @@ function(compute_all_gradients)
 #define height_b_alt              r12
 
 #define compute_edge_delta_x3(start_c, height_a, height_b)                     \
-  vmov.u32 heights, height_a, height_b;                                        \
+  vmov heights, height_a, height_b;                                            \
   ldr temp, [ reciprocal_table_ptr, height_a, lsl #2 ];                        \
   vmov.u32 edge_shifts[0], temp;                                               \
   ldr temp, [ reciprocal_table_ptr, height_b, lsl #2 ];                        \
@@ -883,7 +907,7 @@ function(compute_all_gradients)
   add temp, temp, #(1 << 16);                                                  \
   add y_a, temp, #2;                                                           \
   add y_a, y_a, #(2 << 16);                                                    \
-  vmov.u32 y_x4, temp, y_a;                                                    \
+  vmov y_x4, temp, y_a;                                                        \
                                                                                \
   setup_spans_adjust_edges_alternate_##alternate_active(left_index,            \
    right_index);                                                               \
@@ -938,7 +962,7 @@ function(compute_all_gradients)
   sub temp, temp, #(1 << 16);                                                  \
   sub y_a, temp, #2;                                                           \
   sub y_a, y_a, #(2 << 16);                                                    \
-  vmov.u32 y_x4, temp, y_a;                                                    \
+  vmov y_x4, temp, y_a;                                                        \
                                                                                \
   vaddw.s32 edges_xy, edges_xy, edges_dx_dy;                                   \
                                                                                \
@@ -969,7 +993,7 @@ function(compute_all_gradients)
   sub height, y_a, y_c;                                                        \
                                                                                \
   vdup.u32 x_starts, x_a;                                                      \
-  vmov.u32 x_ends, x_c, x_b;                                                   \
+  vmov x_ends, x_c, x_b;                                                       \
                                                                                \
   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
   setup_spans_up(major, minor, minor, yes);                                    \
@@ -981,8 +1005,6 @@ function(setup_spans_up_left)
 function(setup_spans_up_right)
   setup_spans_up_up(right, left)
 
-.pool
-
 #define setup_spans_down_down(minor, major)                                    \
   setup_spans_prologue();                                                      \
   sub height_minor_a, y_b, y_a;                                                \
@@ -990,7 +1012,7 @@ function(setup_spans_up_right)
   sub height, y_c, y_a;                                                        \
                                                                                \
   vdup.u32 x_starts, x_a;                                                      \
-  vmov.u32 x_ends, x_c, x_b;                                                   \
+  vmov x_ends, x_c, x_b;                                                       \
                                                                                \
   compute_edge_delta_x3(x_b, height_major, height_minor_a);                    \
   setup_spans_down(major, minor, minor, yes);                                  \
@@ -1013,7 +1035,7 @@ function(setup_spans_down_right)
 function(setup_spans_up_a)
   setup_spans_prologue()
 
-  vmov.u32 x_starts, x_a, x_b
+  vmov x_starts, x_a, x_b
   vdup.u32 x_ends, x_c
 
   setup_spans_up_flat()
@@ -1022,7 +1044,7 @@ function(setup_spans_up_b)
   setup_spans_prologue()
 
   vdup.u32 x_starts, x_a
-  vmov.u32 x_ends, x_b, x_c
+  vmov x_ends, x_b, x_c
 
   setup_spans_up_flat()
 
@@ -1036,7 +1058,7 @@ function(setup_spans_up_b)
 function(setup_spans_down_a)
   setup_spans_prologue()
 
-  vmov.u32 x_starts, x_a, x_b
+  vmov x_starts, x_a, x_b
   vdup.u32 x_ends, x_c
 
   setup_spans_down_flat()
@@ -1045,7 +1067,7 @@ function(setup_spans_down_b)
   setup_spans_prologue()
 
   vdup.u32 x_starts, x_a
-  vmov.u32 x_ends, x_b, x_c
+  vmov x_ends, x_b, x_c
 
   setup_spans_down_flat()
 
@@ -1076,13 +1098,13 @@ function(setup_spans_up_down)
   sub height_minor_b, y_c, y_a
   sub height_major, y_c, y_b
 
-  vmov.u32 x_starts, x_a, x_c
+  vmov x_starts, x_a, x_c
   vdup.u32 x_ends, x_b
 
   compute_edge_delta_x3(x_a, height_minor_a, height_major)
 
   mov temp, #0
-  vmov.u32 height_increment, temp, height_minor_b
+  vmov height_increment, temp, height_minor_b
   vmlal.s32 edges_xy, edges_dx_dy, height_increment
 
   vmov edges_xy_b_left, edge_alt_low, edge_alt_high
@@ -1119,7 +1141,7 @@ function(setup_spans_up_down)
   sub temp, temp, #(1 << 16)
   sub y_a, temp, #2
   sub y_a, y_a, #(2 << 16)
-  vmov.u32 y_x4, temp, y_a
+  vmov y_x4, temp, y_a
 
   vaddw.s32 edges_xy, edges_xy, edges_dx_dy
 
@@ -1169,12 +1191,16 @@ function(setup_spans_up_down)
   add temp, temp, #(1 << 16) 
   add y_a, temp, #2
   add y_a, y_a, #(2 << 16)
-  vmov.u32 y_x4, temp, y_a
+  vmov y_x4, temp, y_a
 
   setup_spans_adjust_edges_alternate_no(left, right)
 
   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
   add temp, temp, height_minor_b
+
+  cmp temp, #MAX_SPANS
+  beq 5f
+
   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
 
  2:                                                     
@@ -1190,7 +1216,14 @@ function(setup_spans_up_down)
   setup_spans_prologue_b()
   bal 4b
 
-.pool
+ 5:
+  // FIXME: overflow corner case
+  sub temp, temp, height_minor_b
+  bics height_minor_b, #3
+  add temp, temp, height_minor_b
+  strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
+  bne 2b
+  bal 1b
 
 #undef span_uvrg_offset
 #undef span_edge_data
@@ -1560,9 +1593,9 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
-  stmdb sp!, { r0 - r3, r12, r14 };                                            \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, r12, r14 };                                            \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
                                                                                \
   vpop { uvrg_dx4 };                                                           \
   vpop { texture_mask };                                                       \
@@ -1757,9 +1790,9 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   vpush { texture_mask };                                                      \
   vpush { uvrg_dx4 };                                                          \
                                                                                \
-  stmdb sp!, { r0 - r3, r12, r14 };                                            \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, r12, r14 };                                            \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
                                                                                \
   vpop { uvrg_dx4 };                                                           \
   vpop { texture_mask };                                                       \
@@ -1873,9 +1906,9 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
  2:
   vpush { colors }
 
-  stmdb sp!, { r0 - r3, r12, r14 }
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
   bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, r12, r14 }
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
 
   vpop { colors }
 
@@ -1922,7 +1955,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
   vdup.u16 colors, color
 
   add span_edge_data, psx_gpu, #psx_gpu_span_edge_data_offset
-  orr color, color, lsl #16
+  orr color, color, color, lsl #16
 
 
  0:
@@ -1964,7 +1997,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
   moveq right_mask, right_mask, lsr #2
 
   tst right_mask, #0x1
-  streqh color, [ fb_ptr ]
+  strheq color, [ fb_ptr ]
 
  1:
   add span_edge_data, span_edge_data, #8
@@ -2288,9 +2321,9 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   /* TODO: Load from psx_gpu instead of saving/restoring these               */\
   vpush { rg_dx4 };                                                            \
                                                                                \
-  stmdb sp!, { r0 - r3, r12, r14 };                                            \
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
   bl flush_render_block_buffer;                                                \
-  ldmia sp!, { r0 - r3, r12, r14 };                                            \
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 };                        \
                                                                                \
   vpop { rg_dx4 };                                                             \
                                                                                \
@@ -2493,17 +2526,19 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
   vmlal.u8 pixels, g_whole_8, d64_4;                                           \
   vmlal.u8 pixels, b_whole_8, d64_128;                                         \
                                                                                \
-  ldr pc, [ pc, right_mask, lsl #2 ];                                          \
+  JT_OP_REL(100f, right_mask, temp);                                           \
+  JT_OP(ldr pc, [ pc, right_mask, lsl #2 ]);                                   \
   nop;                                                                         \
+ 100:                                                                          \
   nop;                                                                         \
-  .word 4f;                                                                    \
-  .word 5f;                                                                    \
-  .word 6f;                                                                    \
-  .word 7f;                                                                    \
-  .word 8f;                                                                    \
-  .word 9f;                                                                    \
-  .word 10f;                                                                   \
-  .word 11f;                                                                   \
+  .word JTE(100b, 4f);                                                         \
+  .word JTE(100b, 5f);                                                         \
+  .word JTE(100b, 6f);                                                         \
+  .word JTE(100b, 7f);                                                         \
+  .word JTE(100b, 8f);                                                         \
+  .word JTE(100b, 9f);                                                         \
+  .word JTE(100b, 10f);                                                        \
+  .word JTE(100b, 11f);                                                        \
                                                                                \
  4:                                                                            \
   vst1.u16 { pixels_low[0] }, [ fb_ptr ];                                      \
@@ -2676,7 +2711,7 @@ function(texture_blocks_4bpp)
   orr pixels_a, pixels_a, pixel_3, lsl #24
 
   orr pixels_b, pixels_b, pixel_7, lsl #24
-  vmov.u32 texels, pixels_a, pixels_b
+  vmov texels, pixels_a, pixels_b
 
   vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels
   vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels
@@ -2779,11 +2814,11 @@ function(texture_blocks_8bpp)
   ldmia sp!, { r3 - r11, pc }
 
 1:
-  stmdb sp!, { r1 - r2, r12 }
+  stmdb sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
 
   bl update_texture_8bpp_cache
 
-  ldmia sp!, { r1 - r2, r12 }
+  ldmia sp!, { r1 - r2, EXTRA_UNSAVED_REGS r12 }
   bal 0b
 
 
@@ -4266,6 +4301,7 @@ function(blend_blocks_textured_unblended_on)
   beq 1f
 
  0:
+  vorr.u16 pixels, pixels, msb_mask
   vorr.u16 draw_mask, draw_mask, write_mask
   vbif.u16 fb_pixels, pixels, draw_mask
   vst1.u16 { fb_pixels }, [ fb_ptr ]
@@ -4280,6 +4316,7 @@ function(blend_blocks_textured_unblended_on)
   bne 0b
  
  1:
+  vorr.u16 pixels, pixels, msb_mask
   vorr.u16 draw_mask, draw_mask, write_mask
   vbif.u16 fb_pixels, pixels, draw_mask
   vst1.u16 { fb_pixels }, [ fb_ptr ]
@@ -4390,6 +4427,8 @@ function(render_block_fill_body)
 #define fb_ptr_advance_column                             r12
 #define texture_block_ptr                                 r14
 
+#define temp                                              r14
+
 #define texture_page_ptr                                  r3
 #define left_block_mask                                   r4
 #define right_block_mask                                  r5
@@ -4445,9 +4484,9 @@ function(render_block_fill_body)
 setup_sprite_flush_blocks:
   vpush { q1 - q5 }
 
-  stmdb sp!, { r0 - r3, r12, r14 }
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
   bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, r12, r14 }
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
 
   vpop { q1 - q5 }
 
@@ -4462,9 +4501,9 @@ setup_sprite_update_texture_4bpp_cache:
 
 
 setup_sprite_update_texture_8bpp_cache:
-  stmdb sp!, { r0 - r3, r14 }
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r14 }
   bl update_texture_8bpp_cache
-  ldmia sp!, { r0 - r3, pc }
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS pc }
 
 
 #define setup_sprite_tiled_initialize_4bpp()                                   \
@@ -4735,7 +4774,7 @@ setup_sprite_update_texture_8bpp_cache:
   mov fb_ptr_advance_column, #32;                                              \
   vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
                                                                                \
-  sub fb_ptr_advance_column, height, lsl #11;                                  \
+  sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11;           \
   vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
 
 #define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
@@ -5079,7 +5118,7 @@ setup_sprite_update_texture_8bpp_cache:
   mov fb_ptr_advance_column, #32 * 2;                                          \
   vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
   vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
-  sub fb_ptr_advance_column, height, lsl #11 + 1;                              \
+  sub fb_ptr_advance_column, fb_ptr_advance_column, height, lsl #11 + 1;       \
   vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
   vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
 
@@ -5210,24 +5249,26 @@ function(setup_sprite_##texture_mode##x4mode)                                  \
   add block, block, num_blocks, lsl #6;                                        \
                                                                                \
   orreq control_mask, control_mask, #0x2;                                      \
-  ldr pc, [ pc, control_mask, lsl #2 ];                                        \
+  JT_OP_REL(9f, control_mask, temp);                                           \
+  JT_OP(ldr pc, [ pc, control_mask, lsl #2 ]);                                 \
   nop;                                                                         \
                                                                                \
- .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode;            \
- .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode;           \
- .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode;           \
- .word setup_sprite_##texture_mode##_single_single_full_none##x4mode;          \
- .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode;            \
- .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode;          \
- .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode;           \
- .word setup_sprite_##texture_mode##_single_single_half_right##x4mode;         \
- .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode;            \
- .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode;           \
- .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode;           \
- .word setup_sprite_##texture_mode##_single_single_half_left##x4mode;          \
- .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode;            \
+ 9:                                                                            \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_full##x4mode);   \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_full_none##x4mode);  \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_full##x4mode);  \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_full_none##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_full##x4mode);   \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_right##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_full##x4mode);  \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_right##x4mode);\
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_full_half##x4mode);   \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_multi_half_left##x4mode);  \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_full_half##x4mode);  \
+ .word JTE(9b, setup_sprite_##texture_mode##_single_single_half_left##x4mode); \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_multi_half_half##x4mode);   \
  .word 0x00000000;                                                             \
- .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode;           \
+ .word JTE(9b, setup_sprite_##texture_mode##_multi_single_half_half##x4mode);  \
 
 
 setup_sprite_tiled_builder(4bpp,);
@@ -5332,6 +5373,7 @@ function(texture_sprite_blocks_8bpp)
 #undef texels_wide_high
 #undef texels_wide
 #undef fb_ptr2
+#undef temp
 
 #define psx_gpu                                           r0
 #define x                                                 r1
@@ -5385,9 +5427,9 @@ function(texture_sprite_blocks_8bpp)
 setup_sprites_16bpp_flush:
   vpush { d0 - d3 }
 
-  stmdb sp!, { r0 - r3, r12, r14 }
+  stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
   bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, r12, r14 }
+  ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }
 
   vpop { d0 - d3 }
 
@@ -5412,7 +5454,7 @@ function(setup_sprite_16bpp)
   add texture_offset_base, u, u
   add width_rounded, width, #7
 
-  add texture_offset_base, v, lsl #11
+  add texture_offset_base, texture_offset_base, v, lsl #11
   mov left_mask_bits, #0xFF
   
   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
@@ -5427,7 +5469,7 @@ function(setup_sprite_16bpp)
   and right_width, width_rounded, #0x7
   mvn left_mask_bits, left_mask_bits, lsl left_offset
 
-  add texture_mask, texture_mask_height, lsl #11
+  add texture_mask, texture_mask, texture_mask_height, lsl #11
   mov block_width, width_rounded, lsr #3
 
   mov right_mask_bits, right_mask_bits, lsl right_width
@@ -5574,7 +5616,7 @@ function(setup_sprite_16bpp_4x)
   add texture_offset_base, u, u
   add width_rounded, width, #7
 
-  add texture_offset_base, v, lsl #11
+  add texture_offset_base, texture_offset_base, v, lsl #11
   movw left_mask_bits, #0xFFFF
   
   ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
@@ -5593,7 +5635,7 @@ function(setup_sprite_16bpp_4x)
 
   lsl right_width, #1
 
-  add texture_mask, texture_mask_height, lsl #11
+  add texture_mask, texture_mask, texture_mask_height, lsl #11
   mov block_width, width_rounded, lsr #3
 
   mov right_mask_bits, right_mask_bits, lsl right_width
@@ -5744,6 +5786,8 @@ function(setup_sprite_untextured)
   ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
   tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
     | RENDER_FLAGS_BLEND)
+  ldrbeq r12, [ psx_gpu, #psx_gpu_render_mode_offset ]
+  tsteq r12, #RENDER_INTERLACE_ENABLED
   beq setup_sprite_untextured_simple
 
   stmdb sp!, { r4 - r11, r14 }
@@ -6063,7 +6107,7 @@ function(scale2x_tiles8)
   mov r14, r2
   add r0, #1024*2*2
   add r4, #1024*2
-  sub r0, r2, lsl #4+1
+  sub r0, r0, r2, lsl #4+1
   mov r1, r4
   add r12, r0, #1024*2
   bgt 0b