fix unaligned read
[gpsp.git] / arm / video_blend.S
1 .align 2\r
2 \r
3 .global expand_blend\r
4 .global expand_normal\r
5 \r
6 @ Input:\r
7 @   r0 = screen_src_ptr\r
8 @   r1 = screen_dest_ptr\r
9 @   r2 = start\r
10 @   r3 = end\r
11 \r
12 6:\r
13   .word io_registers\r
14   .word palette_ram_converted\r
15   .word 0x04000200                @ combine test mask\r
16   .word 0x07E0F81F                @ clamp mask\r
17   .word 0x000003FE                @ palette index mask\r
18   .word 0x08010020                @ saturation mask\r
19 \r
20 expand_blend:\r
21   stmdb sp!, { r4, r5, r6, r9, r10, r11, r14 }\r
22 \r
23   add r0, r0, r2, lsl #2          @ screen_src_ptr += start\r
24   add r1, r1, r2, lsl #1          @ screen_dest_ptr += start\r
25   sub r2, r3, r2                  @ r2 = end - start\r
26   ldr r3, 6b                      @ r3 = io_registers\r
27   ldrh r3, [r3, #0x52]            @ r3 = bldalpha\r
28   mov r4, r3, lsr #8              @ r4 = bldalpha >> 8\r
29   and r3, r3, #0x1F               @ r3 = blend_a\r
30   and r4, r4, #0x1F               @ r4 = blend_b\r
31   cmp r3, #16                     @ if(blend_a > 16)\r
32   movgt r3, #16                   @   blend_a = 16\r
33   cmp r4, #16                     @ if(blend_b > 16)\r
34   movgt r3, #16                   @   blend_b = 16\r
35 \r
36   ldr r14, 6b + 4                 @ r14 = palette_ram_converted\r
37   ldr r12, 6b + 8                 @ r12 = 0x04000200\r
38   ldr r11, 6b + 12                @ r11 = 0x07E0F81F\r
39   ldr r10, 6b + 16                @ r10 = 0x000003FE\r
40 \r
41   add r5, r3, r4                  @ r5 = blend_a + blend_b\r
42   cmp r5, #16                     @ if((blend_a + blend_b) > 16)\r
43   bgt 3f                          @   goto loop w/saturation\r
44 \r
45 \r
46   @ loop w/o saturation\r
47 1:\r
48   ldr r5, [r0], #4                @ r5 = pixel_pair, screen_src_ptr++\r
49   and r6, r5, r12                 @ r6 = r5 & 0x04000200\r
50   cmp r6, r12                     @ if(r6 != 0x4000200)\r
51   bne 2f                          @   goto no_blend\r
52 \r
53   and r6, r10, r5, lsl #1         @ r6 = (pixel_pair & 0x1FF) << 1\r
54   ldrh r6, [r14, r6]              @ r6 = pixel_top\r
55   orr r6, r6, r6, lsl #16         @ r6 = pixel_top | (pixel_top << 16)\r
56   and r6, r6, r11                 @ r6 = pixel_top_dilated\r
57 \r
58   and r5, r10, r5, lsr #15        @ r5 = ((pixel_pair >> 16) & 0x1FF) << 1\r
59   ldrh r5, [r14, r5]              @ r5 = pixel_bottom\r
60   orr r5, r5, r5, lsl #16         @ r5 = pixel_bottom | (pixel_bottom << 16)\r
61   and r5, r5, r11                 @ r5 = pixel_bottom_dilated\r
62 \r
63   mul r5, r4, r5                  @ r5 = pixel_bottom * blend_b = bottom_mul\r
64   mla r5, r3, r6, r5              @ r5 = (pixel_top * blend_a) + bottom_mul\r
65 \r
66   and r5, r11, r5, lsr #4         @ r5 = (color_dilated >> 4) & 0x07E0F81F\r
67   orr r5, r5, r5, lsr #16         @ r5 = color_dilated | (color_dilated >> 16)\r
68 \r
69   strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++\r
70   subs r2, r2, #1                 @ counter--\r
71   bne 1b                          @ go again\r
72 \r
73   ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }\r
74 \r
75 2:\r
76   and r5, r10, r5, lsl #1         @ r5 = (pixel_pair & 0x1FF) << 1\r
77   ldrh r5, [r14, r5]              @ r5 = pixel_top\r
78   strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++\r
79 \r
80   subs r2, r2, #1                 @ counter--\r
81   bne 1b                          @ go again\r
82 \r
83   ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }\r
84 \r
85 @ loop w/saturation\r
86 \r
87 3:\r
88   ldr r9, 6b + 20                 @ r9 = 0x08010020\r
89 \r
90 4:\r
91   ldr r5, [r0], #4                @ r5 = pixel_pair, screen_src_ptr++\r
92   and r6, r5, r12                 @ r6 = r5 & 0x04000200\r
93   cmp r6, r12                     @ if(r6 != 0x4000200)\r
94   bne 5f                          @   goto no_blend\r
95 \r
96   and r6, r10, r5, lsl #1         @ r6 = (pixel_pair & 0x1FF) << 1\r
97   ldrh r6, [r14, r6]              @ r6 = pixel_top\r
98   orr r6, r6, r6, lsl #16         @ r6 = pixel_top | (pixel_top << 16)\r
99   and r6, r6, r11                 @ r6 = pixel_top_dilated\r
100 \r
101   and r5, r10, r5, lsr #15        @ r5 = ((pixel_pair >> 16) & 0x1FF) << 1\r
102   ldrh r5, [r14, r5]              @ r5 = pixel_bottom\r
103   orr r5, r5, r5, lsl #16         @ r5 = pixel_bottom | (pixel_bottom << 16)\r
104   and r5, r5, r11                 @ r5 = pixel_bottom_dilated\r
105 \r
106   mul r5, r4, r5                  @ r5 = pixel_bottom * blend_b = bottom_mul\r
107   mla r5, r3, r6, r5              @ r5 = (pixel_top * blend_a) + bottom_mul\r
108 \r
109   and r6, r9, r5, lsr #4          @ r6 = saturation bits\r
110   orr r6, r6, r6, lsr #1          @ propogate saturation down msb\r
111   orr r6, r6, r6, lsr #2          @ propogate down next two bits\r
112   orr r6, r6, r6, lsr #3          @ propogate down next three bits\r
113   orr r5, r6, r5, lsr #4          @ mask over result w/saturation\r
114 \r
115   and r5, r11, r5                 @ r5 = (color_dilated >> 4) & 0x07E0F81F\r
116   orr r5, r5, r5, lsr #16         @ r5 = color_dilated | (color_dilated >> 16)\r
117   strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++\r
118 \r
119   subs r2, r2, #1                 @ counter--\r
120   bne 4b                          @ go again\r
121 \r
122   ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }\r
123 \r
124 5:\r
125   and r5, r10, r5, lsl #1         @ r5 = (pixel_pair & 0x1FF) << 1\r
126   ldrh r5, [r14, r5]              @ r5 = pixel_top\r
127   strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++\r
128 \r
129   subs r2, r2, #1                 @ counter--\r
130   bne 4b                          @ go again\r
131 \r
132   ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }\r
133 \r
134 \r
135 \r
136 @ The following function isn't complete (only works on run multiples of 8),\r
137 @ but unfortunately I don't see much potential for actually being able to\r
138 @ use it..\r
139 \r
140 #define expand_pixel_pair(reg, temp)                                         ;\\r
141   and temp, r3, reg, lsr #15                                                 ;\\r
142   ldrh temp, [r2, temp]                                                      ;\\r
143                                                                              ;\\r
144   and reg, r3, reg, lsl #1                                                   ;\\r
145   ldrh reg, [r2, reg]                                                        ;\\r
146                                                                              ;\\r
147   orr reg, reg, temp, lsl #16                                                ;\\r
148 \r
149 \r
150 @ Input:\r
151 @   r0 = screen_ptr\r
152 @   r1 = start\r
153 @   r2 = end\r
154 \r
155 1:\r
156   .word palette_ram_converted\r
157   .word 0x3FE\r
158 \r
159 expand_normal:\r
160   stmdb sp!, { r4, r5, r6, r7, r14 }\r
161 \r
162   add r0, r0, r1, lsl #1          @ screen_ptr += start\r
163   sub r1, r2, r1                  @ r1 = end - start\r
164   ldr r2, 1b                      @ r2 = palette_ram_converted\r
165   ldr r3, 1b + 4                  @ r3 = 0x3FE\r
166 \r
167 2:\r
168   ldmia r0, { r4, r5, r6, r7 }\r
169 \r
170   expand_pixel_pair(r4, r14)\r
171   expand_pixel_pair(r5, r14)\r
172   expand_pixel_pair(r6, r14)\r
173   expand_pixel_pair(r7, r14)\r
174 \r
175   stmia r0!, { r4, r5, r6, r7 }\r
176 \r
177   subs r1, r1, #8\r
178   bne 2b\r
179 \r
180   ldmia sp!, { r4, r5, r6, r7, pc }\r
181 \r