1 @ assembly "optimized" blitter and copy functions
\r
2 @ all pointers must be word-aligned
\r
4 @ (c) Copyright 2006, notaz
\r
5 @ All Rights Reserved
\r
8 @ Convert 0000bbb0 ggg0rrr0
\r
9 @ to 0000rrr0 ggg0bbb0
\r
11 @ r2,r3 - scratch, lr = 0x000F000F
\r
12 .macro convRGB444 reg
\r
13 and r2, \reg, lr @ r2=red
\r
14 and r3, \reg, lr, lsl #8 @ r3=blue
\r
15 and \reg, \reg, lr, lsl #4 @ green stays in place
\r
16 orr \reg, \reg, r2, lsl #8 @ add red back
\r
17 orr \reg, \reg, r3, lsr #8 @ add blue back
\r
20 .global vidConvCpyRGB444 @ void *to, void *from, int pixels
\r
23 stmfd sp!, {r4-r11,lr}
\r
25 mov r12, r2, lsr #4 @ repeats
\r
27 orr lr, lr, #0xF @ lr == pattern 0x000F000F
\r
33 @ I first thought storing multiple registers would be faster,
\r
34 @ but this doesn't seem to be the case, probably because of
\r
35 @ slow video memory we are dealing with
\r
57 ldmfd sp!, {r4-r11,lr}
\r
61 @ Convert 0000bbb0 ggg0rrr0
\r
62 @ to rrr00ggg 000bbb00
\r
64 @ r2,r3 - scratch, lr = 0x07800780
\r
65 .macro convRGB565 reg
\r
66 and r2, \reg, lr, lsr #7 @ r2=red
\r
67 and r3, \reg, lr, lsl #1 @ r3=blue
\r
68 and \reg, lr, \reg,lsl #3 @ green stays, but needs shifting
\r
69 orr \reg, \reg, r2, lsl #12 @ add red back
\r
70 orr \reg, \reg, r3, lsr #7 @ add blue back
\r
73 .global vidConvCpyRGB565 @ void *to, void *from, int pixels
\r
76 stmfd sp!, {r4-r11,lr}
\r
78 mov r12, r2, lsr #4 @ repeats
\r
80 orr lr, lr, #0x780 @ lr == pattern 0x07800780
\r
105 ldmfd sp!, {r4-r11,lr}
\r
109 @ Convert 0000bbb0 ggg0rrr0 0000bbb0 ggg0rrr0
\r
110 @ to 00000000 rrr00000 ggg00000 bbb00000 ...
\r
112 @ r2,r3 - scratch, lr = 0x0000F000
\r
113 @ rin - src reg, rout - dest reg (can be same for both; rout can be r3)
\r
114 .macro convRGB32_l rout rin
\r
115 and r2, \rin, lr, lsr #12 @ r2=red
\r
116 and r3, \rin, lr, lsr #4 @ r3=blue
\r
117 orr r2, r3, r2, lsl #24
\r
118 and \rout, lr, \rin, lsl #8 @ green stays, but needs shifting
\r
119 orr \rout, \rout, r2, lsr #4 @ add red+blue back
\r
122 @ r2,r3 - scratch, lr = 0x0000F000
\r
123 @ rin - src reg, rout - dest reg (can be same for both; rout can be r3)
\r
124 .macro convRGB32_h rout rin
\r
125 and r2, \rin, lr, lsl #4 @ r2=red
\r
126 mov r3, \rin, lsr #24 @ r3=blue
\r
128 and \rout, lr, \rin, lsr #8 @ green
\r
129 orr \rout, \rout, r2, lsl #4
\r
132 @ slightly faster conversion, saves 1 opcode, writes output
\r
133 @ lr = 0x00F000F0, out: r3=lower_pix, r2=higher_pix; trashes rin
\r
134 .macro convRGB32_2 rin rethigh=0
\r
135 and r2, lr, \rin, lsr #4 @ blue
\r
137 orr r2, r2, r3, lsl #8 @ g0b0g0b0
\r
139 mov r3, r2, lsl #16 @ g0b00000
\r
140 and \rin,lr, \rin, ror #12 @ 00r000r0 (reversed)
\r
141 orr r3, r3, \rin, lsr #16 @ g0b000r0
\r
142 mov r3, r3, ror #16 @ r3=low
\r
146 mov r2, r2, lsr #16
\r
148 orr \rin,r2, \rin, lsl #16
\r
150 orr r2, r2, \rin, lsl #16
\r
156 .global vidConvCpyRGB32 @ void *to, void *from, int pixels
\r
159 stmfd sp!, {r4-r7,lr}
\r
161 mov r12, r2, lsr #3 @ repeats
\r
162 mov lr, #0x00F00000
\r
163 orr lr, lr, #0x00F0
\r
176 ldmfd sp!, {r4-r7,lr}
\r
180 @ -------- M2 stuff ---------
\r
186 tmpstore1: .long tmpstore1d
\r
189 @ r3 - scratch, ru - reg with 2 pixels from upper col, rl - ... lower col
\r
190 .macro rot_str16_90 ru rl
\r
191 mov r3, \rl,lsl #16
\r
192 mov r3, r3, lsr #16
\r
193 orr r3, r3, \ru, lsl #16
\r
194 str r3, [r0], #208*2
\r
195 mov r3, \ru,lsr #16
\r
196 mov r3, r3, lsl #16
\r
197 orr r3, r3, \rl, lsr #16
\r
198 str r3, [r0], #208*2
\r
202 .global vidConvCpyM2_16_90 @ void *to, void *from, int width
\r
204 vidConvCpyM2_16_90:
\r
205 stmfd sp!, {r4-r11,lr}
\r
208 str sp, [r4] @ save sp, we will need sp reg..
\r
209 mov sp, r0 @ .. to store our dst
\r
211 @ crashing beyond this point will be fatal (phone reboots), as Symbian OS expects sp to always point to stack
\r
214 mov r12, #0x00670000
\r
215 orr r12, r12, r2, lsl #24
\r
216 orr r12, r12, r2 @ r12 == ((208-2)/2 << 16) | ((width-1)<<24) | (width-1)
\r
219 add r1, r1, #8*2 @ skip left border
\r
223 subs r12, r12, #1<<24
\r
226 ldmia lr!, {r8-r11}
\r
229 rot_str16_90 r6 r10
\r
230 rot_str16_90 r7 r11
\r
234 add r12, r12, #1<<24
\r
235 subs r12, r12, #0x00010000
\r
236 bmi .loopM2_16_90_end
\r
238 add r0, sp, r12, lsr #14 @ calculate new dst pointer
\r
239 orr r12, r12, r12, lsl #24 @ restore the width counter
\r
241 @ skip remaining pixels on these 2 lines
\r
242 mov r4, #328/8-1 @ width of mode2 in line_pixels/8
\r
243 sub r4, r4, r12, lsr #24
\r
244 add r1, lr, r4, lsl #4 @ skip src pixels
\r
253 ldmfd sp!, {r4-r11,lr}
\r
258 @ r3 - scratch, ru - reg with 2 pixels from upper col, rl - ... lower col (for right-to-left copies)
\r
259 .macro rot_str16_270 ru rl
\r
260 mov r3, \rl,lsr #16
\r
261 mov r3, r3, lsl #16
\r
262 orr r3, r3, \ru, lsr #16
\r
263 str r3, [r0], #208*2
\r
264 mov r3, \ru,lsl #16
\r
265 mov r3, r3, lsr #16
\r
266 orr r3, r3, \rl, lsl #16
\r
267 str r3, [r0], #208*2
\r
271 .global vidConvCpyM2_16_270 @ void *to, void *from, int width
\r
273 vidConvCpyM2_16_270:
\r
274 stmfd sp!, {r4-r11,lr}
\r
277 str sp, [r4] @ save sp, we will need sp reg to store our dst
\r
280 mov r12, #0x00670000
\r
281 orr r12, r12, r2, lsl #24
\r
282 orr r12, r12, r2 @ r12 == ((208-2)/2 << 16) | ((width-1)<<24) | (width-1)
\r
284 add r1, r1, #328*2 @ skip left border+1line
\r
286 add sp, r0, #206*2 @ adjust for algo
\r
289 subs r12, r12, #1<<24
\r
292 ldmdb lr!, {r8-r11}
\r
293 rot_str16_270 r7 r11 @ update the screen in incrementing direction, reduces tearing slightly
\r
294 rot_str16_270 r6 r10
\r
295 rot_str16_270 r5 r9
\r
296 rot_str16_270 r4 r8
\r
300 add r12, r12, #1<<24
\r
301 subs r12, r12, #0x00010000
\r
302 bmi .loopM2_16_90_end @ same end as in 90
\r
304 sub r0, sp, r12, lsr #14 @ calculate new dst pointer
\r
305 orr r12, r12, r12, lsl #24 @ restore the width counter
\r
307 @ skip remaining pixels on these 2 lines
\r
308 mov r4, #328/8-1 @ width of mode2 in line_pixels/8
\r
309 sub r4, r4, r12, lsr #24
\r
310 sub r1, lr, r4, lsl #4 @ skip src pixels
\r
311 add r1, r1, #328*2*2
\r
317 .global vidConvCpyM2_RGB32_90 @ void *to, void *from, int width
\r
319 vidConvCpyM2_RGB32_90:
\r
320 stmfd sp!, {r4-r10,lr}
\r
322 mov lr, #0x00F00000
\r
323 orr lr, lr, #0x00F0
\r
325 mov r12, #208/4 @ row counter
\r
326 mov r10, r2, lsl #2 @ we do 2 pixel wide copies
\r
328 add r8, r0, #208*4 @ parallel line
\r
329 add r1, r1, #0x21000
\r
330 add r1, r1, #0x00280 @ r1+=328*207*2+8*2
\r
336 @ at first this loop was written differently: src pixels were fetched with ldm's and
\r
337 @ dest was not sequential. It ran nearly 2 times slower. It seems it is very important
\r
338 @ to do sequential memory access on those items, which we have more (to offload addressing bus?).
\r
340 ldr r4, [r1], #-328*2
\r
341 ldr r5, [r1], #-328*2
\r
342 ldr r6, [r1], #-328*2
\r
343 ldr r7, [r1], #-328*2
\r
355 bne .loopM2RGB32_90
\r
358 ldmeqfd sp!, {r4-r10,pc} @ return
\r
360 mov r12, #208/4 @ restore row counter
\r
361 mov r0, r8 @ set new dst pointer
\r
363 add r9, r9, #2*2 @ fix src pointer
\r
369 @ converter for vidConvCpyM2_RGB32_270
\r
370 @ lr = 0x00F000F0, out: r3=lower_pix, r2=higher_pix; trashes rin
\r
371 .macro convRGB32_3 rin
\r
372 and r2, lr, \rin, lsr #4 @ blue
\r
374 orr r2, r2, r3, lsl #8 @ g0b0g0b0
\r
376 mov r3, r2, lsl #16 @ g0b00000
\r
377 and \rin,lr, \rin, ror #12 @ 00r000r0 (reversed)
\r
378 orr r3, r3, \rin, lsr #16 @ g0b000r0
\r
380 mov r2, r2, lsr #16
\r
381 orr r2, r2, \rin, lsl #16
\r
384 mov \rin,r3, ror #16 @ r3=low
\r
388 .global vidConvCpyM2_RGB32_270 @ void *to, void *from, int width
\r
390 vidConvCpyM2_RGB32_270:
\r
391 stmfd sp!, {r4-r10,lr}
\r
393 mov lr, #0x00F00000
\r
394 orr lr, lr, #0x00F0
\r
396 mov r12, #208/4 @ row counter
\r
397 mov r10, r2, lsl #2 @ we do 2 pixel wide copies (right to left)
\r
399 add r8, r0, #208*4 @ parallel line
\r
406 ldr r4, [r1], #328*2
\r
407 ldr r5, [r1], #328*2
\r
408 ldr r6, [r1], #328*2
\r
409 ldr r7, [r1], #328*2
\r
421 bne .loopM2RGB32_270
\r
424 ldmeqfd sp!, {r4-r10,pc} @ return
\r
426 mov r12, #208/4 @ restore row counter
\r
427 mov r0, r8 @ set new dst pointer
\r
429 sub r9, r9, #2*2 @ fix src pointer
\r