4 .global flushcache @ beginning_addr, end_addr, flags
11 .global block_or @ void *src, size_t n, int pat
15 orr r2, r2, r2, lsl #8
16 orr r2, r2, r2, lsl #16
25 stmia r0!, {r3-r5,r12}
31 .global block_and @ void *src, size_t n, int andpat
35 orr r2, r2, r2, lsl #8
36 orr r2, r2, r2, lsl #16
45 stmia r0!, {r3-r5,r12}
51 .global block_andor @ void *src, size_t n, int andpat, int orpat
55 orr r2, r2, r2, lsl #8
56 orr r2, r2, r2, lsl #16
57 orr r3, r3, r3, lsl #8
58 orr r3, r3, r3, lsl #16
71 stmia r0!, {r4-r6,r12}
77 .global spend_cycles @ c
80 mov r0, r0, lsr #2 @ 4 cycles/iteration
81 sub r0, r0, #2 @ entry/exit/init
89 .global memset32 @ int *dest, int c, int count
103 stmia r0!, {r1,r3,r12,lr}
118 .global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines
121 stmfd sp!,{r4-r11,lr}
124 mov r9, #0x3900 @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
127 mov r11,r3 @ r11= line counter
128 mov r3, r1 @ r3 = pal base
132 add r4, r0, r2, lsl #1 @ r4 = dst_start
133 add r5, r0, r2 @ r5 = src_start
135 add r0, r4, r12,lsl #1 @ r0 = dst_end
136 add r1, r5, r12 @ r1 = src_end
139 sub r1, r1, #64 @ skip borders
142 soft_scale_loop_line:
146 and r4, lr, r12,lsl #1
148 and r5, lr, r12,lsr #7
150 and r4, r4, r9, lsl #2
151 orr r4, r4, r4, lsl #14 @ r4[31:16] = 1/4 pix_s 0
152 and r5, r5, r9, lsl #2
153 sub r6, r5, r5, lsr #2 @ r6 = 3/4 pix_s 1
154 add r4, r4, r6, lsl #16 @ pix_d 0, 1
155 and r6, lr, r12,lsr #15
157 and r12,lr, r12,lsr #23
159 and r6, r6, r9, lsl #2
162 sub r6, r6, r6, lsr #2 @ r6 = 3/4 pix_s 2
163 orr r5, r5, r6, lsl #16
165 and r6, lr, r7, lsl #1
167 and r12,r12,r9, lsl #2
168 add r5, r5, r12,lsl #14 @ pix_d 2, 3
169 and r6, r6, r9, lsl #2
170 orr r6, r12,r6, lsl #16 @ pix_d 4, 5
172 and r12,lr, r7, lsr #7
174 and r10,lr, r7, lsr #15
176 and r12,r12,r9, lsl #2
177 sub r8, r12,r12,lsr #2 @ r8 = 3/4 pix_s 1
178 add r8, r8, r6, lsr #18
179 and r7, lr, r7, lsr #23
181 and r10,r10,r9, lsl #2
182 orr r8, r8, r10,lsl #15
183 add r8, r8, r12,lsl #15 @ pix_d 6, 7
184 sub r10,r10,r10,lsr #2 @ r10= 3/4 pix_s 2
185 and r7, r7, r9, lsl #2
186 add r10,r10,r7, lsr #2 @ += 1/4 pix_s 3
187 orr r10,r10,r7, lsl #16 @ pix_d 8, 9
191 stmdb r0!, {r4,r5,r6,r8,r10}
192 bne soft_scale_loop_line
197 ldmfd sp!,{r4-r11,lr}
201 /* buggy and slow, probably because function call overhead
202 @ renderer helper, based on bitbank's method
203 .global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal
208 ldrb r3, [r1] @ get bit 0 pixels
210 orr r12,r12,r12,lsl #8
211 orr r12,r12,r12,lsl #16
212 ldrb r1, [r1, #8] @ get bit 1 pixels
213 orr r3, r3, r3, lsl #9 @ shift them over 1 byte + 1 bit
214 orr r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes
215 and r4, r12,r3, lsr #7 @ mask off the upper nibble pixels we want
216 and r5, r12,r3, lsr #3 @ mask off the lower nibble pixels we want
219 orr r1, r1, r1, lsl #9 @ process the bit 1 pixels
220 orr r1, r1, r1, lsl #18
221 and r3, r12,r1, lsr #7 @ mask off the upper nibble pixels we want
222 and r1, r12,r1, lsr #3 @ mask off the lower nibble
223 orr r4, r4, r3, lsl #1
224 orr r5, r5, r1, lsl #5
226 @ can this be avoided?
227 mov r4, r4, lsl #3 @ *8
240 mov r5, r5, lsl #3 @ *8