asmutils.s

   1 @ vim:filetype=armasm
   2
   3 @ test
   4 .global flushcache @ beginning_addr, end_addr, flags
   5
   6 flushcache:
   7     swi #0x9f0002
   8     mov pc, lr
   9
  10
  11 .global block_or @ void *src, size_t n, int pat
  12
  13 block_or:
  14     stmfd   sp!, {r4-r5}
  15     orr     r2, r2, r2, lsl #8
  16     orr     r2, r2, r2, lsl #16
  17     mov     r1, r1, lsr #4
  18 block_loop_or:
  19     ldmia   r0, {r3-r5,r12}
  20     subs    r1, r1, #1
  21     orr     r3, r3, r2
  22     orr     r4, r4, r2
  23     orr     r5, r5, r2
  24     orr     r12,r12,r2
  25     stmia   r0!, {r3-r5,r12}
  26     bne     block_loop_or
  27     ldmfd   sp!, {r4-r5}
  28     bx      lr
  29
  30
  31 .global block_and @ void *src, size_t n, int andpat
  32
  33 block_and:
  34     stmfd   sp!, {r4-r5}
  35     orr     r2, r2, r2, lsl #8
  36     orr     r2, r2, r2, lsl #16
  37     mov     r1, r1, lsr #4
  38 block_loop_and:
  39     ldmia   r0, {r3-r5,r12}
  40     subs    r1, r1, #1
  41     and     r3, r3, r2
  42     and     r4, r4, r2
  43     and     r5, r5, r2
  44     and     r12,r12,r2
  45     stmia   r0!, {r3-r5,r12}
  46     bne     block_loop_and
  47     ldmfd   sp!, {r4-r5}
  48     bx      lr
  49
  50
  51 .global block_andor @ void *src, size_t n, int andpat, int orpat
  52
  53 block_andor:
  54     stmfd   sp!, {r4-r6}
  55     orr     r2, r2, r2, lsl #8
  56     orr     r2, r2, r2, lsl #16
  57     orr     r3, r3, r3, lsl #8
  58     orr     r3, r3, r3, lsl #16
  59     mov     r1, r1, lsr #4
  60 block_loop_andor:
  61     ldmia   r0, {r4-r6,r12}
  62     subs    r1, r1, #1
  63     and     r4, r4, r2
  64     orr     r4, r4, r3
  65     and     r5, r5, r2
  66     orr     r5, r5, r3
  67     and     r6, r6, r2
  68     orr     r6, r6, r3
  69     and     r12,r12,r2
  70     orr     r12,r12,r3
  71     stmia   r0!, {r4-r6,r12}
  72     bne     block_loop_andor
  73     ldmfd   sp!, {r4-r6}
  74     bx      lr
  75
  76
  77 .global spend_cycles @ c
  78
  79 spend_cycles:
  80     mov     r0, r0, lsr #2  @ 4 cycles/iteration
  81     sub     r0, r0, #2      @ entry/exit/init
  82 .sc_loop:
  83     subs    r0, r0, #1
  84     bpl     .sc_loop
  85
  86     bx      lr
  87
  88
  89 .global memset32 @ int *dest, int c, int count
  90
  91 memset32:
  92     stmfd   sp!, {lr}
  93
  94     mov     r3, r1
  95     subs    r2, r2, #4
  96     bmi     mst32_fin
  97
  98     mov     r12,r1
  99     mov     lr, r1
 100
 101 mst32_loop:
 102     subs    r2, r2, #4
 103     stmia   r0!, {r1,r3,r12,lr}
 104     bpl     mst32_loop
 105
 106 mst32_fin:
 107     tst     r2, #1
 108     strne   r1, [r0], #4
 109
 110     tst     r2, #2
 111     stmneia r0!, {r1,r3}
 112
 113     ldmfd   sp!, {lr}
 114     bx      lr
 115
 116
 117
 118 .global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines
 119
 120 soft_scale:
 121     stmfd   sp!,{r4-r11,lr}
 122     mov     lr, #0xff
 123     mov     lr, lr, lsl #1
 124     mov     r9, #0x3900        @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
 125     orr     r9, r9, #0x00e7
 126
 127     mov     r11,r3             @ r11= line counter
 128     mov     r3, r1             @ r3 = pal base
 129
 130     mov     r12,#320
 131     mul     r2, r12,r2
 132     add     r4, r0, r2, lsl #1 @ r4 = dst_start
 133     add     r5, r0, r2         @ r5 = src_start
 134     mul     r12,r11,r12
 135     add     r0, r4, r12,lsl #1 @ r0 = dst_end
 136     add     r1, r5, r12        @ r1 = src_end
 137
 138 soft_scale_loop:
 139     sub     r1, r1, #64        @ skip borders
 140     mov     r2, #256/8
 141
 142 soft_scale_loop_line:
 143     ldr     r12, [r1, #-8]!
 144     ldr     r7,  [r1, #4]
 145
 146     and     r4, lr, r12,lsl #1
 147     ldrh    r4, [r3, r4]
 148     and     r5, lr, r12,lsr #7
 149     ldrh    r5, [r3, r5]
 150     and     r4, r4, r9, lsl #2
 151     orr     r4, r4, r4, lsl #14       @ r4[31:16] = 1/4 pix_s 0
 152     and     r5, r5, r9, lsl #2
 153     sub     r6, r5, r5, lsr #2        @ r6 = 3/4 pix_s 1
 154     add     r4, r4, r6, lsl #16       @ pix_d 0, 1
 155     and     r6, lr, r12,lsr #15
 156     ldrh    r6, [r3, r6]
 157     and     r12,lr, r12,lsr #23
 158     ldrh    r12,[r3, r12]
 159     and     r6, r6, r9, lsl #2
 160     add     r5, r5, r6
 161     mov     r5, r5, lsr #1
 162     sub     r6, r6, r6, lsr #2        @ r6 = 3/4 pix_s 2
 163     orr     r5, r5, r6, lsl #16
 164
 165     and     r6, lr, r7, lsl #1
 166     ldrh    r6, [r3, r6]
 167     and     r12,r12,r9, lsl #2
 168     add     r5, r5, r12,lsl #14       @ pix_d 2, 3
 169     and     r6, r6, r9, lsl #2
 170     orr     r6, r12,r6, lsl #16       @ pix_d 4, 5
 171
 172     and     r12,lr, r7, lsr #7
 173     ldrh    r12,[r3, r12]
 174     and     r10,lr, r7, lsr #15
 175     ldrh    r10,[r3, r10]
 176     and     r12,r12,r9, lsl #2
 177     sub     r8, r12,r12,lsr #2        @ r8 = 3/4 pix_s 1
 178     add     r8, r8, r6, lsr #18
 179     and     r7, lr, r7, lsr #23
 180     ldrh    r7, [r3, r7]
 181     and     r10,r10,r9, lsl #2
 182     orr     r8, r8, r10,lsl #15
 183     add     r8, r8, r12,lsl #15       @ pix_d 6, 7
 184     sub     r10,r10,r10,lsr #2        @ r10= 3/4 pix_s 2
 185     and     r7, r7, r9, lsl #2
 186     add     r10,r10,r7, lsr #2        @ += 1/4 pix_s 3
 187     orr     r10,r10,r7, lsl #16       @ pix_d 8, 9
 188
 189     subs    r2, r2, #1
 190
 191     stmdb   r0!, {r4,r5,r6,r8,r10}
 192     bne     soft_scale_loop_line
 193
 194     subs    r11,r11,#1
 195     bne     soft_scale_loop
 196
 197     ldmfd   sp!,{r4-r11,lr}
 198     bx      lr
 199
 200
 201 /* buggy and slow, probably because function call overhead
 202 @ renderer helper, based on bitbank's method
 203 .global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal
 204
 205 draw8pix:
 206     stmfd sp!, {r4,r5}
 207
 208     ldrb  r3, [r1]            @ get bit 0 pixels
 209     mov   r12,#1
 210     orr   r12,r12,r12,lsl #8
 211     orr   r12,r12,r12,lsl #16
 212     ldrb  r1, [r1, #8]        @ get bit 1 pixels
 213     orr   r3, r3, r3, lsl #9  @ shift them over 1 byte + 1 bit
 214     orr   r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes
 215     and   r4, r12,r3, lsr #7  @ mask off the upper nibble pixels we want
 216     and   r5, r12,r3, lsr #3  @ mask off the lower nibble pixels we want
 217     ldr   r2, [r2]
 218
 219     orr   r1, r1, r1, lsl #9  @ process the bit 1 pixels
 220     orr   r1, r1, r1, lsl #18
 221     and   r3, r12,r1, lsr #7  @ mask off the upper nibble pixels we want
 222     and   r1, r12,r1, lsr #3  @ mask off the lower nibble
 223     orr   r4, r4, r3, lsl #1
 224     orr   r5, r5, r1, lsl #5
 225
 226     @ can this be avoided?
 227     mov   r4, r4, lsl #3      @ *8
 228     mov   r3, r2, ror r4
 229     strb  r3, [r0], #1
 230     mov   r4, r4, lsr #8
 231     mov   r3, r2, ror r4
 232     strb  r3, [r0], #1
 233     mov   r4, r4, lsr #8
 234     mov   r3, r2, ror r4
 235     strb  r3, [r0], #1
 236     mov   r4, r4, lsr #8
 237     mov   r3, r2, ror r4
 238     strb  r3, [r0], #1
 239
 240     mov   r5, r5, lsl #3      @ *8
 241     mov   r3, r2, ror r5
 242     strb  r3, [r0], #1
 243     mov   r5, r5, lsr #8
 244     mov   r3, r2, ror r5
 245     strb  r3, [r0], #1
 246     mov   r5, r5, lsr #8
 247     mov   r3, r2, ror r5
 248     strb  r3, [r0], #1
 249     mov   r5, r5, lsr #8
 250     mov   r3, r2, ror r5
 251     strb  r3, [r0], #1
 252
 253     ldmfd sp!, {r4,r5}
 254     bx    lr
 255 */
 256