@ vim:filetype=armasm

@ test
.global flushcache @ beginning_addr, end_addr, flags

flushcache:
    swi #0x9f0002
    mov pc, lr


.global block_or @ void *src, size_t n, int pat

block_or:
    stmfd   sp!, {r4-r5}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    mov     r1, r1, lsr #4
block_loop_or:
    ldmia   r0, {r3-r5,r12}
    subs    r1, r1, #1
    orr     r3, r3, r2
    orr     r4, r4, r2
    orr     r5, r5, r2
    orr     r12,r12,r2
    stmia   r0!, {r3-r5,r12}
    bne     block_loop_or
    ldmfd   sp!, {r4-r5}
    bx      lr


.global block_andor @ void *src, size_t n, int andpat, int orpat

block_andor:
    stmfd   sp!, {r4-r6}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    orr     r3, r3, r3, lsl #8
    orr     r3, r3, r3, lsl #16
    mov     r1, r1, lsr #4
block_loop_andor:
    ldmia   r0, {r4-r6,r12}
    subs    r1, r1, #1
    and     r4, r4, r2
    orr     r4, r4, r3
    and     r5, r5, r2
    orr     r5, r5, r3
    and     r6, r6, r2
    orr     r6, r6, r3
    and     r12,r12,r2
    orr     r12,r12,r3
    stmia   r0!, {r4-r6,r12}
    bne     block_loop_andor
    ldmfd   sp!, {r4-r6}
    bx      lr


.global spend_cycles @ c

spend_cycles:
    mov     r0, r0, lsr #2  @ 4 cycles/iteration
    sub     r0, r0, #2      @ entry/exit/init
.sc_loop:
    subs    r0, r0, #1
    bpl     .sc_loop

    bx      lr


.global soft_scale @ void *dst, unsigned short *pal, int offs, int lines

soft_scale:
    stmfd   sp!,{r4-r11,lr}
    mov     lr, #0xff
    mov     lr, lr, lsl #1
    mov     r9, #0x3900        @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
    orr     r9, r9, #0x00e7

    mov     r11,r3             @ r11= line counter
    mov     r3, r1             @ r3 = pal base

    mov     r12,#320
    mul     r2, r12,r2
    add     r4, r0, r2, lsl #1 @ r4 = dst_start
    add     r5, r0, r2         @ r5 = src_start
    mul     r12,r11,r12
    add     r0, r4, r12,lsl #1 @ r0 = dst_end
    add     r1, r5, r12        @ r1 = src_end

soft_scale_loop:
    sub     r1, r1, #64        @ skip borders
    mov     r2, #256/8

soft_scale_loop_line:
    ldr     r12, [r1, #-8]!
    ldr     r7,  [r1, #4]

    and     r4, lr, r12,lsl #1
    ldrh    r4, [r3, r4]
    and     r5, lr, r12,lsr #7
    ldrh    r5, [r3, r5]
    and     r4, r4, r9, lsl #2
    orr     r4, r4, r4, lsl #14       @ r4[31:16] = 1/4 pix_s 0
    and     r5, r5, r9, lsl #2
    sub     r6, r5, r5, lsr #2        @ r6 = 3/4 pix_s 1
    add     r4, r4, r6, lsl #16       @ pix_d 0, 1
    and     r6, lr, r12,lsr #15
    ldrh    r6, [r3, r6]
    and     r12,lr, r12,lsr #23
    ldrh    r12,[r3, r12]
    and     r6, r6, r9, lsl #2
    add     r5, r5, r6
    mov     r5, r5, lsr #1
    sub     r6, r6, r6, lsr #2        @ r6 = 3/4 pix_s 2
    orr     r5, r5, r6, lsl #16

    and     r6, lr, r7, lsl #1
    ldrh    r6, [r3, r6]
    and     r12,r12,r9, lsl #2
    add     r5, r5, r12,lsl #14       @ pix_d 2, 3
    and     r6, r6, r9, lsl #2
    orr     r6, r12,r6, lsl #16       @ pix_d 4, 5

    and     r12,lr, r7, lsr #7
    ldrh    r12,[r3, r12]
    and     r10,lr, r7, lsr #15
    ldrh    r10,[r3, r10]
    and     r12,r12,r9, lsl #2
    sub     r8, r12,r12,lsr #2        @ r8 = 3/4 pix_s 1
    add     r8, r8, r6, lsr #18
    and     r7, lr, r7, lsr #23
    ldrh    r7, [r3, r7]
    and     r10,r10,r9, lsl #2
    orr     r8, r8, r10,lsl #15
    add     r8, r8, r12,lsl #15       @ pix_d 6, 7
    sub     r10,r10,r10,lsr #2        @ r10= 3/4 pix_s 2
    and     r7, r7, r9, lsl #2
    add     r10,r10,r7, lsr #2        @ += 1/4 pix_s 3
    orr     r10,r10,r7, lsl #16       @ pix_d 8, 9

    subs    r2, r2, #1

    stmdb   r0!, {r4,r5,r6,r8,r10}
    bne     soft_scale_loop_line

    subs    r11,r11,#1
    bne     soft_scale_loop

    ldmfd   sp!,{r4-r11,lr}
    bx      lr


/* buggy and slow, probably because function call overhead
@ renderer helper, based on bitbank's method
.global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal

draw8pix:
    stmfd sp!, {r4,r5}

    ldrb  r3, [r1]            @ get bit 0 pixels
    mov   r12,#1
    orr   r12,r12,r12,lsl #8
    orr   r12,r12,r12,lsl #16
    ldrb  r1, [r1, #8]        @ get bit 1 pixels
    orr   r3, r3, r3, lsl #9  @ shift them over 1 byte + 1 bit
    orr   r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes
    and   r4, r12,r3, lsr #7  @ mask off the upper nibble pixels we want
    and   r5, r12,r3, lsr #3  @ mask off the lower nibble pixels we want
    ldr   r2, [r2]

    orr   r1, r1, r1, lsl #9  @ process the bit 1 pixels
    orr   r1, r1, r1, lsl #18
    and   r3, r12,r1, lsr #7  @ mask off the upper nibble pixels we want
    and   r1, r12,r1, lsr #3  @ mask off the lower nibble
    orr   r4, r4, r3, lsl #1
    orr   r5, r5, r1, lsl #5

    @ can this be avoided?
    mov   r4, r4, lsl #3      @ *8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1
    mov   r4, r4, lsr #8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1
    mov   r4, r4, lsr #8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1
    mov   r4, r4, lsr #8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1

    mov   r5, r5, lsl #3      @ *8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1
    mov   r5, r5, lsr #8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1
    mov   r5, r5, lsr #8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1
    mov   r5, r5, lsr #8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1

    ldmfd sp!, {r4,r5}
    bx    lr
*/