[fceu.git] / arm / asmutils.s

@ vim:filetype=armasm

@ Assembly optimized routines for gpfce - FCE Ultra port 
@ (c) Copyright 2007, Grazvydas "notaz" Ignotas

@ test
.global flushcache @ beginning_addr, end_addr, flags

flushcache:
    swi #0x9f0002
    mov pc, lr


.global block_or @ void *src, size_t n, int pat

block_or:
    stmfd   sp!, {r4-r5}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    mov     r1, r1, lsr #4
block_loop_or:
    ldmia   r0, {r3-r5,r12}
    subs    r1, r1, #1
    orr     r3, r3, r2
    orr     r4, r4, r2
    orr     r5, r5, r2
    orr     r12,r12,r2
    stmia   r0!, {r3-r5,r12}
    bne     block_loop_or
    ldmfd   sp!, {r4-r5}
    bx      lr


.global block_and @ void *src, size_t n, int andpat

block_and:
    stmfd   sp!, {r4-r5}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    mov     r1, r1, lsr #4
block_loop_and:
    ldmia   r0, {r3-r5,r12}
    subs    r1, r1, #1
    and     r3, r3, r2
    and     r4, r4, r2
    and     r5, r5, r2
    and     r12,r12,r2
    stmia   r0!, {r3-r5,r12}
    bne     block_loop_and
    ldmfd   sp!, {r4-r5}
    bx      lr


.global block_andor @ void *src, size_t n, int andpat, int orpat

block_andor:
    stmfd   sp!, {r4-r6}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    orr     r3, r3, r3, lsl #8
    orr     r3, r3, r3, lsl #16
    mov     r1, r1, lsr #4
block_loop_andor:
    ldmia   r0, {r4-r6,r12}
    subs    r1, r1, #1
    and     r4, r4, r2
    orr     r4, r4, r3
    and     r5, r5, r2
    orr     r5, r5, r3
    and     r6, r6, r2
    orr     r6, r6, r3
    and     r12,r12,r2
    orr     r12,r12,r3
    stmia   r0!, {r4-r6,r12}
    bne     block_loop_andor
    ldmfd   sp!, {r4-r6}
    bx      lr


.global spend_cycles @ c

spend_cycles:
    mov     r0, r0, lsr #2  @ 4 cycles/iteration
    sub     r0, r0, #2      @ entry/exit/init
.sc_loop:
    subs    r0, r0, #1
    bpl     .sc_loop

    bx      lr


.global memset32 @ int *dest, int c, int count

memset32:
    stmfd   sp!, {lr}

    mov     r3, r1
    subs    r2, r2, #4
    bmi     mst32_fin

    mov     r12,r1
    mov     lr, r1

mst32_loop:
    subs    r2, r2, #4
    stmia   r0!, {r1,r3,r12,lr}
    bpl     mst32_loop

mst32_fin:
    tst     r2, #1
    strne   r1, [r0], #4

    tst     r2, #2
    stmneia r0!, {r1,r3}

    ldmfd   sp!, {lr}
    bx      lr


@ warning: this code relies on palette being strictly RGB555, i.e. bit5=0
.global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines

soft_scale:
    stmfd   sp!,{r4-r11,lr}
    mov     lr, #0xff
    mov     lr, lr, lsl #1
    mov     r9, #0x3900        @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
    orr     r9, r9, #0x00e7

    mov     r11,r3             @ r11= line counter
    mov     r3, r1             @ r3 = pal base

    mov     r12,#320
    mul     r2, r12,r2
    add     r4, r0, r2, lsl #1 @ r4 = dst_start
    add     r5, r0, r2         @ r5 = src_start
    mul     r12,r11,r12
    add     r0, r4, r12,lsl #1 @ r0 = dst_end
    add     r1, r5, r12        @ r1 = src_end

    mov     r2, r11

soft_scale_loop:
    sub     r1, r1, #64        @ skip borders
    orr     r2, r2, #(256/8-1)<<24

soft_scale_loop_line:
    ldr     r12, [r1, #-8]!
    ldr     r7,  [r1, #4]

    and     r4, lr, r12,lsl #1
    ldrh    r4, [r3, r4]
    and     r5, lr, r12,lsr #7
    ldrh    r5, [r3, r5]
    and     r11,r4, r9, lsl #2
    orr     r4, r4, r11,lsl #14       @ r4[31:16] = 1/4 pix_s 0
    and     r11,r5, r9, lsl #2
    sub     r6, r5, r11,lsr #2        @ r6 = 3/4 pix_s 1
    add     r4, r4, r6, lsl #16       @ pix_d 0, 1
    and     r6, lr, r12,lsr #15
    ldrh    r6, [r3, r6]
    and     r12,lr, r12,lsr #23
    ldrh    r12,[r3, r12]

    mov     r11,r6, ror #11
    adds    r5, r11,r5, ror #11
    mov     r5, r5, ror #22
    bic     r5, r5, #0xff000000
    bic     r5, r5, #0x0420           @ set the green bits as they should be
    orrcs   r5, r5, #0x0400

    and     r11,r6, r9, lsl #2
    sub     r6, r6, r11,lsr #2        @ r6 = 3/4 pix_s 2
    orr     r5, r5, r6, lsl #16

    and     r6, lr, r7, lsl #1
    ldrh    r6, [r3, r6]
    and     r11,r12,r9, lsl #2
    add     r5, r5, r11,lsl #14       @ pix_d 2, 3
    orr     r6, r12,r6, lsl #16       @ pix_d 4, 5

    and     r12,lr, r7, lsr #7
    ldrh    r12,[r3, r12]
    and     r10,lr, r7, lsr #15
    ldrh    r10,[r3, r10]
    and     r11,r12,r9, lsl #2
    sub     r8, r12,r11,lsr #2        @ r8 = 3/4 pix_s 1
    and     r11,r6, r9, lsl #18
    add     r8, r8, r11,lsr #18
    and     r7, lr, r7, lsr #23
    ldrh    r7, [r3, r7]

    mov     r11,r10,ror #11
    adds    r12,r11,r12,ror #11
    mov     r12,r12,ror #22
    bic     r12,r12,#0x0420
    orrcs   r12,r12,#0x0400
    orr     r8, r8, r12,lsl #16       @ pix_d 6, 7

    and     r11,r10,r9, lsl #2
    sub     r10,r10,r11,lsr #2        @ r10= 3/4 pix_s 2
    and     r11,r7, r9, lsl #2
    add     r10,r10,r11,lsr #2        @ += 1/4 pix_s 3
    orr     r10,r10,r7, lsl #16       @ pix_d 8, 9

    subs    r2, r2, #1<<24

    stmdb   r0!, {r4,r5,r6,r8,r10}
    bpl     soft_scale_loop_line

    add     r2, r2, #1<<24
    subs    r2, r2, #1
    bne     soft_scale_loop

    ldmfd   sp!,{r4-r11,lr}
    bx      lr


@ void do_clut(unsigned short *dst, unsigned char *src, unsigned short *pal, int pixels);

.global do_clut

do_clut:
    stmfd   sp!,{r4-r8,lr}
    mov     lr, #0xff
    mov     lr, lr, lsl #1

    mov     r3, r3, lsr #3

do_clut_loop:
    ldmia   r1!,{r4,r5}

    and     r6, lr, r4, lsl #1
    and     r7, lr, r4, lsr #7
    ldrh    r6, [r2, r6]
    and     r8, lr, r4, lsr #15
    ldrh    r7, [r2, r7]
    and     r4, lr, r4, lsr #23
    ldrh    r8, [r2, r8]
    ldrh    r4, [r2, r4]

    orr     r6, r6, r7, lsl #16
    and     r12,lr, r5, lsl #1
    orr     r7, r8, r4, lsl #16
    and     r8, lr, r5, lsr #7
    ldrh    r12, [r2, r12]
    and     r4, lr, r5, lsr #15
    ldrh    r8, [r2, r8]
    and     r5, lr, r5, lsr #23
    ldrh    r4, [r2, r4]
    ldrh    r5, [r2, r5]
    orr     r8, r12,r8, lsl #16
    orr     r12,r4, r5, lsl #16

    stmia   r0!,{r6,r7,r8,r12}
    subs    r3, r3, #1
    bne     do_clut_loop

    ldmfd   sp!,{r4-r8,lr}
    bx      lr
Commit	Line	Data
	1	@ vim:filetype=armasm
	2
	3	@ Assembly optimized routines for gpfce - FCE Ultra port
	4	@ (c) Copyright 2007, Grazvydas "notaz" Ignotas
	5
	6	@ test
	7	.global flushcache @ beginning_addr, end_addr, flags
	8
	9	flushcache:
	10	swi #0x9f0002
	11	mov pc, lr
	12
	13
	14	.global block_or @ void *src, size_t n, int pat
	15
	16	block_or:
	17	stmfd sp!, {r4-r5}
	18	orr r2, r2, r2, lsl #8
	19	orr r2, r2, r2, lsl #16
	20	mov r1, r1, lsr #4
	21	block_loop_or:
	22	ldmia r0, {r3-r5,r12}
	23	subs r1, r1, #1
	24	orr r3, r3, r2
	25	orr r4, r4, r2
	26	orr r5, r5, r2
	27	orr r12,r12,r2
	28	stmia r0!, {r3-r5,r12}
	29	bne block_loop_or
	30	ldmfd sp!, {r4-r5}
	31	bx lr
	32
	33
	34	.global block_and @ void *src, size_t n, int andpat
	35
	36	block_and:
	37	stmfd sp!, {r4-r5}
	38	orr r2, r2, r2, lsl #8
	39	orr r2, r2, r2, lsl #16
	40	mov r1, r1, lsr #4
	41	block_loop_and:
	42	ldmia r0, {r3-r5,r12}
	43	subs r1, r1, #1
	44	and r3, r3, r2
	45	and r4, r4, r2
	46	and r5, r5, r2
	47	and r12,r12,r2
	48	stmia r0!, {r3-r5,r12}
	49	bne block_loop_and
	50	ldmfd sp!, {r4-r5}
	51	bx lr
	52
	53
	54	.global block_andor @ void *src, size_t n, int andpat, int orpat
	55
	56	block_andor:
	57	stmfd sp!, {r4-r6}
	58	orr r2, r2, r2, lsl #8
	59	orr r2, r2, r2, lsl #16
	60	orr r3, r3, r3, lsl #8
	61	orr r3, r3, r3, lsl #16
	62	mov r1, r1, lsr #4
	63	block_loop_andor:
	64	ldmia r0, {r4-r6,r12}
	65	subs r1, r1, #1
	66	and r4, r4, r2
	67	orr r4, r4, r3
	68	and r5, r5, r2
	69	orr r5, r5, r3
	70	and r6, r6, r2
	71	orr r6, r6, r3
	72	and r12,r12,r2
	73	orr r12,r12,r3
	74	stmia r0!, {r4-r6,r12}
	75	bne block_loop_andor
	76	ldmfd sp!, {r4-r6}
	77	bx lr
	78
	79
	80	.global spend_cycles @ c
	81
	82	spend_cycles:
	83	mov r0, r0, lsr #2 @ 4 cycles/iteration
	84	sub r0, r0, #2 @ entry/exit/init
	85	.sc_loop:
	86	subs r0, r0, #1
	87	bpl .sc_loop
	88
	89	bx lr
	90
	91
	92	.global memset32 @ int *dest, int c, int count
	93
	94	memset32:
	95	stmfd sp!, {lr}
	96
	97	mov r3, r1
	98	subs r2, r2, #4
	99	bmi mst32_fin
	100
	101	mov r12,r1
	102	mov lr, r1
	103
	104	mst32_loop:
	105	subs r2, r2, #4
	106	stmia r0!, {r1,r3,r12,lr}
	107	bpl mst32_loop
	108
	109	mst32_fin:
	110	tst r2, #1
	111	strne r1, [r0], #4
	112
	113	tst r2, #2
	114	stmneia r0!, {r1,r3}
	115
	116	ldmfd sp!, {lr}
	117	bx lr
	118
	119
	120	@ warning: this code relies on palette being strictly RGB555, i.e. bit5=0
	121	.global soft_scale @ void dst, unsigned short pal, int line_offs, int lines
	122
	123	soft_scale:
	124	stmfd sp!,{r4-r11,lr}
	125	mov lr, #0xff
	126	mov lr, lr, lsl #1
	127	mov r9, #0x3900 @ f800 07e0 001f \| e000 0780 001c \| 3800 01e0 0007
	128	orr r9, r9, #0x00e7
	129
	130	mov r11,r3 @ r11= line counter
	131	mov r3, r1 @ r3 = pal base
	132
	133	mov r12,#320
	134	mul r2, r12,r2
	135	add r4, r0, r2, lsl #1 @ r4 = dst_start
	136	add r5, r0, r2 @ r5 = src_start
	137	mul r12,r11,r12
	138	add r0, r4, r12,lsl #1 @ r0 = dst_end
	139	add r1, r5, r12 @ r1 = src_end
	140
	141	mov r2, r11
	142
	143	soft_scale_loop:
	144	sub r1, r1, #64 @ skip borders
	145	orr r2, r2, #(256/8-1)<<24
	146
	147	soft_scale_loop_line:
	148	ldr r12, [r1, #-8]!
	149	ldr r7, [r1, #4]
	150
	151	and r4, lr, r12,lsl #1
	152	ldrh r4, [r3, r4]
	153	and r5, lr, r12,lsr #7
	154	ldrh r5, [r3, r5]
	155	and r11,r4, r9, lsl #2
	156	orr r4, r4, r11,lsl #14 @ r4[31:16] = 1/4 pix_s 0
	157	and r11,r5, r9, lsl #2
	158	sub r6, r5, r11,lsr #2 @ r6 = 3/4 pix_s 1
	159	add r4, r4, r6, lsl #16 @ pix_d 0, 1
	160	and r6, lr, r12,lsr #15
	161	ldrh r6, [r3, r6]
	162	and r12,lr, r12,lsr #23
	163	ldrh r12,[r3, r12]
	164
	165	mov r11,r6, ror #11
	166	adds r5, r11,r5, ror #11
	167	mov r5, r5, ror #22
	168	bic r5, r5, #0xff000000
	169	bic r5, r5, #0x0420 @ set the green bits as they should be
	170	orrcs r5, r5, #0x0400
	171
	172	and r11,r6, r9, lsl #2
	173	sub r6, r6, r11,lsr #2 @ r6 = 3/4 pix_s 2
	174	orr r5, r5, r6, lsl #16
	175
	176	and r6, lr, r7, lsl #1
	177	ldrh r6, [r3, r6]
	178	and r11,r12,r9, lsl #2
	179	add r5, r5, r11,lsl #14 @ pix_d 2, 3
	180	orr r6, r12,r6, lsl #16 @ pix_d 4, 5
	181
	182	and r12,lr, r7, lsr #7
	183	ldrh r12,[r3, r12]
	184	and r10,lr, r7, lsr #15
	185	ldrh r10,[r3, r10]
	186	and r11,r12,r9, lsl #2
	187	sub r8, r12,r11,lsr #2 @ r8 = 3/4 pix_s 1
	188	and r11,r6, r9, lsl #18
	189	add r8, r8, r11,lsr #18
	190	and r7, lr, r7, lsr #23
	191	ldrh r7, [r3, r7]
	192
	193	mov r11,r10,ror #11
	194	adds r12,r11,r12,ror #11
	195	mov r12,r12,ror #22
	196	bic r12,r12,#0x0420
	197	orrcs r12,r12,#0x0400
	198	orr r8, r8, r12,lsl #16 @ pix_d 6, 7
	199
	200	and r11,r10,r9, lsl #2
	201	sub r10,r10,r11,lsr #2 @ r10= 3/4 pix_s 2
	202	and r11,r7, r9, lsl #2
	203	add r10,r10,r11,lsr #2 @ += 1/4 pix_s 3
	204	orr r10,r10,r7, lsl #16 @ pix_d 8, 9
	205
	206	subs r2, r2, #1<<24
	207
	208	stmdb r0!, {r4,r5,r6,r8,r10}
	209	bpl soft_scale_loop_line
	210
	211	add r2, r2, #1<<24
	212	subs r2, r2, #1
	213	bne soft_scale_loop
	214
	215	ldmfd sp!,{r4-r11,lr}
	216	bx lr
	217
	218
	219	@ void do_clut(unsigned short dst, unsigned char src, unsigned short *pal, int pixels);
	220
	221	.global do_clut
	222
	223	do_clut:
	224	stmfd sp!,{r4-r8,lr}
	225	mov lr, #0xff
	226	mov lr, lr, lsl #1
	227
	228	mov r3, r3, lsr #3
	229
	230	do_clut_loop:
	231	ldmia r1!,{r4,r5}
	232
	233	and r6, lr, r4, lsl #1
	234	and r7, lr, r4, lsr #7
	235	ldrh r6, [r2, r6]
	236	and r8, lr, r4, lsr #15
	237	ldrh r7, [r2, r7]
	238	and r4, lr, r4, lsr #23
	239	ldrh r8, [r2, r8]
	240	ldrh r4, [r2, r4]
	241
	242	orr r6, r6, r7, lsl #16
	243	and r12,lr, r5, lsl #1
	244	orr r7, r8, r4, lsl #16
	245	and r8, lr, r5, lsr #7
	246	ldrh r12, [r2, r12]
	247	and r4, lr, r5, lsr #15
	248	ldrh r8, [r2, r8]
	249	and r5, lr, r5, lsr #23
	250	ldrh r4, [r2, r4]
	251	ldrh r5, [r2, r5]
	252	orr r8, r12,r8, lsl #16
	253	orr r12,r4, r5, lsl #16
	254
	255	stmia r0!,{r6,r7,r8,r12}
	256	subs r3, r3, #1
	257	bne do_clut_loop
	258
	259	ldmfd sp!,{r4-r8,lr}
	260	bx lr
	261