+ mov r8, #X_CLIP_MAX
+ mov r9, #Y_CLIP_MAX
+ vmov d22, r8, r9
+ vmul.f32 q0, q0, d4[1] @ .x .y .z .w *= [0] .w
+ vmul.f32 q1, q3, d4[0]
+ vshr.u64 d5, d4, #32 @ [0] .w
+ mov r8, #X_CLIP_MIN
+ mov r9, #Y_CLIP_MIN
+ vmov d23, r8, r9
+ vsli.64 d3, d4, #32 @ insert [1] .w
+ vsli.64 d1, d5, #32
+ vsli.u64 d5, d4, #32 @ [0] [1] .w
+ vcgt.f32 d6, d0, d20 @ .xy > 1.0?
+ vcgt.f32 d7, d21, d0
+ vcgt.f32 d4, d5, #0 @ .w > 0?
+ vst1.32 {q0,q1}, [r1]! @ wrt g_vecProjected
+ vcgt.f32 d8, d2, d20
+ vcgt.f32 d9, d21, d2
+ vld1.32 d0[0], [r4]! @ mem: [0] .azyx
+ vand q3, q11
+ vand q4, q11
+ vorr d6, d6, d7
+ vorr d7, d8, d9
+ vld1.32 d0[1], [r5]! @ mem: [1] .azyx
+ vpadd.u32 d6, d7
+ vrev32.8 d0, d0 @ make 0xaazzyyxx [1][0]
+ vsli.u64 d1, d3, #32 @ d3 = [1] [0] .z
+ vmovl.s8 q4, d0
+ vand d6, d4
+ vmovl.s16 q1, d8
+ vmovl.s16 q2, d9
+ vst1.32 {d6}, [r6]! @ g_clipFlag2
+
+ tst r10, #PV_NEON_ENABLE_LIGHT
+ beq pv_neon_no_light
+@ pv_neon_light:
+ @ live NEON registers:
+ @ d1 = [1][0] .z (must preserve)
+ @ q1,q2 = azyx [1][0]
+ @ q12+ = gRSPworldProject
+ ldr r12, [sp, #0x64+0x20]
+ vcvt.f32.s32 q1, q1
+ vcvt.f32.s32 q2, q2
+ vld1.32 {q8,q9}, [r12, :128]! @ load gRSPmodelViewTop
+ vld1.32 {q10}, [r12, :128]
+
+ vdup.32 q5, d4[0] @ [1] .x (dup)
+ vdup.32 q6, d4[1] @ [1] .y (dup)
+ vdup.32 q7, d5[0] @ [1] .z (dup)
+ vdup.32 q2, d2[0] @ [0] .x (dup)
+ vdup.32 q3, d2[1] @ [0] .y (dup)
+ vdup.32 q4, d3[0] @ [0] .z (dup)
+ vmul.f32 q2, q2, q8
+ vmul.f32 q5, q5, q8
+ vmla.f32 q2, q3, q9
+ vmla.f32 q5, q6, q9
+ vmul.f32 q4, q4, q10
+ vmul.f32 q7, q7, q10
+ vadd.f32 q4, q2 @ q4 = temp[0] .xyz0
+ vadd.f32 q5, q7 @ q5 = temp[1] .xyz0
+ vmul.f32 q2, q4, q4 @ temp .xyz0 ^2
+ vmul.f32 q3, q5, q5
+ vpadd.f32 d2, d4, d5
+ vpadd.f32 d3, d6, d7
+ movw r8, #0x0000ffff
+ movt r8, #0x7f7f @ max normal float, ~3.4e+38
+ vdup.32 d4, r8
+ vpadd.f32 d2, d2, d3 @ d2 = [1][0] x^2 + y^2 + z^2
+ vcgt.f32 d5, d2, #0
+ vbif d2, d4, d5 @ if (d2 == 0) d2 = MAXFLOAT
+
+ vrsqrte.f32 d3, d2 @ ~ 1/sqrt(d2), d2 = [1][0] .sqrsum
+ vmul.f32 d4, d3, d2
+ ldr r9, [sp, #0x64+0x18] @ &fRSPAmbientLightRGBA
+ ldr r7, [sp, #0x64+0x14] @ gRSPlights
+ ldr r8, [sp, #0x64+0x24] @ gRSPnumLights
+ vrsqrts.f32 d4, d3, d4 @ step
+ vld1.32 {q6}, [r9] @ rgb
+ vld1.32 {q7}, [r9] @ rgb
+ vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
+#if 0 /* not necessary? */
+ vmul.f32 d4, d3, d2
+ vrsqrts.f32 d4, d3, d4 @ step
+ vmul.f32 d3, d3, d4 @ 1/sqrt(d2)
+#endif
+ vmul.f32 q2, q4, d3[0] @ q2 = normal[0] .xyz
+ vmul.f32 q3, q5, d3[1] @ q3 = normal[1] .xyz
+
+1:
+ vld1.32 {q8}, [r7]
+ vmul.f32 q4, q8, q2 @ gRSPlights[l] * normal
+ vmul.f32 q5, q8, q3
+ vpadd.f32 d8, d8, d9
+ vpadd.f32 d10, d10, d11
+ vpadd.f32 d8, d8, d10 @ d8 = [1][0] fCosT
+ vcgt.f32 d9, d8, #0 @ if (!(fCosT > 0))
+ vand d8, d9 @ fCosT = 0
+ add r9, r7, #OFFSETOF_Light_fr
+ vld1.32 {q8}, [r9] @ .fr .fg .fb
+ vdup.32 q5, d8[1] @ [1] fCosT (dup)
+ vdup.32 q4, d8[0] @
+ vmla.f32 q7, q8, q5 @ .rgb += frgb * fCosT
+ vmla.f32 q6, q8, q4
+ add r7, #SIZEOF_Light
+ subs r8, #1
+ bgt 1b
+
+ movt r8, #0x437f @ float 255
+ vdup.32 q8, r8
+ vcgt.f32 q4, q6, q8 @ if (.rgb > 255)
+ vcgt.f32 q5, q7, q8
+ vbit q6, q8, q4 @ .rgb = 255
+ vbit q7, q8, q5
+ vcvt.u32.f32 q6, q6
+ vcvt.u32.f32 q7, q7
+ ldrb r8, [r4, #-4] @ .a from vtx
+ ldrb r9, [r5, #-4]
+ vext.32 q4, q6, q6, #3 @ reg: .abgr -> .bgra
+ vext.32 q5, q7, q7, #3
+ vmov.32 d8[0], r8 @ use .a from input
+ vmov.32 d10[0], r9
+ vmovn.u32 d8, q4
+ vmovn.u32 d10, q5
+ vmovn.u16 d0, q4
+ vmovn.u16 d2, q5
+ vsli.u64 d0, d2, #32
+ vrev32.8 d0, d0 @ 0xbbggrraa -> 0xaarrggbb
+ b pv_neon_fog_alpha
+
+pv_neon_no_light:
+ tst r10, #PV_NEON_ENABLE_SHADE
+ vldr d0, [sp, #0x64+0x2c] @ primitiveColor [0] [1]
+ beq pv_neon_fog_alpha
+ @ easier to do with ARM
+ ldr r8, [r4, #-4]
+ ldr r9, [r5, #-4]
+ ror r8, #8 @ mem: .argb -> .rgba
+ ror r9, #8 @ reg: 0xbbggrraa -> ..
+ vmov d0, r8, r9
+
+pv_neon_fog_alpha:
+ tst r10, #PV_NEON_FOG_ALPHA
+ beq pv_neon_next
+ vmov.f32 d20, #1.0
+ vcgt.f32 d2, d1, d20 @ [0] [1] .z > 1.0?
+ vcgt.f32 d3, d1, #0 @ > 0?
+ movw r8, #0
+ movt r8, #0x4f7f @ r8 = (float)(255<<24)
+ vbit d1, d20, d2 @ make 1.0 if needed
+ vand d1, d3
+ vdup.32 d4, r8
+ vmul.f32 d1, d1, d4
+ vcvt.u32.f32 d1, d1
+ vmov.u32 d5, #0xff000000
+ vbit d0, d1, d5
+
+pv_neon_next:
+ subs r11, #2
+ vst1.32 {d0}, [r2]! @ g_dwVtxDifColor
+ add r4, #16
+ add r5, #16
+ bgt 0b
+ nop