New, separate GPU plugin based on Unai. (#233)
authorgameblabla <gameblabla@users.noreply.github.com>
Fri, 29 Oct 2021 20:03:27 +0000 (20:03 +0000)
committerGitHub <noreply@github.com>
Fri, 29 Oct 2021 20:03:27 +0000 (23:03 +0300)
This new plugin is based on Unai but is more accurate and fixes a few issues.
According to some tests on real hardware :
fps      old new
spyro1   130 112
tekken3   95  68
nfs3     107  91

Because of this, it was decided to make it separate from the "Old" Unai.

Note that this doesn't have the threading changes from libretro's fork yet :
this will be for another PR.

Co-authored-by: negativeExponent <negativeExponent@users.noreply.github.com>
Co-authored-by: Justin Weiss <justin@justinweiss.com>
Co-authored-by: senquack <dansilsby@gmail.com>
31 files changed:
Makefile
configure
frontend/main.c
frontend/menu.c
frontend/plugin_lib.h
plugins/gpu_senquack/Makefile [new file with mode: 0644]
plugins/gpu_senquack/README_senquack.txt [new file with mode: 0644]
plugins/gpu_senquack/debug.h [new file with mode: 0644]
plugins/gpu_senquack/gpu.cpp [new file with mode: 0644]
plugins/gpu_senquack/gpu.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_arm.S [new file with mode: 0644]
plugins/gpu_senquack/gpu_arm.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_blit.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_command.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_fixedpoint.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_blend.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_blend_arm.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_blend_arm5.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_blend_arm7.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_light.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_light_arm.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_inner_quantization.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_raster_image.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_raster_line.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_raster_polygon.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_raster_sprite.h [new file with mode: 0644]
plugins/gpu_senquack/gpu_senquack.h [new file with mode: 0644]
plugins/gpu_senquack/gpulib_if.cpp [new file with mode: 0644]
plugins/gpu_senquack/port.h [new file with mode: 0644]
plugins/gpu_senquack/profiler.h [new file with mode: 0644]

index 18ef4e0..0998f58 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -129,6 +129,15 @@ plugins/gpu_unai/gpulib_if.o: CFLAGS += -DREARMED -O3
 CC_LINK = $(CXX)
 endif
 
+ifeq "$(BUILTIN_GPU)" "senquack"
+OBJS += plugins/gpu_senquack/gpulib_if.o
+ifeq "$(ARCH)" "arm"
+OBJS += plugins/gpu_senquack/gpu_arm.o
+endif
+plugins/gpu_senquack/gpulib_if.o: CFLAGS += -DREARMED -O3 
+CC_LINK = $(CXX)
+endif
+
 # cdrcimg
 OBJS += plugins/cdrcimg/cdrcimg.o
 ifeq "$(CHD_SUPPORT)" "1"
index 5caf0f4..20ff1d5 100755 (executable)
--- a/configure
+++ b/configure
@@ -39,12 +39,12 @@ check_define_val()
 
 platform_list="generic pandora maemo caanoo libretro"
 platform="generic"
-builtin_gpu_list="peops unai neon"
+builtin_gpu_list="peops unai neon senquack"
 builtin_gpu=""
 sound_driver_list="oss alsa pulseaudio sdl libretro"
 sound_drivers=""
 plugins="plugins/spunull/spunull.so \
-plugins/dfxvideo/gpu_peops.so plugins/gpu_unai/gpu_unai.so"
+plugins/dfxvideo/gpu_peops.so plugins/gpu_unai/gpu_unai.so plugins/gpu_senquack/gpu_senquack.so"
 ram_fixed="no"
 drc_cache_base="no"
 have_armv5=""
index 3bb0f4b..4631618 100644 (file)
@@ -130,6 +130,12 @@ void emu_set_default_config(void)
        pl_rearmed_cbs.gpu_neon.enhancement_no_main = 0;
        pl_rearmed_cbs.gpu_peops.iUseDither = 0;
        pl_rearmed_cbs.gpu_peops.dwActFixes = 1<<7;
+       pl_rearmed_cbs.gpu_senquack.ilace_force = 0;
+       pl_rearmed_cbs.gpu_senquack.pixel_skip = 0;
+       pl_rearmed_cbs.gpu_senquack.lighting = 1;
+       pl_rearmed_cbs.gpu_senquack.fast_lighting = 0;
+       pl_rearmed_cbs.gpu_senquack.blending = 1;
+       pl_rearmed_cbs.gpu_senquack.dithering = 0;
        pl_rearmed_cbs.gpu_unai.abe_hack =
        pl_rearmed_cbs.gpu_unai.no_light =
        pl_rearmed_cbs.gpu_unai.no_blend = 0;
index e2286d4..05dde46 100644 (file)
@@ -430,6 +430,13 @@ static const struct {
        CE_INTVAL_P(gpu_unai.abe_hack),
        CE_INTVAL_P(gpu_unai.no_light),
        CE_INTVAL_P(gpu_unai.no_blend),
+       CE_INTVAL_P(gpu_senquack.ilace_force),
+       CE_INTVAL_P(gpu_senquack.pixel_skip),
+       CE_INTVAL_P(gpu_senquack.lighting),
+       CE_INTVAL_P(gpu_senquack.fast_lighting),
+       CE_INTVAL_P(gpu_senquack.blending),
+       CE_INTVAL_P(gpu_senquack.dithering),
+       CE_INTVAL_P(gpu_senquack.scale_hires),
        CE_INTVAL_P(gpu_neon.allow_interlace),
        CE_INTVAL_P(gpu_neon.enhancement_enable),
        CE_INTVAL_P(gpu_neon.enhancement_no_main),
@@ -1378,6 +1385,25 @@ static int menu_loop_plugin_gpu_unai(int id, int keys)
        return 0;
 }
 
+static menu_entry e_menu_plugin_gpu_senquack[] =
+{
+       mee_onoff     ("Interlace",                  0, pl_rearmed_cbs.gpu_senquack.ilace_force, 1),
+       mee_onoff     ("Dithering",                  0, pl_rearmed_cbs.gpu_senquack.dithering, 1),
+       mee_onoff     ("Lighting",                   0, pl_rearmed_cbs.gpu_senquack.lighting, 1),
+       mee_onoff     ("Fast lighting",              0, pl_rearmed_cbs.gpu_senquack.fast_lighting, 1),
+       mee_onoff     ("Blending",                   0, pl_rearmed_cbs.gpu_senquack.blending, 1),
+       mee_onoff     ("Pixel skip",                 0, pl_rearmed_cbs.gpu_senquack.pixel_skip, 1),
+       mee_end,
+};
+
+static int menu_loop_plugin_gpu_senquack(int id, int keys)
+{
+       int sel = 0;
+       me_loop(e_menu_plugin_gpu_senquack, &sel);
+       return 0;
+}
+
+
 static const char *men_gpu_dithering[] = { "None", "Game dependant", "Always", NULL };
 //static const char h_gpu_0[]            = "Needed for Chrono Cross";
 static const char h_gpu_1[]            = "Capcom fighting games";
@@ -1479,6 +1505,7 @@ static const char h_plugin_gpu[] =
 #endif
                                   "gpu_peops is Pete's soft GPU, slow but accurate\n"
                                   "gpu_unai is GPU from PCSX4ALL, fast but glitchy\n"
+                                  "gpu_senquack is more accurate but slower\n"
                                   "gpu_gles Pete's hw GPU, uses 3D chip but is glitchy\n"
                                   "must save config and reload the game if changed";
 static const char h_plugin_spu[] = "spunull effectively disables sound\n"
@@ -1486,6 +1513,7 @@ static const char h_plugin_spu[] = "spunull effectively disables sound\n"
 static const char h_gpu_peops[]  = "Configure P.E.Op.S. SoftGL Driver V1.17";
 static const char h_gpu_peopsgl[]= "Configure P.E.Op.S. MesaGL Driver V1.78";
 static const char h_gpu_unai[]   = "Configure Unai/PCSX4ALL Team GPU plugin";
+static const char h_gpu_senquack[]   = "Configure Unai/PCSX4ALL Senquack plugin";
 static const char h_spu[]        = "Configure built-in P.E.Op.S. Sound Driver V1.7";
 
 static menu_entry e_menu_plugin_options[] =
@@ -1498,6 +1526,7 @@ static menu_entry e_menu_plugin_options[] =
 #endif
        mee_handler_h ("Configure gpu_peops plugin",    menu_loop_plugin_gpu_peops, h_gpu_peops),
        mee_handler_h ("Configure gpu_unai GPU plugin", menu_loop_plugin_gpu_unai, h_gpu_unai),
+       mee_handler_h ("Configure gpu_senquack GPU plugin", menu_loop_plugin_gpu_senquack, h_gpu_senquack),
        mee_handler_h ("Configure gpu_gles GPU plugin", menu_loop_plugin_gpu_peopsgl, h_gpu_peopsgl),
        mee_handler_h ("Configure built-in SPU plugin", menu_loop_plugin_spu, h_spu),
        mee_end,
index 4a11002..f55eb44 100644 (file)
@@ -77,6 +77,15 @@ struct rearmed_cbs {
                int   no_light, no_blend;
                int   lineskip;
        } gpu_unai;
+       struct {
+               int ilace_force;
+               int pixel_skip;
+               int lighting;
+               int fast_lighting;
+               int blending;
+               int dithering;
+               int scale_hires;
+       } gpu_senquack;
        struct {
                int   dwActFixes;
                int   bDrawDither, iFilterType, iFrameTexType;
diff --git a/plugins/gpu_senquack/Makefile b/plugins/gpu_senquack/Makefile
new file mode 100644 (file)
index 0000000..c3be35b
--- /dev/null
@@ -0,0 +1,19 @@
+CFLAGS += -ggdb -Wall -O3 -ffast-math
+CFLAGS += -DREARMED
+CFLAGS += -I../../include
+#CFLAGS += -DINLINE="static __inline__"
+#CFLAGS += -Dasm="__asm__ __volatile__"
+#CFLAGS += -DUSE_GPULIB=1
+
+include ../../config.mak
+
+SRC_STANDALONE += gpu.cpp
+SRC_GPULIB += gpulib_if.cpp
+
+ifeq "$(ARCH)" "arm"
+SRC += gpu_arm.S
+endif
+
+#BIN_STANDALONE = gpuPCSX4ALL.so
+BIN_GPULIB = gpu_senquack.so
+include ../gpulib/gpulib.mak
diff --git a/plugins/gpu_senquack/README_senquack.txt b/plugins/gpu_senquack/README_senquack.txt
new file mode 100644 (file)
index 0000000..cda17fc
--- /dev/null
@@ -0,0 +1,956 @@
+//NOTE: You can find the set of original Unai poly routines (disabled now)
+// at the bottom end of this file.
+
+//senquack - Original Unai GPU poly routines have been replaced with new
+// ones based on DrHell routines. The original routines suffered from
+// shifted rows, causing many quads to have their first triangle drawn
+// correctly, but the second triangle would randomly have pixels shifted
+// either left or right or entire rows not drawn at all. Furthermore,
+// some times entire triangles seemed to be either missing or only
+// partially drawn (most clearly seen in sky/road textures in NFS3,
+// clock tower in beginning of Castlevania SOTN). Pixel gaps were
+// prevalent.
+//
+// Since DrHell GPU didn't seem to exhibit these artifacts at all, I adapted
+// its routines to GPU Unai (Unai was probably already originally based on it).
+// DrHell uses 22.10 fixed point instead of Unai's 16.16, so gpu_fixedpoint.h
+// required modification as well as gpu_inner.h (where gpuPolySpanFn driver
+// functions are).
+//
+// Originally, I tried to patch up original Unai routines and got as far
+// as fixing the shifted rows, but still had other problem of triangles rendered
+// wrong (black triangular gaps in NFS3 sky, clock tower in Castlevania SOTN).
+// I eventually gave up. Even after rewriting/adapting the routines,
+// however, I still had some random pixel droupouts, specifically in
+// NFS3 sky texture. I discovered that gpu_inner.h gpuPolySpanFn function
+// was taking optimizations to an extreme and packing u/v texture coords
+// into one 32-bit word, reducing their accuracy. Only once they were
+// handled in full-accuracy individual words was that problem fixed.
+//
+// NOTE: I also added support for doing divisions using the FPU, either
+//  with normal division or multiplication-by-reciprocal.
+//  To use float division, GPU_UNAI_USE_FLOATMATH should be defined.
+//  To use float mult-by-reciprocal, GPU_UNAI_USE_FLOAT_DIV_MULTINV
+//   can be specified (GPU_UNAI_USE_FLOATMATH must also be specified)
+//  To use inaccurate fixed-point mult-by-reciprocal, define
+//   GPU_UNAI_USE_INT_DIV_MULTINV. This is the default on older
+//   ARM devices like Wiz/Caanoo that have neither integer division
+//   in hardware or an FPU. It results in some pixel dropouts,
+//   texture glitches, but less than the original GPU UNAI code.
+//
+//  If nothing is specified, integer division will be used.
+//
+// NOTE 2: Even with MIPS32R2 having FPU recip.s instruction, and it is
+//  used when this platform is detected, I found it not to give any
+//  noticeable speedup over normal float division (in fact seemed a tiny
+//  tiny bit slower). I also found float division to not provide any
+//  noticeable speedups versus integer division on MISP32R2 platform.
+//  Granted, the differences were all around .5 FPS or less.
+//
+// TODO:
+// * See if anything can be done about remaining pixel gaps in Gran
+//   Turismo car models, track.
+// * Find better way of passing parameters to gpuPolySpanFn functions than
+//   through original Unai method of using global variables u4,v4,du4 etc.
+// * Come up with some newer way of drawing rows of pixels than by calling
+//   gpuPolySpanFn through function pointer. For every row, at least on
+//   MIPS platforms, many registers are having to be pushed/popped from stack
+//   on each call, which is strange since MIPS has so many registers.
+// * MIPS MXU/ASM optimized gpuPolySpanFn ?
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Disabled original Unai poly routines left here for reference:
+// ( from gpu_raster_polygon.h )
+//////////////////////////////////////////////////////////////////////////
+#define GPU_TESTRANGE3() \
+{ \
+       if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \
+       if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \
+       if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \
+       if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \
+       if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \
+       if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \
+}
+
+/*----------------------------------------------------------------------
+F3
+----------------------------------------------------------------------*/
+
+void gpuDrawF3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]);
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]);
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]);
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]);
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       PixelData = GPU_RGB16(PacketBuffer.U4[0]);
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);
+                       GPU_SWAP(y1, y2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+               }
+       }
+
+       ya = y2 - y0;
+       yb = y2 - y1;
+       dx =(x2 - x1) * ya - (x2 - x0) * yb;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               dx3 = xLoDivx((x2 - x0), (y2 - y0));
+                               dx4 = xLoDivx((x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               dx3 = xLoDivx((x1 - x0), (y1 - y0));
+                               dx4 = xLoDivx((x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               x4  = i2x(x1);
+                               x3  = i2x(x0) + (dx3 * (y1 - y0));
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               x3  = i2x(x1);
+                               x4  = i2x(x0) + (dx4 * (y1 - y0));
+                               dx3 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;
+                       x4 += dx4*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;
+               x4+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+                       if(xa < xmin) xa = xmin;
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+FT3
+----------------------------------------------------------------------*/
+
+void gpuDrawFT3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 u0, u1, u2, u3, du3=0;
+       s32 v0, v1, v2, v3, dv3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       u0 = PacketBuffer.U1[8];  v0 = PacketBuffer.U1[9];
+       u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17];
+       u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25];
+
+       r4 = s32(PacketBuffer.U1[0]);
+       g4 = s32(PacketBuffer.U1[1]);
+       b4 = s32(PacketBuffer.U1[2]);
+       dr4 = dg4 = db4 = 0;
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);
+                       GPU_SWAP(v0, v1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);
+                       GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(u1, u2, temp);
+                       GPU_SWAP(v1, v2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);
+                       GPU_SWAP(v0, v1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+
+       s32 iF,iS;
+       xInv( dx, iF, iS);
+       du4 = xInvMulx( du4, iF, iS);
+       dv4 = xInvMulx( dv4, iF, iS);
+       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       u3 = i2x(u0);
+                       v3 = i2x(v0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv( (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               du3 = xInvMulx( (u2 - u0), iF, iS);
+                               dv3 = xInvMulx( (v2 - v0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv( (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               du3 = xInvMulx( (u1 - u0), iF, iS);
+                               dv3 = xInvMulx( (v1 - v0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               u3 = i2x(u0) + (du3 * temp);
+                               v3 = i2x(v0) + (dv3 * temp);
+                               x3 = i2x(x0) + (dx3 * temp);
+                               x4 = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               u3 = i2x(u1);
+                               v3 = i2x(v1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+                               xInv( (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               du3 = xInvMulx( (u2 - u1), iF, iS);
+                               dv3 = xInvMulx( (v2 - v1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;
+                       x4 += dx4*temp;
+                       u3 += du3*temp;
+                       v3 += dv3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;
+               x4+= fixed_HALF;
+               u3+= fixed_HALF;
+               v4+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               u4 = u3 + du4*temp;
+                               v4 = v3 + dv4*temp;
+                       }
+                       else
+                       {
+                               u4 = u3;
+                               v4 = v3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+G3
+----------------------------------------------------------------------*/
+
+void gpuDrawG3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 r0, r1, r2, r3, dr3=0;
+       s32 g0, g1, g2, g3, dg3=0;
+       s32 b0, b1, b2, b3, db3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
+       r1 = PacketBuffer.U1[8];        g1 = PacketBuffer.U1[9];        b1 = PacketBuffer.U1[10];
+       r2 = PacketBuffer.U1[16];       g2 = PacketBuffer.U1[17];       b2 = PacketBuffer.U1[18];
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(r1, r2, temp);         GPU_SWAP(g1, g2, temp);   GPU_SWAP(b1, b2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(r0, r1, temp);   GPU_SWAP(g0, g1, temp);               GPU_SWAP(b0, b1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+       s32 iF,iS;
+       xInv(            dx, iF, iS);
+       dr4 = xInvMulx( dr4, iF, iS);
+       dg4 = xInvMulx( dg4, iF, iS);
+       db4 = xInvMulx( db4, iF, iS);
+       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+       lInc = db + dg + dr;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       r3 = i2x(r0);
+                       g3 = i2x(g0);
+                       b3 = i2x(b0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv(           (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               dr3 = xInvMulx( (r2 - r0), iF, iS);
+                               dg3 = xInvMulx( (g2 - g0), iF, iS);
+                               db3 = xInvMulx( (b2 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv(           (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               dr3 = xInvMulx( (r1 - r0), iF, iS);
+                               dg3 = xInvMulx( (g1 - g0), iF, iS);
+                               db3 = xInvMulx( (b1 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               r3  = i2x(r0) + (dr3 * temp);
+                               g3  = i2x(g0) + (dg3 * temp);
+                               b3  = i2x(b0) + (db3 * temp);
+                               x3  = i2x(x0) + (dx3 * temp);
+                               x4  = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               r3 = i2x(r1);
+                               g3 = i2x(g1);
+                               b3 = i2x(b1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                               xInv(           (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               dr3 = xInvMulx( (r2 - r1), iF, iS);
+                               dg3 = xInvMulx( (g2 - g1), iF, iS);
+                               db3 = xInvMulx( (b2 - b1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;   x4 += dx4*temp;
+                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;  x4+= fixed_HALF;
+               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       }
+                       else
+                       {
+                               r4 = r3;  g4 = g3;  b4 = b3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+GT3
+----------------------------------------------------------------------*/
+
+void gpuDrawGT3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 u0, u1, u2, u3, du3=0;
+       s32 v0, v1, v2, v3, dv3=0;
+       s32 r0, r1, r2, r3, dr3=0;
+       s32 g0, g1, g2, g3, dg3=0;
+       s32 b0, b1, b2, b3, db3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+
+       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
+       u0 = PacketBuffer.U1[8];        v0 = PacketBuffer.U1[9];
+       r1 = PacketBuffer.U1[12];       g1 = PacketBuffer.U1[13];       b1 = PacketBuffer.U1[14];
+       u1 = PacketBuffer.U1[20];       v1 = PacketBuffer.U1[21];
+       r2 = PacketBuffer.U1[24];       g2 = PacketBuffer.U1[25];       b2 = PacketBuffer.U1[26];
+       u2 = PacketBuffer.U1[32];       v2 = PacketBuffer.U1[33];
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);   GPU_SWAP(b0, b1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(u1, u2, temp);         GPU_SWAP(v1, v2, temp);
+                       GPU_SWAP(r1, r2, temp);   GPU_SWAP(g1, g2, temp);               GPU_SWAP(b1, b2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+       s32 iF,iS;
+
+       xInv(            dx, iF, iS);
+       du4 = xInvMulx( du4, iF, iS);
+       dv4 = xInvMulx( dv4, iF, iS);
+       dr4 = xInvMulx( dr4, iF, iS);
+       dg4 = xInvMulx( dg4, iF, iS);
+       db4 = xInvMulx( db4, iF, iS);
+       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+       lInc = db + dg + dr;
+       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       u3 = i2x(u0);
+                       v3 = i2x(v0);
+                       r3 = i2x(r0);
+                       g3 = i2x(g0);
+                       b3 = i2x(b0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv(           (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               du3 = xInvMulx( (u2 - u0), iF, iS);
+                               dv3 = xInvMulx( (v2 - v0), iF, iS);
+                               dr3 = xInvMulx( (r2 - r0), iF, iS);
+                               dg3 = xInvMulx( (g2 - g0), iF, iS);
+                               db3 = xInvMulx( (b2 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv(           (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               du3 = xInvMulx( (u1 - u0), iF, iS);
+                               dv3 = xInvMulx( (v1 - v0), iF, iS);
+                               dr3 = xInvMulx( (r1 - r0), iF, iS);
+                               dg3 = xInvMulx( (g1 - g0), iF, iS);
+                               db3 = xInvMulx( (b1 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               u3  = i2x(u0) + (du3 * temp);
+                               v3  = i2x(v0) + (dv3 * temp);
+                               r3  = i2x(r0) + (dr3 * temp);
+                               g3  = i2x(g0) + (dg3 * temp);
+                               b3  = i2x(b0) + (db3 * temp);
+                               x3  = i2x(x0) + (dx3 * temp);
+                               x4  = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               u3 = i2x(u1);
+                               v3 = i2x(v1);
+                               r3 = i2x(r1);
+                               g3 = i2x(g1);
+                               b3 = i2x(b1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                               xInv(           (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               du3 = xInvMulx( (u2 - u1), iF, iS);
+                               dv3 = xInvMulx( (v2 - v1), iF, iS);
+                               dr3 = xInvMulx( (r2 - r1), iF, iS);
+                               dg3 = xInvMulx( (g2 - g1), iF, iS);
+                               db3 = xInvMulx( (b2 - b1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;   x4 += dx4*temp;
+                       u3 += du3*temp;   v3 += dv3*temp;
+                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;  x4+= fixed_HALF;
+               u3+= fixed_HALF;  v4+= fixed_HALF;
+               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3,        b3+=db3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin))     continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               u4 = u3 + du4*temp;   v4 = v3 + dv4*temp;
+                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       }
+                       else
+                       {
+                               u4 = u3;  v4 = v3;
+                               r4 = r3;  g4 = g3;  b4 = b3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Original Unai poly routines left here for reference:
+// ( from gpu_inner.h ) NOTE: this uses 16.16, not 22.10 fixed point
+//////////////////////////////////////////////////////////////////////////
+template<const int CF>
+INLINE void  gpuPolySpanFn(u16 *pDst, u32 count)
+{
+       if (!TM)
+       {       
+               // NO TEXTURE
+               if (!G)
+               {
+                       // NO GOURAUD
+                       u16 data;
+                       if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); }
+                       else data=PixelData;
+                       if ((!M)&&(!B))
+                       {
+                               if (MB) { data = data | 0x8000; }
+                               do { *pDst++ = data; } while (--count);
+                       }
+                       else if ((M)&&(!B))
+                       {
+                               if (MB) { data = data | 0x8000; }
+                               do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
+                       }
+                       else
+                       {
+                               u16 uSrc;
+                               u16 uDst;
+                               u32 uMsk; if (BM==0) uMsk=0x7BDE;
+                               u32 bMsk; if (BI) bMsk=blit_mask;
+                               do
+                               {
+                                       // blit-mask
+                                       if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endtile; }
+                                       //  masking
+                                       uDst = *pDst;
+                                       if(M) { if (uDst&0x8000) goto endtile;  }
+                                       uSrc = data;
+                                       //  blend
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                                       if (MB) { *pDst = uSrc | 0x8000; }
+                                       else    { *pDst = uSrc; }
+                                       endtile: pDst++;
+                               }
+                               while (--count);
+                       }
+               }
+               else
+               {
+                       // GOURAUD
+                       u16 uDst;
+                       u16 uSrc;
+                       u32 linc=lInc;
+                       u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));
+                       u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+                       u32 bMsk; if (BI) bMsk=blit_mask;
+                       do
+                       {
+                               // blit-mask
+                               if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endgou; }
+                               //  masking
+                               if(M) { uDst = *pDst;  if (uDst&0x8000) goto endgou;  }
+                               //  blend
+                               if(B)
+                               {
+                                       //  light
+                                       gpuLightingRGB(uSrc,lCol);
+                                       if(!M)    { uDst = *pDst; }
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                               }
+                               else
+                               {
+                                       //  light
+                                       gpuLightingRGB(uSrc,lCol);
+                               }
+                               if (MB) { *pDst = uSrc | 0x8000; }
+                               else    { *pDst = uSrc; }
+                               endgou: pDst++; lCol=(lCol+linc);
+                       }
+                       while (--count);
+               }
+       }
+       else
+       {
+               // TEXTURE
+               u16 uDst;
+               u16 uSrc;
+               u32 linc; if (L&&G) linc=lInc;
+               u32 tinc=tInc;
+               u32 tmsk=tMsk;
+               u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk;
+               const u16* _TBA=TBA;
+               const u16* _CBA; if (TM!=3) _CBA=CBA;
+               u32 lCol;
+               if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); }
+               else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));  }
+               u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+               u32 bMsk; if (BI) bMsk=blit_mask;
+               do
+               {
+                       // blit-mask
+                       if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endpoly; }
+                       //  masking
+                       if(M) { uDst = *pDst;  if (uDst&0x8000) goto endpoly;  }
+                       //  texture
+                       if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; }
+                       if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc)  goto endpoly; }
+                       if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc)  goto endpoly; }
+                       //  blend
+                       if(B)
+                       {
+                               if (uSrc&0x8000)
+                               {
+                                       //  light
+                                       if(L) gpuLightingTXT(uSrc, lCol);
+                                       if(!M)    { uDst = *pDst; }
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                               }
+                               else
+                               {
+                                       // light
+                                       if(L) gpuLightingTXT(uSrc, lCol);
+                               }
+                       }
+                       else
+                       {
+                               //  light
+                               if(L)  { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; }
+                       }
+                       if (MB) { *pDst = uSrc | 0x8000; }
+                       else    { *pDst = uSrc; }
+                       endpoly: pDst++;
+                       tCor=(tCor+tinc)&tmsk;
+                       if (L&&G) lCol=(lCol+linc);
+               }
+               while (--count);
+       }
+}
diff --git a/plugins/gpu_senquack/debug.h b/plugins/gpu_senquack/debug.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/plugins/gpu_senquack/gpu.cpp b/plugins/gpu_senquack/gpu.cpp
new file mode 100644 (file)
index 0000000..5f2929f
--- /dev/null
@@ -0,0 +1,830 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#include <stddef.h>
+#include "plugins.h"
+#include "psxcommon.h"
+//#include "port.h"
+#include "gpu_senquack.h"
+
+#define VIDEO_WIDTH 320
+
+#ifdef TIME_IN_MSEC
+#define TPS 1000
+#else
+#define TPS 1000000
+#endif
+
+#define IS_PAL (gpu_senquack.GPU_GP1&(0x08<<17))
+
+//senquack - Original 512KB of guard space seems not to be enough, as Xenogears
+// accesses outside this range and crashes in town intro fight sequence.
+// Increased to 2MB total (double PSX VRAM) and Xenogears no longer
+// crashes, but some textures are still messed up. Also note that alignment min
+// is 16 bytes, needed for pixel-skipping rendering/blitting in high horiz res.
+// Extra 4KB is for guard room at beginning.
+// TODO: Determine cause of out-of-bounds write/reads. <-- Note: this is largely
+//  solved by adoption of PCSX Rearmed's 'gpulib' in gpulib_if.cpp, which
+//  replaces this file (gpu.cpp)
+//u16   GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(32)));
+static u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE*2 + 4096)/2] __attribute__((aligned(32)));
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// Inner loop driver instantiation file
+#include "gpu_inner.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal image drawing functions
+#include "gpu_raster_image.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal line drawing functions
+#include "gpu_raster_line.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal polygon drawing functions
+#include "gpu_raster_polygon.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal sprite drawing functions
+#include "gpu_raster_sprite.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU command buffer execution/store
+#include "gpu_command.h"
+
+///////////////////////////////////////////////////////////////////////////////
+static void gpuReset(void)
+{
+       memset((void*)&gpu_senquack, 0, sizeof(gpu_senquack));
+       gpu_senquack.vram = (u16*)GPU_FrameBuffer + (4096/2); //4kb guard room in front
+       gpu_senquack.GPU_GP1 = 0x14802000;
+       gpu_senquack.DrawingArea[2] = 256;
+       gpu_senquack.DrawingArea[3] = 240;
+       gpu_senquack.DisplayArea[2] = 256;
+       gpu_senquack.DisplayArea[3] = 240;
+       gpu_senquack.DisplayArea[5] = 240;
+       gpu_senquack.TextureWindow[0] = 0;
+       gpu_senquack.TextureWindow[1] = 0;
+       gpu_senquack.TextureWindow[2] = 255;
+       gpu_senquack.TextureWindow[3] = 255;
+       //senquack - new vars must be updated whenever texture window is changed:
+       //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+       const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+       gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+       gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+       // Configuration options
+       gpu_senquack.config = gpu_senquack_config_ext;
+       gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+       gpu_senquack.frameskip.skipCount = gpu_senquack.config.frameskip_count;
+
+       SetupLightLUT();
+       SetupDitheringConstants();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+long GPU_init(void)
+{
+       gpuReset();
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+       // s_invTable
+       for(unsigned int i=1;i<=(1<<TABLE_BITS);++i)
+       {
+               s_invTable[i-1]=0x7fffffff/i;
+       }
+#endif
+
+       gpu_senquack.fb_dirty = true;
+       gpu_senquack.dma.last_dma = NULL;
+       return (0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+long GPU_shutdown(void)
+{
+       return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+long GPU_freeze(u32 bWrite, GPUFreeze_t* p2)
+{
+       if (!p2) return (0);
+       if (p2->ulFreezeVersion != 1) return (0);
+
+       if (bWrite)
+       {
+               p2->ulStatus = gpu_senquack.GPU_GP1;
+               memset(p2->ulControl, 0, sizeof(p2->ulControl));
+               // save resolution and registers for P.E.Op.S. compatibility
+               p2->ulControl[3] = (3 << 24) | ((gpu_senquack.GPU_GP1 >> 23) & 1);
+               p2->ulControl[4] = (4 << 24) | ((gpu_senquack.GPU_GP1 >> 29) & 3);
+               p2->ulControl[5] = (5 << 24) | (gpu_senquack.DisplayArea[0] | (gpu_senquack.DisplayArea[1] << 10));
+               p2->ulControl[6] = (6 << 24) | (2560 << 12);
+               p2->ulControl[7] = (7 << 24) | (gpu_senquack.DisplayArea[4] | (gpu_senquack.DisplayArea[5] << 10));
+               p2->ulControl[8] = (8 << 24) | ((gpu_senquack.GPU_GP1 >> 17) & 0x3f) | ((gpu_senquack.GPU_GP1 >> 10) & 0x40);
+               memcpy((void*)p2->psxVRam, (void*)gpu_senquack.vram, FRAME_BUFFER_SIZE);
+               return (1);
+       }
+       else
+       {
+               extern void GPU_writeStatus(u32 data);
+               gpu_senquack.GPU_GP1 = p2->ulStatus;
+               memcpy((void*)gpu_senquack.vram, (void*)p2->psxVRam, FRAME_BUFFER_SIZE);
+               GPU_writeStatus((5 << 24) | p2->ulControl[5]);
+               GPU_writeStatus((7 << 24) | p2->ulControl[7]);
+               GPU_writeStatus((8 << 24) | p2->ulControl[8]);
+               gpuSetTexture(gpu_senquack.GPU_GP1);
+               return (1);
+       }
+       return (0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU DMA comunication
+
+///////////////////////////////////////////////////////////////////////////////
+u8 PacketSize[256] =
+{
+       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              0-15
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              16-31
+       3, 3, 3, 3, 6, 6, 6, 6, 4, 4, 4, 4, 8, 8, 8, 8, //              32-47
+       5, 5, 5, 5, 8, 8, 8, 8, 7, 7, 7, 7, 11, 11, 11, 11,     //      48-63
+       2, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, //              64-79
+       3, 3, 3, 3, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, //              80-95
+       2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, //              96-111
+       1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, //              112-127
+       3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              128-
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              144
+       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              160
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  //
+};
+
+///////////////////////////////////////////////////////////////////////////////
+INLINE void gpuSendPacket()
+{
+       gpuSendPacketFunction(gpu_senquack.PacketBuffer.U4[0]>>24);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+INLINE void gpuCheckPacket(u32 uData)
+{
+       if (gpu_senquack.PacketCount)
+       {
+               gpu_senquack.PacketBuffer.U4[gpu_senquack.PacketIndex++] = uData;
+               --gpu_senquack.PacketCount;
+       }
+       else
+       {
+               gpu_senquack.PacketBuffer.U4[0] = uData;
+               gpu_senquack.PacketCount = PacketSize[uData >> 24];
+               gpu_senquack.PacketIndex = 1;
+       }
+       if (!gpu_senquack.PacketCount) gpuSendPacket();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_writeDataMem(u32* dmaAddress, int dmaCount)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeDataMem(%d)\n",dmaCount);
+       #endif
+       u32 data;
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+
+       while (dmaCount) 
+       {
+               if (gpu_senquack.dma.FrameToWrite)
+               {
+                       while (dmaCount)
+                       {
+                               dmaCount--;
+                               data = *dmaAddress++;
+                               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+                               gpu_senquack.dma.pvram[gpu_senquack.dma.px] = data;
+                               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+                               {
+                                       gpu_senquack.dma.px = 0;
+                                       gpu_senquack.dma.pvram += 1024;
+                                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                                       {
+                                               gpu_senquack.dma.FrameToWrite = false;
+                                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                                               gpu_senquack.fb_dirty = true;
+                                               break;
+                                       }
+                               }
+                               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+                               gpu_senquack.dma.pvram[gpu_senquack.dma.px] = data>>16;
+                               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+                               {
+                                       gpu_senquack.dma.px = 0;
+                                       gpu_senquack.dma.pvram += 1024;
+                                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                                       {
+                                               gpu_senquack.dma.FrameToWrite = false;
+                                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                                               gpu_senquack.fb_dirty = true;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               else
+               {
+                       data = *dmaAddress++;
+                       dmaCount--;
+                       gpuCheckPacket(data);
+               }
+       }
+
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 | 0x14000000) & ~0x60000000;
+}
+
+long GPU_dmaChain(u32 *rambase, u32 start_addr)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_dmaChain(0x%x)\n",start_addr);
+       #endif
+
+       u32 addr, *list;
+       u32 len, count;
+       long dma_words = 0;
+
+       if (gpu_senquack.dma.last_dma) *gpu_senquack.dma.last_dma |= 0x800000;
+       
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+       
+       addr = start_addr & 0xffffff;
+       for (count = 0; addr != 0xffffff; count++)
+       {
+               list = rambase + (addr & 0x1fffff) / 4;
+               len = list[0] >> 24;
+               addr = list[0] & 0xffffff;
+
+               dma_words += 1 + len;
+
+               // add loop detection marker
+               list[0] |= 0x800000;
+
+               if (len) GPU_writeDataMem(list + 1, len);
+
+               if (addr & 0x800000)
+               {
+                       #ifdef ENABLE_GPU_LOG_SUPPORT
+                               fprintf(stdout,"GPU_dmaChain(LOOP)\n");
+                       #endif
+                       break;
+               }
+       }
+
+       // remove loop detection markers
+       addr = start_addr & 0x1fffff;
+       while (count-- > 0)
+       {
+               list = rambase + addr / 4;
+               addr = list[0] & 0x1fffff;
+               list[0] &= ~0x800000;
+       }
+       
+       if (gpu_senquack.dma.last_dma) *gpu_senquack.dma.last_dma &= ~0x800000;
+       gpu_senquack.dma.last_dma = rambase + (start_addr & 0x1fffff) / 4;
+
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 | 0x14000000) & ~0x60000000;
+
+       return dma_words;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_writeData(u32 data)
+{
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeData()\n");
+       #endif
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+
+       if (gpu_senquack.dma.FrameToWrite)
+       {
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               gpu_senquack.dma.pvram[gpu_senquack.dma.px]=(u16)data;
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToWrite = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                               gpu_senquack.fb_dirty = true;
+                       }
+               }
+               if (gpu_senquack.dma.FrameToWrite)
+               {
+                       if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+                       gpu_senquack.dma.pvram[gpu_senquack.dma.px]=data>>16;
+                       if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+                       {
+                               gpu_senquack.dma.px = 0;
+                               gpu_senquack.dma.pvram += 1024;
+                               if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                               {
+                                       gpu_senquack.dma.FrameToWrite = false;
+                                       gpu_senquack.GPU_GP1 &= ~0x08000000;
+                                       gpu_senquack.fb_dirty = true;
+                               }
+                       }
+               }
+       }
+       else
+       {
+               gpuCheckPacket(data);
+       }
+       gpu_senquack.GPU_GP1 |= 0x14000000;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_readDataMem(u32* dmaAddress, int dmaCount)
+{
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_readDataMem(%d)\n",dmaCount);
+       #endif
+       if(!gpu_senquack.dma.FrameToRead) return;
+
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+       do 
+       {
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               // lower 16 bit
+               //senquack - 64-bit fix (from notaz)
+               //u32 data = (unsigned long)gpu_senquack.dma.pvram[gpu_senquack.dma.px];
+               u32 data = (u32)gpu_senquack.dma.pvram[gpu_senquack.dma.px];
+
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+               }
+
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               // higher 16 bit (always, even if it's an odd width)
+               //senquack - 64-bit fix (from notaz)
+               //data |= (unsigned long)(gpu_senquack.dma.pvram[gpu_senquack.dma.px])<<16;
+               data |= (u32)(gpu_senquack.dma.pvram[gpu_senquack.dma.px])<<16;
+               
+               *dmaAddress++ = data;
+
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToRead = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                               break;
+                       }
+               }
+       } while (--dmaCount);
+
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 | 0x14000000) & ~0x60000000;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+u32 GPU_readData(void)
+{
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_readData()\n");
+       #endif
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+       if (gpu_senquack.dma.FrameToRead)
+       {
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               gpu_senquack.GPU_GP0 = gpu_senquack.dma.pvram[gpu_senquack.dma.px];
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToRead = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                       }
+               }
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               gpu_senquack.GPU_GP0 |= gpu_senquack.dma.pvram[gpu_senquack.dma.px]<<16;
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToRead = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                       }
+               }
+
+       }
+       gpu_senquack.GPU_GP1 |= 0x14000000;
+
+       return (gpu_senquack.GPU_GP0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+u32 GPU_readStatus(void)
+{
+       return gpu_senquack.GPU_GP1;
+}
+
+INLINE void GPU_NoSkip(void)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_NoSkip()\n");
+       #endif
+       gpu_senquack.frameskip.wasSkip = gpu_senquack.frameskip.isSkip;
+       if (gpu_senquack.frameskip.isSkip)
+       {
+               gpu_senquack.frameskip.isSkip = false;
+               gpu_senquack.frameskip.skipGPU = false;
+       }
+       else
+       {
+               gpu_senquack.frameskip.isSkip = gpu_senquack.frameskip.skipFrame;
+               gpu_senquack.frameskip.skipGPU = gpu_senquack.frameskip.skipFrame;
+       }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void  GPU_writeStatus(u32 data)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeStatus(%d,%d)\n",data>>24,data & 0xff);
+       #endif
+       switch (data >> 24) {
+       case 0x00:
+               gpuReset();
+               break;
+       case 0x01:
+               gpu_senquack.GPU_GP1 &= ~0x08000000;
+               gpu_senquack.PacketCount = 0;
+               gpu_senquack.dma.FrameToRead = gpu_senquack.dma.FrameToWrite = false;
+               break;
+       case 0x02:
+               gpu_senquack.GPU_GP1 &= ~0x08000000;
+               gpu_senquack.PacketCount = 0;
+               gpu_senquack.dma.FrameToRead = gpu_senquack.dma.FrameToWrite = false;
+               break;
+       case 0x03:
+               gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x00800000) | ((data & 1) << 23);
+               break;
+       case 0x04:
+               if (data == 0x04000000) gpu_senquack.PacketCount = 0;
+               gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x60000000) | ((data & 3) << 29);
+               break;
+       case 0x05:
+               // Start of Display Area in VRAM
+               gpu_senquack.DisplayArea[0] = data & 0x3ff;         // X (0..1023)
+               gpu_senquack.DisplayArea[1] = (data >> 10) & 0x1ff; // Y (0..511)
+               GPU_NoSkip();
+               break;
+       case 0x06:
+               // GP1(06h) - Horizontal Display range (on Screen)
+               // 0-11   X1 (260h+0)       ;12bit       ;\counted in 53.222400MHz units,
+               // 12-23  X2 (260h+320*8)   ;12bit       ;/relative to HSYNC
+
+               // senquack - gpu_senquack completely ignores GP1(0x06) command and
+               // lacks even a place in DisplayArea[] array to store the values.
+               // It seems to have been concerned only with vertical display range
+               // and centering top/bottom. I will not add support here, and
+               // focus instead on the gpulib version (gpulib_if.cpp) which uses
+               // gpulib for its PS1->host framebuffer blitting.
+               break;
+       case 0x07:
+               // GP1(07h) - Vertical Display range (on Screen)
+               // 0-9   Y1 (NTSC=88h-(224/2), (PAL=A3h-(264/2))  ;\scanline numbers on screen,
+               // 10-19 Y2 (NTSC=88h+(224/2), (PAL=A3h+(264/2))  ;/relative to VSYNC
+               // 20-23 Not used (zero)
+               {
+                       u32 v1=data & 0x000003FF; //(short)(data & 0x3ff);
+                       u32 v2=(data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff);
+                       if ((gpu_senquack.DisplayArea[4]!=v1)||(gpu_senquack.DisplayArea[5]!=v2))
+                       {
+                               gpu_senquack.DisplayArea[4] = v1;
+                               gpu_senquack.DisplayArea[5] = v2;
+                               #ifdef ENABLE_GPU_LOG_SUPPORT
+                                       fprintf(stdout,"video_clear(CHANGE_Y)\n");
+                               #endif
+                               video_clear();
+                       }
+               }
+               break;
+       case 0x08:
+               {
+                       static const u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 };
+                       static const u32 VerticalResolution[4] = { 240, 480, 256, 480 };
+                       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10);
+                       #ifdef ENABLE_GPU_LOG_SUPPORT
+                               fprintf(stdout,"GPU_writeStatus(RES=%dx%d,BITS=%d,PAL=%d)\n",HorizontalResolution[(gpu_senquack.GPU_GP1 >> 16) & 7],
+                                               VerticalResolution[(gpu_senquack.GPU_GP1 >> 19) & 3],(gpu_senquack.GPU_GP1&0x00200000?24:15),(IS_PAL?1:0));
+                       #endif
+                       // Video mode change
+                       u32 new_width = HorizontalResolution[(gpu_senquack.GPU_GP1 >> 16) & 7];
+                       u32 new_height = VerticalResolution[(gpu_senquack.GPU_GP1 >> 19) & 3];
+
+                       if (gpu_senquack.DisplayArea[2] != new_width || gpu_senquack.DisplayArea[3] != new_height)
+                       {
+                               // Update width
+                               gpu_senquack.DisplayArea[2] = new_width;
+
+                               if (PixelSkipEnabled()) {
+                                       // Set blit_mask for high horizontal resolutions. This allows skipping
+                                       //  rendering pixels that would never get displayed on low-resolution
+                                       //  platforms that use simple pixel-dropping scaler.
+                                       switch (gpu_senquack.DisplayArea[2])
+                                       {
+                                               case 512: gpu_senquack.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+                                               case 640: gpu_senquack.blit_mask = 0xaa; break; // GPU_BlitWS
+                                               default:  gpu_senquack.blit_mask = 0;    break;
+                                       }
+                               } else {
+                                       gpu_senquack.blit_mask = 0;
+                               }
+
+                               // Update height
+                               gpu_senquack.DisplayArea[3] = new_height;
+
+                               if (LineSkipEnabled()) {
+                                       // Set rendering line-skip (only render every other line in high-res
+                                       //  480 vertical mode, or, optionally, force it for all video modes)
+
+                                       if (gpu_senquack.DisplayArea[3] == 480) {
+                                               if (gpu_senquack.config.ilace_force) {
+                                                       gpu_senquack.ilace_mask = 3; // Only need 1/4 of lines
+                                               } else {
+                                                       gpu_senquack.ilace_mask = 1; // Only need 1/2 of lines
+                                               }
+                                       } else {
+                                               // Vert resolution changed from 480 to lower one
+                                               gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+                                       }
+                               } else {
+                                       gpu_senquack.ilace_mask = 0;
+                               }
+
+                               #ifdef ENABLE_GPU_LOG_SUPPORT
+                                       fprintf(stdout,"video_clear(CHANGE_RES)\n");
+                               #endif
+                               video_clear();
+                       }
+
+               }
+               break;
+       case 0x10:
+               switch (data & 0xff) {
+                       case 2: gpu_senquack.GPU_GP0 = gpu_senquack.tex_window; break;
+                       case 3: gpu_senquack.GPU_GP0 = (gpu_senquack.DrawingArea[1] << 10) | gpu_senquack.DrawingArea[0]; break;
+                       case 4: gpu_senquack.GPU_GP0 = ((gpu_senquack.DrawingArea[3]-1) << 10) | (gpu_senquack.DrawingArea[2]-1); break;
+                       case 5: case 6: gpu_senquack.GPU_GP0 = (((u32)gpu_senquack.DrawingOffset[1] & 0x7ff) << 11) | ((u32)gpu_senquack.DrawingOffset[0] & 0x7ff); break;
+                       case 7: gpu_senquack.GPU_GP0 = 2; break;
+                       case 8: case 15: gpu_senquack.GPU_GP0 = 0xBFC03720; break;
+               }
+               break;
+       }
+}
+
+// Blitting functions
+#include "gpu_blit.h"
+
+static void gpuVideoOutput(void)
+{
+       int h0, x0, y0, w0, h1;
+
+       x0 = gpu_senquack.DisplayArea[0];
+       y0 = gpu_senquack.DisplayArea[1];
+
+       w0 = gpu_senquack.DisplayArea[2];
+       h0 = gpu_senquack.DisplayArea[3];  // video mode
+
+       h1 = gpu_senquack.DisplayArea[5] - gpu_senquack.DisplayArea[4]; // display needed
+       if (h0 == 480) h1 = Min2(h1*2,480);
+
+       bool isRGB24 = (gpu_senquack.GPU_GP1 & 0x00200000 ? true : false);
+       u16* dst16 = SCREEN;
+       u16* src16 = (u16*)gpu_senquack.vram;
+
+       // PS1 fb read wraps around (fixes black screen in 'Tobal no. 1')
+       unsigned int src16_offs_msk = 1024*512-1;
+       unsigned int src16_offs = (x0 + y0*1024) & src16_offs_msk;
+
+       //  Height centering
+       int sizeShift = 1;
+       if (h0 == 256) {
+               h0 = 240;
+       } else if (h0 == 480) {
+               sizeShift = 2;
+       }
+       if (h1 > h0) {
+               src16_offs = (src16_offs + (((h1-h0) / 2) * 1024)) & src16_offs_msk;
+               h1 = h0;
+       } else if (h1<h0) {
+               dst16 += ((h0-h1) >> sizeShift) * VIDEO_WIDTH;
+       }
+
+
+       /* Main blitter */
+       int incY = (h0==480) ? 2 : 1;
+       h0=(h0==480 ? 2048 : 1024);
+
+       {
+               const int li=gpu_senquack.ilace_mask;
+               bool pi = ProgressiveInterlaceEnabled();
+               bool pif = gpu_senquack.prog_ilace_flag;
+               switch ( w0 )
+               {
+                       case 256:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWDWW(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 368:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWWWWWWWS(src16 + src16_offs, dst16, isRGB24, 4);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 320:
+                               // Ensure 32-bit alignment for GPU_BlitWW() blitter:
+                               src16_offs &= ~1;
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWW(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 384:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWWWWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 512:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWSWWSWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 640:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+               }
+               gpu_senquack.prog_ilace_flag = !gpu_senquack.prog_ilace_flag;
+       }
+       video_flip();
+}
+
+// Update frames-skip each second>>3 (8 times per second)
+#define GPU_FRAMESKIP_UPDATE 3
+
+static void GPU_frameskip (bool show)
+{
+       u32 now=get_ticks(); // current frame
+
+       // Update frameskip
+       if (gpu_senquack.frameskip.skipCount==0) gpu_senquack.frameskip.skipFrame=false; // frameskip off
+       else if (gpu_senquack.frameskip.skipCount==7) { if (show) gpu_senquack.frameskip.skipFrame=!gpu_senquack.frameskip.skipFrame; } // frameskip medium
+       else if (gpu_senquack.frameskip.skipCount==8) gpu_senquack.frameskip.skipFrame=true; // frameskip maximum
+       else
+       {
+               static u32 spd=100; // speed %
+               static u32 frames=0; // frames counter
+               static u32 prev=now; // previous fps calculation
+               frames++;
+               if ((now-prev)>=(TPS>>GPU_FRAMESKIP_UPDATE))
+               {
+                       if (IS_PAL) spd=(frames<<1);
+                       else spd=((frames*1001)/600);
+                       spd<<=GPU_FRAMESKIP_UPDATE;
+                       frames=0;
+                       prev=now;
+               }
+               switch(gpu_senquack.frameskip.skipCount)
+               {
+                       case 1: if (spd<50) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<50%)
+                       case 2: if (spd<60) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<60%)
+                       case 3: if (spd<70) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<70%)
+                       case 4: if (spd<80) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<80%)
+                       case 5: if (spd<90) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<90%)
+               }
+       }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_updateLace(void)
+{
+       // Interlace bit toggle
+       gpu_senquack.GPU_GP1 ^= 0x80000000;
+
+       // Update display?
+       if ((gpu_senquack.fb_dirty) && (!gpu_senquack.frameskip.wasSkip) && (!(gpu_senquack.GPU_GP1&0x00800000)))
+       {
+               // Display updated
+               gpuVideoOutput();
+               GPU_frameskip(true);
+               #ifdef ENABLE_GPU_LOG_SUPPORT
+                       fprintf(stdout,"GPU_updateLace(UPDATE)\n");
+               #endif
+       } else {
+               GPU_frameskip(false);
+               #ifdef ENABLE_GPU_LOG_SUPPORT
+                       fprintf(stdout,"GPU_updateLace(SKIP)\n");
+               #endif
+       }
+
+       if ((!gpu_senquack.frameskip.skipCount) && (gpu_senquack.DisplayArea[3] == 480)) gpu_senquack.frameskip.skipGPU=true; // Tekken 3 hack
+
+       gpu_senquack.fb_dirty=false;
+       gpu_senquack.dma.last_dma = NULL;
+}
+
+// Allows frontend to signal plugin to redraw screen after returning to emu
+void GPU_requestScreenRedraw()
+{
+       gpu_senquack.fb_dirty = true;
+}
+
+void GPU_getScreenInfo(GPUScreenInfo_t *sinfo)
+{
+       bool depth24 = (gpu_senquack.GPU_GP1 & 0x00200000 ? true : false);
+       int16_t hres = (uint16_t)gpu_senquack.DisplayArea[2];
+       int16_t vres = (uint16_t)gpu_senquack.DisplayArea[3];
+       int16_t w = hres; // Original gpu_senquack doesn't support width < 100%
+       int16_t h = gpu_senquack.DisplayArea[5] - gpu_senquack.DisplayArea[4];
+       if (vres == 480)
+               h *= 2;
+       if (h <= 0 || h > vres)
+               h = vres;
+
+       sinfo->vram    = (uint8_t*)gpu_senquack.vram;
+       sinfo->x       = (uint16_t)gpu_senquack.DisplayArea[0];
+       sinfo->y       = (uint16_t)gpu_senquack.DisplayArea[1];
+       sinfo->w       = w;
+       sinfo->h       = h;
+       sinfo->hres    = hres;
+       sinfo->vres    = vres;
+       sinfo->depth24 = depth24;
+       sinfo->pal     = IS_PAL;
+}
diff --git a/plugins/gpu_senquack/gpu.h b/plugins/gpu_senquack/gpu.h
new file mode 100644 (file)
index 0000000..7a46751
--- /dev/null
@@ -0,0 +1,74 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef GPU_UNAI_GPU_H
+#define GPU_UNAI_GPU_H
+
+struct gpu_senquack_config_t {
+       uint8_t pixel_skip:1;     // If 1, allows skipping rendering pixels that
+                                 //  would not be visible when a high horizontal
+                                 //  resolution PS1 video mode is set.
+                                 //  Only applies to devices with low resolutions
+                                 //  like 320x240. Should not be used if a
+                                 //  down-scaling framebuffer blitter is in use.
+                                 //  Can cause gfx artifacts if game reads VRAM
+                                 //  to do framebuffer effects.
+
+       uint8_t ilace_force:3;    // Option to force skipping rendering of lines,
+                                 //  for very slow platforms. Value will be
+                                 //  assigned to 'ilace_mask' in gpu_senquack struct.
+                                 //  Normally 0. Value '1' will skip rendering
+                                 //  odd lines.
+
+       uint8_t scale_hires:1;    // If 1, will scale hi-res output to
+                                 //  320x240 when gpulib reads the frame.
+                                 //  Implies pixel_skip and ilace_force
+                                 //  (when height > 240).
+       uint8_t lighting:1;
+       uint8_t fast_lighting:1;
+       uint8_t blending:1;
+       uint8_t dithering:1;
+
+       //senquack Only PCSX Rearmed's version of gpu_senquack had this, and I
+       // don't think it's necessary. It would require adding 'AH' flag to
+       // gpuSpriteSpanFn() increasing size of sprite span function array.
+       //uint8_t enableAbbeyHack:1;  // Abe's Odyssey hack
+
+       ////////////////////////////////////////////////////////////////////////////
+       // Variables used only by older standalone version of gpu_senquack (gpu.cpp)
+#ifndef USE_GPULIB
+       uint8_t prog_ilace:1;         // Progressive interlace option (old option)
+                                     //  This option was somewhat oddly named:
+                                     //  When in interlaced video mode, on a low-res
+                                     //  320x240 device, only the even lines are
+                                     //  rendered. This option will take that one
+                                     //  step further and only render half the even
+                                     //  even lines one frame, and then the other half.
+       uint8_t frameskip_count:3;    // Frame skip (0..7)
+#endif
+};
+
+extern gpu_senquack_config_t gpu_senquack_config_ext;
+
+// TODO: clean up show_fps frontend option
+extern  bool show_fps;
+
+#endif // GPU_UNAI_GPU_H
diff --git a/plugins/gpu_senquack/gpu_arm.S b/plugins/gpu_senquack/gpu_arm.S
new file mode 100644 (file)
index 0000000..ec87f21
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * (C) Gražvydas "notaz" Ignotas, 2011
+ *
+ * This work is licensed under the terms of  GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "arm_features.h"
+
+.text
+.align 2
+
+@ in: r0=dst, r2=pal, r12=0x1e
+@ trashes r6-r8,lr,flags
+.macro do_4_pixels rs ibase obase
+.if \ibase - 1 < 0
+    and     r6, r12, \rs, lsl #1
+.else
+    and     r6, r12, \rs, lsr #\ibase-1
+.endif
+    and     r7, r12, \rs, lsr #\ibase+3
+    and     r8, r12, \rs, lsr #\ibase+7
+    and     lr, r12, \rs, lsr #\ibase+11
+    ldrh    r6, [r2, r6]
+    ldrh    r7, [r2, r7]
+    ldrh    r8, [r2, r8]
+    ldrh    lr, [r2, lr]
+    tst     r6, r6
+    strneh  r6, [r0, #\obase+0]
+    tst     r7, r7
+    strneh  r7, [r0, #\obase+2]
+    tst     r8, r8
+    strneh  r8, [r0, #\obase+4]
+    tst     lr, lr
+    strneh  lr, [r0, #\obase+6]
+.endm
+
+.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines)
+draw_spr16_full:
+    stmfd   sp!, {r4-r8,lr}
+    mov     r12, #0x1e             @ empty pixel
+
+0:
+    ldmia   r1, {r4,r5}
+    do_4_pixels r4, 0,  0
+    do_4_pixels r4, 16, 8
+    do_4_pixels r5, 0,  16
+    do_4_pixels r5, 16, 24
+    subs    r3, r3, #1
+    add     r0, r0, #2048
+    add     r1, r1, #2048
+    bgt     0b
+
+    ldmfd   sp!, {r4-r8,pc}
+
+@ vim:filetype=armasm
diff --git a/plugins/gpu_senquack/gpu_arm.h b/plugins/gpu_senquack/gpu_arm.h
new file mode 100644 (file)
index 0000000..b9f8f97
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef __GPU_UNAI_GPU_ARM_H__
+#define __GPU_UNAI_GPU_ARM_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void draw_spr16_full(u16 *d, void *s, u16 *pal, int lines);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __GPU_UNAI_GPU_ARM_H__ */
diff --git a/plugins/gpu_senquack/gpu_blit.h b/plugins/gpu_senquack/gpu_blit.h
new file mode 100644 (file)
index 0000000..e93f12f
--- /dev/null
@@ -0,0 +1,405 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _INNER_BLIT_H_
+#define _INNER_BLIT_H_
+
+#ifndef USE_BGR15
+#define RGB24(R,G,B)   (((((R)&0xF8)<<8)|(((G)&0xFC)<<3)|(((B)&0xF8)>>3)))
+#define RGB16X2(C)      (((C)&(0x1f001f<<10))>>10) | (((C)&(0x1f001f<<5))<<1) | (((C)&(0x1f001f<<0))<<11)
+#define RGB16(C)               (((C)&(0x1f<<10))>>10) | (((C)&(0x1f<<5))<<1) | (((C)&(0x1f<<0))<<11)
+#else
+#define RGB24(R,G,B)   ((((R)&0xF8)>>3)|(((G)&0xF8)<<2)|(((B)&0xF8)<<7))
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Blitting code with rescale and interlace support.
+
+INLINE void GPU_BlitWW(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 20;
+                       const u32* src32 = (const u32*) src; 
+                       u32* dst32 = (u32*)(void*) dst16;
+                       do{
+                               dst32[0] = RGB16X2(src32[0]);
+                               dst32[1] = RGB16X2(src32[1]);
+                               dst32[2] = RGB16X2(src32[2]);
+                               dst32[3] = RGB16X2(src32[3]);
+                               dst32[4] = RGB16X2(src32[4]);
+                               dst32[5] = RGB16X2(src32[5]);
+                               dst32[6] = RGB16X2(src32[6]);
+                               dst32[7] = RGB16X2(src32[7]);
+                               dst32 += 8;
+                               src32 += 8;
+                       }while(--uCount);
+               #else
+                       memcpy(dst16,src,640);
+               #endif
+       }
+       else
+       {
+               uCount = 20;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[ 3] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[ 4] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 5] = RGB24(src8[15], src8[16], src8[17] );
+                       dst16[ 6] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[ 7] = RGB24(src8[21], src8[22], src8[23] );
+
+                       dst16[ 8] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[ 9] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[10] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16[11] = RGB24(src8[33], src8[34], src8[35] );
+                       dst16[12] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[13] = RGB24(src8[39], src8[40], src8[41] );
+                       dst16[14] = RGB24(src8[42], src8[43], src8[44] );
+                       dst16[15] = RGB24(src8[45], src8[46], src8[47] );
+                       dst16 += 16;
+                       src8  += 48;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 32;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = RGB16(src16[3]);
+                               dst16[ 3] = RGB16(src16[4]);
+                               dst16[ 4] = RGB16(src16[6]);
+                               dst16[ 5] = RGB16(src16[8]);
+                               dst16[ 6] = RGB16(src16[9]);
+                               dst16[ 7] = RGB16(src16[11]);
+                               dst16[ 8] = RGB16(src16[12]);
+                               dst16[ 9] = RGB16(src16[14]);
+                               dst16 += 10;
+                               src16 += 16;
+                       }while(--uCount);
+               #else
+                       uCount = 64;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                               *dst16++ = *src16;
+                               src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 32;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[ 3] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 4] = RGB24(src8[18], src8[19], src8[20] );
+
+                       dst16[ 5] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[ 6] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[ 7] = RGB24(src8[33], src8[34], src8[35] );
+                       dst16[ 8] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[ 9] = RGB24(src8[42], src8[43], src8[44] );
+
+                       dst16 += 10;
+                       src8  += 48;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 32;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = RGB16(src16[2]);
+                               dst16[ 3] = RGB16(src16[3]);
+                               dst16[ 4] = RGB16(src16[4]);
+                               dst16[ 5] = RGB16(src16[6]);
+                               dst16[ 6] = RGB16(src16[7]);
+                               dst16[ 7] = RGB16(src16[8]);
+                               dst16[ 8] = RGB16(src16[9]);
+                               dst16[ 9] = RGB16(src16[10]);
+                               dst16 += 10;
+                               src16 += 12;
+                       }while(--uCount);
+               #else
+                       uCount = 64;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 32;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[2] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[3] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[4] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[5] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[6] = RGB24(src8[21], src8[22], src8[23] );
+                       dst16[7] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[8] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[9] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16 += 10;
+                       src8  += 36;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, bool isRGB24, u32 uClip_src)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 20;
+                       const u16* src16 = ((const u16*) src) + uClip_src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = RGB16(src16[2]);
+                               dst16[ 3] = RGB16(src16[3]);
+                               dst16[ 4] = RGB16(src16[4]);
+                               dst16[ 5] = RGB16(src16[5]);
+                               dst16[ 6] = RGB16(src16[6]);
+                               dst16[ 7] = RGB16(src16[7]);
+
+                               dst16[ 8] = RGB16(src16[9]);
+                               dst16[ 9] = RGB16(src16[10]);
+                               dst16[10] = RGB16(src16[11]);
+                               dst16[11] = RGB16(src16[12]);
+                               dst16[12] = RGB16(src16[13]);
+                               dst16[13] = RGB16(src16[14]);
+                               dst16[14] = RGB16(src16[15]);
+                               dst16[15] = RGB16(src16[16]);
+                               dst16 += 16;
+                               src16 += 18;
+                       }while(--uCount);
+               #else
+                       uCount = 40;
+                       const u16* src16 = ((const u16*) src) + uClip_src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 20;
+               const u8* src8 = (const u8*)src + (uClip_src<<1) + uClip_src;
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[ 3] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[ 4] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 5] = RGB24(src8[15], src8[16], src8[17] );
+                       dst16[ 6] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[ 7] = RGB24(src8[21], src8[22], src8[23] );
+
+                       dst16[ 8] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[ 9] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16[10] = RGB24(src8[33], src8[34], src8[35] );
+                       dst16[11] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[12] = RGB24(src8[39], src8[40], src8[41] );
+                       dst16[13] = RGB24(src8[42], src8[43], src8[44] );
+                       dst16[14] = RGB24(src8[45], src8[46], src8[47] );
+                       dst16[15] = RGB24(src8[48], src8[49], src8[50] );
+                       dst16 += 16;
+                       src8  += 54;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 32;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = dst16[1];
+                               dst16[ 3] = RGB16(src16[2]);
+                               dst16[ 4] = RGB16(src16[3]);
+                               dst16[ 5] = RGB16(src16[4]);
+                               dst16[ 6] = RGB16(src16[5]);
+                               dst16[ 7] = dst16[6];
+                               dst16[ 8] = RGB16(src16[6]);
+                               dst16[ 9] = RGB16(src16[7]);
+                               dst16 += 10;
+                               src16 +=  8;
+                       }while(--uCount);
+               #else
+                       uCount = 64;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 32;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[ 0] = RGB24(src8[0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = dst16[1];
+                       dst16[ 3] = RGB24(src8[6], src8[ 7], src8[ 8] );
+                       dst16[ 4] = RGB24(src8[9], src8[10], src8[11] );
+
+                       dst16[ 5] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 6] = RGB24(src8[15], src8[16], src8[17] );
+                       dst16[ 7] = dst16[6];
+                       dst16[ 8] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[ 9] = RGB24(src8[21], src8[22], src8[23] );
+                       dst16 += 10;
+                       src8  += 24;
+               }while(--uCount);
+       }
+}
+
+
+INLINE void GPU_BlitWS(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 20;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[2]);
+                               dst16[ 2] = RGB16(src16[4]);
+                               dst16[ 3] = RGB16(src16[6]);
+
+                               dst16[ 4] = RGB16(src16[8]);
+                               dst16[ 5] = RGB16(src16[10]);
+                               dst16[ 6] = RGB16(src16[12]);
+                               dst16[ 7] = RGB16(src16[14]);
+
+                               dst16[ 8] = RGB16(src16[16]);
+                               dst16[ 9] = RGB16(src16[18]);
+                               dst16[10] = RGB16(src16[20]);
+                               dst16[11] = RGB16(src16[22]);
+
+                               dst16[12] = RGB16(src16[24]);
+                               dst16[13] = RGB16(src16[26]);
+                               dst16[14] = RGB16(src16[28]);
+                               dst16[15] = RGB16(src16[30]);
+
+                               dst16 += 16;
+                               src16 += 32;
+                       }while(--uCount);
+               #else
+                       uCount = 320;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16; src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 20;
+               const u8* src8 = (const u8*) src; 
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[ 2] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 3] = RGB24(src8[18], src8[19], src8[20] );
+
+                       dst16[ 4] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[ 5] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16[ 6] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[ 7] = RGB24(src8[42], src8[43], src8[44] );
+
+                       dst16[ 8] = RGB24(src8[48], src8[49], src8[50] );
+                       dst16[ 9] = RGB24(src8[54], src8[55], src8[56] );
+                       dst16[10] = RGB24(src8[60], src8[61], src8[62] );
+                       dst16[11] = RGB24(src8[66], src8[67], src8[68] );
+
+                       dst16[12] = RGB24(src8[72], src8[73], src8[74] );
+                       dst16[13] = RGB24(src8[78], src8[79], src8[80] );
+                       dst16[14] = RGB24(src8[84], src8[85], src8[86] );
+                       dst16[15] = RGB24(src8[90], src8[91], src8[92] );
+
+                       dst16 += 16;
+                       src8  += 96;
+               }while(--uCount);
+       }
+}
+
+#endif //_INNER_BLIT_H_
diff --git a/plugins/gpu_senquack/gpu_command.h b/plugins/gpu_senquack/gpu_command.h
new file mode 100644 (file)
index 0000000..d052ae8
--- /dev/null
@@ -0,0 +1,621 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_COMMAND_H__
+#define __GPU_UNAI_GPU_COMMAND_H__
+
+///////////////////////////////////////////////////////////////////////////////
+void gpuSetTexture(u16 tpage)
+{
+       u32 tmode, tx, ty;
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x1FF) | (tpage & 0x1FF);
+       gpu_senquack.TextureWindow[0]&= ~gpu_senquack.TextureWindow[2];
+       gpu_senquack.TextureWindow[1]&= ~gpu_senquack.TextureWindow[3];
+
+       tmode = (tpage >> 7) & 3;  // 16bpp, 8bpp, or 4bpp texture colors?
+                                  // 0: 4bpp     1: 8bpp     2/3: 16bpp
+
+       // Nocash PSX docs state setting of 3 is same as setting of 2 (16bpp):
+       // Note: DrHell assumes 3 is same as 0.. TODO: verify which is correct?
+       if (tmode == 3) tmode = 2;
+
+       tx = (tpage & 0x0F) << 6;
+       ty = (tpage & 0x10) << 4;
+
+       tx += (gpu_senquack.TextureWindow[0] >> (2 - tmode));
+       ty += gpu_senquack.TextureWindow[1];
+       
+       gpu_senquack.BLEND_MODE  = ((tpage>>5) & 3) << 3;
+       gpu_senquack.TEXT_MODE   = (tmode + 1) << 5; // gpu_senquack.TEXT_MODE should be values 1..3, so add one
+       gpu_senquack.TBA = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(tx, ty)];
+}
+
+///////////////////////////////////////////////////////////////////////////////
+INLINE void gpuSetCLUT(u16 clut)
+{
+       gpu_senquack.CBA = &((u16*)gpu_senquack.vram)[(clut & 0x7FFF) << 4];
+}
+
+#ifdef  ENABLE_GPU_NULL_SUPPORT
+#define NULL_GPU() break
+#else
+#define NULL_GPU()
+#endif
+
+#ifdef  ENABLE_GPU_LOG_SUPPORT
+#define DO_LOG(expr) printf expr
+#else
+#define DO_LOG(expr) {}
+#endif
+
+#define Blending      (((PRIM&0x2) && BlendingEnabled()) ? (PRIM&0x2) : 0)
+#define Blending_Mode (((PRIM&0x2) && BlendingEnabled()) ? gpu_senquack.BLEND_MODE : 0)
+#define Lighting      (((~PRIM)&0x1) && LightingEnabled())
+// Dithering applies only to Gouraud-shaded polys or texture-blended polys:
+#define Dithering     (((((~PRIM)&0x1) || (PRIM&0x10)) && DitheringEnabled()) ?            \
+                       (ForcedDitheringEnabled() ? (1<<9) : (gpu_senquack.GPU_GP1 & (1 << 9))) \
+                       : 0)
+
+///////////////////////////////////////////////////////////////////////////////
+//Now handled by Rearmed's gpulib and gpu_senquack/gpulib_if.cpp:
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_senquack_t &gpu_senquack, u32 cmd_word)
+{
+       // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+       u8 num = (cmd_word >> 24) & 7;
+       switch (num) {
+               case 1: {
+                       // GP0(E1h) - Draw Mode setting (aka "Texpage")
+                       DO_LOG(("GP0(0xE1) DrawMode TexPage(0x%x)\n", cmd_word));
+                       u32 cur_texpage = gpu_senquack.GPU_GP1 & 0x7FF;
+                       u32 new_texpage = cmd_word & 0x7FF;
+                       if (cur_texpage != new_texpage) {
+                               gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x7FF) | new_texpage;
+                               gpuSetTexture(gpu_senquack.GPU_GP1);
+                       }
+               } break;
+
+               case 2: {
+                       // GP0(E2h) - Texture Window setting
+                       DO_LOG(("GP0(0xE2) TextureWindow(0x%x)\n", cmd_word));
+                       if (cmd_word != gpu_senquack.TextureWindowCur) {
+                               static const u8 TextureMask[32] = {
+                                       255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+                                       127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+                               };
+                               gpu_senquack.TextureWindowCur = cmd_word;
+                               gpu_senquack.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+                               gpu_senquack.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+                               gpu_senquack.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+                               gpu_senquack.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+                               gpu_senquack.TextureWindow[0] &= ~gpu_senquack.TextureWindow[2];
+                               gpu_senquack.TextureWindow[1] &= ~gpu_senquack.TextureWindow[3];
+
+                               // Inner loop vars must be updated whenever texture window is changed:
+                               const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+                               gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+                               gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+                               gpuSetTexture(gpu_senquack.GPU_GP1);
+                       }
+               } break;
+
+               case 3: {
+                       // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+                       DO_LOG(("GP0(0xE3) DrawingArea Pos(0x%x)\n", cmd_word));
+                       gpu_senquack.DrawingArea[0] = cmd_word         & 0x3FF;
+                       gpu_senquack.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+               } break;
+
+               case 4: {
+                       // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+                       DO_LOG(("GP0(0xE4) DrawingArea Size(0x%x)\n", cmd_word));
+                       gpu_senquack.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+                       gpu_senquack.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+               } break;
+
+               case 5: {
+                       // GP0(E5h) - Set Drawing Offset (X,Y)
+                       DO_LOG(("GP0(0xE5) DrawingOffset(0x%x)\n", cmd_word));
+                       gpu_senquack.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+                       gpu_senquack.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+               } break;
+
+               case 6: {
+                       // GP0(E6h) - Mask Bit Setting
+                       DO_LOG(("GP0(0xE6) SetMask(0x%x)\n", cmd_word));
+                       gpu_senquack.Masking  = (cmd_word & 0x2) <<  1;
+                       gpu_senquack.PixelMSB = (cmd_word & 0x1) <<  8;
+               } break;
+       }
+}
+
+void gpuSendPacketFunction(const int PRIM)
+{
+       //printf("0x%x\n",PRIM);
+
+       //senquack - TODO: optimize this (packet pointer union as prim draw parameter
+       // introduced as optimization for gpulib command-list processing)
+       PtrUnion packet = { .ptr = (void*)&gpu_senquack.PacketBuffer };
+
+       switch (PRIM)
+       {
+               case 0x02: {
+                       NULL_GPU();
+                       gpuClearImage(packet);    //  prim handles updateLace && skip
+                       gpu_senquack.fb_dirty = true;
+                       DO_LOG(("gpuClearImage(0x%x)\n",PRIM));
+               } break;
+
+               case 0x20:
+               case 0x21:
+               case 0x22:
+               case 0x23: {          // Monochrome 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyF(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyF(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x24:
+               case 0x25:
+               case 0x26:
+               case 0x27: {          // Textured 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+                               u32 driver_idx =
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+                               if (!FastLightingEnabled()) {
+                                       driver_idx |= Lighting;
+                               } else {
+                                       if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+                                               driver_idx |= Lighting;
+                               }
+
+                               PP driver = gpuPolySpanDrivers[driver_idx];
+                               gpuDrawPolyFT(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyFT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x28:
+               case 0x29:
+               case 0x2A:
+               case 0x2B: {          // Monochrome 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyF(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyF(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x2C:
+               case 0x2D:
+               case 0x2E:
+               case 0x2F: {          // Textured 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+                               u32 driver_idx =
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+                               if (!FastLightingEnabled()) {
+                                       driver_idx |= Lighting;
+                               } else {
+                                       if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+                                               driver_idx |= Lighting;
+                               }
+
+                               PP driver = gpuPolySpanDrivers[driver_idx];
+                               gpuDrawPolyFT(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyFT(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x30:
+               case 0x31:
+               case 0x32:
+               case 0x33: {          // Gouraud-shaded 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+                               // this is an untextured poly, so CF_LIGHT (texture blend)
+                               // shouldn't apply. Until the original array of template
+                               // instantiation ptrs is fixed, we're stuck with this. (TODO)
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyG(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyG(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x34:
+               case 0x35:
+               case 0x36:
+               case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyGT(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyGT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x38:
+               case 0x39:
+               case 0x3A:
+               case 0x3B: {          // Gouraud-shaded 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // See notes regarding '129' for 0x30..0x33 further above -senquack
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyG(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyG(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x3C:
+               case 0x3D:
+               case 0x3E:
+               case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyGT(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyGT(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x40:
+               case 0x41:
+               case 0x42:
+               case 0x43: {          // Monochrome line
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineF(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x48:
+               case 0x49:
+               case 0x4A:
+               case 0x4B:
+               case 0x4C:
+               case 0x4D:
+               case 0x4E:
+               case 0x4F: { // Monochrome line strip
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineF(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
+                       }
+                       if ((gpu_senquack.PacketBuffer.U4[3] & 0xF000F000) != 0x50005000)
+                       {
+                               gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[2];
+                               gpu_senquack.PacketBuffer.U4[2] = gpu_senquack.PacketBuffer.U4[3];
+                               gpu_senquack.PacketCount = 1;
+                               gpu_senquack.PacketIndex = 3;
+                       }
+               } break;
+
+               case 0x50:
+               case 0x51:
+               case 0x52:
+               case 0x53: {          // Gouraud-shaded line
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               // Index MSB selects Gouraud-shaded PixelSpanDriver:
+                               driver_idx |= (1 << 5);
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineG(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x58:
+               case 0x59:
+               case 0x5A:
+               case 0x5B:
+               case 0x5C:
+               case 0x5D:
+               case 0x5E:
+               case 0x5F: { // Gouraud-shaded line strip
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               // Index MSB selects Gouraud-shaded PixelSpanDriver:
+                               driver_idx |= (1 << 5);
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineG(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
+                       }
+                       if ((gpu_senquack.PacketBuffer.U4[4] & 0xF000F000) != 0x50005000)
+                       {
+                               gpu_senquack.PacketBuffer.U1[3 + (2 * 4)] = gpu_senquack.PacketBuffer.U1[3 + (0 * 4)];
+                               gpu_senquack.PacketBuffer.U4[0] = gpu_senquack.PacketBuffer.U4[2];
+                               gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[3];
+                               gpu_senquack.PacketBuffer.U4[2] = gpu_senquack.PacketBuffer.U4[4];
+                               gpu_senquack.PacketCount = 2;
+                               gpu_senquack.PacketIndex = 3;
+                       }
+               } break;
+
+               case 0x60:
+               case 0x61:
+               case 0x62:
+               case 0x63: {          // Monochrome rectangle (variable size)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x64:
+               case 0x65:
+               case 0x66:
+               case 0x67: {          // Textured rectangle (variable size)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+                               // This fixes Silent Hill running animation on loading screens:
+                               // (On PSX, color values 0x00-0x7F darken the source texture's color,
+                               //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+                               //  0x80 leaves source texture color unchanged, HOWEVER,
+                               //   gpu_senquack uses a simple lighting LUT whereby only the upper
+                               //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+                               //   0x80.
+                               // 
+                               // NOTE: I've changed all textured sprite draw commands here and
+                               //  elsewhere to use proper behavior, but left poly commands
+                               //  alone, I don't want to slow rendering down too much. (TODO)
+                               //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x68:
+               case 0x69:
+               case 0x6A:
+               case 0x6B: {          // Monochrome rectangle (1x1 dot)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[2] = 0x00010001;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x70:
+               case 0x71:
+               case 0x72:
+               case 0x73: {          // Monochrome rectangle (8x8)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[2] = 0x00080008;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x74:
+               case 0x75:
+               case 0x76:
+               case 0x77: {          // Textured rectangle (8x8)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[3] = 0x00080008;
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+                               //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+                               //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x78:
+               case 0x79:
+               case 0x7A:
+               case 0x7B: {          // Monochrome rectangle (16x16)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[2] = 0x00100010;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x7C:
+               case 0x7D:
+                       #ifdef __arm__
+                       /* Notaz 4bit sprites optimization */
+                       if ((!gpu_senquack.frameskip.skipGPU) && (!(gpu_senquack.GPU_GP1&0x180)) && (!(gpu_senquack.Masking|gpu_senquack.PixelMSB)))
+                       {
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuDrawS16(packet);
+                               gpu_senquack.fb_dirty = true;
+                               break;
+                       }
+                       #endif
+               case 0x7E:
+               case 0x7F: {          // Textured rectangle (16x16)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[3] = 0x00100010;
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+                               //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+                               //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x80:          //  vid -> vid
+                       gpuMoveImage(packet);   //  prim handles updateLace && skip
+                       if ((!gpu_senquack.frameskip.skipCount) && (gpu_senquack.DisplayArea[3] == 480)) // Tekken 3 hack
+                       {
+                               if (!gpu_senquack.frameskip.skipGPU) gpu_senquack.fb_dirty = true;
+                       }
+                       else
+                       {
+                               gpu_senquack.fb_dirty = true;
+                       }
+                       DO_LOG(("gpuMoveImage(0x%x)\n",PRIM));
+                       break;
+               case 0xA0:          //  sys ->vid
+                       gpuLoadImage(packet);   //  prim handles updateLace && skip
+                       DO_LOG(("gpuLoadImage(0x%x)\n",PRIM));
+                       break;
+               case 0xC0:          //  vid -> sys
+                       gpuStoreImage(packet);  //  prim handles updateLace && skip
+                       DO_LOG(("gpuStoreImage(0x%x)\n",PRIM));
+                       break;
+               case 0xE1 ... 0xE6: { // Draw settings
+                       gpuGP0Cmd_0xEx(gpu_senquack, gpu_senquack.PacketBuffer.U4[0]);
+               } break;
+       }
+}
+#endif //!USE_GPULIB
+///////////////////////////////////////////////////////////////////////////////
+// End of code specific to non-gpulib standalone version of gpu_senquack
+///////////////////////////////////////////////////////////////////////////////
+
+#endif /* __GPU_UNAI_GPU_COMMAND_H__ */
diff --git a/plugins/gpu_senquack/gpu_fixedpoint.h b/plugins/gpu_senquack/gpu_fixedpoint.h
new file mode 100644 (file)
index 0000000..5df42cf
--- /dev/null
@@ -0,0 +1,134 @@
+/***************************************************************************
+ *   Copyright (C) 2010 PCSX4ALL Team                                      *
+ *   Copyright (C) 2010 Unai                                               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+ ***************************************************************************/
+
+#ifndef FIXED_H
+#define FIXED_H
+
+typedef s32 fixed;
+
+//senquack - The gpu_drhell poly routines I adapted use 22.10 fixed point,
+//           while original Unai used 16.16: (see README_senquack.txt)
+//#define FIXED_BITS 16
+#define FIXED_BITS 10
+
+#define fixed_ZERO ((fixed)0)
+#define fixed_ONE  ((fixed)1<<FIXED_BITS)
+#define fixed_TWO  ((fixed)2<<FIXED_BITS)
+#define fixed_HALF ((fixed)((1<<FIXED_BITS)>>1))
+
+#define fixed_LOMASK ((fixed)((1<<FIXED_BITS)-1))
+#define fixed_HIMASK ((fixed)(~fixed_LOMASK))
+
+// int<->fixed conversions:
+#define i2x(x) ((x)<<FIXED_BITS)
+#define x2i(x) ((x)>>FIXED_BITS)
+
+INLINE fixed FixedCeil(const fixed x)
+{
+       return (x + (fixed_ONE - 1)) & fixed_HIMASK;
+}
+
+INLINE s32 FixedCeilToInt(const fixed x)
+{
+       return (x + (fixed_ONE - 1)) >> FIXED_BITS;
+}
+
+//senquack - float<->fixed conversions:
+#define f2x(x) ((s32)((x) * (float)(1<<FIXED_BITS)))
+#define x2f(x) ((float)(x) / (float)(1<<FIXED_BITS))
+
+//senquack - floating point reciprocal:
+//NOTE: These assume x is always != 0 !!!
+#ifdef GPU_UNAI_USE_FLOATMATH
+#if defined(_MIPS_ARCH_MIPS32R2) || (__mips == 64)
+INLINE float FloatInv(const float x)
+{
+       float res;
+       asm("recip.s %0,%1" : "=f" (res) : "f" (x));
+       return res;
+}
+#else
+INLINE float FloatInv(const float x)
+{
+       return (1.0f / x);
+}
+#endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// --- BEGIN INVERSE APPROXIMATION SECTION ---
+///////////////////////////////////////////////////////////////////////////
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+
+//  big precision inverse table.
+#define TABLE_BITS 16
+s32 s_invTable[(1<<TABLE_BITS)];
+
+//senquack - MIPS32 happens to have same instruction/format:
+#if defined(__arm__) || (__mips == 32)
+INLINE u32 Log2(u32 x) { u32 res; asm("clz %0,%1" : "=r" (res) : "r" (x)); return 32-res; }
+#else
+INLINE u32 Log2(u32 x) { u32 i = 0; for ( ; x > 0; ++i, x >>= 1); return i - 1; }
+#endif
+
+INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
+{
+  u32 uD = (_b<0) ? -_b : _b;
+  if(uD>1)
+  {
+       u32 uLog = Log2(uD);
+    uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0;
+    u32 uDen = (uD>>uLog);
+    iFactor_ = s_invTable[uDen];
+    iFactor_ = (_b<0) ? -iFactor_ :iFactor_;
+    //senquack - Adapted to 22.10 fixed point (originally 16.16):
+    //iShift_  = 15+uLog;
+    iShift_  = 21+uLog;
+  }
+  else
+  {
+    iFactor_=_b;
+    iShift_ = 0;
+  }
+}
+
+INLINE  fixed xInvMulx  (const fixed _a, const s32 _iFact, const s32 _iShift)
+{
+       #ifdef __arm__
+               s64 res;
+               asm ("smull %Q0, %R0, %1, %2" : "=&r" (res) : "r"(_a) , "r"(_iFact));
+               return fixed(res>>_iShift);
+       #else
+               return fixed( ((s64)(_a)*(s64)(_iFact))>>(_iShift) );
+       #endif
+}
+
+INLINE  fixed xLoDivx   (const fixed _a, const fixed _b)
+{
+  s32 iFact, iShift;
+  xInv(_b, iFact, iShift);
+  return xInvMulx(_a, iFact, iShift);
+}
+#endif // GPU_UNAI_USE_INT_DIV_MULTINV
+///////////////////////////////////////////////////////////////////////////
+// --- END INVERSE APPROXIMATION SECTION ---
+///////////////////////////////////////////////////////////////////////////
+
+#endif  //FIXED_H
diff --git a/plugins/gpu_senquack/gpu_inner.h b/plugins/gpu_senquack/gpu_inner.h
new file mode 100644 (file)
index 0000000..8cb4bd5
--- /dev/null
@@ -0,0 +1,734 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_INNER_H__
+#define __GPU_UNAI_GPU_INNER_H__
+
+///////////////////////////////////////////////////////////////////////////////
+// Inner loop driver instantiation file
+
+///////////////////////////////////////////////////////////////////////////////
+//  Option Masks (CF template paramter)
+#define  CF_LIGHT     ((CF>> 0)&1) // Lighting
+#define  CF_BLEND     ((CF>> 1)&1) // Blending
+#define  CF_MASKCHECK ((CF>> 2)&1) // Mask bit check
+#define  CF_BLENDMODE ((CF>> 3)&3) // Blend mode   0..3
+#define  CF_TEXTMODE  ((CF>> 5)&3) // Texture mode 1..3 (0: texturing disabled)
+#define  CF_GOURAUD   ((CF>> 7)&1) // Gouraud shading
+#define  CF_MASKSET   ((CF>> 8)&1) // Mask bit set
+#define  CF_DITHER    ((CF>> 9)&1) // Dithering
+#define  CF_BLITMASK  ((CF>>10)&1) // blit_mask check (skip rendering pixels
+                                   //  that wouldn't end up displayed on
+                                   //  low-res screen using simple downscaler)
+
+//#ifdef __arm__
+//#ifndef ENABLE_GPU_ARMV7
+/* ARMv5 */
+//#include "gpu_inner_blend_arm5.h"
+//#else
+/* ARMv7 optimized */
+//#include "gpu_inner_blend_arm7.h"
+//#endif
+//#else
+//#include "gpu_inner_blend.h"
+//#endif
+
+#include "gpu_inner_blend.h"
+#include "gpu_inner_quantization.h"
+#include "gpu_inner_light.h"
+
+#ifdef __arm__
+#include "gpu_inner_blend_arm.h"
+#include "gpu_inner_light_arm.h"
+#define gpuBlending gpuBlendingARM
+#define gpuLightingRGB gpuLightingRGBARM
+#define gpuLightingTXT gpuLightingTXTARM
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
+// Non-dithering lighting and blending functions preserve uSrc
+// MSB. This saves a few operations and useless load/stores.
+#define MSB_PRESERVED (!CF_DITHER)
+#else
+#define gpuBlending gpuBlendingGeneric
+#define gpuLightingRGB gpuLightingRGBGeneric
+#define gpuLightingTXT gpuLightingTXTGeneric
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
+#define MSB_PRESERVED 0
+#endif
+
+
+// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+// This is only for debugging/verification of low-precision colors in C.
+// Low-precision Gouraud is intended for use by SIMD-optimized inner drivers
+// which get/use Gouraud colors in SIMD registers.
+//#define GPU_GOURAUD_LOW_PRECISION
+
+// How many bits of fixed-point precision GouraudColor uses
+#ifdef GPU_GOURAUD_LOW_PRECISION
+#define GPU_GOURAUD_FIXED_BITS 11
+#else
+#define GPU_GOURAUD_FIXED_BITS 16
+#endif
+
+// Used to pass Gouraud colors to gpuPixelSpanFn() (lines)
+struct GouraudColor {
+#ifdef GPU_GOURAUD_LOW_PRECISION
+       u16 r, g, b;
+       s16 r_incr, g_incr, b_incr;
+#else
+       u32 r, g, b;
+       s32 r_incr, g_incr, b_incr;
+#endif
+};
+
+static inline u16 gpuGouraudColor15bpp(u32 r, u32 g, u32 b)
+{
+       r >>= GPU_GOURAUD_FIXED_BITS;
+       g >>= GPU_GOURAUD_FIXED_BITS;
+       b >>= GPU_GOURAUD_FIXED_BITS;
+
+#ifndef GPU_GOURAUD_LOW_PRECISION
+       // High-precision Gouraud colors are 8-bit + fractional
+       r >>= 3;  g >>= 3;  b >>= 3;
+#endif
+
+       return r | (g << 5) | (b << 10);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Pixel span operations generator gpuPixelSpanFn<>
+//  Oct 2016: Created/adapted from old gpuPixelFn by senquack:
+//  Original gpuPixelFn was used to draw lines one pixel at a time. I wrote
+//  new line algorithms that draw lines using horizontal/vertical/diagonal
+//  spans of pixels, necessitating new pixel-drawing function that could
+//  not only render spans of pixels, but gouraud-shade them as well.
+//  This speeds up line rendering and would allow tile-rendering (untextured
+//  rectangles) to use the same set of functions. Since tiles are always
+//  monochrome, they simply wouldn't use the extra set of 32 gouraud-shaded
+//  gpuPixelSpanFn functions (TODO?).
+//
+// NOTE: While the PS1 framebuffer is 16 bit, we use 8-bit pointers here,
+//       so that pDst can be incremented directly by 'incr' parameter
+//       without having to shift it before use.
+template<int CF>
+static u8* gpuPixelSpanFn(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
+{
+       // Blend func can save an operation if it knows uSrc MSB is
+       //  unset. For untextured prims, this is always true.
+       const bool skip_uSrc_mask = true;
+
+       u16 col;
+       struct GouraudColor * gcPtr;
+       u32 r, g, b;
+       s32 r_incr, g_incr, b_incr;
+
+       if (CF_GOURAUD) {
+               gcPtr = (GouraudColor*)data;
+               r = gcPtr->r;  r_incr = gcPtr->r_incr;
+               g = gcPtr->g;  g_incr = gcPtr->g_incr;
+               b = gcPtr->b;  b_incr = gcPtr->b_incr;
+       } else {
+               col = (u16)data;
+       }
+
+       do {
+               if (!CF_GOURAUD)
+               {   // NO GOURAUD
+                       if (!CF_MASKCHECK && !CF_BLEND) {
+                               if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                               else            { *(u16*)pDst = col;          }
+                       } else if (CF_MASKCHECK && !CF_BLEND) {
+                               if (!(*(u16*)pDst & 0x8000)) {
+                                       if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                                       else            { *(u16*)pDst = col;          }
+                               }
+                       } else {
+                               uint_fast16_t uDst = *(u16*)pDst;
+                               if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+
+                               uint_fast16_t uSrc = col;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+                               else            { *(u16*)pDst = uSrc;          }
+                       }
+
+               } else
+               {   // GOURAUD
+
+                       if (!CF_MASKCHECK && !CF_BLEND) {
+                               col = gpuGouraudColor15bpp(r, g, b);
+                               if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                               else            { *(u16*)pDst = col;          }
+                       } else if (CF_MASKCHECK && !CF_BLEND) {
+                               col = gpuGouraudColor15bpp(r, g, b);
+                               if (!(*(u16*)pDst & 0x8000)) {
+                                       if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                                       else            { *(u16*)pDst = col;          }
+                               }
+                       } else {
+                               uint_fast16_t uDst = *(u16*)pDst;
+                               if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+                               col = gpuGouraudColor15bpp(r, g, b);
+
+                               uint_fast16_t uSrc = col;
+
+                               // Blend func can save an operation if it knows uSrc MSB is
+                               //  unset. For untextured prims, this is always true.
+                               const bool skip_uSrc_mask = true;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+                               else            { *(u16*)pDst = uSrc;          }
+                       }
+               }
+
+endpixel:
+               if (CF_GOURAUD) {
+                       r += r_incr;
+                       g += g_incr;
+                       b += b_incr;
+               }
+               pDst += incr;
+       } while (len-- > 1);
+
+       // Note from senquack: Normally, I'd prefer to write a 'do {} while (--len)'
+       //  loop, or even a for() loop, however, on MIPS platforms anything but the
+       //  'do {} while (len-- > 1)' tends to generate very unoptimal asm, with
+       //  many unneeded MULs/ADDs/branches at the ends of these functions.
+       //  If you change the loop structure above, be sure to compare the quality
+       //  of the generated code!!
+
+       if (CF_GOURAUD) {
+               gcPtr->r = r;
+               gcPtr->g = g;
+               gcPtr->b = b;
+       }
+       return pDst;
+}
+
+static u8* PixelSpanNULL(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"PixelSpanNULL()\n");
+       #endif
+       return pDst;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  PixelSpan (lines) innerloops driver
+typedef u8* (*PSD)(u8* dst, uintptr_t data, ptrdiff_t incr, size_t len);
+
+const PSD gpuPixelSpanDrivers[64] =
+{ 
+       // Array index | 'CF' template field | Field value
+       // ------------+---------------------+----------------
+       // Bit 0       | CF_BLEND            | off (0), on (1)
+       // Bit 1       | CF_MASKCHECK        | off (0), on (1)
+       // Bit 3:2     | CF_BLENDMODE        | 0..3
+       // Bit 4       | CF_MASKSET          | off (0), on (1)
+       // Bit 5       | CF_GOURAUD          | off (0), on (1)
+       //
+       // NULL entries are ones for which blending is disabled and blend-mode
+       //  field is non-zero, which is obviously invalid.
+
+       // Flat-shaded
+       gpuPixelSpanFn<0x00<<1>,         gpuPixelSpanFn<0x01<<1>,         gpuPixelSpanFn<0x02<<1>,         gpuPixelSpanFn<0x03<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x05<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x07<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x09<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0B<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x0D<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0F<<1>,
+
+       // Flat-shaded + PixelMSB (CF_MASKSET)
+       gpuPixelSpanFn<(0x00<<1)|0x100>, gpuPixelSpanFn<(0x01<<1)|0x100>, gpuPixelSpanFn<(0x02<<1)|0x100>, gpuPixelSpanFn<(0x03<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x100>,
+
+       // Gouraud-shaded (CF_GOURAUD)
+       gpuPixelSpanFn<(0x00<<1)|0x80>,  gpuPixelSpanFn<(0x01<<1)|0x80>,  gpuPixelSpanFn<(0x02<<1)|0x80>,  gpuPixelSpanFn<(0x03<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x80>,
+
+       // Gouraud-shaded (CF_GOURAUD) + PixelMSB (CF_MASKSET)
+       gpuPixelSpanFn<(0x00<<1)|0x180>, gpuPixelSpanFn<(0x01<<1)|0x180>, gpuPixelSpanFn<(0x02<<1)|0x180>, gpuPixelSpanFn<(0x03<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x180>
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Tiles innerloops generator
+
+template<int CF>
+static void gpuTileSpanFn(u16 *pDst, u32 count, u16 data)
+{
+       if (!CF_MASKCHECK && !CF_BLEND) {
+               if (CF_MASKSET) { data = data | 0x8000; }
+               do { *pDst++ = data; } while (--count);
+       } else if (CF_MASKCHECK && !CF_BLEND) {
+               if (CF_MASKSET) { data = data | 0x8000; }
+               do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
+       } else
+       {
+               // Blend func can save an operation if it knows uSrc MSB is
+               //  unset. For untextured prims, this is always true.
+               const bool skip_uSrc_mask = true;
+
+               uint_fast16_t uSrc, uDst;
+               do
+               {
+                       if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+                       if (CF_MASKCHECK) { if (uDst&0x8000) goto endtile; }
+
+                       uSrc = data;
+
+                       if (CF_BLEND)
+                               uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                       if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                       else            { *pDst = uSrc;          }
+
+                       //senquack - Did not apply "Silent Hill" mask-bit fix to here.
+                       // It is hard to tell from scarce documentation available and
+                       //  lack of comments in code, but I believe the tile-span
+                       //  functions here should not bother to preserve any source MSB,
+                       //  as they are not drawing from a texture.
+endtile:
+                       pDst++;
+               }
+               while (--count);
+       }
+}
+
+static void TileNULL(u16 *pDst, u32 count, u16 data)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"TileNULL()\n");
+       #endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Tiles innerloops driver
+typedef void (*PT)(u16 *pDst, u32 count, u16 data);
+
+// Template instantiation helper macros
+#define TI(cf) gpuTileSpanFn<(cf)>
+#define TN     TileNULL
+#define TIBLOCK(ub) \
+       TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+       TN,            TI((ub)|0x0a), TN,            TI((ub)|0x0e), \
+       TN,            TI((ub)|0x12), TN,            TI((ub)|0x16), \
+       TN,            TI((ub)|0x1a), TN,            TI((ub)|0x1e)
+
+const PT gpuTileSpanDrivers[32] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8)
+};
+
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Sprites innerloops generator
+
+template<int CF>
+static void gpuSpriteSpanFn(u16 *pDst, u32 count, u8* pTxt, u32 u0)
+{
+       // Blend func can save an operation if it knows uSrc MSB is unset.
+       //  Untextured prims can always skip (source color always comes with MSB=0).
+       //  For textured prims, the generic lighting funcs always return it unset. (bonus!)
+       const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
+
+       uint_fast16_t uSrc, uDst, srcMSB;
+       bool should_blend;
+       u32 u0_mask = gpu_senquack.TextureWindow[2];
+
+       u8 r5, g5, b5;
+       if (CF_LIGHT) {
+               r5 = gpu_senquack.r5;
+               g5 = gpu_senquack.g5;
+               b5 = gpu_senquack.b5;
+       }
+
+       if (CF_TEXTMODE==3) {
+               // Texture is accessed byte-wise, so adjust mask if 16bpp
+               u0_mask <<= 1;
+       }
+
+       const u16 *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_senquack.CBA;
+
+       do
+       {
+               if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+               if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; }
+
+               if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+                       u8 rgb = pTxt[(u0 & u0_mask)>>1];
+                       uSrc = CBA_[(rgb>>((u0&1)<<2))&0xf];
+               }
+               if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+                       uSrc = CBA_[pTxt[u0 & u0_mask]];
+               }
+               if (CF_TEXTMODE==3) {  // 16bpp
+                       uSrc = *(u16*)(&pTxt[u0 & u0_mask]);
+               }
+
+               if (!uSrc) goto endsprite;
+
+               //senquack - save source MSB, as blending or lighting macros will not
+               //           (Silent Hill gray rectangles mask bit bug)
+               if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
+               
+               if (CF_LIGHT)
+                       uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+
+               should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
+
+               if (CF_BLEND && should_blend)
+                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+               if (CF_MASKSET)                                    { *pDst = uSrc | 0x8000; }
+               else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = uSrc | srcMSB; }
+               else                                               { *pDst = uSrc;          }
+
+endsprite:
+               u0 += (CF_TEXTMODE==3) ? 2 : 1;
+               pDst++;
+       }
+       while (--count);
+}
+
+static void SpriteNULL(u16 *pDst, u32 count, u8* pTxt, u32 u0)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"SpriteNULL()\n");
+       #endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+//  Sprite innerloops driver
+typedef void (*PS)(u16 *pDst, u32 count, u8* pTxt, u32 u0);
+
+// Template instantiation helper macros
+#define TI(cf) gpuSpriteSpanFn<(cf)>
+#define TN     SpriteNULL
+#define TIBLOCK(ub) \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
+
+const PS gpuSpriteSpanDrivers[256] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8)
+};
+
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Polygon innerloops generator
+
+//senquack - Newer version with following changes:
+//           * Adapted to work with new poly routings in gpu_raster_polygon.h
+//             adapted from DrHell GPU. They are less glitchy and use 22.10
+//             fixed-point instead of original UNAI's 16.16.
+//           * Texture coordinates are no longer packed together into one
+//             unsigned int. This seems to lose too much accuracy (they each
+//             end up being only 8.7 fixed-point that way) and pixel-droupouts
+//             were noticeable both with original code and current DrHell
+//             adaptations. An example would be the sky in NFS3. Now, they are
+//             stored in separate ints, using separate masks.
+//           * Function is no longer INLINE, as it was always called
+//             through a function pointer.
+//           * Function now ensures the mask bit of source texture is preserved
+//             across calls to blending functions (Silent Hill rectangles fix)
+//           * November 2016: Large refactoring of blending/lighting when
+//             JohnnyF added dithering. See gpu_inner_quantization.h and
+//             relevant blend/light headers.
+// (see README_senquack.txt)
+template<int CF>
+static void gpuPolySpanFn(const gpu_senquack_t &gpu_senquack, u16 *pDst, u32 count)
+{
+       // Blend func can save an operation if it knows uSrc MSB is unset.
+       //  Untextured prims can always skip this (src color MSB is always 0).
+       //  For textured prims, the generic lighting funcs always return it unset. (bonus!)
+       const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
+       bool should_blend;
+
+       u32 bMsk; if (CF_BLITMASK) bMsk = gpu_senquack.blit_mask;
+
+       if (!CF_TEXTMODE)
+       {
+               if (!CF_GOURAUD)
+               {
+                       // UNTEXTURED, NO GOURAUD
+                       const u16 pix15 = gpu_senquack.PixelData;
+                       do {
+                               uint_fast16_t uSrc, uDst;
+
+                               // NOTE: Don't enable CF_BLITMASK  pixel skipping (speed hack)
+                               //  on untextured polys. It seems to do more harm than good: see
+                               //  gravestone text at end of Medieval intro sequence. -senquack
+                               //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) { goto endpolynotextnogou; } }
+
+                               if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+                               if (CF_MASKCHECK) { if (uDst&0x8000) { goto endpolynotextnogou; } }
+
+                               uSrc = pix15;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                               else            { *pDst = uSrc;          }
+
+endpolynotextnogou:
+                               pDst++;
+                       } while(--count);
+               }
+               else
+               {
+                       // UNTEXTURED, GOURAUD
+                       u32 l_gCol = gpu_senquack.gCol;
+                       u32 l_gInc = gpu_senquack.gInc;
+
+                       do {
+                               uint_fast16_t uDst, uSrc;
+
+                               // See note in above loop regarding CF_BLITMASK
+                               //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; }
+
+                               if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+                               if (CF_MASKCHECK) { if (uDst&0x8000) goto endpolynotextgou; }
+
+                               if (CF_DITHER) {
+                                       // GOURAUD, DITHER
+
+                                       u32 uSrc24 = gpuLightingRGB24(l_gCol);
+                                       if (CF_BLEND)
+                                               uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+                                       uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+                               } else {
+                                       // GOURAUD, NO DITHER
+
+                                       uSrc = gpuLightingRGB(l_gCol);
+
+                                       if (CF_BLEND)
+                                               uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+                               }
+
+                               if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                               else            { *pDst = uSrc;          }
+
+endpolynotextgou:
+                               pDst++;
+                               l_gCol += l_gInc;
+                       }
+                       while (--count);
+               }
+       }
+       else
+       {
+               // TEXTURED
+
+               uint_fast16_t uDst, uSrc, srcMSB;
+
+               //senquack - note: original UNAI code had gpu_senquack.{u4/v4} packed into
+               // one 32-bit unsigned int, but this proved to lose too much accuracy
+               // (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
+               u32 l_u_msk = gpu_senquack.u_msk;     u32 l_v_msk = gpu_senquack.v_msk;
+               u32 l_u = gpu_senquack.u & l_u_msk;   u32 l_v = gpu_senquack.v & l_v_msk;
+               s32 l_u_inc = gpu_senquack.u_inc;     s32 l_v_inc = gpu_senquack.v_inc;
+
+               const u16* TBA_ = gpu_senquack.TBA;
+               const u16* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_senquack.CBA;
+
+               u8 r5, g5, b5;
+               u8 r8, g8, b8;
+
+               u32 l_gInc, l_gCol;
+
+               if (CF_LIGHT) {
+                       if (CF_GOURAUD) {
+                               l_gInc = gpu_senquack.gInc;
+                               l_gCol = gpu_senquack.gCol;
+                       } else {
+                               if (CF_DITHER) {
+                                       r8 = gpu_senquack.r8;
+                                       g8 = gpu_senquack.g8;
+                                       b8 = gpu_senquack.b8;
+                               } else {
+                                       r5 = gpu_senquack.r5;
+                                       g5 = gpu_senquack.g5;
+                                       b5 = gpu_senquack.b5;
+                               }
+                       }
+               }
+
+               do
+               {
+                       if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolytext; }
+                       if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+                       if (CF_MASKCHECK) if (uDst&0x8000) { goto endpolytext; }
+
+                       //senquack - adapted to work with new 22.10 fixed point routines:
+                       //           (UNAI originally used 16.16)
+                       if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+                               u32 tu=(l_u>>10);
+                               u32 tv=(l_v<<1)&(0xff<<11);
+                               u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
+                               uSrc=CBA_[(rgb>>((tu&1)<<2))&0xf];
+                               if (!uSrc) goto endpolytext;
+                       }
+                       if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+                               uSrc = CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])];
+                               if (!uSrc) goto endpolytext;
+                       }
+                       if (CF_TEXTMODE==3) {  // 16bpp
+                               uSrc = TBA_[(l_u>>10)+((l_v)&(0xff<<10))];
+                               if (!uSrc) goto endpolytext;
+                       }
+
+                       // Save source MSB, as blending or lighting will not (Silent Hill)
+                       if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
+
+                       // When textured, only dither when LIGHT (texture blend) is enabled
+                       // LIGHT &&  BLEND => dither
+                       // LIGHT && !BLEND => dither
+                       //!LIGHT &&  BLEND => no dither
+                       //!LIGHT && !BLEND => no dither
+
+                       if (CF_DITHER && CF_LIGHT) {
+                               u32 uSrc24;
+                               if ( CF_GOURAUD)
+                                       uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
+                               if (!CF_GOURAUD)
+                                       uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+
+                               if (CF_BLEND && srcMSB)
+                                       uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+
+                               uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+                       } else
+                       {
+                               if (CF_LIGHT) {
+                                       if ( CF_GOURAUD)
+                                               uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
+                                       if (!CF_GOURAUD)
+                                               uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+                               }
+
+                               should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
+                               if (CF_BLEND && should_blend)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+                       }
+
+                       if (CF_MASKSET)                                    { *pDst = uSrc | 0x8000; }
+                       else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = uSrc | srcMSB; }
+                       else                                               { *pDst = uSrc;          }
+endpolytext:
+                       pDst++;
+                       l_u = (l_u + l_u_inc) & l_u_msk;
+                       l_v = (l_v + l_v_inc) & l_v_msk;
+                       if (CF_LIGHT && CF_GOURAUD) l_gCol += l_gInc;
+               }
+               while (--count);
+       }
+}
+
+static void PolyNULL(const gpu_senquack_t &gpu_senquack, u16 *pDst, u32 count)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"PolyNULL()\n");
+       #endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Polygon innerloops driver
+typedef void (*PP)(const gpu_senquack_t &gpu_senquack, u16 *pDst, u32 count);
+
+// Template instantiation helper macros
+#define TI(cf) gpuPolySpanFn<(cf)>
+#define TN     PolyNULL
+#define TIBLOCK(ub) \
+       TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+       TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
+       TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
+       TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
+       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f), \
+       TN,            TI((ub)|0x81), TN,            TI((ub)|0x83), TN,            TI((ub)|0x85), TN,            TI((ub)|0x87), \
+       TN,            TN,            TN,            TI((ub)|0x8b), TN,            TN,            TN,            TI((ub)|0x8f), \
+       TN,            TN,            TN,            TI((ub)|0x93), TN,            TN,            TN,            TI((ub)|0x97), \
+       TN,            TN,            TN,            TI((ub)|0x9b), TN,            TN,            TN,            TI((ub)|0x9f), \
+       TN,            TI((ub)|0xa1), TN,            TI((ub)|0xa3), TN,            TI((ub)|0xa5), TN,            TI((ub)|0xa7), \
+       TN,            TN,            TN,            TI((ub)|0xab), TN,            TN,            TN,            TI((ub)|0xaf), \
+       TN,            TN,            TN,            TI((ub)|0xb3), TN,            TN,            TN,            TI((ub)|0xb7), \
+       TN,            TN,            TN,            TI((ub)|0xbb), TN,            TN,            TN,            TI((ub)|0xbf), \
+       TN,            TI((ub)|0xc1), TN,            TI((ub)|0xc3), TN,            TI((ub)|0xc5), TN,            TI((ub)|0xc7), \
+       TN,            TN,            TN,            TI((ub)|0xcb), TN,            TN,            TN,            TI((ub)|0xcf), \
+       TN,            TN,            TN,            TI((ub)|0xd3), TN,            TN,            TN,            TI((ub)|0xd7), \
+       TN,            TN,            TN,            TI((ub)|0xdb), TN,            TN,            TN,            TI((ub)|0xdf), \
+       TN,            TI((ub)|0xe1), TN,            TI((ub)|0xe3), TN,            TI((ub)|0xe5), TN,            TI((ub)|0xe7), \
+       TN,            TN,            TN,            TI((ub)|0xeb), TN,            TN,            TN,            TI((ub)|0xef), \
+       TN,            TN,            TN,            TI((ub)|0xf3), TN,            TN,            TN,            TI((ub)|0xf7), \
+       TN,            TN,            TN,            TI((ub)|0xfb), TN,            TN,            TN,            TI((ub)|0xff)
+
+const PP gpuPolySpanDrivers[2048] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8),
+       TIBLOCK(4<<8), TIBLOCK(5<<8), TIBLOCK(6<<8), TIBLOCK(7<<8)
+};
+
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+#endif /* __GPU_UNAI_GPU_INNER_H__ */
diff --git a/plugins/gpu_senquack/gpu_inner_blend.h b/plugins/gpu_senquack/gpu_inner_blend.h
new file mode 100644 (file)
index 0000000..febc7ed
--- /dev/null
@@ -0,0 +1,188 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color
+//  in 'uDst' (background), returning resulting color.
+//
+// INPUT:
+//  'uSrc','uDst' input: -bbbbbgggggrrrrr
+//                       ^ bit 16
+// OUTPUT:
+//           u16 output: 0bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+// Where '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
+GPU_INLINE uint_fast16_t gpuBlendingGeneric(uint_fast16_t uSrc, uint_fast16_t uDst)
+{
+       // These use Blargg's bitwise modulo-clamping:
+       //  http://blargg.8bitalley.com/info/rgb_mixing.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_add.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_sub.html
+
+       uint_fast16_t mix;
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+#ifdef GPU_UNAI_USE_ACCURATE_BLENDING
+               // Slower, but more accurate (doesn't lose LSB data)
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1;
+#else
+               mix = ((uDst & 0x7bde) + (uSrc & 0x7bde)) >> 1;
+#endif
+       }
+
+       // 1.0 x Back + 1.0 x Forward
+       if (BLENDMODE==1) {
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               u32 sum      = uSrc + uDst;
+               u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               u32 carries  = (sum - low_bits) & 0x8420;
+               u32 modulo   = sum - carries;
+               u32 clamp    = carries - (carries >> 5);
+               mix = modulo | clamp;
+       }
+
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               u32 diff     = uDst - uSrc + 0x8420;
+               u32 low_bits = (uDst ^ uSrc) & 0x8420;
+               u32 borrows  = (diff - low_bits) & 0x8420;
+               u32 modulo   = diff - borrows;
+               u32 clamp    = borrows - (borrows >> 5);
+               mix = modulo & clamp;
+       }
+
+       // 1.0 x Back + 0.25 x Forward
+       if (BLENDMODE==3) {
+               uDst &= 0x7fff;
+               uSrc = ((uSrc >> 2) & 0x1ce7);
+               u32 sum      = uSrc + uDst;
+               u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               u32 carries  = (sum - low_bits) & 0x8420;
+               u32 modulo   = sum - carries;
+               u32 clamp    = carries - (carries >> 5);
+               mix = modulo | clamp;
+       }
+
+       return mix;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert bgr555 color in uSrc to padded u32 5.4:5.4:5.4 bgr fixed-pt
+//  color triplet suitable for use with HQ 24-bit quantization.
+//
+// INPUT:
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuGetRGB24(uint_fast16_t uSrc)
+{
+       return ((uSrc & 0x7C00)<<14)
+            | ((uSrc & 0x03E0)<< 9)
+            | ((uSrc & 0x001F)<< 4);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24'
+//  (foreground color) with bgr555 color in 'uDst' (background color),
+//  returning the resulting u32 5.4:5.4:5.4 color.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE>
+GPU_INLINE u32 gpuBlending24(u32 uSrc24, uint_fast16_t uDst)
+{
+       // These use techniques adapted from Blargg's techniques mentioned in
+       //  in gpuBlending() comments above. Not as much bitwise trickery is
+       //  necessary because of presence of 0 padding in uSrc24 format.
+
+       u32 uDst24 = gpuGetRGB24(uDst);
+       u32 mix;
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+               const u32 uMsk = 0x1FE7F9FE;
+               // Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already
+               mix = (uDst24 + (uSrc24 & uMsk)) >> 1;
+       }
+
+       // 1.0 x Back + 1.0 x Forward
+       if (BLENDMODE==1) {
+               u32 sum     = uSrc24 + uDst24;
+               u32 carries = sum & 0x20080200;
+               u32 modulo  = sum - carries;
+               u32 clamp   = carries - (carries >> 9);
+               mix = modulo | clamp;
+       }
+
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               // Insert ones in 0-padded borrow slot of color to be subtracted from
+               uDst24 |= 0x20080200;
+               u32 diff    = uDst24 - uSrc24;
+               u32 borrows = diff & 0x20080200;
+               u32 clamp   = borrows - (borrows >> 9);
+               mix = diff & clamp;
+       }
+
+       // 1.0 x Back + 0.25 x Forward
+       if (BLENDMODE==3) {
+               uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2;
+               u32 sum     = uSrc24 + uDst24;
+               u32 carries = sum & 0x20080200;
+               u32 modulo  = sum - carries;
+               u32 clamp   = carries - (carries >> 9);
+               mix = modulo | clamp;
+       }
+
+       return mix;
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_senquack/gpu_inner_blend_arm.h b/plugins/gpu_senquack/gpu_inner_blend_arm.h
new file mode 100644 (file)
index 0000000..6413527
--- /dev/null
@@ -0,0 +1,103 @@
+#ifndef _OP_BLEND_ARM_H_
+#define _OP_BLEND_ARM_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color
+//  in 'uDst' (background), returning resulting color.
+//
+// INPUT:
+//  'uSrc','uDst' input: -bbbbbgggggrrrrr
+//                       ^ bit 16
+// OUTPUT:
+//           u16 output: 0bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+// Where '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
+GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
+{
+       // These use Blargg's bitwise modulo-clamping:
+       //  http://blargg.8bitalley.com/info/rgb_mixing.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_add.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_sub.html
+
+       uint_fast16_t mix;
+
+       // Clear preserved msb
+       asm ("bic %[uDst], %[uDst], #0x8000" : [uDst] "+r" (uDst));
+
+       if (BLENDMODE == 3) {
+               // Prepare uSrc for blending ((0.25 * uSrc) & (0.25 * mask))
+               asm ("and %[uSrc], %[mask], %[uSrc], lsr #0x2" : [uSrc] "+r" (uSrc) : [mask] "r" (0x1ce7));
+       } else if (!SKIP_USRC_MSB_MASK) {
+               asm ("bic %[uSrc], %[uSrc], #0x8000" : [uSrc] "+r" (uSrc));
+       }
+
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+               // mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1;
+               asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
+                    "and %[mix], %[mix], %[mask]\n\t"  // ... & 0x0421
+                    "sub %[mix], %[uDst], %[mix]\n\t"  // uDst - ...
+                    "add %[mix], %[uSrc], %[mix]\n\t"  // uSrc + ...
+                    "mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1
+                    : [mix] "=&r" (mix)
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
+       }
+
+       if (BLENDMODE == 1 || BLENDMODE == 3) {
+               // u32 sum      = uSrc + uDst;
+               // u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               // u32 carries  = (sum - low_bits) & 0x8420;
+               // u32 modulo   = sum - carries;
+               // u32 clamp    = carries - (carries >> 5);
+               // mix = modulo | clamp;
+
+               u32 sum;
+
+               asm ("add %[sum], %[uSrc], %[uDst]\n\t" // sum = uSrc + uDst
+                    "eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
+                    "and %[mix], %[mix], %[mask]\n\t"  // low_bits = (... & 0x0421)
+                    "sub %[mix], %[sum], %[mix]\n\t"   // sum - low_bits
+                    "and %[mix], %[mix], %[mask], lsl #0x05\n\t"  // carries = ... & 0x8420
+                    "sub %[sum], %[sum], %[mix] \n\t"  // modulo = sum - carries
+                    "sub %[mix], %[mix], %[mix], lsr #0x05\n\t" // clamp = carries - (carries >> 5)
+                    "orr %[mix], %[sum], %[mix]"       // mix = modulo | clamp
+                    : [sum] "=&r" (sum), [mix] "=&r" (mix)
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
+       }
+    
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               u32 diff;
+               // u32 diff     = uDst - uSrc + 0x8420;
+               // u32 low_bits = (uDst ^ uSrc) & 0x8420;
+               // u32 borrows  = (diff - low_bits) & 0x8420;
+               // u32 modulo   = diff - borrows;
+               // u32 clamp    = borrows - (borrows >> 5);
+               // mix = modulo & clamp;
+               asm ("sub %[diff], %[uDst], %[uSrc]\n\t"  // uDst - uSrc
+                    "add %[diff], %[diff], %[mask]\n\t"  // diff = ... + 0x8420
+                    "eor %[mix], %[uDst], %[uSrc]\n\t"   // uDst ^ uSrc
+                    "and %[mix], %[mix], %[mask]\n\t"    // low_bits = ... & 0x8420
+                    "sub %[mix], %[diff], %[mix]\n\t"    // diff - low_bits
+                    "and %[mix], %[mix], %[mask]\n\t"    // borrows = ... & 0x8420
+                    "sub %[diff], %[diff], %[mix]\n\t"   // modulo = diff - borrows
+                    "sub %[mix], %[mix], %[mix], lsr #0x05\n\t"  // clamp = borrows - (borrows >> 5)
+                    "and %[mix], %[diff], %[mix]"        // mix = modulo & clamp
+                    : [diff] "=&r" (diff), [mix] "=&r" (mix)
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x8420));
+       }
+
+       // There's not a case where we can get into this function,
+       // SKIP_USRC_MSB_MASK is false, and the msb of uSrc is unset.
+       if (!SKIP_USRC_MSB_MASK) {
+               asm ("orr %[mix], %[mix], #0x8000" : [mix] "+r" (mix));
+       }
+  
+       return mix;
+}
+
+#endif  //_OP_BLEND_ARM_H_
diff --git a/plugins/gpu_senquack/gpu_inner_blend_arm5.h b/plugins/gpu_senquack/gpu_inner_blend_arm5.h
new file mode 100644 (file)
index 0000000..0e9b74f
--- /dev/null
@@ -0,0 +1,100 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+       asm ("and  %[src], %[src], %[msk]  " : [src] "=r" (uSrc) : "0" (uSrc), [msk] "r" (uMsk)                  ); \
+       asm ("and  %[dst], %[dst], %[msk]  " : [dst] "=r" (uDst) : "0" (uDst), [msk] "r" (uMsk)                  ); \
+       asm ("add  %[src], %[dst], %[src]  " : [src] "=r" (uSrc) :             [dst] "r" (uDst), "0" (uSrc)      ); \
+       asm ("mov  %[src], %[src], lsr #1  " : [src] "=r" (uSrc) : "0" (uSrc)                                    ); \
+}
+
+//     1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+       u16 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+       asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+       asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+       asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+       asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+       asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+       asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+       asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+       asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+       asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+//     1.0 x Back - 1.0 x Forward      */
+#define gpuBlending02(uSrc,uDst) \
+{ \
+       u16 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[out], %[dt],    %[st]    " : [out] "=r" (out)  : [dt]  "r" (dt),   [st]  "r" (st) : "cc"         ); \
+       asm ("movmi  %[out], #0x0000            " : [out] "=r" (out)  : "0" (out)                                       ); \
+       asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+       asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+       asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("mov %[uSrc], %[out]" : [uSrc] "=r" (uSrc) : [out] "r" (out) ); \
+}
+
+//     1.0 x Back + 0.25 x Forward     */
+#define gpuBlending03(uSrc,uDst) \
+{ \
+               u16 st,dt,out; \
+               asm ("mov    %[src], %[src],   lsr #2   " : [src] "=r" (uSrc) : "0" (uSrc)                                      ); \
+               asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x1C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+               asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+               asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+               asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x00E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+               asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+               asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+               asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+               asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x0007  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+               asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+               asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+               asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_senquack/gpu_inner_blend_arm7.h b/plugins/gpu_senquack/gpu_inner_blend_arm7.h
new file mode 100644 (file)
index 0000000..083e62d
--- /dev/null
@@ -0,0 +1,107 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+       asm ("and  %[src], %[src], %[msk]\n" \
+            "and  %[dst], %[dst], %[msk]\n" \
+            "add  %[src], %[dst], %[src]\n" \
+            "mov  %[src], %[src], lsr #1\n" \
+        : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
+}
+
+//     1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x7C00\n" \
+            "add    %[out], %[dt],    %[st]  \n" \
+            "cmp    %[out], #0x7C00          \n" \
+            "movhi  %[out], #0x7C00          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x03E0\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x03E0          \n" \
+            "movhi  %[dt],  #0x03E0          \n" \
+            "orr    %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x001F\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x001F          \n" \
+            "movhi  %[dt],  #0x001F          \n" \
+            "orr    %[src], %[out],  %[dt]  \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//     1.0 x Back - 1.0 x Forward      */
+#define gpuBlending02(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x7C00\n" \
+            "subs   %[out], %[dt],    %[st]  \n" \
+            "movmi  %[out], #0x0000          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x03E0\n" \
+            "subs   %[dt],  %[dt],    %[st]  \n" \
+            "orrpl  %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x001F\n" \
+            "subs   %[dt],  %[dt],    %[st]  \n" \
+            "orrpl  %[out], %[out],   %[dt]  \n" \
+            "mov    %[src], %[out]           \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//     1.0 x Back + 0.25 x Forward     */
+#define gpuBlending03(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("mov    %[src], %[src],   lsr #2 \n" \
+            "and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x1C00\n" \
+            "add    %[out], %[dt],    %[st]  \n" \
+            "cmp    %[out], #0x7C00          \n" \
+            "movhi  %[out], #0x7C00          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x00E0\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x03E0          \n" \
+            "movhi  %[dt],  #0x03E0          \n" \
+            "orr    %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x0007\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x001F          \n" \
+            "movhi  %[dt],  #0x001F          \n" \
+            "orr    %[src], %[out],   %[dt]  \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_senquack/gpu_inner_light.h b/plugins/gpu_senquack/gpu_inner_light.h
new file mode 100644 (file)
index 0000000..b5d8933
--- /dev/null
@@ -0,0 +1,271 @@
+/***************************************************************************
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_LIGHT_H_
+#define _OP_LIGHT_H_
+
+//  GPU color operations for lighting calculations
+
+static void SetupLightLUT()
+{
+       // 1024-entry lookup table that modulates 5-bit texture + 5-bit light value.
+       // A light value of 15 does not modify the incoming texture color.
+       // LightLUT[32*32] array is initialized to following values:
+       //  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       //  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       //  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+       //  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
+       //  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+       //  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
+       //  0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11,
+       //  0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13,
+       //  0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,
+       //  0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17,
+       //  0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19,
+       //  0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21,
+       //  0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23,
+       //  0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25,
+       //  0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27,
+       //  0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31,
+       //  0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31,
+       //  0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31,
+       //  0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,
+       //  0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
+       for (int j=0; j < 32; ++j) {
+               for (int i=0; i < 32; ++i) {
+                       int val = i * j / 16;
+                       if (val > 31) val = 31;
+                       gpu_senquack.LightLUT[(j*32) + i] = val;
+               }
+       }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+// 'r','g','b' are 8.10 fixed-pt color components (r shown here)
+//     'r' input:  --------------rrrrrrrrXXXXXXXXXX
+//                 ^ bit 31
+// RETURNS:
+//    u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '-' don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudCol(u32 r, u32 g, u32 b)
+{
+       return ((u32)(b>> 8)&(0x03ff    ))
+            | ((u32)(g<< 3)&(0x07ff<<10))
+            | ((u32)(r<<14)&(0x07ff<<21));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed increment for Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  Sign-extended 8.10 fixed-pt r,g,b color increment values (only dr is shown)
+//   'dr' input:  ssssssssssssssrrrrrrrrXXXXXXXXXX
+//                ^ bit 31
+// RETURNS:
+//   u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and 's' sign bits
+//
+// NOTE: The correctness of this code/method has not been fully verified,
+//       having been merely factored out from original code in
+//       poly-drawing functions. Feel free to check/improve it -senquack
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudColInc(s32 dr, s32 dg, s32 db)
+{
+       u32 dr_tmp = (u32)(dr << 14)&(0xffffffff<<21);  if (dr < 0) dr_tmp += 1<<21;
+       u32 dg_tmp = (u32)(dg <<  3)&(0xffffffff<<10);  if (dg < 0) dg_tmp += 1<<10;
+       u32 db_tmp = (u32)(db >>  8)&(0xffffffff    );  if (db < 0) db_tmp += 1<< 0;
+       return db_tmp + dg_tmp + dr_tmp;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// RETURNS:
+//    u16 output:  0bbbbbgggggrrrrr
+//                 ^ bit 16
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingRGBGeneric(u32 gCol)
+{
+       return ((gCol<< 5)&0x7C00) |
+              ((gCol>>11)&0x03E0) |
+               (gCol>>27);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet in 'gCol'
+//  to padded u32 5.4:5.4:5.4 bgr fixed-pt triplet, suitable for use
+//  with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                      ^ bit 31
+// RETURNS:
+//         u32 output:  000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                      ^ bit 31
+//  Where 'X' are fixed-pt bits, '0' zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingRGB24(u32 gCol)
+{
+       return ((gCol<<19) & (0x1FF<<20)) |
+              ((gCol>> 2) & (0x1FF<<10)) |
+               (gCol>>23);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+//
+// INPUT:
+//        'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//          is midpoint that doesn't modify that component of texture
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+       return (gpu_senquack.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
+              (gpu_senquack.LightLUT[ (uSrc&0x03E0)     | g5] <<  5) |
+              (gpu_senquack.LightLUT[((uSrc&0x001F)<<5) | r5]      );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
+//
+// INPUT:
+//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
+//     15.0 is midpoint that does not modify color of texture
+//         gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
+//                       ^ bit 31
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, u32 gCol)
+{
+       return (gpu_senquack.LightLUT[((uSrc&0x7C00)>>5) | ((gCol>> 5)&0x1F)]<<10) |
+              (gpu_senquack.LightLUT[ (uSrc&0x03E0)     | ((gCol>>16)&0x1F)]<< 5) |
+              (gpu_senquack.LightLUT[((uSrc&0x001F)<<5) |  (gCol>>27)      ]    );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color,
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//        'r8','g8','b8' are unsigned 8-bit color component values, value of
+//          127 is midpoint that doesn't modify that component of texture
+//
+//         uSrc input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8)
+{
+       uint_fast16_t r1 = uSrc&0x001F;
+       uint_fast16_t g1 = uSrc&0x03E0;
+       uint_fast16_t b1 = uSrc&0x7C00;
+
+       uint_fast16_t r2 = r8;
+       uint_fast16_t g2 = g8;
+       uint_fast16_t b2 = b8;
+
+       u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+       return ((r3>> 3)    ) |
+              ((g3>> 8)<<10) |
+              ((b3>>13)<<20);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color in 'uSrc',
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'uSrc' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+//       'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                     ^ bit 31
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24Gouraud(uint_fast16_t uSrc, u32 gCol)
+{
+       uint_fast16_t r1 = uSrc&0x001F;
+       uint_fast16_t g1 = uSrc&0x03E0;
+       uint_fast16_t b1 = uSrc&0x7C00;
+
+       uint_fast16_t r2 = (gCol>>24) & 0xFF;
+       uint_fast16_t g2 = (gCol>>13) & 0xFF;
+       uint_fast16_t b2 = (gCol>> 2) & 0xFF;
+
+       u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+       return ((r3>> 3)    ) |
+              ((g3>> 8)<<10) |
+              ((b3>>13)<<20);
+}
+
+#endif  //_OP_LIGHT_H_
diff --git a/plugins/gpu_senquack/gpu_inner_light_arm.h b/plugins/gpu_senquack/gpu_inner_light_arm.h
new file mode 100644 (file)
index 0000000..550f6b1
--- /dev/null
@@ -0,0 +1,112 @@
+#ifndef _OP_LIGHT_ARM_H_
+#define _OP_LIGHT_ARM_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// RETURNS:
+//    u16 output:  0bbbbbgggggrrrrr
+//                 ^ bit 16
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
+{
+       uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
+       u32 tmp;
+  
+       asm ("and %[tmp], %[gCol], %[out]\n\t"              // tmp holds 0x000000bbbbb00000
+            "and %[out], %[out],  %[gCol], lsr #0x0B\n\t"  // out holds 0x000000ggggg00000
+            "orr %[tmp], %[out],  %[tmp],  lsl #0x05\n\t"  // tmp holds 0x0bbbbbggggg00000
+            "orr %[out], %[tmp],  %[gCol], lsr #0x1B\n\t"  // out holds 0x0bbbbbgggggrrrrr
+            : [out] "+&r" (out), [tmp] "=&r" (tmp)
+            : [gCol] "r"  (gCol)
+            );
+
+       return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+//
+// INPUT:
+//       'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//         is midpoint that doesn't modify that component of texture
+//       'uSrc' input:  mbbbbbgggggrrrrr
+//                      ^ bit 16
+// RETURNS:
+//         u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits.
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+       uint_fast16_t out = 0x03E0;
+       u32 db, dg;
+
+       // Using `g` for src, `G` for dest
+       asm ("and    %[dg],  %[out],    %[src]  \n\t"             // dg holds 0x000000ggggg00000
+            "orr    %[dg],  %[dg],     %[g5]   \n\t"             // dg holds 0x000000gggggGGGGG
+            "and    %[db],  %[out],    %[src], lsr #0x05 \n\t"   // db holds 0x000000bbbbb00000
+            "ldrb   %[dg],  [%[lut],   %[dg]]  \n\t"             // dg holds result 0x00000000000ggggg
+            "and    %[out], %[out],    %[src], lsl #0x05 \n\t"   // out holds 0x000000rrrrr00000
+            "orr    %[out], %[out],    %[r5]   \n\t"             // out holds 0x000000rrrrrRRRRR
+            "orr    %[db],  %[db],     %[b5]   \n\t"             // db holds 0x000000bbbbbBBBBB
+            "ldrb   %[out], [%[lut],   %[out]] \n\t"             // out holds result 0x00000000000rrrrr
+            "ldrb   %[db],  [%[lut],   %[db]]  \n\t"             // db holds result 0x00000000000bbbbb
+            "tst    %[src], #0x8000\n\t"                         // check whether msb was set on uSrc
+            "orr    %[out], %[out],    %[dg],  lsl #0x05   \n\t" // out holds 0x000000gggggrrrrr
+            "orrne  %[out], %[out],    #0x8000\n\t"              // add msb to out if set on uSrc
+            "orr    %[out], %[out],    %[db],  lsl #0x0A   \n\t" // out holds 0xmbbbbbgggggrrrrr
+            : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg)
+            : [r5] "r" (r5), [g5] "r" (g5),  [b5] "r" (b5),
+              [lut] "r" (gpu_senquack.LightLUT), [src] "r" (uSrc), "0" (out)
+            : "cc");
+       return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
+//
+// INPUT:
+//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
+//     15.0 is midpoint that does not modify color of texture
+//        gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
+//                      ^ bit 31
+//       'uSrc' input:  mbbbbbgggggrrrrr
+//                      ^ bit 16
+// RETURNS:
+//         u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol)
+{
+       uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
+       u32 db,dg,gtmp;
+
+       // Using `g` for src, `G` for dest
+       asm ("and    %[dg],  %[out],  %[src]   \n\t"           // dg holds 0x000000ggggg00000
+            "and    %[gtmp],%[out],  %[gCol], lsr #0x0B \n\t" // gtmp holds 0x000000GGGGG00000
+            "and    %[db],  %[out],  %[src],  lsr #0x05 \n\t" // db holds 0x000000bbbbb00000
+            "orr    %[dg],  %[dg],   %[gtmp], lsr #0x05 \n\t" // dg holds 0x000000gggggGGGGG
+            "and    %[gtmp],%[out],  %[gCol]  \n\t"           // gtmp holds 0x000000BBBBB00000
+            "ldrb   %[dg],  [%[lut], %[dg]]   \n\t"           // dg holds result 0x00000000000ggggg
+            "and    %[out], %[out],  %[src],  lsl #0x05 \n\t" // out holds 0x000000rrrrr00000
+            "orr    %[out], %[out],  %[gCol], lsr #0x1B \n\t" // out holds 0x000000rrrrrRRRRR
+            "orr    %[db],  %[db],   %[gtmp], lsr #0x05 \n\t" // db holds 0x000000bbbbbBBBBB
+            "ldrb   %[out], [%[lut], %[out]]  \n\t"           // out holds result 0x00000000000rrrrr
+            "ldrb   %[db],  [%[lut], %[db]]   \n\t"           // db holds result 0x00000000000bbbbb
+            "tst    %[src], #0x8000\n\t"                      // check whether msb was set on uSrc
+            "orr    %[out], %[out],  %[dg],   lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr
+            "orrne  %[out], %[out],  #0x8000\n\t"             // add msb to out if set on uSrc
+            "orr    %[out], %[out],  %[db],   lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr
+            : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg),
+              [gtmp] "=&r" (gtmp) \
+            : [gCol] "r" (gCol), [lut] "r" (gpu_senquack.LightLUT), "0" (out), [src] "r" (uSrc)
+            : "cc");
+
+       return out;
+}
+
+#endif  //_OP_LIGHT_ARM_H_
diff --git a/plugins/gpu_senquack/gpu_inner_quantization.h b/plugins/gpu_senquack/gpu_inner_quantization.h
new file mode 100644 (file)
index 0000000..6432d03
--- /dev/null
@@ -0,0 +1,108 @@
+/***************************************************************************
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_DITHER_H_
+#define _OP_DITHER_H_
+
+static void SetupDitheringConstants()
+{
+       // Initialize Dithering Constants
+       // The screen is divided into 8x8 chunks and sub-unitary noise is applied
+       // using the following matrix. This ensures that data lost in color
+       // quantization will be added back to the image 'by chance' in predictable
+       // patterns that are naturally 'smoothed' by your sight when viewed from a
+       // certain distance.
+       //
+       // http://caca.zoy.org/study/index.html
+       //
+       // Shading colors are encoded in 4.5, and then are quantitized to 5.0,
+       // DitherMatrix constants reflect that.
+
+       static const u8 DitherMatrix[] = {
+                0, 32,  8, 40,  2, 34, 10, 42,
+               48, 16, 56, 24, 50, 18, 58, 26,
+               12, 44,  4, 36, 14, 46,  6, 38,
+               60, 28, 52, 20, 62, 30, 54, 22,
+                3, 35, 11, 43,  1, 33,  9, 41,
+               51, 19, 59, 27, 49, 17, 57, 25,
+               15, 47,  7, 39, 13, 45,  5, 37,
+               63, 31, 55, 23, 61, 29, 53, 21
+       };
+
+       int i, j;
+       for (i = 0; i < 8; i++)
+       {
+               for (j = 0; j < 8; j++)
+               {
+                       u16 offset = (i << 3) | j;
+
+                       u32 component = ((DitherMatrix[offset] + 1) << 4) / 65; //[5.5] -> [5]
+
+                       // XXX - senquack - hack Dec 2016
+                       //  Until JohnnyF gets the time to work further on dithering,
+                       //   force lower bit of component to 0. This fixes grid pattern
+                       //   affecting quality of dithered image, as well as loss of
+                       //   detail in dark areas. With lower bit unset like this, existing
+                       //   27-bit accuracy of dithering math is unneeded, could be 24-bit.
+                       //   Is 8x8 matrix overkill as a result, can we use 4x4?
+                       component &= ~1;
+
+                       gpu_senquack.DitherMatrix[offset] = (component)
+                                                     | (component << 10)
+                                                     | (component << 20);
+               }
+       }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert padded u32 5.4:5.4:5.4 bgr fixed-pt triplet to final bgr555 color,
+//  applying dithering if specified by template parameter.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'pDst' is a pointer to destination framebuffer pixel, used
+//         to determine which DitherMatrix[] entry to apply.
+// RETURNS:
+//         u16 output: 0bbbbbgggggrrrrr
+//                     ^ bit 16
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int DITHER>
+GPU_INLINE u16 gpuColorQuantization24(u32 uSrc24, const u16 *pDst)
+{
+       if (DITHER)
+       {
+               u16 fbpos  = (u32)(pDst - gpu_senquack.vram);
+               u16 offset = ((fbpos & (0x7 << 10)) >> 7) | (fbpos & 0x7);
+
+               //clean overflow flags and add
+               uSrc24 = (uSrc24 & 0x1FF7FDFF) + gpu_senquack.DitherMatrix[offset];
+
+               if (uSrc24 & (1<< 9)) uSrc24 |= (0x1FF    );
+               if (uSrc24 & (1<<19)) uSrc24 |= (0x1FF<<10);
+               if (uSrc24 & (1<<29)) uSrc24 |= (0x1FF<<20);
+       }
+
+       return ((uSrc24>> 4) & (0x1F    ))
+            | ((uSrc24>> 9) & (0x1F<<5 ))
+            | ((uSrc24>>14) & (0x1F<<10));
+}
+
+#endif //_OP_DITHER_H_
diff --git a/plugins/gpu_senquack/gpu_raster_image.h b/plugins/gpu_senquack/gpu_raster_image.h
new file mode 100644 (file)
index 0000000..8e8064c
--- /dev/null
@@ -0,0 +1,220 @@
+/***************************************************************************
+ *   Copyright (C) 2010 PCSX4ALL Team                                      *
+ *   Copyright (C) 2010 Unai                                               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+ ***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_IMAGE_H__
+#define __GPU_UNAI_GPU_RASTER_IMAGE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+void gpuLoadImage(PtrUnion packet)
+{
+       u16 x0, y0, w0, h0;
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       w0 = packet.U2[4];
+       h0 = packet.U2[5];
+
+       if ((y0 + h0) > FRAME_HEIGHT)
+       {
+               h0 = FRAME_HEIGHT - y0;
+       }
+
+       gpu_senquack.dma.FrameToWrite = ((w0)&&(h0));
+
+       gpu_senquack.dma.px = 0;
+       gpu_senquack.dma.py = 0;
+       gpu_senquack.dma.x_end = w0;
+       gpu_senquack.dma.y_end = h0;
+       gpu_senquack.dma.pvram = &((u16*)gpu_senquack.vram)[x0+(y0*1024)];
+
+       gpu_senquack.GPU_GP1 |= 0x08000000;
+}
+#endif // !USE_GPULIB
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+void gpuStoreImage(PtrUnion packet)
+{
+       u16 x0, y0, w0, h0;
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       w0 = packet.U2[4];
+       h0 = packet.U2[5];
+
+       if ((y0 + h0) > FRAME_HEIGHT)
+       {
+               h0 = FRAME_HEIGHT - y0;
+       }
+       gpu_senquack.dma.FrameToRead = ((w0)&&(h0));
+
+       gpu_senquack.dma.px = 0;
+       gpu_senquack.dma.py = 0;
+       gpu_senquack.dma.x_end = w0;
+       gpu_senquack.dma.y_end = h0;
+       gpu_senquack.dma.pvram = &((u16*)gpu_senquack.vram)[x0+(y0*1024)];
+       
+       gpu_senquack.GPU_GP1 |= 0x08000000;
+}
+#endif // !USE_GPULIB
+
+void gpuMoveImage(PtrUnion packet)
+{
+       u32 x0, y0, x1, y1;
+       s32 w0, h0;
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       x1 = packet.U2[4] & 1023;
+       y1 = packet.U2[5] & 511;
+       w0 = packet.U2[6];
+       h0 = packet.U2[7];
+
+       if( (x0==x1) && (y0==y1) ) return;
+       if ((w0<=0) || (h0<=0)) return;
+       
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"gpuMoveImage(x0=%u,y0=%u,x1=%u,y1=%u,w0=%d,h0=%d)\n",x0,y0,x1,y1,w0,h0);
+       #endif
+       
+       if (((y0+h0)>512)||((x0+w0)>1024)||((y1+h0)>512)||((x1+w0)>1024))
+       {
+               u16 *psxVuw=gpu_senquack.vram;
+               s32 i,j;
+           for(j=0;j<h0;j++)
+                for(i=0;i<w0;i++)
+                 psxVuw [(1024*((y1+j)&511))+((x1+i)&0x3ff)]=
+                  psxVuw[(1024*((y0+j)&511))+((x0+i)&0x3ff)];
+       }
+       else if ((x0&1)||(x1&1))
+       {
+               u16 *lpDst, *lpSrc;
+               lpDst = lpSrc = (u16*)gpu_senquack.vram;
+               lpSrc += FRAME_OFFSET(x0, y0);
+               lpDst += FRAME_OFFSET(x1, y1);
+               x1 = FRAME_WIDTH - w0;
+               do {
+                       x0=w0;
+                       do { *lpDst++ = *lpSrc++; } while (--x0);
+                       lpDst += x1;
+                       lpSrc += x1;
+               } while (--h0);
+       }
+       else
+       {
+               u32 *lpDst, *lpSrc;
+               lpDst = lpSrc = (u32*)(void*)gpu_senquack.vram;
+               lpSrc += ((FRAME_OFFSET(x0, y0))>>1);
+               lpDst += ((FRAME_OFFSET(x1, y1))>>1);
+               if (w0&1)
+               {
+                       x1 = (FRAME_WIDTH - w0 +1)>>1;
+                       w0>>=1;
+                       if (!w0) {
+                               do {
+                                       *((u16*)lpDst) = *((u16*)lpSrc);
+                                       lpDst += x1;
+                                       lpSrc += x1;
+                               } while (--h0);
+                       } else
+                       do {
+                               x0=w0;
+                               do { *lpDst++ = *lpSrc++; } while (--x0);
+                               *((u16*)lpDst) = *((u16*)lpSrc);
+                               lpDst += x1;
+                               lpSrc += x1;
+                       } while (--h0);
+               }
+               else
+               {
+                       x1 = (FRAME_WIDTH - w0)>>1;
+                       w0>>=1;
+                       do {
+                               x0=w0;
+                               do { *lpDst++ = *lpSrc++; } while (--x0);
+                               lpDst += x1;
+                               lpSrc += x1;
+                       } while (--h0);
+               }
+       }
+}
+
+void gpuClearImage(PtrUnion packet)
+{
+       s32   x0, y0, w0, h0;
+       x0 = packet.S2[2];
+       y0 = packet.S2[3];
+       w0 = packet.S2[4] & 0x3ff;
+       h0 = packet.S2[5] & 0x3ff;
+        
+       w0 += x0;
+       if (x0 < 0) x0 = 0;
+       if (w0 > FRAME_WIDTH) w0 = FRAME_WIDTH;
+       w0 -= x0;
+       if (w0 <= 0) return;
+       h0 += y0;
+       if (y0 < 0) y0 = 0;
+       if (h0 > FRAME_HEIGHT) h0 = FRAME_HEIGHT;
+       h0 -= y0;
+       if (h0 <= 0) return;
+
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"gpuClearImage(x0=%d,y0=%d,w0=%d,h0=%d)\n",x0,y0,w0,h0);
+       #endif
+       
+       if (x0&1)
+       {
+               u16* pixel = (u16*)gpu_senquack.vram + FRAME_OFFSET(x0, y0);
+               u16 rgb = GPU_RGB16(packet.U4[0]);
+               y0 = FRAME_WIDTH - w0;
+               do {
+                       x0=w0;
+                       do { *pixel++ = rgb; } while (--x0);
+                       pixel += y0;
+               } while (--h0);
+       }
+       else
+       {
+               u32* pixel = (u32*)gpu_senquack.vram + ((FRAME_OFFSET(x0, y0))>>1);
+               u32 rgb = GPU_RGB16(packet.U4[0]);
+               rgb |= (rgb<<16);
+               if (w0&1)
+               {
+                       y0 = (FRAME_WIDTH - w0 +1)>>1;
+                       w0>>=1;
+                       do {
+                               x0=w0;
+                               do { *pixel++ = rgb; } while (--x0);
+                               *((u16*)pixel) = (u16)rgb;
+                               pixel += y0;
+                       } while (--h0);
+               }
+               else
+               {
+                       y0 = (FRAME_WIDTH - w0)>>1;
+                       w0>>=1;
+                       do {
+                               x0=w0;
+                               do { *pixel++ = rgb; } while (--x0);
+                               pixel += y0;
+                       } while (--h0);
+               }
+       }
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_IMAGE_H__ */
diff --git a/plugins/gpu_senquack/gpu_raster_line.h b/plugins/gpu_senquack/gpu_raster_line.h
new file mode 100644 (file)
index 0000000..4dd99a6
--- /dev/null
@@ -0,0 +1,720 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_LINE_H__
+#define __GPU_UNAI_GPU_RASTER_LINE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal line drawing functions
+//
+// Rewritten October 2016 by senquack:
+//  Instead of one pixel at a time, lines are now drawn in runs of pixels,
+//  whether vertical, horizontal, or diagonal. A new inner driver
+//  'gpuPixelSpanFn' is used, as well as an enhanced Bresenham run-slice
+//  algorithm. For more information, see the following:
+//
+//  Michael Abrash - Graphics Programming Black Book
+//  Chapters 35 - 36 (does not implement diagonal runs)
+//  http://www.drdobbs.com/parallel/graphics-programming-black-book/184404919
+//  http://www.jagregory.com/abrash-black-book/
+//
+//  Article by Andrew Delong (does not implement diagonal runs)
+//  http://timetraces.ca/nw/drawline.htm
+//
+//  'Run-Based Multi-Point Line Drawing' by Eun Jae Lee & Larry F. Hodges
+//  https://smartech.gatech.edu/bitstream/handle/1853/3632/93-22.pdf
+//  Provided the idea of doing a half-octant transform allowing lines with
+//  slopes between 0.5 and 2.0 (diagonal runs of pixels) to be handled
+//  identically to the traditional horizontal/vertical run-slice method.
+
+// Use 16.16 fixed point precision for line math.
+// NOTE: Gouraud colors used by gpuPixelSpanFn can use a different precision.
+#define GPU_LINE_FIXED_BITS 16
+
+// If defined, Gouraud lines will use fixed-point multiply-by-inverse to
+// do most divisions. With enough accuracy, this should be OK.
+#define USE_LINES_ALL_FIXED_PT_MATH
+
+//////////////////////
+// Flat-shaded line //
+//////////////////////
+void gpuDrawLineF(PtrUnion packet, const PSD gpuPixelSpanDriver)
+{
+       int x0, y0, x1, y1;
+       int dx, dy;
+
+       // All three of these variables should be signed (so multiplication works)
+       ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+       const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+       const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+       // Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+       //  bottommost pixels of the draw area. Since we render every pixel between
+       //  and including both line endpoints, subtract one from xmax/ymax.
+       const int xmin = gpu_senquack.DrawingArea[0];
+       const int ymin = gpu_senquack.DrawingArea[1];
+       const int xmax = gpu_senquack.DrawingArea[2] - 1;
+       const int ymax = gpu_senquack.DrawingArea[3] - 1;
+
+       x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_senquack.DrawingOffset[0];
+       y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_senquack.DrawingOffset[1];
+       x1 = GPU_EXPANDSIGN(packet.S2[4]) + gpu_senquack.DrawingOffset[0];
+       y1 = GPU_EXPANDSIGN(packet.S2[5]) + gpu_senquack.DrawingOffset[1];
+
+       // Always draw top to bottom, so ensure y0 <= y1
+       if (y0 > y1) {
+               SwapValues(y0, y1);
+               SwapValues(x0, x1);
+       }
+
+       // Is line totally outside Y clipping range?
+       if (y0 > ymax || y1 < ymin) return;
+
+       dx = x1 - x0;
+       dy = y1 - y0;
+
+       // X-axis range check : max distance between any two X coords is 1023
+       // (PSX hardware will not render anything violating this rule)
+       // NOTE: We'll check y coord range further below
+       if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+               return;
+
+       // Y-axis range check and clipping
+       if (dy) {
+               // Y-axis range check : max distance between any two Y coords is 511
+               // (PSX hardware will not render anything violating this rule)
+               if (dy >= CHKMAX_Y)
+                       return;
+
+               // We already know y0 < y1
+               if (y0 < ymin) {
+                       x0 += GPU_FAST_DIV(((ymin - y0) * dx), dy);
+                       y0 = ymin;
+               }
+               if (y1 > ymax) {
+                       x1 += GPU_FAST_DIV(((ymax - y1) * dx), dy);
+                       y1 = ymax;
+               }
+
+               // Recompute in case clipping occurred:
+               dx = x1 - x0;
+               dy = y1 - y0;
+       }
+
+       // Check X clipping range, set 'sx' x-direction variable
+       if (dx == 0) {
+               // Is vertical line totally outside X clipping range?
+               if (x0 < xmin || x0 > xmax)
+                       return;
+               sx = 0;
+       } else {
+               if (dx > 0) {
+                       // x0 is leftmost coordinate
+                       if (x0 > xmax) return; // Both points outside X clip range
+
+                       if (x0 < xmin) {
+                               if (x1 < xmin) return; // Both points outside X clip range
+                               y0 += GPU_FAST_DIV(((xmin - x0) * dy), dx);
+                               x0 = xmin;
+                       }
+
+                       if (x1 > xmax) {
+                               y1 += GPU_FAST_DIV(((xmax - x1) * dy), dx);
+                               x1 = xmax;
+                       }
+
+                       sx = +1;
+                       dx = x1 - x0; // Get final value, which should also be absolute value
+               } else {
+                       // x1 is leftmost coordinate
+                       if (x1 > xmax) return; // Both points outside X clip range
+
+                       if (x1 < xmin) {
+                               if (x0 < xmin) return; // Both points outside X clip range
+
+                               y1 += GPU_FAST_DIV(((xmin - x1) * dy), dx);
+                               x1 = xmin;
+                       }
+
+                       if (x0 > xmax) {
+                               y0 += GPU_FAST_DIV(((xmax - x0) * dy), dx);
+                               x0 = xmax;
+                       }
+
+                       sx = -1;
+                       dx = x0 - x1; // Get final value, which should also be absolute value
+               }
+
+               // Recompute in case clipping occurred:
+               dy = y1 - y0;
+       }
+
+       // IMPORTANT: dx,dy should now contain their absolute values
+
+       int min_length,    // Minimum length of a pixel run
+           start_length,  // Length of first run
+           end_length,    // Length of last run
+           err_term,      // Cumulative error to determine when to draw longer run
+           err_adjup,     // Increment to err_term for each run drawn
+           err_adjdown;   // Subract this from err_term after drawing longer run
+
+       // Color to draw with (16 bits, highest of which is unset mask bit)
+       uintptr_t col16 = GPU_RGB16(packet.U4[0]);
+
+       // We use u8 pointers even though PS1 has u16 framebuffer.
+       //  This allows pixel-drawing functions to increment dst pointer
+       //  directly by the passed 'incr' value, not having to shift it first.
+       u8 *dst = (u8*)gpu_senquack.vram + y0 * dst_stride + x0 * dst_depth;
+
+       // SPECIAL CASE: Vertical line
+       if (dx == 0) {
+               gpuPixelSpanDriver(dst, col16, dst_stride, dy+1);
+               return;
+       }
+
+       // SPECIAL CASE: Horizontal line
+       if (dy == 0) {
+               gpuPixelSpanDriver(dst, col16, sx * dst_depth, dx+1);
+               return;
+       }
+
+       // SPECIAL CASE: Diagonal line
+       if (dx == dy) {
+               gpuPixelSpanDriver(dst, col16, dst_stride + (sx * dst_depth), dy+1);
+               return;
+       }
+
+       int       major, minor;             // Major axis, minor axis
+       ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+       if (dx > dy) {
+               major = dx;
+               minor = dy;
+       } else {
+               major = dy;
+               minor = dx;
+       }
+
+       // Determine if diagonal or horizontal runs
+       if (major < (2 * minor)) {
+               // Diagonal runs, so perform half-octant transformation
+               minor = major - minor;
+
+               // Advance diagonally when drawing runs
+               incr_major = dst_stride + (sx * dst_depth);
+
+               // After drawing each run, correct for over-advance along minor axis
+               if (dx > dy)
+                       incr_minor = -dst_stride;
+               else
+                       incr_minor = -sx * dst_depth;
+       } else {
+               // Horizontal or vertical runs
+               if (dx > dy) {
+                       incr_major = sx * dst_depth;
+                       incr_minor = dst_stride;
+               } else {
+                       incr_major = dst_stride;
+                       incr_minor = sx * dst_depth;
+               }
+       }
+
+       if (minor > 1) {
+               // Minimum number of pixels each run
+               min_length = major / minor;
+
+               // Initial error term; reflects an initial step of 0.5 along minor axis
+               err_term = (major % minor) - (minor * 2);
+
+               // Increment err_term this much each step along minor axis; when
+               //  err_term crosses zero, draw longer pixel run.
+               err_adjup = (major % minor) * 2;
+       } else {
+               min_length = major;
+               err_term = 0;
+               err_adjup = 0;
+       }
+
+       // Error term adjustment when err_term turns over; used to factor
+       //  out the major-axis step made at that time
+       err_adjdown = minor * 2;
+
+       // The initial and last runs are partial, because minor axis advances
+       //  only 0.5 for these runs, rather than 1. Each is half a full run,
+       //  plus the initial pixel.
+       start_length = end_length = (min_length / 2) + 1;
+
+       if (min_length & 1) {
+               // If there're an odd number of pixels per run, we have 1 pixel that
+               //  can't be allocated to either the initial or last partial run, so
+               //  we'll add 0.5 to err_term so that this pixel will be handled
+               //  by the normal full-run loop
+               err_term += minor;
+       } else {
+               // If the minimum run length is even and there's no fractional advance,
+               // we have one pixel that could go to either the initial or last
+               // partial run, which we arbitrarily allocate to the last run
+               if (err_adjup == 0)
+                       start_length--; // Leave out the extra pixel at the start
+       }
+
+       // First run of pixels
+       dst = gpuPixelSpanDriver(dst, col16, incr_major, start_length);
+       dst += incr_minor;
+
+       // Middle runs of pixels
+       while (--minor > 0) {
+               int run_length = min_length;
+               err_term += err_adjup;
+
+               // If err_term passed 0, reset it and draw longer run
+               if (err_term > 0) {
+                       err_term -= err_adjdown;
+                       run_length++;
+               }
+
+               dst = gpuPixelSpanDriver(dst, col16, incr_major, run_length);
+               dst += incr_minor;
+       }
+
+       // Final run of pixels
+       gpuPixelSpanDriver(dst, col16, incr_major, end_length);
+}
+
+/////////////////////////
+// Gouraud-shaded line //
+/////////////////////////
+void gpuDrawLineG(PtrUnion packet, const PSD gpuPixelSpanDriver)
+{
+       int x0, y0, x1, y1;
+       int dx, dy, dr, dg, db;
+       u32 r0, g0, b0, r1, g1, b1;
+
+       // All three of these variables should be signed (so multiplication works)
+       ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+       const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+       const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+       // Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+       //  bottommost pixels of the draw area. We'll render every pixel between
+       //  and including both line endpoints, so subtract one from xmax/ymax.
+       const int xmin = gpu_senquack.DrawingArea[0];
+       const int ymin = gpu_senquack.DrawingArea[1];
+       const int xmax = gpu_senquack.DrawingArea[2] - 1;
+       const int ymax = gpu_senquack.DrawingArea[3] - 1;
+
+       x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_senquack.DrawingOffset[0];
+       y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_senquack.DrawingOffset[1];
+       x1 = GPU_EXPANDSIGN(packet.S2[6]) + gpu_senquack.DrawingOffset[0];
+       y1 = GPU_EXPANDSIGN(packet.S2[7]) + gpu_senquack.DrawingOffset[1];
+
+       u32 col0 = packet.U4[0];
+       u32 col1 = packet.U4[2];
+
+       // Always draw top to bottom, so ensure y0 <= y1
+       if (y0 > y1) {
+               SwapValues(y0, y1);
+               SwapValues(x0, x1);
+               SwapValues(col0, col1);
+       }
+
+       // Is line totally outside Y clipping range?
+       if (y0 > ymax || y1 < ymin) return;
+
+       // If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+       // (This is only beneficial if using SIMD-optimized pixel driver)
+#ifdef GPU_GOURAUD_LOW_PRECISION
+       r0 = (col0 >> 3) & 0x1f;  g0 = (col0 >> 11) & 0x1f;  b0 = (col0 >> 19) & 0x1f;
+       r1 = (col1 >> 3) & 0x1f;  g1 = (col1 >> 11) & 0x1f;  b1 = (col1 >> 19) & 0x1f;
+#else
+       r0 = col0 & 0xff;  g0 = (col0 >> 8) & 0xff;  b0 = (col0 >> 16) & 0xff;
+       r1 = col1 & 0xff;  g1 = (col1 >> 8) & 0xff;  b1 = (col1 >> 16) & 0xff;
+#endif
+
+       dx = x1 - x0;
+       dy = y1 - y0;
+       dr = r1 - r0;
+       dg = g1 - g0;
+       db = b1 - b0;
+
+       // X-axis range check : max distance between any two X coords is 1023
+       // (PSX hardware will not render anything violating this rule)
+       // NOTE: We'll check y coord range further below
+       if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+               return;
+
+       // Y-axis range check and clipping
+       if (dy) {
+               // Y-axis range check : max distance between any two Y coords is 511
+               // (PSX hardware will not render anything violating this rule)
+               if (dy >= CHKMAX_Y)
+                       return;
+
+               // We already know y0 < y1
+               if (y0 < ymin) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                       s32 factor = GPU_FAST_DIV(((ymin - y0) << GPU_LINE_FIXED_BITS), dy);
+                       x0 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+                       r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                       g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                       b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                       x0 += (ymin - y0) * dx / dy;
+                       r0 += (ymin - y0) * dr / dy;
+                       g0 += (ymin - y0) * dg / dy;
+                       b0 += (ymin - y0) * db / dy;
+#endif
+                       y0 = ymin;
+               }
+
+               if (y1 > ymax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                       s32 factor = GPU_FAST_DIV(((ymax - y1) << GPU_LINE_FIXED_BITS), dy);
+                       x1 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+                       r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                       g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                       b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                       x1 += (ymax - y1) * dx / dy;
+                       r1 += (ymax - y1) * dr / dy;
+                       g1 += (ymax - y1) * dg / dy;
+                       b1 += (ymax - y1) * db / dy;
+#endif
+                       y1 = ymax;
+               }
+
+               // Recompute in case clipping occurred:
+               dx = x1 - x0;
+               dy = y1 - y0;
+               dr = r1 - r0;
+               dg = g1 - g0;
+               db = b1 - b0;
+       }
+
+       // Check X clipping range, set 'sx' x-direction variable
+       if (dx == 0) {
+               // Is vertical line totally outside X clipping range?
+               if (x0 < xmin || x0 > xmax)
+                       return;
+               sx = 0;
+       } else {
+               if (dx > 0) {
+                       // x0 is leftmost coordinate
+                       if (x0 > xmax) return; // Both points outside X clip range
+
+                       if (x0 < xmin) {
+                               if (x1 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmin - x0) << GPU_LINE_FIXED_BITS), dx);
+                               y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y0 += (xmin - x0) * dy / dx;
+                               r0 += (xmin - x0) * dr / dx;
+                               g0 += (xmin - x0) * dg / dx;
+                               b0 += (xmin - x0) * db / dx;
+#endif
+                               x0 = xmin;
+                       }
+
+                       if (x1 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmax - x1) << GPU_LINE_FIXED_BITS), dx);
+                               y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y1 += (xmax - x1) * dy / dx;
+                               r1 += (xmax - x1) * dr / dx;
+                               g1 += (xmax - x1) * dg / dx;
+                               b1 += (xmax - x1) * db / dx;
+#endif
+                               x1 = xmax;
+                       }
+
+                       sx = +1;
+                       dx = x1 - x0; // Get final value, which should also be absolute value
+               } else {
+                       // x1 is leftmost coordinate
+                       if (x1 > xmax) return; // Both points outside X clip range
+
+                       if (x1 < xmin) {
+                               if (x0 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmin - x1) << GPU_LINE_FIXED_BITS), dx);
+                               y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y1 += (xmin - x1) * dy / dx;
+                               r1 += (xmin - x1) * dr / dx;
+                               g1 += (xmin - x1) * dg / dx;
+                               b1 += (xmin - x1) * db / dx;
+#endif
+                               x1 = xmin;
+                       }
+
+                       if (x0 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmax - x0) << GPU_LINE_FIXED_BITS), dx);
+                               y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y0 += (xmax - x0) * dy / dx;
+                               r0 += (xmax - x0) * dr / dx;
+                               g0 += (xmax - x0) * dg / dx;
+                               b0 += (xmax - x0) * db / dx;
+#endif
+                               x0 = xmax;
+                       }
+
+                       sx = -1;
+                       dx = x0 - x1; // Get final value, which should also be absolute value
+               }
+
+               // Recompute in case clipping occurred:
+               dy = y1 - y0;
+               dr = r1 - r0;
+               dg = g1 - g0;
+               db = b1 - b0;
+       }
+
+       // IMPORTANT: dx,dy should now contain their absolute values
+
+       int min_length,    // Minimum length of a pixel run
+           start_length,  // Length of first run
+           end_length,    // Length of last run
+           err_term,      // Cumulative error to determine when to draw longer run
+           err_adjup,     // Increment to err_term for each run drawn
+           err_adjdown;   // Subract this from err_term after drawing longer run
+
+       GouraudColor gcol;
+       gcol.r = r0 << GPU_GOURAUD_FIXED_BITS;
+       gcol.g = g0 << GPU_GOURAUD_FIXED_BITS;
+       gcol.b = b0 << GPU_GOURAUD_FIXED_BITS;
+
+       // We use u8 pointers even though PS1 has u16 framebuffer.
+       //  This allows pixel-drawing functions to increment dst pointer
+       //  directly by the passed 'incr' value, not having to shift it first.
+       u8 *dst = (u8*)gpu_senquack.vram + y0 * dst_stride + x0 * dst_depth;
+
+       // SPECIAL CASE: Vertical line
+       if (dx == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dy fixed-point inverse
+               s32 inv_factor = 1 << GPU_GOURAUD_FIXED_BITS;
+               if (dy > 1) inv_factor = GPU_FAST_DIV(inv_factor, dy);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               // First, convert to Gouraud fixed point
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dy > 1) {
+                       if (dr) gcol.r_incr /= dy;
+                       if (dg) gcol.g_incr /= dy;
+                       if (db) gcol.b_incr /= dy;
+               }
+#endif
+               
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride, dy+1);
+               return;
+       }
+
+       // SPECIAL CASE: Horizontal line
+       if (dy == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dx fixed-point inverse
+               s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+               if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dx > 1) {
+                       if (dr) gcol.r_incr /= dx;
+                       if (dg) gcol.g_incr /= dx;
+                       if (db) gcol.b_incr /= dx;
+               }
+#endif
+
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, sx * dst_depth, dx+1);
+               return;
+       }
+
+       // SPECIAL CASE: Diagonal line
+       if (dx == dy) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dx fixed-point inverse
+               s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+               if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               // First, convert to Gouraud fixed point
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dx > 1) {
+                       if (dr) gcol.r_incr /= dx;
+                       if (dg) gcol.g_incr /= dx;
+                       if (db) gcol.b_incr /= dx;
+               }
+#endif
+
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride + (sx * dst_depth), dy+1);
+               return;
+       }
+
+       int       major, minor;             // Absolute val of major,minor axis delta
+       ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+       if (dx > dy) {
+               major = dx;
+               minor = dy;
+       } else {
+               major = dy;
+               minor = dx;
+       }
+
+       // Determine if diagonal or horizontal runs
+       if (major < (2 * minor)) {
+               // Diagonal runs, so perform half-octant transformation
+               minor = major - minor;
+
+               // Advance diagonally when drawing runs
+               incr_major = dst_stride + (sx * dst_depth);
+
+               // After drawing each run, correct for over-advance along minor axis
+               if (dx > dy)
+                       incr_minor = -dst_stride;
+               else
+                       incr_minor = -sx * dst_depth;
+       } else {
+               // Horizontal or vertical runs
+               if (dx > dy) {
+                       incr_major = sx * dst_depth;
+                       incr_minor = dst_stride;
+               } else {
+                       incr_major = dst_stride;
+                       incr_minor = sx * dst_depth;
+               }
+       }
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+       s32 major_inv = GPU_FAST_DIV((1 << GPU_GOURAUD_FIXED_BITS), major);
+
+       // Simultaneously divide and convert from integer to Gouraud fixed point:
+       gcol.r_incr = dr * major_inv;
+       gcol.g_incr = dg * major_inv;
+       gcol.b_incr = db * major_inv;
+#else
+       gcol.r_incr = dr ? ((dr << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+       gcol.g_incr = dg ? ((dg << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+       gcol.b_incr = db ? ((db << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+#endif
+
+       if (minor > 1) {
+               // Minimum number of pixels each run
+               min_length = major / minor;
+
+               // Initial error term; reflects an initial step of 0.5 along minor axis
+               err_term = (major % minor) - (minor * 2);
+
+               // Increment err_term this much each step along minor axis; when
+               //  err_term crosses zero, draw longer pixel run.
+               err_adjup = (major % minor) * 2;
+       } else {
+               min_length = major;
+               err_term = 0;
+               err_adjup = 0;
+       }
+
+       // Error term adjustment when err_term turns over; used to factor
+       //  out the major-axis step made at that time
+       err_adjdown = minor * 2;
+
+       // The initial and last runs are partial, because minor axis advances
+       //  only 0.5 for these runs, rather than 1. Each is half a full run,
+       //  plus the initial pixel.
+       start_length = end_length = (min_length / 2) + 1;
+
+       if (min_length & 1) {
+               // If there're an odd number of pixels per run, we have 1 pixel that
+               //  can't be allocated to either the initial or last partial run, so
+               //  we'll add 0.5 to err_term so that this pixel will be handled
+               //  by the normal full-run loop
+               err_term += minor;
+       } else {
+               // If the minimum run length is even and there's no fractional advance,
+               // we have one pixel that could go to either the initial or last
+               // partial run, which we'll arbitrarily allocate to the last run
+               if (err_adjup == 0)
+                       start_length--; // Leave out the extra pixel at the start
+       }
+
+       // First run of pixels
+       dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, start_length);
+       dst += incr_minor;
+
+       // Middle runs of pixels
+       while (--minor > 0) {
+               int run_length = min_length;
+               err_term += err_adjup;
+
+               // If err_term passed 0, reset it and draw longer run
+               if (err_term > 0) {
+                       err_term -= err_adjdown;
+                       run_length++;
+               }
+
+               dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, run_length);
+               dst += incr_minor;
+       }
+
+       // Final run of pixels
+       gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, end_length);
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_LINE_H__ */
diff --git a/plugins/gpu_senquack/gpu_raster_polygon.h b/plugins/gpu_senquack/gpu_raster_polygon.h
new file mode 100644 (file)
index 0000000..8638ac4
--- /dev/null
@@ -0,0 +1,1453 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_POLYGON_H__
+#define __GPU_UNAI_GPU_RASTER_POLYGON_H__
+
+//senquack - NOTE: GPU Unai poly routines have been rewritten/adapted
+// from DrHell routines to fix multiple issues. See README_senquack.txt
+
+///////////////////////////////////////////////////////////////////////////////
+// Shared poly vertex buffer, able to handle 3 or 4-pt polys of any type.
+///////////////////////////////////////////////////////////////////////////////
+
+struct PolyVertex {
+       s32 x, y; // Sign-extended 11-bit X,Y coords
+       union {
+               struct { u8 u, v, pad[2]; } tex; // Texture coords (if used)
+               u32 tex_word;
+       };
+       union {
+               struct { u8 r, g, b, pad; } col; // 24-bit RGB color (if used)
+               u32 col_word;
+       };
+};
+
+enum PolyAttribute {
+       POLYATTR_TEXTURE = (1 << 0),
+       POLYATTR_GOURAUD = (1 << 1)
+};
+
+enum PolyType {
+       POLYTYPE_F  = 0,
+       POLYTYPE_FT = (POLYATTR_TEXTURE),
+       POLYTYPE_G  = (POLYATTR_GOURAUD),
+       POLYTYPE_GT = (POLYATTR_TEXTURE | POLYATTR_GOURAUD)
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// polyInitVertexBuffer()
+// Fills vbuf[] array with data from any type of poly draw-command packet.
+///////////////////////////////////////////////////////////////////////////////
+static void polyInitVertexBuffer(PolyVertex *vbuf, const PtrUnion packet, PolyType ptype, u32 is_quad)
+{
+       bool texturing = ptype & POLYATTR_TEXTURE;
+       bool gouraud   = ptype & POLYATTR_GOURAUD;
+
+       int vert_stride = 1; // Stride of vertices in cmd packet, in 32-bit words
+       if (texturing)
+               vert_stride++;
+       if (gouraud)
+               vert_stride++;
+
+       int num_verts = (is_quad) ? 4 : 3;
+       u32 *ptr;
+
+       // X,Y coords, adjusted by draw offsets
+       s32 x_off = gpu_senquack.DrawingOffset[0];
+       s32 y_off = gpu_senquack.DrawingOffset[1];
+       ptr = &packet.U4[1];
+       for (int i=0;  i < num_verts; ++i, ptr += vert_stride) {
+               s16* coord_ptr = (s16*)ptr;
+               vbuf[i].x = GPU_EXPANDSIGN(coord_ptr[0]) + x_off;
+               vbuf[i].y = GPU_EXPANDSIGN(coord_ptr[1]) + y_off;
+       }
+
+       // U,V texture coords (if applicable)
+       if (texturing) {
+               ptr = &packet.U4[2];
+               for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+                       vbuf[i].tex_word = *ptr;
+       }
+
+       // Colors (if applicable)
+       if (gouraud) {
+               ptr = &packet.U4[0];
+               for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+                       vbuf[i].col_word = *ptr;
+       }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Helper functions to determine which vertex in a 2 or 3 vertex array
+//   has the highest/lowest X/Y coordinate.
+//   Note: the comparison logic is such that, given a set of vertices with
+//    identical values for a given coordinate, a different index will be
+//    returned from vertIdxOfLeast..() than a call to vertIdxOfHighest..().
+//    This ensures that, during the vertex-ordering phase of rasterization,
+//    all three vertices remain unique.
+///////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+static inline int vertIdxOfLeastXCoord2(const T *Tptr)
+{
+       return (Tptr[0].x <= Tptr[1].x) ? 0 : 1;
+}
+
+template<typename T>
+static inline int vertIdxOfLeastXCoord3(const T *Tptr)
+{
+       int least_of_v0_v1 = vertIdxOfLeastXCoord2(Tptr);
+       return (Tptr[least_of_v0_v1].x <= Tptr[2].x) ? least_of_v0_v1 : 2;
+}
+
+template<typename T>
+static inline int vertIdxOfLeastYCoord2(const T *Tptr)
+{
+       return (Tptr[0].y <= Tptr[1].y) ? 0 : 1;
+}
+
+template<typename T>
+static inline int vertIdxOfLeastYCoord3(const T *Tptr)
+{
+       int least_of_v0_v1 = vertIdxOfLeastYCoord2(Tptr);
+       return (Tptr[least_of_v0_v1].y <= Tptr[2].y) ? least_of_v0_v1 : 2;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord2(const T *Tptr)
+{
+       return (Tptr[1].x >= Tptr[0].x) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord3(const T *Tptr)
+{
+       int highest_of_v0_v1 = vertIdxOfHighestXCoord2(Tptr);
+       return (Tptr[2].x >= Tptr[highest_of_v0_v1].x) ? 2 : highest_of_v0_v1;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord2(const T *Tptr)
+{
+       return (Tptr[1].y >= Tptr[0].y) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord3(const T *Tptr)
+{
+       int highest_of_v0_v1 = vertIdxOfHighestYCoord2(Tptr);
+       return (Tptr[2].y >= Tptr[highest_of_v0_v1].y) ? 2 : highest_of_v0_v1;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// polyUseTriangle()
+//  Determines if the specified triangle should be rendered. If so, it
+//  fills the given array of vertex pointers, vert_ptrs, in order of
+//  increasing Y coordinate values, as required by rasterization algorithm.
+//  Parameter 'tri_num' is 0 for first triangle (idx 0,1,2 of vbuf[]),
+//   or 1 for second triangle of a quad (idx 1,2,3 of vbuf[]).
+//  Returns true if triangle should be rendered, false if not.
+///////////////////////////////////////////////////////////////////////////////
+static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVertex **vert_ptrs)
+{
+       // Using verts 0,1,2 or is this the 2nd pass of a quad (verts 1,2,3)?
+       const PolyVertex *tri_ptr = &vbuf[(tri_num == 0) ? 0 : 1];
+
+       // Get indices of highest/lowest X,Y coords within triangle
+       int idx_lowest_x  = vertIdxOfLeastXCoord3(tri_ptr);
+       int idx_highest_x = vertIdxOfHighestXCoord3(tri_ptr);
+       int idx_lowest_y  = vertIdxOfLeastYCoord3(tri_ptr);
+       int idx_highest_y = vertIdxOfHighestYCoord3(tri_ptr);
+
+       // Maximum absolute distance between any two X coordinates is 1023,
+       //  and for Y coordinates is 511 (PS1 hardware limitation)
+       int lowest_x  = tri_ptr[idx_lowest_x].x;
+       int highest_x = tri_ptr[idx_highest_x].x;
+       int lowest_y  = tri_ptr[idx_lowest_y].y;
+       int highest_y = tri_ptr[idx_highest_y].y;
+       if ((highest_x - lowest_x) >= CHKMAX_X ||
+           (highest_y - lowest_y) >= CHKMAX_Y)
+               return false;
+
+       // Determine if triangle is completely outside clipping range
+       int xmin, xmax, ymin, ymax;
+       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+       int clipped_lowest_x  = Max2(xmin,lowest_x);
+       int clipped_lowest_y  = Max2(ymin,lowest_y);
+       int clipped_highest_x = Min2(xmax,highest_x);
+       int clipped_highest_y = Min2(ymax,highest_y);
+       if (clipped_lowest_x >= clipped_highest_x ||
+           clipped_lowest_y >= clipped_highest_y)
+               return false;
+
+       // Order vertex ptrs by increasing y value (draw routines need this).
+       // The middle index is deduced by a binary math trick that depends
+       //  on index range always being between 0..2
+       vert_ptrs[0] = tri_ptr + idx_lowest_y;
+       vert_ptrs[1] = tri_ptr + ((idx_lowest_y + idx_highest_y) ^ 3);
+       vert_ptrs[2] = tri_ptr + idx_highest_y;
+       return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal polygon drawing functions
+///////////////////////////////////////////////////////////////////////////////
+
+/*----------------------------------------------------------------------
+gpuDrawPolyF - Flat-shaded, untextured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       // Set up bgr555 color to be used across calls in inner driver
+       gpu_senquack.PixelData = GPU_RGB16(packet.U4[0]);
+
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 x0, x1, x2, y0, y1, y2;
+
+               x0 = vptrs[0]->x;  y0 = vptrs[0]->y;
+               x1 = vptrs[1]->x;  y1 = vptrs[1]->y;
+               x2 = vptrs[2]->x;  y2 = vptrs[2]->y;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx = (x2 - x1) * ya - (x2 - x0) * yb;
+
+               for (int loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       dx3 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       dx3 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0) + (dx3 * (y1 - y0));
+                                       x4 = i2x(x1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx3 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+                               if ((xmin - xa) > 0) xa = xmin;
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+/*----------------------------------------------------------------------
+gpuDrawPolyFT - Flat-shaded, textured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
+       gpu_senquack.r8 = packet.U1[0];
+       gpu_senquack.g8 = packet.U1[1];
+       gpu_senquack.b8 = packet.U1[2];
+       // r5/g5/b5 used if just texture-blending is applied (15-bit light)
+       gpu_senquack.r5 = packet.U1[0] >> 3;
+       gpu_senquack.g5 = packet.U1[1] >> 3;
+       gpu_senquack.b5 = packet.U1[2] >> 3;
+
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 u3, du3, v3, dv3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 u0, u1, u2, v0, v1, v2;
+               s32 du4, dv4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+               dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       du4 = -du4;
+                       dv4 = -dv4;
+               }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       du4 = (fixed)((du4 << FIXED_BITS) * finv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       du4 = xInvMulx(du4, iF, iS);
+                       dv4 = xInvMulx(dv4, iF, iS);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+                       dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#endif
+#endif
+               // Set u,v increments for inner driver
+               gpu_senquack.u_inc = du4;
+               gpu_senquack.v_inc = dv4;
+
+               //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
+               //                       (SAME ISSUE ELSEWHERE)
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               u3 = i2x(u0);  v3 = i2x(v0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               du3 = xInvMulx((u2 - u0), iF, iS);
+                                               dv3 = xInvMulx((v2 - v0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+                                               dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               du3 = xInvMulx((u1 - u0), iF, iS);
+                                               dv3 = xInvMulx((v1 - v0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+                                               dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);
+                                       x4 = i2x(x1);
+                                       u3 = i2x(u0);
+                                       v3 = i2x(v0);
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               u3 += (du3 * (y1 - y0));
+                                               v3 += (dv3 * (y1 - y0));
+                                       }
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+                                       u3 = i2x(u1);
+                                       v3 = i2x(v1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               du3 = xInvMulx((u2 - u1), iF, iS);
+                                               dv3 = xInvMulx((v2 - v1), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#else 
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+                                               dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += dx3 * (ymin - ya);
+                               x4 += dx4 * (ymin - ya);
+                               u3 += du3 * (ymin - ya);
+                               v3 += dv3 * (ymin - ya);
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       u3 += du3, v3 += dv3 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 u4, v4;
+
+                               xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+                               u4 = u3;  v4 = v3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       u4 += (du4 * itmp) >> FIXED_BITS;
+                                       v4 += (dv4 * itmp) >> FIXED_BITS;
+                               }
+
+                               u4 += fixed_HALF;
+                               v4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       u4 += du4 * (xmin - xa);
+                                       v4 += dv4 * (xmin - xa);
+                                       xa = xmin;
+                               }
+
+                               // Set u,v coords for inner driver
+                               gpu_senquack.u = u4;
+                               gpu_senquack.v = v4;
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+/*----------------------------------------------------------------------
+gpuDrawPolyG - Gouraud-shaded, untextured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_G, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 r3, dr3, g3, dg3, b3, db3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+               s32 dr4, dg4, db4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+               dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+               db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       dr4 = -dr4;
+                       dg4 = -dg4;
+                       db4 = -db4;
+               }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+                       db4 = (fixed)((db4 << FIXED_BITS) * finv);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+                       db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       dr4 = xInvMulx(dr4, iF, iS);
+                       dg4 = xInvMulx(dg4, iF, iS);
+                       db4 = xInvMulx(db4, iF, iS);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+                       dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+                       db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#endif
+#endif
+               // Setup packed Gouraud increment for inner driver
+               gpu_senquack.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;
+                               yb = y1;
+                               x3 = x4 = i2x(x0);
+                               r3 = i2x(r0);
+                               g3 = i2x(g0);
+                               b3 = i2x(b0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               dr3 = xInvMulx((r2 - r0), iF, iS);
+                                               dg3 = xInvMulx((g2 - g0), iF, iS);
+                                               db3 = xInvMulx((b2 - b0), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+                                               dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+                                               db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               dr3 = xInvMulx((r1 - r0), iF, iS);
+                                               dg3 = xInvMulx((g1 - g0), iF, iS);
+                                               db3 = xInvMulx((b1 - b0), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+                                               dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+                                               db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);  x4 = i2x(x1);
+                                       r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               r3 += (dr3 * (y1 - y0));
+                                               g3 += (dg3 * (y1 - y0));
+                                               b3 += (db3 * (y1 - y0));
+                                       }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                                       r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               dr3 = xInvMulx((r2 - r1), iF, iS);
+                                               dg3 = xInvMulx((g2 - g1), iF, iS);
+                                               db3 = xInvMulx((b2 - b1), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+                                               dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+                                               db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               r3 += (dr3 * (ymin - ya));
+                               g3 += (dg3 * (ymin - ya));
+                               b3 += (db3 * (ymin - ya));
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       r3 += dr3, g3 += dg3, b3 += db3 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 r4, g4, b4;
+
+                               xa = FixedCeilToInt(x3);
+                               xb = FixedCeilToInt(x4);
+                               r4 = r3;  g4 = g3;  b4 = b3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       r4 += (dr4 * itmp) >> FIXED_BITS;
+                                       g4 += (dg4 * itmp) >> FIXED_BITS;
+                                       b4 += (db4 * itmp) >> FIXED_BITS;
+                               }
+
+                               r4 += fixed_HALF;
+                               g4 += fixed_HALF;
+                               b4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       r4 += (dr4 * (xmin - xa));
+                                       g4 += (dg4 * (xmin - xa));
+                                       b4 += (db4 * (xmin - xa));
+                                       xa = xmin;
+                               }
+
+                               // Setup packed Gouraud color for inner driver
+                               gpu_senquack.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+/*----------------------------------------------------------------------
+gpuDrawPolyGT - Gouraud-shaded, textured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_GT, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 u3, du3, v3, dv3;
+               s32 r3, dr3, g3, dg3, b3, db3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 u0, u1, u2, v0, v1, v2;
+               s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+               s32 du4, dv4;
+               s32 dr4, dg4, db4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+               r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+               r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+               r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+               dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+               dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+               dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+               db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       du4 = -du4;
+                       dv4 = -dv4;
+                       dr4 = -dr4;
+                       dg4 = -dg4;
+                       db4 = -db4;
+               }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       du4 = (fixed)((du4 << FIXED_BITS) * finv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+                       db4 = (fixed)((db4 << FIXED_BITS) * finv);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+                       db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       du4 = xInvMulx(du4, iF, iS);
+                       dv4 = xInvMulx(dv4, iF, iS);
+                       dr4 = xInvMulx(dr4, iF, iS);
+                       dg4 = xInvMulx(dg4, iF, iS);
+                       db4 = xInvMulx(db4, iF, iS);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+                       dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+                       dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+                       dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+                       db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#endif
+#endif
+               // Set u,v increments and packed Gouraud increment for inner driver
+               gpu_senquack.u_inc = du4;
+               gpu_senquack.v_inc = dv4;
+               gpu_senquack.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               u3 = i2x(u0);  v3 = i2x(v0);
+                               r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               du3 = xInvMulx((u2 - u0), iF, iS);
+                                               dv3 = xInvMulx((v2 - v0), iF, iS);
+                                               dr3 = xInvMulx((r2 - r0), iF, iS);
+                                               dg3 = xInvMulx((g2 - g0), iF, iS);
+                                               db3 = xInvMulx((b2 - b0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+                                               dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+                                               dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+                                               dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+                                               db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / float(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               du3 = xInvMulx((u1 - u0), iF, iS);
+                                               dv3 = xInvMulx((v1 - v0), iF, iS);
+                                               dr3 = xInvMulx((r1 - r0), iF, iS);
+                                               dg3 = xInvMulx((g1 - g0), iF, iS);
+                                               db3 = xInvMulx((b1 - b0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+                                               dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+                                               dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+                                               dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+                                               db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);  x4 = i2x(x1);
+                                       u3 = i2x(u0);  v3 = i2x(v0);
+                                       r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               u3 += (du3 * (y1 - y0));
+                                               v3 += (dv3 * (y1 - y0));
+                                               r3 += (dr3 * (y1 - y0));
+                                               g3 += (dg3 * (y1 - y0));
+                                               b3 += (db3 * (y1 - y0));
+                                       }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                                       u3 = i2x(u1);  v3 = i2x(v1);
+                                       r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               du3 = xInvMulx((u2 - u1), iF, iS);
+                                               dv3 = xInvMulx((v2 - v1), iF, iS);
+                                               dr3 = xInvMulx((r2 - r1), iF, iS);
+                                               dg3 = xInvMulx((g2 - g1), iF, iS);
+                                               db3 = xInvMulx((b2 - b1), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+                                               dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+                                               dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+                                               dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+                                               db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               u3 += (du3 * (ymin - ya));
+                               v3 += (dv3 * (ymin - ya));
+                               r3 += (dr3 * (ymin - ya));
+                               g3 += (dg3 * (ymin - ya));
+                               b3 += (db3 * (ymin - ya));
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       u3 += du3, v3 += dv3,
+                                       r3 += dr3, g3 += dg3, b3 += db3 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 u4, v4;
+                               u32 r4, g4, b4;
+
+                               xa = FixedCeilToInt(x3);
+                               xb = FixedCeilToInt(x4);
+                               u4 = u3;  v4 = v3;
+                               r4 = r3;  g4 = g3;  b4 = b3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       u4 += (du4 * itmp) >> FIXED_BITS;
+                                       v4 += (dv4 * itmp) >> FIXED_BITS;
+                                       r4 += (dr4 * itmp) >> FIXED_BITS;
+                                       g4 += (dg4 * itmp) >> FIXED_BITS;
+                                       b4 += (db4 * itmp) >> FIXED_BITS;
+                               }
+
+                               u4 += fixed_HALF;
+                               v4 += fixed_HALF;
+                               r4 += fixed_HALF;
+                               g4 += fixed_HALF;
+                               b4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       u4 += du4 * (xmin - xa);
+                                       v4 += dv4 * (xmin - xa);
+                                       r4 += dr4 * (xmin - xa);
+                                       g4 += dg4 * (xmin - xa);
+                                       b4 += db4 * (xmin - xa);
+                                       xa = xmin;
+                               }
+
+                               // Set packed Gouraud color and u,v coords for inner driver
+                               gpu_senquack.u = u4;
+                               gpu_senquack.v = v4;
+                               gpu_senquack.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_POLYGON_H__ */
diff --git a/plugins/gpu_senquack/gpu_raster_sprite.h b/plugins/gpu_senquack/gpu_raster_sprite.h
new file mode 100644 (file)
index 0000000..ddbad67
--- /dev/null
@@ -0,0 +1,170 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_SPRITE_H__
+#define __GPU_UNAI_GPU_RASTER_SPRITE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal sprite drawing functions
+
+void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver)
+{
+       s32 x0, x1, y0, y1;
+       u32 u0, v0;
+
+       //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+       // or sprites in 1st level of SkullMonkeys disappear when walking right.
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_senquack.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_senquack.DrawingOffset[1]);
+
+       u32 w = packet.U2[6] & 0x3ff; // Max width is 1023
+       u32 h = packet.U2[7] & 0x1ff; // Max height is 511
+       x1 = x0 + w;
+       y1 = y0 + h;
+
+       s32 xmin, xmax, ymin, ymax;
+       xmin = gpu_senquack.DrawingArea[0];     xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];     ymax = gpu_senquack.DrawingArea[3];
+
+       u0 = packet.U1[8];
+       v0 = packet.U1[9];
+
+       s32 temp;
+       temp = ymin - y0;
+       if (temp > 0) { y0 = ymin; v0 += temp; }
+       if (y1 > ymax) y1 = ymax;
+       if (y1 <= y0) return;
+
+       temp = xmin - x0;
+       if (temp > 0) { x0 = xmin; u0 += temp; }
+       if (x1 > xmax) x1 = xmax;
+       x1 -= x0;
+       if (x1 <= 0) return;
+
+       gpu_senquack.r5 = packet.U1[0] >> 3;
+       gpu_senquack.g5 = packet.U1[1] >> 3;
+       gpu_senquack.b5 = packet.U1[2] >> 3;
+
+       u16 *Pixel = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(x0, y0)];
+       const int li=gpu_senquack.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+       unsigned int tmode = gpu_senquack.TEXT_MODE >> 5;
+       const u32 v0_mask = gpu_senquack.TextureWindow[3];
+       u8* pTxt_base = (u8*)gpu_senquack.TBA;
+
+       // Texture is accessed byte-wise, so adjust idx if 16bpp
+       if (tmode == 3) u0 <<= 1;
+
+       for (; y0<y1; ++y0) {
+               u8* pTxt = pTxt_base + ((v0 & v0_mask) * 2048);
+               if (!(y0&li) && (y0&pi)!=pif)
+                       gpuSpriteSpanDriver(Pixel, x1, pTxt, u0);
+               Pixel += FRAME_WIDTH;
+               v0++;
+       }
+}
+
+#ifdef __arm__
+#include "gpu_arm.h"
+
+/* Notaz 4bit sprites optimization */
+void gpuDrawS16(PtrUnion packet)
+{
+       s32 x0, y0;
+       s32 u0, v0;
+       s32 xmin, xmax;
+       s32 ymin, ymax;
+       u32 h = 16;
+
+       //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+       // or sprites in 1st level of SkullMonkeys disappear when walking right.
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_senquack.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_senquack.DrawingOffset[1]);
+
+       xmin = gpu_senquack.DrawingArea[0];     xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];     ymax = gpu_senquack.DrawingArea[3];
+       u0 = packet.U1[8];
+       v0 = packet.U1[9];
+
+       if (x0 > xmax - 16 || x0 < xmin ||
+           ((u0 | v0) & 15) || !(gpu_senquack.TextureWindow[2] & gpu_senquack.TextureWindow[3] & 8)) {
+               // send corner cases to general handler
+               packet.U4[3] = 0x00100010;
+               gpuDrawS(packet, gpuSpriteSpanFn<0x20>);
+               return;
+       }
+
+       if (y0 >= ymax || y0 <= ymin - 16)
+               return;
+       if (y0 < ymin) {
+               h -= ymin - y0;
+               v0 += ymin - y0;
+               y0 = ymin;
+       }
+       else if (ymax - y0 < 16)
+               h = ymax - y0;
+
+       draw_spr16_full(&gpu_senquack.vram[FRAME_OFFSET(x0, y0)], &gpu_senquack.TBA[FRAME_OFFSET(u0/4, v0)], gpu_senquack.CBA, h);
+}
+#endif // __arm__
+
+void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver)
+{
+       s32 x0, x1, y0, y1;
+
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_senquack.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_senquack.DrawingOffset[1]);
+
+       u32 w = packet.U2[4] & 0x3ff; // Max width is 1023
+       u32 h = packet.U2[5] & 0x1ff; // Max height is 511
+       x1 = x0 + w;
+       y1 = y0 + h;
+
+       s32 xmin, xmax, ymin, ymax;
+       xmin = gpu_senquack.DrawingArea[0];     xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];     ymax = gpu_senquack.DrawingArea[3];
+
+       if (y0 < ymin) y0 = ymin;
+       if (y1 > ymax) y1 = ymax;
+       if (y1 <= y0) return;
+
+       if (x0 < xmin) x0 = xmin;
+       if (x1 > xmax) x1 = xmax;
+       x1 -= x0;
+       if (x1 <= 0) return;
+
+       const u16 Data = GPU_RGB16(packet.U4[0]);
+       u16 *Pixel = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(x0, y0)];
+       const int li=gpu_senquack.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+       for (; y0<y1; ++y0) {
+               if (!(y0&li) && (y0&pi)!=pif)
+                       gpuTileSpanDriver(Pixel,x1,Data);
+               Pixel += FRAME_WIDTH;
+       }
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_SPRITE_H__ */
diff --git a/plugins/gpu_senquack/gpu_senquack.h b/plugins/gpu_senquack/gpu_senquack.h
new file mode 100644 (file)
index 0000000..efbdd4c
--- /dev/null
@@ -0,0 +1,316 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef GPU_UNAI_H
+#define GPU_UNAI_H
+
+#include "gpu.h"
+
+// Header shared between both standalone gpu_senquack (gpu.cpp) and new
+// gpulib-compatible gpu_senquack (gpulib_if.cpp)
+// -> Anything here should be for gpu_senquack's private use. <-
+
+///////////////////////////////////////////////////////////////////////////////
+//  Compile Options
+
+//#define ENABLE_GPU_NULL_SUPPORT   // Enables NullGPU support
+//#define ENABLE_GPU_LOG_SUPPORT    // Enables gpu logger, very slow only for windows debugging
+//#define ENABLE_GPU_ARMV7                     // Enables ARMv7 optimized assembly
+
+//Poly routine options (default is integer math and accurate division)
+//#define GPU_UNAI_USE_FLOATMATH         // Use float math in poly routines
+//#define GPU_UNAI_USE_FLOAT_DIV_MULTINV // If GPU_UNAI_USE_FLOATMATH is defined,
+                                         //  use multiply-by-inverse for division
+//#define GPU_UNAI_USE_INT_DIV_MULTINV   // If GPU_UNAI_USE_FLOATMATH is *not*
+                                         //  defined, use old inaccurate division
+
+
+#define GPU_INLINE static inline __attribute__((always_inline))
+#define INLINE     static inline __attribute__((always_inline))
+
+#define u8  uint8_t
+#define s8  int8_t
+#define u16 uint16_t
+#define s16 int16_t
+#define u32 uint32_t
+#define s32 int32_t
+#define s64 int64_t
+
+union PtrUnion
+{
+       u32  *U4;
+       s32  *S4;
+       u16  *U2;
+       s16  *S2;
+       u8   *U1;
+       s8   *S1;
+       void *ptr;
+};
+
+union GPUPacket
+{
+       u32 U4[16];
+       s32 S4[16];
+       u16 U2[32];
+       s16 S2[32];
+       u8  U1[64];
+       s8  S1[64];
+};
+
+template<class T> static inline void SwapValues(T &x, T &y)
+{
+       T tmp(x);  x = y;  y = tmp;
+}
+
+template<typename T>
+static inline T Min2 (const T a, const T b)
+{
+       return (a<b)?a:b;
+}
+
+template<typename T>
+static inline T Min3 (const T a, const T b, const T c)
+{
+       return  Min2(Min2(a,b),c);
+}
+
+template<typename T>
+static inline T Max2 (const T a, const T b)
+{
+       return  (a>b)?a:b;
+}
+
+template<typename T>
+static inline T Max3 (const T a, const T b, const T c)
+{
+       return  Max2(Max2(a,b),c);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Raster Macros
+
+// Convert 24bpp color parameter of GPU command to 16bpp (15bpp + mask bit)
+#define        GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
+
+// Sign-extend 11-bit coordinate command param
+#define GPU_EXPANDSIGN(x) (((s32)(x)<<(32-11))>>(32-11))
+
+// Max difference between any two X or Y primitive coordinates
+#define CHKMAX_X 1024
+#define CHKMAX_Y 512
+
+#define        FRAME_BUFFER_SIZE       (1024*512*2)
+#define        FRAME_WIDTH                       1024
+#define        FRAME_HEIGHT              512
+#define        FRAME_OFFSET(x,y)       (((y)<<10)+(x))
+#define FRAME_BYTE_STRIDE     2048
+#define FRAME_BYTES_PER_PIXEL 2
+
+static inline s32 GPU_DIV(s32 rs, s32 rt)
+{
+       return rt ? (rs / rt) : (0);
+}
+
+// 'Unsafe' version of above that doesn't check for div-by-zero
+#define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
+
+struct gpu_senquack_t {
+       u32 GPU_GP1;
+       GPUPacket PacketBuffer;
+       u16 *vram;
+
+#ifndef USE_GPULIB
+       u32  GPU_GP0;
+       u32  tex_window;       // Current texture window vals (set by GP0(E2h) cmd)
+       s32  PacketCount;
+       s32  PacketIndex;
+       bool fb_dirty;         // Framebuffer is dirty (according to GPU)
+
+       //  Display status
+       //  NOTE: Standalone older gpu_senquack didn't care about horiz display range
+       u16  DisplayArea[6];   // [0] : Start of display area (in VRAM) X
+                              // [1] : Start of display area (in VRAM) Y
+                              // [2] : Display mode resolution HORIZONTAL
+                              // [3] : Display mode resolution VERTICAL
+                              // [4] : Vertical display range (on TV) START
+                              // [5] : Vertical display range (on TV) END
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Dma Transfers info
+       struct {
+               s32  px,py;
+               s32  x_end,y_end;
+               u16* pvram;
+               u32 *last_dma;     // Last dma pointer
+               bool FrameToRead;  // Load image in progress
+               bool FrameToWrite; // Store image in progress
+       } dma;
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Frameskip
+       struct {
+               int  skipCount;    // Frame skip (0,1,2,3...)
+               bool isSkip;       // Skip frame (according to GPU)
+               bool skipFrame;    // Skip this frame (according to frame skip)
+               bool wasSkip;      // Skip frame old value (according to GPU)
+               bool skipGPU;      // Skip GPU primitives
+       } frameskip;
+#endif
+       // END of standalone gpu_senquack variables
+       ////////////////////////////////////////////////////////////////////////////
+
+       u32 TextureWindowCur;  // Current setting from last GP0(0xE2) cmd (raw form)
+       u8  TextureWindow[4];  // [0] : Texture window offset X
+                              // [1] : Texture window offset Y
+                              // [2] : Texture window mask X
+                              // [3] : Texture window mask Y
+
+       u16 DrawingArea[4];    // [0] : Drawing area top left X
+                              // [1] : Drawing area top left Y
+                              // [2] : Drawing area bottom right X
+                              // [3] : Drawing area bottom right Y
+
+       s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
+                              // [1] : Drawing offset Y (signed)
+
+       u16* TBA;              // Ptr to current texture in VRAM
+       u16* CBA;              // Ptr to current CLUT in VRAM
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Inner Loop parameters
+
+       // 22.10 Fixed-pt texture coords, mask, scanline advance
+       // NOTE: U,V are no longer packed together into one u32, this proved to be
+       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+       u32 u, v;
+       u32 u_msk, v_msk;
+       s32 u_inc, v_inc;
+
+       // Color for Gouraud-shaded prims
+       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+       //  layout:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+       //           ^ bit 31                       ^ bit 0
+       u32 gCol;
+       u32 gInc;          // Increment along scanline for gCol
+
+       // Color for flat-shaded, texture-blended prims
+       u8  r5, g5, b5;    // 5-bit light for undithered prims
+       u8  r8, g8, b8;    // 8-bit light for dithered prims
+
+       // Color for flat-shaded, untextured prims
+       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+
+       // End of inner Loop parameters
+       ////////////////////////////////////////////////////////////////////////////
+
+
+       u8 blit_mask;           // Determines what pixels to skip when rendering.
+                               //  Only useful on low-resolution devices using
+                               //  a simple pixel-dropping downscaler for PS1
+                               //  high-res modes. See 'pixel_skip' option.
+
+       u8 ilace_mask;          // Determines what lines to skip when rendering.
+                               //  Normally 0 when PS1 240 vertical res is in
+                               //  use and ilace_force is 0. When running in
+                               //  PS1 480 vertical res on a low-resolution
+                               //  device (320x240), will usually be set to 1
+                               //  so odd lines are not rendered. (Unless future
+                               //  full-screen scaling option is in use ..TODO)
+
+       bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
+
+       u8 BLEND_MODE;
+       u8 TEXT_MODE;
+       u8 Masking;
+
+       u16 PixelMSB;
+
+       gpu_senquack_config_t config;
+
+       u8  LightLUT[32*32];    // 5-bit lighting LUT (gpu_inner_light.h)
+       u32 DitherMatrix[64];   // Matrix of dither coefficients
+};
+
+static gpu_senquack_t gpu_senquack;
+
+// Global config that frontend can alter.. Values are read in GPU_init().
+// TODO: if frontend menu modifies a setting, add a function that can notify
+// GPU plugin to use new setting.
+gpu_senquack_config_t gpu_senquack_config_ext;
+
+///////////////////////////////////////////////////////////////////////////////
+// Internal inline funcs to get option status: (Allows flexibility)
+static inline bool LightingEnabled()
+{
+       return gpu_senquack.config.lighting;
+}
+
+static inline bool FastLightingEnabled()
+{
+       return gpu_senquack.config.fast_lighting;
+}
+
+static inline bool BlendingEnabled()
+{
+       return gpu_senquack.config.blending;
+}
+
+static inline bool DitheringEnabled()
+{
+       return gpu_senquack.config.dithering;
+}
+
+// For now, this is just for development/experimentation purposes..
+// If modified to return true, it will allow ignoring the status register
+//  bit 9 setting (dither enable). It will still restrict dithering only
+//  to Gouraud-shaded or texture-blended polys.
+static inline bool ForcedDitheringEnabled()
+{
+       return false;
+}
+
+static inline bool ProgressiveInterlaceEnabled()
+{
+#ifdef USE_GPULIB
+       // Using this old option greatly decreases quality of image. Disabled
+       //  for now when using new gpulib, since it also adds more work in loops.
+       return false;
+#else
+       return gpu_senquack.config.prog_ilace;
+#endif
+}
+
+// For now, 320x240 output resolution is assumed, using simple line-skipping
+//  and pixel-skipping downscaler.
+// TODO: Flesh these out so they return useful values based on whether
+//       running on higher-res device or a resampling downscaler is enabled.
+static inline bool PixelSkipEnabled()
+{
+       return gpu_senquack.config.pixel_skip || gpu_senquack.config.scale_hires;
+}
+
+static inline bool LineSkipEnabled()
+{
+       return true;
+}
+
+#endif // GPU_UNAI_H
diff --git a/plugins/gpu_senquack/gpulib_if.cpp b/plugins/gpu_senquack/gpulib_if.cpp
new file mode 100644 (file)
index 0000000..c8452a3
--- /dev/null
@@ -0,0 +1,642 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2011 notaz                                              *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../gpulib/gpu.h"
+
+//#include "port.h"
+#include "gpu_senquack.h"
+
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
+
+// Inner loop driver instantiation file
+#include "gpu_inner.h"
+
+// GPU internal image drawing functions
+#include "gpu_raster_image.h"
+
+// GPU internal line drawing functions
+#include "gpu_raster_line.h"
+
+// GPU internal polygon drawing functions
+#include "gpu_raster_polygon.h"
+
+// GPU internal sprite drawing functions
+#include "gpu_raster_sprite.h"
+
+// GPU command buffer execution/store
+#include "gpu_command.h"
+
+/////////////////////////////////////////////////////////////////////////////
+
+int renderer_init(void)
+{
+  memset((void*)&gpu_senquack, 0, sizeof(gpu_senquack));
+  gpu_senquack.vram = (u16*)gpu.vram;
+
+  // Original standalone gpu_senquack initialized TextureWindow[]. I added the
+  //  same behavior here, since it seems unsafe to leave [2],[3] unset when
+  //  using HLE and Rearmed gpu_neon sets this similarly on init. -senquack
+  gpu_senquack.TextureWindow[0] = 0;
+  gpu_senquack.TextureWindow[1] = 0;
+  gpu_senquack.TextureWindow[2] = 255;
+  gpu_senquack.TextureWindow[3] = 255;
+  //senquack - new vars must be updated whenever texture window is changed:
+  //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+  const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+  gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+  // Configuration options
+  gpu_senquack.config = gpu_senquack_config_ext;
+  //senquack - disabled, not sure this is needed and would require modifying
+  // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was
+  // present in latest PCSX4ALL sources we were using.
+  //gpu_senquack.config.enableAbbeyHack = gpu_senquack_config_ext.abe_hack;
+  gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+  // s_invTable
+  for(int i=1;i<=(1<<TABLE_BITS);++i)
+  {
+    double v = 1.0 / double(i);
+#ifdef GPU_TABLE_10_BITS
+    v *= double(0xffffffff>>1);
+#else
+    v *= double(0x80000000);
+#endif
+    s_invTable[i-1]=s32(v);
+  }
+#endif
+
+  SetupLightLUT();
+  SetupDitheringConstants();
+
+  return 0;
+}
+
+void renderer_finish(void)
+{
+}
+
+void renderer_notify_res_change(void)
+{
+  if (PixelSkipEnabled()) {
+    // Set blit_mask for high horizontal resolutions. This allows skipping
+    //  rendering pixels that would never get displayed on low-resolution
+    //  platforms that use simple pixel-dropping scaler.
+
+    switch (gpu.screen.hres)
+    {
+      case 512: gpu_senquack.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+      case 640: gpu_senquack.blit_mask = 0xaa; break; // GPU_BlitWS
+      default:  gpu_senquack.blit_mask = 0;    break;
+    }
+  } else {
+    gpu_senquack.blit_mask = 0;
+  }
+
+  if (LineSkipEnabled()) {
+    // Set rendering line-skip (only render every other line in high-res
+    //  480 vertical mode, or, optionally, force it for all video modes)
+
+    if (gpu.screen.vres == 480) {
+      if (gpu_senquack.config.ilace_force) {
+        gpu_senquack.ilace_mask = 3; // Only need 1/4 of lines
+      } else {
+        gpu_senquack.ilace_mask = 1; // Only need 1/2 of lines
+      }
+    } else {
+      // Vert resolution changed from 480 to lower one
+      gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+    }
+  } else {
+    gpu_senquack.ilace_mask = 0;
+  }
+
+  /*
+  printf("res change hres: %d   vres: %d   depth: %d   ilace_mask: %d\n",
+      gpu.screen.hres, gpu.screen.vres, gpu.status.rgb24 ? 24 : 15,
+      gpu_senquack.ilace_mask);
+  */
+}
+
+#ifdef USE_GPULIB
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_senquack_t &gpu_senquack, u32 cmd_word)
+{
+  // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+  u8 num = (cmd_word >> 24) & 7;
+  gpu.ex_regs[num] = cmd_word; // Update gpulib register
+  switch (num) {
+    case 1: {
+      // GP0(E1h) - Draw Mode setting (aka "Texpage")
+      u32 cur_texpage = gpu_senquack.GPU_GP1 & 0x7FF;
+      u32 new_texpage = cmd_word & 0x7FF;
+      if (cur_texpage != new_texpage) {
+        gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x7FF) | new_texpage;
+        gpuSetTexture(gpu_senquack.GPU_GP1);
+      }
+    } break;
+
+    case 2: {
+      // GP0(E2h) - Texture Window setting
+      if (cmd_word != gpu_senquack.TextureWindowCur) {
+        static const u8 TextureMask[32] = {
+          255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+          127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+        };
+        gpu_senquack.TextureWindowCur = cmd_word;
+        gpu_senquack.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+        gpu_senquack.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+        gpu_senquack.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+        gpu_senquack.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+        gpu_senquack.TextureWindow[0] &= ~gpu_senquack.TextureWindow[2];
+        gpu_senquack.TextureWindow[1] &= ~gpu_senquack.TextureWindow[3];
+
+        // Inner loop vars must be updated whenever texture window is changed:
+        const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+        gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+        gpuSetTexture(gpu_senquack.GPU_GP1);
+      }
+    } break;
+
+    case 3: {
+      // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+      gpu_senquack.DrawingArea[0] = cmd_word         & 0x3FF;
+      gpu_senquack.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+    } break;
+
+    case 4: {
+      // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+      gpu_senquack.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+      gpu_senquack.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+    } break;
+
+    case 5: {
+      // GP0(E5h) - Set Drawing Offset (X,Y)
+      gpu_senquack.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+      gpu_senquack.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+    } break;
+
+    case 6: {
+      // GP0(E6h) - Mask Bit Setting
+      gpu_senquack.Masking  = (cmd_word & 0x2) <<  1;
+      gpu_senquack.PixelMSB = (cmd_word & 0x1) <<  8;
+    } break;
+  }
+}
+#endif
+
+extern const unsigned char cmd_lengths[256];
+
+int do_cmd_list(u32 *list, int list_len, int *last_cmd)
+{
+  u32 cmd = 0, len, i;
+  u32 *list_start = list;
+  u32 *list_end = list + list_len;
+
+  //TODO: set ilace_mask when resolution changes instead of every time,
+  // eliminate #ifdef below.
+  gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+
+#ifdef HAVE_PRE_ARMV7 /* XXX */
+  gpu_senquack.ilace_mask |= gpu.status.interlace;
+#endif
+  if (gpu_senquack.config.scale_hires) {
+    gpu_senquack.ilace_mask |= gpu.status.interlace;
+  }
+
+  for (; list < list_end; list += 1 + len)
+  {
+    cmd = *list >> 24;
+    len = cmd_lengths[cmd];
+    if (list + 1 + len > list_end) {
+      cmd = -1;
+      break;
+    }
+
+    #define PRIM cmd
+    gpu_senquack.PacketBuffer.U4[0] = list[0];
+    for (i = 1; i <= len; i++)
+      gpu_senquack.PacketBuffer.U4[i] = list[i];
+
+    PtrUnion packet = { .ptr = (void*)&gpu_senquack.PacketBuffer };
+
+    switch (cmd)
+    {
+      case 0x02:
+        gpuClearImage(packet);
+        break;
+
+      case 0x20:
+      case 0x21:
+      case 0x22:
+      case 0x23: {          // Monochrome 3-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, false);
+      } break;
+
+      case 0x24:
+      case 0x25:
+      case 0x26:
+      case 0x27: {          // Textured 3-pt poly
+        gpuSetCLUT   (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, false);
+      } break;
+
+      case 0x28:
+      case 0x29:
+      case 0x2A:
+      case 0x2B: {          // Monochrome 4-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x2C:
+      case 0x2D:
+      case 0x2E:
+      case 0x2F: {          // Textured 4-pt poly
+        gpuSetCLUT   (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x30:
+      case 0x31:
+      case 0x32:
+      case 0x33: {          // Gouraud-shaded 3-pt poly
+        //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+        // this is an untextured poly, so CF_LIGHT (texture blend)
+        // shouldn't apply. Until the original array of template
+        // instantiation ptrs is fixed, we're stuck with this. (TODO)
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, false);
+      } break;
+
+      case 0x34:
+      case 0x35:
+      case 0x36:
+      case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, false);
+      } break;
+
+      case 0x38:
+      case 0x39:
+      case 0x3A:
+      case 0x3B: {          // Gouraud-shaded 4-pt poly
+        // See notes regarding '129' for 0x30..0x33 further above -senquack
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x3C:
+      case 0x3D:
+      case 0x3E:
+      case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x40:
+      case 0x41:
+      case 0x42:
+      case 0x43: {          // Monochrome line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
+      } break;
+
+      case 0x48 ... 0x4F: { // Monochrome line strip
+        u32 num_vertexes = 1;
+        u32 *list_position = &(list[2]);
+
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
+
+        while(1)
+        {
+          gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[2];
+          gpu_senquack.PacketBuffer.U4[2] = *list_position++;
+          gpuDrawLineF(packet, driver);
+
+          num_vertexes++;
+          if(list_position >= list_end) {
+            cmd = -1;
+            goto breakloop;
+          }
+          if((*list_position & 0xf000f000) == 0x50005000)
+            break;
+        }
+
+        len += (num_vertexes - 2);
+      } break;
+
+      case 0x50:
+      case 0x51:
+      case 0x52:
+      case 0x53: {          // Gouraud-shaded line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
+      } break;
+
+      case 0x58 ... 0x5F: { // Gouraud-shaded line strip
+        u32 num_vertexes = 1;
+        u32 *list_position = &(list[2]);
+
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
+
+        while(1)
+        {
+          gpu_senquack.PacketBuffer.U4[0] = gpu_senquack.PacketBuffer.U4[2];
+          gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[3];
+          gpu_senquack.PacketBuffer.U4[2] = *list_position++;
+          gpu_senquack.PacketBuffer.U4[3] = *list_position++;
+          gpuDrawLineG(packet, driver);
+
+          num_vertexes++;
+          if(list_position >= list_end) {
+            cmd = -1;
+            goto breakloop;
+          }
+          if((*list_position & 0xf000f000) == 0x50005000)
+            break;
+        }
+
+        len += (num_vertexes - 2) * 2;
+      } break;
+
+      case 0x60:
+      case 0x61:
+      case 0x62:
+      case 0x63: {          // Monochrome rectangle (variable size)
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x64:
+      case 0x65:
+      case 0x66:
+      case 0x67: {          // Textured rectangle (variable size)
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        // This fixes Silent Hill running animation on loading screens:
+        // (On PSX, color values 0x00-0x7F darken the source texture's color,
+        //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+        //  0x80 leaves source texture color unchanged, HOWEVER,
+        //   gpu_senquack uses a simple lighting LUT whereby only the upper
+        //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+        //   0x80.
+        // 
+        // NOTE: I've changed all textured sprite draw commands here and
+        //  elsewhere to use proper behavior, but left poly commands
+        //  alone, I don't want to slow rendering down too much. (TODO)
+        //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
+
+      case 0x68:
+      case 0x69:
+      case 0x6A:
+      case 0x6B: {          // Monochrome rectangle (1x1 dot)
+        gpu_senquack.PacketBuffer.U4[2] = 0x00010001;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x70:
+      case 0x71:
+      case 0x72:
+      case 0x73: {          // Monochrome rectangle (8x8)
+        gpu_senquack.PacketBuffer.U4[2] = 0x00080008;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x74:
+      case 0x75:
+      case 0x76:
+      case 0x77: {          // Textured rectangle (8x8)
+        gpu_senquack.PacketBuffer.U4[3] = 0x00080008;
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
+
+      case 0x78:
+      case 0x79:
+      case 0x7A:
+      case 0x7B: {          // Monochrome rectangle (16x16)
+        gpu_senquack.PacketBuffer.U4[2] = 0x00100010;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x7C:
+      case 0x7D:
+#ifdef __arm__
+        if ((gpu_senquack.GPU_GP1 & 0x180) == 0 && (gpu_senquack.Masking | gpu_senquack.PixelMSB) == 0)
+        {
+          gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+          gpuDrawS16(packet);
+          break;
+        }
+        // fallthrough
+#endif
+      case 0x7E:
+      case 0x7F: {          // Textured rectangle (16x16)
+        gpu_senquack.PacketBuffer.U4[3] = 0x00100010;
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
+
+      case 0x80:          //  vid -> vid
+        gpuMoveImage(packet);
+        break;
+
+#ifdef TEST
+      case 0xA0:          //  sys -> vid
+      {
+        u32 load_width = list[2] & 0xffff;
+        u32 load_height = list[2] >> 16;
+        u32 load_size = load_width * load_height;
+
+        len += load_size / 2;
+      } break;
+
+      case 0xC0:
+        break;
+#else
+      case 0xA0:          //  sys ->vid
+      case 0xC0:          //  vid -> sys
+        // Handled by gpulib
+        goto breakloop;
+#endif
+      case 0xE1 ... 0xE6: { // Draw settings
+        gpuGP0Cmd_0xEx(gpu_senquack, gpu_senquack.PacketBuffer.U4[0]);
+      } break;
+    }
+  }
+
+breakloop:
+  gpu.ex_regs[1] &= ~0x1ff;
+  gpu.ex_regs[1] |= gpu_senquack.GPU_GP1 & 0x1ff;
+
+  *last_cmd = cmd;
+  return list - list_start;
+}
+
+void renderer_sync_ecmds(uint32_t *ecmds)
+{
+  int dummy;
+  do_cmd_list(&ecmds[1], 6, &dummy);
+}
+
+void renderer_update_caches(int x, int y, int w, int h)
+{
+}
+
+void renderer_flush_queues(void)
+{
+}
+
+void renderer_set_interlace(int enable, int is_odd)
+{
+}
+
+#include "../../frontend/plugin_lib.h"
+// Handle any gpulib settings applicable to gpu_senquack:
+void renderer_set_config(const struct rearmed_cbs *cbs)
+{
+  gpu_senquack.vram = (u16*)gpu.vram;
+  gpu_senquack.config.ilace_force   = cbs->gpu_senquack.ilace_force;
+  gpu_senquack.config.pixel_skip    = cbs->gpu_senquack.pixel_skip;
+  gpu_senquack.config.lighting      = cbs->gpu_senquack.lighting;
+  gpu_senquack.config.fast_lighting = cbs->gpu_senquack.fast_lighting;
+  gpu_senquack.config.blending      = cbs->gpu_senquack.blending;
+  gpu_senquack.config.dithering     = cbs->gpu_senquack.dithering;
+  gpu_senquack.config.scale_hires   = cbs->gpu_senquack.scale_hires;
+}
+
+// vim:shiftwidth=2:expandtab
diff --git a/plugins/gpu_senquack/port.h b/plugins/gpu_senquack/port.h
new file mode 100644 (file)
index 0000000..0a731f8
--- /dev/null
@@ -0,0 +1,41 @@
+#ifndef __GPU_UNAI_GPU_PORT_H__
+#define __GPU_UNAI_GPU_PORT_H__
+
+#include <stddef.h>
+#include <string.h>
+
+#define INLINE static inline
+
+#define GPU_init       GPUinit
+#define GPU_shutdown   GPUshutdown
+//#define GPU_freeze   GPUfreeze
+#define GPU_writeDataMem GPUwriteDataMem
+#define GPU_dmaChain   GPUdmaChain
+#define GPU_writeData  GPUwriteData
+#define GPU_readDataMem        GPUreadDataMem
+#define GPU_readData   GPUreadData
+#define GPU_readStatus GPUreadStatus
+#define GPU_writeStatus        GPUwriteStatus
+#define GPU_updateLace GPUupdateLace
+
+extern "C" {
+
+#define u32 unsigned int
+#define s32 signed int
+
+bool GPUinit(void);
+void GPUshutdown(void);
+void GPUwriteDataMem(u32* dmaAddress, s32 dmaCount);
+long GPUdmaChain(u32* baseAddr, u32 dmaVAddr);
+void GPUwriteData(u32 data);
+void GPUreadDataMem(u32* dmaAddress, s32 dmaCount);
+u32  GPUreadData(void);
+u32  GPUreadStatus(void);
+void GPUwriteStatus(u32 data);
+
+#undef u32
+#undef s32
+
+}
+
+#endif /* __GPU_UNAI_GPU_PORT_H__ */
diff --git a/plugins/gpu_senquack/profiler.h b/plugins/gpu_senquack/profiler.h
new file mode 100644 (file)
index 0000000..a23ee38
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef __GPU_UNAI_GPU_PROFILER_H__
+#define __GPU_UNAI_GPU_PROFILER_H__
+
+#define pcsx4all_prof_pause(...)
+#define pcsx4all_prof_start_with_pause(...)
+#define pcsx4all_prof_end_with_resume(...)
+#define pcsx4all_prof_resume(...)
+
+#endif /* __GPU_UNAI_GPU_PROFILER_H__ */