New, separate GPU plugin based on Unai. (#233)

author gameblabla <gameblabla@users.noreply.github.com>

Fri, 29 Oct 2021 20:03:27 +0000 (20:03 +0000)

committer GitHub <noreply@github.com>

Fri, 29 Oct 2021 20:03:27 +0000 (23:03 +0300)
author gameblabla <gameblabla@users.noreply.github.com>
Fri, 29 Oct 2021 20:03:27 +0000 (20:03 +0000)
committer GitHub <noreply@github.com>
Fri, 29 Oct 2021 20:03:27 +0000 (23:03 +0300)
diff --git a/Makefile b/Makefile

index 18ef4e0..0998f58 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -129,6 +129,15 @@ plugins/gpu_unai/gpulib_if.o: CFLAGS += -DREARMED -O3
  CC_LINK = $(CXX)
  endif
  
+ifeq "$(BUILTIN_GPU)" "senquack"
+OBJS += plugins/gpu_senquack/gpulib_if.o
+ifeq "$(ARCH)" "arm"
+OBJS += plugins/gpu_senquack/gpu_arm.o
+endif
+plugins/gpu_senquack/gpulib_if.o: CFLAGS += -DREARMED -O3 
+CC_LINK = $(CXX)
+endif
+
  # cdrcimg
  OBJS += plugins/cdrcimg/cdrcimg.o
  ifeq "$(CHD_SUPPORT)" "1"
diff --git a/configure b/configure

index 5caf0f4..20ff1d5 100755 (executable)
--- a/configure
+++ b/configure
@@ -39,12 +39,12 @@ check_define_val()
  
  platform_list="generic pandora maemo caanoo libretro"
  platform="generic"
-builtin_gpu_list="peops unai neon"
+builtin_gpu_list="peops unai neon senquack"
  builtin_gpu=""
  sound_driver_list="oss alsa pulseaudio sdl libretro"
  sound_drivers=""
  plugins="plugins/spunull/spunull.so \
-plugins/dfxvideo/gpu_peops.so plugins/gpu_unai/gpu_unai.so"
+plugins/dfxvideo/gpu_peops.so plugins/gpu_unai/gpu_unai.so plugins/gpu_senquack/gpu_senquack.so"
  ram_fixed="no"
  drc_cache_base="no"
  have_armv5=""
diff --git a/frontend/main.c b/frontend/main.c

index 3bb0f4b..4631618 100644 (file)
--- a/frontend/main.c
+++ b/frontend/main.c
@@ -130,6 +130,12 @@ void emu_set_default_config(void)
         pl_rearmed_cbs.gpu_neon.enhancement_no_main = 0;
         pl_rearmed_cbs.gpu_peops.iUseDither = 0;
         pl_rearmed_cbs.gpu_peops.dwActFixes = 1<<7;
+       pl_rearmed_cbs.gpu_senquack.ilace_force = 0;
+       pl_rearmed_cbs.gpu_senquack.pixel_skip = 0;
+       pl_rearmed_cbs.gpu_senquack.lighting = 1;
+       pl_rearmed_cbs.gpu_senquack.fast_lighting = 0;
+       pl_rearmed_cbs.gpu_senquack.blending = 1;
+       pl_rearmed_cbs.gpu_senquack.dithering = 0;
         pl_rearmed_cbs.gpu_unai.abe_hack =
         pl_rearmed_cbs.gpu_unai.no_light =
         pl_rearmed_cbs.gpu_unai.no_blend = 0;
diff --git a/frontend/menu.c b/frontend/menu.c

index e2286d4..05dde46 100644 (file)
--- a/frontend/menu.c
+++ b/frontend/menu.c
@@ -430,6 +430,13 @@ static const struct {
         CE_INTVAL_P(gpu_unai.abe_hack),
         CE_INTVAL_P(gpu_unai.no_light),
         CE_INTVAL_P(gpu_unai.no_blend),
+       CE_INTVAL_P(gpu_senquack.ilace_force),
+       CE_INTVAL_P(gpu_senquack.pixel_skip),
+       CE_INTVAL_P(gpu_senquack.lighting),
+       CE_INTVAL_P(gpu_senquack.fast_lighting),
+       CE_INTVAL_P(gpu_senquack.blending),
+       CE_INTVAL_P(gpu_senquack.dithering),
+       CE_INTVAL_P(gpu_senquack.scale_hires),
         CE_INTVAL_P(gpu_neon.allow_interlace),
         CE_INTVAL_P(gpu_neon.enhancement_enable),
         CE_INTVAL_P(gpu_neon.enhancement_no_main),
@@ -1378,6 +1385,25 @@ static int menu_loop_plugin_gpu_unai(int id, int keys)
         return 0;
  }
  
+static menu_entry e_menu_plugin_gpu_senquack[] =
+{
+       mee_onoff     ("Interlace",                  0, pl_rearmed_cbs.gpu_senquack.ilace_force, 1),
+       mee_onoff     ("Dithering",                  0, pl_rearmed_cbs.gpu_senquack.dithering, 1),
+       mee_onoff     ("Lighting",                   0, pl_rearmed_cbs.gpu_senquack.lighting, 1),
+       mee_onoff     ("Fast lighting",              0, pl_rearmed_cbs.gpu_senquack.fast_lighting, 1),
+       mee_onoff     ("Blending",                   0, pl_rearmed_cbs.gpu_senquack.blending, 1),
+       mee_onoff     ("Pixel skip",                 0, pl_rearmed_cbs.gpu_senquack.pixel_skip, 1),
+       mee_end,
+};
+
+static int menu_loop_plugin_gpu_senquack(int id, int keys)
+{
+       int sel = 0;
+       me_loop(e_menu_plugin_gpu_senquack, &sel);
+       return 0;
+}
+
+
  static const char *men_gpu_dithering[] = { "None", "Game dependant", "Always", NULL };
  //static const char h_gpu_0[]            = "Needed for Chrono Cross";
  static const char h_gpu_1[]            = "Capcom fighting games";
@@ -1479,6 +1505,7 @@ static const char h_plugin_gpu[] =
  #endif
                                    "gpu_peops is Pete's soft GPU, slow but accurate\n"
                                    "gpu_unai is GPU from PCSX4ALL, fast but glitchy\n"
+                                  "gpu_senquack is more accurate but slower\n"
                                    "gpu_gles Pete's hw GPU, uses 3D chip but is glitchy\n"
                                    "must save config and reload the game if changed";
  static const char h_plugin_spu[] = "spunull effectively disables sound\n"
@@ -1486,6 +1513,7 @@ static const char h_plugin_spu[] = "spunull effectively disables sound\n"
  static const char h_gpu_peops[]  = "Configure P.E.Op.S. SoftGL Driver V1.17";
  static const char h_gpu_peopsgl[]= "Configure P.E.Op.S. MesaGL Driver V1.78";
  static const char h_gpu_unai[]   = "Configure Unai/PCSX4ALL Team GPU plugin";
+static const char h_gpu_senquack[]   = "Configure Unai/PCSX4ALL Senquack plugin";
  static const char h_spu[]        = "Configure built-in P.E.Op.S. Sound Driver V1.7";
  
  static menu_entry e_menu_plugin_options[] =
@@ -1498,6 +1526,7 @@ static menu_entry e_menu_plugin_options[] =
  #endif
         mee_handler_h ("Configure gpu_peops plugin",    menu_loop_plugin_gpu_peops, h_gpu_peops),
         mee_handler_h ("Configure gpu_unai GPU plugin", menu_loop_plugin_gpu_unai, h_gpu_unai),
+       mee_handler_h ("Configure gpu_senquack GPU plugin", menu_loop_plugin_gpu_senquack, h_gpu_senquack),
         mee_handler_h ("Configure gpu_gles GPU plugin", menu_loop_plugin_gpu_peopsgl, h_gpu_peopsgl),
         mee_handler_h ("Configure built-in SPU plugin", menu_loop_plugin_spu, h_spu),
         mee_end,
diff --git a/frontend/plugin_lib.h b/frontend/plugin_lib.h

index 4a11002..f55eb44 100644 (file)
--- a/frontend/plugin_lib.h
+++ b/frontend/plugin_lib.h
@@ -77,6 +77,15 @@ struct rearmed_cbs {
                 int   no_light, no_blend;
                 int   lineskip;
         } gpu_unai;
+       struct {
+               int ilace_force;
+               int pixel_skip;
+               int lighting;
+               int fast_lighting;
+               int blending;
+               int dithering;
+               int scale_hires;
+       } gpu_senquack;
         struct {
                 int   dwActFixes;
                 int   bDrawDither, iFilterType, iFrameTexType;
diff --git a/plugins/gpu_senquack/Makefile b/plugins/gpu_senquack/Makefile

new file mode 100644 (file)

index 0000000..c3be35b
--- /dev/null
+++ b/plugins/gpu_senquack/Makefile
@@ -0,0 +1,19 @@
+CFLAGS += -ggdb -Wall -O3 -ffast-math
+CFLAGS += -DREARMED
+CFLAGS += -I../../include
+#CFLAGS += -DINLINE="static __inline__"
+#CFLAGS += -Dasm="__asm__ __volatile__"
+#CFLAGS += -DUSE_GPULIB=1
+
+include ../../config.mak
+
+SRC_STANDALONE += gpu.cpp
+SRC_GPULIB += gpulib_if.cpp
+
+ifeq "$(ARCH)" "arm"
+SRC += gpu_arm.S
+endif
+
+#BIN_STANDALONE = gpuPCSX4ALL.so
+BIN_GPULIB = gpu_senquack.so
+include ../gpulib/gpulib.mak
diff --git a/plugins/gpu_senquack/README_senquack.txt b/plugins/gpu_senquack/README_senquack.txt

new file mode 100644 (file)

index 0000000..cda17fc
--- /dev/null
+++ b/plugins/gpu_senquack/README_senquack.txt
@@ -0,0 +1,956 @@
+//NOTE: You can find the set of original Unai poly routines (disabled now)
+// at the bottom end of this file.
+
+//senquack - Original Unai GPU poly routines have been replaced with new
+// ones based on DrHell routines. The original routines suffered from
+// shifted rows, causing many quads to have their first triangle drawn
+// correctly, but the second triangle would randomly have pixels shifted
+// either left or right or entire rows not drawn at all. Furthermore,
+// some times entire triangles seemed to be either missing or only
+// partially drawn (most clearly seen in sky/road textures in NFS3,
+// clock tower in beginning of Castlevania SOTN). Pixel gaps were
+// prevalent.
+//
+// Since DrHell GPU didn't seem to exhibit these artifacts at all, I adapted
+// its routines to GPU Unai (Unai was probably already originally based on it).
+// DrHell uses 22.10 fixed point instead of Unai's 16.16, so gpu_fixedpoint.h
+// required modification as well as gpu_inner.h (where gpuPolySpanFn driver
+// functions are).
+//
+// Originally, I tried to patch up original Unai routines and got as far
+// as fixing the shifted rows, but still had other problem of triangles rendered
+// wrong (black triangular gaps in NFS3 sky, clock tower in Castlevania SOTN).
+// I eventually gave up. Even after rewriting/adapting the routines,
+// however, I still had some random pixel droupouts, specifically in
+// NFS3 sky texture. I discovered that gpu_inner.h gpuPolySpanFn function
+// was taking optimizations to an extreme and packing u/v texture coords
+// into one 32-bit word, reducing their accuracy. Only once they were
+// handled in full-accuracy individual words was that problem fixed.
+//
+// NOTE: I also added support for doing divisions using the FPU, either
+//  with normal division or multiplication-by-reciprocal.
+//  To use float division, GPU_UNAI_USE_FLOATMATH should be defined.
+//  To use float mult-by-reciprocal, GPU_UNAI_USE_FLOAT_DIV_MULTINV
+//   can be specified (GPU_UNAI_USE_FLOATMATH must also be specified)
+//  To use inaccurate fixed-point mult-by-reciprocal, define
+//   GPU_UNAI_USE_INT_DIV_MULTINV. This is the default on older
+//   ARM devices like Wiz/Caanoo that have neither integer division
+//   in hardware or an FPU. It results in some pixel dropouts,
+//   texture glitches, but less than the original GPU UNAI code.
+//
+//  If nothing is specified, integer division will be used.
+//
+// NOTE 2: Even with MIPS32R2 having FPU recip.s instruction, and it is
+//  used when this platform is detected, I found it not to give any
+//  noticeable speedup over normal float division (in fact seemed a tiny
+//  tiny bit slower). I also found float division to not provide any
+//  noticeable speedups versus integer division on MISP32R2 platform.
+//  Granted, the differences were all around .5 FPS or less.
+//
+// TODO:
+// * See if anything can be done about remaining pixel gaps in Gran
+//   Turismo car models, track.
+// * Find better way of passing parameters to gpuPolySpanFn functions than
+//   through original Unai method of using global variables u4,v4,du4 etc.
+// * Come up with some newer way of drawing rows of pixels than by calling
+//   gpuPolySpanFn through function pointer. For every row, at least on
+//   MIPS platforms, many registers are having to be pushed/popped from stack
+//   on each call, which is strange since MIPS has so many registers.
+// * MIPS MXU/ASM optimized gpuPolySpanFn ?
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Disabled original Unai poly routines left here for reference:
+// ( from gpu_raster_polygon.h )
+//////////////////////////////////////////////////////////////////////////
+#define GPU_TESTRANGE3() \
+{ \
+       if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \
+       if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \
+       if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \
+       if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \
+       if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \
+       if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \
+}
+
+/*----------------------------------------------------------------------
+F3
+----------------------------------------------------------------------*/
+
+void gpuDrawF3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]);
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]);
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]);
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]);
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       PixelData = GPU_RGB16(PacketBuffer.U4[0]);
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);
+                       GPU_SWAP(y1, y2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+               }
+       }
+
+       ya = y2 - y0;
+       yb = y2 - y1;
+       dx =(x2 - x1) * ya - (x2 - x0) * yb;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               dx3 = xLoDivx((x2 - x0), (y2 - y0));
+                               dx4 = xLoDivx((x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               dx3 = xLoDivx((x1 - x0), (y1 - y0));
+                               dx4 = xLoDivx((x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               x4  = i2x(x1);
+                               x3  = i2x(x0) + (dx3 * (y1 - y0));
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               x3  = i2x(x1);
+                               x4  = i2x(x0) + (dx4 * (y1 - y0));
+                               dx3 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;
+                       x4 += dx4*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;
+               x4+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+                       if(xa < xmin) xa = xmin;
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+FT3
+----------------------------------------------------------------------*/
+
+void gpuDrawFT3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 u0, u1, u2, u3, du3=0;
+       s32 v0, v1, v2, v3, dv3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       u0 = PacketBuffer.U1[8];  v0 = PacketBuffer.U1[9];
+       u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17];
+       u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25];
+
+       r4 = s32(PacketBuffer.U1[0]);
+       g4 = s32(PacketBuffer.U1[1]);
+       b4 = s32(PacketBuffer.U1[2]);
+       dr4 = dg4 = db4 = 0;
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);
+                       GPU_SWAP(v0, v1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);
+                       GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(u1, u2, temp);
+                       GPU_SWAP(v1, v2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);
+                       GPU_SWAP(v0, v1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+
+       s32 iF,iS;
+       xInv( dx, iF, iS);
+       du4 = xInvMulx( du4, iF, iS);
+       dv4 = xInvMulx( dv4, iF, iS);
+       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       u3 = i2x(u0);
+                       v3 = i2x(v0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv( (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               du3 = xInvMulx( (u2 - u0), iF, iS);
+                               dv3 = xInvMulx( (v2 - v0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv( (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               du3 = xInvMulx( (u1 - u0), iF, iS);
+                               dv3 = xInvMulx( (v1 - v0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               u3 = i2x(u0) + (du3 * temp);
+                               v3 = i2x(v0) + (dv3 * temp);
+                               x3 = i2x(x0) + (dx3 * temp);
+                               x4 = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               u3 = i2x(u1);
+                               v3 = i2x(v1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+                               xInv( (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               du3 = xInvMulx( (u2 - u1), iF, iS);
+                               dv3 = xInvMulx( (v2 - v1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;
+                       x4 += dx4*temp;
+                       u3 += du3*temp;
+                       v3 += dv3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;
+               x4+= fixed_HALF;
+               u3+= fixed_HALF;
+               v4+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               u4 = u3 + du4*temp;
+                               v4 = v3 + dv4*temp;
+                       }
+                       else
+                       {
+                               u4 = u3;
+                               v4 = v3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+G3
+----------------------------------------------------------------------*/
+
+void gpuDrawG3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 r0, r1, r2, r3, dr3=0;
+       s32 g0, g1, g2, g3, dg3=0;
+       s32 b0, b1, b2, b3, db3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
+       r1 = PacketBuffer.U1[8];        g1 = PacketBuffer.U1[9];        b1 = PacketBuffer.U1[10];
+       r2 = PacketBuffer.U1[16];       g2 = PacketBuffer.U1[17];       b2 = PacketBuffer.U1[18];
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(r1, r2, temp);         GPU_SWAP(g1, g2, temp);   GPU_SWAP(b1, b2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(r0, r1, temp);   GPU_SWAP(g0, g1, temp);               GPU_SWAP(b0, b1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+       s32 iF,iS;
+       xInv(            dx, iF, iS);
+       dr4 = xInvMulx( dr4, iF, iS);
+       dg4 = xInvMulx( dg4, iF, iS);
+       db4 = xInvMulx( db4, iF, iS);
+       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+       lInc = db + dg + dr;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       r3 = i2x(r0);
+                       g3 = i2x(g0);
+                       b3 = i2x(b0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv(           (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               dr3 = xInvMulx( (r2 - r0), iF, iS);
+                               dg3 = xInvMulx( (g2 - g0), iF, iS);
+                               db3 = xInvMulx( (b2 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv(           (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               dr3 = xInvMulx( (r1 - r0), iF, iS);
+                               dg3 = xInvMulx( (g1 - g0), iF, iS);
+                               db3 = xInvMulx( (b1 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               r3  = i2x(r0) + (dr3 * temp);
+                               g3  = i2x(g0) + (dg3 * temp);
+                               b3  = i2x(b0) + (db3 * temp);
+                               x3  = i2x(x0) + (dx3 * temp);
+                               x4  = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               r3 = i2x(r1);
+                               g3 = i2x(g1);
+                               b3 = i2x(b1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                               xInv(           (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               dr3 = xInvMulx( (r2 - r1), iF, iS);
+                               dg3 = xInvMulx( (g2 - g1), iF, iS);
+                               db3 = xInvMulx( (b2 - b1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;   x4 += dx4*temp;
+                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;  x4+= fixed_HALF;
+               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       }
+                       else
+                       {
+                               r4 = r3;  g4 = g3;  b4 = b3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+GT3
+----------------------------------------------------------------------*/
+
+void gpuDrawGT3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 u0, u1, u2, u3, du3=0;
+       s32 v0, v1, v2, v3, dv3=0;
+       s32 r0, r1, r2, r3, dr3=0;
+       s32 g0, g1, g2, g3, dg3=0;
+       s32 b0, b1, b2, b3, db3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+
+       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
+       u0 = PacketBuffer.U1[8];        v0 = PacketBuffer.U1[9];
+       r1 = PacketBuffer.U1[12];       g1 = PacketBuffer.U1[13];       b1 = PacketBuffer.U1[14];
+       u1 = PacketBuffer.U1[20];       v1 = PacketBuffer.U1[21];
+       r2 = PacketBuffer.U1[24];       g2 = PacketBuffer.U1[25];       b2 = PacketBuffer.U1[26];
+       u2 = PacketBuffer.U1[32];       v2 = PacketBuffer.U1[33];
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);   GPU_SWAP(b0, b1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(u1, u2, temp);         GPU_SWAP(v1, v2, temp);
+                       GPU_SWAP(r1, r2, temp);   GPU_SWAP(g1, g2, temp);               GPU_SWAP(b1, b2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+       s32 iF,iS;
+
+       xInv(            dx, iF, iS);
+       du4 = xInvMulx( du4, iF, iS);
+       dv4 = xInvMulx( dv4, iF, iS);
+       dr4 = xInvMulx( dr4, iF, iS);
+       dg4 = xInvMulx( dg4, iF, iS);
+       db4 = xInvMulx( db4, iF, iS);
+       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+       lInc = db + dg + dr;
+       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       u3 = i2x(u0);
+                       v3 = i2x(v0);
+                       r3 = i2x(r0);
+                       g3 = i2x(g0);
+                       b3 = i2x(b0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv(           (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               du3 = xInvMulx( (u2 - u0), iF, iS);
+                               dv3 = xInvMulx( (v2 - v0), iF, iS);
+                               dr3 = xInvMulx( (r2 - r0), iF, iS);
+                               dg3 = xInvMulx( (g2 - g0), iF, iS);
+                               db3 = xInvMulx( (b2 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv(           (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               du3 = xInvMulx( (u1 - u0), iF, iS);
+                               dv3 = xInvMulx( (v1 - v0), iF, iS);
+                               dr3 = xInvMulx( (r1 - r0), iF, iS);
+                               dg3 = xInvMulx( (g1 - g0), iF, iS);
+                               db3 = xInvMulx( (b1 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               u3  = i2x(u0) + (du3 * temp);
+                               v3  = i2x(v0) + (dv3 * temp);
+                               r3  = i2x(r0) + (dr3 * temp);
+                               g3  = i2x(g0) + (dg3 * temp);
+                               b3  = i2x(b0) + (db3 * temp);
+                               x3  = i2x(x0) + (dx3 * temp);
+                               x4  = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               u3 = i2x(u1);
+                               v3 = i2x(v1);
+                               r3 = i2x(r1);
+                               g3 = i2x(g1);
+                               b3 = i2x(b1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                               xInv(           (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               du3 = xInvMulx( (u2 - u1), iF, iS);
+                               dv3 = xInvMulx( (v2 - v1), iF, iS);
+                               dr3 = xInvMulx( (r2 - r1), iF, iS);
+                               dg3 = xInvMulx( (g2 - g1), iF, iS);
+                               db3 = xInvMulx( (b2 - b1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;   x4 += dx4*temp;
+                       u3 += du3*temp;   v3 += dv3*temp;
+                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;  x4+= fixed_HALF;
+               u3+= fixed_HALF;  v4+= fixed_HALF;
+               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3,        b3+=db3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin))     continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               u4 = u3 + du4*temp;   v4 = v3 + dv4*temp;
+                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       }
+                       else
+                       {
+                               u4 = u3;  v4 = v3;
+                               r4 = r3;  g4 = g3;  b4 = b3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Original Unai poly routines left here for reference:
+// ( from gpu_inner.h ) NOTE: this uses 16.16, not 22.10 fixed point
+//////////////////////////////////////////////////////////////////////////
+template<const int CF>
+INLINE void  gpuPolySpanFn(u16 *pDst, u32 count)
+{
+       if (!TM)
+       {       
+               // NO TEXTURE
+               if (!G)
+               {
+                       // NO GOURAUD
+                       u16 data;
+                       if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); }
+                       else data=PixelData;
+                       if ((!M)&&(!B))
+                       {
+                               if (MB) { data = data | 0x8000; }
+                               do { *pDst++ = data; } while (--count);
+                       }
+                       else if ((M)&&(!B))
+                       {
+                               if (MB) { data = data | 0x8000; }
+                               do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
+                       }
+                       else
+                       {
+                               u16 uSrc;
+                               u16 uDst;
+                               u32 uMsk; if (BM==0) uMsk=0x7BDE;
+                               u32 bMsk; if (BI) bMsk=blit_mask;
+                               do
+                               {
+                                       // blit-mask
+                                       if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endtile; }
+                                       //  masking
+                                       uDst = *pDst;
+                                       if(M) { if (uDst&0x8000) goto endtile;  }
+                                       uSrc = data;
+                                       //  blend
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                                       if (MB) { *pDst = uSrc | 0x8000; }
+                                       else    { *pDst = uSrc; }
+                                       endtile: pDst++;
+                               }
+                               while (--count);
+                       }
+               }
+               else
+               {
+                       // GOURAUD
+                       u16 uDst;
+                       u16 uSrc;
+                       u32 linc=lInc;
+                       u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));
+                       u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+                       u32 bMsk; if (BI) bMsk=blit_mask;
+                       do
+                       {
+                               // blit-mask
+                               if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endgou; }
+                               //  masking
+                               if(M) { uDst = *pDst;  if (uDst&0x8000) goto endgou;  }
+                               //  blend
+                               if(B)
+                               {
+                                       //  light
+                                       gpuLightingRGB(uSrc,lCol);
+                                       if(!M)    { uDst = *pDst; }
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                               }
+                               else
+                               {
+                                       //  light
+                                       gpuLightingRGB(uSrc,lCol);
+                               }
+                               if (MB) { *pDst = uSrc | 0x8000; }
+                               else    { *pDst = uSrc; }
+                               endgou: pDst++; lCol=(lCol+linc);
+                       }
+                       while (--count);
+               }
+       }
+       else
+       {
+               // TEXTURE
+               u16 uDst;
+               u16 uSrc;
+               u32 linc; if (L&&G) linc=lInc;
+               u32 tinc=tInc;
+               u32 tmsk=tMsk;
+               u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk;
+               const u16* _TBA=TBA;
+               const u16* _CBA; if (TM!=3) _CBA=CBA;
+               u32 lCol;
+               if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); }
+               else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));  }
+               u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+               u32 bMsk; if (BI) bMsk=blit_mask;
+               do
+               {
+                       // blit-mask
+                       if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endpoly; }
+                       //  masking
+                       if(M) { uDst = *pDst;  if (uDst&0x8000) goto endpoly;  }
+                       //  texture
+                       if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; }
+                       if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc)  goto endpoly; }
+                       if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc)  goto endpoly; }
+                       //  blend
+                       if(B)
+                       {
+                               if (uSrc&0x8000)
+                               {
+                                       //  light
+                                       if(L) gpuLightingTXT(uSrc, lCol);
+                                       if(!M)    { uDst = *pDst; }
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                               }
+                               else
+                               {
+                                       // light
+                                       if(L) gpuLightingTXT(uSrc, lCol);
+                               }
+                       }
+                       else
+                       {
+                               //  light
+                               if(L)  { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; }
+                       }
+                       if (MB) { *pDst = uSrc | 0x8000; }
+                       else    { *pDst = uSrc; }
+                       endpoly: pDst++;
+                       tCor=(tCor+tinc)&tmsk;
+                       if (L&&G) lCol=(lCol+linc);
+               }
+               while (--count);
+       }
+}
diff --git a/plugins/gpu_senquack/debug.h b/plugins/gpu_senquack/debug.h

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/plugins/gpu_senquack/gpu.cpp b/plugins/gpu_senquack/gpu.cpp

new file mode 100644 (file)

index 0000000..5f2929f
--- /dev/null
+++ b/plugins/gpu_senquack/gpu.cpp
@@ -0,0 +1,830 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#include <stddef.h>
+#include "plugins.h"
+#include "psxcommon.h"
+//#include "port.h"
+#include "gpu_senquack.h"
+
+#define VIDEO_WIDTH 320
+
+#ifdef TIME_IN_MSEC
+#define TPS 1000
+#else
+#define TPS 1000000
+#endif
+
+#define IS_PAL (gpu_senquack.GPU_GP1&(0x08<<17))
+
+//senquack - Original 512KB of guard space seems not to be enough, as Xenogears
+// accesses outside this range and crashes in town intro fight sequence.
+// Increased to 2MB total (double PSX VRAM) and Xenogears no longer
+// crashes, but some textures are still messed up. Also note that alignment min
+// is 16 bytes, needed for pixel-skipping rendering/blitting in high horiz res.
+// Extra 4KB is for guard room at beginning.
+// TODO: Determine cause of out-of-bounds write/reads. <-- Note: this is largely
+//  solved by adoption of PCSX Rearmed's 'gpulib' in gpulib_if.cpp, which
+//  replaces this file (gpu.cpp)
+//u16   GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(32)));
+static u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE*2 + 4096)/2] __attribute__((aligned(32)));
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// Inner loop driver instantiation file
+#include "gpu_inner.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal image drawing functions
+#include "gpu_raster_image.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal line drawing functions
+#include "gpu_raster_line.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal polygon drawing functions
+#include "gpu_raster_polygon.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU internal sprite drawing functions
+#include "gpu_raster_sprite.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// GPU command buffer execution/store
+#include "gpu_command.h"
+
+///////////////////////////////////////////////////////////////////////////////
+static void gpuReset(void)
+{
+       memset((void*)&gpu_senquack, 0, sizeof(gpu_senquack));
+       gpu_senquack.vram = (u16*)GPU_FrameBuffer + (4096/2); //4kb guard room in front
+       gpu_senquack.GPU_GP1 = 0x14802000;
+       gpu_senquack.DrawingArea[2] = 256;
+       gpu_senquack.DrawingArea[3] = 240;
+       gpu_senquack.DisplayArea[2] = 256;
+       gpu_senquack.DisplayArea[3] = 240;
+       gpu_senquack.DisplayArea[5] = 240;
+       gpu_senquack.TextureWindow[0] = 0;
+       gpu_senquack.TextureWindow[1] = 0;
+       gpu_senquack.TextureWindow[2] = 255;
+       gpu_senquack.TextureWindow[3] = 255;
+       //senquack - new vars must be updated whenever texture window is changed:
+       //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+       const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+       gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+       gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+       // Configuration options
+       gpu_senquack.config = gpu_senquack_config_ext;
+       gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+       gpu_senquack.frameskip.skipCount = gpu_senquack.config.frameskip_count;
+
+       SetupLightLUT();
+       SetupDitheringConstants();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+long GPU_init(void)
+{
+       gpuReset();
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+       // s_invTable
+       for(unsigned int i=1;i<=(1<<TABLE_BITS);++i)
+       {
+               s_invTable[i-1]=0x7fffffff/i;
+       }
+#endif
+
+       gpu_senquack.fb_dirty = true;
+       gpu_senquack.dma.last_dma = NULL;
+       return (0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+long GPU_shutdown(void)
+{
+       return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+long GPU_freeze(u32 bWrite, GPUFreeze_t* p2)
+{
+       if (!p2) return (0);
+       if (p2->ulFreezeVersion != 1) return (0);
+
+       if (bWrite)
+       {
+               p2->ulStatus = gpu_senquack.GPU_GP1;
+               memset(p2->ulControl, 0, sizeof(p2->ulControl));
+               // save resolution and registers for P.E.Op.S. compatibility
+               p2->ulControl[3] = (3 << 24) | ((gpu_senquack.GPU_GP1 >> 23) & 1);
+               p2->ulControl[4] = (4 << 24) | ((gpu_senquack.GPU_GP1 >> 29) & 3);
+               p2->ulControl[5] = (5 << 24) | (gpu_senquack.DisplayArea[0] | (gpu_senquack.DisplayArea[1] << 10));
+               p2->ulControl[6] = (6 << 24) | (2560 << 12);
+               p2->ulControl[7] = (7 << 24) | (gpu_senquack.DisplayArea[4] | (gpu_senquack.DisplayArea[5] << 10));
+               p2->ulControl[8] = (8 << 24) | ((gpu_senquack.GPU_GP1 >> 17) & 0x3f) | ((gpu_senquack.GPU_GP1 >> 10) & 0x40);
+               memcpy((void*)p2->psxVRam, (void*)gpu_senquack.vram, FRAME_BUFFER_SIZE);
+               return (1);
+       }
+       else
+       {
+               extern void GPU_writeStatus(u32 data);
+               gpu_senquack.GPU_GP1 = p2->ulStatus;
+               memcpy((void*)gpu_senquack.vram, (void*)p2->psxVRam, FRAME_BUFFER_SIZE);
+               GPU_writeStatus((5 << 24) | p2->ulControl[5]);
+               GPU_writeStatus((7 << 24) | p2->ulControl[7]);
+               GPU_writeStatus((8 << 24) | p2->ulControl[8]);
+               gpuSetTexture(gpu_senquack.GPU_GP1);
+               return (1);
+       }
+       return (0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU DMA comunication
+
+///////////////////////////////////////////////////////////////////////////////
+u8 PacketSize[256] =
+{
+       0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              0-15
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              16-31
+       3, 3, 3, 3, 6, 6, 6, 6, 4, 4, 4, 4, 8, 8, 8, 8, //              32-47
+       5, 5, 5, 5, 8, 8, 8, 8, 7, 7, 7, 7, 11, 11, 11, 11,     //      48-63
+       2, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, //              64-79
+       3, 3, 3, 3, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, //              80-95
+       2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, //              96-111
+       1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, //              112-127
+       3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              128-
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              144
+       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //              160
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  //
+};
+
+///////////////////////////////////////////////////////////////////////////////
+INLINE void gpuSendPacket()
+{
+       gpuSendPacketFunction(gpu_senquack.PacketBuffer.U4[0]>>24);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+INLINE void gpuCheckPacket(u32 uData)
+{
+       if (gpu_senquack.PacketCount)
+       {
+               gpu_senquack.PacketBuffer.U4[gpu_senquack.PacketIndex++] = uData;
+               --gpu_senquack.PacketCount;
+       }
+       else
+       {
+               gpu_senquack.PacketBuffer.U4[0] = uData;
+               gpu_senquack.PacketCount = PacketSize[uData >> 24];
+               gpu_senquack.PacketIndex = 1;
+       }
+       if (!gpu_senquack.PacketCount) gpuSendPacket();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_writeDataMem(u32* dmaAddress, int dmaCount)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeDataMem(%d)\n",dmaCount);
+       #endif
+       u32 data;
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+
+       while (dmaCount) 
+       {
+               if (gpu_senquack.dma.FrameToWrite)
+               {
+                       while (dmaCount)
+                       {
+                               dmaCount--;
+                               data = *dmaAddress++;
+                               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+                               gpu_senquack.dma.pvram[gpu_senquack.dma.px] = data;
+                               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+                               {
+                                       gpu_senquack.dma.px = 0;
+                                       gpu_senquack.dma.pvram += 1024;
+                                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                                       {
+                                               gpu_senquack.dma.FrameToWrite = false;
+                                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                                               gpu_senquack.fb_dirty = true;
+                                               break;
+                                       }
+                               }
+                               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+                               gpu_senquack.dma.pvram[gpu_senquack.dma.px] = data>>16;
+                               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+                               {
+                                       gpu_senquack.dma.px = 0;
+                                       gpu_senquack.dma.pvram += 1024;
+                                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                                       {
+                                               gpu_senquack.dma.FrameToWrite = false;
+                                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                                               gpu_senquack.fb_dirty = true;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               else
+               {
+                       data = *dmaAddress++;
+                       dmaCount--;
+                       gpuCheckPacket(data);
+               }
+       }
+
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 | 0x14000000) & ~0x60000000;
+}
+
+long GPU_dmaChain(u32 *rambase, u32 start_addr)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_dmaChain(0x%x)\n",start_addr);
+       #endif
+
+       u32 addr, *list;
+       u32 len, count;
+       long dma_words = 0;
+
+       if (gpu_senquack.dma.last_dma) *gpu_senquack.dma.last_dma |= 0x800000;
+       
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+       
+       addr = start_addr & 0xffffff;
+       for (count = 0; addr != 0xffffff; count++)
+       {
+               list = rambase + (addr & 0x1fffff) / 4;
+               len = list[0] >> 24;
+               addr = list[0] & 0xffffff;
+
+               dma_words += 1 + len;
+
+               // add loop detection marker
+               list[0] |= 0x800000;
+
+               if (len) GPU_writeDataMem(list + 1, len);
+
+               if (addr & 0x800000)
+               {
+                       #ifdef ENABLE_GPU_LOG_SUPPORT
+                               fprintf(stdout,"GPU_dmaChain(LOOP)\n");
+                       #endif
+                       break;
+               }
+       }
+
+       // remove loop detection markers
+       addr = start_addr & 0x1fffff;
+       while (count-- > 0)
+       {
+               list = rambase + addr / 4;
+               addr = list[0] & 0x1fffff;
+               list[0] &= ~0x800000;
+       }
+       
+       if (gpu_senquack.dma.last_dma) *gpu_senquack.dma.last_dma &= ~0x800000;
+       gpu_senquack.dma.last_dma = rambase + (start_addr & 0x1fffff) / 4;
+
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 | 0x14000000) & ~0x60000000;
+
+       return dma_words;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_writeData(u32 data)
+{
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeData()\n");
+       #endif
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+
+       if (gpu_senquack.dma.FrameToWrite)
+       {
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               gpu_senquack.dma.pvram[gpu_senquack.dma.px]=(u16)data;
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToWrite = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                               gpu_senquack.fb_dirty = true;
+                       }
+               }
+               if (gpu_senquack.dma.FrameToWrite)
+               {
+                       if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+                       gpu_senquack.dma.pvram[gpu_senquack.dma.px]=data>>16;
+                       if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+                       {
+                               gpu_senquack.dma.px = 0;
+                               gpu_senquack.dma.pvram += 1024;
+                               if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                               {
+                                       gpu_senquack.dma.FrameToWrite = false;
+                                       gpu_senquack.GPU_GP1 &= ~0x08000000;
+                                       gpu_senquack.fb_dirty = true;
+                               }
+                       }
+               }
+       }
+       else
+       {
+               gpuCheckPacket(data);
+       }
+       gpu_senquack.GPU_GP1 |= 0x14000000;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_readDataMem(u32* dmaAddress, int dmaCount)
+{
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_readDataMem(%d)\n",dmaCount);
+       #endif
+       if(!gpu_senquack.dma.FrameToRead) return;
+
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+       do 
+       {
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               // lower 16 bit
+               //senquack - 64-bit fix (from notaz)
+               //u32 data = (unsigned long)gpu_senquack.dma.pvram[gpu_senquack.dma.px];
+               u32 data = (u32)gpu_senquack.dma.pvram[gpu_senquack.dma.px];
+
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+               }
+
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               // higher 16 bit (always, even if it's an odd width)
+               //senquack - 64-bit fix (from notaz)
+               //data |= (unsigned long)(gpu_senquack.dma.pvram[gpu_senquack.dma.px])<<16;
+               data |= (u32)(gpu_senquack.dma.pvram[gpu_senquack.dma.px])<<16;
+               
+               *dmaAddress++ = data;
+
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToRead = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                               break;
+                       }
+               }
+       } while (--dmaCount);
+
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 | 0x14000000) & ~0x60000000;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+u32 GPU_readData(void)
+{
+       const u16 *VIDEO_END = (u16*)gpu_senquack.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_readData()\n");
+       #endif
+       gpu_senquack.GPU_GP1 &= ~0x14000000;
+       if (gpu_senquack.dma.FrameToRead)
+       {
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               gpu_senquack.GPU_GP0 = gpu_senquack.dma.pvram[gpu_senquack.dma.px];
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToRead = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                       }
+               }
+               if ((&gpu_senquack.dma.pvram[gpu_senquack.dma.px])>(VIDEO_END)) gpu_senquack.dma.pvram-=512*1024;
+               gpu_senquack.GPU_GP0 |= gpu_senquack.dma.pvram[gpu_senquack.dma.px]<<16;
+               if (++gpu_senquack.dma.px >= gpu_senquack.dma.x_end)
+               {
+                       gpu_senquack.dma.px = 0;
+                       gpu_senquack.dma.pvram += 1024;
+                       if (++gpu_senquack.dma.py >= gpu_senquack.dma.y_end)
+                       {
+                               gpu_senquack.dma.FrameToRead = false;
+                               gpu_senquack.GPU_GP1 &= ~0x08000000;
+                       }
+               }
+
+       }
+       gpu_senquack.GPU_GP1 |= 0x14000000;
+
+       return (gpu_senquack.GPU_GP0);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+u32 GPU_readStatus(void)
+{
+       return gpu_senquack.GPU_GP1;
+}
+
+INLINE void GPU_NoSkip(void)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_NoSkip()\n");
+       #endif
+       gpu_senquack.frameskip.wasSkip = gpu_senquack.frameskip.isSkip;
+       if (gpu_senquack.frameskip.isSkip)
+       {
+               gpu_senquack.frameskip.isSkip = false;
+               gpu_senquack.frameskip.skipGPU = false;
+       }
+       else
+       {
+               gpu_senquack.frameskip.isSkip = gpu_senquack.frameskip.skipFrame;
+               gpu_senquack.frameskip.skipGPU = gpu_senquack.frameskip.skipFrame;
+       }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void  GPU_writeStatus(u32 data)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeStatus(%d,%d)\n",data>>24,data & 0xff);
+       #endif
+       switch (data >> 24) {
+       case 0x00:
+               gpuReset();
+               break;
+       case 0x01:
+               gpu_senquack.GPU_GP1 &= ~0x08000000;
+               gpu_senquack.PacketCount = 0;
+               gpu_senquack.dma.FrameToRead = gpu_senquack.dma.FrameToWrite = false;
+               break;
+       case 0x02:
+               gpu_senquack.GPU_GP1 &= ~0x08000000;
+               gpu_senquack.PacketCount = 0;
+               gpu_senquack.dma.FrameToRead = gpu_senquack.dma.FrameToWrite = false;
+               break;
+       case 0x03:
+               gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x00800000) | ((data & 1) << 23);
+               break;
+       case 0x04:
+               if (data == 0x04000000) gpu_senquack.PacketCount = 0;
+               gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x60000000) | ((data & 3) << 29);
+               break;
+       case 0x05:
+               // Start of Display Area in VRAM
+               gpu_senquack.DisplayArea[0] = data & 0x3ff;         // X (0..1023)
+               gpu_senquack.DisplayArea[1] = (data >> 10) & 0x1ff; // Y (0..511)
+               GPU_NoSkip();
+               break;
+       case 0x06:
+               // GP1(06h) - Horizontal Display range (on Screen)
+               // 0-11   X1 (260h+0)       ;12bit       ;\counted in 53.222400MHz units,
+               // 12-23  X2 (260h+320*8)   ;12bit       ;/relative to HSYNC
+
+               // senquack - gpu_senquack completely ignores GP1(0x06) command and
+               // lacks even a place in DisplayArea[] array to store the values.
+               // It seems to have been concerned only with vertical display range
+               // and centering top/bottom. I will not add support here, and
+               // focus instead on the gpulib version (gpulib_if.cpp) which uses
+               // gpulib for its PS1->host framebuffer blitting.
+               break;
+       case 0x07:
+               // GP1(07h) - Vertical Display range (on Screen)
+               // 0-9   Y1 (NTSC=88h-(224/2), (PAL=A3h-(264/2))  ;\scanline numbers on screen,
+               // 10-19 Y2 (NTSC=88h+(224/2), (PAL=A3h+(264/2))  ;/relative to VSYNC
+               // 20-23 Not used (zero)
+               {
+                       u32 v1=data & 0x000003FF; //(short)(data & 0x3ff);
+                       u32 v2=(data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff);
+                       if ((gpu_senquack.DisplayArea[4]!=v1)||(gpu_senquack.DisplayArea[5]!=v2))
+                       {
+                               gpu_senquack.DisplayArea[4] = v1;
+                               gpu_senquack.DisplayArea[5] = v2;
+                               #ifdef ENABLE_GPU_LOG_SUPPORT
+                                       fprintf(stdout,"video_clear(CHANGE_Y)\n");
+                               #endif
+                               video_clear();
+                       }
+               }
+               break;
+       case 0x08:
+               {
+                       static const u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 };
+                       static const u32 VerticalResolution[4] = { 240, 480, 256, 480 };
+                       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10);
+                       #ifdef ENABLE_GPU_LOG_SUPPORT
+                               fprintf(stdout,"GPU_writeStatus(RES=%dx%d,BITS=%d,PAL=%d)\n",HorizontalResolution[(gpu_senquack.GPU_GP1 >> 16) & 7],
+                                               VerticalResolution[(gpu_senquack.GPU_GP1 >> 19) & 3],(gpu_senquack.GPU_GP1&0x00200000?24:15),(IS_PAL?1:0));
+                       #endif
+                       // Video mode change
+                       u32 new_width = HorizontalResolution[(gpu_senquack.GPU_GP1 >> 16) & 7];
+                       u32 new_height = VerticalResolution[(gpu_senquack.GPU_GP1 >> 19) & 3];
+
+                       if (gpu_senquack.DisplayArea[2] != new_width || gpu_senquack.DisplayArea[3] != new_height)
+                       {
+                               // Update width
+                               gpu_senquack.DisplayArea[2] = new_width;
+
+                               if (PixelSkipEnabled()) {
+                                       // Set blit_mask for high horizontal resolutions. This allows skipping
+                                       //  rendering pixels that would never get displayed on low-resolution
+                                       //  platforms that use simple pixel-dropping scaler.
+                                       switch (gpu_senquack.DisplayArea[2])
+                                       {
+                                               case 512: gpu_senquack.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+                                               case 640: gpu_senquack.blit_mask = 0xaa; break; // GPU_BlitWS
+                                               default:  gpu_senquack.blit_mask = 0;    break;
+                                       }
+                               } else {
+                                       gpu_senquack.blit_mask = 0;
+                               }
+
+                               // Update height
+                               gpu_senquack.DisplayArea[3] = new_height;
+
+                               if (LineSkipEnabled()) {
+                                       // Set rendering line-skip (only render every other line in high-res
+                                       //  480 vertical mode, or, optionally, force it for all video modes)
+
+                                       if (gpu_senquack.DisplayArea[3] == 480) {
+                                               if (gpu_senquack.config.ilace_force) {
+                                                       gpu_senquack.ilace_mask = 3; // Only need 1/4 of lines
+                                               } else {
+                                                       gpu_senquack.ilace_mask = 1; // Only need 1/2 of lines
+                                               }
+                                       } else {
+                                               // Vert resolution changed from 480 to lower one
+                                               gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+                                       }
+                               } else {
+                                       gpu_senquack.ilace_mask = 0;
+                               }
+
+                               #ifdef ENABLE_GPU_LOG_SUPPORT
+                                       fprintf(stdout,"video_clear(CHANGE_RES)\n");
+                               #endif
+                               video_clear();
+                       }
+
+               }
+               break;
+       case 0x10:
+               switch (data & 0xff) {
+                       case 2: gpu_senquack.GPU_GP0 = gpu_senquack.tex_window; break;
+                       case 3: gpu_senquack.GPU_GP0 = (gpu_senquack.DrawingArea[1] << 10) | gpu_senquack.DrawingArea[0]; break;
+                       case 4: gpu_senquack.GPU_GP0 = ((gpu_senquack.DrawingArea[3]-1) << 10) | (gpu_senquack.DrawingArea[2]-1); break;
+                       case 5: case 6: gpu_senquack.GPU_GP0 = (((u32)gpu_senquack.DrawingOffset[1] & 0x7ff) << 11) | ((u32)gpu_senquack.DrawingOffset[0] & 0x7ff); break;
+                       case 7: gpu_senquack.GPU_GP0 = 2; break;
+                       case 8: case 15: gpu_senquack.GPU_GP0 = 0xBFC03720; break;
+               }
+               break;
+       }
+}
+
+// Blitting functions
+#include "gpu_blit.h"
+
+static void gpuVideoOutput(void)
+{
+       int h0, x0, y0, w0, h1;
+
+       x0 = gpu_senquack.DisplayArea[0];
+       y0 = gpu_senquack.DisplayArea[1];
+
+       w0 = gpu_senquack.DisplayArea[2];
+       h0 = gpu_senquack.DisplayArea[3];  // video mode
+
+       h1 = gpu_senquack.DisplayArea[5] - gpu_senquack.DisplayArea[4]; // display needed
+       if (h0 == 480) h1 = Min2(h1*2,480);
+
+       bool isRGB24 = (gpu_senquack.GPU_GP1 & 0x00200000 ? true : false);
+       u16* dst16 = SCREEN;
+       u16* src16 = (u16*)gpu_senquack.vram;
+
+       // PS1 fb read wraps around (fixes black screen in 'Tobal no. 1')
+       unsigned int src16_offs_msk = 1024*512-1;
+       unsigned int src16_offs = (x0 + y0*1024) & src16_offs_msk;
+
+       //  Height centering
+       int sizeShift = 1;
+       if (h0 == 256) {
+               h0 = 240;
+       } else if (h0 == 480) {
+               sizeShift = 2;
+       }
+       if (h1 > h0) {
+               src16_offs = (src16_offs + (((h1-h0) / 2) * 1024)) & src16_offs_msk;
+               h1 = h0;
+       } else if (h1<h0) {
+               dst16 += ((h0-h1) >> sizeShift) * VIDEO_WIDTH;
+       }
+
+
+       /* Main blitter */
+       int incY = (h0==480) ? 2 : 1;
+       h0=(h0==480 ? 2048 : 1024);
+
+       {
+               const int li=gpu_senquack.ilace_mask;
+               bool pi = ProgressiveInterlaceEnabled();
+               bool pif = gpu_senquack.prog_ilace_flag;
+               switch ( w0 )
+               {
+                       case 256:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWDWW(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 368:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWWWWWWWS(src16 + src16_offs, dst16, isRGB24, 4);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 320:
+                               // Ensure 32-bit alignment for GPU_BlitWW() blitter:
+                               src16_offs &= ~1;
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWW(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 384:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWWWWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 512:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWSWWSWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+                       case 640:
+                               for(int y1=y0+h1; y0<y1; y0+=incY)
+                               {
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
+                               }
+                               break;
+               }
+               gpu_senquack.prog_ilace_flag = !gpu_senquack.prog_ilace_flag;
+       }
+       video_flip();
+}
+
+// Update frames-skip each second>>3 (8 times per second)
+#define GPU_FRAMESKIP_UPDATE 3
+
+static void GPU_frameskip (bool show)
+{
+       u32 now=get_ticks(); // current frame
+
+       // Update frameskip
+       if (gpu_senquack.frameskip.skipCount==0) gpu_senquack.frameskip.skipFrame=false; // frameskip off
+       else if (gpu_senquack.frameskip.skipCount==7) { if (show) gpu_senquack.frameskip.skipFrame=!gpu_senquack.frameskip.skipFrame; } // frameskip medium
+       else if (gpu_senquack.frameskip.skipCount==8) gpu_senquack.frameskip.skipFrame=true; // frameskip maximum
+       else
+       {
+               static u32 spd=100; // speed %
+               static u32 frames=0; // frames counter
+               static u32 prev=now; // previous fps calculation
+               frames++;
+               if ((now-prev)>=(TPS>>GPU_FRAMESKIP_UPDATE))
+               {
+                       if (IS_PAL) spd=(frames<<1);
+                       else spd=((frames*1001)/600);
+                       spd<<=GPU_FRAMESKIP_UPDATE;
+                       frames=0;
+                       prev=now;
+               }
+               switch(gpu_senquack.frameskip.skipCount)
+               {
+                       case 1: if (spd<50) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<50%)
+                       case 2: if (spd<60) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<60%)
+                       case 3: if (spd<70) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<70%)
+                       case 4: if (spd<80) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<80%)
+                       case 5: if (spd<90) gpu_senquack.frameskip.skipFrame=true; else gpu_senquack.frameskip.skipFrame=false; break; // frameskip on (spd<90%)
+               }
+       }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+void GPU_updateLace(void)
+{
+       // Interlace bit toggle
+       gpu_senquack.GPU_GP1 ^= 0x80000000;
+
+       // Update display?
+       if ((gpu_senquack.fb_dirty) && (!gpu_senquack.frameskip.wasSkip) && (!(gpu_senquack.GPU_GP1&0x00800000)))
+       {
+               // Display updated
+               gpuVideoOutput();
+               GPU_frameskip(true);
+               #ifdef ENABLE_GPU_LOG_SUPPORT
+                       fprintf(stdout,"GPU_updateLace(UPDATE)\n");
+               #endif
+       } else {
+               GPU_frameskip(false);
+               #ifdef ENABLE_GPU_LOG_SUPPORT
+                       fprintf(stdout,"GPU_updateLace(SKIP)\n");
+               #endif
+       }
+
+       if ((!gpu_senquack.frameskip.skipCount) && (gpu_senquack.DisplayArea[3] == 480)) gpu_senquack.frameskip.skipGPU=true; // Tekken 3 hack
+
+       gpu_senquack.fb_dirty=false;
+       gpu_senquack.dma.last_dma = NULL;
+}
+
+// Allows frontend to signal plugin to redraw screen after returning to emu
+void GPU_requestScreenRedraw()
+{
+       gpu_senquack.fb_dirty = true;
+}
+
+void GPU_getScreenInfo(GPUScreenInfo_t *sinfo)
+{
+       bool depth24 = (gpu_senquack.GPU_GP1 & 0x00200000 ? true : false);
+       int16_t hres = (uint16_t)gpu_senquack.DisplayArea[2];
+       int16_t vres = (uint16_t)gpu_senquack.DisplayArea[3];
+       int16_t w = hres; // Original gpu_senquack doesn't support width < 100%
+       int16_t h = gpu_senquack.DisplayArea[5] - gpu_senquack.DisplayArea[4];
+       if (vres == 480)
+               h *= 2;
+       if (h <= 0 || h > vres)
+               h = vres;
+
+       sinfo->vram    = (uint8_t*)gpu_senquack.vram;
+       sinfo->x       = (uint16_t)gpu_senquack.DisplayArea[0];
+       sinfo->y       = (uint16_t)gpu_senquack.DisplayArea[1];
+       sinfo->w       = w;
+       sinfo->h       = h;
+       sinfo->hres    = hres;
+       sinfo->vres    = vres;
+       sinfo->depth24 = depth24;
+       sinfo->pal     = IS_PAL;
+}
diff --git a/plugins/gpu_senquack/gpu.h b/plugins/gpu_senquack/gpu.h

new file mode 100644 (file)

index 0000000..7a46751
--- /dev/null
+++ b/plugins/gpu_senquack/gpu.h
@@ -0,0 +1,74 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef GPU_UNAI_GPU_H
+#define GPU_UNAI_GPU_H
+
+struct gpu_senquack_config_t {
+       uint8_t pixel_skip:1;     // If 1, allows skipping rendering pixels that
+                                 //  would not be visible when a high horizontal
+                                 //  resolution PS1 video mode is set.
+                                 //  Only applies to devices with low resolutions
+                                 //  like 320x240. Should not be used if a
+                                 //  down-scaling framebuffer blitter is in use.
+                                 //  Can cause gfx artifacts if game reads VRAM
+                                 //  to do framebuffer effects.
+
+       uint8_t ilace_force:3;    // Option to force skipping rendering of lines,
+                                 //  for very slow platforms. Value will be
+                                 //  assigned to 'ilace_mask' in gpu_senquack struct.
+                                 //  Normally 0. Value '1' will skip rendering
+                                 //  odd lines.
+
+       uint8_t scale_hires:1;    // If 1, will scale hi-res output to
+                                 //  320x240 when gpulib reads the frame.
+                                 //  Implies pixel_skip and ilace_force
+                                 //  (when height > 240).
+       uint8_t lighting:1;
+       uint8_t fast_lighting:1;
+       uint8_t blending:1;
+       uint8_t dithering:1;
+
+       //senquack Only PCSX Rearmed's version of gpu_senquack had this, and I
+       // don't think it's necessary. It would require adding 'AH' flag to
+       // gpuSpriteSpanFn() increasing size of sprite span function array.
+       //uint8_t enableAbbeyHack:1;  // Abe's Odyssey hack
+
+       ////////////////////////////////////////////////////////////////////////////
+       // Variables used only by older standalone version of gpu_senquack (gpu.cpp)
+#ifndef USE_GPULIB
+       uint8_t prog_ilace:1;         // Progressive interlace option (old option)
+                                     //  This option was somewhat oddly named:
+                                     //  When in interlaced video mode, on a low-res
+                                     //  320x240 device, only the even lines are
+                                     //  rendered. This option will take that one
+                                     //  step further and only render half the even
+                                     //  even lines one frame, and then the other half.
+       uint8_t frameskip_count:3;    // Frame skip (0..7)
+#endif
+};
+
+extern gpu_senquack_config_t gpu_senquack_config_ext;
+
+// TODO: clean up show_fps frontend option
+extern  bool show_fps;
+
+#endif // GPU_UNAI_GPU_H
diff --git a/plugins/gpu_senquack/gpu_arm.S b/plugins/gpu_senquack/gpu_arm.S

new file mode 100644 (file)

index 0000000..ec87f21
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_arm.S
@@ -0,0 +1,56 @@
+/*
+ * (C) Gražvydas "notaz" Ignotas, 2011
+ *
+ * This work is licensed under the terms of  GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "arm_features.h"
+
+.text
+.align 2
+
+@ in: r0=dst, r2=pal, r12=0x1e
+@ trashes r6-r8,lr,flags
+.macro do_4_pixels rs ibase obase
+.if \ibase - 1 < 0
+    and     r6, r12, \rs, lsl #1
+.else
+    and     r6, r12, \rs, lsr #\ibase-1
+.endif
+    and     r7, r12, \rs, lsr #\ibase+3
+    and     r8, r12, \rs, lsr #\ibase+7
+    and     lr, r12, \rs, lsr #\ibase+11
+    ldrh    r6, [r2, r6]
+    ldrh    r7, [r2, r7]
+    ldrh    r8, [r2, r8]
+    ldrh    lr, [r2, lr]
+    tst     r6, r6
+    strneh  r6, [r0, #\obase+0]
+    tst     r7, r7
+    strneh  r7, [r0, #\obase+2]
+    tst     r8, r8
+    strneh  r8, [r0, #\obase+4]
+    tst     lr, lr
+    strneh  lr, [r0, #\obase+6]
+.endm
+
+.global draw_spr16_full @ (u16 *d, void *s, u16 *pal, int lines)
+draw_spr16_full:
+    stmfd   sp!, {r4-r8,lr}
+    mov     r12, #0x1e             @ empty pixel
+
+0:
+    ldmia   r1, {r4,r5}
+    do_4_pixels r4, 0,  0
+    do_4_pixels r4, 16, 8
+    do_4_pixels r5, 0,  16
+    do_4_pixels r5, 16, 24
+    subs    r3, r3, #1
+    add     r0, r0, #2048
+    add     r1, r1, #2048
+    bgt     0b
+
+    ldmfd   sp!, {r4-r8,pc}
+
+@ vim:filetype=armasm
diff --git a/plugins/gpu_senquack/gpu_arm.h b/plugins/gpu_senquack/gpu_arm.h

new file mode 100644 (file)

index 0000000..b9f8f97
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_arm.h
@@ -0,0 +1,14 @@
+#ifndef __GPU_UNAI_GPU_ARM_H__
+#define __GPU_UNAI_GPU_ARM_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void draw_spr16_full(u16 *d, void *s, u16 *pal, int lines);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __GPU_UNAI_GPU_ARM_H__ */
diff --git a/plugins/gpu_senquack/gpu_blit.h b/plugins/gpu_senquack/gpu_blit.h

new file mode 100644 (file)

index 0000000..e93f12f
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_blit.h
@@ -0,0 +1,405 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _INNER_BLIT_H_
+#define _INNER_BLIT_H_
+
+#ifndef USE_BGR15
+#define RGB24(R,G,B)   (((((R)&0xF8)<<8)|(((G)&0xFC)<<3)|(((B)&0xF8)>>3)))
+#define RGB16X2(C)      (((C)&(0x1f001f<<10))>>10) | (((C)&(0x1f001f<<5))<<1) | (((C)&(0x1f001f<<0))<<11)
+#define RGB16(C)               (((C)&(0x1f<<10))>>10) | (((C)&(0x1f<<5))<<1) | (((C)&(0x1f<<0))<<11)
+#else
+#define RGB24(R,G,B)   ((((R)&0xF8)>>3)|(((G)&0xF8)<<2)|(((B)&0xF8)<<7))
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Blitting code with rescale and interlace support.
+
+INLINE void GPU_BlitWW(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 20;
+                       const u32* src32 = (const u32*) src; 
+                       u32* dst32 = (u32*)(void*) dst16;
+                       do{
+                               dst32[0] = RGB16X2(src32[0]);
+                               dst32[1] = RGB16X2(src32[1]);
+                               dst32[2] = RGB16X2(src32[2]);
+                               dst32[3] = RGB16X2(src32[3]);
+                               dst32[4] = RGB16X2(src32[4]);
+                               dst32[5] = RGB16X2(src32[5]);
+                               dst32[6] = RGB16X2(src32[6]);
+                               dst32[7] = RGB16X2(src32[7]);
+                               dst32 += 8;
+                               src32 += 8;
+                       }while(--uCount);
+               #else
+                       memcpy(dst16,src,640);
+               #endif
+       }
+       else
+       {
+               uCount = 20;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[ 3] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[ 4] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 5] = RGB24(src8[15], src8[16], src8[17] );
+                       dst16[ 6] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[ 7] = RGB24(src8[21], src8[22], src8[23] );
+
+                       dst16[ 8] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[ 9] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[10] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16[11] = RGB24(src8[33], src8[34], src8[35] );
+                       dst16[12] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[13] = RGB24(src8[39], src8[40], src8[41] );
+                       dst16[14] = RGB24(src8[42], src8[43], src8[44] );
+                       dst16[15] = RGB24(src8[45], src8[46], src8[47] );
+                       dst16 += 16;
+                       src8  += 48;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 32;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = RGB16(src16[3]);
+                               dst16[ 3] = RGB16(src16[4]);
+                               dst16[ 4] = RGB16(src16[6]);
+                               dst16[ 5] = RGB16(src16[8]);
+                               dst16[ 6] = RGB16(src16[9]);
+                               dst16[ 7] = RGB16(src16[11]);
+                               dst16[ 8] = RGB16(src16[12]);
+                               dst16[ 9] = RGB16(src16[14]);
+                               dst16 += 10;
+                               src16 += 16;
+                       }while(--uCount);
+               #else
+                       uCount = 64;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                               *dst16++ = *src16;
+                               src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 32;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[ 3] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 4] = RGB24(src8[18], src8[19], src8[20] );
+
+                       dst16[ 5] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[ 6] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[ 7] = RGB24(src8[33], src8[34], src8[35] );
+                       dst16[ 8] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[ 9] = RGB24(src8[42], src8[43], src8[44] );
+
+                       dst16 += 10;
+                       src8  += 48;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 32;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = RGB16(src16[2]);
+                               dst16[ 3] = RGB16(src16[3]);
+                               dst16[ 4] = RGB16(src16[4]);
+                               dst16[ 5] = RGB16(src16[6]);
+                               dst16[ 6] = RGB16(src16[7]);
+                               dst16[ 7] = RGB16(src16[8]);
+                               dst16[ 8] = RGB16(src16[9]);
+                               dst16[ 9] = RGB16(src16[10]);
+                               dst16 += 10;
+                               src16 += 12;
+                       }while(--uCount);
+               #else
+                       uCount = 64;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 32;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[2] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[3] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[4] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[5] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[6] = RGB24(src8[21], src8[22], src8[23] );
+                       dst16[7] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[8] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[9] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16 += 10;
+                       src8  += 36;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, bool isRGB24, u32 uClip_src)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 20;
+                       const u16* src16 = ((const u16*) src) + uClip_src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = RGB16(src16[2]);
+                               dst16[ 3] = RGB16(src16[3]);
+                               dst16[ 4] = RGB16(src16[4]);
+                               dst16[ 5] = RGB16(src16[5]);
+                               dst16[ 6] = RGB16(src16[6]);
+                               dst16[ 7] = RGB16(src16[7]);
+
+                               dst16[ 8] = RGB16(src16[9]);
+                               dst16[ 9] = RGB16(src16[10]);
+                               dst16[10] = RGB16(src16[11]);
+                               dst16[11] = RGB16(src16[12]);
+                               dst16[12] = RGB16(src16[13]);
+                               dst16[13] = RGB16(src16[14]);
+                               dst16[14] = RGB16(src16[15]);
+                               dst16[15] = RGB16(src16[16]);
+                               dst16 += 16;
+                               src16 += 18;
+                       }while(--uCount);
+               #else
+                       uCount = 40;
+                       const u16* src16 = ((const u16*) src) + uClip_src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 20;
+               const u8* src8 = (const u8*)src + (uClip_src<<1) + uClip_src;
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[ 3] = RGB24(src8[ 9], src8[10], src8[11] );
+                       dst16[ 4] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 5] = RGB24(src8[15], src8[16], src8[17] );
+                       dst16[ 6] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[ 7] = RGB24(src8[21], src8[22], src8[23] );
+
+                       dst16[ 8] = RGB24(src8[27], src8[28], src8[29] );
+                       dst16[ 9] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16[10] = RGB24(src8[33], src8[34], src8[35] );
+                       dst16[11] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[12] = RGB24(src8[39], src8[40], src8[41] );
+                       dst16[13] = RGB24(src8[42], src8[43], src8[44] );
+                       dst16[14] = RGB24(src8[45], src8[46], src8[47] );
+                       dst16[15] = RGB24(src8[48], src8[49], src8[50] );
+                       dst16 += 16;
+                       src8  += 54;
+               }while(--uCount);
+       }
+}
+
+INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 32;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[1]);
+                               dst16[ 2] = dst16[1];
+                               dst16[ 3] = RGB16(src16[2]);
+                               dst16[ 4] = RGB16(src16[3]);
+                               dst16[ 5] = RGB16(src16[4]);
+                               dst16[ 6] = RGB16(src16[5]);
+                               dst16[ 7] = dst16[6];
+                               dst16[ 8] = RGB16(src16[6]);
+                               dst16[ 9] = RGB16(src16[7]);
+                               dst16 += 10;
+                               src16 +=  8;
+                       }while(--uCount);
+               #else
+                       uCount = 64;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                               *dst16++ = *src16++;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 32;
+               const u8* src8 = (const u8*)src;
+               do{
+                       dst16[ 0] = RGB24(src8[0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[3], src8[ 4], src8[ 5] );
+                       dst16[ 2] = dst16[1];
+                       dst16[ 3] = RGB24(src8[6], src8[ 7], src8[ 8] );
+                       dst16[ 4] = RGB24(src8[9], src8[10], src8[11] );
+
+                       dst16[ 5] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 6] = RGB24(src8[15], src8[16], src8[17] );
+                       dst16[ 7] = dst16[6];
+                       dst16[ 8] = RGB24(src8[18], src8[19], src8[20] );
+                       dst16[ 9] = RGB24(src8[21], src8[22], src8[23] );
+                       dst16 += 10;
+                       src8  += 24;
+               }while(--uCount);
+       }
+}
+
+
+INLINE void GPU_BlitWS(const void* src, u16* dst16, bool isRGB24)
+{
+       u32 uCount;
+       if(!isRGB24)
+       {
+               #ifndef USE_BGR15
+                       uCount = 20;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               dst16[ 0] = RGB16(src16[0]);
+                               dst16[ 1] = RGB16(src16[2]);
+                               dst16[ 2] = RGB16(src16[4]);
+                               dst16[ 3] = RGB16(src16[6]);
+
+                               dst16[ 4] = RGB16(src16[8]);
+                               dst16[ 5] = RGB16(src16[10]);
+                               dst16[ 6] = RGB16(src16[12]);
+                               dst16[ 7] = RGB16(src16[14]);
+
+                               dst16[ 8] = RGB16(src16[16]);
+                               dst16[ 9] = RGB16(src16[18]);
+                               dst16[10] = RGB16(src16[20]);
+                               dst16[11] = RGB16(src16[22]);
+
+                               dst16[12] = RGB16(src16[24]);
+                               dst16[13] = RGB16(src16[26]);
+                               dst16[14] = RGB16(src16[28]);
+                               dst16[15] = RGB16(src16[30]);
+
+                               dst16 += 16;
+                               src16 += 32;
+                       }while(--uCount);
+               #else
+                       uCount = 320;
+                       const u16* src16 = (const u16*) src; 
+                       do{
+                               *dst16++ = *src16; src16+=2;
+                       }while(--uCount);
+               #endif
+       }
+       else
+       {
+               uCount = 20;
+               const u8* src8 = (const u8*) src; 
+               do{
+                       dst16[ 0] = RGB24(src8[ 0], src8[ 1], src8[ 2] );
+                       dst16[ 1] = RGB24(src8[ 6], src8[ 7], src8[ 8] );
+                       dst16[ 2] = RGB24(src8[12], src8[13], src8[14] );
+                       dst16[ 3] = RGB24(src8[18], src8[19], src8[20] );
+
+                       dst16[ 4] = RGB24(src8[24], src8[25], src8[26] );
+                       dst16[ 5] = RGB24(src8[30], src8[31], src8[32] );
+                       dst16[ 6] = RGB24(src8[36], src8[37], src8[38] );
+                       dst16[ 7] = RGB24(src8[42], src8[43], src8[44] );
+
+                       dst16[ 8] = RGB24(src8[48], src8[49], src8[50] );
+                       dst16[ 9] = RGB24(src8[54], src8[55], src8[56] );
+                       dst16[10] = RGB24(src8[60], src8[61], src8[62] );
+                       dst16[11] = RGB24(src8[66], src8[67], src8[68] );
+
+                       dst16[12] = RGB24(src8[72], src8[73], src8[74] );
+                       dst16[13] = RGB24(src8[78], src8[79], src8[80] );
+                       dst16[14] = RGB24(src8[84], src8[85], src8[86] );
+                       dst16[15] = RGB24(src8[90], src8[91], src8[92] );
+
+                       dst16 += 16;
+                       src8  += 96;
+               }while(--uCount);
+       }
+}
+
+#endif //_INNER_BLIT_H_
diff --git a/plugins/gpu_senquack/gpu_command.h b/plugins/gpu_senquack/gpu_command.h

new file mode 100644 (file)

index 0000000..d052ae8
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_command.h
@@ -0,0 +1,621 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_COMMAND_H__
+#define __GPU_UNAI_GPU_COMMAND_H__
+
+///////////////////////////////////////////////////////////////////////////////
+void gpuSetTexture(u16 tpage)
+{
+       u32 tmode, tx, ty;
+       gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x1FF) | (tpage & 0x1FF);
+       gpu_senquack.TextureWindow[0]&= ~gpu_senquack.TextureWindow[2];
+       gpu_senquack.TextureWindow[1]&= ~gpu_senquack.TextureWindow[3];
+
+       tmode = (tpage >> 7) & 3;  // 16bpp, 8bpp, or 4bpp texture colors?
+                                  // 0: 4bpp     1: 8bpp     2/3: 16bpp
+
+       // Nocash PSX docs state setting of 3 is same as setting of 2 (16bpp):
+       // Note: DrHell assumes 3 is same as 0.. TODO: verify which is correct?
+       if (tmode == 3) tmode = 2;
+
+       tx = (tpage & 0x0F) << 6;
+       ty = (tpage & 0x10) << 4;
+
+       tx += (gpu_senquack.TextureWindow[0] >> (2 - tmode));
+       ty += gpu_senquack.TextureWindow[1];
+       
+       gpu_senquack.BLEND_MODE  = ((tpage>>5) & 3) << 3;
+       gpu_senquack.TEXT_MODE   = (tmode + 1) << 5; // gpu_senquack.TEXT_MODE should be values 1..3, so add one
+       gpu_senquack.TBA = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(tx, ty)];
+}
+
+///////////////////////////////////////////////////////////////////////////////
+INLINE void gpuSetCLUT(u16 clut)
+{
+       gpu_senquack.CBA = &((u16*)gpu_senquack.vram)[(clut & 0x7FFF) << 4];
+}
+
+#ifdef  ENABLE_GPU_NULL_SUPPORT
+#define NULL_GPU() break
+#else
+#define NULL_GPU()
+#endif
+
+#ifdef  ENABLE_GPU_LOG_SUPPORT
+#define DO_LOG(expr) printf expr
+#else
+#define DO_LOG(expr) {}
+#endif
+
+#define Blending      (((PRIM&0x2) && BlendingEnabled()) ? (PRIM&0x2) : 0)
+#define Blending_Mode (((PRIM&0x2) && BlendingEnabled()) ? gpu_senquack.BLEND_MODE : 0)
+#define Lighting      (((~PRIM)&0x1) && LightingEnabled())
+// Dithering applies only to Gouraud-shaded polys or texture-blended polys:
+#define Dithering     (((((~PRIM)&0x1) || (PRIM&0x10)) && DitheringEnabled()) ?            \
+                       (ForcedDitheringEnabled() ? (1<<9) : (gpu_senquack.GPU_GP1 & (1 << 9))) \
+                       : 0)
+
+///////////////////////////////////////////////////////////////////////////////
+//Now handled by Rearmed's gpulib and gpu_senquack/gpulib_if.cpp:
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_senquack_t &gpu_senquack, u32 cmd_word)
+{
+       // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+       u8 num = (cmd_word >> 24) & 7;
+       switch (num) {
+               case 1: {
+                       // GP0(E1h) - Draw Mode setting (aka "Texpage")
+                       DO_LOG(("GP0(0xE1) DrawMode TexPage(0x%x)\n", cmd_word));
+                       u32 cur_texpage = gpu_senquack.GPU_GP1 & 0x7FF;
+                       u32 new_texpage = cmd_word & 0x7FF;
+                       if (cur_texpage != new_texpage) {
+                               gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x7FF) | new_texpage;
+                               gpuSetTexture(gpu_senquack.GPU_GP1);
+                       }
+               } break;
+
+               case 2: {
+                       // GP0(E2h) - Texture Window setting
+                       DO_LOG(("GP0(0xE2) TextureWindow(0x%x)\n", cmd_word));
+                       if (cmd_word != gpu_senquack.TextureWindowCur) {
+                               static const u8 TextureMask[32] = {
+                                       255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+                                       127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+                               };
+                               gpu_senquack.TextureWindowCur = cmd_word;
+                               gpu_senquack.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+                               gpu_senquack.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+                               gpu_senquack.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+                               gpu_senquack.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+                               gpu_senquack.TextureWindow[0] &= ~gpu_senquack.TextureWindow[2];
+                               gpu_senquack.TextureWindow[1] &= ~gpu_senquack.TextureWindow[3];
+
+                               // Inner loop vars must be updated whenever texture window is changed:
+                               const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+                               gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+                               gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+                               gpuSetTexture(gpu_senquack.GPU_GP1);
+                       }
+               } break;
+
+               case 3: {
+                       // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+                       DO_LOG(("GP0(0xE3) DrawingArea Pos(0x%x)\n", cmd_word));
+                       gpu_senquack.DrawingArea[0] = cmd_word         & 0x3FF;
+                       gpu_senquack.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+               } break;
+
+               case 4: {
+                       // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+                       DO_LOG(("GP0(0xE4) DrawingArea Size(0x%x)\n", cmd_word));
+                       gpu_senquack.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+                       gpu_senquack.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+               } break;
+
+               case 5: {
+                       // GP0(E5h) - Set Drawing Offset (X,Y)
+                       DO_LOG(("GP0(0xE5) DrawingOffset(0x%x)\n", cmd_word));
+                       gpu_senquack.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+                       gpu_senquack.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+               } break;
+
+               case 6: {
+                       // GP0(E6h) - Mask Bit Setting
+                       DO_LOG(("GP0(0xE6) SetMask(0x%x)\n", cmd_word));
+                       gpu_senquack.Masking  = (cmd_word & 0x2) <<  1;
+                       gpu_senquack.PixelMSB = (cmd_word & 0x1) <<  8;
+               } break;
+       }
+}
+
+void gpuSendPacketFunction(const int PRIM)
+{
+       //printf("0x%x\n",PRIM);
+
+       //senquack - TODO: optimize this (packet pointer union as prim draw parameter
+       // introduced as optimization for gpulib command-list processing)
+       PtrUnion packet = { .ptr = (void*)&gpu_senquack.PacketBuffer };
+
+       switch (PRIM)
+       {
+               case 0x02: {
+                       NULL_GPU();
+                       gpuClearImage(packet);    //  prim handles updateLace && skip
+                       gpu_senquack.fb_dirty = true;
+                       DO_LOG(("gpuClearImage(0x%x)\n",PRIM));
+               } break;
+
+               case 0x20:
+               case 0x21:
+               case 0x22:
+               case 0x23: {          // Monochrome 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyF(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyF(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x24:
+               case 0x25:
+               case 0x26:
+               case 0x27: {          // Textured 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+                               u32 driver_idx =
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+                               if (!FastLightingEnabled()) {
+                                       driver_idx |= Lighting;
+                               } else {
+                                       if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+                                               driver_idx |= Lighting;
+                               }
+
+                               PP driver = gpuPolySpanDrivers[driver_idx];
+                               gpuDrawPolyFT(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyFT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x28:
+               case 0x29:
+               case 0x2A:
+               case 0x2B: {          // Monochrome 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyF(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyF(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x2C:
+               case 0x2D:
+               case 0x2E:
+               case 0x2F: {          // Textured 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+                               u32 driver_idx =
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+                               if (!FastLightingEnabled()) {
+                                       driver_idx |= Lighting;
+                               } else {
+                                       if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+                                               driver_idx |= Lighting;
+                               }
+
+                               PP driver = gpuPolySpanDrivers[driver_idx];
+                               gpuDrawPolyFT(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyFT(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x30:
+               case 0x31:
+               case 0x32:
+               case 0x33: {          // Gouraud-shaded 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+                               // this is an untextured poly, so CF_LIGHT (texture blend)
+                               // shouldn't apply. Until the original array of template
+                               // instantiation ptrs is fixed, we're stuck with this. (TODO)
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyG(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyG(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x34:
+               case 0x35:
+               case 0x36:
+               case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyGT(packet, driver, false);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyGT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x38:
+               case 0x39:
+               case 0x3A:
+               case 0x3B: {          // Gouraud-shaded 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // See notes regarding '129' for 0x30..0x33 further above -senquack
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode |
+                                       gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyG(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyG(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x3C:
+               case 0x3D:
+               case 0x3E:
+               case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_senquack.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_senquack.TEXT_MODE |
+                                       gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+                               ];
+                               gpuDrawPolyGT(packet, driver, true); // is_quad = true
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyGT(0x%x) (4-pt QUAD)\n",PRIM));
+                       }
+               } break;
+
+               case 0x40:
+               case 0x41:
+               case 0x42:
+               case 0x43: {          // Monochrome line
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineF(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x48:
+               case 0x49:
+               case 0x4A:
+               case 0x4B:
+               case 0x4C:
+               case 0x4D:
+               case 0x4E:
+               case 0x4F: { // Monochrome line strip
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineF(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
+                       }
+                       if ((gpu_senquack.PacketBuffer.U4[3] & 0xF000F000) != 0x50005000)
+                       {
+                               gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[2];
+                               gpu_senquack.PacketBuffer.U4[2] = gpu_senquack.PacketBuffer.U4[3];
+                               gpu_senquack.PacketCount = 1;
+                               gpu_senquack.PacketIndex = 3;
+                       }
+               } break;
+
+               case 0x50:
+               case 0x51:
+               case 0x52:
+               case 0x53: {          // Gouraud-shaded line
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               // Index MSB selects Gouraud-shaded PixelSpanDriver:
+                               driver_idx |= (1 << 5);
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineG(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x58:
+               case 0x59:
+               case 0x5A:
+               case 0x5B:
+               case 0x5C:
+               case 0x5D:
+               case 0x5E:
+               case 0x5F: { // Gouraud-shaded line strip
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+                               // Index MSB selects Gouraud-shaded PixelSpanDriver:
+                               driver_idx |= (1 << 5);
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineG(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
+                       }
+                       if ((gpu_senquack.PacketBuffer.U4[4] & 0xF000F000) != 0x50005000)
+                       {
+                               gpu_senquack.PacketBuffer.U1[3 + (2 * 4)] = gpu_senquack.PacketBuffer.U1[3 + (0 * 4)];
+                               gpu_senquack.PacketBuffer.U4[0] = gpu_senquack.PacketBuffer.U4[2];
+                               gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[3];
+                               gpu_senquack.PacketBuffer.U4[2] = gpu_senquack.PacketBuffer.U4[4];
+                               gpu_senquack.PacketCount = 2;
+                               gpu_senquack.PacketIndex = 3;
+                       }
+               } break;
+
+               case 0x60:
+               case 0x61:
+               case 0x62:
+               case 0x63: {          // Monochrome rectangle (variable size)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x64:
+               case 0x65:
+               case 0x66:
+               case 0x67: {          // Textured rectangle (variable size)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+                               // This fixes Silent Hill running animation on loading screens:
+                               // (On PSX, color values 0x00-0x7F darken the source texture's color,
+                               //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+                               //  0x80 leaves source texture color unchanged, HOWEVER,
+                               //   gpu_senquack uses a simple lighting LUT whereby only the upper
+                               //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+                               //   0x80.
+                               // 
+                               // NOTE: I've changed all textured sprite draw commands here and
+                               //  elsewhere to use proper behavior, but left poly commands
+                               //  alone, I don't want to slow rendering down too much. (TODO)
+                               //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x68:
+               case 0x69:
+               case 0x6A:
+               case 0x6B: {          // Monochrome rectangle (1x1 dot)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[2] = 0x00010001;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x70:
+               case 0x71:
+               case 0x72:
+               case 0x73: {          // Monochrome rectangle (8x8)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[2] = 0x00080008;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x74:
+               case 0x75:
+               case 0x76:
+               case 0x77: {          // Textured rectangle (8x8)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[3] = 0x00080008;
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+                               //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+                               //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x78:
+               case 0x79:
+               case 0x7A:
+               case 0x7B: {          // Monochrome rectangle (16x16)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[2] = 0x00100010;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x7C:
+               case 0x7D:
+                       #ifdef __arm__
+                       /* Notaz 4bit sprites optimization */
+                       if ((!gpu_senquack.frameskip.skipGPU) && (!(gpu_senquack.GPU_GP1&0x180)) && (!(gpu_senquack.Masking|gpu_senquack.PixelMSB)))
+                       {
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               gpuDrawS16(packet);
+                               gpu_senquack.fb_dirty = true;
+                               break;
+                       }
+                       #endif
+               case 0x7E:
+               case 0x7F: {          // Textured rectangle (16x16)
+                       if (!gpu_senquack.frameskip.skipGPU)
+                       {
+                               NULL_GPU();
+                               gpu_senquack.PacketBuffer.U4[3] = 0x00100010;
+                               gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+                               //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+                               //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_senquack.fb_dirty = true;
+                               DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
+                       }
+               } break;
+
+               case 0x80:          //  vid -> vid
+                       gpuMoveImage(packet);   //  prim handles updateLace && skip
+                       if ((!gpu_senquack.frameskip.skipCount) && (gpu_senquack.DisplayArea[3] == 480)) // Tekken 3 hack
+                       {
+                               if (!gpu_senquack.frameskip.skipGPU) gpu_senquack.fb_dirty = true;
+                       }
+                       else
+                       {
+                               gpu_senquack.fb_dirty = true;
+                       }
+                       DO_LOG(("gpuMoveImage(0x%x)\n",PRIM));
+                       break;
+               case 0xA0:          //  sys ->vid
+                       gpuLoadImage(packet);   //  prim handles updateLace && skip
+                       DO_LOG(("gpuLoadImage(0x%x)\n",PRIM));
+                       break;
+               case 0xC0:          //  vid -> sys
+                       gpuStoreImage(packet);  //  prim handles updateLace && skip
+                       DO_LOG(("gpuStoreImage(0x%x)\n",PRIM));
+                       break;
+               case 0xE1 ... 0xE6: { // Draw settings
+                       gpuGP0Cmd_0xEx(gpu_senquack, gpu_senquack.PacketBuffer.U4[0]);
+               } break;
+       }
+}
+#endif //!USE_GPULIB
+///////////////////////////////////////////////////////////////////////////////
+// End of code specific to non-gpulib standalone version of gpu_senquack
+///////////////////////////////////////////////////////////////////////////////
+
+#endif /* __GPU_UNAI_GPU_COMMAND_H__ */
diff --git a/plugins/gpu_senquack/gpu_fixedpoint.h b/plugins/gpu_senquack/gpu_fixedpoint.h

new file mode 100644 (file)

index 0000000..5df42cf
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_fixedpoint.h
@@ -0,0 +1,134 @@
+/***************************************************************************
+ *   Copyright (C) 2010 PCSX4ALL Team                                      *
+ *   Copyright (C) 2010 Unai                                               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+ ***************************************************************************/
+
+#ifndef FIXED_H
+#define FIXED_H
+
+typedef s32 fixed;
+
+//senquack - The gpu_drhell poly routines I adapted use 22.10 fixed point,
+//           while original Unai used 16.16: (see README_senquack.txt)
+//#define FIXED_BITS 16
+#define FIXED_BITS 10
+
+#define fixed_ZERO ((fixed)0)
+#define fixed_ONE  ((fixed)1<<FIXED_BITS)
+#define fixed_TWO  ((fixed)2<<FIXED_BITS)
+#define fixed_HALF ((fixed)((1<<FIXED_BITS)>>1))
+
+#define fixed_LOMASK ((fixed)((1<<FIXED_BITS)-1))
+#define fixed_HIMASK ((fixed)(~fixed_LOMASK))
+
+// int<->fixed conversions:
+#define i2x(x) ((x)<<FIXED_BITS)
+#define x2i(x) ((x)>>FIXED_BITS)
+
+INLINE fixed FixedCeil(const fixed x)
+{
+       return (x + (fixed_ONE - 1)) & fixed_HIMASK;
+}
+
+INLINE s32 FixedCeilToInt(const fixed x)
+{
+       return (x + (fixed_ONE - 1)) >> FIXED_BITS;
+}
+
+//senquack - float<->fixed conversions:
+#define f2x(x) ((s32)((x) * (float)(1<<FIXED_BITS)))
+#define x2f(x) ((float)(x) / (float)(1<<FIXED_BITS))
+
+//senquack - floating point reciprocal:
+//NOTE: These assume x is always != 0 !!!
+#ifdef GPU_UNAI_USE_FLOATMATH
+#if defined(_MIPS_ARCH_MIPS32R2) || (__mips == 64)
+INLINE float FloatInv(const float x)
+{
+       float res;
+       asm("recip.s %0,%1" : "=f" (res) : "f" (x));
+       return res;
+}
+#else
+INLINE float FloatInv(const float x)
+{
+       return (1.0f / x);
+}
+#endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////
+// --- BEGIN INVERSE APPROXIMATION SECTION ---
+///////////////////////////////////////////////////////////////////////////
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+
+//  big precision inverse table.
+#define TABLE_BITS 16
+s32 s_invTable[(1<<TABLE_BITS)];
+
+//senquack - MIPS32 happens to have same instruction/format:
+#if defined(__arm__) || (__mips == 32)
+INLINE u32 Log2(u32 x) { u32 res; asm("clz %0,%1" : "=r" (res) : "r" (x)); return 32-res; }
+#else
+INLINE u32 Log2(u32 x) { u32 i = 0; for ( ; x > 0; ++i, x >>= 1); return i - 1; }
+#endif
+
+INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
+{
+  u32 uD = (_b<0) ? -_b : _b;
+  if(uD>1)
+  {
+       u32 uLog = Log2(uD);
+    uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0;
+    u32 uDen = (uD>>uLog);
+    iFactor_ = s_invTable[uDen];
+    iFactor_ = (_b<0) ? -iFactor_ :iFactor_;
+    //senquack - Adapted to 22.10 fixed point (originally 16.16):
+    //iShift_  = 15+uLog;
+    iShift_  = 21+uLog;
+  }
+  else
+  {
+    iFactor_=_b;
+    iShift_ = 0;
+  }
+}
+
+INLINE  fixed xInvMulx  (const fixed _a, const s32 _iFact, const s32 _iShift)
+{
+       #ifdef __arm__
+               s64 res;
+               asm ("smull %Q0, %R0, %1, %2" : "=&r" (res) : "r"(_a) , "r"(_iFact));
+               return fixed(res>>_iShift);
+       #else
+               return fixed( ((s64)(_a)*(s64)(_iFact))>>(_iShift) );
+       #endif
+}
+
+INLINE  fixed xLoDivx   (const fixed _a, const fixed _b)
+{
+  s32 iFact, iShift;
+  xInv(_b, iFact, iShift);
+  return xInvMulx(_a, iFact, iShift);
+}
+#endif // GPU_UNAI_USE_INT_DIV_MULTINV
+///////////////////////////////////////////////////////////////////////////
+// --- END INVERSE APPROXIMATION SECTION ---
+///////////////////////////////////////////////////////////////////////////
+
+#endif  //FIXED_H
diff --git a/plugins/gpu_senquack/gpu_inner.h b/plugins/gpu_senquack/gpu_inner.h

new file mode 100644 (file)

index 0000000..8cb4bd5
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner.h
@@ -0,0 +1,734 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_INNER_H__
+#define __GPU_UNAI_GPU_INNER_H__
+
+///////////////////////////////////////////////////////////////////////////////
+// Inner loop driver instantiation file
+
+///////////////////////////////////////////////////////////////////////////////
+//  Option Masks (CF template paramter)
+#define  CF_LIGHT     ((CF>> 0)&1) // Lighting
+#define  CF_BLEND     ((CF>> 1)&1) // Blending
+#define  CF_MASKCHECK ((CF>> 2)&1) // Mask bit check
+#define  CF_BLENDMODE ((CF>> 3)&3) // Blend mode   0..3
+#define  CF_TEXTMODE  ((CF>> 5)&3) // Texture mode 1..3 (0: texturing disabled)
+#define  CF_GOURAUD   ((CF>> 7)&1) // Gouraud shading
+#define  CF_MASKSET   ((CF>> 8)&1) // Mask bit set
+#define  CF_DITHER    ((CF>> 9)&1) // Dithering
+#define  CF_BLITMASK  ((CF>>10)&1) // blit_mask check (skip rendering pixels
+                                   //  that wouldn't end up displayed on
+                                   //  low-res screen using simple downscaler)
+
+//#ifdef __arm__
+//#ifndef ENABLE_GPU_ARMV7
+/* ARMv5 */
+//#include "gpu_inner_blend_arm5.h"
+//#else
+/* ARMv7 optimized */
+//#include "gpu_inner_blend_arm7.h"
+//#endif
+//#else
+//#include "gpu_inner_blend.h"
+//#endif
+
+#include "gpu_inner_blend.h"
+#include "gpu_inner_quantization.h"
+#include "gpu_inner_light.h"
+
+#ifdef __arm__
+#include "gpu_inner_blend_arm.h"
+#include "gpu_inner_light_arm.h"
+#define gpuBlending gpuBlendingARM
+#define gpuLightingRGB gpuLightingRGBARM
+#define gpuLightingTXT gpuLightingTXTARM
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
+// Non-dithering lighting and blending functions preserve uSrc
+// MSB. This saves a few operations and useless load/stores.
+#define MSB_PRESERVED (!CF_DITHER)
+#else
+#define gpuBlending gpuBlendingGeneric
+#define gpuLightingRGB gpuLightingRGBGeneric
+#define gpuLightingTXT gpuLightingTXTGeneric
+#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
+#define MSB_PRESERVED 0
+#endif
+
+
+// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+// This is only for debugging/verification of low-precision colors in C.
+// Low-precision Gouraud is intended for use by SIMD-optimized inner drivers
+// which get/use Gouraud colors in SIMD registers.
+//#define GPU_GOURAUD_LOW_PRECISION
+
+// How many bits of fixed-point precision GouraudColor uses
+#ifdef GPU_GOURAUD_LOW_PRECISION
+#define GPU_GOURAUD_FIXED_BITS 11
+#else
+#define GPU_GOURAUD_FIXED_BITS 16
+#endif
+
+// Used to pass Gouraud colors to gpuPixelSpanFn() (lines)
+struct GouraudColor {
+#ifdef GPU_GOURAUD_LOW_PRECISION
+       u16 r, g, b;
+       s16 r_incr, g_incr, b_incr;
+#else
+       u32 r, g, b;
+       s32 r_incr, g_incr, b_incr;
+#endif
+};
+
+static inline u16 gpuGouraudColor15bpp(u32 r, u32 g, u32 b)
+{
+       r >>= GPU_GOURAUD_FIXED_BITS;
+       g >>= GPU_GOURAUD_FIXED_BITS;
+       b >>= GPU_GOURAUD_FIXED_BITS;
+
+#ifndef GPU_GOURAUD_LOW_PRECISION
+       // High-precision Gouraud colors are 8-bit + fractional
+       r >>= 3;  g >>= 3;  b >>= 3;
+#endif
+
+       return r | (g << 5) | (b << 10);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Pixel span operations generator gpuPixelSpanFn<>
+//  Oct 2016: Created/adapted from old gpuPixelFn by senquack:
+//  Original gpuPixelFn was used to draw lines one pixel at a time. I wrote
+//  new line algorithms that draw lines using horizontal/vertical/diagonal
+//  spans of pixels, necessitating new pixel-drawing function that could
+//  not only render spans of pixels, but gouraud-shade them as well.
+//  This speeds up line rendering and would allow tile-rendering (untextured
+//  rectangles) to use the same set of functions. Since tiles are always
+//  monochrome, they simply wouldn't use the extra set of 32 gouraud-shaded
+//  gpuPixelSpanFn functions (TODO?).
+//
+// NOTE: While the PS1 framebuffer is 16 bit, we use 8-bit pointers here,
+//       so that pDst can be incremented directly by 'incr' parameter
+//       without having to shift it before use.
+template<int CF>
+static u8* gpuPixelSpanFn(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
+{
+       // Blend func can save an operation if it knows uSrc MSB is
+       //  unset. For untextured prims, this is always true.
+       const bool skip_uSrc_mask = true;
+
+       u16 col;
+       struct GouraudColor * gcPtr;
+       u32 r, g, b;
+       s32 r_incr, g_incr, b_incr;
+
+       if (CF_GOURAUD) {
+               gcPtr = (GouraudColor*)data;
+               r = gcPtr->r;  r_incr = gcPtr->r_incr;
+               g = gcPtr->g;  g_incr = gcPtr->g_incr;
+               b = gcPtr->b;  b_incr = gcPtr->b_incr;
+       } else {
+               col = (u16)data;
+       }
+
+       do {
+               if (!CF_GOURAUD)
+               {   // NO GOURAUD
+                       if (!CF_MASKCHECK && !CF_BLEND) {
+                               if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                               else            { *(u16*)pDst = col;          }
+                       } else if (CF_MASKCHECK && !CF_BLEND) {
+                               if (!(*(u16*)pDst & 0x8000)) {
+                                       if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                                       else            { *(u16*)pDst = col;          }
+                               }
+                       } else {
+                               uint_fast16_t uDst = *(u16*)pDst;
+                               if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+
+                               uint_fast16_t uSrc = col;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+                               else            { *(u16*)pDst = uSrc;          }
+                       }
+
+               } else
+               {   // GOURAUD
+
+                       if (!CF_MASKCHECK && !CF_BLEND) {
+                               col = gpuGouraudColor15bpp(r, g, b);
+                               if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                               else            { *(u16*)pDst = col;          }
+                       } else if (CF_MASKCHECK && !CF_BLEND) {
+                               col = gpuGouraudColor15bpp(r, g, b);
+                               if (!(*(u16*)pDst & 0x8000)) {
+                                       if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                                       else            { *(u16*)pDst = col;          }
+                               }
+                       } else {
+                               uint_fast16_t uDst = *(u16*)pDst;
+                               if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+                               col = gpuGouraudColor15bpp(r, g, b);
+
+                               uint_fast16_t uSrc = col;
+
+                               // Blend func can save an operation if it knows uSrc MSB is
+                               //  unset. For untextured prims, this is always true.
+                               const bool skip_uSrc_mask = true;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+                               else            { *(u16*)pDst = uSrc;          }
+                       }
+               }
+
+endpixel:
+               if (CF_GOURAUD) {
+                       r += r_incr;
+                       g += g_incr;
+                       b += b_incr;
+               }
+               pDst += incr;
+       } while (len-- > 1);
+
+       // Note from senquack: Normally, I'd prefer to write a 'do {} while (--len)'
+       //  loop, or even a for() loop, however, on MIPS platforms anything but the
+       //  'do {} while (len-- > 1)' tends to generate very unoptimal asm, with
+       //  many unneeded MULs/ADDs/branches at the ends of these functions.
+       //  If you change the loop structure above, be sure to compare the quality
+       //  of the generated code!!
+
+       if (CF_GOURAUD) {
+               gcPtr->r = r;
+               gcPtr->g = g;
+               gcPtr->b = b;
+       }
+       return pDst;
+}
+
+static u8* PixelSpanNULL(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"PixelSpanNULL()\n");
+       #endif
+       return pDst;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  PixelSpan (lines) innerloops driver
+typedef u8* (*PSD)(u8* dst, uintptr_t data, ptrdiff_t incr, size_t len);
+
+const PSD gpuPixelSpanDrivers[64] =
+{ 
+       // Array index | 'CF' template field | Field value
+       // ------------+---------------------+----------------
+       // Bit 0       | CF_BLEND            | off (0), on (1)
+       // Bit 1       | CF_MASKCHECK        | off (0), on (1)
+       // Bit 3:2     | CF_BLENDMODE        | 0..3
+       // Bit 4       | CF_MASKSET          | off (0), on (1)
+       // Bit 5       | CF_GOURAUD          | off (0), on (1)
+       //
+       // NULL entries are ones for which blending is disabled and blend-mode
+       //  field is non-zero, which is obviously invalid.
+
+       // Flat-shaded
+       gpuPixelSpanFn<0x00<<1>,         gpuPixelSpanFn<0x01<<1>,         gpuPixelSpanFn<0x02<<1>,         gpuPixelSpanFn<0x03<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x05<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x07<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x09<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0B<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x0D<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0F<<1>,
+
+       // Flat-shaded + PixelMSB (CF_MASKSET)
+       gpuPixelSpanFn<(0x00<<1)|0x100>, gpuPixelSpanFn<(0x01<<1)|0x100>, gpuPixelSpanFn<(0x02<<1)|0x100>, gpuPixelSpanFn<(0x03<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x100>,
+
+       // Gouraud-shaded (CF_GOURAUD)
+       gpuPixelSpanFn<(0x00<<1)|0x80>,  gpuPixelSpanFn<(0x01<<1)|0x80>,  gpuPixelSpanFn<(0x02<<1)|0x80>,  gpuPixelSpanFn<(0x03<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x80>,
+
+       // Gouraud-shaded (CF_GOURAUD) + PixelMSB (CF_MASKSET)
+       gpuPixelSpanFn<(0x00<<1)|0x180>, gpuPixelSpanFn<(0x01<<1)|0x180>, gpuPixelSpanFn<(0x02<<1)|0x180>, gpuPixelSpanFn<(0x03<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x180>
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Tiles innerloops generator
+
+template<int CF>
+static void gpuTileSpanFn(u16 *pDst, u32 count, u16 data)
+{
+       if (!CF_MASKCHECK && !CF_BLEND) {
+               if (CF_MASKSET) { data = data | 0x8000; }
+               do { *pDst++ = data; } while (--count);
+       } else if (CF_MASKCHECK && !CF_BLEND) {
+               if (CF_MASKSET) { data = data | 0x8000; }
+               do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
+       } else
+       {
+               // Blend func can save an operation if it knows uSrc MSB is
+               //  unset. For untextured prims, this is always true.
+               const bool skip_uSrc_mask = true;
+
+               uint_fast16_t uSrc, uDst;
+               do
+               {
+                       if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+                       if (CF_MASKCHECK) { if (uDst&0x8000) goto endtile; }
+
+                       uSrc = data;
+
+                       if (CF_BLEND)
+                               uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                       if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                       else            { *pDst = uSrc;          }
+
+                       //senquack - Did not apply "Silent Hill" mask-bit fix to here.
+                       // It is hard to tell from scarce documentation available and
+                       //  lack of comments in code, but I believe the tile-span
+                       //  functions here should not bother to preserve any source MSB,
+                       //  as they are not drawing from a texture.
+endtile:
+                       pDst++;
+               }
+               while (--count);
+       }
+}
+
+static void TileNULL(u16 *pDst, u32 count, u16 data)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"TileNULL()\n");
+       #endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Tiles innerloops driver
+typedef void (*PT)(u16 *pDst, u32 count, u16 data);
+
+// Template instantiation helper macros
+#define TI(cf) gpuTileSpanFn<(cf)>
+#define TN     TileNULL
+#define TIBLOCK(ub) \
+       TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+       TN,            TI((ub)|0x0a), TN,            TI((ub)|0x0e), \
+       TN,            TI((ub)|0x12), TN,            TI((ub)|0x16), \
+       TN,            TI((ub)|0x1a), TN,            TI((ub)|0x1e)
+
+const PT gpuTileSpanDrivers[32] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8)
+};
+
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Sprites innerloops generator
+
+template<int CF>
+static void gpuSpriteSpanFn(u16 *pDst, u32 count, u8* pTxt, u32 u0)
+{
+       // Blend func can save an operation if it knows uSrc MSB is unset.
+       //  Untextured prims can always skip (source color always comes with MSB=0).
+       //  For textured prims, the generic lighting funcs always return it unset. (bonus!)
+       const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
+
+       uint_fast16_t uSrc, uDst, srcMSB;
+       bool should_blend;
+       u32 u0_mask = gpu_senquack.TextureWindow[2];
+
+       u8 r5, g5, b5;
+       if (CF_LIGHT) {
+               r5 = gpu_senquack.r5;
+               g5 = gpu_senquack.g5;
+               b5 = gpu_senquack.b5;
+       }
+
+       if (CF_TEXTMODE==3) {
+               // Texture is accessed byte-wise, so adjust mask if 16bpp
+               u0_mask <<= 1;
+       }
+
+       const u16 *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_senquack.CBA;
+
+       do
+       {
+               if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+               if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; }
+
+               if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+                       u8 rgb = pTxt[(u0 & u0_mask)>>1];
+                       uSrc = CBA_[(rgb>>((u0&1)<<2))&0xf];
+               }
+               if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+                       uSrc = CBA_[pTxt[u0 & u0_mask]];
+               }
+               if (CF_TEXTMODE==3) {  // 16bpp
+                       uSrc = *(u16*)(&pTxt[u0 & u0_mask]);
+               }
+
+               if (!uSrc) goto endsprite;
+
+               //senquack - save source MSB, as blending or lighting macros will not
+               //           (Silent Hill gray rectangles mask bit bug)
+               if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
+               
+               if (CF_LIGHT)
+                       uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+
+               should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
+
+               if (CF_BLEND && should_blend)
+                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+               if (CF_MASKSET)                                    { *pDst = uSrc | 0x8000; }
+               else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = uSrc | srcMSB; }
+               else                                               { *pDst = uSrc;          }
+
+endsprite:
+               u0 += (CF_TEXTMODE==3) ? 2 : 1;
+               pDst++;
+       }
+       while (--count);
+}
+
+static void SpriteNULL(u16 *pDst, u32 count, u8* pTxt, u32 u0)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"SpriteNULL()\n");
+       #endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+//  Sprite innerloops driver
+typedef void (*PS)(u16 *pDst, u32 count, u8* pTxt, u32 u0);
+
+// Template instantiation helper macros
+#define TI(cf) gpuSpriteSpanFn<(cf)>
+#define TN     SpriteNULL
+#define TIBLOCK(ub) \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
+
+const PS gpuSpriteSpanDrivers[256] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8)
+};
+
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Polygon innerloops generator
+
+//senquack - Newer version with following changes:
+//           * Adapted to work with new poly routings in gpu_raster_polygon.h
+//             adapted from DrHell GPU. They are less glitchy and use 22.10
+//             fixed-point instead of original UNAI's 16.16.
+//           * Texture coordinates are no longer packed together into one
+//             unsigned int. This seems to lose too much accuracy (they each
+//             end up being only 8.7 fixed-point that way) and pixel-droupouts
+//             were noticeable both with original code and current DrHell
+//             adaptations. An example would be the sky in NFS3. Now, they are
+//             stored in separate ints, using separate masks.
+//           * Function is no longer INLINE, as it was always called
+//             through a function pointer.
+//           * Function now ensures the mask bit of source texture is preserved
+//             across calls to blending functions (Silent Hill rectangles fix)
+//           * November 2016: Large refactoring of blending/lighting when
+//             JohnnyF added dithering. See gpu_inner_quantization.h and
+//             relevant blend/light headers.
+// (see README_senquack.txt)
+template<int CF>
+static void gpuPolySpanFn(const gpu_senquack_t &gpu_senquack, u16 *pDst, u32 count)
+{
+       // Blend func can save an operation if it knows uSrc MSB is unset.
+       //  Untextured prims can always skip this (src color MSB is always 0).
+       //  For textured prims, the generic lighting funcs always return it unset. (bonus!)
+       const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
+       bool should_blend;
+
+       u32 bMsk; if (CF_BLITMASK) bMsk = gpu_senquack.blit_mask;
+
+       if (!CF_TEXTMODE)
+       {
+               if (!CF_GOURAUD)
+               {
+                       // UNTEXTURED, NO GOURAUD
+                       const u16 pix15 = gpu_senquack.PixelData;
+                       do {
+                               uint_fast16_t uSrc, uDst;
+
+                               // NOTE: Don't enable CF_BLITMASK  pixel skipping (speed hack)
+                               //  on untextured polys. It seems to do more harm than good: see
+                               //  gravestone text at end of Medieval intro sequence. -senquack
+                               //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) { goto endpolynotextnogou; } }
+
+                               if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+                               if (CF_MASKCHECK) { if (uDst&0x8000) { goto endpolynotextnogou; } }
+
+                               uSrc = pix15;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                               else            { *pDst = uSrc;          }
+
+endpolynotextnogou:
+                               pDst++;
+                       } while(--count);
+               }
+               else
+               {
+                       // UNTEXTURED, GOURAUD
+                       u32 l_gCol = gpu_senquack.gCol;
+                       u32 l_gInc = gpu_senquack.gInc;
+
+                       do {
+                               uint_fast16_t uDst, uSrc;
+
+                               // See note in above loop regarding CF_BLITMASK
+                               //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; }
+
+                               if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+                               if (CF_MASKCHECK) { if (uDst&0x8000) goto endpolynotextgou; }
+
+                               if (CF_DITHER) {
+                                       // GOURAUD, DITHER
+
+                                       u32 uSrc24 = gpuLightingRGB24(l_gCol);
+                                       if (CF_BLEND)
+                                               uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+                                       uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+                               } else {
+                                       // GOURAUD, NO DITHER
+
+                                       uSrc = gpuLightingRGB(l_gCol);
+
+                                       if (CF_BLEND)
+                                               uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+                               }
+
+                               if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                               else            { *pDst = uSrc;          }
+
+endpolynotextgou:
+                               pDst++;
+                               l_gCol += l_gInc;
+                       }
+                       while (--count);
+               }
+       }
+       else
+       {
+               // TEXTURED
+
+               uint_fast16_t uDst, uSrc, srcMSB;
+
+               //senquack - note: original UNAI code had gpu_senquack.{u4/v4} packed into
+               // one 32-bit unsigned int, but this proved to lose too much accuracy
+               // (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
+               u32 l_u_msk = gpu_senquack.u_msk;     u32 l_v_msk = gpu_senquack.v_msk;
+               u32 l_u = gpu_senquack.u & l_u_msk;   u32 l_v = gpu_senquack.v & l_v_msk;
+               s32 l_u_inc = gpu_senquack.u_inc;     s32 l_v_inc = gpu_senquack.v_inc;
+
+               const u16* TBA_ = gpu_senquack.TBA;
+               const u16* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_senquack.CBA;
+
+               u8 r5, g5, b5;
+               u8 r8, g8, b8;
+
+               u32 l_gInc, l_gCol;
+
+               if (CF_LIGHT) {
+                       if (CF_GOURAUD) {
+                               l_gInc = gpu_senquack.gInc;
+                               l_gCol = gpu_senquack.gCol;
+                       } else {
+                               if (CF_DITHER) {
+                                       r8 = gpu_senquack.r8;
+                                       g8 = gpu_senquack.g8;
+                                       b8 = gpu_senquack.b8;
+                               } else {
+                                       r5 = gpu_senquack.r5;
+                                       g5 = gpu_senquack.g5;
+                                       b5 = gpu_senquack.b5;
+                               }
+                       }
+               }
+
+               do
+               {
+                       if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolytext; }
+                       if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+                       if (CF_MASKCHECK) if (uDst&0x8000) { goto endpolytext; }
+
+                       //senquack - adapted to work with new 22.10 fixed point routines:
+                       //           (UNAI originally used 16.16)
+                       if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+                               u32 tu=(l_u>>10);
+                               u32 tv=(l_v<<1)&(0xff<<11);
+                               u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
+                               uSrc=CBA_[(rgb>>((tu&1)<<2))&0xf];
+                               if (!uSrc) goto endpolytext;
+                       }
+                       if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+                               uSrc = CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])];
+                               if (!uSrc) goto endpolytext;
+                       }
+                       if (CF_TEXTMODE==3) {  // 16bpp
+                               uSrc = TBA_[(l_u>>10)+((l_v)&(0xff<<10))];
+                               if (!uSrc) goto endpolytext;
+                       }
+
+                       // Save source MSB, as blending or lighting will not (Silent Hill)
+                       if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
+
+                       // When textured, only dither when LIGHT (texture blend) is enabled
+                       // LIGHT &&  BLEND => dither
+                       // LIGHT && !BLEND => dither
+                       //!LIGHT &&  BLEND => no dither
+                       //!LIGHT && !BLEND => no dither
+
+                       if (CF_DITHER && CF_LIGHT) {
+                               u32 uSrc24;
+                               if ( CF_GOURAUD)
+                                       uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
+                               if (!CF_GOURAUD)
+                                       uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+
+                               if (CF_BLEND && srcMSB)
+                                       uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+
+                               uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+                       } else
+                       {
+                               if (CF_LIGHT) {
+                                       if ( CF_GOURAUD)
+                                               uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
+                                       if (!CF_GOURAUD)
+                                               uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+                               }
+
+                               should_blend = MSB_PRESERVED ? uSrc & 0x8000 : srcMSB;
+                               if (CF_BLEND && should_blend)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+                       }
+
+                       if (CF_MASKSET)                                    { *pDst = uSrc | 0x8000; }
+                       else if (!MSB_PRESERVED && (CF_BLEND || CF_LIGHT)) { *pDst = uSrc | srcMSB; }
+                       else                                               { *pDst = uSrc;          }
+endpolytext:
+                       pDst++;
+                       l_u = (l_u + l_u_inc) & l_u_msk;
+                       l_v = (l_v + l_v_inc) & l_v_msk;
+                       if (CF_LIGHT && CF_GOURAUD) l_gCol += l_gInc;
+               }
+               while (--count);
+       }
+}
+
+static void PolyNULL(const gpu_senquack_t &gpu_senquack, u16 *pDst, u32 count)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"PolyNULL()\n");
+       #endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Polygon innerloops driver
+typedef void (*PP)(const gpu_senquack_t &gpu_senquack, u16 *pDst, u32 count);
+
+// Template instantiation helper macros
+#define TI(cf) gpuPolySpanFn<(cf)>
+#define TN     PolyNULL
+#define TIBLOCK(ub) \
+       TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+       TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
+       TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
+       TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
+       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f), \
+       TN,            TI((ub)|0x81), TN,            TI((ub)|0x83), TN,            TI((ub)|0x85), TN,            TI((ub)|0x87), \
+       TN,            TN,            TN,            TI((ub)|0x8b), TN,            TN,            TN,            TI((ub)|0x8f), \
+       TN,            TN,            TN,            TI((ub)|0x93), TN,            TN,            TN,            TI((ub)|0x97), \
+       TN,            TN,            TN,            TI((ub)|0x9b), TN,            TN,            TN,            TI((ub)|0x9f), \
+       TN,            TI((ub)|0xa1), TN,            TI((ub)|0xa3), TN,            TI((ub)|0xa5), TN,            TI((ub)|0xa7), \
+       TN,            TN,            TN,            TI((ub)|0xab), TN,            TN,            TN,            TI((ub)|0xaf), \
+       TN,            TN,            TN,            TI((ub)|0xb3), TN,            TN,            TN,            TI((ub)|0xb7), \
+       TN,            TN,            TN,            TI((ub)|0xbb), TN,            TN,            TN,            TI((ub)|0xbf), \
+       TN,            TI((ub)|0xc1), TN,            TI((ub)|0xc3), TN,            TI((ub)|0xc5), TN,            TI((ub)|0xc7), \
+       TN,            TN,            TN,            TI((ub)|0xcb), TN,            TN,            TN,            TI((ub)|0xcf), \
+       TN,            TN,            TN,            TI((ub)|0xd3), TN,            TN,            TN,            TI((ub)|0xd7), \
+       TN,            TN,            TN,            TI((ub)|0xdb), TN,            TN,            TN,            TI((ub)|0xdf), \
+       TN,            TI((ub)|0xe1), TN,            TI((ub)|0xe3), TN,            TI((ub)|0xe5), TN,            TI((ub)|0xe7), \
+       TN,            TN,            TN,            TI((ub)|0xeb), TN,            TN,            TN,            TI((ub)|0xef), \
+       TN,            TN,            TN,            TI((ub)|0xf3), TN,            TN,            TN,            TI((ub)|0xf7), \
+       TN,            TN,            TN,            TI((ub)|0xfb), TN,            TN,            TN,            TI((ub)|0xff)
+
+const PP gpuPolySpanDrivers[2048] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8),
+       TIBLOCK(4<<8), TIBLOCK(5<<8), TIBLOCK(6<<8), TIBLOCK(7<<8)
+};
+
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+#endif /* __GPU_UNAI_GPU_INNER_H__ */
diff --git a/plugins/gpu_senquack/gpu_inner_blend.h b/plugins/gpu_senquack/gpu_inner_blend.h

new file mode 100644 (file)

index 0000000..febc7ed
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_blend.h
@@ -0,0 +1,188 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color
+//  in 'uDst' (background), returning resulting color.
+//
+// INPUT:
+//  'uSrc','uDst' input: -bbbbbgggggrrrrr
+//                       ^ bit 16
+// OUTPUT:
+//           u16 output: 0bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+// Where '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
+GPU_INLINE uint_fast16_t gpuBlendingGeneric(uint_fast16_t uSrc, uint_fast16_t uDst)
+{
+       // These use Blargg's bitwise modulo-clamping:
+       //  http://blargg.8bitalley.com/info/rgb_mixing.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_add.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_sub.html
+
+       uint_fast16_t mix;
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+#ifdef GPU_UNAI_USE_ACCURATE_BLENDING
+               // Slower, but more accurate (doesn't lose LSB data)
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1;
+#else
+               mix = ((uDst & 0x7bde) + (uSrc & 0x7bde)) >> 1;
+#endif
+       }
+
+       // 1.0 x Back + 1.0 x Forward
+       if (BLENDMODE==1) {
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               u32 sum      = uSrc + uDst;
+               u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               u32 carries  = (sum - low_bits) & 0x8420;
+               u32 modulo   = sum - carries;
+               u32 clamp    = carries - (carries >> 5);
+               mix = modulo | clamp;
+       }
+
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               u32 diff     = uDst - uSrc + 0x8420;
+               u32 low_bits = (uDst ^ uSrc) & 0x8420;
+               u32 borrows  = (diff - low_bits) & 0x8420;
+               u32 modulo   = diff - borrows;
+               u32 clamp    = borrows - (borrows >> 5);
+               mix = modulo & clamp;
+       }
+
+       // 1.0 x Back + 0.25 x Forward
+       if (BLENDMODE==3) {
+               uDst &= 0x7fff;
+               uSrc = ((uSrc >> 2) & 0x1ce7);
+               u32 sum      = uSrc + uDst;
+               u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               u32 carries  = (sum - low_bits) & 0x8420;
+               u32 modulo   = sum - carries;
+               u32 clamp    = carries - (carries >> 5);
+               mix = modulo | clamp;
+       }
+
+       return mix;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert bgr555 color in uSrc to padded u32 5.4:5.4:5.4 bgr fixed-pt
+//  color triplet suitable for use with HQ 24-bit quantization.
+//
+// INPUT:
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuGetRGB24(uint_fast16_t uSrc)
+{
+       return ((uSrc & 0x7C00)<<14)
+            | ((uSrc & 0x03E0)<< 9)
+            | ((uSrc & 0x001F)<< 4);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24'
+//  (foreground color) with bgr555 color in 'uDst' (background color),
+//  returning the resulting u32 5.4:5.4:5.4 color.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE>
+GPU_INLINE u32 gpuBlending24(u32 uSrc24, uint_fast16_t uDst)
+{
+       // These use techniques adapted from Blargg's techniques mentioned in
+       //  in gpuBlending() comments above. Not as much bitwise trickery is
+       //  necessary because of presence of 0 padding in uSrc24 format.
+
+       u32 uDst24 = gpuGetRGB24(uDst);
+       u32 mix;
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+               const u32 uMsk = 0x1FE7F9FE;
+               // Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already
+               mix = (uDst24 + (uSrc24 & uMsk)) >> 1;
+       }
+
+       // 1.0 x Back + 1.0 x Forward
+       if (BLENDMODE==1) {
+               u32 sum     = uSrc24 + uDst24;
+               u32 carries = sum & 0x20080200;
+               u32 modulo  = sum - carries;
+               u32 clamp   = carries - (carries >> 9);
+               mix = modulo | clamp;
+       }
+
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               // Insert ones in 0-padded borrow slot of color to be subtracted from
+               uDst24 |= 0x20080200;
+               u32 diff    = uDst24 - uSrc24;
+               u32 borrows = diff & 0x20080200;
+               u32 clamp   = borrows - (borrows >> 9);
+               mix = diff & clamp;
+       }
+
+       // 1.0 x Back + 0.25 x Forward
+       if (BLENDMODE==3) {
+               uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2;
+               u32 sum     = uSrc24 + uDst24;
+               u32 carries = sum & 0x20080200;
+               u32 modulo  = sum - carries;
+               u32 clamp   = carries - (carries >> 9);
+               mix = modulo | clamp;
+       }
+
+       return mix;
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_senquack/gpu_inner_blend_arm.h b/plugins/gpu_senquack/gpu_inner_blend_arm.h

new file mode 100644 (file)

index 0000000..6413527
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_blend_arm.h
@@ -0,0 +1,103 @@
+#ifndef _OP_BLEND_ARM_H_
+#define _OP_BLEND_ARM_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color
+//  in 'uDst' (background), returning resulting color.
+//
+// INPUT:
+//  'uSrc','uDst' input: -bbbbbgggggrrrrr
+//                       ^ bit 16
+// OUTPUT:
+//           u16 output: 0bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+// Where '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
+GPU_INLINE uint_fast16_t gpuBlendingARM(uint_fast16_t uSrc, uint_fast16_t uDst)
+{
+       // These use Blargg's bitwise modulo-clamping:
+       //  http://blargg.8bitalley.com/info/rgb_mixing.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_add.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_sub.html
+
+       uint_fast16_t mix;
+
+       // Clear preserved msb
+       asm ("bic %[uDst], %[uDst], #0x8000" : [uDst] "+r" (uDst));
+
+       if (BLENDMODE == 3) {
+               // Prepare uSrc for blending ((0.25 * uSrc) & (0.25 * mask))
+               asm ("and %[uSrc], %[mask], %[uSrc], lsr #0x2" : [uSrc] "+r" (uSrc) : [mask] "r" (0x1ce7));
+       } else if (!SKIP_USRC_MSB_MASK) {
+               asm ("bic %[uSrc], %[uSrc], #0x8000" : [uSrc] "+r" (uSrc));
+       }
+
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+               // mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1;
+               asm ("eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
+                    "and %[mix], %[mix], %[mask]\n\t"  // ... & 0x0421
+                    "sub %[mix], %[uDst], %[mix]\n\t"  // uDst - ...
+                    "add %[mix], %[uSrc], %[mix]\n\t"  // uSrc + ...
+                    "mov %[mix], %[mix], lsr #0x1\n\t" // ... >> 1
+                    : [mix] "=&r" (mix)
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
+       }
+
+       if (BLENDMODE == 1 || BLENDMODE == 3) {
+               // u32 sum      = uSrc + uDst;
+               // u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               // u32 carries  = (sum - low_bits) & 0x8420;
+               // u32 modulo   = sum - carries;
+               // u32 clamp    = carries - (carries >> 5);
+               // mix = modulo | clamp;
+
+               u32 sum;
+
+               asm ("add %[sum], %[uSrc], %[uDst]\n\t" // sum = uSrc + uDst
+                    "eor %[mix], %[uSrc], %[uDst]\n\t" // uSrc ^ uDst
+                    "and %[mix], %[mix], %[mask]\n\t"  // low_bits = (... & 0x0421)
+                    "sub %[mix], %[sum], %[mix]\n\t"   // sum - low_bits
+                    "and %[mix], %[mix], %[mask], lsl #0x05\n\t"  // carries = ... & 0x8420
+                    "sub %[sum], %[sum], %[mix] \n\t"  // modulo = sum - carries
+                    "sub %[mix], %[mix], %[mix], lsr #0x05\n\t" // clamp = carries - (carries >> 5)
+                    "orr %[mix], %[sum], %[mix]"       // mix = modulo | clamp
+                    : [sum] "=&r" (sum), [mix] "=&r" (mix)
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x0421));
+       }
+    
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               u32 diff;
+               // u32 diff     = uDst - uSrc + 0x8420;
+               // u32 low_bits = (uDst ^ uSrc) & 0x8420;
+               // u32 borrows  = (diff - low_bits) & 0x8420;
+               // u32 modulo   = diff - borrows;
+               // u32 clamp    = borrows - (borrows >> 5);
+               // mix = modulo & clamp;
+               asm ("sub %[diff], %[uDst], %[uSrc]\n\t"  // uDst - uSrc
+                    "add %[diff], %[diff], %[mask]\n\t"  // diff = ... + 0x8420
+                    "eor %[mix], %[uDst], %[uSrc]\n\t"   // uDst ^ uSrc
+                    "and %[mix], %[mix], %[mask]\n\t"    // low_bits = ... & 0x8420
+                    "sub %[mix], %[diff], %[mix]\n\t"    // diff - low_bits
+                    "and %[mix], %[mix], %[mask]\n\t"    // borrows = ... & 0x8420
+                    "sub %[diff], %[diff], %[mix]\n\t"   // modulo = diff - borrows
+                    "sub %[mix], %[mix], %[mix], lsr #0x05\n\t"  // clamp = borrows - (borrows >> 5)
+                    "and %[mix], %[diff], %[mix]"        // mix = modulo & clamp
+                    : [diff] "=&r" (diff), [mix] "=&r" (mix)
+                    : [uSrc] "r" (uSrc), [uDst] "r" (uDst), [mask] "r" (0x8420));
+       }
+
+       // There's not a case where we can get into this function,
+       // SKIP_USRC_MSB_MASK is false, and the msb of uSrc is unset.
+       if (!SKIP_USRC_MSB_MASK) {
+               asm ("orr %[mix], %[mix], #0x8000" : [mix] "+r" (mix));
+       }
+  
+       return mix;
+}
+
+#endif  //_OP_BLEND_ARM_H_
diff --git a/plugins/gpu_senquack/gpu_inner_blend_arm5.h b/plugins/gpu_senquack/gpu_inner_blend_arm5.h

new file mode 100644 (file)

index 0000000..0e9b74f
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_blend_arm5.h
@@ -0,0 +1,100 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+       asm ("and  %[src], %[src], %[msk]  " : [src] "=r" (uSrc) : "0" (uSrc), [msk] "r" (uMsk)                  ); \
+       asm ("and  %[dst], %[dst], %[msk]  " : [dst] "=r" (uDst) : "0" (uDst), [msk] "r" (uMsk)                  ); \
+       asm ("add  %[src], %[dst], %[src]  " : [src] "=r" (uSrc) :             [dst] "r" (uDst), "0" (uSrc)      ); \
+       asm ("mov  %[src], %[src], lsr #1  " : [src] "=r" (uSrc) : "0" (uSrc)                                    ); \
+}
+
+//     1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+       u16 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+       asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+       asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+       asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+       asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+       asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+       asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+       asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+       asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+       asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+//     1.0 x Back - 1.0 x Forward      */
+#define gpuBlending02(uSrc,uDst) \
+{ \
+       u16 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[out], %[dt],    %[st]    " : [out] "=r" (out)  : [dt]  "r" (dt),   [st]  "r" (st) : "cc"         ); \
+       asm ("movmi  %[out], #0x0000            " : [out] "=r" (out)  : "0" (out)                                       ); \
+       asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+       asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+       asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("mov %[uSrc], %[out]" : [uSrc] "=r" (uSrc) : [out] "r" (out) ); \
+}
+
+//     1.0 x Back + 0.25 x Forward     */
+#define gpuBlending03(uSrc,uDst) \
+{ \
+               u16 st,dt,out; \
+               asm ("mov    %[src], %[src],   lsr #2   " : [src] "=r" (uSrc) : "0" (uSrc)                                      ); \
+               asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x1C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+               asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+               asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+               asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x00E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+               asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+               asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+               asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+               asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x0007  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+               asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+               asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+               asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_senquack/gpu_inner_blend_arm7.h b/plugins/gpu_senquack/gpu_inner_blend_arm7.h

new file mode 100644 (file)

index 0000000..083e62d
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_blend_arm7.h
@@ -0,0 +1,107 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+       asm ("and  %[src], %[src], %[msk]\n" \
+            "and  %[dst], %[dst], %[msk]\n" \
+            "add  %[src], %[dst], %[src]\n" \
+            "mov  %[src], %[src], lsr #1\n" \
+        : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
+}
+
+//     1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x7C00\n" \
+            "add    %[out], %[dt],    %[st]  \n" \
+            "cmp    %[out], #0x7C00          \n" \
+            "movhi  %[out], #0x7C00          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x03E0\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x03E0          \n" \
+            "movhi  %[dt],  #0x03E0          \n" \
+            "orr    %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x001F\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x001F          \n" \
+            "movhi  %[dt],  #0x001F          \n" \
+            "orr    %[src], %[out],  %[dt]  \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//     1.0 x Back - 1.0 x Forward      */
+#define gpuBlending02(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x7C00\n" \
+            "subs   %[out], %[dt],    %[st]  \n" \
+            "movmi  %[out], #0x0000          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x03E0\n" \
+            "subs   %[dt],  %[dt],    %[st]  \n" \
+            "orrpl  %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x001F\n" \
+            "subs   %[dt],  %[dt],    %[st]  \n" \
+            "orrpl  %[out], %[out],   %[dt]  \n" \
+            "mov    %[src], %[out]           \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//     1.0 x Back + 0.25 x Forward     */
+#define gpuBlending03(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("mov    %[src], %[src],   lsr #2 \n" \
+            "and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x1C00\n" \
+            "add    %[out], %[dt],    %[st]  \n" \
+            "cmp    %[out], #0x7C00          \n" \
+            "movhi  %[out], #0x7C00          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x00E0\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x03E0          \n" \
+            "movhi  %[dt],  #0x03E0          \n" \
+            "orr    %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x0007\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x001F          \n" \
+            "movhi  %[dt],  #0x001F          \n" \
+            "orr    %[src], %[out],   %[dt]  \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_senquack/gpu_inner_light.h b/plugins/gpu_senquack/gpu_inner_light.h

new file mode 100644 (file)

index 0000000..b5d8933
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_light.h
@@ -0,0 +1,271 @@
+/***************************************************************************
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_LIGHT_H_
+#define _OP_LIGHT_H_
+
+//  GPU color operations for lighting calculations
+
+static void SetupLightLUT()
+{
+       // 1024-entry lookup table that modulates 5-bit texture + 5-bit light value.
+       // A light value of 15 does not modify the incoming texture color.
+       // LightLUT[32*32] array is initialized to following values:
+       //  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       //  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       //  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+       //  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
+       //  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+       //  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
+       //  0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11,
+       //  0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13,
+       //  0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,
+       //  0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17,
+       //  0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19,
+       //  0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21,
+       //  0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23,
+       //  0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25,
+       //  0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27,
+       //  0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31,
+       //  0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31,
+       //  0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31,
+       //  0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,
+       //  0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
+       for (int j=0; j < 32; ++j) {
+               for (int i=0; i < 32; ++i) {
+                       int val = i * j / 16;
+                       if (val > 31) val = 31;
+                       gpu_senquack.LightLUT[(j*32) + i] = val;
+               }
+       }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+// 'r','g','b' are 8.10 fixed-pt color components (r shown here)
+//     'r' input:  --------------rrrrrrrrXXXXXXXXXX
+//                 ^ bit 31
+// RETURNS:
+//    u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '-' don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudCol(u32 r, u32 g, u32 b)
+{
+       return ((u32)(b>> 8)&(0x03ff    ))
+            | ((u32)(g<< 3)&(0x07ff<<10))
+            | ((u32)(r<<14)&(0x07ff<<21));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed increment for Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  Sign-extended 8.10 fixed-pt r,g,b color increment values (only dr is shown)
+//   'dr' input:  ssssssssssssssrrrrrrrrXXXXXXXXXX
+//                ^ bit 31
+// RETURNS:
+//   u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and 's' sign bits
+//
+// NOTE: The correctness of this code/method has not been fully verified,
+//       having been merely factored out from original code in
+//       poly-drawing functions. Feel free to check/improve it -senquack
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudColInc(s32 dr, s32 dg, s32 db)
+{
+       u32 dr_tmp = (u32)(dr << 14)&(0xffffffff<<21);  if (dr < 0) dr_tmp += 1<<21;
+       u32 dg_tmp = (u32)(dg <<  3)&(0xffffffff<<10);  if (dg < 0) dg_tmp += 1<<10;
+       u32 db_tmp = (u32)(db >>  8)&(0xffffffff    );  if (db < 0) db_tmp += 1<< 0;
+       return db_tmp + dg_tmp + dr_tmp;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// RETURNS:
+//    u16 output:  0bbbbbgggggrrrrr
+//                 ^ bit 16
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingRGBGeneric(u32 gCol)
+{
+       return ((gCol<< 5)&0x7C00) |
+              ((gCol>>11)&0x03E0) |
+               (gCol>>27);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet in 'gCol'
+//  to padded u32 5.4:5.4:5.4 bgr fixed-pt triplet, suitable for use
+//  with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                      ^ bit 31
+// RETURNS:
+//         u32 output:  000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                      ^ bit 31
+//  Where 'X' are fixed-pt bits, '0' zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingRGB24(u32 gCol)
+{
+       return ((gCol<<19) & (0x1FF<<20)) |
+              ((gCol>> 2) & (0x1FF<<10)) |
+               (gCol>>23);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+//
+// INPUT:
+//        'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//          is midpoint that doesn't modify that component of texture
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTGeneric(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+       return (gpu_senquack.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
+              (gpu_senquack.LightLUT[ (uSrc&0x03E0)     | g5] <<  5) |
+              (gpu_senquack.LightLUT[((uSrc&0x001F)<<5) | r5]      );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
+//
+// INPUT:
+//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
+//     15.0 is midpoint that does not modify color of texture
+//         gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
+//                       ^ bit 31
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudGeneric(uint_fast16_t uSrc, u32 gCol)
+{
+       return (gpu_senquack.LightLUT[((uSrc&0x7C00)>>5) | ((gCol>> 5)&0x1F)]<<10) |
+              (gpu_senquack.LightLUT[ (uSrc&0x03E0)     | ((gCol>>16)&0x1F)]<< 5) |
+              (gpu_senquack.LightLUT[((uSrc&0x001F)<<5) |  (gCol>>27)      ]    );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color,
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//        'r8','g8','b8' are unsigned 8-bit color component values, value of
+//          127 is midpoint that doesn't modify that component of texture
+//
+//         uSrc input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24(uint_fast16_t uSrc, u8 r8, u8 g8, u8 b8)
+{
+       uint_fast16_t r1 = uSrc&0x001F;
+       uint_fast16_t g1 = uSrc&0x03E0;
+       uint_fast16_t b1 = uSrc&0x7C00;
+
+       uint_fast16_t r2 = r8;
+       uint_fast16_t g2 = g8;
+       uint_fast16_t b2 = b8;
+
+       u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+       return ((r3>> 3)    ) |
+              ((g3>> 8)<<10) |
+              ((b3>>13)<<20);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color in 'uSrc',
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'uSrc' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+//       'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                     ^ bit 31
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24Gouraud(uint_fast16_t uSrc, u32 gCol)
+{
+       uint_fast16_t r1 = uSrc&0x001F;
+       uint_fast16_t g1 = uSrc&0x03E0;
+       uint_fast16_t b1 = uSrc&0x7C00;
+
+       uint_fast16_t r2 = (gCol>>24) & 0xFF;
+       uint_fast16_t g2 = (gCol>>13) & 0xFF;
+       uint_fast16_t b2 = (gCol>> 2) & 0xFF;
+
+       u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+       return ((r3>> 3)    ) |
+              ((g3>> 8)<<10) |
+              ((b3>>13)<<20);
+}
+
+#endif  //_OP_LIGHT_H_
diff --git a/plugins/gpu_senquack/gpu_inner_light_arm.h b/plugins/gpu_senquack/gpu_inner_light_arm.h

new file mode 100644 (file)

index 0000000..550f6b1
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_light_arm.h
@@ -0,0 +1,112 @@
+#ifndef _OP_LIGHT_ARM_H_
+#define _OP_LIGHT_ARM_H_
+
+////////////////////////////////////////////////////////////////////////////////
+// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// RETURNS:
+//    u16 output:  0bbbbbgggggrrrrr
+//                 ^ bit 16
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingRGBARM(u32 gCol)
+{
+       uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
+       u32 tmp;
+  
+       asm ("and %[tmp], %[gCol], %[out]\n\t"              // tmp holds 0x000000bbbbb00000
+            "and %[out], %[out],  %[gCol], lsr #0x0B\n\t"  // out holds 0x000000ggggg00000
+            "orr %[tmp], %[out],  %[tmp],  lsl #0x05\n\t"  // tmp holds 0x0bbbbbggggg00000
+            "orr %[out], %[tmp],  %[gCol], lsr #0x1B\n\t"  // out holds 0x0bbbbbgggggrrrrr
+            : [out] "+&r" (out), [tmp] "=&r" (tmp)
+            : [gCol] "r"  (gCol)
+            );
+
+       return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+//
+// INPUT:
+//       'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//         is midpoint that doesn't modify that component of texture
+//       'uSrc' input:  mbbbbbgggggrrrrr
+//                      ^ bit 16
+// RETURNS:
+//         u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits.
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTARM(uint_fast16_t uSrc, u8 r5, u8 g5, u8 b5)
+{
+       uint_fast16_t out = 0x03E0;
+       u32 db, dg;
+
+       // Using `g` for src, `G` for dest
+       asm ("and    %[dg],  %[out],    %[src]  \n\t"             // dg holds 0x000000ggggg00000
+            "orr    %[dg],  %[dg],     %[g5]   \n\t"             // dg holds 0x000000gggggGGGGG
+            "and    %[db],  %[out],    %[src], lsr #0x05 \n\t"   // db holds 0x000000bbbbb00000
+            "ldrb   %[dg],  [%[lut],   %[dg]]  \n\t"             // dg holds result 0x00000000000ggggg
+            "and    %[out], %[out],    %[src], lsl #0x05 \n\t"   // out holds 0x000000rrrrr00000
+            "orr    %[out], %[out],    %[r5]   \n\t"             // out holds 0x000000rrrrrRRRRR
+            "orr    %[db],  %[db],     %[b5]   \n\t"             // db holds 0x000000bbbbbBBBBB
+            "ldrb   %[out], [%[lut],   %[out]] \n\t"             // out holds result 0x00000000000rrrrr
+            "ldrb   %[db],  [%[lut],   %[db]]  \n\t"             // db holds result 0x00000000000bbbbb
+            "tst    %[src], #0x8000\n\t"                         // check whether msb was set on uSrc
+            "orr    %[out], %[out],    %[dg],  lsl #0x05   \n\t" // out holds 0x000000gggggrrrrr
+            "orrne  %[out], %[out],    #0x8000\n\t"              // add msb to out if set on uSrc
+            "orr    %[out], %[out],    %[db],  lsl #0x0A   \n\t" // out holds 0xmbbbbbgggggrrrrr
+            : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg)
+            : [r5] "r" (r5), [g5] "r" (g5),  [b5] "r" (b5),
+              [lut] "r" (gpu_senquack.LightLUT), [src] "r" (uSrc), "0" (out)
+            : "cc");
+       return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
+//
+// INPUT:
+//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
+//     15.0 is midpoint that does not modify color of texture
+//        gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
+//                      ^ bit 31
+//       'uSrc' input:  mbbbbbgggggrrrrr
+//                      ^ bit 16
+// RETURNS:
+//         u16 output:  mbbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE uint_fast16_t gpuLightingTXTGouraudARM(uint_fast16_t uSrc, u32 gCol)
+{
+       uint_fast16_t out = 0x03E0; // don't need the mask after starting to write output
+       u32 db,dg,gtmp;
+
+       // Using `g` for src, `G` for dest
+       asm ("and    %[dg],  %[out],  %[src]   \n\t"           // dg holds 0x000000ggggg00000
+            "and    %[gtmp],%[out],  %[gCol], lsr #0x0B \n\t" // gtmp holds 0x000000GGGGG00000
+            "and    %[db],  %[out],  %[src],  lsr #0x05 \n\t" // db holds 0x000000bbbbb00000
+            "orr    %[dg],  %[dg],   %[gtmp], lsr #0x05 \n\t" // dg holds 0x000000gggggGGGGG
+            "and    %[gtmp],%[out],  %[gCol]  \n\t"           // gtmp holds 0x000000BBBBB00000
+            "ldrb   %[dg],  [%[lut], %[dg]]   \n\t"           // dg holds result 0x00000000000ggggg
+            "and    %[out], %[out],  %[src],  lsl #0x05 \n\t" // out holds 0x000000rrrrr00000
+            "orr    %[out], %[out],  %[gCol], lsr #0x1B \n\t" // out holds 0x000000rrrrrRRRRR
+            "orr    %[db],  %[db],   %[gtmp], lsr #0x05 \n\t" // db holds 0x000000bbbbbBBBBB
+            "ldrb   %[out], [%[lut], %[out]]  \n\t"           // out holds result 0x00000000000rrrrr
+            "ldrb   %[db],  [%[lut], %[db]]   \n\t"           // db holds result 0x00000000000bbbbb
+            "tst    %[src], #0x8000\n\t"                      // check whether msb was set on uSrc
+            "orr    %[out], %[out],  %[dg],   lsl #0x05 \n\t" // out holds 0x000000gggggrrrrr
+            "orrne  %[out], %[out],  #0x8000\n\t"             // add msb to out if set on uSrc
+            "orr    %[out], %[out],  %[db],   lsl #0x0A \n\t" // out holds 0xmbbbbbgggggrrrrr
+            : [out] "=&r" (out), [db] "=&r" (db), [dg] "=&r" (dg),
+              [gtmp] "=&r" (gtmp) \
+            : [gCol] "r" (gCol), [lut] "r" (gpu_senquack.LightLUT), "0" (out), [src] "r" (uSrc)
+            : "cc");
+
+       return out;
+}
+
+#endif  //_OP_LIGHT_ARM_H_
diff --git a/plugins/gpu_senquack/gpu_inner_quantization.h b/plugins/gpu_senquack/gpu_inner_quantization.h

new file mode 100644 (file)

index 0000000..6432d03
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_inner_quantization.h
@@ -0,0 +1,108 @@
+/***************************************************************************
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_DITHER_H_
+#define _OP_DITHER_H_
+
+static void SetupDitheringConstants()
+{
+       // Initialize Dithering Constants
+       // The screen is divided into 8x8 chunks and sub-unitary noise is applied
+       // using the following matrix. This ensures that data lost in color
+       // quantization will be added back to the image 'by chance' in predictable
+       // patterns that are naturally 'smoothed' by your sight when viewed from a
+       // certain distance.
+       //
+       // http://caca.zoy.org/study/index.html
+       //
+       // Shading colors are encoded in 4.5, and then are quantitized to 5.0,
+       // DitherMatrix constants reflect that.
+
+       static const u8 DitherMatrix[] = {
+                0, 32,  8, 40,  2, 34, 10, 42,
+               48, 16, 56, 24, 50, 18, 58, 26,
+               12, 44,  4, 36, 14, 46,  6, 38,
+               60, 28, 52, 20, 62, 30, 54, 22,
+                3, 35, 11, 43,  1, 33,  9, 41,
+               51, 19, 59, 27, 49, 17, 57, 25,
+               15, 47,  7, 39, 13, 45,  5, 37,
+               63, 31, 55, 23, 61, 29, 53, 21
+       };
+
+       int i, j;
+       for (i = 0; i < 8; i++)
+       {
+               for (j = 0; j < 8; j++)
+               {
+                       u16 offset = (i << 3) | j;
+
+                       u32 component = ((DitherMatrix[offset] + 1) << 4) / 65; //[5.5] -> [5]
+
+                       // XXX - senquack - hack Dec 2016
+                       //  Until JohnnyF gets the time to work further on dithering,
+                       //   force lower bit of component to 0. This fixes grid pattern
+                       //   affecting quality of dithered image, as well as loss of
+                       //   detail in dark areas. With lower bit unset like this, existing
+                       //   27-bit accuracy of dithering math is unneeded, could be 24-bit.
+                       //   Is 8x8 matrix overkill as a result, can we use 4x4?
+                       component &= ~1;
+
+                       gpu_senquack.DitherMatrix[offset] = (component)
+                                                     | (component << 10)
+                                                     | (component << 20);
+               }
+       }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert padded u32 5.4:5.4:5.4 bgr fixed-pt triplet to final bgr555 color,
+//  applying dithering if specified by template parameter.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'pDst' is a pointer to destination framebuffer pixel, used
+//         to determine which DitherMatrix[] entry to apply.
+// RETURNS:
+//         u16 output: 0bbbbbgggggrrrrr
+//                     ^ bit 16
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int DITHER>
+GPU_INLINE u16 gpuColorQuantization24(u32 uSrc24, const u16 *pDst)
+{
+       if (DITHER)
+       {
+               u16 fbpos  = (u32)(pDst - gpu_senquack.vram);
+               u16 offset = ((fbpos & (0x7 << 10)) >> 7) | (fbpos & 0x7);
+
+               //clean overflow flags and add
+               uSrc24 = (uSrc24 & 0x1FF7FDFF) + gpu_senquack.DitherMatrix[offset];
+
+               if (uSrc24 & (1<< 9)) uSrc24 |= (0x1FF    );
+               if (uSrc24 & (1<<19)) uSrc24 |= (0x1FF<<10);
+               if (uSrc24 & (1<<29)) uSrc24 |= (0x1FF<<20);
+       }
+
+       return ((uSrc24>> 4) & (0x1F    ))
+            | ((uSrc24>> 9) & (0x1F<<5 ))
+            | ((uSrc24>>14) & (0x1F<<10));
+}
+
+#endif //_OP_DITHER_H_
diff --git a/plugins/gpu_senquack/gpu_raster_image.h b/plugins/gpu_senquack/gpu_raster_image.h

new file mode 100644 (file)

index 0000000..8e8064c
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_raster_image.h
@@ -0,0 +1,220 @@
+/***************************************************************************
+ *   Copyright (C) 2010 PCSX4ALL Team                                      *
+ *   Copyright (C) 2010 Unai                                               *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+ ***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_IMAGE_H__
+#define __GPU_UNAI_GPU_RASTER_IMAGE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+void gpuLoadImage(PtrUnion packet)
+{
+       u16 x0, y0, w0, h0;
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       w0 = packet.U2[4];
+       h0 = packet.U2[5];
+
+       if ((y0 + h0) > FRAME_HEIGHT)
+       {
+               h0 = FRAME_HEIGHT - y0;
+       }
+
+       gpu_senquack.dma.FrameToWrite = ((w0)&&(h0));
+
+       gpu_senquack.dma.px = 0;
+       gpu_senquack.dma.py = 0;
+       gpu_senquack.dma.x_end = w0;
+       gpu_senquack.dma.y_end = h0;
+       gpu_senquack.dma.pvram = &((u16*)gpu_senquack.vram)[x0+(y0*1024)];
+
+       gpu_senquack.GPU_GP1 |= 0x08000000;
+}
+#endif // !USE_GPULIB
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+void gpuStoreImage(PtrUnion packet)
+{
+       u16 x0, y0, w0, h0;
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       w0 = packet.U2[4];
+       h0 = packet.U2[5];
+
+       if ((y0 + h0) > FRAME_HEIGHT)
+       {
+               h0 = FRAME_HEIGHT - y0;
+       }
+       gpu_senquack.dma.FrameToRead = ((w0)&&(h0));
+
+       gpu_senquack.dma.px = 0;
+       gpu_senquack.dma.py = 0;
+       gpu_senquack.dma.x_end = w0;
+       gpu_senquack.dma.y_end = h0;
+       gpu_senquack.dma.pvram = &((u16*)gpu_senquack.vram)[x0+(y0*1024)];
+       
+       gpu_senquack.GPU_GP1 |= 0x08000000;
+}
+#endif // !USE_GPULIB
+
+void gpuMoveImage(PtrUnion packet)
+{
+       u32 x0, y0, x1, y1;
+       s32 w0, h0;
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       x1 = packet.U2[4] & 1023;
+       y1 = packet.U2[5] & 511;
+       w0 = packet.U2[6];
+       h0 = packet.U2[7];
+
+       if( (x0==x1) && (y0==y1) ) return;
+       if ((w0<=0) || (h0<=0)) return;
+       
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"gpuMoveImage(x0=%u,y0=%u,x1=%u,y1=%u,w0=%d,h0=%d)\n",x0,y0,x1,y1,w0,h0);
+       #endif
+       
+       if (((y0+h0)>512)||((x0+w0)>1024)||((y1+h0)>512)||((x1+w0)>1024))
+       {
+               u16 *psxVuw=gpu_senquack.vram;
+               s32 i,j;
+           for(j=0;j<h0;j++)
+                for(i=0;i<w0;i++)
+                 psxVuw [(1024*((y1+j)&511))+((x1+i)&0x3ff)]=
+                  psxVuw[(1024*((y0+j)&511))+((x0+i)&0x3ff)];
+       }
+       else if ((x0&1)||(x1&1))
+       {
+               u16 *lpDst, *lpSrc;
+               lpDst = lpSrc = (u16*)gpu_senquack.vram;
+               lpSrc += FRAME_OFFSET(x0, y0);
+               lpDst += FRAME_OFFSET(x1, y1);
+               x1 = FRAME_WIDTH - w0;
+               do {
+                       x0=w0;
+                       do { *lpDst++ = *lpSrc++; } while (--x0);
+                       lpDst += x1;
+                       lpSrc += x1;
+               } while (--h0);
+       }
+       else
+       {
+               u32 *lpDst, *lpSrc;
+               lpDst = lpSrc = (u32*)(void*)gpu_senquack.vram;
+               lpSrc += ((FRAME_OFFSET(x0, y0))>>1);
+               lpDst += ((FRAME_OFFSET(x1, y1))>>1);
+               if (w0&1)
+               {
+                       x1 = (FRAME_WIDTH - w0 +1)>>1;
+                       w0>>=1;
+                       if (!w0) {
+                               do {
+                                       *((u16*)lpDst) = *((u16*)lpSrc);
+                                       lpDst += x1;
+                                       lpSrc += x1;
+                               } while (--h0);
+                       } else
+                       do {
+                               x0=w0;
+                               do { *lpDst++ = *lpSrc++; } while (--x0);
+                               *((u16*)lpDst) = *((u16*)lpSrc);
+                               lpDst += x1;
+                               lpSrc += x1;
+                       } while (--h0);
+               }
+               else
+               {
+                       x1 = (FRAME_WIDTH - w0)>>1;
+                       w0>>=1;
+                       do {
+                               x0=w0;
+                               do { *lpDst++ = *lpSrc++; } while (--x0);
+                               lpDst += x1;
+                               lpSrc += x1;
+                       } while (--h0);
+               }
+       }
+}
+
+void gpuClearImage(PtrUnion packet)
+{
+       s32   x0, y0, w0, h0;
+       x0 = packet.S2[2];
+       y0 = packet.S2[3];
+       w0 = packet.S2[4] & 0x3ff;
+       h0 = packet.S2[5] & 0x3ff;
+        
+       w0 += x0;
+       if (x0 < 0) x0 = 0;
+       if (w0 > FRAME_WIDTH) w0 = FRAME_WIDTH;
+       w0 -= x0;
+       if (w0 <= 0) return;
+       h0 += y0;
+       if (y0 < 0) y0 = 0;
+       if (h0 > FRAME_HEIGHT) h0 = FRAME_HEIGHT;
+       h0 -= y0;
+       if (h0 <= 0) return;
+
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"gpuClearImage(x0=%d,y0=%d,w0=%d,h0=%d)\n",x0,y0,w0,h0);
+       #endif
+       
+       if (x0&1)
+       {
+               u16* pixel = (u16*)gpu_senquack.vram + FRAME_OFFSET(x0, y0);
+               u16 rgb = GPU_RGB16(packet.U4[0]);
+               y0 = FRAME_WIDTH - w0;
+               do {
+                       x0=w0;
+                       do { *pixel++ = rgb; } while (--x0);
+                       pixel += y0;
+               } while (--h0);
+       }
+       else
+       {
+               u32* pixel = (u32*)gpu_senquack.vram + ((FRAME_OFFSET(x0, y0))>>1);
+               u32 rgb = GPU_RGB16(packet.U4[0]);
+               rgb |= (rgb<<16);
+               if (w0&1)
+               {
+                       y0 = (FRAME_WIDTH - w0 +1)>>1;
+                       w0>>=1;
+                       do {
+                               x0=w0;
+                               do { *pixel++ = rgb; } while (--x0);
+                               *((u16*)pixel) = (u16)rgb;
+                               pixel += y0;
+                       } while (--h0);
+               }
+               else
+               {
+                       y0 = (FRAME_WIDTH - w0)>>1;
+                       w0>>=1;
+                       do {
+                               x0=w0;
+                               do { *pixel++ = rgb; } while (--x0);
+                               pixel += y0;
+                       } while (--h0);
+               }
+       }
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_IMAGE_H__ */
diff --git a/plugins/gpu_senquack/gpu_raster_line.h b/plugins/gpu_senquack/gpu_raster_line.h

new file mode 100644 (file)

index 0000000..4dd99a6
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_raster_line.h
@@ -0,0 +1,720 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_LINE_H__
+#define __GPU_UNAI_GPU_RASTER_LINE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal line drawing functions
+//
+// Rewritten October 2016 by senquack:
+//  Instead of one pixel at a time, lines are now drawn in runs of pixels,
+//  whether vertical, horizontal, or diagonal. A new inner driver
+//  'gpuPixelSpanFn' is used, as well as an enhanced Bresenham run-slice
+//  algorithm. For more information, see the following:
+//
+//  Michael Abrash - Graphics Programming Black Book
+//  Chapters 35 - 36 (does not implement diagonal runs)
+//  http://www.drdobbs.com/parallel/graphics-programming-black-book/184404919
+//  http://www.jagregory.com/abrash-black-book/
+//
+//  Article by Andrew Delong (does not implement diagonal runs)
+//  http://timetraces.ca/nw/drawline.htm
+//
+//  'Run-Based Multi-Point Line Drawing' by Eun Jae Lee & Larry F. Hodges
+//  https://smartech.gatech.edu/bitstream/handle/1853/3632/93-22.pdf
+//  Provided the idea of doing a half-octant transform allowing lines with
+//  slopes between 0.5 and 2.0 (diagonal runs of pixels) to be handled
+//  identically to the traditional horizontal/vertical run-slice method.
+
+// Use 16.16 fixed point precision for line math.
+// NOTE: Gouraud colors used by gpuPixelSpanFn can use a different precision.
+#define GPU_LINE_FIXED_BITS 16
+
+// If defined, Gouraud lines will use fixed-point multiply-by-inverse to
+// do most divisions. With enough accuracy, this should be OK.
+#define USE_LINES_ALL_FIXED_PT_MATH
+
+//////////////////////
+// Flat-shaded line //
+//////////////////////
+void gpuDrawLineF(PtrUnion packet, const PSD gpuPixelSpanDriver)
+{
+       int x0, y0, x1, y1;
+       int dx, dy;
+
+       // All three of these variables should be signed (so multiplication works)
+       ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+       const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+       const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+       // Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+       //  bottommost pixels of the draw area. Since we render every pixel between
+       //  and including both line endpoints, subtract one from xmax/ymax.
+       const int xmin = gpu_senquack.DrawingArea[0];
+       const int ymin = gpu_senquack.DrawingArea[1];
+       const int xmax = gpu_senquack.DrawingArea[2] - 1;
+       const int ymax = gpu_senquack.DrawingArea[3] - 1;
+
+       x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_senquack.DrawingOffset[0];
+       y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_senquack.DrawingOffset[1];
+       x1 = GPU_EXPANDSIGN(packet.S2[4]) + gpu_senquack.DrawingOffset[0];
+       y1 = GPU_EXPANDSIGN(packet.S2[5]) + gpu_senquack.DrawingOffset[1];
+
+       // Always draw top to bottom, so ensure y0 <= y1
+       if (y0 > y1) {
+               SwapValues(y0, y1);
+               SwapValues(x0, x1);
+       }
+
+       // Is line totally outside Y clipping range?
+       if (y0 > ymax || y1 < ymin) return;
+
+       dx = x1 - x0;
+       dy = y1 - y0;
+
+       // X-axis range check : max distance between any two X coords is 1023
+       // (PSX hardware will not render anything violating this rule)
+       // NOTE: We'll check y coord range further below
+       if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+               return;
+
+       // Y-axis range check and clipping
+       if (dy) {
+               // Y-axis range check : max distance between any two Y coords is 511
+               // (PSX hardware will not render anything violating this rule)
+               if (dy >= CHKMAX_Y)
+                       return;
+
+               // We already know y0 < y1
+               if (y0 < ymin) {
+                       x0 += GPU_FAST_DIV(((ymin - y0) * dx), dy);
+                       y0 = ymin;
+               }
+               if (y1 > ymax) {
+                       x1 += GPU_FAST_DIV(((ymax - y1) * dx), dy);
+                       y1 = ymax;
+               }
+
+               // Recompute in case clipping occurred:
+               dx = x1 - x0;
+               dy = y1 - y0;
+       }
+
+       // Check X clipping range, set 'sx' x-direction variable
+       if (dx == 0) {
+               // Is vertical line totally outside X clipping range?
+               if (x0 < xmin || x0 > xmax)
+                       return;
+               sx = 0;
+       } else {
+               if (dx > 0) {
+                       // x0 is leftmost coordinate
+                       if (x0 > xmax) return; // Both points outside X clip range
+
+                       if (x0 < xmin) {
+                               if (x1 < xmin) return; // Both points outside X clip range
+                               y0 += GPU_FAST_DIV(((xmin - x0) * dy), dx);
+                               x0 = xmin;
+                       }
+
+                       if (x1 > xmax) {
+                               y1 += GPU_FAST_DIV(((xmax - x1) * dy), dx);
+                               x1 = xmax;
+                       }
+
+                       sx = +1;
+                       dx = x1 - x0; // Get final value, which should also be absolute value
+               } else {
+                       // x1 is leftmost coordinate
+                       if (x1 > xmax) return; // Both points outside X clip range
+
+                       if (x1 < xmin) {
+                               if (x0 < xmin) return; // Both points outside X clip range
+
+                               y1 += GPU_FAST_DIV(((xmin - x1) * dy), dx);
+                               x1 = xmin;
+                       }
+
+                       if (x0 > xmax) {
+                               y0 += GPU_FAST_DIV(((xmax - x0) * dy), dx);
+                               x0 = xmax;
+                       }
+
+                       sx = -1;
+                       dx = x0 - x1; // Get final value, which should also be absolute value
+               }
+
+               // Recompute in case clipping occurred:
+               dy = y1 - y0;
+       }
+
+       // IMPORTANT: dx,dy should now contain their absolute values
+
+       int min_length,    // Minimum length of a pixel run
+           start_length,  // Length of first run
+           end_length,    // Length of last run
+           err_term,      // Cumulative error to determine when to draw longer run
+           err_adjup,     // Increment to err_term for each run drawn
+           err_adjdown;   // Subract this from err_term after drawing longer run
+
+       // Color to draw with (16 bits, highest of which is unset mask bit)
+       uintptr_t col16 = GPU_RGB16(packet.U4[0]);
+
+       // We use u8 pointers even though PS1 has u16 framebuffer.
+       //  This allows pixel-drawing functions to increment dst pointer
+       //  directly by the passed 'incr' value, not having to shift it first.
+       u8 *dst = (u8*)gpu_senquack.vram + y0 * dst_stride + x0 * dst_depth;
+
+       // SPECIAL CASE: Vertical line
+       if (dx == 0) {
+               gpuPixelSpanDriver(dst, col16, dst_stride, dy+1);
+               return;
+       }
+
+       // SPECIAL CASE: Horizontal line
+       if (dy == 0) {
+               gpuPixelSpanDriver(dst, col16, sx * dst_depth, dx+1);
+               return;
+       }
+
+       // SPECIAL CASE: Diagonal line
+       if (dx == dy) {
+               gpuPixelSpanDriver(dst, col16, dst_stride + (sx * dst_depth), dy+1);
+               return;
+       }
+
+       int       major, minor;             // Major axis, minor axis
+       ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+       if (dx > dy) {
+               major = dx;
+               minor = dy;
+       } else {
+               major = dy;
+               minor = dx;
+       }
+
+       // Determine if diagonal or horizontal runs
+       if (major < (2 * minor)) {
+               // Diagonal runs, so perform half-octant transformation
+               minor = major - minor;
+
+               // Advance diagonally when drawing runs
+               incr_major = dst_stride + (sx * dst_depth);
+
+               // After drawing each run, correct for over-advance along minor axis
+               if (dx > dy)
+                       incr_minor = -dst_stride;
+               else
+                       incr_minor = -sx * dst_depth;
+       } else {
+               // Horizontal or vertical runs
+               if (dx > dy) {
+                       incr_major = sx * dst_depth;
+                       incr_minor = dst_stride;
+               } else {
+                       incr_major = dst_stride;
+                       incr_minor = sx * dst_depth;
+               }
+       }
+
+       if (minor > 1) {
+               // Minimum number of pixels each run
+               min_length = major / minor;
+
+               // Initial error term; reflects an initial step of 0.5 along minor axis
+               err_term = (major % minor) - (minor * 2);
+
+               // Increment err_term this much each step along minor axis; when
+               //  err_term crosses zero, draw longer pixel run.
+               err_adjup = (major % minor) * 2;
+       } else {
+               min_length = major;
+               err_term = 0;
+               err_adjup = 0;
+       }
+
+       // Error term adjustment when err_term turns over; used to factor
+       //  out the major-axis step made at that time
+       err_adjdown = minor * 2;
+
+       // The initial and last runs are partial, because minor axis advances
+       //  only 0.5 for these runs, rather than 1. Each is half a full run,
+       //  plus the initial pixel.
+       start_length = end_length = (min_length / 2) + 1;
+
+       if (min_length & 1) {
+               // If there're an odd number of pixels per run, we have 1 pixel that
+               //  can't be allocated to either the initial or last partial run, so
+               //  we'll add 0.5 to err_term so that this pixel will be handled
+               //  by the normal full-run loop
+               err_term += minor;
+       } else {
+               // If the minimum run length is even and there's no fractional advance,
+               // we have one pixel that could go to either the initial or last
+               // partial run, which we arbitrarily allocate to the last run
+               if (err_adjup == 0)
+                       start_length--; // Leave out the extra pixel at the start
+       }
+
+       // First run of pixels
+       dst = gpuPixelSpanDriver(dst, col16, incr_major, start_length);
+       dst += incr_minor;
+
+       // Middle runs of pixels
+       while (--minor > 0) {
+               int run_length = min_length;
+               err_term += err_adjup;
+
+               // If err_term passed 0, reset it and draw longer run
+               if (err_term > 0) {
+                       err_term -= err_adjdown;
+                       run_length++;
+               }
+
+               dst = gpuPixelSpanDriver(dst, col16, incr_major, run_length);
+               dst += incr_minor;
+       }
+
+       // Final run of pixels
+       gpuPixelSpanDriver(dst, col16, incr_major, end_length);
+}
+
+/////////////////////////
+// Gouraud-shaded line //
+/////////////////////////
+void gpuDrawLineG(PtrUnion packet, const PSD gpuPixelSpanDriver)
+{
+       int x0, y0, x1, y1;
+       int dx, dy, dr, dg, db;
+       u32 r0, g0, b0, r1, g1, b1;
+
+       // All three of these variables should be signed (so multiplication works)
+       ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+       const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+       const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+       // Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+       //  bottommost pixels of the draw area. We'll render every pixel between
+       //  and including both line endpoints, so subtract one from xmax/ymax.
+       const int xmin = gpu_senquack.DrawingArea[0];
+       const int ymin = gpu_senquack.DrawingArea[1];
+       const int xmax = gpu_senquack.DrawingArea[2] - 1;
+       const int ymax = gpu_senquack.DrawingArea[3] - 1;
+
+       x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_senquack.DrawingOffset[0];
+       y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_senquack.DrawingOffset[1];
+       x1 = GPU_EXPANDSIGN(packet.S2[6]) + gpu_senquack.DrawingOffset[0];
+       y1 = GPU_EXPANDSIGN(packet.S2[7]) + gpu_senquack.DrawingOffset[1];
+
+       u32 col0 = packet.U4[0];
+       u32 col1 = packet.U4[2];
+
+       // Always draw top to bottom, so ensure y0 <= y1
+       if (y0 > y1) {
+               SwapValues(y0, y1);
+               SwapValues(x0, x1);
+               SwapValues(col0, col1);
+       }
+
+       // Is line totally outside Y clipping range?
+       if (y0 > ymax || y1 < ymin) return;
+
+       // If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+       // (This is only beneficial if using SIMD-optimized pixel driver)
+#ifdef GPU_GOURAUD_LOW_PRECISION
+       r0 = (col0 >> 3) & 0x1f;  g0 = (col0 >> 11) & 0x1f;  b0 = (col0 >> 19) & 0x1f;
+       r1 = (col1 >> 3) & 0x1f;  g1 = (col1 >> 11) & 0x1f;  b1 = (col1 >> 19) & 0x1f;
+#else
+       r0 = col0 & 0xff;  g0 = (col0 >> 8) & 0xff;  b0 = (col0 >> 16) & 0xff;
+       r1 = col1 & 0xff;  g1 = (col1 >> 8) & 0xff;  b1 = (col1 >> 16) & 0xff;
+#endif
+
+       dx = x1 - x0;
+       dy = y1 - y0;
+       dr = r1 - r0;
+       dg = g1 - g0;
+       db = b1 - b0;
+
+       // X-axis range check : max distance between any two X coords is 1023
+       // (PSX hardware will not render anything violating this rule)
+       // NOTE: We'll check y coord range further below
+       if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+               return;
+
+       // Y-axis range check and clipping
+       if (dy) {
+               // Y-axis range check : max distance between any two Y coords is 511
+               // (PSX hardware will not render anything violating this rule)
+               if (dy >= CHKMAX_Y)
+                       return;
+
+               // We already know y0 < y1
+               if (y0 < ymin) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                       s32 factor = GPU_FAST_DIV(((ymin - y0) << GPU_LINE_FIXED_BITS), dy);
+                       x0 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+                       r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                       g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                       b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                       x0 += (ymin - y0) * dx / dy;
+                       r0 += (ymin - y0) * dr / dy;
+                       g0 += (ymin - y0) * dg / dy;
+                       b0 += (ymin - y0) * db / dy;
+#endif
+                       y0 = ymin;
+               }
+
+               if (y1 > ymax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                       s32 factor = GPU_FAST_DIV(((ymax - y1) << GPU_LINE_FIXED_BITS), dy);
+                       x1 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+                       r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                       g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                       b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                       x1 += (ymax - y1) * dx / dy;
+                       r1 += (ymax - y1) * dr / dy;
+                       g1 += (ymax - y1) * dg / dy;
+                       b1 += (ymax - y1) * db / dy;
+#endif
+                       y1 = ymax;
+               }
+
+               // Recompute in case clipping occurred:
+               dx = x1 - x0;
+               dy = y1 - y0;
+               dr = r1 - r0;
+               dg = g1 - g0;
+               db = b1 - b0;
+       }
+
+       // Check X clipping range, set 'sx' x-direction variable
+       if (dx == 0) {
+               // Is vertical line totally outside X clipping range?
+               if (x0 < xmin || x0 > xmax)
+                       return;
+               sx = 0;
+       } else {
+               if (dx > 0) {
+                       // x0 is leftmost coordinate
+                       if (x0 > xmax) return; // Both points outside X clip range
+
+                       if (x0 < xmin) {
+                               if (x1 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmin - x0) << GPU_LINE_FIXED_BITS), dx);
+                               y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y0 += (xmin - x0) * dy / dx;
+                               r0 += (xmin - x0) * dr / dx;
+                               g0 += (xmin - x0) * dg / dx;
+                               b0 += (xmin - x0) * db / dx;
+#endif
+                               x0 = xmin;
+                       }
+
+                       if (x1 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmax - x1) << GPU_LINE_FIXED_BITS), dx);
+                               y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y1 += (xmax - x1) * dy / dx;
+                               r1 += (xmax - x1) * dr / dx;
+                               g1 += (xmax - x1) * dg / dx;
+                               b1 += (xmax - x1) * db / dx;
+#endif
+                               x1 = xmax;
+                       }
+
+                       sx = +1;
+                       dx = x1 - x0; // Get final value, which should also be absolute value
+               } else {
+                       // x1 is leftmost coordinate
+                       if (x1 > xmax) return; // Both points outside X clip range
+
+                       if (x1 < xmin) {
+                               if (x0 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmin - x1) << GPU_LINE_FIXED_BITS), dx);
+                               y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y1 += (xmin - x1) * dy / dx;
+                               r1 += (xmin - x1) * dr / dx;
+                               g1 += (xmin - x1) * dg / dx;
+                               b1 += (xmin - x1) * db / dx;
+#endif
+                               x1 = xmin;
+                       }
+
+                       if (x0 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmax - x0) << GPU_LINE_FIXED_BITS), dx);
+                               y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y0 += (xmax - x0) * dy / dx;
+                               r0 += (xmax - x0) * dr / dx;
+                               g0 += (xmax - x0) * dg / dx;
+                               b0 += (xmax - x0) * db / dx;
+#endif
+                               x0 = xmax;
+                       }
+
+                       sx = -1;
+                       dx = x0 - x1; // Get final value, which should also be absolute value
+               }
+
+               // Recompute in case clipping occurred:
+               dy = y1 - y0;
+               dr = r1 - r0;
+               dg = g1 - g0;
+               db = b1 - b0;
+       }
+
+       // IMPORTANT: dx,dy should now contain their absolute values
+
+       int min_length,    // Minimum length of a pixel run
+           start_length,  // Length of first run
+           end_length,    // Length of last run
+           err_term,      // Cumulative error to determine when to draw longer run
+           err_adjup,     // Increment to err_term for each run drawn
+           err_adjdown;   // Subract this from err_term after drawing longer run
+
+       GouraudColor gcol;
+       gcol.r = r0 << GPU_GOURAUD_FIXED_BITS;
+       gcol.g = g0 << GPU_GOURAUD_FIXED_BITS;
+       gcol.b = b0 << GPU_GOURAUD_FIXED_BITS;
+
+       // We use u8 pointers even though PS1 has u16 framebuffer.
+       //  This allows pixel-drawing functions to increment dst pointer
+       //  directly by the passed 'incr' value, not having to shift it first.
+       u8 *dst = (u8*)gpu_senquack.vram + y0 * dst_stride + x0 * dst_depth;
+
+       // SPECIAL CASE: Vertical line
+       if (dx == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dy fixed-point inverse
+               s32 inv_factor = 1 << GPU_GOURAUD_FIXED_BITS;
+               if (dy > 1) inv_factor = GPU_FAST_DIV(inv_factor, dy);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               // First, convert to Gouraud fixed point
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dy > 1) {
+                       if (dr) gcol.r_incr /= dy;
+                       if (dg) gcol.g_incr /= dy;
+                       if (db) gcol.b_incr /= dy;
+               }
+#endif
+               
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride, dy+1);
+               return;
+       }
+
+       // SPECIAL CASE: Horizontal line
+       if (dy == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dx fixed-point inverse
+               s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+               if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dx > 1) {
+                       if (dr) gcol.r_incr /= dx;
+                       if (dg) gcol.g_incr /= dx;
+                       if (db) gcol.b_incr /= dx;
+               }
+#endif
+
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, sx * dst_depth, dx+1);
+               return;
+       }
+
+       // SPECIAL CASE: Diagonal line
+       if (dx == dy) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dx fixed-point inverse
+               s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+               if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               // First, convert to Gouraud fixed point
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dx > 1) {
+                       if (dr) gcol.r_incr /= dx;
+                       if (dg) gcol.g_incr /= dx;
+                       if (db) gcol.b_incr /= dx;
+               }
+#endif
+
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride + (sx * dst_depth), dy+1);
+               return;
+       }
+
+       int       major, minor;             // Absolute val of major,minor axis delta
+       ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+       if (dx > dy) {
+               major = dx;
+               minor = dy;
+       } else {
+               major = dy;
+               minor = dx;
+       }
+
+       // Determine if diagonal or horizontal runs
+       if (major < (2 * minor)) {
+               // Diagonal runs, so perform half-octant transformation
+               minor = major - minor;
+
+               // Advance diagonally when drawing runs
+               incr_major = dst_stride + (sx * dst_depth);
+
+               // After drawing each run, correct for over-advance along minor axis
+               if (dx > dy)
+                       incr_minor = -dst_stride;
+               else
+                       incr_minor = -sx * dst_depth;
+       } else {
+               // Horizontal or vertical runs
+               if (dx > dy) {
+                       incr_major = sx * dst_depth;
+                       incr_minor = dst_stride;
+               } else {
+                       incr_major = dst_stride;
+                       incr_minor = sx * dst_depth;
+               }
+       }
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+       s32 major_inv = GPU_FAST_DIV((1 << GPU_GOURAUD_FIXED_BITS), major);
+
+       // Simultaneously divide and convert from integer to Gouraud fixed point:
+       gcol.r_incr = dr * major_inv;
+       gcol.g_incr = dg * major_inv;
+       gcol.b_incr = db * major_inv;
+#else
+       gcol.r_incr = dr ? ((dr << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+       gcol.g_incr = dg ? ((dg << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+       gcol.b_incr = db ? ((db << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+#endif
+
+       if (minor > 1) {
+               // Minimum number of pixels each run
+               min_length = major / minor;
+
+               // Initial error term; reflects an initial step of 0.5 along minor axis
+               err_term = (major % minor) - (minor * 2);
+
+               // Increment err_term this much each step along minor axis; when
+               //  err_term crosses zero, draw longer pixel run.
+               err_adjup = (major % minor) * 2;
+       } else {
+               min_length = major;
+               err_term = 0;
+               err_adjup = 0;
+       }
+
+       // Error term adjustment when err_term turns over; used to factor
+       //  out the major-axis step made at that time
+       err_adjdown = minor * 2;
+
+       // The initial and last runs are partial, because minor axis advances
+       //  only 0.5 for these runs, rather than 1. Each is half a full run,
+       //  plus the initial pixel.
+       start_length = end_length = (min_length / 2) + 1;
+
+       if (min_length & 1) {
+               // If there're an odd number of pixels per run, we have 1 pixel that
+               //  can't be allocated to either the initial or last partial run, so
+               //  we'll add 0.5 to err_term so that this pixel will be handled
+               //  by the normal full-run loop
+               err_term += minor;
+       } else {
+               // If the minimum run length is even and there's no fractional advance,
+               // we have one pixel that could go to either the initial or last
+               // partial run, which we'll arbitrarily allocate to the last run
+               if (err_adjup == 0)
+                       start_length--; // Leave out the extra pixel at the start
+       }
+
+       // First run of pixels
+       dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, start_length);
+       dst += incr_minor;
+
+       // Middle runs of pixels
+       while (--minor > 0) {
+               int run_length = min_length;
+               err_term += err_adjup;
+
+               // If err_term passed 0, reset it and draw longer run
+               if (err_term > 0) {
+                       err_term -= err_adjdown;
+                       run_length++;
+               }
+
+               dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, run_length);
+               dst += incr_minor;
+       }
+
+       // Final run of pixels
+       gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, end_length);
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_LINE_H__ */
diff --git a/plugins/gpu_senquack/gpu_raster_polygon.h b/plugins/gpu_senquack/gpu_raster_polygon.h

new file mode 100644 (file)

index 0000000..8638ac4
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_raster_polygon.h
@@ -0,0 +1,1453 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_POLYGON_H__
+#define __GPU_UNAI_GPU_RASTER_POLYGON_H__
+
+//senquack - NOTE: GPU Unai poly routines have been rewritten/adapted
+// from DrHell routines to fix multiple issues. See README_senquack.txt
+
+///////////////////////////////////////////////////////////////////////////////
+// Shared poly vertex buffer, able to handle 3 or 4-pt polys of any type.
+///////////////////////////////////////////////////////////////////////////////
+
+struct PolyVertex {
+       s32 x, y; // Sign-extended 11-bit X,Y coords
+       union {
+               struct { u8 u, v, pad[2]; } tex; // Texture coords (if used)
+               u32 tex_word;
+       };
+       union {
+               struct { u8 r, g, b, pad; } col; // 24-bit RGB color (if used)
+               u32 col_word;
+       };
+};
+
+enum PolyAttribute {
+       POLYATTR_TEXTURE = (1 << 0),
+       POLYATTR_GOURAUD = (1 << 1)
+};
+
+enum PolyType {
+       POLYTYPE_F  = 0,
+       POLYTYPE_FT = (POLYATTR_TEXTURE),
+       POLYTYPE_G  = (POLYATTR_GOURAUD),
+       POLYTYPE_GT = (POLYATTR_TEXTURE | POLYATTR_GOURAUD)
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// polyInitVertexBuffer()
+// Fills vbuf[] array with data from any type of poly draw-command packet.
+///////////////////////////////////////////////////////////////////////////////
+static void polyInitVertexBuffer(PolyVertex *vbuf, const PtrUnion packet, PolyType ptype, u32 is_quad)
+{
+       bool texturing = ptype & POLYATTR_TEXTURE;
+       bool gouraud   = ptype & POLYATTR_GOURAUD;
+
+       int vert_stride = 1; // Stride of vertices in cmd packet, in 32-bit words
+       if (texturing)
+               vert_stride++;
+       if (gouraud)
+               vert_stride++;
+
+       int num_verts = (is_quad) ? 4 : 3;
+       u32 *ptr;
+
+       // X,Y coords, adjusted by draw offsets
+       s32 x_off = gpu_senquack.DrawingOffset[0];
+       s32 y_off = gpu_senquack.DrawingOffset[1];
+       ptr = &packet.U4[1];
+       for (int i=0;  i < num_verts; ++i, ptr += vert_stride) {
+               s16* coord_ptr = (s16*)ptr;
+               vbuf[i].x = GPU_EXPANDSIGN(coord_ptr[0]) + x_off;
+               vbuf[i].y = GPU_EXPANDSIGN(coord_ptr[1]) + y_off;
+       }
+
+       // U,V texture coords (if applicable)
+       if (texturing) {
+               ptr = &packet.U4[2];
+               for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+                       vbuf[i].tex_word = *ptr;
+       }
+
+       // Colors (if applicable)
+       if (gouraud) {
+               ptr = &packet.U4[0];
+               for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+                       vbuf[i].col_word = *ptr;
+       }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Helper functions to determine which vertex in a 2 or 3 vertex array
+//   has the highest/lowest X/Y coordinate.
+//   Note: the comparison logic is such that, given a set of vertices with
+//    identical values for a given coordinate, a different index will be
+//    returned from vertIdxOfLeast..() than a call to vertIdxOfHighest..().
+//    This ensures that, during the vertex-ordering phase of rasterization,
+//    all three vertices remain unique.
+///////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+static inline int vertIdxOfLeastXCoord2(const T *Tptr)
+{
+       return (Tptr[0].x <= Tptr[1].x) ? 0 : 1;
+}
+
+template<typename T>
+static inline int vertIdxOfLeastXCoord3(const T *Tptr)
+{
+       int least_of_v0_v1 = vertIdxOfLeastXCoord2(Tptr);
+       return (Tptr[least_of_v0_v1].x <= Tptr[2].x) ? least_of_v0_v1 : 2;
+}
+
+template<typename T>
+static inline int vertIdxOfLeastYCoord2(const T *Tptr)
+{
+       return (Tptr[0].y <= Tptr[1].y) ? 0 : 1;
+}
+
+template<typename T>
+static inline int vertIdxOfLeastYCoord3(const T *Tptr)
+{
+       int least_of_v0_v1 = vertIdxOfLeastYCoord2(Tptr);
+       return (Tptr[least_of_v0_v1].y <= Tptr[2].y) ? least_of_v0_v1 : 2;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord2(const T *Tptr)
+{
+       return (Tptr[1].x >= Tptr[0].x) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord3(const T *Tptr)
+{
+       int highest_of_v0_v1 = vertIdxOfHighestXCoord2(Tptr);
+       return (Tptr[2].x >= Tptr[highest_of_v0_v1].x) ? 2 : highest_of_v0_v1;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord2(const T *Tptr)
+{
+       return (Tptr[1].y >= Tptr[0].y) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord3(const T *Tptr)
+{
+       int highest_of_v0_v1 = vertIdxOfHighestYCoord2(Tptr);
+       return (Tptr[2].y >= Tptr[highest_of_v0_v1].y) ? 2 : highest_of_v0_v1;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// polyUseTriangle()
+//  Determines if the specified triangle should be rendered. If so, it
+//  fills the given array of vertex pointers, vert_ptrs, in order of
+//  increasing Y coordinate values, as required by rasterization algorithm.
+//  Parameter 'tri_num' is 0 for first triangle (idx 0,1,2 of vbuf[]),
+//   or 1 for second triangle of a quad (idx 1,2,3 of vbuf[]).
+//  Returns true if triangle should be rendered, false if not.
+///////////////////////////////////////////////////////////////////////////////
+static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVertex **vert_ptrs)
+{
+       // Using verts 0,1,2 or is this the 2nd pass of a quad (verts 1,2,3)?
+       const PolyVertex *tri_ptr = &vbuf[(tri_num == 0) ? 0 : 1];
+
+       // Get indices of highest/lowest X,Y coords within triangle
+       int idx_lowest_x  = vertIdxOfLeastXCoord3(tri_ptr);
+       int idx_highest_x = vertIdxOfHighestXCoord3(tri_ptr);
+       int idx_lowest_y  = vertIdxOfLeastYCoord3(tri_ptr);
+       int idx_highest_y = vertIdxOfHighestYCoord3(tri_ptr);
+
+       // Maximum absolute distance between any two X coordinates is 1023,
+       //  and for Y coordinates is 511 (PS1 hardware limitation)
+       int lowest_x  = tri_ptr[idx_lowest_x].x;
+       int highest_x = tri_ptr[idx_highest_x].x;
+       int lowest_y  = tri_ptr[idx_lowest_y].y;
+       int highest_y = tri_ptr[idx_highest_y].y;
+       if ((highest_x - lowest_x) >= CHKMAX_X ||
+           (highest_y - lowest_y) >= CHKMAX_Y)
+               return false;
+
+       // Determine if triangle is completely outside clipping range
+       int xmin, xmax, ymin, ymax;
+       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+       int clipped_lowest_x  = Max2(xmin,lowest_x);
+       int clipped_lowest_y  = Max2(ymin,lowest_y);
+       int clipped_highest_x = Min2(xmax,highest_x);
+       int clipped_highest_y = Min2(ymax,highest_y);
+       if (clipped_lowest_x >= clipped_highest_x ||
+           clipped_lowest_y >= clipped_highest_y)
+               return false;
+
+       // Order vertex ptrs by increasing y value (draw routines need this).
+       // The middle index is deduced by a binary math trick that depends
+       //  on index range always being between 0..2
+       vert_ptrs[0] = tri_ptr + idx_lowest_y;
+       vert_ptrs[1] = tri_ptr + ((idx_lowest_y + idx_highest_y) ^ 3);
+       vert_ptrs[2] = tri_ptr + idx_highest_y;
+       return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal polygon drawing functions
+///////////////////////////////////////////////////////////////////////////////
+
+/*----------------------------------------------------------------------
+gpuDrawPolyF - Flat-shaded, untextured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       // Set up bgr555 color to be used across calls in inner driver
+       gpu_senquack.PixelData = GPU_RGB16(packet.U4[0]);
+
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 x0, x1, x2, y0, y1, y2;
+
+               x0 = vptrs[0]->x;  y0 = vptrs[0]->y;
+               x1 = vptrs[1]->x;  y1 = vptrs[1]->y;
+               x2 = vptrs[2]->x;  y2 = vptrs[2]->y;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx = (x2 - x1) * ya - (x2 - x0) * yb;
+
+               for (int loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       dx3 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       dx3 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0) + (dx3 * (y1 - y0));
+                                       x4 = i2x(x1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx3 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+                               if ((xmin - xa) > 0) xa = xmin;
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+/*----------------------------------------------------------------------
+gpuDrawPolyFT - Flat-shaded, textured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
+       gpu_senquack.r8 = packet.U1[0];
+       gpu_senquack.g8 = packet.U1[1];
+       gpu_senquack.b8 = packet.U1[2];
+       // r5/g5/b5 used if just texture-blending is applied (15-bit light)
+       gpu_senquack.r5 = packet.U1[0] >> 3;
+       gpu_senquack.g5 = packet.U1[1] >> 3;
+       gpu_senquack.b5 = packet.U1[2] >> 3;
+
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 u3, du3, v3, dv3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 u0, u1, u2, v0, v1, v2;
+               s32 du4, dv4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+               dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       du4 = -du4;
+                       dv4 = -dv4;
+               }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       du4 = (fixed)((du4 << FIXED_BITS) * finv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       du4 = xInvMulx(du4, iF, iS);
+                       dv4 = xInvMulx(dv4, iF, iS);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+                       dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#endif
+#endif
+               // Set u,v increments for inner driver
+               gpu_senquack.u_inc = du4;
+               gpu_senquack.v_inc = dv4;
+
+               //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
+               //                       (SAME ISSUE ELSEWHERE)
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               u3 = i2x(u0);  v3 = i2x(v0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               du3 = xInvMulx((u2 - u0), iF, iS);
+                                               dv3 = xInvMulx((v2 - v0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+                                               dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               du3 = xInvMulx((u1 - u0), iF, iS);
+                                               dv3 = xInvMulx((v1 - v0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+                                               dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);
+                                       x4 = i2x(x1);
+                                       u3 = i2x(u0);
+                                       v3 = i2x(v0);
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               u3 += (du3 * (y1 - y0));
+                                               v3 += (dv3 * (y1 - y0));
+                                       }
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+                                       u3 = i2x(u1);
+                                       v3 = i2x(v1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               du3 = xInvMulx((u2 - u1), iF, iS);
+                                               dv3 = xInvMulx((v2 - v1), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#else 
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+                                               dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += dx3 * (ymin - ya);
+                               x4 += dx4 * (ymin - ya);
+                               u3 += du3 * (ymin - ya);
+                               v3 += dv3 * (ymin - ya);
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       u3 += du3, v3 += dv3 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 u4, v4;
+
+                               xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+                               u4 = u3;  v4 = v3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       u4 += (du4 * itmp) >> FIXED_BITS;
+                                       v4 += (dv4 * itmp) >> FIXED_BITS;
+                               }
+
+                               u4 += fixed_HALF;
+                               v4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       u4 += du4 * (xmin - xa);
+                                       v4 += dv4 * (xmin - xa);
+                                       xa = xmin;
+                               }
+
+                               // Set u,v coords for inner driver
+                               gpu_senquack.u = u4;
+                               gpu_senquack.v = v4;
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+/*----------------------------------------------------------------------
+gpuDrawPolyG - Gouraud-shaded, untextured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_G, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 r3, dr3, g3, dg3, b3, db3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+               s32 dr4, dg4, db4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+               dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+               db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       dr4 = -dr4;
+                       dg4 = -dg4;
+                       db4 = -db4;
+               }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+                       db4 = (fixed)((db4 << FIXED_BITS) * finv);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+                       db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       dr4 = xInvMulx(dr4, iF, iS);
+                       dg4 = xInvMulx(dg4, iF, iS);
+                       db4 = xInvMulx(db4, iF, iS);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+                       dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+                       db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+               } else {
+                       dr4 = dg4 = db4 = 0;
+               }
+#endif
+#endif
+               // Setup packed Gouraud increment for inner driver
+               gpu_senquack.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;
+                               yb = y1;
+                               x3 = x4 = i2x(x0);
+                               r3 = i2x(r0);
+                               g3 = i2x(g0);
+                               b3 = i2x(b0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               dr3 = xInvMulx((r2 - r0), iF, iS);
+                                               dg3 = xInvMulx((g2 - g0), iF, iS);
+                                               db3 = xInvMulx((b2 - b0), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+                                               dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+                                               db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               dr3 = xInvMulx((r1 - r0), iF, iS);
+                                               dg3 = xInvMulx((g1 - g0), iF, iS);
+                                               db3 = xInvMulx((b1 - b0), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+                                               dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+                                               db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);  x4 = i2x(x1);
+                                       r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               r3 += (dr3 * (y1 - y0));
+                                               g3 += (dg3 * (y1 - y0));
+                                               b3 += (db3 * (y1 - y0));
+                                       }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                                       r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               dr3 = xInvMulx((r2 - r1), iF, iS);
+                                               dg3 = xInvMulx((g2 - g1), iF, iS);
+                                               db3 = xInvMulx((b2 - b1), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+                                               dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+                                               db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               r3 += (dr3 * (ymin - ya));
+                               g3 += (dg3 * (ymin - ya));
+                               b3 += (db3 * (ymin - ya));
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       r3 += dr3, g3 += dg3, b3 += db3 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 r4, g4, b4;
+
+                               xa = FixedCeilToInt(x3);
+                               xb = FixedCeilToInt(x4);
+                               r4 = r3;  g4 = g3;  b4 = b3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       r4 += (dr4 * itmp) >> FIXED_BITS;
+                                       g4 += (dg4 * itmp) >> FIXED_BITS;
+                                       b4 += (db4 * itmp) >> FIXED_BITS;
+                               }
+
+                               r4 += fixed_HALF;
+                               g4 += fixed_HALF;
+                               b4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       r4 += (dr4 * (xmin - xa));
+                                       g4 += (dg4 * (xmin - xa));
+                                       b4 += (db4 * (xmin - xa));
+                                       xa = xmin;
+                               }
+
+                               // Setup packed Gouraud color for inner driver
+                               gpu_senquack.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+/*----------------------------------------------------------------------
+gpuDrawPolyGT - Gouraud-shaded, textured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_GT, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
+       {
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 u3, du3, v3, dv3;
+               s32 r3, dr3, g3, dg3, b3, db3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 u0, u1, u2, v0, v1, v2;
+               s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+               s32 du4, dv4;
+               s32 dr4, dg4, db4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+               r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+               r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+               r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+               dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+               dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+               dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+               db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       du4 = -du4;
+                       dv4 = -dv4;
+                       dr4 = -dr4;
+                       dg4 = -dg4;
+                       db4 = -db4;
+               }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       du4 = (fixed)((du4 << FIXED_BITS) * finv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+                       db4 = (fixed)((db4 << FIXED_BITS) * finv);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+                       db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       du4 = xInvMulx(du4, iF, iS);
+                       dv4 = xInvMulx(dv4, iF, iS);
+                       dr4 = xInvMulx(dr4, iF, iS);
+                       dg4 = xInvMulx(dg4, iF, iS);
+                       db4 = xInvMulx(db4, iF, iS);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#else
+               if (dx4 != 0) {
+                       du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+                       dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+                       dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+                       dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+                       db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
+               }
+#endif
+#endif
+               // Set u,v increments and packed Gouraud increment for inner driver
+               gpu_senquack.u_inc = du4;
+               gpu_senquack.v_inc = dv4;
+               gpu_senquack.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               u3 = i2x(u0);  v3 = i2x(v0);
+                               r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               du3 = xInvMulx((u2 - u0), iF, iS);
+                                               dv3 = xInvMulx((v2 - v0), iF, iS);
+                                               dr3 = xInvMulx((r2 - r0), iF, iS);
+                                               dg3 = xInvMulx((g2 - g0), iF, iS);
+                                               db3 = xInvMulx((b2 - b0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+                                               dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+                                               dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+                                               dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+                                               db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / float(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               du3 = xInvMulx((u1 - u0), iF, iS);
+                                               dv3 = xInvMulx((v1 - v0), iF, iS);
+                                               dr3 = xInvMulx((r1 - r0), iF, iS);
+                                               dg3 = xInvMulx((g1 - g0), iF, iS);
+                                               db3 = xInvMulx((b1 - b0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+                                               dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+                                               dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+                                               dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+                                               db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);  x4 = i2x(x1);
+                                       u3 = i2x(u0);  v3 = i2x(v0);
+                                       r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               u3 += (du3 * (y1 - y0));
+                                               v3 += (dv3 * (y1 - y0));
+                                               r3 += (dr3 * (y1 - y0));
+                                               g3 += (dg3 * (y1 - y0));
+                                               b3 += (db3 * (y1 - y0));
+                                       }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                                       u3 = i2x(u1);  v3 = i2x(v1);
+                                       r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               du3 = xInvMulx((u2 - u1), iF, iS);
+                                               dv3 = xInvMulx((v2 - v1), iF, iS);
+                                               dr3 = xInvMulx((r2 - r1), iF, iS);
+                                               dg3 = xInvMulx((g2 - g1), iF, iS);
+                                               db3 = xInvMulx((b2 - b1), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+                                               dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+                                               dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+                                               dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+                                               db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_senquack.DrawingArea[0];  xmax = gpu_senquack.DrawingArea[2];
+                       ymin = gpu_senquack.DrawingArea[1];  ymax = gpu_senquack.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               u3 += (du3 * (ymin - ya));
+                               v3 += (dv3 * (ymin - ya));
+                               r3 += (dr3 * (ymin - ya));
+                               g3 += (dg3 * (ymin - ya));
+                               b3 += (db3 * (ymin - ya));
+                               ya = ymin;
+                       }
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_senquack.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       u3 += du3, v3 += dv3,
+                                       r3 += dr3, g3 += dg3, b3 += db3 )
+                       {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 u4, v4;
+                               u32 r4, g4, b4;
+
+                               xa = FixedCeilToInt(x3);
+                               xb = FixedCeilToInt(x4);
+                               u4 = u3;  v4 = v3;
+                               r4 = r3;  g4 = g3;  b4 = b3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       u4 += (du4 * itmp) >> FIXED_BITS;
+                                       v4 += (dv4 * itmp) >> FIXED_BITS;
+                                       r4 += (dr4 * itmp) >> FIXED_BITS;
+                                       g4 += (dg4 * itmp) >> FIXED_BITS;
+                                       b4 += (db4 * itmp) >> FIXED_BITS;
+                               }
+
+                               u4 += fixed_HALF;
+                               v4 += fixed_HALF;
+                               r4 += fixed_HALF;
+                               g4 += fixed_HALF;
+                               b4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       u4 += du4 * (xmin - xa);
+                                       v4 += dv4 * (xmin - xa);
+                                       r4 += dr4 * (xmin - xa);
+                                       g4 += dg4 * (xmin - xa);
+                                       b4 += db4 * (xmin - xa);
+                                       xa = xmin;
+                               }
+
+                               // Set packed Gouraud color and u,v coords for inner driver
+                               gpu_senquack.u = u4;
+                               gpu_senquack.v = v4;
+                               gpu_senquack.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_senquack, PixelBase + xa, (xb - xa));
+                       }
+               }
+       } while (++cur_pass < total_passes);
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_POLYGON_H__ */
diff --git a/plugins/gpu_senquack/gpu_raster_sprite.h b/plugins/gpu_senquack/gpu_raster_sprite.h

new file mode 100644 (file)

index 0000000..ddbad67
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_raster_sprite.h
@@ -0,0 +1,170 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef __GPU_UNAI_GPU_RASTER_SPRITE_H__
+#define __GPU_UNAI_GPU_RASTER_SPRITE_H__
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal sprite drawing functions
+
+void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver)
+{
+       s32 x0, x1, y0, y1;
+       u32 u0, v0;
+
+       //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+       // or sprites in 1st level of SkullMonkeys disappear when walking right.
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_senquack.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_senquack.DrawingOffset[1]);
+
+       u32 w = packet.U2[6] & 0x3ff; // Max width is 1023
+       u32 h = packet.U2[7] & 0x1ff; // Max height is 511
+       x1 = x0 + w;
+       y1 = y0 + h;
+
+       s32 xmin, xmax, ymin, ymax;
+       xmin = gpu_senquack.DrawingArea[0];     xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];     ymax = gpu_senquack.DrawingArea[3];
+
+       u0 = packet.U1[8];
+       v0 = packet.U1[9];
+
+       s32 temp;
+       temp = ymin - y0;
+       if (temp > 0) { y0 = ymin; v0 += temp; }
+       if (y1 > ymax) y1 = ymax;
+       if (y1 <= y0) return;
+
+       temp = xmin - x0;
+       if (temp > 0) { x0 = xmin; u0 += temp; }
+       if (x1 > xmax) x1 = xmax;
+       x1 -= x0;
+       if (x1 <= 0) return;
+
+       gpu_senquack.r5 = packet.U1[0] >> 3;
+       gpu_senquack.g5 = packet.U1[1] >> 3;
+       gpu_senquack.b5 = packet.U1[2] >> 3;
+
+       u16 *Pixel = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(x0, y0)];
+       const int li=gpu_senquack.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+       unsigned int tmode = gpu_senquack.TEXT_MODE >> 5;
+       const u32 v0_mask = gpu_senquack.TextureWindow[3];
+       u8* pTxt_base = (u8*)gpu_senquack.TBA;
+
+       // Texture is accessed byte-wise, so adjust idx if 16bpp
+       if (tmode == 3) u0 <<= 1;
+
+       for (; y0<y1; ++y0) {
+               u8* pTxt = pTxt_base + ((v0 & v0_mask) * 2048);
+               if (!(y0&li) && (y0&pi)!=pif)
+                       gpuSpriteSpanDriver(Pixel, x1, pTxt, u0);
+               Pixel += FRAME_WIDTH;
+               v0++;
+       }
+}
+
+#ifdef __arm__
+#include "gpu_arm.h"
+
+/* Notaz 4bit sprites optimization */
+void gpuDrawS16(PtrUnion packet)
+{
+       s32 x0, y0;
+       s32 u0, v0;
+       s32 xmin, xmax;
+       s32 ymin, ymax;
+       u32 h = 16;
+
+       //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+       // or sprites in 1st level of SkullMonkeys disappear when walking right.
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_senquack.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_senquack.DrawingOffset[1]);
+
+       xmin = gpu_senquack.DrawingArea[0];     xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];     ymax = gpu_senquack.DrawingArea[3];
+       u0 = packet.U1[8];
+       v0 = packet.U1[9];
+
+       if (x0 > xmax - 16 || x0 < xmin ||
+           ((u0 | v0) & 15) || !(gpu_senquack.TextureWindow[2] & gpu_senquack.TextureWindow[3] & 8)) {
+               // send corner cases to general handler
+               packet.U4[3] = 0x00100010;
+               gpuDrawS(packet, gpuSpriteSpanFn<0x20>);
+               return;
+       }
+
+       if (y0 >= ymax || y0 <= ymin - 16)
+               return;
+       if (y0 < ymin) {
+               h -= ymin - y0;
+               v0 += ymin - y0;
+               y0 = ymin;
+       }
+       else if (ymax - y0 < 16)
+               h = ymax - y0;
+
+       draw_spr16_full(&gpu_senquack.vram[FRAME_OFFSET(x0, y0)], &gpu_senquack.TBA[FRAME_OFFSET(u0/4, v0)], gpu_senquack.CBA, h);
+}
+#endif // __arm__
+
+void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver)
+{
+       s32 x0, x1, y0, y1;
+
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_senquack.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_senquack.DrawingOffset[1]);
+
+       u32 w = packet.U2[4] & 0x3ff; // Max width is 1023
+       u32 h = packet.U2[5] & 0x1ff; // Max height is 511
+       x1 = x0 + w;
+       y1 = y0 + h;
+
+       s32 xmin, xmax, ymin, ymax;
+       xmin = gpu_senquack.DrawingArea[0];     xmax = gpu_senquack.DrawingArea[2];
+       ymin = gpu_senquack.DrawingArea[1];     ymax = gpu_senquack.DrawingArea[3];
+
+       if (y0 < ymin) y0 = ymin;
+       if (y1 > ymax) y1 = ymax;
+       if (y1 <= y0) return;
+
+       if (x0 < xmin) x0 = xmin;
+       if (x1 > xmax) x1 = xmax;
+       x1 -= x0;
+       if (x1 <= 0) return;
+
+       const u16 Data = GPU_RGB16(packet.U4[0]);
+       u16 *Pixel = &((u16*)gpu_senquack.vram)[FRAME_OFFSET(x0, y0)];
+       const int li=gpu_senquack.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_senquack.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_senquack.prog_ilace_flag?(gpu_senquack.ilace_mask+1):0):1);
+
+       for (; y0<y1; ++y0) {
+               if (!(y0&li) && (y0&pi)!=pif)
+                       gpuTileSpanDriver(Pixel,x1,Data);
+               Pixel += FRAME_WIDTH;
+       }
+}
+
+#endif /* __GPU_UNAI_GPU_RASTER_SPRITE_H__ */
diff --git a/plugins/gpu_senquack/gpu_senquack.h b/plugins/gpu_senquack/gpu_senquack.h

new file mode 100644 (file)

index 0000000..efbdd4c
--- /dev/null
+++ b/plugins/gpu_senquack/gpu_senquack.h
@@ -0,0 +1,316 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef GPU_UNAI_H
+#define GPU_UNAI_H
+
+#include "gpu.h"
+
+// Header shared between both standalone gpu_senquack (gpu.cpp) and new
+// gpulib-compatible gpu_senquack (gpulib_if.cpp)
+// -> Anything here should be for gpu_senquack's private use. <-
+
+///////////////////////////////////////////////////////////////////////////////
+//  Compile Options
+
+//#define ENABLE_GPU_NULL_SUPPORT   // Enables NullGPU support
+//#define ENABLE_GPU_LOG_SUPPORT    // Enables gpu logger, very slow only for windows debugging
+//#define ENABLE_GPU_ARMV7                     // Enables ARMv7 optimized assembly
+
+//Poly routine options (default is integer math and accurate division)
+//#define GPU_UNAI_USE_FLOATMATH         // Use float math in poly routines
+//#define GPU_UNAI_USE_FLOAT_DIV_MULTINV // If GPU_UNAI_USE_FLOATMATH is defined,
+                                         //  use multiply-by-inverse for division
+//#define GPU_UNAI_USE_INT_DIV_MULTINV   // If GPU_UNAI_USE_FLOATMATH is *not*
+                                         //  defined, use old inaccurate division
+
+
+#define GPU_INLINE static inline __attribute__((always_inline))
+#define INLINE     static inline __attribute__((always_inline))
+
+#define u8  uint8_t
+#define s8  int8_t
+#define u16 uint16_t
+#define s16 int16_t
+#define u32 uint32_t
+#define s32 int32_t
+#define s64 int64_t
+
+union PtrUnion
+{
+       u32  *U4;
+       s32  *S4;
+       u16  *U2;
+       s16  *S2;
+       u8   *U1;
+       s8   *S1;
+       void *ptr;
+};
+
+union GPUPacket
+{
+       u32 U4[16];
+       s32 S4[16];
+       u16 U2[32];
+       s16 S2[32];
+       u8  U1[64];
+       s8  S1[64];
+};
+
+template<class T> static inline void SwapValues(T &x, T &y)
+{
+       T tmp(x);  x = y;  y = tmp;
+}
+
+template<typename T>
+static inline T Min2 (const T a, const T b)
+{
+       return (a<b)?a:b;
+}
+
+template<typename T>
+static inline T Min3 (const T a, const T b, const T c)
+{
+       return  Min2(Min2(a,b),c);
+}
+
+template<typename T>
+static inline T Max2 (const T a, const T b)
+{
+       return  (a>b)?a:b;
+}
+
+template<typename T>
+static inline T Max3 (const T a, const T b, const T c)
+{
+       return  Max2(Max2(a,b),c);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Raster Macros
+
+// Convert 24bpp color parameter of GPU command to 16bpp (15bpp + mask bit)
+#define        GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
+
+// Sign-extend 11-bit coordinate command param
+#define GPU_EXPANDSIGN(x) (((s32)(x)<<(32-11))>>(32-11))
+
+// Max difference between any two X or Y primitive coordinates
+#define CHKMAX_X 1024
+#define CHKMAX_Y 512
+
+#define        FRAME_BUFFER_SIZE       (1024*512*2)
+#define        FRAME_WIDTH                       1024
+#define        FRAME_HEIGHT              512
+#define        FRAME_OFFSET(x,y)       (((y)<<10)+(x))
+#define FRAME_BYTE_STRIDE     2048
+#define FRAME_BYTES_PER_PIXEL 2
+
+static inline s32 GPU_DIV(s32 rs, s32 rt)
+{
+       return rt ? (rs / rt) : (0);
+}
+
+// 'Unsafe' version of above that doesn't check for div-by-zero
+#define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
+
+struct gpu_senquack_t {
+       u32 GPU_GP1;
+       GPUPacket PacketBuffer;
+       u16 *vram;
+
+#ifndef USE_GPULIB
+       u32  GPU_GP0;
+       u32  tex_window;       // Current texture window vals (set by GP0(E2h) cmd)
+       s32  PacketCount;
+       s32  PacketIndex;
+       bool fb_dirty;         // Framebuffer is dirty (according to GPU)
+
+       //  Display status
+       //  NOTE: Standalone older gpu_senquack didn't care about horiz display range
+       u16  DisplayArea[6];   // [0] : Start of display area (in VRAM) X
+                              // [1] : Start of display area (in VRAM) Y
+                              // [2] : Display mode resolution HORIZONTAL
+                              // [3] : Display mode resolution VERTICAL
+                              // [4] : Vertical display range (on TV) START
+                              // [5] : Vertical display range (on TV) END
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Dma Transfers info
+       struct {
+               s32  px,py;
+               s32  x_end,y_end;
+               u16* pvram;
+               u32 *last_dma;     // Last dma pointer
+               bool FrameToRead;  // Load image in progress
+               bool FrameToWrite; // Store image in progress
+       } dma;
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Frameskip
+       struct {
+               int  skipCount;    // Frame skip (0,1,2,3...)
+               bool isSkip;       // Skip frame (according to GPU)
+               bool skipFrame;    // Skip this frame (according to frame skip)
+               bool wasSkip;      // Skip frame old value (according to GPU)
+               bool skipGPU;      // Skip GPU primitives
+       } frameskip;
+#endif
+       // END of standalone gpu_senquack variables
+       ////////////////////////////////////////////////////////////////////////////
+
+       u32 TextureWindowCur;  // Current setting from last GP0(0xE2) cmd (raw form)
+       u8  TextureWindow[4];  // [0] : Texture window offset X
+                              // [1] : Texture window offset Y
+                              // [2] : Texture window mask X
+                              // [3] : Texture window mask Y
+
+       u16 DrawingArea[4];    // [0] : Drawing area top left X
+                              // [1] : Drawing area top left Y
+                              // [2] : Drawing area bottom right X
+                              // [3] : Drawing area bottom right Y
+
+       s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
+                              // [1] : Drawing offset Y (signed)
+
+       u16* TBA;              // Ptr to current texture in VRAM
+       u16* CBA;              // Ptr to current CLUT in VRAM
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Inner Loop parameters
+
+       // 22.10 Fixed-pt texture coords, mask, scanline advance
+       // NOTE: U,V are no longer packed together into one u32, this proved to be
+       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+       u32 u, v;
+       u32 u_msk, v_msk;
+       s32 u_inc, v_inc;
+
+       // Color for Gouraud-shaded prims
+       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+       //  layout:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+       //           ^ bit 31                       ^ bit 0
+       u32 gCol;
+       u32 gInc;          // Increment along scanline for gCol
+
+       // Color for flat-shaded, texture-blended prims
+       u8  r5, g5, b5;    // 5-bit light for undithered prims
+       u8  r8, g8, b8;    // 8-bit light for dithered prims
+
+       // Color for flat-shaded, untextured prims
+       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+
+       // End of inner Loop parameters
+       ////////////////////////////////////////////////////////////////////////////
+
+
+       u8 blit_mask;           // Determines what pixels to skip when rendering.
+                               //  Only useful on low-resolution devices using
+                               //  a simple pixel-dropping downscaler for PS1
+                               //  high-res modes. See 'pixel_skip' option.
+
+       u8 ilace_mask;          // Determines what lines to skip when rendering.
+                               //  Normally 0 when PS1 240 vertical res is in
+                               //  use and ilace_force is 0. When running in
+                               //  PS1 480 vertical res on a low-resolution
+                               //  device (320x240), will usually be set to 1
+                               //  so odd lines are not rendered. (Unless future
+                               //  full-screen scaling option is in use ..TODO)
+
+       bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
+
+       u8 BLEND_MODE;
+       u8 TEXT_MODE;
+       u8 Masking;
+
+       u16 PixelMSB;
+
+       gpu_senquack_config_t config;
+
+       u8  LightLUT[32*32];    // 5-bit lighting LUT (gpu_inner_light.h)
+       u32 DitherMatrix[64];   // Matrix of dither coefficients
+};
+
+static gpu_senquack_t gpu_senquack;
+
+// Global config that frontend can alter.. Values are read in GPU_init().
+// TODO: if frontend menu modifies a setting, add a function that can notify
+// GPU plugin to use new setting.
+gpu_senquack_config_t gpu_senquack_config_ext;
+
+///////////////////////////////////////////////////////////////////////////////
+// Internal inline funcs to get option status: (Allows flexibility)
+static inline bool LightingEnabled()
+{
+       return gpu_senquack.config.lighting;
+}
+
+static inline bool FastLightingEnabled()
+{
+       return gpu_senquack.config.fast_lighting;
+}
+
+static inline bool BlendingEnabled()
+{
+       return gpu_senquack.config.blending;
+}
+
+static inline bool DitheringEnabled()
+{
+       return gpu_senquack.config.dithering;
+}
+
+// For now, this is just for development/experimentation purposes..
+// If modified to return true, it will allow ignoring the status register
+//  bit 9 setting (dither enable). It will still restrict dithering only
+//  to Gouraud-shaded or texture-blended polys.
+static inline bool ForcedDitheringEnabled()
+{
+       return false;
+}
+
+static inline bool ProgressiveInterlaceEnabled()
+{
+#ifdef USE_GPULIB
+       // Using this old option greatly decreases quality of image. Disabled
+       //  for now when using new gpulib, since it also adds more work in loops.
+       return false;
+#else
+       return gpu_senquack.config.prog_ilace;
+#endif
+}
+
+// For now, 320x240 output resolution is assumed, using simple line-skipping
+//  and pixel-skipping downscaler.
+// TODO: Flesh these out so they return useful values based on whether
+//       running on higher-res device or a resampling downscaler is enabled.
+static inline bool PixelSkipEnabled()
+{
+       return gpu_senquack.config.pixel_skip || gpu_senquack.config.scale_hires;
+}
+
+static inline bool LineSkipEnabled()
+{
+       return true;
+}
+
+#endif // GPU_UNAI_H
diff --git a/plugins/gpu_senquack/gpulib_if.cpp b/plugins/gpu_senquack/gpulib_if.cpp

new file mode 100644 (file)

index 0000000..c8452a3
--- /dev/null
+++ b/plugins/gpu_senquack/gpulib_if.cpp
@@ -0,0 +1,642 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2011 notaz                                              *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../gpulib/gpu.h"
+
+//#include "port.h"
+#include "gpu_senquack.h"
+
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
+
+// Inner loop driver instantiation file
+#include "gpu_inner.h"
+
+// GPU internal image drawing functions
+#include "gpu_raster_image.h"
+
+// GPU internal line drawing functions
+#include "gpu_raster_line.h"
+
+// GPU internal polygon drawing functions
+#include "gpu_raster_polygon.h"
+
+// GPU internal sprite drawing functions
+#include "gpu_raster_sprite.h"
+
+// GPU command buffer execution/store
+#include "gpu_command.h"
+
+/////////////////////////////////////////////////////////////////////////////
+
+int renderer_init(void)
+{
+  memset((void*)&gpu_senquack, 0, sizeof(gpu_senquack));
+  gpu_senquack.vram = (u16*)gpu.vram;
+
+  // Original standalone gpu_senquack initialized TextureWindow[]. I added the
+  //  same behavior here, since it seems unsafe to leave [2],[3] unset when
+  //  using HLE and Rearmed gpu_neon sets this similarly on init. -senquack
+  gpu_senquack.TextureWindow[0] = 0;
+  gpu_senquack.TextureWindow[1] = 0;
+  gpu_senquack.TextureWindow[2] = 255;
+  gpu_senquack.TextureWindow[3] = 255;
+  //senquack - new vars must be updated whenever texture window is changed:
+  //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+  const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+  gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+  // Configuration options
+  gpu_senquack.config = gpu_senquack_config_ext;
+  //senquack - disabled, not sure this is needed and would require modifying
+  // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was
+  // present in latest PCSX4ALL sources we were using.
+  //gpu_senquack.config.enableAbbeyHack = gpu_senquack_config_ext.abe_hack;
+  gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+  // s_invTable
+  for(int i=1;i<=(1<<TABLE_BITS);++i)
+  {
+    double v = 1.0 / double(i);
+#ifdef GPU_TABLE_10_BITS
+    v *= double(0xffffffff>>1);
+#else
+    v *= double(0x80000000);
+#endif
+    s_invTable[i-1]=s32(v);
+  }
+#endif
+
+  SetupLightLUT();
+  SetupDitheringConstants();
+
+  return 0;
+}
+
+void renderer_finish(void)
+{
+}
+
+void renderer_notify_res_change(void)
+{
+  if (PixelSkipEnabled()) {
+    // Set blit_mask for high horizontal resolutions. This allows skipping
+    //  rendering pixels that would never get displayed on low-resolution
+    //  platforms that use simple pixel-dropping scaler.
+
+    switch (gpu.screen.hres)
+    {
+      case 512: gpu_senquack.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+      case 640: gpu_senquack.blit_mask = 0xaa; break; // GPU_BlitWS
+      default:  gpu_senquack.blit_mask = 0;    break;
+    }
+  } else {
+    gpu_senquack.blit_mask = 0;
+  }
+
+  if (LineSkipEnabled()) {
+    // Set rendering line-skip (only render every other line in high-res
+    //  480 vertical mode, or, optionally, force it for all video modes)
+
+    if (gpu.screen.vres == 480) {
+      if (gpu_senquack.config.ilace_force) {
+        gpu_senquack.ilace_mask = 3; // Only need 1/4 of lines
+      } else {
+        gpu_senquack.ilace_mask = 1; // Only need 1/2 of lines
+      }
+    } else {
+      // Vert resolution changed from 480 to lower one
+      gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+    }
+  } else {
+    gpu_senquack.ilace_mask = 0;
+  }
+
+  /*
+  printf("res change hres: %d   vres: %d   depth: %d   ilace_mask: %d\n",
+      gpu.screen.hres, gpu.screen.vres, gpu.status.rgb24 ? 24 : 15,
+      gpu_senquack.ilace_mask);
+  */
+}
+
+#ifdef USE_GPULIB
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_senquack_t &gpu_senquack, u32 cmd_word)
+{
+  // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+  u8 num = (cmd_word >> 24) & 7;
+  gpu.ex_regs[num] = cmd_word; // Update gpulib register
+  switch (num) {
+    case 1: {
+      // GP0(E1h) - Draw Mode setting (aka "Texpage")
+      u32 cur_texpage = gpu_senquack.GPU_GP1 & 0x7FF;
+      u32 new_texpage = cmd_word & 0x7FF;
+      if (cur_texpage != new_texpage) {
+        gpu_senquack.GPU_GP1 = (gpu_senquack.GPU_GP1 & ~0x7FF) | new_texpage;
+        gpuSetTexture(gpu_senquack.GPU_GP1);
+      }
+    } break;
+
+    case 2: {
+      // GP0(E2h) - Texture Window setting
+      if (cmd_word != gpu_senquack.TextureWindowCur) {
+        static const u8 TextureMask[32] = {
+          255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+          127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+        };
+        gpu_senquack.TextureWindowCur = cmd_word;
+        gpu_senquack.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+        gpu_senquack.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+        gpu_senquack.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+        gpu_senquack.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+        gpu_senquack.TextureWindow[0] &= ~gpu_senquack.TextureWindow[2];
+        gpu_senquack.TextureWindow[1] &= ~gpu_senquack.TextureWindow[3];
+
+        // Inner loop vars must be updated whenever texture window is changed:
+        const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+        gpu_senquack.u_msk = (((u32)gpu_senquack.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_senquack.v_msk = (((u32)gpu_senquack.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+        gpuSetTexture(gpu_senquack.GPU_GP1);
+      }
+    } break;
+
+    case 3: {
+      // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+      gpu_senquack.DrawingArea[0] = cmd_word         & 0x3FF;
+      gpu_senquack.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+    } break;
+
+    case 4: {
+      // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+      gpu_senquack.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+      gpu_senquack.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+    } break;
+
+    case 5: {
+      // GP0(E5h) - Set Drawing Offset (X,Y)
+      gpu_senquack.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+      gpu_senquack.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+    } break;
+
+    case 6: {
+      // GP0(E6h) - Mask Bit Setting
+      gpu_senquack.Masking  = (cmd_word & 0x2) <<  1;
+      gpu_senquack.PixelMSB = (cmd_word & 0x1) <<  8;
+    } break;
+  }
+}
+#endif
+
+extern const unsigned char cmd_lengths[256];
+
+int do_cmd_list(u32 *list, int list_len, int *last_cmd)
+{
+  u32 cmd = 0, len, i;
+  u32 *list_start = list;
+  u32 *list_end = list + list_len;
+
+  //TODO: set ilace_mask when resolution changes instead of every time,
+  // eliminate #ifdef below.
+  gpu_senquack.ilace_mask = gpu_senquack.config.ilace_force;
+
+#ifdef HAVE_PRE_ARMV7 /* XXX */
+  gpu_senquack.ilace_mask |= gpu.status.interlace;
+#endif
+  if (gpu_senquack.config.scale_hires) {
+    gpu_senquack.ilace_mask |= gpu.status.interlace;
+  }
+
+  for (; list < list_end; list += 1 + len)
+  {
+    cmd = *list >> 24;
+    len = cmd_lengths[cmd];
+    if (list + 1 + len > list_end) {
+      cmd = -1;
+      break;
+    }
+
+    #define PRIM cmd
+    gpu_senquack.PacketBuffer.U4[0] = list[0];
+    for (i = 1; i <= len; i++)
+      gpu_senquack.PacketBuffer.U4[i] = list[i];
+
+    PtrUnion packet = { .ptr = (void*)&gpu_senquack.PacketBuffer };
+
+    switch (cmd)
+    {
+      case 0x02:
+        gpuClearImage(packet);
+        break;
+
+      case 0x20:
+      case 0x21:
+      case 0x22:
+      case 0x23: {          // Monochrome 3-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, false);
+      } break;
+
+      case 0x24:
+      case 0x25:
+      case 0x26:
+      case 0x27: {          // Textured 3-pt poly
+        gpuSetCLUT   (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, false);
+      } break;
+
+      case 0x28:
+      case 0x29:
+      case 0x2A:
+      case 0x2B: {          // Monochrome 4-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x2C:
+      case 0x2D:
+      case 0x2E:
+      case 0x2F: {          // Textured 4-pt poly
+        gpuSetCLUT   (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_senquack.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | gpu_senquack.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x30:
+      case 0x31:
+      case 0x32:
+      case 0x33: {          // Gouraud-shaded 3-pt poly
+        //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+        // this is an untextured poly, so CF_LIGHT (texture blend)
+        // shouldn't apply. Until the original array of template
+        // instantiation ptrs is fixed, we're stuck with this. (TODO)
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, false);
+      } break;
+
+      case 0x34:
+      case 0x35:
+      case 0x36:
+      case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, false);
+      } break;
+
+      case 0x38:
+      case 0x39:
+      case 0x3A:
+      case 0x3B: {          // Gouraud-shaded 4-pt poly
+        // See notes regarding '129' for 0x30..0x33 further above -senquack
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_senquack.Masking | Blending | 129 | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x3C:
+      case 0x3D:
+      case 0x3E:
+      case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_senquack.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_senquack.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_senquack.TEXT_MODE |
+          gpu_senquack.Masking | Blending | ((Lighting)?129:0) | gpu_senquack.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, true); // is_quad = true
+      } break;
+
+      case 0x40:
+      case 0x41:
+      case 0x42:
+      case 0x43: {          // Monochrome line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
+      } break;
+
+      case 0x48 ... 0x4F: { // Monochrome line strip
+        u32 num_vertexes = 1;
+        u32 *list_position = &(list[2]);
+
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
+
+        while(1)
+        {
+          gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[2];
+          gpu_senquack.PacketBuffer.U4[2] = *list_position++;
+          gpuDrawLineF(packet, driver);
+
+          num_vertexes++;
+          if(list_position >= list_end) {
+            cmd = -1;
+            goto breakloop;
+          }
+          if((*list_position & 0xf000f000) == 0x50005000)
+            break;
+        }
+
+        len += (num_vertexes - 2);
+      } break;
+
+      case 0x50:
+      case 0x51:
+      case 0x52:
+      case 0x53: {          // Gouraud-shaded line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
+      } break;
+
+      case 0x58 ... 0x5F: { // Gouraud-shaded line strip
+        u32 num_vertexes = 1;
+        u32 *list_position = &(list[2]);
+
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
+
+        while(1)
+        {
+          gpu_senquack.PacketBuffer.U4[0] = gpu_senquack.PacketBuffer.U4[2];
+          gpu_senquack.PacketBuffer.U4[1] = gpu_senquack.PacketBuffer.U4[3];
+          gpu_senquack.PacketBuffer.U4[2] = *list_position++;
+          gpu_senquack.PacketBuffer.U4[3] = *list_position++;
+          gpuDrawLineG(packet, driver);
+
+          num_vertexes++;
+          if(list_position >= list_end) {
+            cmd = -1;
+            goto breakloop;
+          }
+          if((*list_position & 0xf000f000) == 0x50005000)
+            break;
+        }
+
+        len += (num_vertexes - 2) * 2;
+      } break;
+
+      case 0x60:
+      case 0x61:
+      case 0x62:
+      case 0x63: {          // Monochrome rectangle (variable size)
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x64:
+      case 0x65:
+      case 0x66:
+      case 0x67: {          // Textured rectangle (variable size)
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        // This fixes Silent Hill running animation on loading screens:
+        // (On PSX, color values 0x00-0x7F darken the source texture's color,
+        //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+        //  0x80 leaves source texture color unchanged, HOWEVER,
+        //   gpu_senquack uses a simple lighting LUT whereby only the upper
+        //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+        //   0x80.
+        // 
+        // NOTE: I've changed all textured sprite draw commands here and
+        //  elsewhere to use proper behavior, but left poly commands
+        //  alone, I don't want to slow rendering down too much. (TODO)
+        //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
+
+      case 0x68:
+      case 0x69:
+      case 0x6A:
+      case 0x6B: {          // Monochrome rectangle (1x1 dot)
+        gpu_senquack.PacketBuffer.U4[2] = 0x00010001;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x70:
+      case 0x71:
+      case 0x72:
+      case 0x73: {          // Monochrome rectangle (8x8)
+        gpu_senquack.PacketBuffer.U4[2] = 0x00080008;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x74:
+      case 0x75:
+      case 0x76:
+      case 0x77: {          // Textured rectangle (8x8)
+        gpu_senquack.PacketBuffer.U4[3] = 0x00080008;
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
+
+      case 0x78:
+      case 0x79:
+      case 0x7A:
+      case 0x7B: {          // Monochrome rectangle (16x16)
+        gpu_senquack.PacketBuffer.U4[2] = 0x00100010;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
+
+      case 0x7C:
+      case 0x7D:
+#ifdef __arm__
+        if ((gpu_senquack.GPU_GP1 & 0x180) == 0 && (gpu_senquack.Masking | gpu_senquack.PixelMSB) == 0)
+        {
+          gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+          gpuDrawS16(packet);
+          break;
+        }
+        // fallthrough
+#endif
+      case 0x7E:
+      case 0x7F: {          // Textured rectangle (16x16)
+        gpu_senquack.PacketBuffer.U4[3] = 0x00100010;
+        gpuSetCLUT    (gpu_senquack.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_senquack.TEXT_MODE | gpu_senquack.Masking | Blending | (gpu_senquack.PixelMSB>>1);
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_senquack.PacketBuffer.U1[0]>0x5F) && (gpu_senquack.PacketBuffer.U1[1]>0x5F) && (gpu_senquack.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_senquack.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
+
+      case 0x80:          //  vid -> vid
+        gpuMoveImage(packet);
+        break;
+
+#ifdef TEST
+      case 0xA0:          //  sys -> vid
+      {
+        u32 load_width = list[2] & 0xffff;
+        u32 load_height = list[2] >> 16;
+        u32 load_size = load_width * load_height;
+
+        len += load_size / 2;
+      } break;
+
+      case 0xC0:
+        break;
+#else
+      case 0xA0:          //  sys ->vid
+      case 0xC0:          //  vid -> sys
+        // Handled by gpulib
+        goto breakloop;
+#endif
+      case 0xE1 ... 0xE6: { // Draw settings
+        gpuGP0Cmd_0xEx(gpu_senquack, gpu_senquack.PacketBuffer.U4[0]);
+      } break;
+    }
+  }
+
+breakloop:
+  gpu.ex_regs[1] &= ~0x1ff;
+  gpu.ex_regs[1] |= gpu_senquack.GPU_GP1 & 0x1ff;
+
+  *last_cmd = cmd;
+  return list - list_start;
+}
+
+void renderer_sync_ecmds(uint32_t *ecmds)
+{
+  int dummy;
+  do_cmd_list(&ecmds[1], 6, &dummy);
+}
+
+void renderer_update_caches(int x, int y, int w, int h)
+{
+}
+
+void renderer_flush_queues(void)
+{
+}
+
+void renderer_set_interlace(int enable, int is_odd)
+{
+}
+
+#include "../../frontend/plugin_lib.h"
+// Handle any gpulib settings applicable to gpu_senquack:
+void renderer_set_config(const struct rearmed_cbs *cbs)
+{
+  gpu_senquack.vram = (u16*)gpu.vram;
+  gpu_senquack.config.ilace_force   = cbs->gpu_senquack.ilace_force;
+  gpu_senquack.config.pixel_skip    = cbs->gpu_senquack.pixel_skip;
+  gpu_senquack.config.lighting      = cbs->gpu_senquack.lighting;
+  gpu_senquack.config.fast_lighting = cbs->gpu_senquack.fast_lighting;
+  gpu_senquack.config.blending      = cbs->gpu_senquack.blending;
+  gpu_senquack.config.dithering     = cbs->gpu_senquack.dithering;
+  gpu_senquack.config.scale_hires   = cbs->gpu_senquack.scale_hires;
+}
+
+// vim:shiftwidth=2:expandtab
diff --git a/plugins/gpu_senquack/port.h b/plugins/gpu_senquack/port.h

new file mode 100644 (file)

index 0000000..0a731f8
--- /dev/null
+++ b/plugins/gpu_senquack/port.h
@@ -0,0 +1,41 @@
+#ifndef __GPU_UNAI_GPU_PORT_H__
+#define __GPU_UNAI_GPU_PORT_H__
+
+#include <stddef.h>
+#include <string.h>
+
+#define INLINE static inline
+
+#define GPU_init       GPUinit
+#define GPU_shutdown   GPUshutdown
+//#define GPU_freeze   GPUfreeze
+#define GPU_writeDataMem GPUwriteDataMem
+#define GPU_dmaChain   GPUdmaChain
+#define GPU_writeData  GPUwriteData
+#define GPU_readDataMem        GPUreadDataMem
+#define GPU_readData   GPUreadData
+#define GPU_readStatus GPUreadStatus
+#define GPU_writeStatus        GPUwriteStatus
+#define GPU_updateLace GPUupdateLace
+
+extern "C" {
+
+#define u32 unsigned int
+#define s32 signed int
+
+bool GPUinit(void);
+void GPUshutdown(void);
+void GPUwriteDataMem(u32* dmaAddress, s32 dmaCount);
+long GPUdmaChain(u32* baseAddr, u32 dmaVAddr);
+void GPUwriteData(u32 data);
+void GPUreadDataMem(u32* dmaAddress, s32 dmaCount);
+u32  GPUreadData(void);
+u32  GPUreadStatus(void);
+void GPUwriteStatus(u32 data);
+
+#undef u32
+#undef s32
+
+}
+
+#endif /* __GPU_UNAI_GPU_PORT_H__ */
diff --git a/plugins/gpu_senquack/profiler.h b/plugins/gpu_senquack/profiler.h

new file mode 100644 (file)

index 0000000..a23ee38
--- /dev/null
+++ b/plugins/gpu_senquack/profiler.h
@@ -0,0 +1,9 @@
+#ifndef __GPU_UNAI_GPU_PROFILER_H__
+#define __GPU_UNAI_GPU_PROFILER_H__
+
+#define pcsx4all_prof_pause(...)
+#define pcsx4all_prof_start_with_pause(...)
+#define pcsx4all_prof_end_with_resume(...)
+#define pcsx4all_prof_resume(...)
+
+#endif /* __GPU_UNAI_GPU_PROFILER_H__ */
author	gameblabla <gameblabla@users.noreply.github.com>
	Fri, 29 Oct 2021 20:03:27 +0000 (20:03 +0000)
committer	GitHub <noreply@github.com>
	Fri, 29 Oct 2021 20:03:27 +0000 (23:03 +0300)
Makefile		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
frontend/main.c		patch \| blob \| blame \| history
frontend/menu.c		patch \| blob \| blame \| history
frontend/plugin_lib.h		patch \| blob \| blame \| history
plugins/gpu_senquack/Makefile	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/README_senquack.txt	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/debug.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu.cpp	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_arm.S	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_arm.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_blit.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_command.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_fixedpoint.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_blend.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_blend_arm.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_blend_arm5.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_blend_arm7.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_light.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_light_arm.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_inner_quantization.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_raster_image.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_raster_line.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_raster_polygon.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_raster_sprite.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpu_senquack.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/gpulib_if.cpp	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/port.h	[new file with mode: 0644]	patch \| blob
plugins/gpu_senquack/profiler.h	[new file with mode: 0644]	patch \| blob