#include "gpu_inner_quantization.h"
#include "gpu_inner_light.h"
+#include "arm_features.h"
+#include "compiler_features.h"
#ifdef __arm__
+#include "gpu_arm.h"
#include "gpu_inner_blend_arm.h"
#include "gpu_inner_light_arm.h"
#define gpuBlending gpuBlendingARM
-#define gpuLightingRGB gpuLightingRGBARM
#define gpuLightingTXT gpuLightingTXTARM
-#define gpuLightingTXTGouraud gpuLightingTXTGouraudARM
-// Non-dithering lighting and blending functions preserve uSrc
-// MSB. This saves a few operations and useless load/stores.
-#define MSB_PRESERVED (!CF_DITHER)
#else
#define gpuBlending gpuBlendingGeneric
-#define gpuLightingRGB gpuLightingRGBGeneric
#define gpuLightingTXT gpuLightingTXTGeneric
-#define gpuLightingTXTGouraud gpuLightingTXTGouraudGeneric
-#define MSB_PRESERVED 0
#endif
+// Non-dithering lighting and blending functions preserve uSrc
+// MSB. This saves a few operations and useless load/stores.
+#define MSB_PRESERVED (!CF_DITHER)
// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
// This is only for debugging/verification of low-precision colors in C.
// GPU Tiles innerloops generator
template<int CF>
-static void gpuTileSpanFn(le16_t *pDst, u32 count, u16 data)
+static inline void gpuTileSpanFn(le16_t *pDst, u16 data, u32 count)
{
le16_t ldata;
}
}
-static void TileNULL(le16_t *pDst, u32 count, u16 data)
+template<int CF>
+static noinline void gpuTileDriverFn(le16_t *pDst, u16 data, u32 count,
+ const gpu_unai_inner_t &inn)
+{
+ const int li=gpu_unai.inn.ilace_mask;
+ const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.inn.ilace_mask+1):0);
+ const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.inn.ilace_mask+1):0):1);
+ const int y1 = inn.y1;
+ int y0 = inn.y0;
+
+ for (; y0 < y1; ++y0) {
+ if (!(y0&li) && (y0&pi) != pif)
+ gpuTileSpanFn<CF>(pDst, data, count);
+ pDst += FRAME_WIDTH;
+ }
+}
+
+#ifdef __arm__
+
+template<int CF>
+static void TileAsm(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn)
+{
+ switch (CF) {
+ case 0x02: tile_driver_st0_asm(pDst, data, count, &inn); return;
+ case 0x0a: tile_driver_st1_asm(pDst, data, count, &inn); return;
+ case 0x1a: tile_driver_st3_asm(pDst, data, count, &inn); return;
+#ifdef HAVE_ARMV6
+ case 0x12: tile_driver_st2_asm(pDst, data, count, &inn); return;
+#endif
+ }
+ gpuTileDriverFn<CF>(pDst, data, count, inn);
+}
+
+#endif
+
+static void TileNULL(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn)
{
#ifdef ENABLE_GPU_LOG_SUPPORT
fprintf(stdout,"TileNULL()\n");
///////////////////////////////////////////////////////////////////////////////
// Tiles innerloops driver
-typedef void (*PT)(le16_t *pDst, u32 count, u16 data);
+typedef void (*PT)(le16_t *pDst, u16 data, u32 count, const gpu_unai_inner_t &inn);
// Template instantiation helper macros
-#define TI(cf) gpuTileSpanFn<(cf)>
+#define TI(cf) gpuTileDriverFn<(cf)>
#define TN TileNULL
+#ifdef __arm__
+#define TA(cf) TileAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) TileAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
+#endif
#define TIBLOCK(ub) \
- TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
- TN, TI((ub)|0x0a), TN, TI((ub)|0x0e), \
- TN, TI((ub)|0x12), TN, TI((ub)|0x16), \
- TN, TI((ub)|0x1a), TN, TI((ub)|0x1e)
+ TI((ub)|0x00), TA6((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+ TN, TA ((ub)|0x0a), TN, TI((ub)|0x0e), \
+ TN, TA6((ub)|0x12), TN, TI((ub)|0x16), \
+ TN, TA ((ub)|0x1a), TN, TI((ub)|0x1e)
-const PT gpuTileSpanDrivers[32] = {
+const PT gpuTileDrivers[32] = {
TIBLOCK(0<<8), TIBLOCK(1<<8)
};
#undef TI
#undef TN
+#undef TA
+#undef TA6
#undef TIBLOCK
///////////////////////////////////////////////////////////////////////////////
// GPU Sprites innerloops generator
+typedef void (*PS)(le16_t *pPixel, u32 count, const u8 *pTxt,
+ const gpu_unai_inner_t &inn);
+
template<int CF>
-static void gpuSpriteSpanFn(le16_t *pDst, u32 count, u8* pTxt, u32 u0)
+static noinline void gpuSpriteDriverFn(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+ const gpu_unai_inner_t &inn)
{
// Blend func can save an operation if it knows uSrc MSB is unset.
// Untextured prims can always skip (source color always comes with MSB=0).
uint_fast16_t uSrc, uDst, srcMSB;
bool should_blend;
- u32 u0_mask = gpu_unai.TextureWindow[2];
+ u32 u0_mask = inn.u_msk >> 10;
u8 r5, g5, b5;
if (CF_LIGHT) {
- r5 = gpu_unai.r5;
- g5 = gpu_unai.g5;
- b5 = gpu_unai.b5;
+ r5 = inn.r5;
+ g5 = inn.g5;
+ b5 = inn.b5;
}
+ const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = inn.CBA;
+ const u32 v0_mask = inn.v_msk >> 10;
+ s32 y0 = inn.y0, y1 = inn.y1, li = inn.ilace_mask;
+ u32 u0_ = inn.u, v0 = inn.v;
+
if (CF_TEXTMODE==3) {
- // Texture is accessed byte-wise, so adjust mask if 16bpp
+ // Texture is accessed byte-wise, so adjust to 16bpp
+ u0_ <<= 1;
u0_mask <<= 1;
}
- const le16_t *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
-
- do
+ for (; y0 < y1; ++y0, pPixel += FRAME_WIDTH, ++v0)
{
+ if (y0 & li) continue;
+ const u8 *pTxt = pTxt_base + ((v0 & v0_mask) * 2048);
+ le16_t *pDst = pPixel;
+ u32 u0 = u0_;
+ u32 count1 = count;
+ do
+ {
if (CF_MASKCHECK || CF_BLEND) { uDst = le16_to_u16(*pDst); }
if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; }
endsprite:
u0 += (CF_TEXTMODE==3) ? 2 : 1;
pDst++;
+ }
+ while (--count1);
}
- while (--count);
}
-static void SpriteNULL(le16_t *pDst, u32 count, u8* pTxt, u32 u0)
+#ifdef __arm__
+
+template<int CF>
+static void SpriteMaybeAsm(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+ const gpu_unai_inner_t &inn)
+{
+#if 1
+ s32 lines = inn.y1 - inn.y0;
+ u32 u1m = inn.u + count - 1, v1m = inn.v + lines - 1;
+ if (u1m == (u1m & (inn.u_msk >> 10)) && v1m == (v1m & (inn.v_msk >> 10))) {
+ const u8 *pTxt = pTxt_base + inn.v * 2048;
+ switch (CF) {
+ case 0x20: sprite_driver_4bpp_asm (pPixel, pTxt + inn.u / 2, count, &inn); return;
+ case 0x40: sprite_driver_8bpp_asm (pPixel, pTxt + inn.u, count, &inn); return;
+ case 0x60: sprite_driver_16bpp_asm(pPixel, pTxt + inn.u * 2, count, &inn); return;
+ }
+ }
+ if (v1m == (v1m & (inn.v_msk >> 10))) {
+ const u8 *pTxt = pTxt_base + inn.v * 2048;
+ switch (CF) {
+ case 0x20: sprite_driver_4bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+ case 0x22: sprite_driver_4bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
+ case 0x40: sprite_driver_8bpp_l0_std_asm(pPixel, pTxt, count, &inn); return;
+ case 0x42: sprite_driver_8bpp_l0_st0_asm(pPixel, pTxt, count, &inn); return;
+#ifdef HAVE_ARMV6
+ case 0x21: sprite_driver_4bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
+ case 0x23: sprite_driver_4bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
+ case 0x2b: sprite_driver_4bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
+ case 0x41: sprite_driver_8bpp_l1_std_asm(pPixel, pTxt, count, &inn); return;
+ case 0x43: sprite_driver_8bpp_l1_st0_asm(pPixel, pTxt, count, &inn); return;
+ case 0x4b: sprite_driver_8bpp_l1_st1_asm(pPixel, pTxt, count, &inn); return;
+#endif
+ }
+ }
+#endif
+ gpuSpriteDriverFn<CF>(pPixel, count, pTxt_base, inn);
+}
+#endif // __arm__
+
+static void SpriteNULL(le16_t *pPixel, u32 count, const u8 *pTxt_base,
+ const gpu_unai_inner_t &inn)
{
#ifdef ENABLE_GPU_LOG_SUPPORT
fprintf(stdout,"SpriteNULL()\n");
///////////////////////////////////////////////////////////////////////////////
// Sprite innerloops driver
-typedef void (*PS)(le16_t *pDst, u32 count, u8* pTxt, u32 u0);
// Template instantiation helper macros
-#define TI(cf) gpuSpriteSpanFn<(cf)>
+#define TI(cf) gpuSpriteDriverFn<(cf)>
#define TN SpriteNULL
+#ifdef __arm__
+#define TA(cf) SpriteMaybeAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) SpriteMaybeAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
+#endif
#define TIBLOCK(ub) \
TN, TN, TN, TN, TN, TN, TN, TN, \
TN, TN, TN, TN, TN, TN, TN, TN, \
TN, TN, TN, TN, TN, TN, TN, TN, \
TN, TN, TN, TN, TN, TN, TN, TN, \
- TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
- TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
+ TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+ TN, TN, TI((ub)|0x2a), TA6((ub)|0x2b),TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \
- TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
- TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \
+ TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+ TN, TN, TI((ub)|0x4a), TA6((ub)|0x4b),TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \
TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \
TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \
- TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+ TA((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
TN, TN, TI((ub)|0x6a), TI((ub)|0x6b), TN, TN, TI((ub)|0x6e), TI((ub)|0x6f), \
TN, TN, TI((ub)|0x72), TI((ub)|0x73), TN, TN, TI((ub)|0x76), TI((ub)|0x77), \
TN, TN, TI((ub)|0x7a), TI((ub)|0x7b), TN, TN, TI((ub)|0x7e), TI((ub)|0x7f)
-const PS gpuSpriteSpanDrivers[256] = {
+const PS gpuSpriteDrivers[256] = {
TIBLOCK(0<<8), TIBLOCK(1<<8)
};
#undef TI
#undef TN
#undef TIBLOCK
+#undef TA
+#undef TA6
///////////////////////////////////////////////////////////////////////////////
// GPU Polygon innerloops generator
// relevant blend/light headers.
// (see README_senquack.txt)
template<int CF>
-static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+static noinline void gpuPolySpanFn(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
{
// Blend func can save an operation if it knows uSrc MSB is unset.
// Untextured prims can always skip this (src color MSB is always 0).
const bool skip_uSrc_mask = MSB_PRESERVED ? (!CF_TEXTMODE) : (!CF_TEXTMODE) || CF_LIGHT;
bool should_blend;
- u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask;
+ u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.inn.blit_mask;
if (!CF_TEXTMODE)
{
if (!CF_GOURAUD)
{
// UNTEXTURED, NO GOURAUD
- const u16 pix15 = gpu_unai.PixelData;
+ const u16 pix15 = gpu_unai.inn.PixelData;
do {
uint_fast16_t uSrc, uDst;
else
{
// UNTEXTURED, GOURAUD
- u32 l_gCol = gpu_unai.gCol;
- u32 l_gInc = gpu_unai.gInc;
+ gcol_t l_gCol = gpu_unai.inn.gCol;
+ gcol_t l_gInc = gpu_unai.inn.gInc;
do {
uint_fast16_t uDst, uSrc;
endpolynotextgou:
pDst++;
- l_gCol += l_gInc;
+ l_gCol.raw += l_gInc.raw;
}
while (--count);
}
//senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
// one 32-bit unsigned int, but this proved to lose too much accuracy
// (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
- u32 l_u_msk = gpu_unai.u_msk; u32 l_v_msk = gpu_unai.v_msk;
- u32 l_u = gpu_unai.u & l_u_msk; u32 l_v = gpu_unai.v & l_v_msk;
- s32 l_u_inc = gpu_unai.u_inc; s32 l_v_inc = gpu_unai.v_inc;
+ u32 l_u_msk = gpu_unai.inn.u_msk; u32 l_v_msk = gpu_unai.inn.v_msk;
+ u32 l_u = gpu_unai.inn.u & l_u_msk; u32 l_v = gpu_unai.inn.v & l_v_msk;
+ s32 l_u_inc = gpu_unai.inn.u_inc; s32 l_v_inc = gpu_unai.inn.v_inc;
+ l_v <<= 1;
+ l_v_inc <<= 1;
+ l_v_msk = (l_v_msk & (0xff<<10)) << 1;
- const le16_t* TBA_ = gpu_unai.TBA;
- const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+ const le16_t* TBA_ = gpu_unai.inn.TBA;
+ const le16_t* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.inn.CBA;
u8 r5, g5, b5;
u8 r8, g8, b8;
- u32 l_gInc, l_gCol;
+ gcol_t l_gInc, l_gCol;
if (CF_LIGHT) {
if (CF_GOURAUD) {
- l_gInc = gpu_unai.gInc;
- l_gCol = gpu_unai.gCol;
+ l_gInc = gpu_unai.inn.gInc;
+ l_gCol = gpu_unai.inn.gCol;
} else {
if (CF_DITHER) {
- r8 = gpu_unai.r8;
- g8 = gpu_unai.g8;
- b8 = gpu_unai.b8;
+ r8 = gpu_unai.inn.r8;
+ g8 = gpu_unai.inn.g8;
+ b8 = gpu_unai.inn.b8;
} else {
- r5 = gpu_unai.r5;
- g5 = gpu_unai.g5;
- b5 = gpu_unai.b5;
+ r5 = gpu_unai.inn.r5;
+ g5 = gpu_unai.inn.g5;
+ b5 = gpu_unai.inn.b5;
}
}
}
// (UNAI originally used 16.16)
if (CF_TEXTMODE==1) { // 4bpp (CLUT)
u32 tu=(l_u>>10);
- u32 tv=(l_v<<1)&(0xff<<11);
+ u32 tv=l_v&l_v_msk;
u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
uSrc=le16_to_u16(CBA_[(rgb>>((tu&1)<<2))&0xf]);
if (!uSrc) goto endpolytext;
}
if (CF_TEXTMODE==2) { // 8bpp (CLUT)
- uSrc = le16_to_u16(CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])]);
+ u32 tv=l_v&l_v_msk;
+ uSrc = le16_to_u16(CBA_[((u8*)TBA_)[tv+(l_u>>10)]]);
if (!uSrc) goto endpolytext;
}
if (CF_TEXTMODE==3) { // 16bpp
- uSrc = le16_to_u16(TBA_[(l_u>>10)+((l_v)&(0xff<<10))]);
+ u32 tv=(l_v&l_v_msk)>>1;
+ uSrc = le16_to_u16(TBA_[tv+(l_u>>10)]);
if (!uSrc) goto endpolytext;
}
endpolytext:
pDst++;
l_u = (l_u + l_u_inc) & l_u_msk;
- l_v = (l_v + l_v_inc) & l_v_msk;
- if (CF_LIGHT && CF_GOURAUD) l_gCol += l_gInc;
+ l_v += l_v_inc;
+ if (CF_LIGHT && CF_GOURAUD)
+ l_gCol.raw += l_gInc.raw;
}
while (--count);
}
}
+#ifdef __arm__
+template<int CF>
+static void PolySpanMaybeAsm(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
+{
+ switch (CF) {
+ case 0x02: poly_untex_st0_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x0a: poly_untex_st1_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x1a: poly_untex_st3_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x20: poly_4bpp_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x22: poly_4bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
+ case 0x40: poly_8bpp_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x42: poly_8bpp_l0_st0_asm(pDst, &gpu_unai.inn, count); break;
+#ifdef HAVE_ARMV6
+ case 0x12: poly_untex_st2_asm (pDst, &gpu_unai.inn, count); break;
+ case 0x21: poly_4bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
+ case 0x23: poly_4bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
+ case 0x41: poly_8bpp_l1_std_asm(pDst, &gpu_unai.inn, count); break;
+ case 0x43: poly_8bpp_l1_st0_asm(pDst, &gpu_unai.inn, count); break;
+#endif
+ default: gpuPolySpanFn<CF>(gpu_unai, pDst, count);
+ }
+}
+#endif
+
static void PolyNULL(const gpu_unai_t &gpu_unai, le16_t *pDst, u32 count)
{
#ifdef ENABLE_GPU_LOG_SUPPORT
// Template instantiation helper macros
#define TI(cf) gpuPolySpanFn<(cf)>
#define TN PolyNULL
+#ifdef __arm__
+#define TA(cf) PolySpanMaybeAsm<(cf)>
+#else
+#define TA(cf) TI(cf)
+#endif
+#ifdef HAVE_ARMV6
+#define TA6(cf) PolySpanMaybeAsm<(cf)>
+#else
+#define TA6(cf) TI(cf)
+#endif
#define TIBLOCK(ub) \
- TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
- TN, TN, TI((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \
- TN, TN, TI((ub)|0x12), TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \
- TN, TN, TI((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \
- TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+ TI((ub)|0x00), TI((ub)|0x01), TA6((ub)|0x02),TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+ TN, TN, TA((ub)|0x0a), TI((ub)|0x0b), TN, TN, TI((ub)|0x0e), TI((ub)|0x0f), \
+ TN, TN, TA6((ub)|0x12),TI((ub)|0x13), TN, TN, TI((ub)|0x16), TI((ub)|0x17), \
+ TN, TN, TA((ub)|0x1a), TI((ub)|0x1b), TN, TN, TI((ub)|0x1e), TI((ub)|0x1f), \
+ TA((ub)|0x20), TA6((ub)|0x21),TA6((ub)|0x22),TA6((ub)|0x23),TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
TN, TN, TI((ub)|0x2a), TI((ub)|0x2b), TN, TN, TI((ub)|0x2e), TI((ub)|0x2f), \
TN, TN, TI((ub)|0x32), TI((ub)|0x33), TN, TN, TI((ub)|0x36), TI((ub)|0x37), \
TN, TN, TI((ub)|0x3a), TI((ub)|0x3b), TN, TN, TI((ub)|0x3e), TI((ub)|0x3f), \
- TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+ TA((ub)|0x40), TA6((ub)|0x41),TA6((ub)|0x42),TA6((ub)|0x43),TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
TN, TN, TI((ub)|0x4a), TI((ub)|0x4b), TN, TN, TI((ub)|0x4e), TI((ub)|0x4f), \
TN, TN, TI((ub)|0x52), TI((ub)|0x53), TN, TN, TI((ub)|0x56), TI((ub)|0x57), \
TN, TN, TI((ub)|0x5a), TI((ub)|0x5b), TN, TN, TI((ub)|0x5e), TI((ub)|0x5f), \
#undef TI
#undef TN
#undef TIBLOCK
+#undef TA
+#undef TA6
#endif /* __GPU_UNAI_GPU_INNER_H__ */