+#define gvorr_n_u16(d, n) d.u16 |= n
+
+// fallbacks
+#if 1
+
+#ifndef gvaddhn_u32
+#define gvaddhn_u32(d, a, b) { \
+ gvreg tmp1_ = { .u32 = a.u32 + b.u32 }; \
+ gvmovn_top_u32(d, tmp1_); \
+}
+#endif
+#ifndef gvabsq_s32
+#define gvabsq_s32(d, s) { \
+ gvreg tmp1_ = { .s32 = (gvs32){} - s.s32 }; \
+ gvreg mask_ = { .s32 = s.s32 >> 31 }; \
+ gvbslq_(d, mask_, tmp1_, s); \
+}
+#endif
+#ifndef gvbit
+#define gvbslq_(d, s, a, b) d.u32 = (a.u32 & s.u32) | (b.u32 & ~s.u32)
+#define gvbifq(d, a, b) gvbslq_(d, b, d, a)
+#define gvbit(d, a, b) gvbslq_(d, b, a, d)
+#endif
+#ifndef gvaddw_s32
+#define gvaddw_s32(d, a, b) {gvreg t_; gvmovl_s32(t_, b); d.s64 += t_.s64;}
+#endif
+#ifndef gvhaddq_u16
+// can do this because the caller needs the msb clear
+#define gvhaddq_u16(d, a, b) d.u16 = (a.u16 + b.u16) >> 1
+#endif
+#ifndef gvminq_u16
+#define gvminq_u16(d, a, b) { \
+ gvu16 t_ = a.u16 < b.u16; \
+ d.u16 = (a.u16 & t_) | (b.u16 & ~t_); \
+}
+#endif
+#ifndef gvmlsq_s32
+#define gvmlsq_s32(d, a, b) d.s32 -= a.s32 * b.s32
+#endif
+#ifndef gvmlsq_l_s32
+#define gvmlsq_l_s32(d, a, b, l){gvreg t_; gvdupq_l_u32(t_, b, l); d.s32 -= a.s32 * t_.s32;}
+#endif
+#ifndef gvmla_s32
+#define gvmla_s32(d, a, b) d.s32 += a.s32 * b.s32
+#endif
+#ifndef gvmla_u32
+#define gvmla_u32 gvmla_s32
+#endif
+#ifndef gvmlaq_s32
+#define gvmlaq_s32(d, a, b) d.s32 += a.s32 * b.s32
+#endif
+#ifndef gvmlaq_u32
+#define gvmlaq_u32 gvmlaq_s32
+#endif
+#ifndef gvmlal_u8
+#define gvmlal_u8(d, a, b) {gvreg t_; gvmull_u8(t_, a, b); d.u16 += t_.u16;}
+#endif
+#ifndef gvmlal_s32
+#define gvmlal_s32(d, a, b) {gvreg t_; gvmull_s32(t_, a, b); d.s64 += t_.s64;}
+#endif
+#ifndef gvmov_l_s32
+#define gvmov_l_s32(d, s, l) d.s32[l] = s
+#endif
+#ifndef gvmov_l_u32
+#define gvmov_l_u32(d, s, l) d.u32[l] = s
+#endif
+#ifndef gvmul_s32
+#define gvmul_s32(d, a, b) d.s32 = a.s32 * b.s32
+#endif
+#ifndef gvmull_u8
+#define gvmull_u8(d, a, b) { \
+ gvreg t1_, t2_; \
+ gvmovl_u8(t1_, a); \
+ gvmovl_u8(t2_, b); \
+ d.u16 = t1_.u16 * t2_.u16; \
+}
+#endif
+#ifndef gvmull_s32
+// note: compilers tend to use int regs here
+#define gvmull_s32(d, a, b) { \
+ d.s64[0] = (s64)a.s32[0] * b.s32[0]; \
+ d.s64[1] = (s64)a.s32[1] * b.s32[1]; \
+}
+#endif
+#ifndef gvneg_s32
+#define gvneg_s32(d, s) d.s32 = -s.s32
+#endif
+// x86 note: needs _mm_sllv_epi16 (avx512), else this sucks terribly
+#ifndef gvshl_u16
+#define gvshl_u16(d, a, b) d.u16 = a.u16 << b.u16
+#endif
+// x86 note: needs _mm_sllv_* (avx2)
+#ifndef gvshlq_u64
+#define gvshlq_u64(d, a, b) d.u64 = a.u64 << b.u64
+#endif
+#ifndef gvshll_n_s8
+#define gvshll_n_s8(d, s, n) {gvreg t_; gvmovl_s8(t_, s); gvshlq_n_u16(d, t_, n);}
+#endif
+#ifndef gvshll_n_u8
+#define gvshll_n_u8(d, s, n) {gvreg t_; gvmovl_u8(t_, s); gvshlq_n_u16(d, t_, n);}
+#endif
+#ifndef gvshr_n_u8
+#define gvshr_n_u8(d, s, n) d.u8 = s.u8 >> (n)
+#endif
+#ifndef gvshrq_n_s64
+#define gvshrq_n_s64(d, s, n) d.s64 = s.s64 >> (n)
+#endif
+#ifndef gvshrn_n_u16
+#define gvshrn_n_u16(d, s, n) {gvreg t_; gvshrq_n_u16(t_, s, n); gvmovn_u16(d, t_);}
+#endif
+#ifndef gvshrn_n_u32
+#define gvshrn_n_u32(d, s, n) {gvreg t_; gvshrq_n_u32(t_, s, n); gvmovn_u32(d, t_);}
+#endif
+#ifndef gvsli_n_u8
+#define gvsli_n_u8(d, s, n) d.u8 = (s.u8 << (n)) | (d.u8 & ((1u << (n)) - 1u))
+#endif
+#ifndef gvsri_n_u8
+#define gvsri_n_u8(d, s, n) d.u8 = (s.u8 >> (n)) | (d.u8 & ((0xff00u >> (n)) & 0xffu))
+#endif
+#ifndef gvtstq_u16
+#define gvtstq_u16(d, a, b) d.u16 = (a.u16 & b.u16) != 0
+#endif
+
+#ifndef gvld2_u8_dup
+#define gvld2_u8_dup(v0, v1, p) { \
+ gvdup_n_u8(v0, ((const u8 *)(p))[0]); \
+ gvdup_n_u8(v1, ((const u8 *)(p))[1]); \
+}
+#endif
+#ifndef gvst1_u8
+#define gvst1_u8(v, p) *(uint64_t_ua *)(p) = v.u64[0]
+#endif
+#ifndef gvst1q_u16
+#define gvst1q_u16(v, p) *(gvreg_ua *)(p) = v
+#endif
+#ifndef gvst1q_inc_u32
+#define gvst1q_inc_u32(v, p, i) {*(gvreg_ua *)(p) = v; p += (i) / sizeof(*p);}
+#endif
+#ifndef gvst1q_pi_u32
+#define gvst1q_pi_u32(v, p) gvst1q_inc_u32(v, p, sizeof(v))
+#endif
+#ifndef gvst1q_2_pi_u32
+#define gvst1q_2_pi_u32(v0, v1, p) { \
+ gvst1q_inc_u32(v0, p, sizeof(v0)); \
+ gvst1q_inc_u32(v1, p, sizeof(v1)); \
+}
+#endif
+#ifndef gvst2_u8
+#define gvst2_u8(v0, v1, p) {gvreg t_; gvzip_u8(t_, v0, v1); *(gvu8_ua *)(p) = t_.u8;}
+#endif
+#ifndef gvst2_u16
+#define gvst2_u16(v0, v1, p) {gvreg t_; gvzip_u16(t_, v0, v1); *(gvu16_ua *)(p) = t_.u16;}
+#endif
+
+// note: these shuffles assume sizeof(gvhreg) == 16 && sizeof(gvreg) == 16
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// prefer __builtin_shuffle on gcc as it handles -1 poorly
+#if __has_builtin(__builtin_shufflevector) && !__has_builtin(__builtin_shuffle)
+
+#ifndef gvld2q_u8
+#define gvld2q_u8(v0, v1, p) { \
+ gvu8 v0_ = ((gvu8_ua *)(p))[0]; \
+ gvu8 v1_ = ((gvu8_ua *)(p))[1]; \
+ v0.u8 = __builtin_shufflevector(v0_, v1_, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30); \
+ v1.u8 = __builtin_shufflevector(v0_, v1_, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31); \
+}
+#endif
+#ifndef gvmovn_u16
+#define gvmovn_u16(d, s) \
+ d.u8 = __builtin_shufflevector(s.u8, s.u8, 0,2,4,6,8,10,12,14,-1,-1,-1,-1,-1,-1,-1,-1)
+#endif
+#ifndef gvmovn_u32
+#define gvmovn_u32(d, s) \
+ d.u16 = __builtin_shufflevector(s.u16, s.u16, 0,2,4,6,-1,-1,-1,-1)
+#endif
+#ifndef gvmovn_top_u32
+#define gvmovn_top_u32(d, s) \
+ d.u16 = __builtin_shufflevector(s.u16, s.u16, 1,3,5,7,-1,-1,-1,-1)
+#endif
+#ifndef gvzip_u8
+#define gvzip_u8(d, a, b) \
+ d.u8 = __builtin_shufflevector(a.u8, b.u8, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23)
+#endif
+#ifndef gvzip_u16
+#define gvzip_u16(d, a, b) \
+ d.u16 = __builtin_shufflevector(a.u16, b.u16, 0,8,1,9,2,10,3,11)
+#endif
+#ifndef gvzipq_u16
+#define gvzipq_u16(d0, d1, s0, s1) { \
+ gvu16 t_ = __builtin_shufflevector(s0.u16, s1.u16, 0, 8, 1, 9, 2, 10, 3, 11); \
+ d1.u16 = __builtin_shufflevector(s0.u16, s1.u16, 4,12, 5,13, 6, 14, 7, 15); \
+ d0.u16 = t_; \
+}
+#endif
+
+#else // !__has_builtin(__builtin_shufflevector)
+
+#ifndef gvld2q_u8
+#define gvld2q_u8(v0, v1, p) { \
+ gvu8 v0_ = ((gvu8_ua *)(p))[0]; \
+ gvu8 v1_ = ((gvu8_ua *)(p))[1]; \
+ v0.u8 = __builtin_shuffle(v0_, v1_, (gvu8){0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30}); \
+ v1.u8 = __builtin_shuffle(v0_, v1_, (gvu8){1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31}); \
+}
+#endif
+#ifndef gvmovn_u16
+#define gvmovn_u16(d, s) \
+ d.u8 = __builtin_shuffle(s.u8, (gvu8){0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14})
+#endif
+#ifndef gvmovn_u32
+#define gvmovn_u32(d, s) \
+ d.u16 = __builtin_shuffle(s.u16, (gvu16){0,2,4,6,0,2,4,6})
+#endif
+#ifndef gvmovn_top_u32
+#define gvmovn_top_u32(d, s) \
+ d.u16 = __builtin_shuffle(s.u16, (gvu16){1,3,5,7,1,3,5,7})
+#endif
+#ifndef gvtbl2_u8
+#define gvtbl2_u8(d, a, b) d.u8 = __builtin_shuffle(a.u8, b.u8)
+#endif
+#ifndef gvzip_u8
+#define gvzip_u8(d, a, b) \
+ d.u8 = __builtin_shuffle(a.u8, b.u8, (gvu8){0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23})
+#endif
+#ifndef gvzip_u16
+#define gvzip_u16(d, a, b) \
+ d.u16 = __builtin_shuffle(a.u16, b.u16, (gvu16){0,8,1,9,2,10,3,11})
+#endif
+#ifndef gvzipq_u16
+#define gvzipq_u16(d0, d1, s0, s1) { \
+ gvu16 t_ = __builtin_shuffle(s0.u16, s1.u16, (gvu16){0, 8, 1, 9, 2, 10, 3, 11}); \
+ d1.u16 = __builtin_shuffle(s0.u16, s1.u16, (gvu16){4,12, 5,13, 6, 14, 7, 15}); \
+ d0.u16 = t_; \
+}
+#endif
+
+#endif // __builtin_shufflevector || __builtin_shuffle
+
+#ifndef gvtbl2_u8
+#define gvtbl2_u8(d, a, b) { \
+ int i_; \
+ for (i_ = 0; i_ < 16; i_++) \
+ d.u8[i_] = a.u8[b.u8[i_]]; \
+}
+#endif
+
+#endif // fallbacks