shithub: amd64-simd

Download patch

ref: 675aa84403f98776a7d463e1cc5f9bd41cdbab92
parent: cc3307440e698d58843a5273519f4988c01937f1
author: rodri <[email protected]>
date: Sat Nov 25 05:34:41 EST 2023

cleaned things up and improved the organization a bit.

--- a/avx.h
+++ b/avx.h
@@ -45,6 +45,26 @@
 #define VMOVAPD_256rr(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);		\
 			VOP(0x28, 0x3, (d), (s))
 
+/* VMOVDQA */
+#define VMOVDQA_128mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);	\
+				VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQA_128rm(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_66);		\
+			VOP(0x7F, 0x3, (s), (d))
+#define VMOVDQA_256mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);	\
+				VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQA_256rm(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_66);		\
+			VOP(0x7F, 0x3, (s), (d))
+
+/* VMODQU */
+#define VMOVDQU_128mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3);	\
+				VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQU_128rm(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_128,VEX_p_F3);		\
+			VOP(0x7F, 0x3, (s), (d))
+#define VMOVDQU_256mr(off, s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3);	\
+				VOPi(0x6F, 0x1, (d), (s), (off))
+#define VMOVDQU_256rm(s, d)	VEX3(0,0,0,VEX_m_0F,0,0,VEX_L_256,VEX_p_F3);		\
+			VOP(0x7F, 0x3, (s), (d))
+
 /* VADDPD */
 #define VADDPD_128mr(off, s0, s1, d)	VEX3(0,0,0,VEX_m_0F,0,(s0),VEX_L_128,VEX_p_66);	\
 				VOPi(0x58, 0x1, (d), (s1), (off))
--- a/bench/main.c
+++ b/bench/main.c
@@ -2,15 +2,63 @@
 #include <libc.h>
 #include <thread.h>
 #include <geometry.h>
-#include "b.h"
+#include "../bench9/b.h"
 
-double dppd(Point2, Point2);
-double dppda(Point2, Point2);
-double dppd3(Point3, Point3);
-double dppd3a(Point3, Point3);
-Point3 xvec3(Point3, Point3);
+double min(double, double);
+double dotvec2_sse4(Point2, Point2);
+double dotvec2_avx(Point2, Point2);
+double dotvec3_sse4(Point3, Point3);
+double dotvec3_avx(Point3, Point3);
+Point2 Pt2b(double, double, double);
+Point3 crossvec3_sse(Point3, Point3);
+double hsubpd(double, double);
+double fma(double, double, double);
+Point2 addpt2_avx(Point2, Point2);
 
+double
+fmin(double a, double b)
+{
+	return a<b? a: b;
+}
+
+double
+madd(double a, double b, double c)
+{
+	return a + b*c;
+}
+
 static void
+bmin(int fd)
+{
+	Bgr g;
+	B *b0, *b1;
+	double a, b;
+	int i;
+
+	benchinitgr(&g, "min");
+	b0 = benchadd(&g, "fmin");
+	b1 = benchadd(&g, "fmin_sse");
+
+	while(b0->n > 0 || b1->n > 0){
+		a = truerand()*frand();
+		b = truerand()*frand();
+
+		benchin(b0);
+		for(i = 0; i < 1e6; i++)
+			fmin(a, b);
+		benchout(b0);
+
+		benchin(b1);
+		for(i = 0; i < 1e6; i++)
+			min(a, b);
+		benchout(b1);
+	}
+
+	benchprintgr(&g, fd);
+	benchfreegr(&g);
+}
+
+static void
 bdotvec2(int fd)
 {
 	Bgr g;
@@ -20,7 +68,7 @@
 
 	benchinitgr(&g, "2d dot product");
 	b0 = benchadd(&g, "dotvec2");
-	b1 = benchadd(&g, "dotvec2_simd");
+	b1 = benchadd(&g, "dotvec2_sse4");
 	b2 = benchadd(&g, "dotvec2_avx");
 
 	while(b0->n > 0 || b1->n > 0){
@@ -34,12 +82,12 @@
 
 		benchin(b1);
 		for(i = 0; i < 1e6; i++)
-			dppd(a, b);
+			dotvec2_sse4(a, b);
 		benchout(b1);
 
 		benchin(b2);
 		for(i = 0; i < 1e6; i++)
-			dppda(a, b);
+			dotvec2_avx(a, b);
 		benchout(b2);
 	}
 
@@ -57,7 +105,7 @@
 
 	benchinitgr(&g, "3d dot product");
 	b0 = benchadd(&g, "dotvec3");
-	b1 = benchadd(&g, "dotvec3_simd");
+	b1 = benchadd(&g, "dotvec3_sse4");
 	b2 = benchadd(&g, "dotvec3_avx");
 
 	while(b0->n > 0 || b1->n > 0){
@@ -71,12 +119,12 @@
 
 		benchin(b1);
 		for(i = 0; i < 1e6; i++)
-			dppd3(a, b);
+			dotvec3_sse4(a, b);
 		benchout(b1);
 
 		benchin(b2);
 		for(i = 0; i < 1e6; i++)
-			dppd3a(a, b);
+			dotvec3_avx(a, b);
 		benchout(b2);
 	}
 
@@ -94,7 +142,7 @@
 
 	benchinitgr(&g, "3d cross product");
 	b0 = benchadd(&g, "crossvec3");
-	b1 = benchadd(&g, "crossvec3_simd");
+	b1 = benchadd(&g, "crossvec3_sse");
 
 	while(b0->n > 0 || b1->n > 0){
 		a = Vec3(truerand()*frand(), truerand()*frand(), truerand()*frand());
@@ -107,7 +155,7 @@
 
 		benchin(b1);
 		for(i = 0; i < 1e6; i++)
-			xvec3(a, b);
+			crossvec3_sse(a, b);
 		benchout(b1);
 	}
 
@@ -115,6 +163,101 @@
 	benchfreegr(&g);
 }
 
+static void
+bPt2(int fd)
+{
+	Bgr g;
+	B *b0, *b1;
+	double x, y, w;
+	int i;
+
+	benchinitgr(&g, "Pt2");
+	b0 = benchadd(&g, "Pt2");
+	b1 = benchadd(&g, "Pt2b");
+
+	while(b0->n > 0 || b1->n > 0){
+		x = truerand()*frand();
+		y = truerand()*frand();
+		w = truerand()*frand();
+
+		benchin(b0);
+		for(i = 0; i < 1e6; i++)
+			Pt2(x, y, w);
+		benchout(b0);
+
+		benchin(b1);
+		for(i = 0; i < 1e6; i++)
+			Pt2b(x, y, w);
+		benchout(b1);
+	}
+
+	benchprintgr(&g, fd);
+	benchfreegr(&g);
+}
+
+static void
+bfma(int fd)
+{
+	Bgr g;
+	B *b0, *b1;
+	double a, b, c;
+	int i;
+
+	benchinitgr(&g, "multiply + add");
+	b0 = benchadd(&g, "madd");
+	b1 = benchadd(&g, "fma_avx");
+
+	while(b0->n > 0 || b1->n > 0){
+		a = truerand()*frand();
+		b = truerand()*frand();
+		c = truerand()*frand();
+
+		benchin(b0);
+		for(i = 0; i < 1e6; i++)
+			madd(a, b, c);
+		benchout(b0);
+
+		benchin(b1);
+		for(i = 0; i < 1e6; i++)
+			fma(a, b, c);
+		benchout(b1);
+	}
+
+	benchprintgr(&g, fd);
+	benchfreegr(&g);
+}
+
+static void
+baddpt2(int fd)
+{
+	Bgr g;
+	B *b0, *b1;
+	Point2 a, b;
+	int i;
+
+	benchinitgr(&g, "2d point sum");
+	b0 = benchadd(&g, "addpt2");
+	b1 = benchadd(&g, "addpt2_avx");
+
+	while(b0->n > 0 || b1->n > 0){
+		a = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand());
+		b = Pt2(truerand()*frand(), truerand()*frand(), truerand()*frand());
+
+		benchin(b0);
+		for(i = 0; i < 1e6; i++)
+			addpt2(a, b);
+		benchout(b0);
+
+		benchin(b1);
+		for(i = 0; i < 1e6; i++)
+			addpt2_avx(a, b);
+		benchout(b1);
+	}
+
+	benchprintgr(&g, fd);
+	benchfreegr(&g);
+}
+
 void
 threadmain(int argc, char **argv)
 {
@@ -124,11 +267,19 @@
 	if(benchwire(0) != 0)
 		fprint(2, "failed to wire: %r\n");
 
+	bmin(1);
+	bseparator(1);
 	bdotvec2(1);
 	bseparator(1);
 	bdotvec3(1);
 	bseparator(1);
 	bcrossvec3(1);
+	bseparator(1);
+	bPt2(1);
+	bseparator(1);
+	bfma(1);
+	bseparator(1);
+	baddpt2(1);
 
 	threadexitsall(nil);
 }
--- /dev/null
+++ b/bench/mkfile
@@ -1,0 +1,23 @@
+</$objtype/mkfile
+
+TARG=bench9
+BIN=/$objtype/bin
+arch=`{echo __^$objtype^__}
+CFLAGS=$CFLAGS -D$arch -p
+
+HFILES=\
+	../bench9/b.h\
+	../regs.h\
+	../sse.h\
+	../avx.h\
+
+OFILES=\
+	../bench9/b.$O\
+	../bench9/b_$objtype.$O\
+	../min.$O\
+	../dppd.$O\
+	main.$O\
+
+default:V: all
+
+</sys/src/cmd/mkone
--- a/dppd.s
+++ b/dppd.s
@@ -5,24 +5,21 @@
 DATA one(SB)/8,$1.0
 GLOBL one(SB), $8
 
-TEXT dppd(SB), 1, $0
+TEXT dotvec2_sse4(SB), 1, $0
 	MOVQ SP, AX
-	MOVLPD(8, rAX, rX0)	/* MOVLPD a+0(FP), X0 */
-	MOVHPD(16, rAX, rX0)	/* MOVHPD a+8(FP), X0 */
-	MOVLPD(32, rAX, rX1)	/* MOVLPD b+24(FP), X1 */
-	MOVHPD(40, rAX, rX1)	/* MOVHPD b+32(FP), X1*/
+	MOVDQU_mr(8, rAX, rX0)	/* MOVDQU a+0(FP), X0 */
+	MOVDQU_mr(32, rAX, rX1)	/* MOVDQU b+24(FP), X1 */
 	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
 	RET
 
-TEXT dppda(SB), 1, $0
+TEXT dotvec2_avx(SB), 1, $0
 	MOVQ SP, AX
 	VMOVUPD_128mr(8, rAX, rX0)	/* VMOVUPD a+0(FP), X0 */
 	VMOVUPD_128mr(32, rAX, rX1)	/* VMOVUPD b+24(FP), X1 */
 	VDPPD(rX1, rX0, rX0)		/* VDPPD $0x31, X1, X0, X0 */
-	VZEROUPPER
 	RET
 
-TEXT dppd3(SB), 1, $0
+TEXT dotvec3_sse4(SB), 1, $0
 	MOVQ SP, AX
 	MOVLPD(8, rAX, rX0)	/* MOVLPD a+0(FP), X0 */
 	MOVHPD(16, rAX, rX0)	/* MOVHPD a+8(FP), X0 */
@@ -35,7 +32,7 @@
 	DPPD(rX1, rX0)		/* DPPD $0x31, X1, X0 */
 	RET
 
-TEXT dppd3a(SB), 1, $0
+TEXT dotvec3_avx(SB), 1, $0
 	MOVQ SP, AX
 	VMOVUPD_128mr(8, rAX, rX0)	/* VMOVUPD a+0(FP), X0 */
 	VMOVUPD_128mr(40, rAX, rX1)	/* VMOVUPD b+32(FP), X1 */
@@ -43,7 +40,6 @@
 	MOVSD a+16(FP), X1
 	MOVSD b+48(FP), X2
 	VFMADD231SD(rX1, rX2, rX0)
-	VZEROUPPER
 	RET
 
 TEXT Pt2b(SB), 1, $0
@@ -63,7 +59,7 @@
 	HSUBPD(rX0, rX0)	/* HSUBPD X0, X0 */
 	RET
 
-TEXT xvec3(SB), 1, $0
+TEXT crossvec3_sse(SB), 1, $0
 	MOVQ SP, AX
 	ADDQ $8, AX
 	MOVLPD(40, rAX, rX0)	/* MOVLPD b+32(FP), X0 */
@@ -91,7 +87,7 @@
 	MOVSD X0, 24(DI)
 	RET
 
-TEXT xvec3a(SB), 1, $0
+TEXT crossvec3_avx(SB), 1, $0
 	MOVQ SP, AX
 	ADDQ $8, AX
 	
@@ -101,5 +97,13 @@
 	MOVSD b+8(FP), X1
 	MOVSD c+16(FP), X2
 	VFMADD231SD(rX1, rX2, rX0)
-	VZEROUPPER
+	RET
+
+TEXT addpt2_avx(SB), 1, $0
+	MOVQ SP, AX
+	ADDQ $8, AX
+	VMOVDQU_256mr(8, rAX, rX0)
+	VMOVDQU_256mr(32, rAX, rX1)
+	VADDPD_256rr(rX1, rX0, rX0)
+	VMOVDQU_256rm(rX0, rAX)
 	RET
--- a/main.c
+++ b/main.c
@@ -2,16 +2,16 @@
 #include <libc.h>
 #include <geometry.h>
 
-uvlong nanosec(void);
 double min(double, double);
-double dppd(Point2, Point2);
-double dppda(Point2, Point2);
-double dppd3(Point3, Point3);
-double dppd3a(Point3, Point3);
+double dotvec2_sse4(Point2, Point2);
+double dotvec2_avx(Point2, Point2);
+double dotvec3_sse4(Point3, Point3);
+double dotvec3_avx(Point3, Point3);
 Point2 Pt2b(double, double, double);
-Point3 xvec3(Point3, Point3);
+Point3 crossvec3_sse(Point3, Point3);
 double hsubpd(double, double);
 double fma(double, double, double);
+Point2 addpt2_avx(Point2, Point2);
 
 double
 fmin(double a, double b)
@@ -19,13 +19,18 @@
 	return a<b? a: b;
 }
 
+double
+madd(double a, double b, double c)
+{
+	return a + b*c;
+}
+
 void
 main(int argc, char *argv[])
 {
-	uvlong t0, t1;
 	double a, b, r;
-	Point2 p0, p1;
-	Point3 p0t, p1t, pr;
+	Point2 p0, p1, pr;
+	Point3 p0t, p1t, prt;
 
 	GEOMfmtinstall();
 	ARGBEGIN{default:sysfatal("shit");}ARGEND
@@ -34,75 +39,62 @@
 	a = strtod(argv[0], nil);
 	b = strtod(argv[1], nil);
 
-	t0 = nanosec();
 	r = fmin(a, b);
-	t1 = nanosec();
-	print("fmin(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
-	t0 = nanosec();
+	print("fmin(%g, %g) = %g\n", a, b, r);
 	r = min(a, b);
-	t1 = nanosec();
-	print("min(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+	print("min(%g, %g) = %g\n", a, b, r);
 
 	print("\n");
 
 	p0 = Pt2b(a, 1, 1);
 	p1 = Pt2b(b, 3, 1);
-	t0 = nanosec();
-	r = dppd(p0, p1);
-	t1 = nanosec();
-	print("dppd(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
-	t0 = nanosec();
+	r = dotvec2_sse4(p0, p1);
+	print("dotvec2_sse4(%v, %v) = %g\n", p0, p1, r);
 	r = dotvec2(p0, p1);
-	t1 = nanosec();
-	print("dotvec2(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
-	t0 = nanosec();
-	r = dppda(p0, p1);
-	t1 = nanosec();
-	print("dppda(%v, %v) = %g\ttook %lludns\n", p0, p1, r, t1-t0);
+	print("dotvec2(%v, %v) = %g\n", p0, p1, r);
+	r = dotvec2_avx(p0, p1);
+	print("dotvec2_avx(%v, %v) = %g\n", p0, p1, r);
 
 	print("\n");
 
 	p0t = Pt3(a, 1, 9, 1);
 	p1t = Pt3(b, 3, 4, 1);
-	t0 = nanosec();
-	r = dppd3(p0t, p1t);
-	t1 = nanosec();
-	print("dppd3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
-	t0 = nanosec();
+	r = dotvec3_sse4(p0t, p1t);
+	print("dotvec3_sse4(%V, %V) = %g\n", p0t, p1t, r);
 	r = dotvec3(p0t, p1t);
-	t1 = nanosec();
-	print("dotvec3(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
-	t0 = nanosec();
-	r = dppd3a(p0t, p1t);
-	t1 = nanosec();
-	print("dppd3a(%V, %V) = %g\ttook %lludns\n", p0t, p1t, r, t1-t0);
+	print("dotvec3(%V, %V) = %g\n", p0t, p1t, r);
+	r = dotvec3_avx(p0t, p1t);
+	print("dotvec3_avx(%V, %V) = %g\n", p0t, p1t, r);
 
 	print("\n");
 
-	t0 = nanosec();
 	r = hsubpd(a, b);
-	t1 = nanosec();
-	print("hsubpd(%g, %g) = %g\ttook %lludns\n", a, b, r, t1-t0);
+	print("hsubpd(%g, %g) = %g\n", a, b, r);
 
 	print("\n");
 
 	p0t = Pt3(a, 1, 9, 1);
 	p1t = Pt3(b, 3, 4, 1);
-	t0 = nanosec();
-	pr = xvec3(p0t, p1t);
-	t1 = nanosec();
-	print("xvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
-	t0 = nanosec();
-	pr = crossvec3(p0t, p1t);
-	t1 = nanosec();
-	print("crossvec3(%V, %V) = %V\ttook %lludns\n", p0t, p1t, pr, t1-t0);
+	prt = crossvec3_sse(p0t, p1t);
+	print("crossvec3_sse(%V, %V) = %V\n", p0t, p1t, prt);
+	prt = crossvec3(p0t, p1t);
+	print("crossvec3(%V, %V) = %V\n", p0t, p1t, prt);
 
 	print("\n");
 
-	t0 = nanosec();
+	r = madd(a, b, 21);
+	print("madd(%g, %g, 21) = %g\n", a, b, r);
 	r = fma(a, b, 21);
-	t1 = nanosec();
-	print("fma(%g, %g, 21) = %g\ttook %lludns\n", a, b, r, t1-t0);
+	print("fma(%g, %g, 21) = %g\n", a, b, r);
+
+	print("\n");
+
+	p0 = Pt2b(a, 1, 1);
+	p1 = Pt2b(b, 3, 1);
+	pr = addpt2(p0, p1);
+	print("addpt2(%v, %v) = %v\n", p0, p1, pr);
+	pr = addpt2_avx(p0, p1);
+	print("addpt2_avx(%v, %v) = %v\n", p0, p1, pr);
 
 	exits(nil);
 }
--- a/mkfile
+++ b/mkfile
@@ -6,7 +6,6 @@
 	main.$O\
 	min.$O\
 	dppd.$O\
-	nanosec.$O\
 
 HFILES=\
 	regs.h\
@@ -14,3 +13,7 @@
 	avx.h\
 
 </sys/src/cmd/mkone
+
+pulldeps:VQ:
+	git/clone git://shithub.us/sigrid/bench9 || \
+	git/clone https://git.sr.ht/~ft/bench9
--- a/nanosec.c
+++ /dev/null
@@ -1,109 +1,0 @@
-#include <u.h>
-#include <libc.h>
-#include <tos.h>
-
-/*
- * This code is a mixture of cpuid(1) and the nanosec() found in vmx,
- * in order to force the use of nsec(2) in case we are running in a
- * virtualized environment where the clock is mis-bhyve-ing.
- */
-
-typedef struct Res {
-	ulong ax, bx, cx, dx;
-} Res;
-
-static uchar _cpuid[] = {
-	0x5E,			/* POP SI (PC) */
-	0x5D,			/* POP BP (Res&) */
-	0x58,			/* POP AX */
-	0x59,			/* POP CX */
-
-	0x51,			/* PUSH CX */
-	0x50,			/* PUSH AX */
-	0x55,			/* PUSH BP */
-	0x56,			/* PUSH SI */
-
-	0x31, 0xDB,		/* XOR BX, BX */
-	0x31, 0xD2,		/* XOR DX, DX */
-
-	0x0F, 0xA2,		/* CPUID */
-
-	0x89, 0x45, 0x00,	/* MOV AX, 0(BP) */
-	0x89, 0x5d, 0x04,	/* MOV BX, 4(BP) */
-	0x89, 0x4d, 0x08,	/* MOV CX, 8(BP) */
-	0x89, 0x55, 0x0C,	/* MOV DX, 12(BP) */
-	0xC3,			/* RET */
-};
-
-static Res (*cpuid)(ulong ax, ulong cx) = (Res(*)(ulong, ulong)) _cpuid;
-
-/*
- * nsec() is wallclock and can be adjusted by timesync
- * so need to use cycles() instead, but fall back to
- * nsec() in case we can't
- */
-uvlong
-nanosec(void)
-{
-	static uvlong fasthz, xstart;
-	char buf[13], path[128];
-	ulong w;
-	uvlong x, div;
-	int fd;
-	Res r;
-
-	if(fasthz == ~0ULL)
-		return nsec() - xstart;
-
-	if(fasthz == 0){
-		/* first long in a.out header */
-		snprint(path, sizeof path, "/proc/%d/text", getpid());
-		fd = open(path, OREAD);
-		if(fd < 0)
-			goto Wallclock;
-		if(read(fd, buf, 4) != 4){
-			close(fd);
-			goto Wallclock;
-		}
-		close(fd);
-
-		w = ((ulong *) buf)[0];
-
-		switch(w){
-		default:
-			goto Wallclock;
-		case 0x978a0000:	/* amd64 */
-			/* patch out POP BP -> POP AX */
-			_cpuid[1] = 0x58;
-		case 0xeb010000:	/* 386 */
-			break;
-		}
-		segflush(_cpuid, sizeof(_cpuid));
-
-		r = cpuid(0x40000000, 0);
-		((ulong *) buf)[0] = r.bx;
-		((ulong *) buf)[1] = r.cx;
-		((ulong *) buf)[2] = r.dx;
-		buf[12] = 0;
-
-		if(strstr(buf, "bhyve") != nil)
-			goto Wallclock;
-
-		if(_tos->cyclefreq){
-			fasthz = _tos->cyclefreq;
-			cycles(&xstart);
-		} else {
-Wallclock:
-			fasthz = ~0ULL;
-			xstart = nsec();
-		}
-		return 0;
-	}
-	cycles(&x);
-	x -= xstart;
-
-	/* this is ugly */
-	for(div = 1000000000ULL; x < 0x1999999999999999ULL && div > 1 ; div /= 10ULL, x *= 10ULL);
-
-	return x / (fasthz / div);
-}
--- a/sse.h
+++ b/sse.h
@@ -6,7 +6,18 @@
 			BYTE $(((m)<<6)|((ro)<<3)|(rm))
 #define OP4i(o, m, ro, rm, i)	OP4((o), (m), (ro), (rm));	\
 			BYTE $(i)
+#define F3OP(o, m, ro, rm)	WORD $0x0FF3; BYTE $(o);	\
+			BYTE $(((m)<<6)|((ro)<<3)|(rm))
+#define F3OPi(o, m, ro, rm, i)	F3OP((o), (m), (ro), (rm));	\
+			BYTE $(i)
 
+/* MOVDQA */
+#define MOVDQA_mr(off, s, d) OPi(0x6F, 0x1, (d), (s), (off))
+#define MOVDQA_rm(off, s, d) OPi(0x7F, 0x1, (s), (d), (off))
+
+/* MODQU */
+#define MOVDQU_mr(off, s, d) F3OPi(0x6F, 0x1, (d), (s), (off))
+#define MOVDQU_rm(off, s, d) F3OPi(0x7F, 0x1, (s), (d), (off))
 
 /* MOVLPD */
 //opcode = 660F12