shithub: riscv

Download patch

ref: 213bf5089365d00d9d40635bcfe62e197d548c1b
parent: e3883b050e1784f97bd6474c6af73023fe3bbe44
author: jpathy <[email protected]>
date: Tue May 21 19:15:13 EDT 2013

add 6(a|l) sse support to 8(a|l)

--- a/sys/src/cmd/8a/a.y
+++ b/sys/src/cmd/8a/a.y
@@ -20,9 +20,9 @@
 %left	'+' '-'
 %left	'*' '/' '%'
 %token	<lval>	LTYPE0 LTYPE1 LTYPE2 LTYPE3 LTYPE4
-%token	<lval>	LTYPEC LTYPED LTYPEN LTYPER LTYPET LTYPES LTYPEM LTYPEI LTYPEG
+%token	<lval>	LTYPEC LTYPED LTYPEN LTYPER LTYPET LTYPES LTYPEM LTYPEI LTYPEG LTYPEXC LTYPEX
 %token	<lval>	LCONST LFP LPC LSB
-%token	<lval>	LBREG LLREG LSREG LFREG
+%token	<lval>	LBREG LLREG LSREG LFREG LMREG LXREG
 %token	<dval>	LFCONST
 %token	<sval>	LSCONST LSP
 %token	<sym>	LNAME LLAB LVAR
@@ -30,7 +30,7 @@
 %type	<con2>	con2
 %type	<gen>	mem imm imm2 reg nam rel rem rim rom omem nmem
 %type	<gen2>	nonnon nonrel nonrem rimnon rimrem remrim
-%type	<gen2>	spec1 spec2 spec3 spec4 spec5 spec6 spec7 spec8
+%type	<gen2>	spec1 spec2 spec3 spec4 spec5 spec6 spec7 spec8 spec9 spec10
 %%
 prog:
 |	prog line
@@ -79,6 +79,8 @@
 |	LTYPEM spec6	{ outcode($1, &$2); }
 |	LTYPEI spec7	{ outcode($1, &$2); }
 |	LTYPEG spec8	{ outcode($1, &$2); }
+|	LTYPEXC spec9	{ outcode($1, &$2); }
+|	LTYPEX spec10	{ outcode($1, &$2); }
 
 nonnon:
 	{
@@ -237,7 +239,24 @@
 		$$.from.scale = $3;
 		$$.to = $5;
 	}
+spec9:	/* CMPPS/CMPPD */
+	reg ',' rem ',' con
+	{
+		$$.from = $1;
+		$$.to = $3;
+		$$.from.offset = $5;
+	}
 
+spec10:	/* shufl */
+	imm ',' rem ',' reg
+	{
+		$$.from = $3;
+		$$.to = $5;
+		if($1.type != D_CONST)
+			yyerror("illegal constant");
+		$$.to.offset = $1.offset;
+	}
+
 rem:
 	reg
 |	mem
@@ -301,6 +320,11 @@
 		$$ = nullgen;
 		$$.type = $1;
 	}
+|	LMREG
+	{
+		$$ = nullgen;
+		$$.type = $1;
+	}
 |	LSP
 	{
 		$$ = nullgen;
@@ -307,6 +331,11 @@
 		$$.type = D_SP;
 	}
 |	LSREG
+	{
+		$$ = nullgen;
+		$$.type = $1;
+	}
+|	LXREG
 	{
 		$$ = nullgen;
 		$$.type = $1;
--- a/sys/src/cmd/8a/lex.c
+++ b/sys/src/cmd/8a/lex.c
@@ -192,6 +192,24 @@
 	"F6",		LFREG,	D_F0+6,
 	"F7",		LFREG,	D_F0+7,
 
+	"M0",		LMREG,	D_M0+0,
+	"M1",		LMREG,	D_M0+1,
+	"M2",		LMREG,	D_M0+2,
+	"M3",		LMREG,	D_M0+3,
+	"M4",		LMREG,	D_M0+4,
+	"M5",		LMREG,	D_M0+5,
+	"M6",		LMREG,	D_M0+6,
+	"M7",		LMREG,	D_M0+7,
+
+	"X0",		LXREG,	D_X0+0,
+	"X1",		LXREG,	D_X0+1,
+	"X2",		LXREG,	D_X0+2,
+	"X3",		LXREG,	D_X0+3,
+	"X4",		LXREG,	D_X0+4,
+	"X5",		LXREG,	D_X0+5,
+	"X6",		LXREG,	D_X0+6,
+	"X7",		LXREG,	D_X0+7,
+
 	"CS",		LSREG,	D_CS,
 	"SS",		LSREG,	D_SS,
 	"DS",		LSREG,	D_DS,
@@ -277,6 +295,7 @@
 	"CMPXCHGB",	LTYPE3,	ACMPXCHGB,
 	"CMPXCHGL",	LTYPE3,	ACMPXCHGL,
 	"CMPXCHGW",	LTYPE3,	ACMPXCHGW,
+	"CPUID",	LTYPE0,	ACPUID,
 	"DAA",		LTYPE0,	ADAA,
 	"DAS",		LTYPE0,	ADAS,
 	"DATA",		LTYPED,	ADATA,
@@ -638,6 +657,188 @@
 	"FXTRACT",	LTYPE0, AFXTRACT,
 	"FYL2X",	LTYPE0, AFYL2X,
 	"FYL2XP1",	LTYPE0, AFYL2XP1,
+
+	"ADDPD",	LTYPE3,	AADDPD,
+	"ADDPS",	LTYPE3,	AADDPS,
+	"ADDSD",	LTYPE3,	AADDSD,
+	"ADDSS",	LTYPE3,	AADDSS,
+	"ANDNPD",	LTYPE3,	AANDNPD,
+	"ANDNPS",	LTYPE3,	AANDNPS,
+	"ANDPD",	LTYPE3,	AANDPD,
+	"ANDPS",	LTYPE3,	AANDPS,
+	"CMPPD",	LTYPEXC,ACMPPD,
+	"CMPPS",	LTYPEXC,ACMPPS,
+	"CMPSD",	LTYPEXC,ACMPSD,
+	"CMPSS",	LTYPEXC,ACMPSS,
+	"COMISD",	LTYPE3,	ACOMISD,
+	"COMISS",	LTYPE3,	ACOMISS,
+	"CVTPL2PD",	LTYPE3,	ACVTPL2PD,
+	"CVTPL2PS",	LTYPE3,	ACVTPL2PS,
+	"CVTPD2PL",	LTYPE3,	ACVTPD2PL,
+	"CVTPD2PS",	LTYPE3,	ACVTPD2PS,
+	"CVTPS2PL",	LTYPE3,	ACVTPS2PL,
+	"PF2IW",	LTYPE3,	APF2IW,
+	"PF2IL",	LTYPE3,	APF2IL,
+	"PF2ID",	LTYPE3,	APF2IL,	/* syn */
+	"PI2FL",	LTYPE3,	API2FL,
+	"PI2FD",	LTYPE3,	API2FL,	/* syn */
+	"PI2FW",	LTYPE3,	API2FW,
+	"CVTPS2PD",	LTYPE3,	ACVTPS2PD,
+	"CVTSD2SL",	LTYPE3,	ACVTSD2SL,
+	"CVTSD2SS",	LTYPE3,	ACVTSD2SS,
+	"CVTSL2SD",	LTYPE3,	ACVTSL2SD,
+	"CVTSL2SS",	LTYPE3,	ACVTSL2SS,
+	"CVTSS2SD",	LTYPE3,	ACVTSS2SD,
+	"CVTSS2SL",	LTYPE3,	ACVTSS2SL,
+	"CVTTPD2PL",	LTYPE3,	ACVTTPD2PL,
+	"CVTTPS2PL",	LTYPE3,	ACVTTPS2PL,
+	"CVTTSD2SL",	LTYPE3,	ACVTTSD2SL,
+	"CVTTSS2SL",	LTYPE3,	ACVTTSS2SL,
+	"DIVPD",	LTYPE3,	ADIVPD,
+	"DIVPS",	LTYPE3,	ADIVPS,
+	"DIVSD",	LTYPE3,	ADIVSD,
+	"DIVSS",	LTYPE3,	ADIVSS,
+	"FXRSTOR",	LTYPE2,	AFXRSTOR,
+	"FXSAVE",	LTYPE1,	AFXSAVE,
+	"LDMXCSR",	LTYPE2,	ALDMXCSR,
+	"MASKMOVOU",	LTYPE3,	AMASKMOVOU,
+	"MASKMOVDQU",	LTYPE3,	AMASKMOVOU,	/* syn */
+	"MAXPD",	LTYPE3,	AMAXPD,
+	"MAXPS",	LTYPE3,	AMAXPS,
+	"MAXSD",	LTYPE3,	AMAXSD,
+	"MAXSS",	LTYPE3,	AMAXSS,
+	"MINPD",	LTYPE3,	AMINPD,
+	"MINPS",	LTYPE3,	AMINPS,
+	"MINSD",	LTYPE3,	AMINSD,
+	"MINSS",	LTYPE3,	AMINSS,
+	"MOVAPD",	LTYPE3,	AMOVAPD,
+	"MOVAPS",	LTYPE3,	AMOVAPS,
+	"MOVO",		LTYPE3,	AMOVO,
+	"MOVOA",	LTYPE3,	AMOVO,	/* syn */
+	"MOVOU",	LTYPE3,	AMOVOU,
+	"MOVHLPS",	LTYPE3,	AMOVHLPS,
+	"MOVHPD",	LTYPE3,	AMOVHPD,
+	"MOVHPS",	LTYPE3,	AMOVHPS,
+	"MOVLHPS",	LTYPE3,	AMOVLHPS,
+	"MOVLPD",	LTYPE3,	AMOVLPD,
+	"MOVLPS",	LTYPE3,	AMOVLPS,
+	"MOVMSKPD",	LTYPE3,	AMOVMSKPD,
+	"MOVMSKPS",	LTYPE3,	AMOVMSKPS,
+	"MOVNTO",	LTYPE3,	AMOVNTO,
+	"MOVNTDQ",	LTYPE3,	AMOVNTO,	/* syn */
+	"MOVNTPD",	LTYPE3,	AMOVNTPD,
+	"MOVNTPS",	LTYPE3,	AMOVNTPS,
+	"MOVNTQ",	LTYPE3,	AMOVNTQ,
+	"MOVQOZX",	LTYPE3,	AMOVQOZX,
+	"MOVSD",	LTYPE3,	AMOVSD,
+	"MOVSS",	LTYPE3,	AMOVSS,
+	"MOVUPD",	LTYPE3,	AMOVUPD,
+	"MOVUPS",	LTYPE3,	AMOVUPS,
+	"MULPD",	LTYPE3,	AMULPD,
+	"MULPS",	LTYPE3,	AMULPS,
+	"MULSD",	LTYPE3,	AMULSD,
+	"MULSS",	LTYPE3,	AMULSS,
+	"ORPD",		LTYPE3,	AORPD,
+	"ORPS",		LTYPE3,	AORPS,
+	"PACKSSLW",	LTYPE3,	APACKSSLW,
+	"PACKSSWB",	LTYPE3,	APACKSSWB,
+	"PACKUSWB",	LTYPE3,	APACKUSWB,
+	"PADDB",	LTYPE3,	APADDB,
+	"PADDL",	LTYPE3,	APADDL,
+	"PADDQ",	LTYPE3,	APADDQ,
+	"PADDSB",	LTYPE3,	APADDSB,
+	"PADDSW",	LTYPE3,	APADDSW,
+	"PADDUSB",	LTYPE3,	APADDUSB,
+	"PADDUSW",	LTYPE3,	APADDUSW,
+	"PADDW",	LTYPE3,	APADDW,
+	"PAND",		LTYPE3, APAND,
+	"PANDB",	LTYPE3,	APANDB,
+	"PANDL",	LTYPE3,	APANDL,
+	"PANDSB",	LTYPE3,	APANDSB,
+	"PANDSW",	LTYPE3,	APANDSW,
+	"PANDUSB",	LTYPE3,	APANDUSB,
+	"PANDUSW",	LTYPE3,	APANDUSW,
+	"PANDW",	LTYPE3,	APANDW,
+	"PANDN",	LTYPE3, APANDN,
+	"PAVGB",	LTYPE3,	APAVGB,
+	"PAVGW",	LTYPE3,	APAVGW,
+	"PCMPEQB",	LTYPE3,	APCMPEQB,
+	"PCMPEQL",	LTYPE3,	APCMPEQL,
+	"PCMPEQW",	LTYPE3,	APCMPEQW,
+	"PCMPGTB",	LTYPE3,	APCMPGTB,
+	"PCMPGTL",	LTYPE3,	APCMPGTL,	
+	"PCMPGTW",	LTYPE3,	APCMPGTW,
+	"PEXTRW",	LTYPEX,	APEXTRW,
+	"PINSRW",	LTYPEX,	APINSRW,
+	"PMADDWL",	LTYPE3,	APMADDWL,
+	"PMAXSW",	LTYPE3,	APMAXSW,
+	"PMAXUB",	LTYPE3,	APMAXUB,
+	"PMINSW",	LTYPE3,	APMINSW,
+	"PMINUB",	LTYPE3,	APMINUB,
+	"PMOVMSKB",	LTYPE3,	APMOVMSKB,
+	"PMULHRW",	LTYPE3,	APMULHRW,
+	"PMULHUW",	LTYPE3,	APMULHUW,
+	"PMULHW",	LTYPE3,	APMULHW,
+	"PMULLW",	LTYPE3,	APMULLW,
+	"PMULULQ",	LTYPE3,	APMULULQ,
+	"POR",		LTYPE3,	APOR,
+	"PSADBW",	LTYPE3,	APSADBW,
+	"PSHUFHW",	LTYPEX,	APSHUFHW,
+	"PSHUFL",	LTYPEX,	APSHUFL,
+	"PSHUFLW",	LTYPEX,	APSHUFLW,
+	"PSHUFW",	LTYPEX, APSHUFW,
+	"PSLLO",	LTYPE3,	APSLLO,
+	"PSLLDQ",	LTYPE3,	APSLLO,	/* syn */
+	"PSLLL",	LTYPE3,	APSLLL,
+	"PSLLQ",	LTYPE3,	APSLLQ,
+	"PSLLW",	LTYPE3,	APSLLW,
+	"PSRAL",	LTYPE3,	APSRAL,
+	"PSRAW",	LTYPE3,	APSRAW,
+	"PSRLO",	LTYPE3,	APSRLO,
+	"PSRLDQ",	LTYPE3,	APSRLO,	/* syn */
+	"PSRLL",	LTYPE3,	APSRLL,
+	"PSRLQ",	LTYPE3,	APSRLQ,
+	"PSRLW",	LTYPE3,	APSRLW,
+	"PSUBB",	LTYPE3,	APSUBB,
+	"PSUBL",	LTYPE3,	APSUBL,
+	"PSUBQ",	LTYPE3,	APSUBQ,
+	"PSUBSB",	LTYPE3,	APSUBSB,
+	"PSUBSW",	LTYPE3,	APSUBSW,
+	"PSUBUSB",	LTYPE3,	APSUBUSB,
+	"PSUBUSW",	LTYPE3,	APSUBUSW,
+	"PSUBW",	LTYPE3,	APSUBW,
+	"PUNPCKHBW",	LTYPE3,	APUNPCKHBW,
+	"PUNPCKHLQ",	LTYPE3,	APUNPCKHLQ,
+	"PUNPCKHQDQ",	LTYPE3,	APUNPCKHQDQ,
+	"PUNPCKHWL",	LTYPE3,	APUNPCKHWL,
+	"PUNPCKLBW",	LTYPE3,	APUNPCKLBW,
+	"PUNPCKLLQ",	LTYPE3,	APUNPCKLLQ,
+	"PUNPCKLQDQ",	LTYPE3,	APUNPCKLQDQ,
+	"PUNPCKLWL",	LTYPE3,	APUNPCKLWL,
+	"PXOR",		LTYPE3,	APXOR,
+	"RCPPS",	LTYPE3,	ARCPPS,
+	"RCPSS",	LTYPE3,	ARCPSS,
+	"RSQRTPS",	LTYPE3,	ARSQRTPS,
+	"RSQRTSS",	LTYPE3,	ARSQRTSS,
+	"SHUFPD",	LTYPEX,	ASHUFPD,
+	"SHUFPS",	LTYPEX,	ASHUFPS,
+	"SQRTPD",	LTYPE3,	ASQRTPD,
+	"SQRTPS",	LTYPE3,	ASQRTPS,
+	"SQRTSD",	LTYPE3,	ASQRTSD,
+	"SQRTSS",	LTYPE3,	ASQRTSS,
+	"STMXCSR",	LTYPE1,	ASTMXCSR,
+	"SUBPD",	LTYPE3,	ASUBPD,
+	"SUBPS",	LTYPE3,	ASUBPS,
+	"SUBSD",	LTYPE3,	ASUBSD,
+	"SUBSS",	LTYPE3,	ASUBSS,
+	"UCOMISD",	LTYPE3,	AUCOMISD,
+	"UCOMISS",	LTYPE3,	AUCOMISS,
+	"UNPCKHPD",	LTYPE3,	AUNPCKHPD,
+	"UNPCKHPS",	LTYPE3,	AUNPCKHPS,
+	"UNPCKLPD",	LTYPE3,	AUNPCKLPD,
+	"UNPCKLPS",	LTYPE3,	AUNPCKLPS,
+	"XORPD",	LTYPE3,	AXORPD,
+	"XORPS",	LTYPE3,	AXORPS,
 
 	0
 };
--- a/sys/src/cmd/8c/8.out.h
+++ b/sys/src/cmd/8c/8.out.h
@@ -361,6 +361,7 @@
 	ACMPXCHGB,
 	ACMPXCHGL,
 	ACMPXCHGW,
+	ACPUID,
 
 	/* conditional move */
 	ACMOVLCC,
@@ -405,6 +406,185 @@
 	AFCMOVNU,
 	AFCMOVUN,
 
+	/* media */
+	AADDPD,
+	AADDPS,
+	AADDSD,
+	AADDSS,
+	AANDNPD,
+	AANDNPS,
+	AANDPD,
+	AANDPS,
+	ACMPPD,
+	ACMPPS,
+	ACMPSD,
+	ACMPSS,
+	ACOMISD,
+	ACOMISS,
+	ACVTPD2PL,
+	ACVTPD2PS,
+	ACVTPL2PD,
+	ACVTPL2PS,
+	ACVTPS2PD,
+	ACVTPS2PL,
+	ACVTSD2SL,
+	ACVTSD2SS,
+	ACVTSL2SD,
+	ACVTSL2SS,
+	ACVTSS2SD,
+	ACVTSS2SL,
+	ACVTTPD2PL,
+	ACVTTPS2PL,
+	ACVTTSD2SL,
+	ACVTTSS2SL,
+	ADIVPD,
+	ADIVPS,
+	ADIVSD,
+	ADIVSS,
+	AFXRSTOR,
+	AFXSAVE,
+	ALDMXCSR,
+	AMASKMOVOU,
+	AMASKMOVQ,
+	AMAXPD,
+	AMAXPS,
+	AMAXSD,
+	AMAXSS,
+	AMINPD,
+	AMINPS,
+	AMINSD,
+	AMINSS,
+	AMOVAPD,
+	AMOVAPS,
+	AMOVOU,
+	AMOVHLPS,
+	AMOVHPD,
+	AMOVHPS,
+	AMOVLHPS,
+	AMOVLPD,
+	AMOVLPS,
+	AMOVMSKPD,
+	AMOVMSKPS,
+	AMOVNTO,
+	AMOVNTPD,
+	AMOVNTPS,
+	AMOVNTQ,
+	AMOVO,
+	AMOVQOZX,
+	AMOVSD,
+	AMOVSS,
+	AMOVUPD,
+	AMOVUPS,
+	AMULPD,
+	AMULPS,
+	AMULSD,
+	AMULSS,
+	AORPD,
+	AORPS,
+	APACKSSLW,
+	APACKSSWB,
+	APACKUSWB,
+	APADDB,
+	APADDL,
+	APADDQ,
+	APADDSB,
+	APADDSW,
+	APADDUSB,
+	APADDUSW,
+	APADDW,
+	APANDB,
+	APANDL,
+	APANDSB,
+	APANDSW,
+	APANDUSB,
+	APANDUSW,
+	APANDW,
+	APAND,
+	APANDN,
+	APAVGB,
+	APAVGW,
+	APCMPEQB,
+	APCMPEQL,
+	APCMPEQW,
+	APCMPGTB,
+	APCMPGTL,
+	APCMPGTW,
+	APEXTRW,
+	APINSRW,
+	APMADDWL,
+	APMAXSW,
+	APMAXUB,
+	APMINSW,
+	APMINUB,
+	APMOVMSKB,
+	APMULHRW,
+	APMULHUW,
+	APMULHW,
+	APMULLW,
+	APMULULQ,
+	APOR,
+	APSADBW,
+	APSHUFHW,
+	APSHUFL,
+	APSHUFLW,
+	APSHUFW,
+	APSLLO,
+	APSLLL,
+	APSLLQ,
+	APSLLW,
+	APSRAL,
+	APSRAW,
+	APSRLO,
+	APSRLL,
+	APSRLQ,
+	APSRLW,
+	APSUBB,
+	APSUBL,
+	APSUBQ,
+	APSUBSB,
+	APSUBSW,
+	APSUBUSB,
+	APSUBUSW,
+	APSUBW,
+	APSWAPL,
+	APUNPCKHBW,
+	APUNPCKHLQ,
+	APUNPCKHQDQ,
+	APUNPCKHWL,
+	APUNPCKLBW,
+	APUNPCKLLQ,
+	APUNPCKLQDQ,
+	APUNPCKLWL,
+	APXOR,
+	ARCPPS,
+	ARCPSS,
+	ARSQRTPS,
+	ARSQRTSS,
+	ASHUFPD,
+	ASHUFPS,
+	ASQRTPD,
+	ASQRTPS,
+	ASQRTSD,
+	ASQRTSS,
+	ASTMXCSR,
+	ASUBPD,
+	ASUBPS,
+	ASUBSD,
+	ASUBSS,
+	AUCOMISD,
+	AUCOMISS,
+	AUNPCKHPD,
+	AUNPCKHPS,
+	AUNPCKLPD,
+	AUNPCKLPS,
+	AXORPD,
+	AXORPS,
+
+	APF2IW,
+	APF2IL,
+	API2FW,
+	API2FL,
+
 	/* add new operations here. nowhere else. here. */
 	ALAST
 };
@@ -470,6 +650,10 @@
 	D_CONST2 = D_INDIR+D_INDIR,
 
 	D_SIZE,	/* 8l internal */
+
+	D_M0,
+	D_X0		= D_M0 + 8,
+	D_XNONE		= D_X0 + 8,
 
 	T_TYPE		= 1<<0,
 	T_INDEX		= 1<<1,
--- a/sys/src/cmd/8c/enam.c
+++ b/sys/src/cmd/8c/enam.c
@@ -340,6 +340,7 @@
 	"CMPXCHGB",
 	"CMPXCHGL",
 	"CMPXCHGW",
+	"CPUID",
 	"CMOVLCC",
 	"CMOVLCS",
 	"CMOVLEQ",
@@ -380,5 +381,181 @@
 	"FCMOVNE",
 	"FCMOVNU",
 	"FCMOVUN",
+	"ADDPD",
+	"ADDPS",
+	"ADDSD",
+	"ADDSS",
+	"ANDNPD",
+	"ANDNPS",
+	"ANDPD",
+	"ANDPS",
+	"CMPPD",
+	"CMPPS",
+	"CMPSD",
+	"CMPSS",
+	"COMISD",
+	"COMISS",
+	"CVTPD2PL",
+	"CVTPD2PS",
+	"CVTPL2PD",
+	"CVTPL2PS",
+	"CVTPS2PD",
+	"CVTPS2PL",
+	"CVTSD2SL",
+	"CVTSD2SS",
+	"CVTSL2SD",
+	"CVTSL2SS",
+	"CVTSS2SD",
+	"CVTSS2SL",
+	"CVTTPD2PL",
+	"CVTTPS2PL",
+	"CVTTSD2SL",
+	"CVTTSS2SL",
+	"DIVPD",
+	"DIVPS",
+	"DIVSD",
+	"DIVSS",
+	"FXRSTOR",
+	"FXSAVE",
+	"LDMXCSR",
+	"MASKMOVOU",
+	"MASKMOVQ",
+	"MAXPD",
+	"MAXPS",
+	"MAXSD",
+	"MAXSS",
+	"MINPD",
+	"MINPS",
+	"MINSD",
+	"MINSS",
+	"MOVAPD",
+	"MOVAPS",
+	"MOVOU",
+	"MOVHLPS",
+	"MOVHPD",
+	"MOVHPS",
+	"MOVLHPS",
+	"MOVLPD",
+	"MOVLPS",
+	"MOVMSKPD",
+	"MOVMSKPS",
+	"MOVNTO",
+	"MOVNTPD",
+	"MOVNTPS",
+	"MOVNTQ",
+	"MOVO",
+	"MOVQOZX",
+	"MOVSD",
+	"MOVSS",
+	"MOVUPD",
+	"MOVUPS",
+	"MULPD",
+	"MULPS",
+	"MULSD",
+	"MULSS",
+	"ORPD",
+	"ORPS",
+	"PACKSSLW",
+	"PACKSSWB",
+	"PACKUSWB",
+	"PADDB",
+	"PADDL",
+	"PADDQ",
+	"PADDSB",
+	"PADDSW",
+	"PADDUSB",
+	"PADDUSW",
+	"PADDW",
+	"PANDB",
+	"PANDL",
+	"PANDSB",
+	"PANDSW",
+	"PANDUSB",
+	"PANDUSW",
+	"PANDW",
+	"PAND",
+	"PANDN",
+	"PAVGB",
+	"PAVGW",
+	"PCMPEQB",
+	"PCMPEQL",
+	"PCMPEQW",
+	"PCMPGTB",
+	"PCMPGTL",
+	"PCMPGTW",
+	"PEXTRW",
+	"PINSRW",
+	"PMADDWL",
+	"PMAXSW",
+	"PMAXUB",
+	"PMINSW",
+	"PMINUB",
+	"PMOVMSKB",
+	"PMULHRW",
+	"PMULHUW",
+	"PMULHW",
+	"PMULLW",
+	"PMULULQ",
+	"POR",
+	"PSADBW",
+	"PSHUFHW",
+	"PSHUFL",
+	"PSHUFLW",
+	"PSHUFW",
+	"PSLLO",
+	"PSLLL",
+	"PSLLQ",
+	"PSLLW",
+	"PSRAL",
+	"PSRAW",
+	"PSRLO",
+	"PSRLL",
+	"PSRLQ",
+	"PSRLW",
+	"PSUBB",
+	"PSUBL",
+	"PSUBQ",
+	"PSUBSB",
+	"PSUBSW",
+	"PSUBUSB",
+	"PSUBUSW",
+	"PSUBW",
+	"PSWAPL",
+	"PUNPCKHBW",
+	"PUNPCKHLQ",
+	"PUNPCKHQDQ",
+	"PUNPCKHWL",
+	"PUNPCKLBW",
+	"PUNPCKLLQ",
+	"PUNPCKLQDQ",
+	"PUNPCKLWL",
+	"PXOR",
+	"RCPPS",
+	"RCPSS",
+	"RSQRTPS",
+	"RSQRTSS",
+	"SHUFPD",
+	"SHUFPS",
+	"SQRTPD",
+	"SQRTPS",
+	"SQRTSD",
+	"SQRTSS",
+	"STMXCSR",
+	"SUBPD",
+	"SUBPS",
+	"SUBSD",
+	"SUBSS",
+	"UCOMISD",
+	"UCOMISS",
+	"UNPCKHPD",
+	"UNPCKHPS",
+	"UNPCKLPD",
+	"UNPCKLPS",
+	"XORPD",
+	"XORPS",
+	"PF2IW",
+	"PF2IL",
+	"PI2FW",
+	"PI2FL",
 	"LAST",
 };
--- a/sys/src/cmd/8l/l.h
+++ b/sys/src/cmd/8l/l.h
@@ -90,7 +90,7 @@
 	short	as;
 	uchar*	ytab;
 	uchar	prefix;
-	uchar	op[10];
+	uchar	op[20];
 };
 
 enum
@@ -142,6 +142,8 @@
 	Ycr0,	Ycr1,	Ycr2,	Ycr3,	Ycr4,	Ycr5,	Ycr6,	Ycr7,
 	Ydr0,	Ydr1,	Ydr2,	Ydr3,	Ydr4,	Ydr5,	Ydr6,	Ydr7,
 	Ytr0,	Ytr1,	Ytr2,	Ytr3,	Ytr4,	Ytr5,	Ytr6,	Ytr7,
+	Ymr, Ymm,
+	Yxr, Yxm,
 	Ymax,
 
 	Zxxx		= 0,
@@ -153,6 +155,7 @@
 	Zib_,
 	Zib_rp,
 	Zibo_m,
+	Zibo_m_xm,
 	Zil_,
 	Zil_rp,
 	Zilo_m,
@@ -160,10 +163,16 @@
 	Zloop,
 	Zm_o,
 	Zm_r,
+	Zm_r_xm,
+	Zm_r_i_xm,
+	Zm_r_3d,
+	Zibm_r, /* mmx1,mmx2/mem64,imm8 */
 	Zaut_r,
 	Zo_m,
 	Zpseudo,
 	Zr_m,
+	Zr_m_xm,
+	Zr_m_i_xm,
 	Zrp_,
 	Z_ib,
 	Z_il,
@@ -181,6 +190,8 @@
 	Pm		= 0x0f,	/* 2byte opcode escape */
 	Pq		= 0xff,	/* both escape */
 	Pb		= 0xfe,	/* byte operands */
+	Pf2		= 0xf2,	/* xmm escape 1 */
+	Pf3		= 0xf3,	/* xmm escape 2 */
 
 	Roffset	= 22,		/* no. bits for offset in relocation address */
 	Rindex	= 10,		/* no. bits for index in relocation address */
@@ -250,7 +261,7 @@
 EXTERN	char	ycover[Ymax*Ymax];
 EXTERN	uchar*	andptr;
 EXTERN	uchar	and[30];
-EXTERN	char	reg[D_NONE];
+EXTERN	char	reg[D_XNONE];
 EXTERN	Prog*	lastp;
 EXTERN	long	lcsize;
 EXTERN	int	nerrors;
@@ -279,6 +290,7 @@
 #define	UP	(&undefp)
 
 extern	Optab	optab[];
+extern	Optab*	opindex[];
 extern	char*	anames[];
 
 int	Aconv(Fmt*);
--- a/sys/src/cmd/8l/list.c
+++ b/sys/src/cmd/8l/list.c
@@ -61,7 +61,7 @@
 
 	a = va_arg(fp->args, Adr*);
 	i = a->type;
-	if(i >= D_INDIR) {
+	if(i >= D_INDIR && i < D_M0) {
 		if(a->offset)
 			snprint(str, sizeof(str), "%ld(%R)", a->offset, i-D_INDIR);
 		else
@@ -208,6 +208,24 @@
 	"TR7",
 
 	"NONE",		/* [D_NONE] */
+
+[D_M0]	"M0",		/* [D_M0] */
+		"M1",
+		"M2",
+		"M3",
+		"M4",
+		"M5",
+		"M6",
+		"M7",
+
+[D_X0]	"X0",		/* [D_X0] */
+		"X1",
+		"X2",
+		"X3",
+		"X4",
+		"X5",
+		"X6",
+		"X7",
 };
 
 int
@@ -217,7 +235,7 @@
 	int r;
 
 	r = va_arg(fp->args, int);
-	if(r >= D_AL && r <= D_NONE)
+	if((r >= D_AL && r <= D_NONE) || (r >= D_M0 && r <= D_X0+7))
 		snprint(str, sizeof(str), "%s", regstr[r-D_AL]);
 	else
 		snprint(str, sizeof(str), "gok(%d)", r);
--- a/sys/src/cmd/8l/obj.c
+++ b/sys/src/cmd/8l/obj.c
@@ -195,11 +195,14 @@
 		Bprint(&bso, "HEADER = -H0x%ld -T0x%lux -D0x%lux -R0x%lux\n",
 			HEADTYPE, INITTEXT, INITDAT, INITRND);
 	Bflush(&bso);
-	for(i=1; optab[i].as; i++)
-		if(i != optab[i].as) {
-			diag("phase error in optab: %d", i);
+	for(i=1; optab[i].as; i++) {
+		c = optab[i].as;
+		if(opindex[c] != nil) {
+			diag("phase error in optab: %d (%A)", i, c);
 			errorexit();
 		}
+		opindex[c] = &optab[i];
+	}
 
 	for(i=0; i<Ymax; i++)
 		ycover[i*Ymax + i] = 1;
@@ -240,7 +243,13 @@
 	ycover[Yrl*Ymax + Yml] = 1;
 	ycover[Ym*Ymax + Yml] = 1;
 
-	for(i=0; i<D_NONE; i++) {
+	ycover[Ym*Ymax + Ymm] = 1;
+	ycover[Ymr*Ymax + Ymm] = 1;
+
+	ycover[Ym*Ymax + Yxm] = 1;
+	ycover[Yxr*Ymax + Yxm] = 1;
+
+	for(i=0; i<D_XNONE; i++) {
 		reg[i] = -1;
 		if(i >= D_AL && i <= D_BH)
 			reg[i] = (i-D_AL) & 7;
@@ -248,6 +257,10 @@
 			reg[i] = (i-D_AX) & 7;
 		if(i >= D_F0 && i <= D_F0+7)
 			reg[i] = (i-D_F0) & 7;
+		if(i >= D_M0 && i <= D_M0+7)
+			reg[i] = (i-D_M0) & 7;
+		if(i >= D_X0 && i <= D_X0+7)
+			reg[i] = (i-D_X0) & 7;
 	}
 
 	zprg.link = P;
@@ -988,6 +1001,13 @@
 	case AFDIVRF:
 	case AFCOMF:
 	case AFCOMFP:
+	case AMOVSS:
+	case AADDSS:
+	case ASUBSS:
+	case AMULSS:
+	case ADIVSS:
+	case ACOMISS:
+	case AUCOMISS:
 		if(skip)
 			goto casdef;
 		if(p->from.type == D_FCONST) {
@@ -1026,6 +1046,13 @@
 	case AFDIVRD:
 	case AFCOMD:
 	case AFCOMDP:
+	case AMOVSD:
+	case AADDSD:
+	case ASUBSD:
+	case AMULSD:
+	case ADIVSD:
+	case ACOMISD:
+	case AUCOMISD:
 		if(skip)
 			goto casdef;
 		if(p->from.type == D_FCONST) {
--- a/sys/src/cmd/8l/optab.c
+++ b/sys/src/cmd/8l/optab.c
@@ -15,8 +15,10 @@
 	Ynone,	Ynone,	Zpseudo,1,
 	Ynone,	Yml,	Zpseudo,1,
 	Ynone,	Yrf,	Zpseudo,1,
+	Ynone,	Yxr,	Zpseudo,1,
 	Yml,	Ynone,	Zpseudo,1,
 	Yrf,	Ynone,	Zpseudo,1,
+	Yxr,	Ynone,	Zpseudo,1,
 	0
 };
 uchar	yxorb[] =
@@ -120,6 +122,10 @@
 //	Yi0,	Yml,	Zibo_m,	2,	// shorter but slower AND $0,dst
 	Yi32,	Yrl,	Zil_rp,	1,
 	Yi32,	Yml,	Zilo_m,	2,
+	Yml,	Ymr,	Zm_r_xm,	1,	// MMX MOVD
+	Ymr,	Yml,	Zr_m_xm,	1,	// MMX MOVD
+	Yml,	Yxr,	Zm_r_xm,	2,	// XMM MOVD (32 bit)
+	Yxr,	Yml,	Zr_m_xm,	2,	// XMM MOVD (32 bit)
 	Yiauto,	Yrl,	Zaut_r,	2,
 	0
 };
@@ -306,6 +312,134 @@
 	Ym,	Ynone,	Zm_o,	2,
 	0
 };
+uchar	ymm[] = 
+{
+	Ymm,	Ymr,	Zm_r_xm,	1,
+	Yxm,	Yxr,	Zm_r_xm,	2,
+	0
+};
+uchar	yxm[] = 
+{
+	Yxm,	Yxr,	Zm_r_xm,	1,
+	0
+};
+uchar	yxcvm1[] = 
+{
+	Yxm,	Yxr,	Zm_r_xm,	2,
+	0
+};
+uchar	yxcvm2[] =
+{
+	Yxm,	Yxr,	Zm_r_xm,	2,
+	0
+};
+uchar	yxmq[] = 
+{
+	Yxm,	Yxr,	Zm_r_xm,	2,
+	0
+};
+uchar	yxr[] = 
+{
+	Yxr,	Yxr,	Zm_r_xm,	1,
+	0
+};
+uchar	yxr_ml[] =
+{
+	Yxr,	Yml,	Zr_m_xm,	1,
+	0
+};
+uchar	ymr[] =
+{
+	Ymr,	Ymr,	Zm_r,	1,
+	0
+};
+uchar	ymr_ml[] =
+{
+	Ymr,	Yml,	Zr_m_xm,	1,
+	0
+};
+uchar	yxcmp[] =
+{
+	Yxm,	Yxr, Zm_r_xm,	1,
+	0
+};
+uchar	yxcmpi[] =
+{
+	Yxm,	Yxr, Zm_r_i_xm,	2,
+	0
+};
+uchar	yxmov[] =
+{
+	Yxm,	Yxr,	Zm_r_xm,	1,
+	Yxr,	Yxm,	Zr_m_xm,	1,
+	0
+};
+uchar	yxcvfl[] = 
+{
+	Yxm,	Yrl,	Zm_r_xm,	1,
+	0
+};
+uchar	yxcvlf[] =
+{
+	Yml,	Yxr,	Zm_r_xm,	1,
+	0
+};
+uchar	yps[] = 
+{
+	Ymm,	Ymr,	Zm_r_xm,	1,
+	Yi8,	Ymr,	Zibo_m_xm,	2,
+	Yxm,	Yxr,	Zm_r_xm,	2,
+	Yi8,	Yxr,	Zibo_m_xm,	3,
+	0
+};
+uchar	yxrrl[] =
+{
+	Yxr,	Yrl,	Zm_r,	1,
+	0
+};
+uchar	ymfp[] =
+{
+	Ymm,	Ymr,	Zm_r_3d,	1,
+	0,
+};
+uchar	ymrxr[] =
+{
+	Ymr,	Yxr,	Zm_r,	1,
+	Yxm,	Yxr,	Zm_r_xm,	1,
+	0
+};
+uchar	ymshuf[] =
+{
+	Ymm,	Ymr,	Zibm_r,	1,
+	0
+};
+uchar	yxshuf[] =
+{
+	Yxm,	Yxr,	Zibm_r,	1,
+	0
+};
+uchar	yextrw[] =
+{
+	Yxr,	Yrl,	Zibm_r,	1,
+	0
+};
+uchar	ypsdq[] =
+{
+	Yi8,	Yxr,	Zibo_m,	2,
+	0
+};
+uchar	ymskb[] =
+{
+	Yxr,	Yrl,	Zm_r_xm,	2,
+	Ymr,	Yrl,	Zm_r_xm,	1,
+	0
+};
+uchar	yxaes[] =
+{
+	Yxm,	Yxr,	Zm_r_xm,	2,
+	Yxm,	Yxr,	Zm_r_i_xm,	2,
+	0
+};
 
 Optab optab[] =
 /*	as, ytab, andproto, opcode */
@@ -320,10 +454,18 @@
 	{ AADCW,	yxorl,	Pe, 0x83,(02),0x15,0x81,(02),0x11,0x13 },
 	{ AADDB,	yxorb,	Px, 0x04,0x80,(00),0x00,0x02 },
 	{ AADDL,	yaddl,	Px, 0x83,(00),0x05,0x81,(00),0x01,0x03 },
+	{ AADDPD,	yxm,	Pq, 0x58 },
+	{ AADDPS,	yxm,	Pm, 0x58 },
+	{ AADDSD,	yxm,	Pf2, 0x58 },
+	{ AADDSS,	yxm,	Pf3, 0x58 },
 	{ AADDW,	yaddl,	Pe, 0x83,(00),0x05,0x81,(00),0x01,0x03 },
 	{ AADJSP },
 	{ AANDB,	yxorb,	Pb, 0x24,0x80,(04),0x20,0x22 },
 	{ AANDL,	yxorl,	Px, 0x83,(04),0x25,0x81,(04),0x21,0x23 },
+	{ AANDNPD,	yxm,	Pq, 0x55 },
+	{ AANDNPS,	yxm,	Pm, 0x55 },
+	{ AANDPD,	yxm,	Pq, 0x54 },
+	{ AANDPS,	yxm,	Pq, 0x54 },
 	{ AANDW,	yxorl,	Pe, 0x83,(04),0x25,0x81,(04),0x21,0x23 },
 	{ AARPL,	yrl_ml,	Px, 0x63 },
 	{ ABOUNDL,	yrl_m,	Px, 0x62 },
@@ -349,9 +491,32 @@
 	{ ACMC,		ynone,	Px, 0xf5 },
 	{ ACMPB,	ycmpb,	Pb, 0x3c,0x80,(07),0x38,0x3a },
 	{ ACMPL,	ycmpl,	Px, 0x83,(07),0x3d,0x81,(07),0x39,0x3b },
+	{ ACMPPD,	yxcmpi,	Px, Pe,0xc2 },
+	{ ACMPPS,	yxcmpi,	Pm, 0xc2,0 },
 	{ ACMPW,	ycmpl,	Pe, 0x83,(07),0x3d,0x81,(07),0x39,0x3b },
+	{ ACOMISD,	yxcmp,	Pe, 0x2f },
+	{ ACOMISS,	yxcmp,	Pm, 0x2f },
+	{ ACPUID,	ynone,	Pm, 0xa2 },
+	{ ACVTPL2PD,	yxcvm2,	Px, Pf3,0xe6 },
+	{ ACVTPL2PS,	yxcvm2,	Pm, 0x5b },
+	{ ACVTPD2PL,	yxcvm1,	Px, Pf2,0xe6 },
+	{ ACVTPD2PS,	yxm,	Pe, 0x5a },
+	{ ACVTPS2PL,	yxcvm1, Px, Pe,0x5b },
+	{ ACVTPS2PD,	yxm,	Pm, 0x5a },
+	{ ACVTSD2SL,	yxcvfl, Pf2, 0x2d },
+	{ ACVTSD2SS,	yxm,	Pf2, 0x5a },
+	{ ACVTSL2SD,	yxcvlf, Pf2, 0x2a },
+	{ ACVTSL2SS,	yxcvlf, Pf3, 0x2a },
+	{ ACVTSS2SD,	yxm,	Pf3, 0x5a },
+	{ ACVTSS2SL,	yxcvfl, Pf3, 0x2d },
+	{ ACVTTPD2PL,	yxcvm1,	Px, Pe,0xe6 },
+	{ ACVTTPS2PL,	yxcvm1,	Px, Pf3,0x5b },
+	{ ACVTTSD2SL,	yxcvfl, Pf2, 0x2c },
+	{ ACVTTSS2SL,	yxcvfl,	Pf3, 0x2c },
 	{ ACMPSB,	ynone,	Pb, 0xa6 },
+	{ ACMPSD,	yxcmpi,	Px, Pf2,0xc2 },
 	{ ACMPSL,	ynone,	Px, 0xa7 },
+	{ ACMPSS,	yxcmpi,	Px, Pf3,0xc2 },
 	{ ACMPSW,	ynone,	Pe, 0xa7 },
 	{ ADAA,		ynone,	Px, 0x27 },
 	{ ADAS,		ynone,	Px, 0x2f },
@@ -361,8 +526,14 @@
 	{ ADECW,	yincl,	Pe, 0x48,0xff,(01) },
 	{ ADIVB,	ydivb,	Pb, 0xf6,(06) },
 	{ ADIVL,	ydivl,	Px, 0xf7,(06) },
+	{ ADIVPD,	yxm,	Pe, 0x5e },
+	{ ADIVPS,	yxm,	Pm, 0x5e },
+	{ ADIVSD,	yxm,	Pf2, 0x5e },
+	{ ADIVSS,	yxm,	Pf3, 0x5e },
 	{ ADIVW,	ydivl,	Pe, 0xf7,(06) },
 	{ AENTER },				/* botch */
+	{ AFXRSTOR,	ysvrs,	Pm, 0xae,(01),0xae,(01) },
+	{ AFXSAVE,	ysvrs,	Pm, 0xae,(00),0xae,(00) },
 	{ AGLOBL },
 	{ AGOK },
 	{ AHISTORY },
@@ -407,6 +578,7 @@
 	{ ALAHF,	ynone,	Px, 0x9f },
 	{ ALARL,	yml_rl,	Pm, 0x02 },
 	{ ALARW,	yml_rl,	Pq, 0x02 },
+	{ ALDMXCSR,	ysvrs,	Pm, 0xae,(02),0xae,(02) },
 	{ ALEAL,	ym_rl,	Px, 0x8d },
 	{ ALEAW,	ym_rl,	Pe, 0x8d },
 	{ ALEAVEL,	ynone,	Px, 0xc9 },
@@ -421,8 +593,20 @@
 	{ ALOOPNE,	yloop,	Px, 0xe0 },
 	{ ALSLL,	yml_rl,	Pm, 0x03  },
 	{ ALSLW,	yml_rl,	Pq, 0x03  },
+	{ AMASKMOVOU,	yxr,	Pe, 0xf7 },
+	{ AMASKMOVQ,	ymr,	Pm, 0xf7 },
+	{ AMAXPD,	yxm,	Pe, 0x5f },
+	{ AMAXPS,	yxm,	Pm, 0x5f },
+	{ AMAXSD,	yxm,	Pf2, 0x5f },
+	{ AMAXSS,	yxm,	Pf3, 0x5f },
+	{ AMINPD,	yxm,	Pe, 0x5d },
+	{ AMINPS,	yxm,	Pm, 0x5d },
+	{ AMINSD,	yxm,	Pf2, 0x5d },
+	{ AMINSS,	yxm,	Pf3, 0x5d },
+	{ AMOVAPD,	yxmov,	Pe, 0x28,0x29 },
+	{ AMOVAPS,	yxmov,	Pm, 0x28,0x29 },
 	{ AMOVB,	ymovb,	Pb, 0x88,0x8a,0xb0,0xc6,(00) },
-	{ AMOVL,	ymovl,	Px, 0x89,0x8b,0x31,0x83,(04),0xb8,0xc7,(00) },
+	{ AMOVL,	ymovl,	Px, 0x89,0x8b,0x31,0x83,(04),0xb8,0xc7,(00),0x6e,0x7e,Pe,0x6e,Pe,0x7e },
 	{ AMOVW,	ymovl,	Pe, 0x89,0x8b,0x31,0x83,(04),0xb8,0xc7,(00) },
 	{ AMOVBLSX,	ymb_rl,	Pm, 0xbe },
 	{ AMOVBLZX,	ymb_rl,	Pm, 0xb6 },
@@ -430,11 +614,34 @@
 	{ AMOVBWZX,	ymb_rl,	Pq, 0xb6 },
 	{ AMOVWLSX,	yml_rl,	Pm, 0xbf },
 	{ AMOVWLZX,	yml_rl,	Pm, 0xb7 },
+	{ AMOVO,	yxmov,	Pe, 0x6f,0x7f },
+	{ AMOVOU,	yxmov,	Pf3, 0x6f,0x7f },
+	{ AMOVHLPS,	yxr,	Pm, 0x12 },
+	{ AMOVHPD,	yxmov,	Pe, 0x16,0x17 },
+	{ AMOVHPS,	yxmov,	Pm, 0x16,0x17 },
+	{ AMOVLHPS,	yxr,	Pm, 0x16 },
+	{ AMOVLPD,	yxmov,	Pe, 0x12,0x13 },
+	{ AMOVLPS,	yxmov,	Pm, 0x12,0x13 },
+	{ AMOVMSKPD,	yxrrl,	Pq, 0x50 },
+	{ AMOVMSKPS,	yxrrl,	Pm, 0x50 },
+	{ AMOVNTO,	yxr_ml,	Pe, 0xe7 },
+	{ AMOVNTPD,	yxr_ml,	Pe, 0x2b },
+	{ AMOVNTPS,	yxr_ml,	Pm, 0x2b },
+	{ AMOVNTQ,	ymr_ml,	Pm, 0xe7 },
+	{ AMOVQOZX,	ymrxr,	Pf3, 0xd6,0x7e },
 	{ AMOVSB,	ynone,	Pb, 0xa4 },
+	{ AMOVSD,	yxmov,	Pf2, 0x10,0x11 },
 	{ AMOVSL,	ynone,	Px, 0xa5 },
+	{ AMOVSS,	yxmov,	Pf3, 0x10,0x11 },
 	{ AMOVSW,	ynone,	Pe, 0xa5 },
+	{ AMOVUPD,	yxmov,	Pe, 0x10,0x11 },
+	{ AMOVUPS,	yxmov,	Pm, 0x10,0x11 },
 	{ AMULB,	ydivb,	Pb, 0xf6,(04) },
 	{ AMULL,	ydivl,	Px, 0xf7,(04) },
+	{ AMULPD,	yxm,	Pe, 0x59 },
+	{ AMULPS,	yxm,	Ym, 0x59 },
+	{ AMULSD,	yxm,	Pf2, 0x59 },
+	{ AMULSS,	yxm,	Pf3, 0x59 },
 	{ AMULW,	ydivl,	Pe, 0xf7,(04) },
 	{ ANAME },
 	{ ANEGB,	yscond,	Px, 0xf6,(03) },
@@ -446,6 +653,8 @@
 	{ ANOTW,	yscond,	Pe, 0xf7,(02) },
 	{ AORB,		yxorb,	Pb, 0x0c,0x80,(01),0x08,0x0a },
 	{ AORL,		yxorl,	Px, 0x83,(01),0x0d,0x81,(01),0x09,0x0b },
+	{ AORPD,	yxm,	Pq, 0x56 },
+	{ AORPS,	yxm,	Pm, 0x56 },
 	{ AORW,		yxorl,	Pe, 0x83,(01),0x0d,0x81,(01),0x09,0x0b },
 	{ AOUTB,	yin,	Pb, 0xe6,0xee },
 	{ AOUTL,	yin,	Px, 0xe7,0xef },
@@ -453,6 +662,44 @@
 	{ AOUTSB,	ynone,	Pb, 0x6e },
 	{ AOUTSL,	ynone,	Px, 0x6f },
 	{ AOUTSW,	ynone,	Pe, 0x6f },
+	{ APACKSSLW,	ymm,	Px, 0x6b,Pe,0x6b },
+	{ APACKSSWB,	ymm,	Px, 0x63,Pe,0x63 },
+	{ APACKUSWB,	ymm,	Px, 0x67,Pe,0x67 },
+	{ APADDB,	ymm,	Px, 0xfc,Pe,0xfc },
+	{ APADDL,	ymm,	Px, 0xfe,Pe,0xfe },
+	{ APADDQ,	yxm,	Pe, 0xd4 },
+	{ APADDSB,	ymm,	Px, 0xec,Pe,0xec },
+	{ APADDSW,	ymm,	Px, 0xed,Pe,0xed },
+	{ APADDUSB,	ymm,	Px, 0xdc,Pe,0xdc },
+	{ APADDUSW,	ymm,	Px, 0xdd,Pe,0xdd },
+	{ APADDW,	ymm,	Px, 0xfd,Pe,0xfd },
+	{ APAND,	ymm,	Px, 0xdb,Pe,0xdb },
+	{ APANDN,	ymm,	Px, 0xdf,Pe,0xdf },
+	{ APAVGB,	ymm,	Px, 0xe0,Pe,0xe0 },
+	{ APAVGW,	ymm,	Px, 0xe3,Pe,0xe3 },
+	{ APCMPEQB,	ymm,	Px, 0x74,Pe,0x74 },
+	{ APCMPEQL,	ymm,	Px, 0x76,Pe,0x76 },
+	{ APCMPEQW,	ymm,	Px, 0x75,Pe,0x75 },
+	{ APCMPGTB,	ymm,	Px, 0x64,Pe,0x64 },
+	{ APCMPGTL,	ymm,	Px, 0x66,Pe,0x66 },
+	{ APCMPGTW,	ymm,	Px, 0x65,Pe,0x65 },
+	{ APEXTRW,	yextrw,	Pq, 0xc5 },
+	{ APF2IL,	ymfp,	Px, 0x1d },
+	{ APF2IW,	ymfp,	Px, 0x1c },
+	{ API2FL,	ymfp,	Px, 0x0d },
+	{ API2FW,	ymfp,	Px, 0x0c },
+	{ APINSRW,	yextrw,	Pq, 0xc4 },
+	{ APMADDWL,	ymm,	Px, 0xf5,Pe,0xf5 },
+	{ APMAXSW,	yxm,	Pe, 0xee },
+	{ APMAXUB,	yxm,	Pe, 0xde },
+	{ APMINSW,	yxm,	Pe, 0xea },
+	{ APMINUB,	yxm,	Pe, 0xda },
+	{ APMOVMSKB,	ymskb,	Px, Pe,0xd7,0xd7 },
+	{ APMULHRW,	ymfp,	Px, 0xb7 },
+	{ APMULHUW,	ymm,	Px, 0xe4,Pe,0xe4 },
+	{ APMULHW,	ymm,	Px, 0xe5,Pe,0xe5 },
+	{ APMULLW,	ymm,	Px, 0xd5,Pe,0xd5 },
+	{ APMULULQ,	ymm,	Px, 0xf4,Pe,0xf4 },
 	{ APOPAL,	ynone,	Px, 0x61 },
 	{ APOPAW,	ynone,	Pe, 0x61 },
 	{ APOPFL,	ynone,	Px, 0x9d },
@@ -459,6 +706,38 @@
 	{ APOPFW,	ynone,	Pe, 0x9d },
 	{ APOPL,	ypopl,	Px, 0x58,0x8f,(00) },
 	{ APOPW,	ypopl,	Pe, 0x58,0x8f,(00) },
+	{ APOR,		ymm,	Px, 0xeb,Pe,0xeb },
+	{ APSADBW,	yxm,	Pq, 0xf6 },
+	{ APSHUFHW,	yxshuf,	Pf3, 0x70 },
+	{ APSHUFL,	yxshuf,	Pq, 0x70 },
+	{ APSHUFLW,	yxshuf,	Pf2, 0x70 },
+	{ APSHUFW,	ymshuf,	Pm, 0x70 },
+	{ APSLLO,	ypsdq,	Pq, 0x73,(07) },
+	{ APSLLL,	yps,	Px, 0xf2, 0x72,(06), Pe,0xf2, Pe,0x72,(06) },
+	{ APSLLQ,	yps,	Px, 0xf3, 0x73,(06), Pe,0xf3, Pe,0x7e,(06) },
+	{ APSLLW,	yps,	Px, 0xf1, 0x71,(06), Pe,0xf1, Pe,0x71,(06) },
+	{ APSRAL,	yps,	Px, 0xe2, 0x72,(04), Pe,0xe2, Pe,0x72,(04) },
+	{ APSRAW,	yps,	Px, 0xe1, 0x71,(04), Pe,0xe1, Pe,0x71,(04) },
+	{ APSRLO,	ypsdq,	Pq, 0x73,(03) },
+	{ APSRLL,	yps,	Px, 0xd2, 0x72,(02), Pe,0xd2, Pe,0x72,(02) },
+	{ APSRLQ,	yps,	Px, 0xd3, 0x73,(02), Pe,0xd3, Pe,0x73,(02) },
+	{ APSRLW,	yps,	Px, 0xd1, 0x71,(02), Pe,0xe1, Pe,0x71,(02) },
+	{ APSUBB,	yxm,	Pe, 0xf8 },
+	{ APSUBL,	yxm,	Pe, 0xfa },
+	{ APSUBQ,	yxm,	Pe, 0xfb },
+	{ APSUBSB,	yxm,	Pe, 0xe8 },
+	{ APSUBSW,	yxm,	Pe, 0xe9 },
+	{ APSUBUSB,	yxm,	Pe, 0xd8 },
+	{ APSUBUSW,	yxm,	Pe, 0xd9 },
+	{ APSUBW,	yxm,	Pe, 0xf9 },
+	{ APUNPCKHBW,	ymm,	Px, 0x68,Pe,0x68 },
+	{ APUNPCKHLQ,	ymm,	Px, 0x6a,Pe,0x6a },
+	{ APUNPCKHQDQ,	yxm,	Pe, 0x6d },
+	{ APUNPCKHWL,	ymm,	Px, 0x69,Pe,0x69 },
+	{ APUNPCKLBW,	ymm,	Px, 0x60,Pe,0x60 },
+	{ APUNPCKLLQ,	ymm,	Px, 0x62,Pe,0x62 },
+	{ APUNPCKLQDQ,	yxm,	Pe, 0x6c },
+	{ APUNPCKLWL,	ymm,	Px, 0x61,Pe,0x61 },
 	{ APUSHAL,	ynone,	Px, 0x60 },
 	{ APUSHAW,	ynone,	Pe, 0x60 },
 	{ APUSHFL,	ynone,	Px, 0x9c },
@@ -465,9 +744,12 @@
 	{ APUSHFW,	ynone,	Pe, 0x9c },
 	{ APUSHL,	ypushl,	Px, 0x50,0xff,(06),0x6a,0x68 },
 	{ APUSHW,	ypushl,	Pe, 0x50,0xff,(06),0x6a,0x68 },
+	{ APXOR,	ymm,	Px, 0xef,Pe,0xef },
 	{ ARCLB,	yshb,	Pb, 0xd0,(02),0xc0,(02),0xd2,(02) },
 	{ ARCLL,	yshl,	Px, 0xd1,(02),0xc1,(02),0xd3,(02),0xd3,(02) },
 	{ ARCLW,	yshl,	Pe, 0xd1,(02),0xc1,(02),0xd3,(02),0xd3,(02) },
+	{ ARCPPS,	yxm,	Pm, 0x53 },
+	{ ARCPSS,	yxm,	Pf3, 0x53 },
 	{ ARCRB,	yshb,	Pb, 0xd0,(03),0xc0,(03),0xd2,(03) },
 	{ ARCRL,	yshl,	Px, 0xd1,(03),0xc1,(03),0xd3,(03),0xd3,(03) },
 	{ ARCRW,	yshl,	Pe, 0xd1,(03),0xc1,(03),0xd3,(03),0xd3,(03) },
@@ -480,6 +762,8 @@
 	{ ARORB,	yshb,	Pb, 0xd0,(01),0xc0,(01),0xd2,(01) },
 	{ ARORL,	yshl,	Px, 0xd1,(01),0xc1,(01),0xd3,(01),0xd3,(01) },
 	{ ARORW,	yshl,	Pe, 0xd1,(01),0xc1,(01),0xd3,(01),0xd3,(01) },
+	{ ARSQRTPS,	yxm,	Pm, 0x52 },
+	{ ARSQRTSS,	yxm,	Pf3, 0x52 },
 	{ ASAHF,	ynone,	Px, 0x9e },
 	{ ASALB,	yshb,	Pb, 0xd0,(04),0xc0,(04),0xd2,(04) },
 	{ ASALL,	yshl,	Px, 0xd1,(04),0xc1,(04),0xd3,(04),0xd3,(04) },
@@ -517,14 +801,25 @@
 	{ ASHRB,	yshb,	Pb, 0xd0,(05),0xc0,(05),0xd2,(05) },
 	{ ASHRL,	yshl,	Px, 0xd1,(05),0xc1,(05),0xd3,(05),0xd3,(05) },
 	{ ASHRW,	yshl,	Pe, 0xd1,(05),0xc1,(05),0xd3,(05),0xd3,(05) },
+	{ ASHUFPD,	yxshuf,	Pq, 0xc6 },
+	{ ASHUFPS,	yxshuf,	Pm, 0xc6 },
+	{ ASQRTPD,	yxm,	Pe, 0x51 },
+	{ ASQRTPS,	yxm,	Pm, 0x51 },
+	{ ASQRTSD,	yxm,	Pf2, 0x51 },
+	{ ASQRTSS,	yxm,	Pf3, 0x51 },
 	{ ASTC,		ynone,	Px, 0xf9 },
 	{ ASTD,		ynone,	Px, 0xfd },
 	{ ASTI,		ynone,	Px, 0xfb },
+	{ ASTMXCSR,	ysvrs,	Pm, 0xae,(03),0xae,(03) },
 	{ ASTOSB,	ynone,	Pb, 0xaa },
 	{ ASTOSL,	ynone,	Px, 0xab },
 	{ ASTOSW,	ynone,	Pe, 0xab },
 	{ ASUBB,	yxorb,	Pb, 0x2c,0x80,(05),0x28,0x2a },
 	{ ASUBL,	yaddl,	Px, 0x83,(05),0x2d,0x81,(05),0x29,0x2b },
+	{ ASUBPD,	yxm,	Pe, 0x5c },
+	{ ASUBPS,	yxm,	Pm, 0x5c },
+	{ ASUBSD,	yxm,	Pf2, 0x5c },
+	{ ASUBSS,	yxm,	Pf3, 0x5c },
 	{ ASUBW,	yaddl,	Pe, 0x83,(05),0x2d,0x81,(05),0x29,0x2b },
 	{ ASYSCALL,	ynone,	Px, 0xcd,100 },
 	{ ATESTB,	ytestb,	Pb, 0xa8,0xf6,(00),0x84,0x84 },
@@ -531,6 +826,12 @@
 	{ ATESTL,	ytestl,	Px, 0xa9,0xf7,(00),0x85,0x85 },
 	{ ATESTW,	ytestl,	Pe, 0xa9,0xf7,(00),0x85,0x85 },
 	{ ATEXT,	ytext,	Px },
+	{ AUCOMISD,	yxcmp,	Pe, 0x2e },
+	{ AUCOMISS,	yxcmp,	Pm, 0x2e },
+	{ AUNPCKHPD,	yxm,	Pe, 0x15 },
+	{ AUNPCKHPS,	yxm,	Pm, 0x15 },
+	{ AUNPCKLPD,	yxm,	Pe, 0x14 },
+	{ AUNPCKLPS,	yxm,	Pm, 0x14 },
 	{ AVERR,	ydivl,	Pm, 0x00,(04) },
 	{ AVERW,	ydivl,	Pm, 0x00,(05) },
 	{ AWAIT,	ynone,	Px, 0x9b },
@@ -541,6 +842,8 @@
 	{ AXLAT,	ynone,	Px, 0xd7 },
 	{ AXORB,	yxorb,	Pb, 0x34,0x80,(06),0x30,0x32 },
 	{ AXORL,	yxorl,	Px, 0x83,(06),0x35,0x81,(06),0x31,0x33 },
+	{ AXORPD,	yxm,	Pe, 0x57 },
+	{ AXORPS,	yxm,	Pm, 0x57 },
 	{ AXORW,	yxorl,	Pe, 0x83,(06),0x35,0x81,(06),0x31,0x33 },
 
 	{ AFMOVB,	yfmvx,	Px, 0xdf,(04) },
@@ -649,6 +952,9 @@
 	{ AFXTRACT,	ynone,	Px, 0xd9, 0xf4 },
 	{ AFYL2X,	ynone,	Px, 0xd9, 0xf1 },
 	{ AFYL2XP1,	ynone,	Px, 0xd9, 0xf9 },
+
 	{ AEND },
 	0
 };
+
+Optab*	opindex[ALAST+1];
--- a/sys/src/cmd/8l/span.c
+++ b/sys/src/cmd/8l/span.c
@@ -326,7 +326,7 @@
 {
 	long v;
 
-	if(a->type >= D_INDIR || a->index != D_NONE) {
+	if((a->type >= D_INDIR && a->type < D_M0) || a->index != D_NONE) {
 		if(a->index != D_NONE && a->scale == 0) {
 			if(a->type == D_ADDR) {
 				switch(a->index) {
@@ -387,6 +387,26 @@
 	case D_F0+7:
 		return	Yrf;
 
+	case D_M0+0:
+	case D_M0+1:
+	case D_M0+2:
+	case D_M0+3:
+	case D_M0+4:
+	case D_M0+5:
+	case D_M0+6:
+	case D_M0+7:
+		return	Ymr;
+
+	case D_X0+0:
+	case D_X0+1:
+	case D_X0+2:
+	case D_X0+3:
+	case D_X0+4:
+	case D_X0+5:
+	case D_X0+6:
+	case D_X0+7:
+		return	Yxr;
+
 	case D_NONE:
 		return Ynone;
 
@@ -576,7 +596,7 @@
 	v = a->offset;
 	t = a->type;
 	if(a->index != D_NONE) {
-		if(t >= D_INDIR) {
+		if(t >= D_INDIR && t < D_M0) {
 			t -= D_INDIR;
 			if(t == D_NONE) {
 				*andptr++ = (0 << 6) | (4 << 0) | (r << 3);
@@ -624,7 +644,13 @@
 		*andptr++ = (3 << 6) | (reg[t] << 0) | (r << 3);
 		return;
 	}
-	if(t >= D_INDIR) {
+	if(t >= D_M0 && t <= D_X0+7) {
+		if(v)
+			goto bad;
+		*andptr++ = (3 << 6) | (reg[t] << 0) | (r << 3);
+		return;
+	}
+	if(t >= D_INDIR && t < D_M0) {
 		t -= D_INDIR;
 		if(t == D_NONE || D_CS <= t && t <= D_GS) {
 			*andptr++ = (0 << 6) | (5 << 0) | (r << 3);
@@ -835,6 +861,30 @@
 		print("%P\n", p);
 }
 
+static int
+mediaop(Optab *o, int op, int osize, int z)
+{
+	switch(op){
+	case Pm:
+	case Pe:
+	case Pf2:
+	case Pf3:
+		if(osize != 1){
+			if(op != Pm)
+				*andptr++ = op;
+			*andptr++ = Pm;
+			op = o->op[++z];
+			break;
+		}
+	default:
+		if(andptr == and || andptr[-1] != Pm)
+			*andptr++ = Pm;
+		break;
+	}
+	*andptr++ = op;
+	return z;
+}
+
 void
 doasm(Prog *p)
 {
@@ -851,7 +901,7 @@
 	if(pre)
 		*andptr++ = pre;
 
-	o = &optab[p->as];
+	o = opindex[p->as];
 	ft = oclass(&p->from) * Ymax;
 	tt = oclass(&p->to) * Ymax;
 	t = o->ytab;
@@ -872,6 +922,12 @@
 		*andptr++ = Pm;
 		break;
 
+	case Pf2:	/* xmm opcode escape */
+	case Pf3:
+		*andptr++ = o->prefix;
+		*andptr++ = Pm;
+		break;
+
 	case Pm:	/* opcode escape */
 		*andptr++ = Pm;
 		break;
@@ -903,6 +959,30 @@
 		asmand(&p->from, reg[p->to.type]);
 		break;
 
+	case Zm_r_xm:
+		mediaop(o, op, t[3], z);
+		asmand(&p->from, reg[p->to.type]);
+		break;
+
+	case Zm_r_i_xm:
+		mediaop(o, op, t[3], z);
+		asmand(&p->from, reg[p->to.type]);
+		*andptr++ = p->to.offset;
+		break;
+
+	case Zm_r_3d:
+		*andptr++ = 0x0f;
+		*andptr++ = 0x0f;
+		asmand(&p->from, reg[p->to.type]);
+		*andptr++ = op;
+		break;
+
+	case Zibm_r:
+		*andptr++ = op;
+		asmand(&p->from, reg[p->to.type]);
+		*andptr++ = p->to.offset;
+ 		break;
+
 	case Zaut_r:
 		*andptr++ = 0x8d;	/* leal */
 		if(p->from.type != D_ADDR)
@@ -924,6 +1004,17 @@
 		asmand(&p->to, reg[p->from.type]);
 		break;
 
+	case Zr_m_xm:
+		mediaop(o, op, t[3], z);
+		asmand(&p->to, reg[p->from.type]);
+		break;
+
+	case Zr_m_i_xm:
+		mediaop(o, op, t[3], z);
+		asmand(&p->to, reg[p->from.type]);
+		*andptr++ = p->from.offset;
+ 		break;
+
 	case Zo_m:
 		*andptr++ = op;
 		asmand(&p->to, o->op[z+1]);
@@ -941,6 +1032,12 @@
 		asmand(&p->to, o->op[z+1]);
 		*andptr++ = v;
 		break;
+
+	case Zibo_m_xm:
+		z = mediaop(o, op, t[3], z);
+		asmand(&p->to, o->op[z+1]);
+ 		*andptr++ = v;
+ 		break;
 
 	case Z_ib:
 		v = vaddr(&p->to);