shithub: riscv

Download patch

ref: 66b6185845e85258f1408271d5f705aacfa6ffdb
parent: 753a35b52ac098985aff5e22a069d30d16903385
author: Sigrid <[email protected]>
date: Sun Dec 6 13:48:32 EST 2020

amd64, vmx: support avx/avx2 for host/guest; use *noavx= in plan9.ini to disable

--- a/sys/man/8/plan9.ini
+++ b/sys/man/8/plan9.ini
@@ -898,6 +898,8 @@
 battery life (see
 .IR stats (8)).
 It is not on by default because it causes problems on some laptops.
+.SS \fL*noavx=\fP
+Disables AVX and AVX2 on AMD64 CPUs.
 .SS USB
 .SS \fL*nousbprobe=\fP
 Disable USB host controller detection.
--- a/sys/src/9/pc/cputemp.c
+++ b/sys/src/9/pc/cputemp.c
@@ -13,7 +13,7 @@
 
 	if(m->cpuiddx & Acpif)
 	if(strcmp(m->cpuidid, "GenuineIntel") == 0){
-		cpuid(6, regs);
+		cpuid(6, 0, regs);
 		return regs[0] & 1;
 	}
 	return 0;
@@ -28,7 +28,7 @@
 	ulong regs[4];
 	static ulong tj;
 
-	cpuid(6, regs);
+	cpuid(6, 0, regs);
 	if((regs[0] & 1) == 0)
 		goto unsup;
 	if(tj == 0){
--- a/sys/src/9/pc/dat.h
+++ b/sys/src/9/pc/dat.h
@@ -250,7 +250,7 @@
 	int	pdbfree;
 	
 	u32int	dr7;			/* shadow copy of dr7 */
-	
+	u32int	xcr0;
 	void*	vmx;
 
 	int	stack[1];
--- a/sys/src/9/pc/devarch.c
+++ b/sys/src/9/pc/devarch.c
@@ -18,11 +18,6 @@
 	Qmax = 32,
 };
 
-enum {
-	CR4Osfxsr = 1 << 9,
-	CR4Oxmmex = 1 << 10,
-};
-
 enum {				/* cpuid standard function codes */
 	Highstdfunc = 0,	/* also returns vendor string */
 	Procsig,
@@ -507,13 +502,13 @@
 	ulong regs[4];
 	vlong mca, mct, pat;
 
-	cpuid(Highstdfunc, regs);
+	cpuid(Highstdfunc, 0, regs);
 	memmove(m->cpuidid,   &regs[1], BY2WD);	/* bx */
 	memmove(m->cpuidid+4, &regs[3], BY2WD);	/* dx */
 	memmove(m->cpuidid+8, &regs[2], BY2WD);	/* cx */
 	m->cpuidid[12] = '\0';
 
-	cpuid(Procsig, regs);
+	cpuid(Procsig, 0, regs);
 	m->cpuidax = regs[0];
 	m->cpuidcx = regs[2];
 	m->cpuiddx = regs[3];
@@ -650,15 +645,6 @@
 	if(m->cpuiddx & Mtrr)
 		mtrrsync();
 
-	if((m->cpuiddx & (Sse|Fxsr)) == (Sse|Fxsr)){			/* have sse fp? */
-		fpsave = fpssesave;
-		fprestore = fpsserestore;
-		putcr4(getcr4() | CR4Osfxsr|CR4Oxmmex);
-	} else {
-		fpsave = fpx87save;
-		fprestore = fpx87restore;
-	}
-
 	if(strcmp(m->cpuidid, "GenuineIntel") == 0 && (m->cpuidcx & Rdrnd) != 0)
 		hwrandbuf = rdrandbuf;
 	else
@@ -669,9 +655,9 @@
 		m->havewatchpt8 = 1;
 
 		/* check and enable NX bit */
-		cpuid(Highextfunc, regs);
+		cpuid(Highextfunc, 0, regs);
 		if(regs[0] >= Procextfeat){
-			cpuid(Procextfeat, regs);
+			cpuid(Procextfeat, 0, regs);
 			if((regs[3] & (1<<20)) != 0){
 				vlong efer;
 
@@ -689,13 +675,15 @@
 		|| family == 6 && (model == 15 || model == 23 || model == 28))
 			m->havewatchpt8 = 1;
 		/* Intel SDM claims amd64 support implies 8-byte watchpoint support */
-		cpuid(Highextfunc, regs);
+		cpuid(Highextfunc, 0, regs);
 		if(regs[0] >= Procextfeat){
-			cpuid(Procextfeat, regs);
+			cpuid(Procextfeat, 0, regs);
 			if((regs[3] & 1<<29) != 0)
 				m->havewatchpt8 = 1;
 		}
 	}
+
+	fpuinit();
 
 	cputype = t;
 	return t->family;
--- a/sys/src/9/pc/devvmx.c
+++ b/sys/src/9/pc/devvmx.c
@@ -44,6 +44,7 @@
 	
 	PROCB_CTLS = 0x4002,
 	PROCB_IRQWIN = 1<<2,
+	PROCB_TSCOFFSET = 1<<3,
 	PROCB_EXITHLT = 1<<7,
 	PROCB_EXITINVLPG = 1<<9,
 	PROCB_EXITMWAIT = 1<<10,
@@ -100,6 +101,7 @@
 	VMENTRY_INTRCODE = 0x4018,
 	VMENTRY_INTRILEN = 0x401a,
 	
+	VMCS_TSC_OFFSET = 0x2010,
 	VMCS_LINK = 0x2800,
 	
 	GUEST_ES = 0x800,
@@ -264,7 +266,9 @@
 	int index, machno;
 	char errstr[ERRMAX];
 	Ureg ureg;
+	uvlong tscoffset;
 	uintptr cr2;
+	uintptr xcr0;
 	uintptr dr[8]; /* DR7 is also kept in VMCS */
 	u8int launched;
 	u8int vpid;
@@ -484,6 +488,13 @@
 }
 
 static int
+xcr0write(Vmx *vmx, char *s)
+{
+	vmx->xcr0 = parseval(s) & 7;
+	return 0;
+}
+
+static int
 readonly(Vmx *, char *)
 {
 	return -1;
@@ -581,6 +592,7 @@
 	{VMXVAR(dr[2]), 0, "dr2"},
 	{VMXVAR(dr[3]), 0, "dr3"},
 	{VMXVAR(dr[6]), 0, "dr6", nil, dr6write},
+	{VMXVAR(xcr0), 0, "xcr0", nil, xcr0write},
 	{GUEST_DR7, 0, "dr7", nil, dr7write},
 	{VM_INSTRERR, 4, "instructionerror", nil, readonly},
 	{VM_EXREASON, 4, "exitreason", nil, readonly},
@@ -857,7 +869,7 @@
 	vlong msr;
 	int i;
 
-	cpuid(1, regs);
+	cpuid(1, 0, regs);
 	if((regs[2] & 1<<5) == 0) return;
 	/* check if disabled by BIOS */
 	if(rdmsr(0x3a, &msr) < 0) return;
@@ -945,8 +957,8 @@
 	
 	if(rdmsr(VMX_PROCB_CTLS_MSR, &msr) < 0) error("rdmsr(VMX_PROCB_CTLS_MSR failed");
 	x = (u32int)procb_ctls | 1<<1 | 7<<4 | 1<<8 | 1<<13 | 1<<14 | 1<<26; /* currently reserved default1 bits */
-	x |= PROCB_EXITHLT | PROCB_EXITMWAIT;
-	x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_EXITMONITOR | PROCB_MSRBITMAP;
+	x |= PROCB_TSCOFFSET | PROCB_EXITMWAIT | PROCB_EXITMONITOR | PROCB_EXITHLT;
+	x |= PROCB_EXITMOVDR | PROCB_EXITIO | PROCB_MSRBITMAP;
 	x |= PROCB_USECTLS2;
 	x &= msr >> 32;
 	vmcswrite(PROCB_CTLS, x);
@@ -1042,8 +1054,8 @@
 	
 	vmx->onentry = FLUSHVPID | FLUSHEPT;
 	fpinit();
-	fpsave(&vmx->fp);
-	
+	vmx->xcr0 = m->xcr0 & 1; /* x87 alone */
+
 	memset(vmx->msrbits, -1, 4096);
 	vmxtrapmsr(vmx, Efer, 0);
 	vmcswrite(VMENTRY_MSRLDADDR, PADDR(vmx->msrguest));
@@ -1051,6 +1063,9 @@
 	vmcswrite(VMEXIT_MSRLDADDR, PADDR(vmx->msrhost));
 	vmcswrite(MSR_BITMAP, PADDR(vmx->msrbits));
 	
+	cycles(&vmx->tscoffset);
+	vmcswrite(VMCS_TSC_OFFSET, vmx->tscoffset);
+
 	if(sizeof(uintptr) == 8){
 		vmxaddmsr(vmx, Star, 0);
 		vmxaddmsr(vmx, Lstar, 0);
@@ -1074,7 +1089,7 @@
 	uintptr cr;
 	vlong x;
 
-	putcr4(getcr4() | 0x2000); /* set VMXE */
+	putcr4(getcr4() | CR4VMXE);
 	putcr0(getcr0() | 0x20); /* set NE */
 	cr = getcr0();
 	if(rdmsr(VMX_CR0_FIXED0, &msr) < 0) error("rdmsr(VMX_CR0_FIXED0) failed");
@@ -1590,8 +1605,9 @@
 static void
 vmxproc(void *vmxp)
 {
-	int init, rc, x;
+	int init, rc, x, useend;
 	u32int procbctls, defprocbctls;
+	u64int start, end, adj;
 	vlong v;
 	Vmx *vmx;
 
@@ -1599,6 +1615,8 @@
 	procwired(up, vmx->machno);
 	sched();
 	init = 0;
+	useend = 0;
+	adj = 0;
 	defprocbctls = 0;
 	while(waserror()){
 		kstrcpy(vmx->errstr, up->errstr, ERRMAX);
@@ -1653,11 +1671,29 @@
 			}
 			if((vmx->dr[7] & ~0xd400) != 0)
 				putdr01236(vmx->dr);
-			fpsserestore(&vmx->fp);
-			putcr2(vmx->cr2);
+
+			fprestore(&vmx->fp);
+			if(m->xcr0 != 0 && vmx->xcr0 != m->xcr0)
+				putxcr0(vmx->xcr0);
+			if(vmx->cr2 != getcr2())
+				putcr2(vmx->cr2);
+			cycles(&start);
+			if(useend){
+				vmx->tscoffset -= end - start + adj;
+				vmcswrite(VMCS_TSC_OFFSET, vmx->tscoffset);
+			}
+			if(adj == 0){
+				cycles(&adj);
+				adj -= start;
+			}
 			rc = vmlaunch(&vmx->ureg, vmx->launched);
+			cycles(&end);
+			useend = 1;
 			vmx->cr2 = getcr2();
-			fpssesave(&vmx->fp);
+			if(m->xcr0 != 0 && vmx->xcr0 != m->xcr0)
+				putxcr0(m->xcr0);
+			fpsave(&vmx->fp);
+
 			splx(x);
 			if(rc < 0)
 				error("vmlaunch failed");
@@ -1799,6 +1835,7 @@
 		free(vmx);
 		nexterror();
 	}
+	memset(vmx, 0, sizeof(Vmx));
 	vmx->state = VMXINIT;
 	vmx->lastcmd = &vmx->firstcmd;
 	vmx->mem.next = &vmx->mem;
--- a/sys/src/9/pc/fns.h
+++ b/sys/src/9/pc/fns.h
@@ -15,7 +15,8 @@
 int	(*cmpswap)(long*, long, long);
 int	cmpswap486(long*, long, long);
 void	(*coherence)(void);
-void	cpuid(int, ulong regs[]);
+void	cpuid(int, int, ulong regs[]);
+void	fpuinit(void);
 int	cpuidentify(void);
 void	cpuidprint(void);
 void	(*cycles)(uvlong*);
@@ -138,6 +139,7 @@
 void	putcr2(ulong);
 void	putcr3(ulong);
 void	putcr4(ulong);
+void	putxcr0(ulong);
 void	putdr(u32int*);
 void	putdr01236(uintptr*);
 void	putdr6(u32int);
--- /dev/null
+++ b/sys/src/9/pc/fpu.c
@@ -1,0 +1,31 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+enum {
+	CR4Osfxsr  = 1 << 9,
+	CR4Oxmmex  = 1 << 10,
+};
+
+void
+putxcr0(ulong)
+{
+}
+
+void
+fpuinit(void)
+{
+	uintptr cr4;
+
+	if((m->cpuiddx & (Sse|Fxsr)) == (Sse|Fxsr)){ /* have sse fp? */
+		fpsave = fpssesave;
+		fprestore = fpsserestore;
+		cr4 = getcr4() | CR4Osfxsr|CR4Oxmmex;
+		putcr4(cr4);
+	} else {
+		fpsave = fpx87save;
+		fprestore = fpx87restore;
+	}
+}
--- a/sys/src/9/pc/l.s
+++ b/sys/src/9/pc/l.s
@@ -520,7 +520,7 @@
  * a 386 (Ac bit can't be set). If it's not a 386 and the Id bit can't be
  * toggled then it's an older 486 of some kind.
  *
- *	cpuid(fun, regs[4]);
+ *	cpuid(fn, sublvl, regs[4]);
  */
 TEXT cpuid(SB), $0
 	MOVL	$0x240000, AX
@@ -539,6 +539,7 @@
 	TESTL	$0x200000, AX			/* Id */
 	JZ	_cpu486				/* can't toggle this bit on some 486 */
 	MOVL	fn+0(FP), AX
+	MOVL	sublvl+4(FP), CX
 	CPUID
 	JMP	_cpuid
 _cpu486:
@@ -555,7 +556,7 @@
 	XORL	CX, CX
 	XORL	DX, DX
 _cpuid:
-	MOVL	regs+4(FP), BP
+	MOVL	regs+8(FP), BP
 	MOVL	AX, 0(BP)
 	MOVL	BX, 4(BP)
 	MOVL	CX, 8(BP)
--- a/sys/src/9/pc/mkfile
+++ b/sys/src/9/pc/mkfile
@@ -49,6 +49,7 @@
 OBJ=\
 	l.$O\
 	cga.$O\
+	fpu.$O\
 	i8253.$O\
 	i8259.$O\
 	main.$O\
--- a/sys/src/9/pc/mtrr.c
+++ b/sys/src/9/pc/mtrr.c
@@ -289,9 +289,9 @@
 	ulong regs[4];
 	uvlong mask;
 
-	cpuid(Exthighfunc, regs);
+	cpuid(Exthighfunc, 0, regs);
 	if(regs[0] >= Extaddrsz) {			/* ax */
-		cpuid(Extaddrsz, regs);
+		cpuid(Extaddrsz, 0, regs);
 		mask = (1ULL << (regs[0] & 0xFF)) - 1;	/* ax */
 	} else {
 		mask = (1ULL << 36) - 1;
--- a/sys/src/9/pc64/dat.h
+++ b/sys/src/9/pc64/dat.h
@@ -2,6 +2,8 @@
 typedef struct BIOS32ci	BIOS32ci;
 typedef struct Conf	Conf;
 typedef struct Confmem	Confmem;
+typedef struct FPssestate	FPssestate;
+typedef struct FPavxstate	FPavxstate;
 typedef struct FPsave	FPsave;
 typedef struct PFPU	PFPU;
 typedef struct ISAConf	ISAConf;
@@ -49,7 +51,7 @@
 	uintptr	pc;
 };
 
-struct FPsave
+struct FPssestate
 {
 	u16int	fcw;			/* x87 control word */
 	u16int	fsw;			/* x87 status word */
@@ -65,6 +67,18 @@
 	uchar	ign[96];		/* reserved, ignored */
 };
 
+struct FPavxstate
+{
+	FPssestate;
+	uchar	header[64];		/* XSAVE header */
+	uchar	ymm[256];		/* upper 128-bit regs (AVX) */
+};
+
+struct FPsave
+{
+	FPavxstate;
+};
+
 enum
 {
 	/* this is a state */
@@ -224,9 +238,12 @@
 	int	havewatchpt8;
 	int	havenx;
 	uvlong	tscticks;
-	
+
 	u64int	dr7;			/* shadow copy of dr7 */
-	
+	u64int	xcr0;
+	u32int	fpsavesz;
+	u32int	fpalign;
+
 	void*	vmx;
 
 	uintptr	stack[1];
@@ -270,8 +287,14 @@
 
 /* cpuid instruction result register bits */
 enum {
+	/* ax */
+	Xsaveopt = 1<<0,
+	Xsaves = 1<<3,
+
 	/* cx */
 	Monitor	= 1<<3,
+	Xsave = 1<<26,
+	Avx	= 1<<28,
 
 	/* dx */
 	Fpuonchip = 1<<0,
--- a/sys/src/9/pc64/fns.h
+++ b/sys/src/9/pc64/fns.h
@@ -15,7 +15,8 @@
 int	(*cmpswap)(long*, long, long);
 int	cmpswap486(long*, long, long);
 void	(*coherence)(void);
-void	cpuid(int, ulong regs[]);
+void	cpuid(int, int, ulong regs[]);
+void	fpuinit(void);
 int	cpuidentify(void);
 void	cpuidprint(void);
 void	(*cycles)(uvlong*);
@@ -40,6 +41,11 @@
 void	(*fpsave)(FPsave*);
 void	fpsserestore(FPsave*);
 void	fpssesave(FPsave*);
+void	fpxrestore(FPsave*);
+void	fpxrestores(FPsave*);
+void	fpxsave(FPsave*);
+void	fpxsaveopt(FPsave*);
+void	fpxsaves(FPsave*);
 void	fpx87restore(FPsave*);
 void	fpx87save(FPsave*);
 int	fpusave(void);
@@ -48,6 +54,7 @@
 u64int	getcr2(void);
 u64int	getcr3(void);
 u64int	getcr4(void);
+u64int	getxcr0(void);
 u64int	getdr6(void);
 char*	getconf(char*);
 void	guesscpuhz(int);
@@ -138,6 +145,7 @@
 void	putcr2(u64int);
 void	putcr3(u64int);
 void	putcr4(u64int);
+void	putxcr0(u64int);
 void	putdr(u64int*);
 void	putdr01236(u64int*);
 void	putdr6(u64int);
--- /dev/null
+++ b/sys/src/9/pc64/fpu.c
@@ -1,0 +1,51 @@
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+
+enum {
+	CR4Osfxsr  = 1 << 9,
+	CR4Oxmmex  = 1 << 10,
+	CR4Oxsave  = 1 << 18,
+};
+
+void
+fpuinit(void)
+{
+	uintptr cr4;
+	ulong regs[4];
+
+	m->fpsavesz = sizeof(FPssestate);
+	m->fpalign = 16;
+	if((m->cpuiddx & (Sse|Fxsr)) == (Sse|Fxsr)){ /* have sse fp? */
+		cr4 = getcr4() | CR4Osfxsr|CR4Oxmmex;
+		putcr4(cr4);
+		fpsave = fpssesave;
+		fprestore = fpsserestore;
+
+		if((m->cpuidcx & (Xsave|Avx)) == (Xsave|Avx) && getconf("*noavx") == nil){
+			cr4 |= CR4Oxsave;
+			putcr4(cr4);
+			m->xcr0 = 7; /* x87, sse, avx */
+			putxcr0(m->xcr0);
+			fpsave = fpxsave;
+			fprestore = fpxrestore;
+
+			cpuid(0xd, 0, regs);
+			m->fpsavesz = regs[1];
+			m->fpalign = 64;
+
+			cpuid(0xd, 1, regs);
+			if(regs[0] & Xsaveopt)
+				fpsave = fpxsaveopt;
+			if(regs[0] & Xsaves){
+				fpsave = fpxsaves;
+				fprestore = fpxrestores;
+			}
+		}
+	} else {
+		fpsave = fpx87save;
+		fprestore = fpx87restore;
+	}
+}
--- a/sys/src/9/pc64/l.s
+++ b/sys/src/9/pc64/l.s
@@ -249,9 +249,10 @@
  */
 TEXT cpuid(SB), $-4
 	MOVL	RARG, AX			/* function in AX */
+	MOVL	cx+8(FP), CX		/* sub-level in CX */
 	CPUID
 
-	MOVQ	info+8(FP), BP
+	MOVQ	info+16(FP), BP
 	MOVL	AX, 0(BP)
 	MOVL	BX, 4(BP)
 	MOVL	CX, 8(BP)
@@ -399,6 +400,21 @@
 	MOVQ	RARG, CR4
 	RET
 
+TEXT getxcr0(SB), 1, $-4			/* XCR0 - extended control */
+	XORQ CX, CX
+	WORD $0x010f; BYTE $0xd0	// XGETBV
+	SHLQ $32, DX
+	ORQ DX, AX
+	RET
+
+TEXT putxcr0(SB), 1, $-4
+	XORQ CX, CX
+	MOVL RARG, DX
+	SHRQ $32, DX
+	MOVL RARG, AX
+	WORD $0x010f; BYTE $0xd1	// XSETBV
+	RET
+
 TEXT mb386(SB), 1, $-4				/* hack */
 TEXT mb586(SB), 1, $-4
 	XORL	AX, AX
@@ -624,6 +640,36 @@
 
 TEXT _fxsave(SB), 1, $-4
 	FXSAVE64 (RARG)
+	RET
+
+TEXT _xrstor(SB), 1, $-4
+	MOVL $7, AX
+	XORL DX, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x6d; BYTE $0x00 // XRSTOR (RARG)
+	RET
+
+TEXT _xrstors(SB), 1, $-4
+	MOVL $7, AX
+	XORL DX, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xc7; BYTE $0x5d; BYTE $0x00 // XRSTORS (RARG)
+	RET
+
+TEXT _xsave(SB), 1, $-4
+	MOVL $7, AX
+	XORL DX, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x65; BYTE $0x00 // XSAVE (RARG)
+	RET
+
+TEXT _xsaveopt(SB), 1, $-4
+	MOVL $7, AX
+	XORL DX, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xae; BYTE $0x75; BYTE $0x00 // XSAVEOPT (RARG)
+	RET
+
+TEXT _xsaves(SB), 1, $-4
+	MOVL $7, AX
+	XORL DX, DX
+	BYTE $0x48; BYTE $0x0f; BYTE $0xc7; BYTE $0x6d; BYTE $0x00 // XSAVES (RARG)
 	RET
 
 TEXT _fwait(SB), 1, $-4
--- a/sys/src/9/pc64/main.c
+++ b/sys/src/9/pc64/main.c
@@ -304,6 +304,9 @@
 extern void _fninit(void);
 extern void _fxrstor(void*);
 extern void _fxsave(void*);
+extern void _xrstor(void*);
+extern void _xsave(void*);
+extern void _xsaveopt(void*);
 extern void _fwait(void);
 extern void _ldmxcsr(u32int);
 extern void _stts(void);
@@ -333,6 +336,39 @@
 	_fxrstor(s);
 }
 
+void
+fpxsave(FPsave *s)
+{
+	_xsave(s);
+	_stts();
+}
+void
+fpxrestore(FPsave *s)
+{
+	_clts();
+	_xrstor(s);
+}
+
+void
+fpxsaves(FPsave *s)
+{
+	_xsaveopt(s);
+	_stts();
+}
+void
+fpxrestores(FPsave *s)
+{
+	_clts();
+	_xrstor(s);
+}
+
+void
+fpxsaveopt(FPsave *s)
+{
+	_xsaveopt(s);
+	_stts();
+}
+
 static char* mathmsg[] =
 {
 	nil,	/* handled below */
@@ -452,7 +488,7 @@
 			up->fpstate |= FPkernel;
 		}
 		while(up->fpslot[index] == nil)
-			up->fpslot[index] = mallocalign(sizeof(FPsave), FPalign, 0, 0);
+			up->fpslot[index] = mallocalign(m->fpsavesz, m->fpalign, 0, 0);
 		up->fpsave = up->fpslot[index];
 		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
 		break;
@@ -538,8 +574,8 @@
 	case FPinactive	| FPpush:
 	case FPinactive:
 		while(p->fpslot[0] == nil)
-			p->fpslot[0] = mallocalign(sizeof(FPsave), FPalign, 0, 0);
-		memmove(p->fpsave = p->fpslot[0], up->fpslot[0], sizeof(FPsave));
+			p->fpslot[0] = mallocalign(m->fpsavesz, m->fpalign, 0, 0);
+		memmove(p->fpsave = p->fpslot[0], up->fpslot[0], m->fpsavesz);
 		p->fpstate = FPinactive;
 	}
 	splx(s);
--- a/sys/src/9/pc64/mem.h
+++ b/sys/src/9/pc64/mem.h
@@ -26,7 +26,6 @@
 #define	ROUND(s, sz)	(((s)+((sz)-1))&~((sz)-1))
 #define	PGROUND(s)	ROUND(s, BY2PG)
 #define	BLOCKALIGN	8
-#define	FPalign		16
 
 #define	MAXMACH		128			/* max # cpus system can run */
 
--- a/sys/src/9/pc64/mkfile
+++ b/sys/src/9/pc64/mkfile
@@ -47,6 +47,7 @@
 OBJ=\
 	l.$O\
 	cga.$O\
+	fpu.$O\
 	i8253.$O\
 	i8259.$O\
 	main.$O\
--- a/sys/src/cmd/vmx/exith.c
+++ b/sys/src/cmd/vmx/exith.c
@@ -1,9 +1,8 @@
 #include <u.h>
 #include <libc.h>
-#include <thread.h>
-#include <bio.h>
 #include "dat.h"
 #include "fns.h"
+#include "x86.h"
 
 int persist = 1;
 
@@ -118,109 +117,167 @@
 
 typedef struct CPUID CPUID;
 struct CPUID {
-	u32int idx;
 	u32int ax, bx, cx, dx;
 };
-static CPUID *cpuidf;
-static int ncpuidf;
+static u32int cpuidmax;
+static u32int cpuidmaxext;
+static CPUID leaf1;
+static struct {
+	uvlong miscen;
+}msr;
 
-static void
-auxcpuidproc(void *vpfd)
-{
-	int *pfd;
-	
-	pfd = vpfd;
-	close(pfd[1]);
-	close(0);
-	open("/dev/null", OREAD);
-	dup(pfd[0], 1);
-	close(pfd[0]);
-	procexecl(nil, "/bin/aux/cpuid", "cpuid", "-r", nil);
-	threadexits("exec: %r");
-}
+static uchar _cpuid[] = {
+	0x5E,			/* POP SI (PC) */
+	0x5D,			/* POP BP (CPUID&) */
+	0x58,			/* POP AX */
+	0x59,			/* POP CX */
 
+	0x51,			/* PUSH CX */
+	0x50,			/* PUSH AX */
+	0x55,			/* PUSH BP */
+	0x56,			/* PUSH SI */
+
+	0x31, 0xDB,		/* XOR BX, BX */
+	0x31, 0xD2,		/* XOR DX, DX */
+
+	0x0F, 0xA2,		/* CPUID */
+
+	0x89, 0x45, 0x00,	/* MOV AX, 0(BP) */
+	0x89, 0x5d, 0x04,	/* MOV BX, 4(BP) */
+	0x89, 0x4d, 0x08,	/* MOV CX, 8(BP) */
+	0x89, 0x55, 0x0C,	/* MOV DX, 12(BP) */
+	0xC3,			/* RET */
+};
+
+static CPUID (*getcpuid)(ulong ax, ulong cx) = (CPUID(*)(ulong, ulong)) _cpuid;
+
 void
 cpuidinit(void)
 {
-	int pfd[2];
-	Biobuf *bp;
-	char *l, *f[5];
-	CPUID *cp;
-	
-	pipe(pfd);
-	procrfork(auxcpuidproc, pfd, 4096, RFFDG);
-	close(pfd[0]);
-	bp = Bfdopen(pfd[1], OREAD);
-	if(bp == nil) sysfatal("Bopenfd: %r");
-	for(; l = Brdstr(bp, '\n', 1), l != nil; free(l)){
-		if(tokenize(l, f, 5) < 5) continue;
-		cpuidf = realloc(cpuidf, (ncpuidf + 1) * sizeof(CPUID));
-		cp = cpuidf + ncpuidf++;
-		cp->idx = strtoul(f[0], nil, 16);
-		cp->ax = strtoul(f[1], nil, 16);
-		cp->bx = strtoul(f[2], nil, 16);
-		cp->cx = strtoul(f[3], nil, 16);
-		cp->dx = strtoul(f[4], nil, 16);
+	CPUID r;
+	int f;
+
+	if(sizeof(uintptr) == 8) /* patch out POP BP -> POP AX */
+		_cpuid[1] = 0x58;
+	segflush(_cpuid, sizeof(_cpuid));
+
+	r = getcpuid(0, 0);
+	cpuidmax = r.ax;
+	r = getcpuid(0x80000000, 0);
+	cpuidmaxext = r.ax;
+	leaf1 = getcpuid(1, 0);
+
+	memset(&msr, 0, sizeof(msr));
+	if((f = open("/dev/msr", OREAD)) >= 0){
+		pread(f, &msr.miscen, 8, 0x1a0);
+		msr.miscen &= 1<<0; /* fast strings */
+		close(f);
 	}
-	Bterm(bp);
-	close(pfd[1]);
 }
 
-CPUID *
-getcpuid(ulong idx)
-{
-	CPUID *cp;
-	
-	for(cp = cpuidf; cp < cpuidf + ncpuidf; cp++)
-		if(cp->idx == idx)
-			return cp;
-	return nil;
-}
+static int xsavesz[] = {
+	[1] = 512+64,
+	[3] = 512+64,
+	[7] = 512+64+256,
+};
 
-int maxcpuid = 7;
-
 static void
 cpuid(ExitInfo *ei)
 {
 	u32int ax, bx, cx, dx;
-	CPUID *cp;
-	static CPUID def;
-	
+	CPUID cp;
+
 	ax = rget(RAX);
-	cp = getcpuid(ax);
-	if(cp == nil) cp = &def;
+	cx = rget(RCX);
+	bx = dx = 0;
+	cp = getcpuid(ax, cx);
 	switch(ax){
-	case 0: /* highest register & GenuineIntel */
-		ax = maxcpuid;
-		bx = cp->bx;
-		dx = cp->dx;
-		cx = cp->cx;
+	case 0x00: /* highest register & GenuineIntel */
+		ax = MIN(cpuidmax, 0x18);
+		bx = cp.bx;
+		dx = cp.dx;
+		cx = cp.cx;
 		break;
-	case 1: /* features */
-		ax = cp->ax;
-		bx = cp->bx & 0xffff;
-		cx = cp->cx & 0x60de2203;
-		dx = cp->dx & 0x0782a179;
+	case 0x01: /* features */
+		ax = cp.ax;
+		bx = cp.bx & 0xffff;
+		/* some features removed, hypervisor added */
+		cx = cp.cx & 0x76de3217 | 0x80000000UL;
+		dx = cp.dx & 0x0f8aa579;
+		if(leaf1.cx & 1<<27){
+			if(rget("cr4real") & Cr4Osxsave)
+				cx |= 1<<27;
+		}else{
+			cx &= ~0x1c000000;
+		}
 		break;
-	case 2: goto literal; /* cache stuff */
-	case 3: goto zero; /* processor serial number */
-	case 4: goto zero; /* cache stuff */
-	case 5: goto zero; /* monitor/mwait */
-	case 6: goto zero; /* thermal management */
-	case 7: goto zero; /* more features */
-	case 10: goto zero; /* performance counters */
+	case 0x02: goto literal; /* cache stuff */
+	case 0x03: goto zero; /* processor serial number */
+	case 0x04: goto literal; /* cache stuff */
+	case 0x05: goto zero; /* monitor/mwait */
+	case 0x06: goto zero; /* thermal management */
+	case 0x07: /* more features */
+		if(cx == 0){
+			ax = 0;
+			bx = cp.bx & 0x2369;
+			cx = 0;
+			if((leaf1.cx & 1<<27) == 0)
+				bx &= ~0xdc230020;
+		}else{
+			goto zero;
+		}
+		break;
+	case 0x08: goto zero;
+	case 0x09: goto literal; /* direct cache access */
+	case 0x0a: goto zero; /* performance counters */
+	case 0x0b: goto zero; /* extended topology */
+	case 0x0c: goto zero;
+	case 0x0d: /* extended state */
+		if((leaf1.cx & 1<<27) == 0)
+			goto zero;
+		if(cx == 0){ /* main leaf */
+			ax = cp.ax & 7; /* x87, sse, avx */
+			bx = xsavesz[rget("xcr0")]; /* current xsave size */
+			cx = xsavesz[ax]; /* max xsave size */
+		}else if(cx == 1){ /* sub leaf */
+			ax = cp.ax & 7; /* xsaveopt, xsavec, xgetbv1 */
+			bx = xsavesz[rget("xcr0")];
+			cx = 0;
+		}else if(cx == 2){
+			ax = xsavesz[7] - xsavesz[3];
+			bx = xsavesz[3];
+			cx = 0;
+		}else{
+			goto zero;
+		}
+		break;
+	case 0x0f: goto zero; /* RDT */
+	case 0x10: goto zero; /* RDT */
+	case 0x12: goto zero; /* SGX */
+	case 0x14: goto zero; /* PT */
+	case 0x15: goto zero; /* TSC */
+	case 0x16: goto zero; /* cpu clock */
+	case 0x17: goto zero; /* SoC */
+	case 0x18: goto literal; /* pages, tlb */
+
+	case 0x40000000: /* hypervisor */
+		ax = 0;
+		bx = 0x4b4d564b; /* act as KVM */
+		cx = 0x564b4d56;
+		dx = 0x4d;
+		break;
+
 	case 0x80000000: /* highest register */
-		ax = 0x80000008;
-		bx = cx = dx = 0;
+		ax = MIN(cpuidmaxext, 0x80000008);
+		cx = 0;
 		break;
 	case 0x80000001: /* signature & ext features */
-		ax = cp->ax;
-		bx = 0;
-		cx = cp->cx & 0x121;
+		ax = cp.ax;
+		cx = cp.cx & 0x121;
 		if(sizeof(uintptr) == 8)
-			dx = cp->dx & 0x24100800;
+			dx = cp.dx & 0x24100800;
 		else
-			dx = cp->dx & 0x04100000;
+			dx = cp.dx & 0x04100000;
 		break;
 	case 0x80000002: goto literal; /* brand string */
 	case 0x80000003: goto literal; /* brand string */
@@ -230,18 +287,16 @@
 	case 0x80000007: goto zero; /* invariant tsc */
 	case 0x80000008: goto literal; /* address bits */
 	literal:
-		ax = cp->ax;
-		bx = cp->bx;
-		cx = cp->cx;
-		dx = cp->dx;
+		ax = cp.ax;
+		bx = cp.bx;
+		cx = cp.cx;
+		dx = cp.dx;
 		break;
 	default:
-		vmerror("unknown cpuid field eax=%#ux", ax);
+		if((ax & 0xf0000000) != 0x40000000)
+			vmerror("unknown cpuid field eax=%#ux", ax);
 	zero:
-		ax = 0;
-		bx = 0;
-		cx = 0;
-		dx = 0;
+		ax = cx = 0;
 		break;
 	}
 	rset(RAX, ax);
@@ -267,6 +322,9 @@
 		else rset("pat", val);
 		break;
 	case 0x8B: val = 0; break; /* microcode update */
+	case 0x1A0: /* IA32_MISC_ENABLE */
+		if(rd) val = msr.miscen;
+		break;
 	default:
 		if(rd){
 			vmerror("read from unknown MSR %#ux ignored", cx);
@@ -373,6 +431,26 @@
 	irqack(ei->qual);
 }
 
+static void
+xsetbv(ExitInfo *ei)
+{
+	uvlong v;
+
+	/* this should also #ud if LOCK prefix is used */
+
+	v = rget(RAX)&0xffffffff | rget(RDX)<<32;
+	if(rget(RCX) & 0xffffffff)
+		postexc("#gp", 0);
+	else if(v != 1 && v != 3 && v != 7)
+		postexc("#gp", 0);
+	else if((leaf1.cx & 1<<26) == 0 || (rget("cr4real") & Cr4Osxsave) == 0)
+		postexc("#ud", NOERRC);
+	else{
+		rset("xcr0", v);
+		skipinstr(ei);
+	}
+}
+
 typedef struct ExitType ExitType;
 struct ExitType {
 	char *name;
@@ -389,6 +467,7 @@
 	{".movdr", movdr},
 	{"#db", dbgexc},
 	{"movcr", movcr},
+	{".xsetbv", xsetbv},
 };
 
 void
--- a/sys/src/cmd/vmx/fns.h
+++ b/sys/src/cmd/vmx/fns.h
@@ -1,3 +1,4 @@
+#define MIN(a,b) ((a)<(b)?(a):(b))
 void *emalloc(ulong);
 void loadkernel(char *);
 uvlong rget(char *);
--- a/sys/src/cmd/vmx/x86.h
+++ b/sys/src/cmd/vmx/x86.h
@@ -22,8 +22,9 @@
 enum {
 	Cr0Pg	= 1<<31,
 	
-	Cr4Pse	= 1<<4,
-	Cr4Pae	= 1<<5,
+	Cr4Pse		= 1<<4,
+	Cr4Pae		= 1<<5,
+	Cr4Osxsave	= 1<<18,
 	
 	EferLme	= 1<<8,
 };