shithub: riscv

--- a/sys/src/9/pc/fns.h

+++ b/sys/src/9/pc/fns.h

@@ -41,12 +41,6 @@

 void	fpoff(void);

 void	(*fprestore)(FPsave*);

 void	(*fpsave)(FPsave*);

-void	fpsserestore(FPsave*);

-void	fpssesave(FPsave*);

-void	fpx87restore(FPsave*);

-void	fpx87restore0(FPsave*);

-void	fpx87save(FPsave*);

-void	fpx87save0(FPsave*);

 ulong	getcr0(void);

 ulong	getcr2(void);

 ulong	getcr3(void);

--- a/sys/src/9/pc/fpu.c

+++ b/sys/src/9/pc/fpu.c

@@ -3,6 +3,8 @@

 #include "mem.h"

 #include "dat.h"

 #include "fns.h"

+#include "io.h"

+#include "ureg.h"

 enum {

 	CR4Osfxsr  = 1 << 9,

@@ -9,11 +11,286 @@

 	CR4Oxmmex  = 1 << 10,

};

+/* from l.s */

+extern void fpsserestore(FPsave*);

+extern void fpssesave(FPsave*);

+extern void fpx87restore0(FPsave*);

+extern void fpx87save0(FPsave*);

 void

 putxcr0(ulong)

+/*

+ * we keep FPsave structure in SSE format emulating FXSAVE / FXRSTOR

+ * instructions for legacy x87 fpu.

+ */

+static void

+fpx87save(FPsave *fps)

+{

+	ushort tag;

+	fpx87save0(fps);

+	/*

+	 * convert x87 tag word to fxsave tag byte:

+	 * 00, 01, 10 -> 1, 11 -> 0

+	 */

+	tag = ~fps->tag;

+	tag = (tag | (tag >> 1)) & 0x5555;

+	tag = (tag | (tag >> 1)) & 0x3333;

+	tag = (tag | (tag >> 2)) & 0x0F0F;

+	tag = (tag | (tag >> 4)) & 0x00FF;

+	/* NOP fps->fcw = fps->control; */

+	fps->fsw = fps->status;

+	fps->ftw = tag;

+	fps->fop = fps->opcode;

+	fps->fpuip = fps->pc;

+	fps->cs = fps->selector;

+	fps->fpudp = fps->operand;

+	fps->ds = fps->oselector;

+#define MOVA(d,s) \

+	*((ushort*)(d+8)) = *((ushort*)(s+8)), \

+	*((ulong*)(d+4)) = *((ulong*)(s+4)), \

+	*((ulong*)(d)) = *((ulong*)(s))

+	MOVA(fps->xregs+0x70, fps->regs+70);

+	MOVA(fps->xregs+0x60, fps->regs+60);

+	MOVA(fps->xregs+0x50, fps->regs+50);

+	MOVA(fps->xregs+0x40, fps->regs+40);

+	MOVA(fps->xregs+0x30, fps->regs+30);

+	MOVA(fps->xregs+0x20, fps->regs+20);

+	MOVA(fps->xregs+0x10, fps->regs+10);

+	MOVA(fps->xregs+0x00, fps->regs+00);

+#undef MOVA

+#define CLR6(d)	\

+	*((ulong*)(d)) = 0, \

+	*((ushort*)(d+4)) = 0

+	CLR6(fps->xregs+0x70+10);

+	CLR6(fps->xregs+0x60+10);

+	CLR6(fps->xregs+0x50+10);

+	CLR6(fps->xregs+0x40+10);

+	CLR6(fps->xregs+0x30+10);

+	CLR6(fps->xregs+0x20+10);

+	CLR6(fps->xregs+0x10+10);

+	CLR6(fps->xregs+0x00+10);

+#undef CLR6

+	fps->rsrvd1 = fps->rsrvd2 = fps->mxcsr = fps->mxcsr_mask = 0;

+}

+static void

+fpx87restore(FPsave *fps)

+{

+	ushort msk, tos, tag, *reg;

+	/* convert fxsave tag byte to x87 tag word */

+	tag = 0;

+	tos = 7 - ((fps->fsw >> 11) & 7);

+	for(msk = 0x80; msk != 0; tos--, msk >>= 1){

+		tag <<= 2;

+		if((fps->ftw & msk) != 0){

+			reg = (ushort*)&fps->xregs[(tos & 7) << 4];

+			switch(reg[4] & 0x7fff){

+			case 0x0000:

+				if((reg[0] | reg[1] | reg[2] | reg[3]) == 0){

+					tag |= 1;	/* 01 zero */

+					break;

+				}

+				/* no break */

+			case 0x7fff:

+				tag |= 2;		/* 10 special */

+				break;

+			default:

+				if((reg[3] & 0x8000) == 0)

+					break;		/* 00 valid */

+				tag |= 2;		/* 10 special */

+				break;

+			}

+		}else{

+			tag |= 3;			/* 11 empty */

+		}

+	}

+#define MOVA(d,s) \

+	*((ulong*)(d)) = *((ulong*)(s)), \

+	*((ulong*)(d+4)) = *((ulong*)(s+4)), \

+	*((ushort*)(d+8)) = *((ushort*)(s+8))

+	MOVA(fps->regs+00, fps->xregs+0x00);

+	MOVA(fps->regs+10, fps->xregs+0x10);

+	MOVA(fps->regs+20, fps->xregs+0x20);

+	MOVA(fps->regs+30, fps->xregs+0x30);

+	MOVA(fps->regs+40, fps->xregs+0x40);

+	MOVA(fps->regs+50, fps->xregs+0x50);

+	MOVA(fps->regs+60, fps->xregs+0x60);

+	MOVA(fps->regs+70, fps->xregs+0x70);

+#undef MOVA

+	fps->oselector = fps->ds;

+	fps->operand = fps->fpudp;

+	fps->opcode = fps->fop & 0x7ff;

+	fps->selector = fps->cs;

+	fps->pc = fps->fpuip;

+	fps->tag = tag;

+	fps->status = fps->fsw;

+	/* NOP fps->control = fps->fcw;  */

+	fps->r1 = fps->r2 = fps->r3 = fps->r4 = 0;

+	fpx87restore0(fps);

+}

+static char* mathmsg[] =

+{

+	nil,	/* handled below */

+	"denormalized operand",

+	"division by zero",

+	"numeric overflow",

+	"numeric underflow",

+	"precision loss",

+};

+static void

+mathnote(ulong status, ulong pc)

+{

+	char *msg, note[ERRMAX];

+	int i;

+	/*

+	 * Some attention should probably be paid here to the

+	 * exception masks and error summary.

+	 */

+	msg = "unknown exception";

+	for(i = 1; i <= 5; i++){

+		if(!((1<<i) & status))

+			continue;

+		msg = mathmsg[i];

+		break;

+	}

+	if(status & 0x01){

+		if(status & 0x40){

+			if(status & 0x200)

+				msg = "stack overflow";

+			else

+				msg = "stack underflow";

+		}else

+			msg = "invalid operation";

+	}

+	snprint(note, sizeof note, "sys: fp: %s fppc=0x%lux status=0x%lux",

+		msg, pc, status);

+	postnote(up, 1, note, NDebug);

+}

+/*

+ *  math coprocessor error

+ */

+static void

+matherror(Ureg*, void*)

+{

+	/*

+	 *  a write cycle to port 0xF0 clears the interrupt latch attached

+	 *  to the error# line from the 387

+	 */

+	if(!(m->cpuiddx & Fpuonchip))

+		outb(0xF0, 0xFF);

+	/*

+	 *  get floating point state to check out error

+	 */

+	fpsave(up->fpsave);

+	up->fpstate = FPinactive;

+	mathnote(up->fpsave->fsw, up->fpsave->fpuip);

+}

+/*

+ *  SIMD error

+ */

+static void

+simderror(Ureg *ureg, void*)

+{

+	fpsave(up->fpsave);

+	up->fpstate = FPinactive;

+	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);

+}

+/*

+ *  math coprocessor emulation fault

+ */

+static void

+mathemu(Ureg *ureg, void*)

+{

+	ulong status, control;

+	if(up->fpstate & FPillegal){

+		/* someone did floating point in a note handler */

+		postnote(up, 1, "sys: floating point in note handler", NDebug);

+		return;

+	}

+	switch(up->fpstate){

+	case FPinit:

+		fpinit();

+		if(fpsave == fpssesave)

+			ldmxcsr(0x1f80);	/* no simd exceptions on 386 */

+		while(up->fpsave == nil)

+			up->fpsave = mallocalign(sizeof(FPsave), FPalign, 0, 0);

+		up->fpstate = FPactive;

+		break;

+	case FPinactive:

+		/*

+		 * Before restoring the state, check for any pending

+		 * exceptions, there's no way to restore the state without

+		 * generating an unmasked exception.

+		 * More attention should probably be paid here to the

+		 * exception masks and error summary.

+		 */

+		status = up->fpsave->fsw;

+		control = up->fpsave->fcw;

+		if((status & ~control) & 0x07F){

+			mathnote(status, up->fpsave->fpuip);

+			break;

+		}

+		fprestore(up->fpsave);

+		up->fpstate = FPactive;

+		break;

+	case FPactive:

+		panic("math emu pid %ld %s pc 0x%lux",

+			up->pid, up->text, ureg->pc);

+		break;

+	}

+}

+/*

+ *  math coprocessor segment overrun

+ */

+static void

+mathover(Ureg*, void*)

+{

+	pexit("math overrun", 0);

+}

+void

+mathinit(void)

+{

+	trapenable(VectorCERR, matherror, 0, "matherror");

+	if(m->cpuidfamily == 3)

+		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");

+	trapenable(VectorCNA, mathemu, 0, "mathemu");

+	trapenable(VectorCSO, mathover, 0, "mathover");

+	trapenable(VectorSIMD, simderror, 0, "simderror");

+}

+/*

+ * fpuinit(), called from cpuidentify() for each cpu.

+ */

 void

 fpuinit(void)

--- a/sys/src/9/pc/main.c

+++ b/sys/src/9/pc/main.c

@@ -234,272 +234,6 @@

/*

- * we keep FPsave structure in SSE format emulating FXSAVE / FXRSTOR

- * instructions for legacy x87 fpu.

- */

-void

-fpx87save(FPsave *fps)

-{

-	ushort tag;

-	fpx87save0(fps);

-	/*

-	 * convert x87 tag word to fxsave tag byte:

-	 * 00, 01, 10 -> 1, 11 -> 0

-	 */

-	tag = ~fps->tag;

-	tag = (tag | (tag >> 1)) & 0x5555;

-	tag = (tag | (tag >> 1)) & 0x3333;

-	tag = (tag | (tag >> 2)) & 0x0F0F;

-	tag = (tag | (tag >> 4)) & 0x00FF;

-	/* NOP fps->fcw = fps->control; */

-	fps->fsw = fps->status;

-	fps->ftw = tag;

-	fps->fop = fps->opcode;

-	fps->fpuip = fps->pc;

-	fps->cs = fps->selector;

-	fps->fpudp = fps->operand;

-	fps->ds = fps->oselector;

-#define MOVA(d,s) \

-	*((ushort*)(d+8)) = *((ushort*)(s+8)), \

-	*((ulong*)(d+4)) = *((ulong*)(s+4)), \

-	*((ulong*)(d)) = *((ulong*)(s))

-	MOVA(fps->xregs+0x70, fps->regs+70);

-	MOVA(fps->xregs+0x60, fps->regs+60);

-	MOVA(fps->xregs+0x50, fps->regs+50);

-	MOVA(fps->xregs+0x40, fps->regs+40);

-	MOVA(fps->xregs+0x30, fps->regs+30);

-	MOVA(fps->xregs+0x20, fps->regs+20);

-	MOVA(fps->xregs+0x10, fps->regs+10);

-	MOVA(fps->xregs+0x00, fps->regs+00);

-#undef MOVA

-#define CLR6(d)	\

-	*((ulong*)(d)) = 0, \

-	*((ushort*)(d+4)) = 0

-	CLR6(fps->xregs+0x70+10);

-	CLR6(fps->xregs+0x60+10);

-	CLR6(fps->xregs+0x50+10);

-	CLR6(fps->xregs+0x40+10);

-	CLR6(fps->xregs+0x30+10);

-	CLR6(fps->xregs+0x20+10);

-	CLR6(fps->xregs+0x10+10);

-	CLR6(fps->xregs+0x00+10);

-#undef CLR6

-	fps->rsrvd1 = fps->rsrvd2 = fps->mxcsr = fps->mxcsr_mask = 0;

-}

-void

-fpx87restore(FPsave *fps)

-{

-	ushort msk, tos, tag, *reg;

-	/* convert fxsave tag byte to x87 tag word */

-	tag = 0;

-	tos = 7 - ((fps->fsw >> 11) & 7);

-	for(msk = 0x80; msk != 0; tos--, msk >>= 1){

-		tag <<= 2;

-		if((fps->ftw & msk) != 0){

-			reg = (ushort*)&fps->xregs[(tos & 7) << 4];

-			switch(reg[4] & 0x7fff){

-			case 0x0000:

-				if((reg[0] | reg[1] | reg[2] | reg[3]) == 0){

-					tag |= 1;	/* 01 zero */

-					break;

-				}

-				/* no break */

-			case 0x7fff:

-				tag |= 2;		/* 10 special */

-				break;

-			default:

-				if((reg[3] & 0x8000) == 0)

-					break;		/* 00 valid */

-				tag |= 2;		/* 10 special */

-				break;

-			}

-		}else{

-			tag |= 3;			/* 11 empty */

-		}

-	}

-#define MOVA(d,s) \

-	*((ulong*)(d)) = *((ulong*)(s)), \

-	*((ulong*)(d+4)) = *((ulong*)(s+4)), \

-	*((ushort*)(d+8)) = *((ushort*)(s+8))

-	MOVA(fps->regs+00, fps->xregs+0x00);

-	MOVA(fps->regs+10, fps->xregs+0x10);

-	MOVA(fps->regs+20, fps->xregs+0x20);

-	MOVA(fps->regs+30, fps->xregs+0x30);

-	MOVA(fps->regs+40, fps->xregs+0x40);

-	MOVA(fps->regs+50, fps->xregs+0x50);

-	MOVA(fps->regs+60, fps->xregs+0x60);

-	MOVA(fps->regs+70, fps->xregs+0x70);

-#undef MOVA

-	fps->oselector = fps->ds;

-	fps->operand = fps->fpudp;

-	fps->opcode = fps->fop & 0x7ff;

-	fps->selector = fps->cs;

-	fps->pc = fps->fpuip;

-	fps->tag = tag;

-	fps->status = fps->fsw;

-	/* NOP fps->control = fps->fcw;  */

-	fps->r1 = fps->r2 = fps->r3 = fps->r4 = 0;

-	fpx87restore0(fps);

-}

-static char* mathmsg[] =

-{

-	nil,	/* handled below */

-	"denormalized operand",

-	"division by zero",

-	"numeric overflow",

-	"numeric underflow",

-	"precision loss",

-};

-static void

-mathnote(ulong status, ulong pc)

-{

-	char *msg, note[ERRMAX];

-	int i;

-	/*

-	 * Some attention should probably be paid here to the

-	 * exception masks and error summary.

-	 */

-	msg = "unknown exception";

-	for(i = 1; i <= 5; i++){

-		if(!((1<<i) & status))

-			continue;

-		msg = mathmsg[i];

-		break;

-	}

-	if(status & 0x01){

-		if(status & 0x40){

-			if(status & 0x200)

-				msg = "stack overflow";

-			else

-				msg = "stack underflow";

-		}else

-			msg = "invalid operation";

-	}

-	snprint(note, sizeof note, "sys: fp: %s fppc=0x%lux status=0x%lux",

-		msg, pc, status);

-	postnote(up, 1, note, NDebug);

-}

-/*

- *  math coprocessor error

- */

-static void

-matherror(Ureg*, void*)

-{

-	/*

-	 *  a write cycle to port 0xF0 clears the interrupt latch attached

-	 *  to the error# line from the 387

-	 */

-	if(!(m->cpuiddx & Fpuonchip))

-		outb(0xF0, 0xFF);

-	/*

-	 *  get floating point state to check out error

-	 */

-	fpsave(up->fpsave);

-	up->fpstate = FPinactive;

-	mathnote(up->fpsave->fsw, up->fpsave->fpuip);

-}

-/*

- *  SIMD error

- */

-static void

-simderror(Ureg *ureg, void*)

-{

-	fpsave(up->fpsave);

-	up->fpstate = FPinactive;

-	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);

-}

-/*

- *  math coprocessor emulation fault

- */

-static void

-mathemu(Ureg *ureg, void*)

-{

-	ulong status, control;

-	if(up->fpstate & FPillegal){

-		/* someone did floating point in a note handler */

-		postnote(up, 1, "sys: floating point in note handler", NDebug);

-		return;

-	}

-	switch(up->fpstate){

-	case FPinit:

-		fpinit();

-		if(fpsave == fpssesave)

-			ldmxcsr(0x1f80);	/* no simd exceptions on 386 */

-		while(up->fpsave == nil)

-			up->fpsave = mallocalign(sizeof(FPsave), FPalign, 0, 0);

-		up->fpstate = FPactive;

-		break;

-	case FPinactive:

-		/*

-		 * Before restoring the state, check for any pending

-		 * exceptions, there's no way to restore the state without

-		 * generating an unmasked exception.

-		 * More attention should probably be paid here to the

-		 * exception masks and error summary.

-		 */

-		status = up->fpsave->fsw;

-		control = up->fpsave->fcw;

-		if((status & ~control) & 0x07F){

-			mathnote(status, up->fpsave->fpuip);

-			break;

-		}

-		fprestore(up->fpsave);

-		up->fpstate = FPactive;

-		break;

-	case FPactive:

-		panic("math emu pid %ld %s pc 0x%lux",

-			up->pid, up->text, ureg->pc);

-		break;

-	}

-}

-/*

- *  math coprocessor segment overrun

- */

-static void

-mathover(Ureg*, void*)

-{

-	pexit("math overrun", 0);

-}

-void

-mathinit(void)

-{

-	trapenable(VectorCERR, matherror, 0, "matherror");

-	if(m->cpuidfamily == 3)

-		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");

-	trapenable(VectorCNA, mathemu, 0, "mathemu");

-	trapenable(VectorCSO, mathover, 0, "mathover");

-	trapenable(VectorSIMD, simderror, 0, "simderror");

-}

-/*

  *  set up floating point for a new process

*/

 void

--- a/sys/src/9/pc64/fns.h

+++ b/sys/src/9/pc64/fns.h

@@ -39,15 +39,10 @@

 void	fpinit(void);

 void	(*fprestore)(FPsave*);

 void	(*fpsave)(FPsave*);

-void	fpsserestore(FPsave*);

-void	fpssesave(FPsave*);

-void	fpxrestore(FPsave*);

-void	fpxrestores(FPsave*);

-void	fpxsave(FPsave*);

-void	fpxsaveopt(FPsave*);

-void	fpxsaves(FPsave*);

-void	fpx87restore(FPsave*);

-void	fpx87save(FPsave*);

+void	fpuprocsetup(Proc*);

+void	fpuprocfork(Proc*);

+void	fpuprocsave(Proc*);

+void	fpuprocrestore(Proc*);

 int	fpusave(void);

 void	fpurestore(int);

 u64int	getcr0(void);

--- a/sys/src/9/pc64/fpu.c

+++ b/sys/src/9/pc64/fpu.c

@@ -3,6 +3,8 @@

 #include "mem.h"

 #include "dat.h"

 #include "fns.h"

+#include "ureg.h"

+#include "io.h"

 enum {

 	CR4Osfxsr  = 1 << 9,

@@ -10,7 +12,253 @@

 	CR4Oxsave  = 1 << 18,

};

+/*

+ * SIMD Floating Point.

+ * Assembler support to get at the individual instructions

+ * is in l.s.

+ */

+extern void _clts(void);

+extern void _fldcw(u16int);

+extern void _fnclex(void);

+extern void _fninit(void);

+extern void _fxrstor(void*);

+extern void _fxsave(void*);

+extern void _xrstor(void*);

+extern void _xsave(void*);

+extern void _xsaveopt(void*);

+extern void _fwait(void);

+extern void _ldmxcsr(u32int);

+extern void _stts(void);

+/*

+ * not used, AMD64 mandated SSE

+ */

+static void

+fpx87save(FPsave*)

+{

+}

+static void

+fpx87restore(FPsave*)

+{

+}

+static void

+fpssesave(FPsave *s)

+{

+	_fxsave(s);

+	_stts();

+}

+static void

+fpsserestore(FPsave *s)

+{

+	_clts();

+	_fxrstor(s);

+}

+static void

+fpxsave(FPsave *s)

+{

+	_xsave(s);

+	_stts();

+}

+static void

+fpxrestore(FPsave *s)

+{

+	_clts();

+	_xrstor(s);

+}

+static void

+fpxsaves(FPsave *s)

+{

+	_xsaveopt(s);

+	_stts();

+}

+static void

+fpxrestores(FPsave *s)

+{

+	_clts();

+	_xrstor(s);

+}

+static void

+fpxsaveopt(FPsave *s)

+{

+	_xsaveopt(s);

+	_stts();

+}

+static char* mathmsg[] =

+{

+	nil,	/* handled below */

+	"denormalized operand",

+	"division by zero",

+	"numeric overflow",

+	"numeric underflow",

+	"precision loss",

+};

+static void

+mathnote(ulong status, uintptr pc)

+{

+	char *msg, note[ERRMAX];

+	int i;

+	/*

+	 * Some attention should probably be paid here to the

+	 * exception masks and error summary.

+	 */

+	msg = "unknown exception";

+	for(i = 1; i <= 5; i++){

+		if(!((1<<i) & status))

+			continue;

+		msg = mathmsg[i];

+		break;

+	}

+	if(status & 0x01){

+		if(status & 0x40){

+			if(status & 0x200)

+				msg = "stack overflow";

+			else

+				msg = "stack underflow";

+		}else

+			msg = "invalid operation";

+	}

+	snprint(note, sizeof note, "sys: fp: %s fppc=%#p status=0x%lux",

+		msg, pc, status);

+	postnote(up, 1, note, NDebug);

+}

+/*

+ *  math coprocessor error

+ */

+static void

+matherror(Ureg *, void*)

+{

+	/*

+	 * Save FPU state to check out the error.

+	 */

+	fpsave(up->fpsave);

+	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

+	mathnote(up->fpsave->fsw, up->fpsave->rip);

+}

+/*

+ *  SIMD error

+ */

+static void

+simderror(Ureg *ureg, void*)

+{

+	fpsave(up->fpsave);

+	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

+	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);

+}

 void

+fpinit(void)

+{

+	/*

+	 * A process tries to use the FPU for the

+	 * first time and generates a 'device not available'

+	 * exception.

+	 * Turn the FPU on and initialise it for use.

+	 * Set the precision and mask the exceptions

+	 * we don't care about from the generic Mach value.

+	 */

+	_clts();

+	_fninit();

+	_fwait();

+	_fldcw(0x0232);

+	_ldmxcsr(0x1900);

+}

+/*

+ *  math coprocessor emulation fault

+ */

+static void

+mathemu(Ureg *ureg, void*)

+{

+	ulong status, control;

+	int index;

+	if(up->fpstate & FPillegal){

+		/* someone did floating point in a note handler */

+		postnote(up, 1, "sys: floating point in note handler", NDebug);

+		return;

+	}

+	switch(up->fpstate & ~(FPnouser|FPkernel|FPindexm)){

+	case FPactive	| FPpush:

+		_clts();

+		fpsave(up->fpsave);

+	case FPinactive	| FPpush:

+		up->fpstate += FPindex1;

+	case FPinit	| FPpush:

+	case FPinit:

+		fpinit();

+		index = up->fpstate >> FPindexs;

+		if(index < 0 || index > (FPindexm>>FPindexs))

+			panic("fpslot index overflow: %d", index);

+		if(userureg(ureg)){

+			if(index != 0)

+				panic("fpslot index %d != 0 for user", index);

+		} else {

+			if(index == 0)

+				up->fpstate |= FPnouser;

+			up->fpstate |= FPkernel;

+		}

+		while(up->fpslot[index] == nil)

+			up->fpslot[index] = mallocalign(sizeof(FPsave), FPalign, 0, 0);

+		up->fpsave = up->fpslot[index];

+		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

+		break;

+	case FPinactive:

+		/*

+		 * Before restoring the state, check for any pending

+		 * exceptions, there's no way to restore the state without

+		 * generating an unmasked exception.

+		 * More attention should probably be paid here to the

+		 * exception masks and error summary.

+		 */

+		status = up->fpsave->fsw;

+		control = up->fpsave->fcw;

+		if((status & ~control) & 0x07F){

+			mathnote(status, up->fpsave->rip);

+			break;

+		}

+		fprestore(up->fpsave);

+		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

+		break;

+	case FPactive:

+		panic("math emu pid %ld %s pc %#p",

+			up->pid, up->text, ureg->pc);

+		break;

+	}

+}

+/*

+ *  math coprocessor segment overrun

+ */

+static void

+mathover(Ureg*, void*)

+{

+	pexit("math overrun", 0);

+}

+void

+mathinit(void)

+{

+	trapenable(VectorCERR, matherror, 0, "matherror");

+	if(m->cpuidfamily == 3)

+		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");

+	trapenable(VectorCNA, mathemu, 0, "mathemu");

+	trapenable(VectorCSO, mathover, 0, "mathover");

+	trapenable(VectorSIMD, simderror, 0, "simderror");

+}

+/*

+ * fpuinit(), called from cpuidentify() for each cpu.

+ */

+void

 fpuinit(void)

 	uintptr cr4;

@@ -42,4 +290,101 @@

 		fpsave = fpx87save;

 		fprestore = fpx87restore;

+}

+void

+fpuprocsetup(Proc *p)

+{

+	p->fpstate = FPinit;

+	_stts();

+}

+void

+fpuprocfork(Proc *p)

+{

+	int s;

+	/* save floating point state */

+	s = splhi();

+	switch(up->fpstate & ~FPillegal){

+	case FPactive	| FPpush:

+		_clts();

+	case FPactive:

+		fpsave(up->fpsave);

+		up->fpstate = FPinactive | (up->fpstate & FPpush);

+	case FPactive	| FPkernel:

+	case FPinactive	| FPkernel:

+	case FPinactive	| FPpush:

+	case FPinactive:

+		while(p->fpslot[0] == nil)

+			p->fpslot[0] = mallocalign(sizeof(FPsave), FPalign, 0, 0);

+		memmove(p->fpsave = p->fpslot[0], up->fpslot[0], sizeof(FPsave));

+		p->fpstate = FPinactive;

+	}

+	splx(s);

+}

+void

+fpuprocsave(Proc *p)

+{

+	switch(p->fpstate & ~(FPnouser|FPkernel|FPindexm)){

+	case FPactive	| FPpush:

+		_clts();

+	case FPactive:

+		if(p->state == Moribund){

+			_fnclex();

+			_stts();

+			break;

+		}

+		/*

+		 * Fpsave() stores without handling pending

+		 * unmasked exeptions. Postnote() can't be called

+		 * here as sleep() already has up->rlock, so

+		 * the handling of pending exceptions is delayed

+		 * until the process runs again and generates an

+		 * emulation fault to activate the FPU.

+		 */

+		fpsave(p->fpsave);

+		p->fpstate = FPinactive | (p->fpstate & ~FPactive);

+		break;

+	}

+}

+void

+fpuprocrestore(Proc*)

+{

+}

+/*

+ * Fpusave and fpurestore lazily save and restore FPU state across

+ * system calls and the pagefault handler so that we can take

+ * advantage of SSE instructions such as AES-NI in the kernel.

+ */

+int

+fpusave(void)

+{

+	int ostate = up->fpstate;

+	if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)

+		_stts();

+	up->fpstate = FPpush | (ostate & ~FPillegal);

+	return ostate;

+}

+void

+fpurestore(int ostate)

+{

+	int astate = up->fpstate;

+	if(astate == (FPpush | (ostate & ~FPillegal))){

+		if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)

+			_clts();

+	} else {

+		if(astate == FPinit)	/* don't restore on procexec()/procsetup() */

+			return;

+		if((astate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)

+			_stts();

+		up->fpsave = up->fpslot[ostate>>FPindexs];

+		if(ostate & FPactive)

+			ostate = FPinactive | (ostate & ~FPactive);

+	}

+	up->fpstate = ostate;

--- a/sys/src/9/pc64/main.c

+++ b/sys/src/9/pc64/main.c

@@ -293,254 +293,10 @@

 	rebootjump((uintptr)entry & (ulong)~0xF0000000UL, PADDR(code), size);

-/*

- * SIMD Floating Point.

- * Assembler support to get at the individual instructions

- * is in l.s.

- */

-extern void _clts(void);

-extern void _fldcw(u16int);

-extern void _fnclex(void);

-extern void _fninit(void);

-extern void _fxrstor(void*);

-extern void _fxsave(void*);

-extern void _xrstor(void*);

-extern void _xsave(void*);

-extern void _xsaveopt(void*);

-extern void _fwait(void);

-extern void _ldmxcsr(u32int);

-extern void _stts(void);

-/*

- * not used, AMD64 mandated SSE

- */

 void

-fpx87save(FPsave*)

-{

-}

-void

-fpx87restore(FPsave*)

-{

-}

-void

-fpssesave(FPsave *s)

-{

-	_fxsave(s);

-	_stts();

-}

-void

-fpsserestore(FPsave *s)

-{

-	_clts();

-	_fxrstor(s);

-}

-void

-fpxsave(FPsave *s)

-{

-	_xsave(s);

-	_stts();

-}

-void

-fpxrestore(FPsave *s)

-{

-	_clts();

-	_xrstor(s);

-}

-void

-fpxsaves(FPsave *s)

-{

-	_xsaveopt(s);

-	_stts();

-}

-void

-fpxrestores(FPsave *s)

-{

-	_clts();

-	_xrstor(s);

-}

-void

-fpxsaveopt(FPsave *s)

-{

-	_xsaveopt(s);

-	_stts();

-}

-static char* mathmsg[] =

-{

-	nil,	/* handled below */

-	"denormalized operand",

-	"division by zero",

-	"numeric overflow",

-	"numeric underflow",

-	"precision loss",

-};

-static void

-mathnote(ulong status, uintptr pc)

-{

-	char *msg, note[ERRMAX];

-	int i;

-	/*

-	 * Some attention should probably be paid here to the

-	 * exception masks and error summary.

-	 */

-	msg = "unknown exception";

-	for(i = 1; i <= 5; i++){

-		if(!((1<<i) & status))

-			continue;

-		msg = mathmsg[i];

-		break;

-	}

-	if(status & 0x01){

-		if(status & 0x40){

-			if(status & 0x200)

-				msg = "stack overflow";

-			else

-				msg = "stack underflow";

-		}else

-			msg = "invalid operation";

-	}

-	snprint(note, sizeof note, "sys: fp: %s fppc=%#p status=0x%lux",

-		msg, pc, status);

-	postnote(up, 1, note, NDebug);

-}

-/*

- *  math coprocessor error

- */

-static void

-matherror(Ureg *, void*)

-{

-	/*

-	 * Save FPU state to check out the error.

-	 */

-	fpsave(up->fpsave);

-	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

-	mathnote(up->fpsave->fsw, up->fpsave->rip);

-}

-/*

- *  SIMD error

- */

-static void

-simderror(Ureg *ureg, void*)

-{

-	fpsave(up->fpsave);

-	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

-	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);

-}

-void

-fpinit(void)

-{

-	/*

-	 * A process tries to use the FPU for the

-	 * first time and generates a 'device not available'

-	 * exception.

-	 * Turn the FPU on and initialise it for use.

-	 * Set the precision and mask the exceptions

-	 * we don't care about from the generic Mach value.

-	 */

-	_clts();

-	_fninit();

-	_fwait();

-	_fldcw(0x0232);

-	_ldmxcsr(0x1900);

-}

-/*

- *  math coprocessor emulation fault

- */

-static void

-mathemu(Ureg *ureg, void*)

-{

-	ulong status, control;

-	int index;

-	if(up->fpstate & FPillegal){

-		/* someone did floating point in a note handler */

-		postnote(up, 1, "sys: floating point in note handler", NDebug);

-		return;

-	}

-	switch(up->fpstate & ~(FPnouser|FPkernel|FPindexm)){

-	case FPactive	| FPpush:

-		_clts();

-		fpsave(up->fpsave);

-	case FPinactive	| FPpush:

-		up->fpstate += FPindex1;

-	case FPinit	| FPpush:

-	case FPinit:

-		fpinit();

-		index = up->fpstate >> FPindexs;

-		if(index < 0 || index > (FPindexm>>FPindexs))

-			panic("fpslot index overflow: %d", index);

-		if(userureg(ureg)){

-			if(index != 0)

-				panic("fpslot index %d != 0 for user", index);

-		} else {

-			if(index == 0)

-				up->fpstate |= FPnouser;

-			up->fpstate |= FPkernel;

-		}

-		while(up->fpslot[index] == nil)

-			up->fpslot[index] = mallocalign(sizeof(FPsave), FPalign, 0, 0);

-		up->fpsave = up->fpslot[index];

-		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

-		break;

-	case FPinactive:

-		/*

-		 * Before restoring the state, check for any pending

-		 * exceptions, there's no way to restore the state without

-		 * generating an unmasked exception.

-		 * More attention should probably be paid here to the

-		 * exception masks and error summary.

-		 */

-		status = up->fpsave->fsw;

-		control = up->fpsave->fcw;

-		if((status & ~control) & 0x07F){

-			mathnote(status, up->fpsave->rip);

-			break;

-		}

-		fprestore(up->fpsave);

-		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));

-		break;

-	case FPactive:

-		panic("math emu pid %ld %s pc %#p",

-			up->pid, up->text, ureg->pc);

-		break;

-	}

-}

-/*

- *  math coprocessor segment overrun

- */

-static void

-mathover(Ureg*, void*)

-{

-	pexit("math overrun", 0);

-}

-void

-mathinit(void)

-{

-	trapenable(VectorCERR, matherror, 0, "matherror");

-	if(m->cpuidfamily == 3)

-		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");

-	trapenable(VectorCNA, mathemu, 0, "mathemu");

-	trapenable(VectorCSO, mathover, 0, "mathover");

-	trapenable(VectorSIMD, simderror, 0, "simderror");

-}

-void

 procsetup(Proc *p)

-	p->fpstate = FPinit;

-	_stts();

+	fpuprocsetup(p);

 	/* clear debug registers */

 	memset(p->dr, 0, sizeof(p->dr));

@@ -556,29 +312,10 @@

 void

 procfork(Proc *p)

-	int s;

 	p->kentry = up->kentry;

 	p->pcycles = -p->kentry;

-	/* save floating point state */

-	s = splhi();

-	switch(up->fpstate & ~FPillegal){

-	case FPactive	| FPpush:

-		_clts();

-	case FPactive:

-		fpsave(up->fpsave);

-		up->fpstate = FPinactive | (up->fpstate & FPpush);

-	case FPactive	| FPkernel:

-	case FPinactive	| FPkernel:

-	case FPinactive	| FPpush:

-	case FPinactive:

-		while(p->fpslot[0] == nil)

-			p->fpslot[0] = mallocalign(sizeof(FPsave), FPalign, 0, 0);

-		memmove(p->fpsave = p->fpslot[0], up->fpslot[0], sizeof(FPsave));

-		p->fpstate = FPinactive;

-	}

-	splx(s);

+	fpuprocfork(p);

 void

@@ -594,6 +331,8 @@

 	if(p->vmx != nil)

 		vmxprocrestore(p);

+	fpuprocrestore(p);

 	if(p->kp)

 		return;

@@ -618,27 +357,7 @@

 	if(p->state == Moribund)

 		p->dr[7] = 0;

-	switch(p->fpstate & ~(FPnouser|FPkernel|FPindexm)){

-	case FPactive	| FPpush:

-		_clts();

-	case FPactive:

-		if(p->state == Moribund){

-			_fnclex();

-			_stts();

-			break;

-		}

-		/*

-		 * Fpsave() stores without handling pending

-		 * unmasked exeptions. Postnote() can't be called

-		 * here as sleep() already has up->rlock, so

-		 * the handling of pending exceptions is delayed

-		 * until the process runs again and generates an

-		 * emulation fault to activate the FPU.

-		 */

-		fpsave(p->fpsave);

-		p->fpstate = FPinactive | (p->fpstate & ~FPactive);

-		break;

-	}

+	fpuprocsave(p);

/*

 	 * While this processor is in the scheduler, the process could run

@@ -652,37 +371,4 @@

 	 * especially on VMware, but it turns out not to matter.

*/

 	mmuflushtlb();

-}

-/*

- * Fpusave and fpurestore lazily save and restore FPU state across

- * system calls and the pagefault handler so that we can take

- * advantage of SSE instructions such as AES-NI in the kernel.

- */

-int

-fpusave(void)

-{

-	int ostate = up->fpstate;

-	if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)

-		_stts();

-	up->fpstate = FPpush | (ostate & ~FPillegal);

-	return ostate;

-}

-void

-fpurestore(int ostate)

-{

-	int astate = up->fpstate;

-	if(astate == (FPpush | (ostate & ~FPillegal))){

-		if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)

-			_clts();

-	} else {

-		if(astate == FPinit)	/* don't restore on procexec()/procsetup() */

-			return;

-		if((astate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)

-			_stts();

-		up->fpsave = up->fpslot[ostate>>FPindexs];

-		if(ostate & FPactive)

-			ostate = FPinactive | (ostate & ~FPactive);

-	}

-	up->fpstate = ostate;