shithub: riscv

Download patch

ref: 8c1bde46f0aa97e9f018e7fb805f367e908fa379
parent: dbbae6d38405cdd817f84e2ace104bb27963a246
author: cinap_lenrek <[email protected]>
date: Sun Dec 6 16:07:30 EST 2020

pc, pc64: move all fpu specific code from main.c to fpu.c

--- a/sys/src/9/pc/fns.h
+++ b/sys/src/9/pc/fns.h
@@ -41,12 +41,6 @@
 void	fpoff(void);
 void	(*fprestore)(FPsave*);
 void	(*fpsave)(FPsave*);
-void	fpsserestore(FPsave*);
-void	fpssesave(FPsave*);
-void	fpx87restore(FPsave*);
-void	fpx87restore0(FPsave*);
-void	fpx87save(FPsave*);
-void	fpx87save0(FPsave*);
 ulong	getcr0(void);
 ulong	getcr2(void);
 ulong	getcr3(void);
--- a/sys/src/9/pc/fpu.c
+++ b/sys/src/9/pc/fpu.c
@@ -3,6 +3,8 @@
 #include "mem.h"
 #include "dat.h"
 #include "fns.h"
+#include "io.h"
+#include "ureg.h"
 
 enum {
 	CR4Osfxsr  = 1 << 9,
@@ -9,11 +11,286 @@
 	CR4Oxmmex  = 1 << 10,
 };
 
+/* from l.s */
+extern void fpsserestore(FPsave*);
+extern void fpssesave(FPsave*);
+extern void fpx87restore0(FPsave*);
+extern void fpx87save0(FPsave*);
+
 void
 putxcr0(ulong)
 {
 }
 
+/*
+ * we keep FPsave structure in SSE format emulating FXSAVE / FXRSTOR
+ * instructions for legacy x87 fpu.
+ */
+static void
+fpx87save(FPsave *fps)
+{
+	ushort tag;
+
+	fpx87save0(fps);
+
+	/*
+	 * convert x87 tag word to fxsave tag byte:
+	 * 00, 01, 10 -> 1, 11 -> 0
+	 */
+	tag = ~fps->tag;
+	tag = (tag | (tag >> 1)) & 0x5555;
+	tag = (tag | (tag >> 1)) & 0x3333;
+	tag = (tag | (tag >> 2)) & 0x0F0F;
+	tag = (tag | (tag >> 4)) & 0x00FF;
+
+	/* NOP fps->fcw = fps->control; */
+	fps->fsw = fps->status;
+	fps->ftw = tag;
+	fps->fop = fps->opcode;
+	fps->fpuip = fps->pc;
+	fps->cs = fps->selector;
+	fps->fpudp = fps->operand;
+	fps->ds = fps->oselector;
+
+#define MOVA(d,s) \
+	*((ushort*)(d+8)) = *((ushort*)(s+8)), \
+	*((ulong*)(d+4)) = *((ulong*)(s+4)), \
+	*((ulong*)(d)) = *((ulong*)(s))
+
+	MOVA(fps->xregs+0x70, fps->regs+70);
+	MOVA(fps->xregs+0x60, fps->regs+60);
+	MOVA(fps->xregs+0x50, fps->regs+50);
+	MOVA(fps->xregs+0x40, fps->regs+40);
+	MOVA(fps->xregs+0x30, fps->regs+30);
+	MOVA(fps->xregs+0x20, fps->regs+20);
+	MOVA(fps->xregs+0x10, fps->regs+10);
+	MOVA(fps->xregs+0x00, fps->regs+00);
+
+#undef MOVA
+
+#define CLR6(d)	\
+	*((ulong*)(d)) = 0, \
+	*((ushort*)(d+4)) = 0
+
+	CLR6(fps->xregs+0x70+10);
+	CLR6(fps->xregs+0x60+10);
+	CLR6(fps->xregs+0x50+10);
+	CLR6(fps->xregs+0x40+10);
+	CLR6(fps->xregs+0x30+10);
+	CLR6(fps->xregs+0x20+10);
+	CLR6(fps->xregs+0x10+10);
+	CLR6(fps->xregs+0x00+10);
+
+#undef CLR6
+
+	fps->rsrvd1 = fps->rsrvd2 = fps->mxcsr = fps->mxcsr_mask = 0;
+}
+
+static void
+fpx87restore(FPsave *fps)
+{
+	ushort msk, tos, tag, *reg;
+
+	/* convert fxsave tag byte to x87 tag word */
+	tag = 0;
+	tos = 7 - ((fps->fsw >> 11) & 7);
+	for(msk = 0x80; msk != 0; tos--, msk >>= 1){
+		tag <<= 2;
+		if((fps->ftw & msk) != 0){
+			reg = (ushort*)&fps->xregs[(tos & 7) << 4];
+			switch(reg[4] & 0x7fff){
+			case 0x0000:
+				if((reg[0] | reg[1] | reg[2] | reg[3]) == 0){
+					tag |= 1;	/* 01 zero */
+					break;
+				}
+				/* no break */
+			case 0x7fff:
+				tag |= 2;		/* 10 special */
+				break;
+			default:
+				if((reg[3] & 0x8000) == 0)
+					break;		/* 00 valid */
+				tag |= 2;		/* 10 special */
+				break;
+			}
+		}else{
+			tag |= 3;			/* 11 empty */
+		}
+	}
+
+#define MOVA(d,s) \
+	*((ulong*)(d)) = *((ulong*)(s)), \
+	*((ulong*)(d+4)) = *((ulong*)(s+4)), \
+	*((ushort*)(d+8)) = *((ushort*)(s+8))
+
+	MOVA(fps->regs+00, fps->xregs+0x00);
+	MOVA(fps->regs+10, fps->xregs+0x10);
+	MOVA(fps->regs+20, fps->xregs+0x20);
+	MOVA(fps->regs+30, fps->xregs+0x30);
+	MOVA(fps->regs+40, fps->xregs+0x40);
+	MOVA(fps->regs+50, fps->xregs+0x50);
+	MOVA(fps->regs+60, fps->xregs+0x60);
+	MOVA(fps->regs+70, fps->xregs+0x70);
+
+#undef MOVA
+
+	fps->oselector = fps->ds;
+	fps->operand = fps->fpudp;
+	fps->opcode = fps->fop & 0x7ff;
+	fps->selector = fps->cs;
+	fps->pc = fps->fpuip;
+	fps->tag = tag;
+	fps->status = fps->fsw;
+	/* NOP fps->control = fps->fcw;  */
+
+	fps->r1 = fps->r2 = fps->r3 = fps->r4 = 0;
+
+	fpx87restore0(fps);
+}
+
+static char* mathmsg[] =
+{
+	nil,	/* handled below */
+	"denormalized operand",
+	"division by zero",
+	"numeric overflow",
+	"numeric underflow",
+	"precision loss",
+};
+
+static void
+mathnote(ulong status, ulong pc)
+{
+	char *msg, note[ERRMAX];
+	int i;
+
+	/*
+	 * Some attention should probably be paid here to the
+	 * exception masks and error summary.
+	 */
+	msg = "unknown exception";
+	for(i = 1; i <= 5; i++){
+		if(!((1<<i) & status))
+			continue;
+		msg = mathmsg[i];
+		break;
+	}
+	if(status & 0x01){
+		if(status & 0x40){
+			if(status & 0x200)
+				msg = "stack overflow";
+			else
+				msg = "stack underflow";
+		}else
+			msg = "invalid operation";
+	}
+	snprint(note, sizeof note, "sys: fp: %s fppc=0x%lux status=0x%lux",
+		msg, pc, status);
+	postnote(up, 1, note, NDebug);
+}
+
+/*
+ *  math coprocessor error
+ */
+static void
+matherror(Ureg*, void*)
+{
+	/*
+	 *  a write cycle to port 0xF0 clears the interrupt latch attached
+	 *  to the error# line from the 387
+	 */
+	if(!(m->cpuiddx & Fpuonchip))
+		outb(0xF0, 0xFF);
+
+	/*
+	 *  get floating point state to check out error
+	 */
+	fpsave(up->fpsave);
+	up->fpstate = FPinactive;
+	mathnote(up->fpsave->fsw, up->fpsave->fpuip);
+}
+
+/*
+ *  SIMD error
+ */
+static void
+simderror(Ureg *ureg, void*)
+{
+	fpsave(up->fpsave);
+	up->fpstate = FPinactive;
+	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);
+}
+
+/*
+ *  math coprocessor emulation fault
+ */
+static void
+mathemu(Ureg *ureg, void*)
+{
+	ulong status, control;
+
+	if(up->fpstate & FPillegal){
+		/* someone did floating point in a note handler */
+		postnote(up, 1, "sys: floating point in note handler", NDebug);
+		return;
+	}
+	switch(up->fpstate){
+	case FPinit:
+		fpinit();
+		if(fpsave == fpssesave)
+			ldmxcsr(0x1f80);	/* no simd exceptions on 386 */
+		while(up->fpsave == nil)
+			up->fpsave = mallocalign(sizeof(FPsave), FPalign, 0, 0);
+		up->fpstate = FPactive;
+		break;
+	case FPinactive:
+		/*
+		 * Before restoring the state, check for any pending
+		 * exceptions, there's no way to restore the state without
+		 * generating an unmasked exception.
+		 * More attention should probably be paid here to the
+		 * exception masks and error summary.
+		 */
+		status = up->fpsave->fsw;
+		control = up->fpsave->fcw;
+		if((status & ~control) & 0x07F){
+			mathnote(status, up->fpsave->fpuip);
+			break;
+		}
+		fprestore(up->fpsave);
+		up->fpstate = FPactive;
+		break;
+	case FPactive:
+		panic("math emu pid %ld %s pc 0x%lux", 
+			up->pid, up->text, ureg->pc);
+		break;
+	}
+}
+
+/*
+ *  math coprocessor segment overrun
+ */
+static void
+mathover(Ureg*, void*)
+{
+	pexit("math overrun", 0);
+}
+
+void
+mathinit(void)
+{
+	trapenable(VectorCERR, matherror, 0, "matherror");
+	if(m->cpuidfamily == 3)
+		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");
+	trapenable(VectorCNA, mathemu, 0, "mathemu");
+	trapenable(VectorCSO, mathover, 0, "mathover");
+	trapenable(VectorSIMD, simderror, 0, "simderror");
+}
+
+/*
+ * fpuinit(), called from cpuidentify() for each cpu.
+ */
 void
 fpuinit(void)
 {
--- a/sys/src/9/pc/main.c
+++ b/sys/src/9/pc/main.c
@@ -234,272 +234,6 @@
 }
 
 /*
- * we keep FPsave structure in SSE format emulating FXSAVE / FXRSTOR
- * instructions for legacy x87 fpu.
- */
-void
-fpx87save(FPsave *fps)
-{
-	ushort tag;
-
-	fpx87save0(fps);
-
-	/*
-	 * convert x87 tag word to fxsave tag byte:
-	 * 00, 01, 10 -> 1, 11 -> 0
-	 */
-	tag = ~fps->tag;
-	tag = (tag | (tag >> 1)) & 0x5555;
-	tag = (tag | (tag >> 1)) & 0x3333;
-	tag = (tag | (tag >> 2)) & 0x0F0F;
-	tag = (tag | (tag >> 4)) & 0x00FF;
-
-	/* NOP fps->fcw = fps->control; */
-	fps->fsw = fps->status;
-	fps->ftw = tag;
-	fps->fop = fps->opcode;
-	fps->fpuip = fps->pc;
-	fps->cs = fps->selector;
-	fps->fpudp = fps->operand;
-	fps->ds = fps->oselector;
-
-#define MOVA(d,s) \
-	*((ushort*)(d+8)) = *((ushort*)(s+8)), \
-	*((ulong*)(d+4)) = *((ulong*)(s+4)), \
-	*((ulong*)(d)) = *((ulong*)(s))
-
-	MOVA(fps->xregs+0x70, fps->regs+70);
-	MOVA(fps->xregs+0x60, fps->regs+60);
-	MOVA(fps->xregs+0x50, fps->regs+50);
-	MOVA(fps->xregs+0x40, fps->regs+40);
-	MOVA(fps->xregs+0x30, fps->regs+30);
-	MOVA(fps->xregs+0x20, fps->regs+20);
-	MOVA(fps->xregs+0x10, fps->regs+10);
-	MOVA(fps->xregs+0x00, fps->regs+00);
-
-#undef MOVA
-
-#define CLR6(d)	\
-	*((ulong*)(d)) = 0, \
-	*((ushort*)(d+4)) = 0
-
-	CLR6(fps->xregs+0x70+10);
-	CLR6(fps->xregs+0x60+10);
-	CLR6(fps->xregs+0x50+10);
-	CLR6(fps->xregs+0x40+10);
-	CLR6(fps->xregs+0x30+10);
-	CLR6(fps->xregs+0x20+10);
-	CLR6(fps->xregs+0x10+10);
-	CLR6(fps->xregs+0x00+10);
-
-#undef CLR6
-
-	fps->rsrvd1 = fps->rsrvd2 = fps->mxcsr = fps->mxcsr_mask = 0;
-}
-
-void
-fpx87restore(FPsave *fps)
-{
-	ushort msk, tos, tag, *reg;
-
-	/* convert fxsave tag byte to x87 tag word */
-	tag = 0;
-	tos = 7 - ((fps->fsw >> 11) & 7);
-	for(msk = 0x80; msk != 0; tos--, msk >>= 1){
-		tag <<= 2;
-		if((fps->ftw & msk) != 0){
-			reg = (ushort*)&fps->xregs[(tos & 7) << 4];
-			switch(reg[4] & 0x7fff){
-			case 0x0000:
-				if((reg[0] | reg[1] | reg[2] | reg[3]) == 0){
-					tag |= 1;	/* 01 zero */
-					break;
-				}
-				/* no break */
-			case 0x7fff:
-				tag |= 2;		/* 10 special */
-				break;
-			default:
-				if((reg[3] & 0x8000) == 0)
-					break;		/* 00 valid */
-				tag |= 2;		/* 10 special */
-				break;
-			}
-		}else{
-			tag |= 3;			/* 11 empty */
-		}
-	}
-
-#define MOVA(d,s) \
-	*((ulong*)(d)) = *((ulong*)(s)), \
-	*((ulong*)(d+4)) = *((ulong*)(s+4)), \
-	*((ushort*)(d+8)) = *((ushort*)(s+8))
-
-	MOVA(fps->regs+00, fps->xregs+0x00);
-	MOVA(fps->regs+10, fps->xregs+0x10);
-	MOVA(fps->regs+20, fps->xregs+0x20);
-	MOVA(fps->regs+30, fps->xregs+0x30);
-	MOVA(fps->regs+40, fps->xregs+0x40);
-	MOVA(fps->regs+50, fps->xregs+0x50);
-	MOVA(fps->regs+60, fps->xregs+0x60);
-	MOVA(fps->regs+70, fps->xregs+0x70);
-
-#undef MOVA
-
-	fps->oselector = fps->ds;
-	fps->operand = fps->fpudp;
-	fps->opcode = fps->fop & 0x7ff;
-	fps->selector = fps->cs;
-	fps->pc = fps->fpuip;
-	fps->tag = tag;
-	fps->status = fps->fsw;
-	/* NOP fps->control = fps->fcw;  */
-
-	fps->r1 = fps->r2 = fps->r3 = fps->r4 = 0;
-
-	fpx87restore0(fps);
-}
-
-static char* mathmsg[] =
-{
-	nil,	/* handled below */
-	"denormalized operand",
-	"division by zero",
-	"numeric overflow",
-	"numeric underflow",
-	"precision loss",
-};
-
-static void
-mathnote(ulong status, ulong pc)
-{
-	char *msg, note[ERRMAX];
-	int i;
-
-	/*
-	 * Some attention should probably be paid here to the
-	 * exception masks and error summary.
-	 */
-	msg = "unknown exception";
-	for(i = 1; i <= 5; i++){
-		if(!((1<<i) & status))
-			continue;
-		msg = mathmsg[i];
-		break;
-	}
-	if(status & 0x01){
-		if(status & 0x40){
-			if(status & 0x200)
-				msg = "stack overflow";
-			else
-				msg = "stack underflow";
-		}else
-			msg = "invalid operation";
-	}
-	snprint(note, sizeof note, "sys: fp: %s fppc=0x%lux status=0x%lux",
-		msg, pc, status);
-	postnote(up, 1, note, NDebug);
-}
-
-/*
- *  math coprocessor error
- */
-static void
-matherror(Ureg*, void*)
-{
-	/*
-	 *  a write cycle to port 0xF0 clears the interrupt latch attached
-	 *  to the error# line from the 387
-	 */
-	if(!(m->cpuiddx & Fpuonchip))
-		outb(0xF0, 0xFF);
-
-	/*
-	 *  get floating point state to check out error
-	 */
-	fpsave(up->fpsave);
-	up->fpstate = FPinactive;
-	mathnote(up->fpsave->fsw, up->fpsave->fpuip);
-}
-
-/*
- *  SIMD error
- */
-static void
-simderror(Ureg *ureg, void*)
-{
-	fpsave(up->fpsave);
-	up->fpstate = FPinactive;
-	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);
-}
-
-/*
- *  math coprocessor emulation fault
- */
-static void
-mathemu(Ureg *ureg, void*)
-{
-	ulong status, control;
-
-	if(up->fpstate & FPillegal){
-		/* someone did floating point in a note handler */
-		postnote(up, 1, "sys: floating point in note handler", NDebug);
-		return;
-	}
-	switch(up->fpstate){
-	case FPinit:
-		fpinit();
-		if(fpsave == fpssesave)
-			ldmxcsr(0x1f80);	/* no simd exceptions on 386 */
-		while(up->fpsave == nil)
-			up->fpsave = mallocalign(sizeof(FPsave), FPalign, 0, 0);
-		up->fpstate = FPactive;
-		break;
-	case FPinactive:
-		/*
-		 * Before restoring the state, check for any pending
-		 * exceptions, there's no way to restore the state without
-		 * generating an unmasked exception.
-		 * More attention should probably be paid here to the
-		 * exception masks and error summary.
-		 */
-		status = up->fpsave->fsw;
-		control = up->fpsave->fcw;
-		if((status & ~control) & 0x07F){
-			mathnote(status, up->fpsave->fpuip);
-			break;
-		}
-		fprestore(up->fpsave);
-		up->fpstate = FPactive;
-		break;
-	case FPactive:
-		panic("math emu pid %ld %s pc 0x%lux", 
-			up->pid, up->text, ureg->pc);
-		break;
-	}
-}
-
-/*
- *  math coprocessor segment overrun
- */
-static void
-mathover(Ureg*, void*)
-{
-	pexit("math overrun", 0);
-}
-
-void
-mathinit(void)
-{
-	trapenable(VectorCERR, matherror, 0, "matherror");
-	if(m->cpuidfamily == 3)
-		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");
-	trapenable(VectorCNA, mathemu, 0, "mathemu");
-	trapenable(VectorCSO, mathover, 0, "mathover");
-	trapenable(VectorSIMD, simderror, 0, "simderror");
-}
-
-/*
  *  set up floating point for a new process
  */
 void
--- a/sys/src/9/pc64/fns.h
+++ b/sys/src/9/pc64/fns.h
@@ -39,15 +39,10 @@
 void	fpinit(void);
 void	(*fprestore)(FPsave*);
 void	(*fpsave)(FPsave*);
-void	fpsserestore(FPsave*);
-void	fpssesave(FPsave*);
-void	fpxrestore(FPsave*);
-void	fpxrestores(FPsave*);
-void	fpxsave(FPsave*);
-void	fpxsaveopt(FPsave*);
-void	fpxsaves(FPsave*);
-void	fpx87restore(FPsave*);
-void	fpx87save(FPsave*);
+void	fpuprocsetup(Proc*);
+void	fpuprocfork(Proc*);
+void	fpuprocsave(Proc*);
+void	fpuprocrestore(Proc*);
 int	fpusave(void);
 void	fpurestore(int);
 u64int	getcr0(void);
--- a/sys/src/9/pc64/fpu.c
+++ b/sys/src/9/pc64/fpu.c
@@ -3,6 +3,8 @@
 #include "mem.h"
 #include "dat.h"
 #include "fns.h"
+#include "ureg.h"
+#include "io.h"
 
 enum {
 	CR4Osfxsr  = 1 << 9,
@@ -10,7 +12,253 @@
 	CR4Oxsave  = 1 << 18,
 };
 
+/*
+ * SIMD Floating Point.
+ * Assembler support to get at the individual instructions
+ * is in l.s.
+ */
+extern void _clts(void);
+extern void _fldcw(u16int);
+extern void _fnclex(void);
+extern void _fninit(void);
+extern void _fxrstor(void*);
+extern void _fxsave(void*);
+extern void _xrstor(void*);
+extern void _xsave(void*);
+extern void _xsaveopt(void*);
+extern void _fwait(void);
+extern void _ldmxcsr(u32int);
+extern void _stts(void);
+
+/*
+ * not used, AMD64 mandated SSE
+ */
+static void
+fpx87save(FPsave*)
+{
+}
+static void
+fpx87restore(FPsave*)
+{
+}
+
+static void
+fpssesave(FPsave *s)
+{
+	_fxsave(s);
+	_stts();
+}
+static void
+fpsserestore(FPsave *s)
+{
+	_clts();
+	_fxrstor(s);
+}
+
+static void
+fpxsave(FPsave *s)
+{
+	_xsave(s);
+	_stts();
+}
+static void
+fpxrestore(FPsave *s)
+{
+	_clts();
+	_xrstor(s);
+}
+
+static void
+fpxsaves(FPsave *s)
+{
+	_xsaveopt(s);
+	_stts();
+}
+static void
+fpxrestores(FPsave *s)
+{
+	_clts();
+	_xrstor(s);
+}
+
+static void
+fpxsaveopt(FPsave *s)
+{
+	_xsaveopt(s);
+	_stts();
+}
+
+static char* mathmsg[] =
+{
+	nil,	/* handled below */
+	"denormalized operand",
+	"division by zero",
+	"numeric overflow",
+	"numeric underflow",
+	"precision loss",
+};
+
+static void
+mathnote(ulong status, uintptr pc)
+{
+	char *msg, note[ERRMAX];
+	int i;
+
+	/*
+	 * Some attention should probably be paid here to the
+	 * exception masks and error summary.
+	 */
+	msg = "unknown exception";
+	for(i = 1; i <= 5; i++){
+		if(!((1<<i) & status))
+			continue;
+		msg = mathmsg[i];
+		break;
+	}
+	if(status & 0x01){
+		if(status & 0x40){
+			if(status & 0x200)
+				msg = "stack overflow";
+			else
+				msg = "stack underflow";
+		}else
+			msg = "invalid operation";
+	}
+	snprint(note, sizeof note, "sys: fp: %s fppc=%#p status=0x%lux",
+		msg, pc, status);
+	postnote(up, 1, note, NDebug);
+}
+
+/*
+ *  math coprocessor error
+ */
+static void
+matherror(Ureg *, void*)
+{
+	/*
+	 * Save FPU state to check out the error.
+	 */
+	fpsave(up->fpsave);
+	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
+	mathnote(up->fpsave->fsw, up->fpsave->rip);
+}
+
+/*
+ *  SIMD error
+ */
+static void
+simderror(Ureg *ureg, void*)
+{
+	fpsave(up->fpsave);
+	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
+	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);
+}
+
 void
+fpinit(void)
+{
+	/*
+	 * A process tries to use the FPU for the
+	 * first time and generates a 'device not available'
+	 * exception.
+	 * Turn the FPU on and initialise it for use.
+	 * Set the precision and mask the exceptions
+	 * we don't care about from the generic Mach value.
+	 */
+	_clts();
+	_fninit();
+	_fwait();
+	_fldcw(0x0232);
+	_ldmxcsr(0x1900);
+}
+
+/*
+ *  math coprocessor emulation fault
+ */
+static void
+mathemu(Ureg *ureg, void*)
+{
+	ulong status, control;
+	int index;
+
+	if(up->fpstate & FPillegal){
+		/* someone did floating point in a note handler */
+		postnote(up, 1, "sys: floating point in note handler", NDebug);
+		return;
+	}
+	switch(up->fpstate & ~(FPnouser|FPkernel|FPindexm)){
+	case FPactive	| FPpush:
+		_clts();
+		fpsave(up->fpsave);
+	case FPinactive	| FPpush:
+		up->fpstate += FPindex1;
+	case FPinit	| FPpush:
+	case FPinit:
+		fpinit();
+		index = up->fpstate >> FPindexs;
+		if(index < 0 || index > (FPindexm>>FPindexs))
+			panic("fpslot index overflow: %d", index);
+		if(userureg(ureg)){
+			if(index != 0)
+				panic("fpslot index %d != 0 for user", index);
+		} else {
+			if(index == 0)
+				up->fpstate |= FPnouser;
+			up->fpstate |= FPkernel;
+		}
+		while(up->fpslot[index] == nil)
+			up->fpslot[index] = mallocalign(sizeof(FPsave), FPalign, 0, 0);
+		up->fpsave = up->fpslot[index];
+		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
+		break;
+	case FPinactive:
+		/*
+		 * Before restoring the state, check for any pending
+		 * exceptions, there's no way to restore the state without
+		 * generating an unmasked exception.
+		 * More attention should probably be paid here to the
+		 * exception masks and error summary.
+		 */
+		status = up->fpsave->fsw;
+		control = up->fpsave->fcw;
+		if((status & ~control) & 0x07F){
+			mathnote(status, up->fpsave->rip);
+			break;
+		}
+		fprestore(up->fpsave);
+		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
+		break;
+	case FPactive:
+		panic("math emu pid %ld %s pc %#p", 
+			up->pid, up->text, ureg->pc);
+		break;
+	}
+}
+
+/*
+ *  math coprocessor segment overrun
+ */
+static void
+mathover(Ureg*, void*)
+{
+	pexit("math overrun", 0);
+}
+
+void
+mathinit(void)
+{
+	trapenable(VectorCERR, matherror, 0, "matherror");
+	if(m->cpuidfamily == 3)
+		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");
+	trapenable(VectorCNA, mathemu, 0, "mathemu");
+	trapenable(VectorCSO, mathover, 0, "mathover");
+	trapenable(VectorSIMD, simderror, 0, "simderror");
+}
+
+/*
+ * fpuinit(), called from cpuidentify() for each cpu.
+ */
+void
 fpuinit(void)
 {
 	uintptr cr4;
@@ -42,4 +290,101 @@
 		fpsave = fpx87save;
 		fprestore = fpx87restore;
 	}
+}
+
+void
+fpuprocsetup(Proc *p)
+{
+	p->fpstate = FPinit;
+	_stts();
+}
+
+void
+fpuprocfork(Proc *p)
+{
+	int s;
+
+	/* save floating point state */
+	s = splhi();
+	switch(up->fpstate & ~FPillegal){
+	case FPactive	| FPpush:
+		_clts();
+	case FPactive:
+		fpsave(up->fpsave);
+		up->fpstate = FPinactive | (up->fpstate & FPpush);
+	case FPactive	| FPkernel:
+	case FPinactive	| FPkernel:
+	case FPinactive	| FPpush:
+	case FPinactive:
+		while(p->fpslot[0] == nil)
+			p->fpslot[0] = mallocalign(sizeof(FPsave), FPalign, 0, 0);
+		memmove(p->fpsave = p->fpslot[0], up->fpslot[0], sizeof(FPsave));
+		p->fpstate = FPinactive;
+	}
+	splx(s);
+}
+
+void
+fpuprocsave(Proc *p)
+{
+	switch(p->fpstate & ~(FPnouser|FPkernel|FPindexm)){
+	case FPactive	| FPpush:
+		_clts();
+	case FPactive:
+		if(p->state == Moribund){
+			_fnclex();
+			_stts();
+			break;
+		}
+		/*
+		 * Fpsave() stores without handling pending
+		 * unmasked exeptions. Postnote() can't be called
+		 * here as sleep() already has up->rlock, so
+		 * the handling of pending exceptions is delayed
+		 * until the process runs again and generates an
+		 * emulation fault to activate the FPU.
+		 */
+		fpsave(p->fpsave);
+		p->fpstate = FPinactive | (p->fpstate & ~FPactive);
+		break;
+	}
+}
+
+void
+fpuprocrestore(Proc*)
+{
+}
+
+
+/*
+ * Fpusave and fpurestore lazily save and restore FPU state across
+ * system calls and the pagefault handler so that we can take
+ * advantage of SSE instructions such as AES-NI in the kernel.
+ */
+int
+fpusave(void)
+{
+	int ostate = up->fpstate;
+	if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)
+		_stts();
+	up->fpstate = FPpush | (ostate & ~FPillegal);
+	return ostate;
+}
+void
+fpurestore(int ostate)
+{
+	int astate = up->fpstate;
+	if(astate == (FPpush | (ostate & ~FPillegal))){
+		if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)
+			_clts();
+	} else {
+		if(astate == FPinit)	/* don't restore on procexec()/procsetup() */
+			return;
+		if((astate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)
+			_stts();
+		up->fpsave = up->fpslot[ostate>>FPindexs];
+		if(ostate & FPactive)
+			ostate = FPinactive | (ostate & ~FPactive);
+	}
+	up->fpstate = ostate;
 }
--- a/sys/src/9/pc64/main.c
+++ b/sys/src/9/pc64/main.c
@@ -293,254 +293,10 @@
 	rebootjump((uintptr)entry & (ulong)~0xF0000000UL, PADDR(code), size);
 }
 
-/*
- * SIMD Floating Point.
- * Assembler support to get at the individual instructions
- * is in l.s.
- */
-extern void _clts(void);
-extern void _fldcw(u16int);
-extern void _fnclex(void);
-extern void _fninit(void);
-extern void _fxrstor(void*);
-extern void _fxsave(void*);
-extern void _xrstor(void*);
-extern void _xsave(void*);
-extern void _xsaveopt(void*);
-extern void _fwait(void);
-extern void _ldmxcsr(u32int);
-extern void _stts(void);
-
-/*
- * not used, AMD64 mandated SSE
- */
 void
-fpx87save(FPsave*)
-{
-}
-void
-fpx87restore(FPsave*)
-{
-}
-
-void
-fpssesave(FPsave *s)
-{
-	_fxsave(s);
-	_stts();
-}
-void
-fpsserestore(FPsave *s)
-{
-	_clts();
-	_fxrstor(s);
-}
-
-void
-fpxsave(FPsave *s)
-{
-	_xsave(s);
-	_stts();
-}
-void
-fpxrestore(FPsave *s)
-{
-	_clts();
-	_xrstor(s);
-}
-
-void
-fpxsaves(FPsave *s)
-{
-	_xsaveopt(s);
-	_stts();
-}
-void
-fpxrestores(FPsave *s)
-{
-	_clts();
-	_xrstor(s);
-}
-
-void
-fpxsaveopt(FPsave *s)
-{
-	_xsaveopt(s);
-	_stts();
-}
-
-static char* mathmsg[] =
-{
-	nil,	/* handled below */
-	"denormalized operand",
-	"division by zero",
-	"numeric overflow",
-	"numeric underflow",
-	"precision loss",
-};
-
-static void
-mathnote(ulong status, uintptr pc)
-{
-	char *msg, note[ERRMAX];
-	int i;
-
-	/*
-	 * Some attention should probably be paid here to the
-	 * exception masks and error summary.
-	 */
-	msg = "unknown exception";
-	for(i = 1; i <= 5; i++){
-		if(!((1<<i) & status))
-			continue;
-		msg = mathmsg[i];
-		break;
-	}
-	if(status & 0x01){
-		if(status & 0x40){
-			if(status & 0x200)
-				msg = "stack overflow";
-			else
-				msg = "stack underflow";
-		}else
-			msg = "invalid operation";
-	}
-	snprint(note, sizeof note, "sys: fp: %s fppc=%#p status=0x%lux",
-		msg, pc, status);
-	postnote(up, 1, note, NDebug);
-}
-
-/*
- *  math coprocessor error
- */
-static void
-matherror(Ureg *, void*)
-{
-	/*
-	 * Save FPU state to check out the error.
-	 */
-	fpsave(up->fpsave);
-	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
-	mathnote(up->fpsave->fsw, up->fpsave->rip);
-}
-
-/*
- *  SIMD error
- */
-static void
-simderror(Ureg *ureg, void*)
-{
-	fpsave(up->fpsave);
-	up->fpstate = FPinactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
-	mathnote(up->fpsave->mxcsr & 0x3f, ureg->pc);
-}
-
-void
-fpinit(void)
-{
-	/*
-	 * A process tries to use the FPU for the
-	 * first time and generates a 'device not available'
-	 * exception.
-	 * Turn the FPU on and initialise it for use.
-	 * Set the precision and mask the exceptions
-	 * we don't care about from the generic Mach value.
-	 */
-	_clts();
-	_fninit();
-	_fwait();
-	_fldcw(0x0232);
-	_ldmxcsr(0x1900);
-}
-
-/*
- *  math coprocessor emulation fault
- */
-static void
-mathemu(Ureg *ureg, void*)
-{
-	ulong status, control;
-	int index;
-
-	if(up->fpstate & FPillegal){
-		/* someone did floating point in a note handler */
-		postnote(up, 1, "sys: floating point in note handler", NDebug);
-		return;
-	}
-	switch(up->fpstate & ~(FPnouser|FPkernel|FPindexm)){
-	case FPactive	| FPpush:
-		_clts();
-		fpsave(up->fpsave);
-	case FPinactive	| FPpush:
-		up->fpstate += FPindex1;
-	case FPinit	| FPpush:
-	case FPinit:
-		fpinit();
-		index = up->fpstate >> FPindexs;
-		if(index < 0 || index > (FPindexm>>FPindexs))
-			panic("fpslot index overflow: %d", index);
-		if(userureg(ureg)){
-			if(index != 0)
-				panic("fpslot index %d != 0 for user", index);
-		} else {
-			if(index == 0)
-				up->fpstate |= FPnouser;
-			up->fpstate |= FPkernel;
-		}
-		while(up->fpslot[index] == nil)
-			up->fpslot[index] = mallocalign(sizeof(FPsave), FPalign, 0, 0);
-		up->fpsave = up->fpslot[index];
-		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
-		break;
-	case FPinactive:
-		/*
-		 * Before restoring the state, check for any pending
-		 * exceptions, there's no way to restore the state without
-		 * generating an unmasked exception.
-		 * More attention should probably be paid here to the
-		 * exception masks and error summary.
-		 */
-		status = up->fpsave->fsw;
-		control = up->fpsave->fcw;
-		if((status & ~control) & 0x07F){
-			mathnote(status, up->fpsave->rip);
-			break;
-		}
-		fprestore(up->fpsave);
-		up->fpstate = FPactive | (up->fpstate & (FPnouser|FPkernel|FPindexm));
-		break;
-	case FPactive:
-		panic("math emu pid %ld %s pc %#p", 
-			up->pid, up->text, ureg->pc);
-		break;
-	}
-}
-
-/*
- *  math coprocessor segment overrun
- */
-static void
-mathover(Ureg*, void*)
-{
-	pexit("math overrun", 0);
-}
-
-void
-mathinit(void)
-{
-	trapenable(VectorCERR, matherror, 0, "matherror");
-	if(m->cpuidfamily == 3)
-		intrenable(IrqIRQ13, matherror, 0, BUSUNKNOWN, "matherror");
-	trapenable(VectorCNA, mathemu, 0, "mathemu");
-	trapenable(VectorCSO, mathover, 0, "mathover");
-	trapenable(VectorSIMD, simderror, 0, "simderror");
-}
-
-void
 procsetup(Proc *p)
 {
-	p->fpstate = FPinit;
-	_stts();
+	fpuprocsetup(p);
 
 	/* clear debug registers */
 	memset(p->dr, 0, sizeof(p->dr));
@@ -556,29 +312,10 @@
 void
 procfork(Proc *p)
 {
-	int s;
-
 	p->kentry = up->kentry;
 	p->pcycles = -p->kentry;
 
-	/* save floating point state */
-	s = splhi();
-	switch(up->fpstate & ~FPillegal){
-	case FPactive	| FPpush:
-		_clts();
-	case FPactive:
-		fpsave(up->fpsave);
-		up->fpstate = FPinactive | (up->fpstate & FPpush);
-	case FPactive	| FPkernel:
-	case FPinactive	| FPkernel:
-	case FPinactive	| FPpush:
-	case FPinactive:
-		while(p->fpslot[0] == nil)
-			p->fpslot[0] = mallocalign(sizeof(FPsave), FPalign, 0, 0);
-		memmove(p->fpsave = p->fpslot[0], up->fpslot[0], sizeof(FPsave));
-		p->fpstate = FPinactive;
-	}
-	splx(s);
+	fpuprocfork(p);
 }
 
 void
@@ -594,6 +331,8 @@
 	if(p->vmx != nil)
 		vmxprocrestore(p);
 
+	fpuprocrestore(p);
+
 	if(p->kp)
 		return;
 
@@ -618,27 +357,7 @@
 	if(p->state == Moribund)
 		p->dr[7] = 0;
 
-	switch(p->fpstate & ~(FPnouser|FPkernel|FPindexm)){
-	case FPactive	| FPpush:
-		_clts();
-	case FPactive:
-		if(p->state == Moribund){
-			_fnclex();
-			_stts();
-			break;
-		}
-		/*
-		 * Fpsave() stores without handling pending
-		 * unmasked exeptions. Postnote() can't be called
-		 * here as sleep() already has up->rlock, so
-		 * the handling of pending exceptions is delayed
-		 * until the process runs again and generates an
-		 * emulation fault to activate the FPU.
-		 */
-		fpsave(p->fpsave);
-		p->fpstate = FPinactive | (p->fpstate & ~FPactive);
-		break;
-	}
+	fpuprocsave(p);
 
 	/*
 	 * While this processor is in the scheduler, the process could run
@@ -652,37 +371,4 @@
 	 * especially on VMware, but it turns out not to matter.
 	 */
 	mmuflushtlb();
-}
-
-/*
- * Fpusave and fpurestore lazily save and restore FPU state across
- * system calls and the pagefault handler so that we can take
- * advantage of SSE instructions such as AES-NI in the kernel.
- */
-int
-fpusave(void)
-{
-	int ostate = up->fpstate;
-	if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)
-		_stts();
-	up->fpstate = FPpush | (ostate & ~FPillegal);
-	return ostate;
-}
-void
-fpurestore(int ostate)
-{
-	int astate = up->fpstate;
-	if(astate == (FPpush | (ostate & ~FPillegal))){
-		if((ostate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)
-			_clts();
-	} else {
-		if(astate == FPinit)	/* don't restore on procexec()/procsetup() */
-			return;
-		if((astate & ~(FPnouser|FPkernel|FPindexm)) == FPactive)
-			_stts();
-		up->fpsave = up->fpslot[ostate>>FPindexs];
-		if(ostate & FPactive)
-			ostate = FPinactive | (ostate & ~FPactive);
-	}
-	up->fpstate = ostate;
 }