shithub: riscv

Download patch

ref: 83e20b4df18d539db59c8e1090f77a6565df250e
parent: 796e5e6000677a39577d545e4603ce251e7cbfe9
author: cinap_lenrek <[email protected]>
date: Sat Oct 20 15:56:31 EDT 2018

bcm: import changes for raspi2/3 from richard miller

--- a/sys/src/9/bcm/archbcm.c
+++ b/sys/src/9/bcm/archbcm.c
@@ -1,5 +1,5 @@
 /*
- * bcm2835 (e.g. raspberry pi) architecture-specific stuff
+ * bcm2835 (e.g. original raspberry pi) architecture-specific stuff
  */
 
 #include "u.h"
@@ -13,8 +13,19 @@
 
 #define	POWERREGS	(VIRTIO+0x100000)
 
+Soc soc = {
+	.dramsize	= 512*MiB,
+	.physio		= 0x20000000,
+	.busdram	= 0x40000000,
+	.busio		= 0x7E000000,
+	.armlocal	= 0,
+	.l1ptedramattrs = Cached | Buffered,
+	.l2ptedramattrs = Cached | Buffered,
+};
+
 enum {
 	Wdogfreq	= 65536,
+	Wdogtime	= 10,	/* seconds, ≤ 15 */
 };
 
 /*
@@ -25,6 +36,7 @@
 		Password	= 0x5A<<24,
 		CfgMask		= 0x03<<4,
 		CfgReset	= 0x02<<4,
+	Rsts		= 0x20>>2,
 	Wdog		= 0x24>>2,
 };
 
@@ -48,13 +60,68 @@
 }
 
 void
+wdogfeed(void)
+{
+	u32int *r;
+
+	r = (u32int*)POWERREGS;
+	r[Wdog] = Password | (Wdogtime * Wdogfreq);
+	r[Rstc] = Password | (r[Rstc] & ~CfgMask) | CfgReset;
+}
+
+void
+wdogoff(void)
+{
+	u32int *r;
+
+	r = (u32int*)POWERREGS;
+	r[Rstc] = Password | (r[Rstc] & ~CfgMask);
+}
+	
+char *
+cputype2name(char *buf, int size)
+{
+	seprint(buf, buf + size, "1176JZF-S");
+	return buf;
+}
+
+void
 cpuidprint(void)
 {
-	print("cpu%d: %dMHz ARM1176JZF-S\n", m->machno, m->cpumhz);
+	char name[64];
+
+	cputype2name(name, sizeof name);
+	delay(50);				/* let uart catch up */
+	print("cpu%d: %dMHz ARM %s\n", m->machno, m->cpumhz, name);
 }
 
+int
+getncpus(void)
+{
+	return 1;
+}
+
+int
+startcpus(uint)
+{
+	return 1;
+}
+
 void
 archbcmlink(void)
 {
+	addclock0link(wdogfeed, HZ);
+}
+
+int
+l2ap(int ap)
+{
+	return (AP(3, (ap))|AP(2, (ap))|AP(1, (ap))|AP(0, (ap)));
+}
+
+int
+cmpswap(long *addr, long old, long new)
+{
+	return cas32(addr, old, new);
 }
 
--- /dev/null
+++ b/sys/src/9/bcm/archbcm2.c
@@ -1,0 +1,248 @@
+/*
+ * bcm2836 (e.g.raspberry pi 2) architecture-specific stuff
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "../port/error.h"
+#include "io.h"
+#include "arm.h"
+
+#include "../port/netif.h"
+
+typedef struct Mbox Mbox;
+typedef struct Mboxes Mboxes;
+
+#define	POWERREGS	(VIRTIO+0x100000)
+
+Soc soc = {
+	.dramsize	= 0x3F000000, 	/* was 1024*MiB, but overlaps with physio */
+	.physio		= 0x3F000000,
+	.busdram	= 0xC0000000,
+	.busio		= 0x7E000000,
+	.armlocal	= 0x40000000,
+	.l1ptedramattrs = Cached | Buffered | L1wralloc | L1sharable,
+	.l2ptedramattrs = Cached | Buffered | L2wralloc | L2sharable,
+};
+
+enum {
+	Wdogfreq	= 65536,
+	Wdogtime	= 10,	/* seconds, ≤ 15 */
+};
+
+/*
+ * Power management / watchdog registers
+ */
+enum {
+	Rstc		= 0x1c>>2,
+		Password	= 0x5A<<24,
+		CfgMask		= 0x03<<4,
+		CfgReset	= 0x02<<4,
+	Rsts		= 0x20>>2,
+	Wdog		= 0x24>>2,
+};
+
+/*
+ * Arm local regs for smp
+ */
+struct Mbox {
+	u32int	doorbell;
+	u32int	mbox1;
+	u32int	mbox2;
+	u32int	startcpu;
+};
+struct Mboxes {
+	Mbox	set[4];
+	Mbox	clr[4];
+};
+
+enum {
+	Mboxregs	= 0x80
+};
+
+static Lock startlock[MAXMACH + 1];
+
+void
+archreset(void)
+{
+	fpon();
+}
+
+void
+archreboot(void)
+{
+	u32int *r;
+
+	r = (u32int*)POWERREGS;
+	r[Wdog] = Password | 1;
+	r[Rstc] = Password | (r[Rstc] & ~CfgMask) | CfgReset;
+	coherence();
+	for(;;)
+		;
+}
+
+void
+wdogfeed(void)
+{
+	u32int *r;
+
+	r = (u32int*)POWERREGS;
+	r[Wdog] = Password | (Wdogtime * Wdogfreq);
+	r[Rstc] = Password | (r[Rstc] & ~CfgMask) | CfgReset;
+}
+
+void
+wdogoff(void)
+{
+	u32int *r;
+
+	r = (u32int*)POWERREGS;
+	r[Rstc] = Password | (r[Rstc] & ~CfgMask);
+}
+
+
+char *
+cputype2name(char *buf, int size)
+{
+	u32int r;
+	uint part;
+	char *p;
+
+	r = cpidget();			/* main id register */
+	assert((r >> 24) == 'A');
+	part = (r >> 4) & MASK(12);
+	switch(part){
+	case 0xc07:
+		p = seprint(buf, buf + size, "Cortex-A7");
+		break;
+	case 0xd03:
+		p = seprint(buf, buf + size, "Cortex-A53");
+		break;
+	default:
+		p = seprint(buf, buf + size, "Unknown-%#x", part);
+		break;
+	}
+	seprint(p, buf + size, " r%ldp%ld",
+		(r >> 20) & MASK(4), r & MASK(4));
+	return buf;
+}
+
+void
+cpuidprint(void)
+{
+	char name[64];
+
+	cputype2name(name, sizeof name);
+	delay(50);				/* let uart catch up */
+	print("cpu%d: %dMHz ARM %s\n", m->machno, m->cpumhz, name);
+}
+
+int
+getncpus(void)
+{
+	int n, max;
+	char *p;
+
+	n = 4;
+	if(n > MAXMACH)
+		n = MAXMACH;
+	p = getconf("*ncpu");
+	if(p && (max = atoi(p)) > 0 && n > max)
+		n = max;
+	return n;
+}
+
+static int
+startcpu(uint cpu)
+{
+	Mboxes *mb;
+	int i;
+	void cpureset();
+
+	mb = (Mboxes*)(ARMLOCAL + Mboxregs);
+	if(mb->clr[cpu].startcpu)
+		return -1;
+	mb->set[cpu].startcpu = PADDR(cpureset);
+	coherence();
+	sev();
+	for(i = 0; i < 1000; i++)
+		if(mb->clr[cpu].startcpu == 0)
+			return 0;
+	mb->clr[cpu].startcpu = PADDR(cpureset);
+	mb->set[cpu].doorbell = 1;
+	return 0;
+}
+
+void
+mboxclear(uint cpu)
+{
+	Mboxes *mb;
+
+	mb = (Mboxes*)(ARMLOCAL + Mboxregs);
+	mb->clr[cpu].mbox1 = 1;
+}
+
+void
+wakecpu(uint cpu)
+{
+	Mboxes *mb;
+
+	mb = (Mboxes*)(ARMLOCAL + Mboxregs);
+	mb->set[cpu].mbox1 = 1;
+}
+
+int
+startcpus(uint ncpu)
+{
+	int i, timeout;
+
+	for(i = 0; i < ncpu; i++)
+		lock(&startlock[i]);
+	cachedwbse(startlock, sizeof startlock);
+	for(i = 1; i < ncpu; i++){
+		if(startcpu(i) < 0)
+			return i;
+		timeout = 10000000;
+		while(!canlock(&startlock[i]))
+			if(--timeout == 0)
+				return i;
+		unlock(&startlock[i]);
+	}
+	return ncpu;
+}
+
+void
+archbcm2link(void)
+{
+	addclock0link(wdogfeed, HZ);
+}
+
+int
+l2ap(int ap)
+{
+	return (AP(0, (ap)));
+}
+
+void
+cpustart(int cpu)
+{
+	Mboxes *mb;
+
+	up = nil;
+	machinit();
+	mb = (Mboxes*)(ARMLOCAL + Mboxregs);
+	mb->clr[cpu].doorbell = 1;
+	trapinit();
+	clockinit();
+	mmuinit1();
+	timersinit();
+	cpuidprint();
+	archreset();
+	active.machs[m->machno] = 1;
+	unlock(&startlock[cpu]);
+	schedinit();
+	panic("schedinit returned");
+}
--- a/sys/src/9/bcm/arm.h
+++ b/sys/src/9/bcm/arm.h
@@ -1,5 +1,5 @@
 /*
- * arm-specific definitions for armv6
+ * arm-specific definitions for armv6 (arm11), armv7 (cortex-a8 and -a7)
  * these are used in C and assembler
  */
 
@@ -12,6 +12,7 @@
 #define PsrMsvc		0x00000013	/* `protected mode for OS' */
 #define PsrMmon		0x00000016	/* `secure monitor' (trustzone hyper) */
 #define PsrMabt		0x00000017
+#define PsrMhyp		0x0000001A
 #define PsrMund		0x0000001B
 #define PsrMsys		0x0000001F	/* `privileged user mode for OS' (trustzone) */
 #define PsrMask		0x0000001F
@@ -52,9 +53,19 @@
 #define CpTLD		10			/* TLB Lockdown, with op2 */
 #define CpVECS		12			/* vector bases, op1==0, Crm==0, op2s (cortex) */
 #define	CpPID		13			/* Process ID */
+#define	CpTIMER		14			/* Generic timer (cortex-a7) */
 #define CpSPM		15			/* system performance monitor (arm1176) */
 
 /*
+ * CpTIMER op1==0 Crm and opcode2 registers (cortex-a7)
+ */
+#define	CpTIMERcntfrq	0
+#define CpTIMERphys		2
+
+#define CpTIMERphysval	0
+#define CpTIMERphysctl	1
+
+/*
  * CpTTB op1==0, Crm==0 opcode2 values.
  */
 #define CpTTB0		0
@@ -71,6 +82,7 @@
  * CpID Secondary (CRm) registers.
  */
 #define CpIDidct	0
+#define	CpIDfeat	1
 
 /*
  * CpID op1==0 opcode2 fields.
@@ -80,6 +92,7 @@
 #define CpIDct		1			/* cache type */
 #define CpIDtlb		3			/* tlb type (cortex) */
 #define CpIDmpid	5			/* multiprocessor id (cortex) */
+#define	CpIDrevid	6			/* extra revision ID */
 
 /* CpIDid op1 values */
 #define CpIDcsize	1			/* cache size (cortex) */
@@ -133,6 +146,10 @@
 #define CpACasa			(1<<4)	/* enable speculative accesses */
 #define CpACl1pe		(1<<3)	/* l1 cache parity enable */
 #define CpACl2en		(1<<1)	/* l2 cache enable; default 1 */
+
+/* cortex-a7 and cortex-a9 */
+#define CpACsmp			(1<<6)	/* SMP l1 caches coherence; needed for ldrex/strex */
+#define CpACl1pctl		(3<<13)	/* l1 prefetch control */
 /*
  * CpCONTROL Secondary (CRm) registers and opcode2 fields.
  */
@@ -151,9 +168,9 @@
 #define CpCACHEinvd	6			/* data or unified */
 #define CpCACHEinvu	7			/* unified (not on cortex) */
 #define CpCACHEva2pa	8			/* va -> pa translation (cortex) */
-#define CpCACHEwb	10			/* writeback */
-#define CpCACHEinvdse	11			/* data or unified by mva */
-#define CpCACHEwbi	14			/* writeback+invalidate */
+#define CpCACHEwb	10			/* writeback to PoC */
+#define CpCACHEwbu	11			/* writeback to PoU */
+#define CpCACHEwbi	14			/* writeback+invalidate (to PoC) */
 
 #define CpCACHEall	0			/* entire (not for invd nor wb(i) on cortex) */
 #define CpCACHEse	1			/* single entry */
@@ -223,7 +240,7 @@
 #define CpVECSmon	1			/* secure monitor base addr */
 
 /*
- * CpSPM Secondary (CRm) registers and opcode2 fields.
+ * CpSPM Secondary (CRm) registers and opcode2 fields (armv6)
  */
 #define CpSPMperf	12			/* various counters */
 
@@ -239,6 +256,21 @@
 #define CpCACHERANGEdwbi	14		/* writeback+invalidate */
 
 /*
+ * CpTTB cache control bits
+ */
+#define CpTTBnos	(1<<5)	/* only Inner cache shareable */
+#define CpTTBinc	(0<<0|0<<6)	/* inner non-cacheable */
+#define CpTTBiwba	(0<<0|1<<6)	/* inner write-back write-allocate */
+#define CpTTBiwt	(1<<0|0<<6)	/* inner write-through */
+#define CpTTBiwb	(1<<0|1<<6)	/* inner write-back no write-allocate */
+#define CpTTBonc	(0<<3)	/* outer non-cacheable */
+#define CpTTBowba	(1<<3)	/* outer write-back write-allocate */
+#define CpTTBowt	(2<<3)	/* outer write-through */
+#define CpTTBowb	(3<<3)	/* outer write-back no write-allocate */
+#define CpTTBs	(1<<1)	/* page table in shareable memory */
+#define CpTTBbase	~0x7F		/* mask off control bits */
+
+/*
  * MMU page table entries.
  * Mbz (0x10) bit is implementation-defined and must be 0 on the cortex.
  */
@@ -256,6 +288,15 @@
 #define Cached		0x00000008		/* L[12] */
 #define Dom0		0
 
+#define L1wralloc	(1<<12)			/* L1 TEX */
+#define L1sharable	(1<<16)
+#define L2wralloc	(1<<6)			/* L2 TEX (small pages) */
+#define L2sharable	(1<<10)
+
+/* attributes for memory containing locks -- differs between armv6 and armv7 */
+//#define L1ptedramattrs	(Cached | Buffered | L1wralloc | L1sharable)
+//#define L2ptedramattrs	(Cached | Buffered | L2wralloc | L2sharable)
+
 #define Noaccess	0			/* AP, DAC */
 #define Krw		1			/* AP */
 /* armv7 deprecates AP[2] == 1 & AP[1:0] == 2 (Uro), prefers 3 (new in v7) */
@@ -267,7 +308,7 @@
 #define F(v, o, w)	(((v) & ((1<<(w))-1))<<(o))
 #define AP(n, v)	F((v), ((n)*2)+4, 2)
 #define L1AP(ap)	(AP(3, (ap)))
-#define L2AP(ap) (AP(3, (ap))|AP(2, (ap))|AP(1, (ap))|AP(0, (ap))) /* pre-armv7 */
+/* L2AP differs between armv6 and armv7 -- see l2ap in arch*.c */
 #define DAC(n, v)	F((v), (n)*2, 2)
 
 #define HVECTORS	0xffff0000
--- a/sys/src/9/bcm/arm.s
+++ b/sys/src/9/bcm/arm.s
@@ -1,5 +1,5 @@
 /*
- * armv6 machine assist, definitions
+ * armv6/v7 machine assist, definitions
  *
  * loader uses R11 as scratch.
  */
@@ -11,8 +11,6 @@
 
 #define L1X(va)		(((((va))>>20) & 0x0fff)<<2)
 
-#define PTEDRAM		(Dom0|L1AP(Krw)|Section|Cached|Buffered)
-
 /*
  * new instructions
  */
@@ -25,12 +23,32 @@
 	MOVW	$0, R0; \
 	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEwait
 
-#define	BARRIERS	ISB; DSB
+#define	BARRIERS	DSB; ISB
 
 #define MCRR(coproc, op, rd, rn, crm) \
 	WORD $(0xec400000|(rn)<<16|(rd)<<12|(coproc)<<8|(op)<<4|(crm))
+#define MRRC(coproc, op, rd, rn, crm) \
+	WORD $(0xec500000|(rn)<<16|(rd)<<12|(coproc)<<8|(op)<<4|(crm))
+#define MSR(R, rn, m, m1) \
+	WORD $(0xe120f200|(R)<<22|(m1)<<16|(m)<<8|(rn))
 
+#define CPSIE	WORD	$0xf1080080	/* intr enable: zeroes I bit */
+#define CPSID	WORD	$0xf10c0080	/* intr disable: sets I bit */
+
 #define OKAY \
 	MOVW	$0x7E200028,R2; \
 	MOVW	$0x10000,R3; \
 	MOVW	R3,(R2)
+
+#define PUTC(s)
+
+/*
+ * get cpu id, or zero if armv6
+ */
+#define CPUID(r) \
+	MRC	CpSC, 0, r, C(CpID), C(CpIDfeat), 7; \
+	CMP	$0, r; \
+	B.EQ	2(PC); \
+	MRC	CpSC, 0, r, C(CpID), C(CpIDidct), CpIDmpid; \
+	AND.S	$(MAXMACH-1), r
+
--- /dev/null
+++ b/sys/src/9/bcm/armv6.s
@@ -1,0 +1,324 @@
+/*
+ * Broadcom bcm2835 SoC, as used in Raspberry Pi
+ * arm1176jzf-s processor (armv6)
+ */
+
+#include "arm.s"
+
+#define CACHELINESZ 32
+
+TEXT armstart(SB), 1, $-4
+
+	/*
+	 * SVC mode, interrupts disabled
+	 */
+	MOVW	$(PsrDirq|PsrDfiq|PsrMsvc), R1
+	MOVW	R1, CPSR
+
+	/*
+	 * disable the mmu and L1 caches
+	 * invalidate caches and tlb
+	 */
+	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	BIC	$(CpCdcache|CpCicache|CpCpredict|CpCmmu), R1
+	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvu), CpCACHEall
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
+	ISB
+
+	/*
+	 * clear mach and page tables
+	 */
+	MOVW	$PADDR(MACHADDR), R1
+	MOVW	$PADDR(KTZERO), R2
+_ramZ:
+	MOVW	R0, (R1)
+	ADD	$4, R1
+	CMP	R1, R2
+	BNE	_ramZ
+
+	/*
+	 * start stack at top of mach (physical addr)
+	 * set up page tables for kernel
+	 */
+	MOVW	$PADDR(MACHADDR+MACHSIZE-4), R13
+	MOVW	$PADDR(L1), R0
+	BL	,mmuinit(SB)
+
+	/*
+	 * set up domain access control and page table base
+	 */
+	MOVW	$Client, R1
+	MCR	CpSC, 0, R1, C(CpDAC), C(0)
+	MOVW	$PADDR(L1), R1
+	MCR	CpSC, 0, R1, C(CpTTB), C(0)
+
+	/*
+	 * enable caches, mmu, and high vectors
+	 */
+	MRC	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
+	ORR	$(CpChv|CpCdcache|CpCicache|CpCpredict|CpCmmu), R0
+	MCR	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
+	ISB
+
+	/*
+	 * switch SB, SP, and PC into KZERO space
+	 */
+	MOVW	$setR12(SB), R12
+	MOVW	$(MACHADDR+MACHSIZE-4), R13
+	MOVW	$_startpg(SB), R15
+
+TEXT _startpg(SB), 1, $-4
+
+	/*
+	 * enable cycle counter
+	 */
+	MOVW	$1, R1
+	MCR	CpSC, 0, R1, C(CpSPM), C(CpSPMperf), CpSPMctl
+
+	/*
+	 * call main and loop forever if it returns
+	 */
+	BL	,main(SB)
+	B	,0(PC)
+
+	BL	_div(SB)		/* hack to load _div, etc. */
+
+TEXT cpidget(SB), 1, $-4			/* main ID */
+	MRC	CpSC, 0, R0, C(CpID), C(0), CpIDid
+	RET
+
+TEXT fsrget(SB), 1, $-4				/* data fault status */
+	MRC	CpSC, 0, R0, C(CpFSR), C(0), CpFSRdata
+	RET
+
+TEXT ifsrget(SB), 1, $-4			/* instruction fault status */
+	MRC	CpSC, 0, R0, C(CpFSR), C(0), CpFSRinst
+	RET
+
+TEXT farget(SB), 1, $-4				/* fault address */
+	MRC	CpSC, 0, R0, C(CpFAR), C(0x0)
+	RET
+
+TEXT lcycles(SB), 1, $-4
+	MRC	CpSC, 0, R0, C(CpSPM), C(CpSPMperf), CpSPMcyc
+	RET
+
+TEXT splhi(SB), 1, $-4
+	MOVW	$(MACHADDR+4), R2		/* save caller pc in Mach */
+	MOVW	R14, 0(R2)
+
+	MOVW	CPSR, R0			/* turn off irqs (but not fiqs) */
+	ORR	$(PsrDirq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT splfhi(SB), 1, $-4
+	MOVW	$(MACHADDR+4), R2		/* save caller pc in Mach */
+	MOVW	R14, 0(R2)
+
+	MOVW	CPSR, R0			/* turn off irqs and fiqs */
+	ORR	$(PsrDirq|PsrDfiq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT splflo(SB), 1, $-4
+	MOVW	CPSR, R0			/* turn on fiqs */
+	BIC	$(PsrDfiq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT spllo(SB), 1, $-4
+	MOVW	CPSR, R0			/* turn on irqs and fiqs */
+	BIC	$(PsrDirq|PsrDfiq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT splx(SB), 1, $-4
+	MOVW	$(MACHADDR+0x04), R2		/* save caller pc in Mach */
+	MOVW	R14, 0(R2)
+
+	MOVW	R0, R1				/* reset interrupt level */
+	MOVW	CPSR, R0
+	MOVW	R1, CPSR
+	RET
+
+TEXT spldone(SB), 1, $0				/* end marker for devkprof.c */
+	RET
+
+TEXT islo(SB), 1, $-4
+	MOVW	CPSR, R0
+	AND	$(PsrDirq), R0
+	EOR	$(PsrDirq), R0
+	RET
+
+TEXT	tas(SB), $-4
+TEXT	_tas(SB), $-4
+	MOVW	R0,R1
+	MOVW	$1,R0
+	SWPW	R0,(R1)			/* fix: deprecated in armv6 */
+	RET
+
+TEXT setlabel(SB), 1, $-4
+	MOVW	R13, 0(R0)		/* sp */
+	MOVW	R14, 4(R0)		/* pc */
+	MOVW	$0, R0
+	RET
+
+TEXT gotolabel(SB), 1, $-4
+	MOVW	0(R0), R13		/* sp */
+	MOVW	4(R0), R14		/* pc */
+	MOVW	$1, R0
+	RET
+
+TEXT getcallerpc(SB), 1, $-4
+	MOVW	0(R13), R0
+	RET
+
+TEXT idlehands(SB), $-4
+	MOVW	CPSR, R3
+	ORR	$(PsrDirq|PsrDfiq), R3, R1		/* splfhi */
+	MOVW	R1, CPSR
+
+	DSB
+	MOVW	nrdy(SB), R0
+	CMP	$0, R0
+	MCR.EQ	CpSC, 0, R0, C(CpCACHE), C(CpCACHEintr), CpCACHEwait
+	DSB
+
+	MOVW	R3, CPSR			/* splx */
+	RET
+
+
+TEXT coherence(SB), $-4
+	BARRIERS
+	RET
+
+/*
+ * invalidate tlb
+ */
+TEXT mmuinvalidate(SB), 1, $-4
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
+	BARRIERS
+	MCR CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEflushbtc
+	RET
+
+/*
+ * mmuinvalidateaddr(va)
+ *   invalidate tlb entry for virtual page address va, ASID 0
+ */
+TEXT mmuinvalidateaddr(SB), 1, $-4
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinvse
+	BARRIERS
+	RET
+
+/*
+ * drain write buffer
+ * writeback data cache
+ */
+TEXT cachedwb(SB), 1, $-4
+	DSB
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEall
+	RET
+
+/*
+ * drain write buffer
+ * writeback and invalidate data cache
+ */
+TEXT cachedwbinv(SB), 1, $-4
+	DSB
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEall
+	RET
+
+/*
+ * cachedwbinvse(va, n)
+ *   drain write buffer
+ *   writeback and invalidate data cache range [va, va+n)
+ */
+TEXT cachedwbinvse(SB), 1, $-4
+	MOVW	R0, R1		/* DSB clears R0 */
+	DSB
+	MOVW	n+4(FP), R2
+	ADD	R1, R2
+	SUB	$1, R2
+	BIC	$(CACHELINESZ-1), R1
+	BIC	$(CACHELINESZ-1), R2
+	MCRR(CpSC, 0, 2, 1, CpCACHERANGEdwbi)
+	RET
+
+/*
+ * cachedwbse(va, n)
+ *   drain write buffer
+ *   writeback data cache range [va, va+n)
+ */
+TEXT cachedwbtlb(SB), 1, $-4
+TEXT cachedwbse(SB), 1, $-4
+
+	MOVW	R0, R1		/* DSB clears R0 */
+	DSB
+	MOVW	n+4(FP), R2
+	ADD	R1, R2
+	BIC	$(CACHELINESZ-1), R1
+	BIC	$(CACHELINESZ-1), R2
+	MCRR(CpSC, 0, 2, 1, CpCACHERANGEdwb)
+	RET
+
+/*
+ * cachedinvse(va, n)
+ *   drain write buffer
+ *   invalidate data cache range [va, va+n)
+ */
+TEXT cachedinvse(SB), 1, $-4
+	MOVW	R0, R1		/* DSB clears R0 */
+	DSB
+	MOVW	n+4(FP), R2
+	ADD	R1, R2
+	SUB	$1, R2
+	BIC	$(CACHELINESZ-1), R1
+	BIC	$(CACHELINESZ-1), R2
+	MCRR(CpSC, 0, 2, 1, CpCACHERANGEinvd)
+	RET
+
+/*
+ * drain write buffer and prefetch buffer
+ * writeback and invalidate data cache
+ * invalidate instruction cache
+ */
+TEXT cacheuwbinv(SB), 1, $-4
+	BARRIERS
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEall
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall
+	RET
+
+/*
+ * L2 cache is not enabled
+ */
+TEXT l2cacheuwbinv(SB), 1, $-4
+	RET
+
+/*
+ * invalidate instruction cache
+ */
+TEXT cacheiinv(SB), 1, $-4
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall
+	RET
+
+/*
+ * invalidate range of instruction cache
+ */
+TEXT cacheiinvse(SB), 1, $-4
+	MOVW	R0, R1		/* DSB clears R0 */
+	DSB
+	MOVW n+4(FP), R2
+	ADD	R1, R2
+	SUB	$1, R2
+	MCRR(CpSC, 0, 2, 1, CpCACHERANGEinvi)
+	MCR CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEflushbtc
+	DSB
+	ISB
+	RET
--- /dev/null
+++ b/sys/src/9/bcm/armv7.s
@@ -1,0 +1,510 @@
+/*
+ * Broadcom bcm2836 SoC, as used in Raspberry Pi 2
+ * 4 x Cortex-A7 processor (armv7)
+ */
+
+#include "arm.s"
+
+#define CACHELINESZ 	64
+#define ICACHELINESZ	32
+
+#undef DSB
+#undef DMB
+#undef ISB
+#define DSB	WORD	$0xf57ff04f	/* data synch. barrier; last f = SY */
+#define DMB	WORD	$0xf57ff05f	/* data mem. barrier; last f = SY */
+#define ISB	WORD	$0xf57ff06f	/* instr. sync. barrier; last f = SY */
+#define WFI	WORD	$0xe320f003	/* wait for interrupt */
+#define WFI_EQ	WORD	$0x0320f003	/* wait for interrupt if eq */
+#define ERET	WORD	$0xe160006e	/* exception return from HYP */
+#define SEV	WORD	$0xe320f004	/* send event */
+
+/* tas/cas strex debugging limits; started at 10000 */
+#define MAXSC 1000000
+
+TEXT armstart(SB), 1, $-4
+
+	/*
+	 * if not cpu0, go to secondary startup
+	 */
+	CPUID(R1)
+	BNE	reset
+
+	/*
+	 * go to SVC mode, interrupts disabled
+	 */
+	BL	svcmode(SB)
+
+	/*
+	 * disable the mmu and caches
+	 */
+	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	BIC	$(CpCdcache|CpCicache|CpCmmu), R1
+	ORR	$(CpCsbo|CpCsw), R1
+	BIC	$CpCsbz, R1
+	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	BARRIERS
+
+	/*
+	 * clear mach and page tables
+	 */
+	MOVW	$PADDR(MACHADDR), R1
+	MOVW	$PADDR(KTZERO), R2
+_ramZ:
+	MOVW	R0, (R1)
+	ADD	$4, R1
+	CMP	R1, R2
+	BNE	_ramZ
+
+	/*
+	 * turn SMP on
+	 * invalidate tlb
+	 */
+	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpAuxctl
+	ORR	$CpACsmp, R1		/* turn SMP on */
+	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpAuxctl
+	BARRIERS
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
+	BARRIERS
+
+	/*
+	 * start stack at top of mach (physical addr)
+	 * set up page tables for kernel
+	 */
+	MOVW	$PADDR(MACHADDR+MACHSIZE-4), R13
+	MOVW	$PADDR(L1), R0
+	BL	mmuinit(SB)
+
+	/*
+	 * set up domain access control and page table base
+	 */
+	MOVW	$Client, R1
+	MCR	CpSC, 0, R1, C(CpDAC), C(0)
+	MOVW	$PADDR(L1), R1
+	ORR	$(CpTTBs|CpTTBowba|CpTTBiwba), R1
+	MCR	CpSC, 0, R1, C(CpTTB), C(0)
+	MCR	CpSC, 0, R1, C(CpTTB), C(0), CpTTB1	/* cortex has two */
+
+	/*
+	 * invalidate my caches before enabling
+	 */
+	BL	cachedinv(SB)
+	BL	cacheiinv(SB)
+	BL	l2cacheuinv(SB)
+	BARRIERS
+
+	/*
+	 * enable caches, mmu, and high vectors
+	 */
+
+	MRC	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
+	ORR	$(CpChv|CpCdcache|CpCicache|CpCmmu), R0
+	MCR	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
+	BARRIERS
+
+	/*
+	 * switch SB, SP, and PC into KZERO space
+	 */
+	MOVW	$setR12(SB), R12
+	MOVW	$(MACHADDR+MACHSIZE-4), R13
+	MOVW	$_startpg(SB), R15
+
+TEXT _startpg(SB), 1, $-4
+
+	/*
+	 * enable cycle counter
+	 */
+	MOVW	$(1<<31), R1
+	MCR	CpSC, 0, R1, C(CpCLD), C(CpCLDena), CpCLDenacyc
+	MOVW	$1, R1
+	MCR	CpSC, 0, R1, C(CpCLD), C(CpCLDena), CpCLDenapmnc
+
+	/*
+	 * call main and loop forever if it returns
+	 */
+	BL	,main(SB)
+	B	,0(PC)
+
+	BL	_div(SB)		/* hack to load _div, etc. */
+
+/*
+ * startup entry for cpu(s) other than 0
+ */
+TEXT cpureset(SB), 1, $-4
+reset:
+	/*
+	 * load physical base for SB addressing while mmu is off
+	 * keep a handy zero in R0 until first function call
+	 */
+	MOVW	$setR12(SB), R12
+	SUB	$KZERO, R12
+	ADD	$PHYSDRAM, R12
+	MOVW	$0, R0
+
+	/*
+	 * SVC mode, interrupts disabled
+	 */
+	BL	svcmode(SB)
+
+	/*
+	 * disable the mmu and caches
+	 */
+	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	BIC	$(CpCdcache|CpCicache|CpCmmu), R1
+	ORR	$(CpCsbo|CpCsw), R1
+	BIC	$CpCsbz, R1
+	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	BARRIERS
+
+	/*
+	 * turn SMP on
+	 * invalidate tlb
+	 */
+	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpAuxctl
+	ORR	$CpACsmp, R1		/* turn SMP on */
+	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpAuxctl
+	BARRIERS
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
+	BARRIERS
+
+	/*
+	 * find Mach for this cpu
+	 */
+	MRC	CpSC, 0, R2, C(CpID), C(CpIDidct), CpIDmpid
+	AND	$(MAXMACH-1), R2	/* mask out non-cpu-id bits */
+	SLL	$2, R2			/* convert to word index */
+	MOVW	$machaddr(SB), R0
+	ADD	R2, R0			/* R0 = &machaddr[cpuid] */
+	MOVW	(R0), R0		/* R0 = machaddr[cpuid] */
+	CMP	$0, R0
+	BEQ	0(PC)			/* must not be zero */
+	SUB	$KZERO, R0, R(MACH)	/* m = PADDR(machaddr[cpuid]) */
+
+	/*
+	 * start stack at top of local Mach
+	 */
+	ADD	$(MACHSIZE-4), R(MACH), R13
+
+	/*
+	 * set up domain access control and page table base
+	 */
+	MOVW	$Client, R1
+	MCR	CpSC, 0, R1, C(CpDAC), C(0)
+	MOVW	12(R(MACH)), R1	/* m->mmul1 */
+	SUB	$KZERO, R1		/* phys addr */
+	ORR	$(CpTTBs|CpTTBowba|CpTTBiwba), R1
+	MCR	CpSC, 0, R1, C(CpTTB), C(0)
+	MCR	CpSC, 0, R1, C(CpTTB), C(0), CpTTB1	/* cortex has two */
+
+	/*
+	 * invalidate my caches before enabling
+	 */
+	BL	cachedinv(SB)
+	BL	cacheiinv(SB)
+	BARRIERS
+
+	/*
+	 * enable caches, mmu, and high vectors
+	 */
+	MRC	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
+	ORR	$(CpChv|CpCdcache|CpCicache|CpCmmu), R0
+	MCR	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
+	BARRIERS
+
+	/*
+	 * switch MACH, SB, SP, and PC into KZERO space
+	 */
+	ADD	$KZERO, R(MACH)
+	MOVW	$setR12(SB), R12
+	ADD	$KZERO, R13
+	MOVW	$_startpg2(SB), R15
+
+TEXT _startpg2(SB), 1, $-4
+
+	/*
+	 * enable cycle counter
+	 */
+	MOVW	$(1<<31), R1
+	MCR	CpSC, 0, R1, C(CpCLD), C(CpCLDena), CpCLDenacyc
+	MOVW	$1, R1
+	MCR	CpSC, 0, R1, C(CpCLD), C(CpCLDena), CpCLDenapmnc
+
+	/*
+	 * call cpustart and loop forever if it returns
+	 */
+	MRC	CpSC, 0, R0, C(CpID), C(CpIDidct), CpIDmpid
+	AND	$(MAXMACH-1), R0		/* mask out non-cpu-id bits */
+	BL	,cpustart(SB)
+	B	,0(PC)
+
+/*
+ * get into SVC mode with interrupts disabled
+ * raspberry pi firmware since 29 Sept 2015 starts in HYP mode
+ */
+TEXT svcmode(SB), 1, $-4
+	MOVW	CPSR, R1
+	AND	$PsrMask, R1
+	MOVW	$PsrMhyp, R2
+	CMP	R2, R1
+	MOVW	$(PsrDirq|PsrDfiq|PsrMsvc), R1
+	BNE	nothyp
+	MSR(1, 1, 1, 0xe)	/* MOVW	R1, SPSR_HYP */
+	MSR(0, 14, 1, 0xe)	/* MOVW	R14, ELR_HYP */
+	ERET
+nothyp:
+	MOVW	R1, CPSR
+	RET
+
+TEXT cpidget(SB), 1, $-4			/* main ID */
+	MRC	CpSC, 0, R0, C(CpID), C(0), CpIDid
+	RET
+
+TEXT fsrget(SB), 1, $-4				/* data fault status */
+	MRC	CpSC, 0, R0, C(CpFSR), C(0), CpFSRdata
+	RET
+
+TEXT ifsrget(SB), 1, $-4			/* instruction fault status */
+	MRC	CpSC, 0, R0, C(CpFSR), C(0), CpFSRinst
+	RET
+
+TEXT farget(SB), 1, $-4				/* fault address */
+	MRC	CpSC, 0, R0, C(CpFAR), C(0x0)
+	RET
+
+TEXT cpctget(SB), 1, $-4			/* cache type */
+	MRC	CpSC, 0, R0, C(CpID), C(CpIDidct), CpIDct
+	RET
+
+TEXT lcycles(SB), 1, $-4
+	MRC	CpSC, 0, R0, C(CpCLD), C(CpCLDcyc), 0
+	RET
+
+TEXT splhi(SB), 1, $-4
+	MOVW	R14, 4(R(MACH))		/* save caller pc in m->splpc */
+
+	MOVW	CPSR, R0			/* turn off irqs (but not fiqs) */
+	ORR	$(PsrDirq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT splfhi(SB), 1, $-4
+	MOVW	R14, 4(R(MACH))		/* save caller pc in m->splpc */
+
+	MOVW	CPSR, R0			/* turn off irqs and fiqs */
+	ORR	$(PsrDirq|PsrDfiq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT splflo(SB), 1, $-4
+	MOVW	CPSR, R0			/* turn on fiqs */
+	BIC	$(PsrDfiq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT spllo(SB), 1, $-4
+	MOVW	CPSR, R0			/* turn on irqs and fiqs */
+	MOVW	$0, R1
+	CMP.S	R1, R(MACH)
+	MOVW.NE	R1, 4(R(MACH))			/* clear m->splpc */
+	BIC	$(PsrDirq|PsrDfiq), R0, R1
+	MOVW	R1, CPSR
+	RET
+
+TEXT splx(SB), 1, $-4
+	MOVW	R14, 4(R(MACH))		/* save caller pc in m->splpc */
+
+	MOVW	R0, R1				/* reset interrupt level */
+	MOVW	CPSR, R0
+	MOVW	R1, CPSR
+	RET
+
+TEXT spldone(SB), 1, $0				/* end marker for devkprof.c */
+	RET
+
+TEXT islo(SB), 1, $-4
+	MOVW	CPSR, R0
+	AND	$(PsrDirq), R0
+	EOR	$(PsrDirq), R0
+	RET
+
+TEXT cas(SB), $0
+TEXT cmpswap(SB), $0
+	MOVW	ov+4(FP), R1
+	MOVW	nv+8(FP), R2
+spincas:
+	LDREX	(R0), R3
+	CMP.S	R3, R1
+	BNE	fail
+	STREX	R2, (R0), R4
+	CMP.S	$0, R4
+	BNE	spincas
+	MOVW	$1, R0
+	DMB
+	RET
+fail:
+	CLREX
+	MOVW	$0, R0
+	RET
+
+TEXT	tas(SB), $-4
+TEXT	_tas(SB), $-4			/* _tas(ulong *) */
+	/* returns old (R0) after modifying (R0) */
+	MOVW	R0,R5
+	DMB
+
+	MOVW	$1,R2		/* new value of (R0) */
+	MOVW	$MAXSC, R8
+tas1:
+	LDREX (R5), R7
+	CMP.S	$0, R7		/* old value non-zero (lock taken)? */
+	BNE	lockbusy	/* we lose */
+	SUB.S	$1, R8
+	BEQ	lockloop2
+	STREX R2,(R5),R4
+	CMP.S	$0, R4
+	BNE	tas1		/* strex failed? try again */
+	DMB
+	B	tas0
+lockloop2:
+	BL	abort(SB)
+lockbusy:
+	CLREX
+tas0:
+	MOVW	R7, R0		/* return old value */
+	RET
+
+TEXT setlabel(SB), 1, $-4
+	MOVW	R13, 0(R0)		/* sp */
+	MOVW	R14, 4(R0)		/* pc */
+	MOVW	$0, R0
+	RET
+
+TEXT gotolabel(SB), 1, $-4
+	MOVW	0(R0), R13		/* sp */
+	MOVW	4(R0), R14		/* pc */
+	MOVW	$1, R0
+	RET
+
+TEXT getcallerpc(SB), 1, $-4
+	MOVW	0(R13), R0
+	RET
+
+TEXT idlehands(SB), $-4
+	MOVW	CPSR, R3
+	ORR	$(PsrDirq|PsrDfiq), R3, R1		/* splfhi */
+	MOVW	R1, CPSR
+
+	DSB
+	MOVW	nrdy(SB), R0
+	CMP	$0, R0
+	WFI_EQ
+	DSB
+
+	MOVW	R3, CPSR			/* splx */
+	RET
+
+
+TEXT coherence(SB), $-4
+	BARRIERS
+	RET
+
+TEXT sev(SB), $-4
+	SEV
+	RET
+
+/*
+ * invalidate tlb
+ */
+TEXT mmuinvalidate(SB), 1, $-4
+	DSB
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
+	BARRIERS
+	RET
+
+/*
+ * mmuinvalidateaddr(va)
+ *   invalidate tlb entry for virtual page address va, ASID 0
+ */
+TEXT mmuinvalidateaddr(SB), 1, $-4
+	DSB
+	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinvse
+	BARRIERS
+	RET
+
+/*
+ * `single-element' cache operations.
+ * in arm arch v7, if effective to PoC, they operate on all cache levels, so separate
+ * l2 functions are unnecessary.
+ */
+
+TEXT cachedwbse(SB), $-4			/* D writeback SE */
+	MOVW	R0, R2
+
+	MOVW	CPSR, R3
+	CPSID					/* splhi */
+
+	BARRIERS			/* force outstanding stores to cache */
+	MOVW	R2, R0
+	MOVW	4(FP), R1
+	ADD	R0, R1				/* R1 is end address */
+	BIC	$(CACHELINESZ-1), R0		/* cache line start */
+_dwbse:
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEse
+	/* can't have a BARRIER here since it zeroes R0 */
+	ADD	$CACHELINESZ, R0
+	CMP.S	R0, R1
+	BGT	_dwbse
+	B	_wait
+
+/*
+ * TLB on armv7 loads from cache, so no need for writeback
+ */
+TEXT cachedwbtlb(SB), $-4
+	DSB
+	ISB
+	RET
+
+TEXT cachedwbinvse(SB), $-4			/* D writeback+invalidate SE */
+	MOVW	R0, R2
+
+	MOVW	CPSR, R3
+	CPSID					/* splhi */
+
+	BARRIERS			/* force outstanding stores to cache */
+	MOVW	R2, R0
+	MOVW	4(FP), R1
+	ADD	R0, R1				/* R1 is end address */
+	BIC	$(CACHELINESZ-1), R0		/* cache line start */
+_dwbinvse:
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEse
+	/* can't have a BARRIER here since it zeroes R0 */
+	ADD	$CACHELINESZ, R0
+	CMP.S	R0, R1
+	BGT	_dwbinvse
+_wait:						/* drain write buffer */
+	BARRIERS
+
+	MOVW	R3, CPSR			/* splx */
+	RET
+
+TEXT cachedinvse(SB), $-4			/* D invalidate SE */
+	MOVW	R0, R2
+
+	MOVW	CPSR, R3
+	CPSID					/* splhi */
+
+	BARRIERS			/* force outstanding stores to cache */
+	MOVW	R2, R0
+	MOVW	4(FP), R1
+	ADD	R0, R1				/* R1 is end address */
+	BIC	$(CACHELINESZ-1), R0		/* cache line start */
+_dinvse:
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvd), CpCACHEse
+	/* can't have a BARRIER here since it zeroes R0 */
+	ADD	$CACHELINESZ, R0
+	CMP.S	R0, R1
+	BGT	_dinvse
+	B	_wait
+
+#include "cache.v7.s"
--- /dev/null
+++ b/sys/src/9/bcm/cache.v7.s
@@ -1,0 +1,220 @@
+/*
+ * cortex arm arch v7 cache flushing and invalidation
+ * shared by l.s and rebootcode.s
+ */
+
+#define	BPIALL	MCR CpSC, 0, R0, C(CpCACHE), C(5), 6	/* branch predictor invalidate all */
+
+TEXT cacheiinv(SB), $-4				/* I invalidate */
+	DSB
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall /* ok on cortex */
+	BPIALL	/* redundant? */
+	DSB
+	ISB
+	RET
+
+TEXT cacheiinvse(SB), $0			/* I invalidate SE */
+	MOVW 4(FP), R1
+	ADD	R0, R1
+	BIC $(ICACHELINESZ - 1), R0
+	DSB
+_iinvse:
+	MCR CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEse
+	ADD $ICACHELINESZ, R0
+	CMP.S R0, R1
+	BGT _iinvse
+	BPIALL
+	DSB
+	ISB
+	RET
+
+/*
+ * set/way operators, passed a suitable set/way value in R0.
+ */
+TEXT cachedwb_sw(SB), $-4
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEsi
+	RET
+
+TEXT cachedwbinv_sw(SB), $-4
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEsi
+	RET
+
+TEXT cachedinv_sw(SB), $-4
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvd), CpCACHEsi
+	RET
+
+	/* set cache size select */
+TEXT setcachelvl(SB), $-4
+	MCR	CpSC, CpIDcssel, R0, C(CpID), C(CpIDidct), 0
+	ISB
+	RET
+
+	/* return cache sizes */
+TEXT getwayssets(SB), $-4
+	MRC	CpSC, CpIDcsize, R0, C(CpID), C(CpIDidct), 0
+	RET
+
+/*
+ * l1 cache operations.
+ * l1 and l2 ops are intended to be called from C, thus need save no
+ * caller's regs, only those we need to preserve across calls.
+ */
+
+TEXT cachedwb(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedwb_sw(SB), R0
+	MOVW	$1, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT cachedwbinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedwbinv_sw(SB), R0
+	MOVW	$1, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT cachedinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedinv_sw(SB), R0
+	MOVW	$1, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT cacheuwbinv(SB), $-4
+	MOVM.DB.W [R14], (R13)	/* save lr on stack */
+	MOVW	CPSR, R1
+	CPSID			/* splhi */
+
+	MOVM.DB.W [R1], (R13)	/* save R1 on stack */
+
+	BL	cachedwbinv(SB)
+	BL	cacheiinv(SB)
+
+	MOVM.IA.W (R13), [R1]	/* restore R1 (saved CPSR) */
+	MOVW	R1, CPSR
+	MOVM.IA.W (R13), [R14]	/* restore lr */
+	RET
+
+/*
+ * l2 cache operations
+ */
+
+TEXT l2cacheuwb(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedwb_sw(SB), R0
+	MOVW	$2, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+TEXT l2cacheuwbinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	CPSR, R1
+	CPSID			/* splhi */
+
+	MOVM.DB.W [R1], (R13)	/* save R1 on stack */
+
+	MOVW	$cachedwbinv_sw(SB), R0
+	MOVW	$2, R8
+	BL	wholecache(SB)
+	BL	l2cacheuinv(SB)
+
+	MOVM.IA.W (R13), [R1]	/* restore R1 (saved CPSR) */
+	MOVW	R1, CPSR
+	MOVW.P	8(R13), R15
+
+TEXT l2cacheuinv(SB), $-4
+	MOVW.W	R14, -8(R13)
+	MOVW	$cachedinv_sw(SB), R0
+	MOVW	$2, R8
+	BL	wholecache(SB)
+	MOVW.P	8(R13), R15
+
+/*
+ * these shift values are for the Cortex-A8 L1 cache (A=2, L=6) and
+ * the Cortex-A8 L2 cache (A=3, L=6).
+ * A = log2(# of ways), L = log2(bytes per cache line).
+ * see armv7 arch ref p. 1403.
+ */
+#define L1WAYSH 30
+#define L1SETSH 6
+#define L2WAYSH 29
+#define L2SETSH 6
+
+/*
+ * callers are assumed to be the above l1 and l2 ops.
+ * R0 is the function to call in the innermost loop.
+ * R8 is the cache level (one-origin: 1 or 2).
+ *
+ * initial translation by 5c, then massaged by hand.
+ */
+TEXT wholecache+0(SB), $-4
+	MOVW	R0, R1		/* save argument for inner loop in R1 */
+	SUB	$1, R8		/* convert cache level to zero origin */
+
+	/* we may not have the MMU on yet, so map R1 to PC's space */
+	BIC	$KSEGM,	R1	/* strip segment from address */
+	MOVW	PC, R2		/* get PC's segment ... */
+	AND	$KSEGM, R2
+	ORR	R2, R1		/* combine them */
+
+	/* drain write buffers */
+	BARRIERS
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEwait
+	ISB
+
+	MOVW	CPSR, R2
+	MOVM.DB.W [R2,R14], (SP) /* save regs on stack */
+	CPSID			/* splhi to make entire op atomic */
+
+	/* get cache sizes */
+	SLL	$1, R8, R0	/* R0 = (cache - 1) << 1 */
+	MCR	CpSC, CpIDcssel, R0, C(CpID), C(CpIDidct), 0 /* set cache size select */
+	ISB
+	MRC	CpSC, CpIDcsize, R0, C(CpID), C(CpIDidct), 0 /* get cache sizes */
+
+	/* compute # of ways and sets for this cache level */
+	SRA	$3, R0, R5	/* R5 (ways) = R0 >> 3 */
+	AND	$1023, R5	/* R5 = (R0 >> 3) & MASK(10) */
+	ADD	$1, R5		/* R5 (ways) = ((R0 >> 3) & MASK(10)) + 1 */
+
+	SRA	$13, R0, R2	/* R2 = R0 >> 13 */
+	AND	$32767, R2	/* R2 = (R0 >> 13) & MASK(15) */
+	ADD	$1, R2		/* R2 (sets) = ((R0 >> 13) & MASK(15)) + 1 */
+
+	/* precompute set/way shifts for inner loop */
+	CMP	$0, R8		/* cache == 1? */
+	MOVW.EQ	$L1WAYSH, R3 	/* yes */
+	MOVW.EQ	$L1SETSH, R4
+	MOVW.NE	$L2WAYSH, R3	/* no */
+	MOVW.NE	$L2SETSH, R4
+
+	/* iterate over ways */
+	MOVW	$0, R7		/* R7: way */
+outer:
+	/* iterate over sets */
+	MOVW	$0, R6		/* R6: set */
+inner:
+	/* compute set/way register contents */
+	SLL	R3, R7, R0 	/* R0 = way << R3 (L?WAYSH) */
+	ORR	R8<<1, R0	/* R0 = way << L?WAYSH | (cache - 1) << 1 */
+	ORR	R6<<R4, R0 	/* R0 = way<<L?WAYSH | (cache-1)<<1 |set<<R4 */
+
+	BL	(R1)		/* call set/way operation with R0 */
+
+	ADD	$1, R6		/* set++ */
+	CMP	R2, R6		/* set >= sets? */
+	BLT	inner		/* no, do next set */
+
+	ADD	$1, R7		/* way++ */
+	CMP	R5, R7		/* way >= ways? */
+	BLT	outer		/* no, do next way */
+
+	MOVM.IA.W (SP), [R2,R14] /* restore regs */
+	MOVW	R2, CPSR	/* splx */
+
+	/* drain write buffers */
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwb), CpCACHEwait
+	ISB
+	RET
--- a/sys/src/9/bcm/clock.c
+++ b/sys/src/9/bcm/clock.c
@@ -1,11 +1,13 @@
 /*
- * bcm2835 timers
+ * bcm283[56] timers
  *	System timers run at 1MHz (timers 1 and 2 are used by GPU)
  *	ARM timer usually runs at 250MHz (may be slower in low power modes)
  *	Cycle counter runs at 700MHz (unless overclocked)
  *    All are free-running up-counters
+ *  Cortex-a7 has local generic timers per cpu (which we run at 1MHz)
  *
  * Use system timer 3 (64 bits) for hzclock interrupts and fastticks
+ *   For smp on bcm2836, use local generic timer for interrupts on cpu1-3
  * Use ARM timer (32 bits) for perfticks
  * Use ARM timer to force immediate interrupt
  * Use cycle counter for cycles()
@@ -17,14 +19,21 @@
 #include "dat.h"
 #include "fns.h"
 #include "io.h"
+#include "ureg.h"
+#include "arm.h"
 
 enum {
 	SYSTIMERS	= VIRTIO+0x3000,
 	ARMTIMER	= VIRTIO+0xB400,
 
+	Localctl	= 0x00,
+	Prescaler	= 0x08,
+	Localintpending	= 0x60,
+
 	SystimerFreq	= 1*Mhz,
 	MaxPeriod	= SystimerFreq / HZ,
-	MinPeriod	= SystimerFreq / (100*HZ),
+	MinPeriod	= 10,
+
 };
 
 typedef struct Systimers Systimers;
@@ -64,6 +73,11 @@
 	TmrPrescale256	= 0x02<<2,
 	CntWidth16	= 0<<1,
 	CntWidth32	= 1<<1,
+
+	/* generic timer (cortex-a7) */
+	Enable	= 1<<0,
+	Imask	= 1<<1,
+	Istatus = 1<<2,
 };
 
 static void
@@ -71,6 +85,8 @@
 {
 	Systimers *tn;
 
+	if(m->machno != 0)
+		panic("cpu%d: unexpected system timer interrupt", m->machno);
 	tn = (Systimers*)SYSTIMERS;
 	/* dismiss interrupt */
 	tn->cs = 1<<3;
@@ -77,6 +93,15 @@
 	timerintr(ureg, 0);
 }
 
+static void
+localclockintr(Ureg *ureg, void *)
+{
+	if(m->machno == 0)
+		panic("cpu0: Unexpected local generic timer interrupt");
+	cpwrsc(0, CpTIMER, CpTIMERphys, CpTIMERphysctl, Imask|Enable);
+	timerintr(ureg, 0);
+}
+
 void
 clockshutdown(void)
 {
@@ -84,6 +109,10 @@
 
 	tm = (Armtimer*)ARMTIMER;
 	tm->ctl = 0;
+	if(cpuserver)
+		wdogfeed();
+	else
+		wdogoff();
 }
 
 void
@@ -93,12 +122,16 @@
 	Armtimer *tm;
 	u32int t0, t1, tstart, tend;
 
-	tn = (Systimers*)SYSTIMERS;
-	tm = (Armtimer*)ARMTIMER;
-	tm->load = 0;
-	tm->ctl = TmrPrescale1|CntEnable|CntWidth32;
-	coherence();
+	if(((cprdsc(0, CpID, CpIDfeat, 1) >> 16) & 0xF) != 0) {
+		/* generic timer supported */
+		if(m->machno == 0){
+			*(ulong*)(ARMLOCAL + Localctl) = 0;				/* input clock is 19.2Mhz crystal */
+			*(ulong*)(ARMLOCAL + Prescaler) = 0x06aaaaab;	/* divide by (2^31/Prescaler) for 1Mhz */
+		}
+		cpwrsc(0, CpTIMER, CpTIMERphys, CpTIMERphysctl, Imask);
+	}
 
+	tn = (Systimers*)SYSTIMERS;
 	tstart = tn->clo;
 	do{
 		t0 = lcycles();
@@ -111,9 +144,14 @@
 	m->cpuhz = 100 * t1;
 	m->cpumhz = (m->cpuhz + Mhz/2 - 1) / Mhz;
 	m->cyclefreq = m->cpuhz;
-
-	tn->c3 = tn->clo - 1;
-	intrenable(IRQtimer3, clockintr, nil, 0, "clock");
+	if(m->machno == 0){
+		tn->c3 = tn->clo - 1;
+		tm = (Armtimer*)ARMTIMER;
+		tm->load = 0;
+		tm->ctl = TmrPrescale1|CntEnable|CntWidth32;
+		intrenable(IRQtimer3, clockintr, nil, 0, "clock");
+	}else
+		intrenable(IRQcntpns, localclockintr, nil, 0, "clock");
 }
 
 void
@@ -120,16 +158,22 @@
 timerset(uvlong next)
 {
 	Systimers *tn;
-	vlong now, period;
+	uvlong now;
+	long period;
 
-	tn = (Systimers*)SYSTIMERS;
 	now = fastticks(nil);
-	period = next - fastticks(nil);
+	period = next - now;
 	if(period < MinPeriod)
-		next = now + MinPeriod;
+		period = MinPeriod;
 	else if(period > MaxPeriod)
-		next = now + MaxPeriod;
-	tn->c3 = (ulong)next;
+		period = MaxPeriod;
+	if(m->machno > 0){
+		cpwrsc(0, CpTIMER, CpTIMERphys, CpTIMERphysval, period);
+		cpwrsc(0, CpTIMER, CpTIMERphys, CpTIMERphysctl, Enable);
+	}else{
+		tn = (Systimers*)SYSTIMERS;
+		tn->c3 = tn->clo + period;
+	}
 }
 
 uvlong
@@ -137,16 +181,17 @@
 {
 	Systimers *tn;
 	ulong lo, hi;
+	uvlong now;
 
-	tn = (Systimers*)SYSTIMERS;
 	if(hz)
 		*hz = SystimerFreq;
+	tn = (Systimers*)SYSTIMERS;
 	do{
 		hi = tn->chi;
 		lo = tn->clo;
 	}while(tn->chi != hi);
-	m->fastclock = (uvlong)hi<<32 | lo;
-	return m->fastclock;
+	now = (uvlong)hi<<32 | lo;
+	return now;
 }
 
 ulong
@@ -172,7 +217,6 @@
 		tm->ctl &= ~(TmrEnable|TmrIntEnable);
 		tm->irq = 1;
 	}
-	coherence();
 }
 
 ulong
@@ -180,7 +224,7 @@
 {
 	if(SystimerFreq != 1*Mhz)
 		return fastticks2us(fastticks(nil));
-	return fastticks(nil);
+	return ((Systimers*)SYSTIMERS)->clo;
 }
 
 void
@@ -189,8 +233,8 @@
 	Systimers *tn;
 	u32int now, diff;
 
-	tn = (Systimers*)SYSTIMERS;
 	diff = n + 1;
+	tn = (Systimers*)SYSTIMERS;
 	now = tn->clo;
 	while(tn->clo - now < diff)
 		;
--- a/sys/src/9/bcm/dat.h
+++ b/sys/src/9/bcm/dat.h
@@ -27,6 +27,7 @@
 typedef struct PMMU	PMMU;
 typedef struct Proc	Proc;
 typedef u32int		PTE;
+typedef struct Soc	Soc;
 typedef struct Uart	Uart;
 typedef struct Ureg	Ureg;
 typedef uvlong		Tval;
@@ -214,7 +215,7 @@
 typedef void		KMap;
 #define	VA(k)		((uintptr)(k))
 #define	kmap(p)		(KMap*)((p)->pa|kseg0)
-#define	kunmap(k)
+extern void kunmap(KMap*);
 
 struct
 {
@@ -279,3 +280,29 @@
 	Devport	*ports;			/* The ports themselves */
 };
 
+struct Soc {			/* SoC dependent configuration */
+	ulong	dramsize;
+	uintptr	physio;
+	uintptr	busdram;
+	uintptr	busio;
+	uintptr	armlocal;
+	u32int	l1ptedramattrs;
+	u32int	l2ptedramattrs;
+};
+extern Soc soc;
+
+#define BUSUNKNOWN -1
+
+/*
+ * GPIO
+ */
+enum {
+	Input	= 0x0,
+	Output	= 0x1,
+	Alt0	= 0x4,
+	Alt1	= 0x5,
+	Alt2	= 0x6,
+	Alt3	= 0x7,
+	Alt4	= 0x3,
+	Alt5	= 0x2,
+};
--- a/sys/src/9/bcm/devarch.c
+++ b/sys/src/9/bcm/devarch.c
@@ -150,9 +150,10 @@
 static long
 cputyperead(Chan*, void *a, long n, vlong offset)
 {
-	char str[128];
+	char name[64], str[128];
 
-	snprint(str, sizeof str, "ARM11 %d\n", m->cpumhz);
+	cputype2name(name, sizeof name);
+	snprint(str, sizeof str, "ARM %s %d\n", name, m->cpumhz);
 	return readstr(offset, a, n, str);
 }
 
@@ -159,8 +160,9 @@
 static long
 cputempread(Chan*, void *a, long n, vlong offset)
 {
- 	char str[128];
- 	snprint(str, sizeof str, "%d±%d\n", gettemp(0) / 1000, 1);
+	char str[16];
+
+	snprint(str, sizeof str, "%ud\n", (getcputemp()+500)/1000);
 	return readstr(offset, a, n, str);
 }
 
--- a/sys/src/9/bcm/devgpio.c
+++ b/sys/src/9/bcm/devgpio.c
@@ -232,110 +232,7 @@
 	}
 }
 
-// stolen from uartmini.c
-#define GPIOREGS	(VIRTIO+0x200000)
-/* GPIO regs */
-enum {
-	Fsel0	= 0x00>>2,
-		FuncMask= 0x7,
-	Set0	= 0x1c>>2,
-	Clr0	= 0x28>>2,
-	Lev0	= 0x34>>2,
-	Evds0	= 0x40>>2,
-	Redge0	= 0x4C>>2,
-	Fedge0	= 0x58>>2,
-	Hpin0	= 0x64>>2,
-	Lpin0	= 0x70>>2,
-	ARedge0	= 0x7C>>2,
-	AFedge0	= 0x88>2,
-	PUD	= 0x94>>2,
-	PUDclk0	= 0x98>>2,
-	PUDclk1	= 0x9c>>2,
-};
-
 static void
-gpiofuncset(uint pin, int func)
-{	
-	u32int *gp, *fsel;
-	int off;
-
-	gp = (u32int*)GPIOREGS;
-	fsel = &gp[Fsel0 + pin/10];
-	off = (pin % 10) * 3;
-	*fsel = (*fsel & ~(FuncMask<<off)) | func<<off;
-}
-
-static int
-gpiofuncget(uint pin)
-{	
-	u32int *gp, *fsel;
-	int off;
-
-	gp = (u32int*)GPIOREGS;
-	fsel = &gp[Fsel0 + pin/10];
-	off = (pin % 10) * 3;
-	return ((*fsel >> off) & FuncMask);
-}
-
-static void
-gpiopullset(uint pin, int state)
-{
-	u32int *gp, *reg;
-	u32int mask;
-
-	gp = (u32int*)GPIOREGS;
-	reg = &gp[PUDclk0 + pin/32];
-	mask = 1 << (pin % 32);
-	gp[PUD] = state;
-	microdelay(1);
-	*reg = mask;
-	microdelay(1);
-	*reg = 0;
-}
-
-static void
-gpioout(uint pin, int set)
-{
-	u32int *gp;
-	int v;
-
-	gp = (u32int*)GPIOREGS;
-	v = set? Set0 : Clr0;
-	gp[v + pin/32] = 1 << (pin % 32);
-}
-
-static int
-gpioin(uint pin)
-{
-	u32int *gp;
-
-	gp = (u32int*)GPIOREGS;
-	return (gp[Lev0 + pin/32] & (1 << (pin % 32))) != 0;
-}
-
-static void
-gpioevent(uint pin, int event, int enable)
-{
-	u32int *gp, *field;
-	int reg = 0;
-	
-	switch(event)
-	{
-		case Erising:
-			reg = Redge0;
-			break;
-		case Efalling:
-			reg = Fedge0;
-			break;
-		default:
-			panic("gpio: unknown event type");
-	}
-	gp = (u32int*)GPIOREGS;
-	field = &gp[reg + pin/32];
-	SET_BIT(field, pin, enable);
-}
-
-static void
 mkdeventry(Chan *c, Qid qid, Dirtab *tab, Dir *db)
 {
 	mkqid(&qid, tab->qid.path, tab->qid.vers, tab->qid.type);
@@ -417,13 +314,8 @@
 interrupt(Ureg*, void *)
 {
 	
-	u32int *gp, *field;
-	char pin;
+	uint pin;
 	
-	gp = (u32int*)GPIOREGS;
-
-	int set;
-
 	coherence();
 	
 	eventvalue = 0;
@@ -430,14 +322,8 @@
 	
 	for(pin = 0; pin < PIN_TABLE_SIZE; pin++)
 	{
-		set = (gp[Evds0 + pin/32] & (1 << (pin % 32))) != 0;
-
-		if(set)
-		{
-			field = &gp[Evds0 + pin/32];
-			SET_BIT(field, pin, 1);
+		if(gpiogetevent(pin))
 			SET_BIT(&eventvalue, pin, 1);
-		}
 	}
 	coherence();
 
@@ -447,7 +333,8 @@
 static void
 gpioinit(void)
 {
-	boardrev = getrevision() & 0xff;
+	gpiomeminit();
+	boardrev = getboardrev() & 0xff;
 	pinscheme = Qboard;
 	intrenable(49, interrupt, nil, 0, "gpio1");
 }
@@ -676,7 +563,7 @@
 			{
 				if(strncmp(funcname[i], arg, strlen(funcname[i])) == 0)
 				{
-					gpiofuncset(pin, i);
+					gpiosel(pin, i);
 					break;
 				}
 			}
@@ -691,7 +578,7 @@
 			{
 				if(strncmp(pudname[i], arg, strlen(pudname[i])) == 0)
 				{
-					gpiopullset(pin, i);
+					gpiopull(pin, i);
 					break;
 				}
 			}
@@ -707,7 +594,7 @@
 			{
 				if(strncmp(evtypename[i], arg, strlen(evtypename[i])) == 0)
 				{
-					gpioevent(pin, i, (cb->f[2][0] == 'e'));
+					gpioselevent(pin, i, (cb->f[2][0] == 'e'));
 					break;
 				}
 			}
--- a/sys/src/9/bcm/dma.c
+++ b/sys/src/9/bcm/dma.c
@@ -25,7 +25,7 @@
 enum {
 	Nchan		= 7,		/* number of dma channels */
 	Regsize		= 0x100,	/* size of regs for each chan */
-	Cbalign		= 32,		/* control block byte alignment */
+	Cbalign		= 64,		/* control block byte alignment (allow for 64-byte cache on bcm2836) */
 	Dbg		= 0,
 	
 	/* registers for each dma controller */
@@ -97,6 +97,18 @@
 static Ctlr dma[Nchan];
 static u32int *dmaregs = (u32int*)DMAREGS;
 
+uintptr
+dmaaddr(void *va)
+{
+	return soc.busdram | (PTR2UINT(va) & ~KSEGM);
+}
+
+static uintptr
+dmaioaddr(void *va)
+{
+	return soc.busio | (PTR2UINT(va) & ~VIRTIO);
+}
+
 static void
 dump(char *msg, uchar *p, int n)
 {
@@ -146,7 +158,7 @@
 		ctlr->regs = (u32int*)(DMAREGS + chan*Regsize);
 		ctlr->cb = xspanalloc(sizeof(Cb), Cbalign, 0);
 		assert(ctlr->cb != nil);
-		dmaregs[Enable] |= 1 << chan;
+		dmaregs[Enable] |= 1<<chan;
 		ctlr->regs[Cs] = Reset;
 		while(ctlr->regs[Cs] & Reset)
 			;
@@ -156,26 +168,26 @@
 	ti = 0;
 	switch(dir){
 	case DmaD2M:
-		cachedwbinvse(dst, len);
+		cachedinvse(dst, len);
 		ti = Srcdreq | Destinc;
-		cb->sourcead = DMAIO(src);
-		cb->destad = DMAADDR(dst);
+		cb->sourcead = dmaioaddr(src);
+		cb->destad = dmaaddr(dst);
 		break;
 	case DmaM2D:
 		cachedwbse(src, len);
 		ti = Destdreq | Srcinc;
-		cb->sourcead = DMAADDR(src);
-		cb->destad = DMAIO(dst);
+		cb->sourcead = dmaaddr(src);
+		cb->destad = dmaioaddr(dst);
 		break;
 	case DmaM2M:
 		cachedwbse(src, len);
-		cachedwbinvse(dst, len);
+		cachedinvse(dst, len);
 		ti = Srcinc | Destinc;
-		cb->sourcead = DMAADDR(src);
-		cb->destad = DMAADDR(dst);
+		cb->sourcead = dmaaddr(src);
+		cb->destad = dmaaddr(dst);
 		break;
 	}
-	cb->ti = ti | dev << Permapshift | Inten;
+	cb->ti = ti | dev<<Permapshift | Inten;
 	cb->txfrlen = len;
 	cb->stride = 0;
 	cb->nextconbk = 0;
@@ -182,7 +194,7 @@
 	cachedwbse(cb, sizeof(Cb));
 	ctlr->regs[Cs] = 0;
 	microdelay(1);
-	ctlr->regs[Conblkad] = DMAADDR(cb);
+	ctlr->regs[Conblkad] = dmaaddr(cb);
 	DBG print("dma start: %ux %ux %ux %ux %ux %ux\n",
 		cb->ti, cb->sourcead, cb->destad, cb->txfrlen,
 		cb->stride, cb->nextconbk);
--- a/sys/src/9/bcm/dwcotg.h
+++ b/sys/src/9/bcm/dwcotg.h
@@ -434,8 +434,8 @@
 		Episo		= 1<<18,
 		Epbulk		= 2<<18,
 		Epintr		= 3<<18,
-	Multicnt	= 0x3<<20,	/* transactions per μframe or retries */
-					/* per periodic split */
+	Multicnt	= 0x3<<20,	/* transactions per μframe */
+					/* or retries per periodic split */
 		OMulticnt	= 20,
 	Devaddr		= 0x7f<<22,	/* device address */
 		ODevaddr	= 22,
--- a/sys/src/9/bcm/fns.h
+++ b/sys/src/9/bcm/fns.h
@@ -10,6 +10,9 @@
 extern void cachedwbinvse(void*, int);
 extern void cacheiinv(void);
 extern void cacheuwbinv(void);
+extern void cachedwbtlb(void*, int);
+extern void cacheiinvse(void*, int);
+extern void cachedinvse(void*, int);
 extern uintptr cankaddr(uintptr pa);
 extern int cas32(void*, u32int, u32int);
 extern void checkmmu(uintptr, uintptr);
@@ -20,11 +23,13 @@
 extern ulong cprd(int cp, int op1, int crn, int crm, int op2);
 extern ulong cprdsc(int op1, int crn, int crm, int op2);
 extern void cpuidprint(void);
+extern u32int cpidget(void);
 extern void cpwr(int cp, int op1, int crn, int crm, int op2, ulong val);
 extern void cpwrsc(int op1, int crn, int crm, int op2, ulong val);
 #define cycles(ip) *(ip) = lcycles()
 extern void dmastart(int, int, int, void*, void*, int);
 extern int dmawait(int);
+extern uintptr dmaaddr(void *va);
 extern int fbblank(int);
 extern void* fbinit(int, int*, int*, int*);
 extern u32int farget(void);
@@ -41,13 +46,26 @@
 extern uint getfirmware(void);
 extern int getpower(int);
 extern void getramsize(Confmem*);
+extern int getncpus(void);
+extern void gpiosel(uint, int);
+extern void gpiopull(uint, int);
+extern void gpiopullup(uint);
+extern void gpiopulloff(uint);
+extern void gpiopulldown(uint);
+extern void gpioout(uint, int);
+extern int gpioin(uint);
+extern void gpioselevent(uint, int, int);
+extern int gpiogetevent(uint);
+extern void gpiomeminit(void);
 extern u32int ifsrget(void);
+extern void intrcpushutdown(void);
 extern void irqenable(int, void (*)(Ureg*, void*), void*);
 #define intrenable(i, f, a, b, n) irqenable((i), (f), (a))
 extern void intrsoff(void);
 extern int isaconfig(char*, int, ISAConf*);
+extern void l2cacheuwbinv(void);
 extern void links(void);
-extern void mmuinit(void);
+extern void mmuinit(void*);
 extern void mmuinit1(void);
 extern void mmuinvalidate(void);
 extern void mmuinvalidateaddr(u32int);
@@ -58,8 +76,10 @@
 extern void procfork(Proc*);
 extern void procsetup(Proc*);
 extern void screeninit(void);
+extern void setclkrate(int, ulong);
 extern void setpower(int, int);
 extern void setr13(int, u32int*);
+extern int startcpus(uint);
 extern int splfhi(void);
 extern int splflo(void);
 extern int tas(void *);
@@ -68,9 +88,17 @@
 extern void uartconsinit(void);
 extern int userureg(Ureg*);
 extern void vectors(void);
+extern void vgpinit(void);
+extern void vgpset(uint, int);
 extern void vtable(void);
-extern uint gettemp(int);
-extern uint getrevision(void);
+extern void wdogoff(void);
+extern void wdogfeed(void);
+extern void vtable(void);
+extern int l2ap(int);
+extern uint getcputemp(void);
+extern char *cputype2name(char *buf, int size);
+extern uint getboardrev(void);
+extern void sev(void);
 
 /*
  * floating point emulation
--- /dev/null
+++ b/sys/src/9/bcm/gpio.c
@@ -1,0 +1,142 @@
+/*
+ * Raspberry Pi GPIO support
+ */
+
+#include "u.h"
+#include "../port/lib.h"
+#include "../port/error.h"
+#include "mem.h"
+#include "dat.h"
+#include "fns.h"
+#include "io.h"
+
+#define GPIOREGS	(VIRTIO+0x200000)
+
+/* GPIO regs */
+enum {
+	Fsel0	= 0x00>>2,
+		FuncMask= 0x7,
+	Set0	= 0x1c>>2,
+	Clr0	= 0x28>>2,
+	Lev0	= 0x34>>2,
+	Evds0	= 0x40>>2,
+	Redge0	= 0x4C>>2,
+	Fedge0	= 0x58>>2,
+	Hpin0	= 0x64>>2,
+	Lpin0	= 0x70>>2,
+	ARedge0	= 0x7C>>2,
+	AFedge0	= 0x88>2,
+	PUD	= 0x94>>2,
+		Off	= 0x0,
+		Pulldown= 0x1,
+		Pullup	= 0x2,
+	PUDclk0	= 0x98>>2,
+	PUDclk1	= 0x9c>>2,
+};
+
+void
+gpiosel(uint pin, int func)
+{	
+	u32int *gp, *fsel;
+	int off;
+
+	gp = (u32int*)GPIOREGS;
+	fsel = &gp[Fsel0 + pin/10];
+	off = (pin % 10) * 3;
+	*fsel = (*fsel & ~(FuncMask<<off)) | func<<off;
+}
+
+void
+gpiopull(uint pin, int func)
+{
+	u32int *gp, *reg;
+	u32int mask;
+
+	gp = (u32int*)GPIOREGS;
+	reg = &gp[PUDclk0 + pin/32];
+	mask = 1 << (pin % 32);
+	gp[PUD] = func;
+	microdelay(1);
+	*reg = mask;
+	microdelay(1);
+	*reg = 0;
+}
+
+void
+gpiopulloff(uint pin)
+{
+	gpiopull(pin, Off);
+}
+
+void
+gpiopullup(uint pin)
+{
+	gpiopull(pin, Pullup);
+}
+
+void
+gpiopulldown(uint pin)
+{
+	gpiopull(pin, Pulldown);
+}
+
+void
+gpioout(uint pin, int set)
+{
+	u32int *gp;
+	int v;
+
+	gp = (u32int*)GPIOREGS;
+	v = set? Set0 : Clr0;
+	gp[v + pin/32] = 1 << (pin % 32);
+}
+
+int
+gpioin(uint pin)
+{
+	u32int *gp;
+
+	gp = (u32int*)GPIOREGS;
+	return (gp[Lev0 + pin/32] & (1 << (pin % 32))) != 0;
+}
+
+void
+gpioselevent(uint pin, int falling, int enable)
+{
+	u32int *gp, *field;
+	int reg;
+
+	enable = enable != 0;
+	if(falling)
+		reg = Fedge0;
+	else
+		reg = Redge0;
+	gp = (u32int*)GPIOREGS;
+	field = &gp[reg + pin/32];
+	*field = (*field & ~(enable<<pin)) | (enable<<pin);
+}
+
+int
+gpiogetevent(uint pin)
+{
+	u32int *gp, *reg, val;
+
+	gp = (u32int*)GPIOREGS;
+	reg = &gp[Evds0 + pin/32];
+	val = *reg & (1 << (pin % 32));
+	*reg |= val;
+	return val != 0;
+}
+
+void
+gpiomeminit(void)
+{
+	Physseg seg;
+
+	memset(&seg, 0, sizeof seg);
+	seg.attr = SG_PHYSICAL;
+	seg.name = "gpio";
+	seg.pa = GPIOREGS;
+	seg.size = BY2PG;
+	addphysseg(&seg);
+}
--- a/sys/src/9/bcm/io.h
+++ b/sys/src/9/bcm/io.h
@@ -8,11 +8,23 @@
 	IRQdma0		= 16,
 #define IRQDMA(chan)	(IRQdma0+(chan))
 	IRQaux		= 29,
+	IRQi2c		= 53,
+	IRQspi		= 54,
+	IRQsdhost	= 56,
 	IRQmmc		= 62,
 
 	IRQbasic	= 64,
 	IRQtimerArm	= IRQbasic + 0,
 
+	IRQlocal	= 96,
+	IRQcntps	= IRQlocal + 0,
+	IRQcntpns	= IRQlocal + 1,
+	IRQmbox0	= IRQlocal + 4,
+	IRQmbox1	= IRQlocal + 5,
+	IRQmbox2	= IRQlocal + 6,
+	IRQmbox3	= IRQlocal + 7,
+	IRQlocaltmr	= IRQlocal + 11,
+
 	IRQfiq		= IRQusb,	/* only one source can be FIQ */
 
 	DmaD2M		= 0,		/* device to memory */
@@ -20,7 +32,14 @@
 	DmaM2M		= 2,		/* memory to memory */
 
 	DmaChanEmmc	= 4,		/* can only use 2-5, maybe 0 */
+	DmaChanSdhost	= 5,
+	DmaChanSpiTx= 2,
+	DmaChanSpiRx= 0,
+
+	DmaDevSpiTx	= 6,
+	DmaDevSpiRx	= 7,
 	DmaDevEmmc	= 11,
+	DmaDevSdhost	= 13,
 
 	PowerSd		= 0,
 	PowerUart0,
@@ -43,4 +62,3 @@
 	ClkPixel,
 	ClkPwm,
 };
-#define BUSUNKNOWN	(-1)
--- a/sys/src/9/bcm/l.s
+++ b/sys/src/9/bcm/l.s
@@ -1,10 +1,14 @@
 /*
- * Broadcom bcm2835 SoC, as used in Raspberry Pi
- * arm1176jzf-s processor (armv6)
+ * Common startup for armv6 and armv7
+ * The rest of l.s has been moved to armv[67].s
  */
 
 #include "arm.s"
 
+/*
+ * on bcm2836, only cpu0 starts here
+ * other cpus enter at cpureset in armv7.s
+ */
 TEXT _start(SB), 1, $-4
 	/*
 	 * load physical base for SB addressing while mmu is off
@@ -16,259 +20,14 @@
 	MOVW	$0, R0
 
 	/*
-	 * SVC mode, interrupts disabled
-	 */
-	MOVW	$(PsrDirq|PsrDfiq|PsrMsvc), R1
-	MOVW	R1, CPSR
-
-	/*
-	 * disable the mmu and L1 caches
-	 * invalidate caches and tlb
-	 */
-	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
-	BIC	$(CpCdcache|CpCicache|CpCpredict|CpCmmu), R1
-	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvu), CpCACHEall
-	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
-	ISB
-
-	/*
-	 * clear mach and page tables
-	 */
-	MOVW	$PADDR(MACHADDR), R1
-	MOVW	$PADDR(KTZERO), R2
-_ramZ:
-	MOVW	R0, (R1)
-	ADD	$4, R1
-	CMP	R1, R2
-	BNE	_ramZ
-
-	/*
 	 * start stack at top of mach (physical addr)
-	 * set up page tables for kernel
 	 */
 	MOVW	$PADDR(MACHADDR+MACHSIZE-4), R13
-	BL	,mmuinit(SB)
 
 	/*
-	 * set up domain access control and page table base
+	 * do arch-dependent startup (no return)
 	 */
-	MOVW	$Client, R1
-	MCR	CpSC, 0, R1, C(CpDAC), C(0)
-	MOVW	$PADDR(L1), R1
-	MCR	CpSC, 0, R1, C(CpTTB), C(0)
-
-	/*
-	 * enable caches, mmu, and high vectors
-	 */
-	MRC	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
-	ORR	$(CpChv|CpCdcache|CpCicache|CpCmmu), R0
-	MCR	CpSC, 0, R0, C(CpCONTROL), C(0), CpMainctl
-	ISB
-
-	/*
-	 * switch SB, SP, and PC into KZERO space
-	 */
-	MOVW	$setR12(SB), R12
-	MOVW	$(MACHADDR+MACHSIZE-4), R13
-	MOVW	$_startpg(SB), R15
-
-TEXT _startpg(SB), 1, $-4
-
-	/*
-	 * enable cycle counter
-	 */
-	MOVW	$1, R1
-	MCR	CpSC, 0, R1, C(CpSPM), C(CpSPMperf), CpSPMctl
-
-	/*
-	 * call main and loop forever if it returns
-	 */
-	BL	,main(SB)
+	BL	,armstart(SB)
 	B	,0(PC)
 
-	BL	_div(SB)		/* hack to load _div, etc. */
-
-TEXT fsrget(SB), 1, $-4				/* data fault status */
-	MRC	CpSC, 0, R0, C(CpFSR), C(0), CpFSRdata
-	RET
-
-TEXT ifsrget(SB), 1, $-4			/* instruction fault status */
-	MRC	CpSC, 0, R0, C(CpFSR), C(0), CpFSRinst
-	RET
-
-TEXT farget(SB), 1, $-4				/* fault address */
-	MRC	CpSC, 0, R0, C(CpFAR), C(0x0)
-	RET
-
-TEXT lcycles(SB), 1, $-4
-	MRC	CpSC, 0, R0, C(CpSPM), C(CpSPMperf), CpSPMcyc
-	RET
-
-TEXT splhi(SB), 1, $-4
-	MOVW	$(MACHADDR+4), R2		/* save caller pc in Mach */
-	MOVW	R14, 0(R2)
-
-	MOVW	CPSR, R0			/* turn off irqs (but not fiqs) */
-	ORR	$(PsrDirq), R0, R1
-	MOVW	R1, CPSR
-	RET
-
-TEXT splfhi(SB), 1, $-4
-	MOVW	$(MACHADDR+4), R2		/* save caller pc in Mach */
-	MOVW	R14, 0(R2)
-
-	MOVW	CPSR, R0			/* turn off irqs and fiqs */
-	ORR	$(PsrDirq|PsrDfiq), R0, R1
-	MOVW	R1, CPSR
-	RET
-
-TEXT splflo(SB), 1, $-4
-	MOVW	CPSR, R0			/* turn on fiqs */
-	BIC	$(PsrDfiq), R0, R1
-	MOVW	R1, CPSR
-	RET
-
-TEXT spllo(SB), 1, $-4
-	MOVW	CPSR, R0			/* turn on irqs and fiqs */
-	BIC	$(PsrDirq|PsrDfiq), R0, R1
-	MOVW	R1, CPSR
-	RET
-
-TEXT splx(SB), 1, $-4
-	MOVW	$(MACHADDR+0x04), R2		/* save caller pc in Mach */
-	MOVW	R14, 0(R2)
-
-	MOVW	R0, R1				/* reset interrupt level */
-	MOVW	CPSR, R0
-	MOVW	R1, CPSR
-	RET
-
-TEXT spldone(SB), 1, $0				/* end marker for devkprof.c */
-	RET
-
-TEXT islo(SB), 1, $-4
-	MOVW	CPSR, R0
-	AND	$(PsrDirq), R0
-	EOR	$(PsrDirq), R0
-	RET
-
-TEXT	tas(SB), $-4
-TEXT	_tas(SB), $-4
-	MOVW	R0,R1
-	MOVW	$1,R0
-	SWPW	R0,(R1)			/* fix: deprecated in armv6 */
-	RET
-
-TEXT setlabel(SB), 1, $-4
-	MOVW	R13, 0(R0)		/* sp */
-	MOVW	R14, 4(R0)		/* pc */
-	MOVW	$0, R0
-	RET
-
-TEXT gotolabel(SB), 1, $-4
-	MOVW	0(R0), R13		/* sp */
-	MOVW	4(R0), R14		/* pc */
-	MOVW	$1, R0
-	RET
-
-TEXT getcallerpc(SB), 1, $-4
-	MOVW	0(R13), R0
-	RET
-
-TEXT idlehands(SB), $-4
-	BARRIERS
-	MOVW	CPSR, R3
-	BIC	$(PsrDirq|PsrDfiq), R3, R1		/* spllo */
-	MOVW	R1, CPSR
-
-	MOVW	$0, R0				/* wait for interrupt */
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEintr), CpCACHEwait
-	ISB
-
-	MOVW	R3, CPSR			/* splx */
-	RET
-
-
-TEXT coherence(SB), $-4
-	BARRIERS
-	RET
-
-/*
- * invalidate tlb
- */
-TEXT mmuinvalidate(SB), 1, $-4
-	MOVW	$0, R0
-	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
-	BARRIERS
-	RET
-
-/*
- * mmuinvalidateaddr(va)
- *   invalidate tlb entry for virtual page address va, ASID 0
- */
-TEXT mmuinvalidateaddr(SB), 1, $-4
-	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinvse
-	BARRIERS
-	RET
-
-/*
- * drain write buffer
- * writeback and invalidate data cache
- */
-TEXT cachedwbinv(SB), 1, $-4
-	DSB
-	MOVW	$0, R0
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEall
-	RET
-
-/*
- * cachedwbinvse(va, n)
- *   drain write buffer
- *   writeback and invalidate data cache range [va, va+n)
- */
-TEXT cachedwbinvse(SB), 1, $-4
-	MOVW	R0, R1		/* DSB clears R0 */
-	DSB
-	MOVW	n+4(FP), R2
-	ADD	R1, R2
-	SUB	$1, R2
-	BIC	$(CACHELINESZ-1), R1
-	BIC	$(CACHELINESZ-1), R2
-	MCRR(CpSC, 0, 2, 1, CpCACHERANGEdwbi)
-	RET
-
-/*
- * cachedwbse(va, n)
- *   drain write buffer
- *   writeback data cache range [va, va+n)
- */
-TEXT cachedwbse(SB), 1, $-4
-	MOVW	R0, R1		/* DSB clears R0 */
-	DSB
-	MOVW	n+4(FP), R2
-	ADD	R1, R2
-	BIC	$(CACHELINESZ-1), R1
-	BIC	$(CACHELINESZ-1), R2
-	MCRR(CpSC, 0, 2, 1, CpCACHERANGEdwb)
-	RET
-
-/*
- * drain write buffer and prefetch buffer
- * writeback and invalidate data cache
- * invalidate instruction cache
- */
-TEXT cacheuwbinv(SB), 1, $-4
-	BARRIERS
-	MOVW	$0, R0
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEall
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall
-	RET
-
-/*
- * invalidate instruction cache
- */
-TEXT cacheiinv(SB), 1, $-4
-	MOVW	$0, R0
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall
 	RET
--- a/sys/src/9/bcm/lexception.s
+++ b/sys/src/9/bcm/lexception.s
@@ -27,6 +27,7 @@
 	WORD	$_vfiq(SB)		/* FIQ, switch to svc mode */
 
 TEXT _vsvc(SB), 1, $-4			/* SWI */
+	CLREX
 	MOVW.W	R14, -4(R13)		/* ureg->pc = interrupted PC */
 	MOVW	SPSR, R14		/* ureg->psr = SPSR */
 	MOVW.W	R14, -4(R13)		/* ... */
@@ -39,10 +40,17 @@
 
 	MOVW	$setR12(SB), R12	/* Make sure we've got the kernel's SB loaded */
 
-//	MOVW	$(KSEG0+16*KiB-MACHSIZE), R10	/* m */
-	MOVW	$(MACHADDR), R10	/* m */
-	MOVW	8(R10), R9		/* up */
+	/* get R(MACH) for this cpu */
+	CPUID(R1)
+	SLL	$2, R1			/* convert to word index */
+	MOVW	$machaddr(SB), R2
+	ADD	R1, R2
+	MOVW	(R2), R(MACH)		/* m = machaddr[cpuid] */
+	CMP	$0, R(MACH)
+	MOVW.EQ	$MACHADDR, R0		/* paranoia: use MACHADDR if 0 */
 
+	MOVW	8(R(MACH)), R(USER)		/* up */
+
 	MOVW	R13, R0			/* first arg is pointer to ureg */
 	SUB	$8, R13			/* space for argument+link */
 
@@ -81,6 +89,7 @@
 	 *  we'll switch to SVC mode and then call trap.
 	 */
 _vswitch:
+	CLREX
 	MOVW	SPSR, R1		/* save SPSR for ureg */
 	MOVW	R14, R2			/* save interrupted pc for ureg */
 	MOVW	R13, R3			/* save pointer to where the original [R0-R4] are */
@@ -119,7 +128,16 @@
 
 	BL	trap(SB)
 
+	MOVW	$setR12(SB), R12	/* reload kernel's SB (ORLY?) */
 	ADD	$(4*2+4*15), R13	/* make r13 point to ureg->type */
+	/*
+	 * if we interrupted a previous trap's handler and are now
+	 * returning to it, we need to propagate the current R(MACH) (R10)
+	 * by overriding the saved one on the stack, since we may have
+	 * been rescheduled and be on a different processor now than
+	 * at entry.
+	 */
+	MOVW	R(MACH), (-(15-MACH)*4)(R13) /* restore current cpu's MACH */
 	MOVW	8(R13), R14		/* restore link */
 	MOVW	4(R13), R0		/* restore SPSR */
 	MOVW	R0, SPSR		/* ... */
@@ -140,10 +158,17 @@
 
 	MOVW	$setR12(SB), R12	/* Make sure we've got the kernel's SB loaded */
 
-//	MOVW	$(KSEG0+16*KiB-MACHSIZE), R10	/* m */
-	MOVW	$(MACHADDR), R10	/* m */
-	MOVW	8(R10), R9		/* up */
+	/* get R(MACH) for this cpu */
+	CPUID(R1)
+	SLL	$2, R1			/* convert to word index */
+	MOVW	$machaddr(SB), R2
+	ADD	R1, R2
+	MOVW	(R2), R(MACH)		/* m = machaddr[cpuid] */
+	CMP	$0, R(MACH)
+	MOVW.EQ	$MACHADDR, R(MACH)		/* paranoia: use MACHADDR if 0 */
 
+	MOVW	8(R(MACH)), R(USER)		/* up */
+
 	MOVW	R13, R0			/* first arg is pointer to ureg */
 	SUB	$(4*2), R13		/* space for argument+link (for debugger) */
 
@@ -158,14 +183,24 @@
 	RFE				/* MOVM.IA.S.W (R13), [R15] */
 
 TEXT _vfiq(SB), 1, $-4			/* FIQ */
+	CLREX
 	MOVW	$PsrMfiq, R8		/* trap type */
 	MOVW	SPSR, R9		/* interrupted psr */
 	MOVW	R14, R10		/* interrupted pc */
 	MOVM.DB.W [R8-R10], (R13)	/* save in ureg */
-	MOVM.DB.W.S [R0-R14], (R13)	/* save interrupted regs */
+	MOVM.DB.S [R0-R14], (R13)	/* save interrupted regs */
+	SUB	$(15*4), R13
 	MOVW	$setR12(SB), R12	/* Make sure we've got the kernel's SB loaded */
-	MOVW	$(MACHADDR), R10	/* m */
-	MOVW	8(R10), R9		/* up */
+	/* get R(MACH) for this cpu */
+	CPUID(R1)
+	SLL	$2, R1			/* convert to word index */
+	MOVW	$machaddr(SB), R2
+	ADD	R1, R2
+	MOVW	(R2), R(MACH)		/* m = machaddr[cpuid] */
+	CMP	$0, R(MACH)
+	MOVW.EQ	$MACHADDR, R(MACH)		/* paranoia: use MACHADDR if 0 */
+
+	MOVW	8(R(MACH)), R(USER)		/* up */
 	MOVW	R13, R0			/* first arg is pointer to ureg */
 	SUB	$(4*2), R13		/* space for argument+link (for debugger) */
 
@@ -187,6 +222,7 @@
 
 	MOVW	CPSR, R2
 	BIC	$PsrMask, R2, R3
+	ORR	$(PsrDirq|PsrDfiq), R3
 	ORR	R0, R3
 	MOVW	R3, CPSR		/* switch to new mode */
 
--- a/sys/src/9/bcm/main.c
+++ b/sys/src/9/bcm/main.c
@@ -4,6 +4,7 @@
 #include "mem.h"
 #include "dat.h"
 #include "fns.h"
+#include "io.h"
 
 #include "init.h"
 #include <pool.h>
@@ -191,6 +192,21 @@
 void
 machinit(void)
 {
+	Mach *m0;
+
+	m->ticks = 1;
+	m->perf.period = 1;
+	m0 = MACHP(0);
+	if (m->machno != 0) {
+		/* synchronise with cpu 0 */
+		m->ticks = m0->ticks;
+	}
+}
+
+void
+mach0init(void)
+{
+	m->mmul1 = (PTE*)L1;
 	m->machno = 0;
 	machaddr[m->machno] = m;
 
@@ -197,8 +213,6 @@
 	m->ticks = 1;
 	m->perf.period = 1;
 
-	conf.nmach = 1;
-
 	active.machs[0] = 1;
 	active.exiting = 0;
 
@@ -206,6 +220,32 @@
 }
 
 static void
+launchinit(void)
+{
+	int mach;
+	Mach *mm;
+	PTE *l1;
+
+	for(mach = 1; mach < conf.nmach; mach++){
+		machaddr[mach] = mm = mallocalign(MACHSIZE, MACHSIZE, 0, 0);
+		l1 = mallocalign(L1SIZE, L1SIZE, 0, 0);
+		if(mm == nil || l1 == nil)
+			panic("launchinit");
+		memset(mm, 0, MACHSIZE);
+		mm->machno = mach;
+
+		memmove(l1, m->mmul1, L1SIZE);  /* clone cpu0's l1 table */
+		cachedwbse(l1, L1SIZE);
+		mm->mmul1 = l1;
+		cachedwbse(mm, MACHSIZE);
+
+	}
+	cachedwbse(machaddr, sizeof machaddr);
+	if((mach = startcpus(conf.nmach)) < conf.nmach)
+			print("only %d cpu%s started\n", mach, mach == 1? "" : "s");
+}
+
+static void
 optionsinit(char* s)
 {
 	strecpy(oargb, oargb+sizeof(oargb), s);
@@ -216,29 +256,14 @@
 }
 
 void
-gpiomeminit(void)
-{
-	Physseg seg;
-	memset(&seg, 0, sizeof seg);
-	seg.attr = SG_PHYSICAL;
-	seg.name = "gpio";
-	seg.pa = (VIRTIO+0x200000);
-	seg.size = BY2PG;
-	addphysseg(&seg);
-}
-
-
-void
 main(void)
 {
 	extern char edata[], end[];
-	uint rev;
+	uint fw, board;
 
-	okay(1);
 	m = (Mach*)MACHADDR;
 	memset(edata, 0, end - edata);	/* clear bss */
-	machinit();
-	mmuinit1();
+	mach0init();
 
 	optionsinit("/boot/boot boot");
 	quotefmtinstall();
@@ -250,14 +275,17 @@
 	screeninit();
 
 	print("\nPlan 9 from Bell Labs\n");
-	rev = getfirmware();
-	print("firmware: rev %d\n", rev);
-	if(rev < Minfirmrev){
-		print("Sorry, firmware (start.elf) must be at least rev %d (%s)\n",
-			Minfirmrev, Minfirmdate);
+	board = getboardrev();
+	fw = getfirmware();
+	print("board rev: %#ux firmware rev: %d\n", board, fw);
+	if(fw < Minfirmrev){
+		print("Sorry, firmware (start*.elf) must be at least rev %d"
+		      " or newer than %s\n", Minfirmrev, Minfirmdate);
 		for(;;)
 			;
 	}
+	/* set clock rate to arm_freq from config.txt (default pi1:700Mhz pi2:900MHz) */
+	setclkrate(ClkArm, 0);
 	trapinit();
 	clockinit();
 	printinit();
@@ -264,6 +292,7 @@
 	timersinit();
 	cpuidprint();
 	archreset();
+	vgpinit();
 
 	procinit0();
 	initseg();
@@ -271,7 +300,8 @@
 	chandevreset();			/* most devices are discovered here */
 	pageinit();
 	userinit();
-	gpiomeminit();
+	launchinit();
+	mmuinit1();
 	schedinit();
 	assert(0);			/* shouldn't have returned */
 }
@@ -484,8 +514,7 @@
 	conf.upages = conf.npage - kpages;
 	conf.ialloc = (kpages/2)*BY2PG;
 
-	/* only one processor */
-	conf.nmach = 1;
+	conf.nmach = getncpus();
 
 	/* set up other configuration parameters */
 	conf.nproc = 100 + ((conf.npage*BY2PG)/MB)*5;
@@ -497,7 +526,7 @@
 	conf.nswppo = 4096;
 	conf.nimage = 200;
 
-	conf.copymode = 0;		/* copy on write */
+	conf.copymode = conf.nmach > 1;
 
 	/*
 	 * Guess how much is taken by the large permanent
@@ -529,6 +558,14 @@
 {
 	cpushutdown();
 	splfhi();
+	if(m->machno != 0){
+		void (*f)(ulong, ulong, ulong) = (void*)REBOOTADDR;
+		intrsoff();
+		intrcpushutdown();
+		cacheuwbinv();
+		(*f)(0, 0, 0);
+		for(;;);
+	}
 	archreboot();
 }
 
@@ -536,11 +573,9 @@
  * stub for ../omap/devether.c
  */
 int
-isaconfig(char *class, int ctlrno, ISAConf *isa)
+isaconfig(char *, int, ISAConf *)
 {
-	USED(ctlrno);
-	USED(isa);
-	return strcmp(class, "ether") == 0;
+	return 0;
 }
 
 /*
@@ -553,12 +588,23 @@
 	void (*f)(ulong, ulong, ulong);
 
 	writeconf();
+	if (m->machno != 0) {
+		procwired(up, 0);
+		sched();
+	}
+
+	/* setup reboot trampoline function */
+	f = (void*)REBOOTADDR;
+	memmove(f, rebootcode, sizeof(rebootcode));
+	cachedwbse(f, sizeof(rebootcode));
+
 	cpushutdown();
+	delay(500);
 
+	splfhi();
+
 	/* turn off buffered serial console */
 	serialoq = nil;
-	kprintoq = nil;
-	screenputs = nil;
 
 	/* shutdown devices */
 	chandevshutdown();
@@ -565,23 +611,14 @@
 
 	/* stop the clock (and watchdog if any) */
 	clockshutdown();
-
-	splfhi();
 	intrsoff();
+	intrcpushutdown();
 
-	/* setup reboot trampoline function */
-	f = (void*)REBOOTADDR;
-	memmove(f, rebootcode, sizeof(rebootcode));
 	cacheuwbinv();
+	l2cacheuwbinv();
 
 	/* off we go - never to return */
 	(*f)(PADDR(entry), PADDR(code), size);
-}
-
-int
-cmpswap(long *addr, long old, long new)
-{
-	return cas32(addr, old, new);
 }
 
 void
--- a/sys/src/9/bcm/mem.h
+++ b/sys/src/9/bcm/mem.h
@@ -5,27 +5,31 @@
 #define MiB		1048576u		/* Mebi 0x0000000000100000 */
 #define GiB		1073741824u		/* Gibi 000000000040000000 */
 
-#define HOWMANY(x, y)	(((x)+((y)-1))/(y))
-#define ROUNDUP(x, y)	(HOWMANY((x), (y))*(y))	/* ceiling */
-#define ROUNDDN(x, y)	(((x)/(y))*(y))		/* floor */
-#define MIN(a, b)	((a) < (b)? (a): (b))
-#define MAX(a, b)	((a) > (b)? (a): (b))
-
 /*
  * Sizes
  */
 #define	BY2PG		(4*KiB)			/* bytes per page */
 #define	PGSHIFT		12			/* log(BY2PG) */
+#define	HOWMANY(x,y)	(((x)+((y)-1))/(y))
+#define	ROUNDUP(x,y)	(HOWMANY((x),(y))*(y))
 #define	PGROUND(s)	ROUNDUP(s, BY2PG)
 #define	ROUND(s, sz)	(((s)+(sz-1))&~(sz-1))
 
-#define	MAXMACH		1			/* max # cpus system can run */
+#define	MAXMACH		4			/* max # cpus system can run */
 #define	MACHSIZE	BY2PG
+#define L1SIZE		(4 * BY2PG)
 
 #define KSTKSIZE	(8*KiB)
 #define STACKALIGN(sp)	((sp) & ~3)		/* bug: assure with alloc */
 
 /*
+ * Magic registers
+ */
+
+#define	USER		9		/* R9 is up-> */
+#define	MACH		10		/* R10 is m-> */
+
+/*
  * Address spaces.
  * KTZERO is used by kprof and dumpstack (if any).
  *
@@ -36,8 +40,8 @@
  */
 
 #define	KSEG0		0x80000000		/* kernel segment */
-/* mask to check segment; good for 512MB dram */
-#define	KSEGM		0xE0000000
+/* mask to check segment; good for 1GB dram */
+#define	KSEGM		0xC0000000
 #define	KZERO		KSEG0			/* kernel address space */
 #define CONFADDR	(KZERO+0x100)		/* unparsed plan9.ini */
 #define	MACHADDR	(KZERO+0x2000)		/* Mach structure */
@@ -47,20 +51,24 @@
 #define	L1		(KZERO+0x4000)		/* tt ptes: 16KiB aligned */
 #define	KTZERO		(KZERO+0x8000)		/* kernel text start */
 #define VIRTIO		0x7E000000		/* i/o registers */
-#define	FRAMEBUFFER	0xA0000000		/* video framebuffer */
+#define	ARMLOCAL	(VIRTIO+IOSIZE)		/* armv7 only */
+#define	VGPIO		(ARMLOCAL+MiB)		/* virtual gpio for pi3 ACT LED */
+#define	FRAMEBUFFER	0xC0000000		/* video framebuffer */
 
 #define	UZERO		0			/* user segment */
 #define	UTZERO		(UZERO+BY2PG)		/* user text start */
-#define	USTKTOP		0x20000000		/* user segment end +1 */
+#define	USTKTOP		0x40000000		/* user segment end +1 */
 #define	USTKSIZE	(8*1024*1024)		/* user stack size */
+#define	TSTKTOP		(USTKTOP-USTKSIZE)	/* sysexec temporary stack */
+#define	TSTKSIZ	 	256
 
 /* address at which to copy and execute rebootcode */
-#define	REBOOTADDR	(KZERO+0x3400)
+#define	REBOOTADDR	(KZERO+0x1800)
 
 /*
  * Legacy...
  */
-#define BLOCKALIGN	32			/* only used in allocb.c */
+#define BLOCKALIGN	64			/* only used in allocb.c */
 #define KSTACK		KSTKSIZE
 
 /*
@@ -71,7 +79,6 @@
 #define BY2WD		4
 #define BY2V		8			/* only used in xalloc.c */
 
-#define CACHELINESZ	32
 #define	PTEMAPMEM	(1024*1024)
 #define	PTEPERTAB	(PTEMAPMEM/BY2PG)
 #define	SEGMAPSIZE	1984
@@ -93,8 +100,7 @@
  *	BUS  addresses as seen from the videocore gpu.
  */
 #define	PHYSDRAM	0
-#define BUSDRAM		0x40000000
-#define	DRAMSIZE	(512*MiB)
-#define	PHYSIO		0x20000000
-#define	BUSIO		0x7E000000
 #define	IOSIZE		(16*MiB)
+
+#define MIN(a, b)	((a) < (b)? (a): (b))
+#define MAX(a, b)	((a) > (b)? (a): (b))
--- a/sys/src/9/bcm/mkfile
+++ b/sys/src/9/bcm/mkfile
@@ -1,7 +1,7 @@
-CONF=pif
-CONFLIST=pif picpuf
+CONF=pi2
+CONFLIST=pi pi2
+CRAPLIST=pif picpuf
 EXTRACOPIES=
-#EXTRACOPIES=''piestand lookout boundary # bovril
 
 loadaddr=0x80008000
 
@@ -104,12 +104,13 @@
 arch.$O clock.$O fpiarm.$O main.$O mmu.$O screen.$O syscall.$O trap.$O: \
 	/$objtype/include/ureg.h
 
-archbcm.$O: ../port/flashif.h
 fpi.$O fpiarm.$O fpimem.$O: fpi.h
-l.$O lexception.$O lproc.$O mmu.$O: arm.s mem.h
+l.$O lexception.$O lproc.$O mmu.$O: mem.h
+l.$O lexception.$O lproc.$O armv6.$O armv7.$O: arm.s
+armv7.$O: cache.v7.s
 main.$O: errstr.h init.h reboot.h
 devmouse.$O mouse.$O screen.$O: screen.h
-devusb.$O: ../port/usb.h
+usbdwc.$O: dwcotg.h ../port/usb.h
 
 init.h:D:	../port/initcode.c init9.s
 	$CC ../port/initcode.c
@@ -123,11 +124,12 @@
 reboot.h:D:	rebootcode.s arm.s arm.h mem.h
 	$AS rebootcode.s
 	# -lc is only for memmove.  -T arg is PADDR(REBOOTADDR)
-	$LD -l -s -T0x3400 -R4 -o reboot.out rebootcode.$O -lc
+	$LD -l -s -T0x1800 -R4 -o reboot.out rebootcode.$O -lc
 	{echo 'uchar rebootcode[]={'
 	 xd -1x reboot.out |
 		sed -e '1,2d' -e 's/^[0-9a-f]+ //' -e 's/ ([0-9a-f][0-9a-f])/0x\1,/g'
 	 echo '};'} > reboot.h
+
 errstr.h:D:	../port/mkerrstr ../port/error.h
 	rc ../port/mkerrstr > errstr.h
 
--- a/sys/src/9/bcm/mmu.c
+++ b/sys/src/9/bcm/mmu.c
@@ -9,19 +9,26 @@
 #define FEXT(d, o, w)	(((d)>>(o)) & ((1<<(w))-1))
 #define L1X(va)		FEXT((va), 20, 12)
 #define L2X(va)		FEXT((va), 12, 8)
+#define L2AP(ap)	l2ap(ap)
+#define L1ptedramattrs	soc.l1ptedramattrs
+#define L2ptedramattrs	soc.l2ptedramattrs
 
 enum {
 	L1lo		= UZERO/MiB,		/* L1X(UZERO)? */
 	L1hi		= (USTKTOP+MiB-1)/MiB,	/* L1X(USTKTOP+MiB-1)? */
+	L2size		= 256*sizeof(PTE),
 };
 
+/*
+ * Set up initial PTEs for cpu0 (called with mmu off)
+ */
 void
-mmuinit(void)
+mmuinit(void *a)
 {
 	PTE *l1, *l2;
 	uintptr pa, va;
 
-	l1 = (PTE*)PADDR(L1);
+	l1 = (PTE*)a;
 	l2 = (PTE*)PADDR(L2);
 
 	/*
@@ -28,8 +35,8 @@
 	 * map all of ram at KZERO
 	 */
 	va = KZERO;
-	for(pa = PHYSDRAM; pa < PHYSDRAM+DRAMSIZE; pa += MiB){
-		l1[L1X(va)] = pa|Dom0|L1AP(Krw)|Section|Cached|Buffered;
+	for(pa = PHYSDRAM; pa < PHYSDRAM+soc.dramsize; pa += MiB){
+		l1[L1X(va)] = pa|Dom0|L1AP(Krw)|Section|L1ptedramattrs;
 		va += MiB;
 	}
 
@@ -36,39 +43,41 @@
 	/*
 	 * identity map first MB of ram so mmu can be enabled
 	 */
-	l1[L1X(PHYSDRAM)] = PHYSDRAM|Dom0|L1AP(Krw)|Section|Cached|Buffered;
+	l1[L1X(PHYSDRAM)] = PHYSDRAM|Dom0|L1AP(Krw)|Section|L1ptedramattrs;
 
 	/*
 	 * map i/o registers 
 	 */
 	va = VIRTIO;
-	for(pa = PHYSIO; pa < PHYSIO+IOSIZE; pa += MiB){
+	for(pa = soc.physio; pa < soc.physio+IOSIZE; pa += MiB){
 		l1[L1X(va)] = pa|Dom0|L1AP(Krw)|Section;
 		va += MiB;
 	}
-
+	pa = soc.armlocal;
+	if(pa)
+		l1[L1X(va)] = pa|Dom0|L1AP(Krw)|Section;
+	
 	/*
-	 * double map exception vectors at top of virtual memory
+	 * double map exception vectors near top of virtual memory
 	 */
 	va = HVECTORS;
 	l1[L1X(va)] = (uintptr)l2|Dom0|Coarse;
-	l2[L2X(va)] = PHYSDRAM|L2AP(Krw)|Small;
+	l2[L2X(va)] = PHYSDRAM|L2AP(Krw)|Small|L2ptedramattrs;
 }
 
 void
-mmuinit1(void)
+mmuinit1()
 {
 	PTE *l1;
 
-	l1 = (PTE*)L1;
-	m->mmul1 = l1;
+	l1 = m->mmul1;
 
 	/*
 	 * undo identity map of first MB of ram
 	 */
 	l1[L1X(PHYSDRAM)] = 0;
-	cachedwbse(&l1[L1X(PHYSDRAM)], sizeof(PTE));
-	mmuinvalidate();
+	cachedwbtlb(&l1[L1X(PHYSDRAM)], sizeof(PTE));
+	mmuinvalidateaddr(PHYSDRAM);
 }
 
 static void
@@ -81,10 +90,11 @@
 	l2 = &proc->mmul2;
 	for(page = *l2; page != nil; page = page->next){
 		if(clear)
-			memset(UINT2PTR(page->va), 0, BY2PG);
+			memset(UINT2PTR(page->va), 0, L2size);
 		l1[page->daddr] = Fault;
 		l2 = &page->next;
 	}
+	coherence();
 	*l2 = proc->mmul2cache;
 	proc->mmul2cache = proc->mmul2;
 	proc->mmul2 = nil;
@@ -93,29 +103,24 @@
 static void
 mmul1empty(void)
 {
-#ifdef notdef
-/* there's a bug in here */
 	PTE *l1;
 
 	/* clean out any user mappings still in l1 */
-	if(m->mmul1lo > L1lo){
+	if(m->mmul1lo > 0){
 		if(m->mmul1lo == 1)
 			m->mmul1[L1lo] = Fault;
 		else
 			memset(&m->mmul1[L1lo], 0, m->mmul1lo*sizeof(PTE));
-		m->mmul1lo = L1lo;
+		m->mmul1lo = 0;
 	}
-	if(m->mmul1hi < L1hi){
-		l1 = &m->mmul1[m->mmul1hi];
-		if((L1hi - m->mmul1hi) == 1)
+	if(m->mmul1hi > 0){
+		l1 = &m->mmul1[L1hi - m->mmul1hi];
+		if(m->mmul1hi == 1)
 			*l1 = Fault;
 		else
-			memset(l1, 0, (L1hi - m->mmul1hi)*sizeof(PTE));
-		m->mmul1hi = L1hi;
+			memset(l1, 0, m->mmul1hi*sizeof(PTE));
+		m->mmul1hi = 0;
 	}
-#else
-	memset(&m->mmul1[L1lo], 0, (L1hi - L1lo)*sizeof(PTE));
-#endif /* notdef */
 }
 
 void
@@ -125,15 +130,7 @@
 	PTE *l1;
 	Page *page;
 
-	/* do kprocs get here and if so, do they need to? */
-	if(m->mmupid == proc->pid && !proc->newtlb)
-		return;
-	m->mmupid = proc->pid;
-
-	/* write back dirty and invalidate l1 caches */
-	cacheuwbinv();
-
-	if(proc->newtlb){
+	if(proc != nil && proc->newtlb){
 		mmul2empty(proc, 1);
 		proc->newtlb = 0;
 	}
@@ -142,19 +139,21 @@
 
 	/* move in new map */
 	l1 = m->mmul1;
+	if(proc != nil)
 	for(page = proc->mmul2; page != nil; page = page->next){
 		x = page->daddr;
 		l1[x] = PPN(page->pa)|Dom0|Coarse;
-		/* know here that L1lo < x < L1hi */
-		if(x+1 - m->mmul1lo < m->mmul1hi - x)
-			m->mmul1lo = x+1;
-		else
-			m->mmul1hi = x;
+		if(x >= L1lo + m->mmul1lo && x < L1hi - m->mmul1hi){
+			if(x+1 - L1lo < L1hi - x)
+				m->mmul1lo = x+1 - L1lo;
+			else
+				m->mmul1hi = L1hi - x;
+		}
 	}
 
 	/* make sure map is in memory */
 	/* could be smarter about how much? */
-	cachedwbse(&l1[L1X(UZERO)], (L1hi - L1lo)*sizeof(PTE));
+	cachedwbtlb(&l1[L1X(UZERO)], (L1hi - L1lo)*sizeof(PTE));
 
 	/* lose any possible stale tlb entries */
 	mmuinvalidate();
@@ -176,9 +175,6 @@
 {
 	Page *page, *next;
 
-	/* write back dirty and invalidate l1 caches */
-	cacheuwbinv();
-
 	mmul2empty(proc, 0);
 	for(page = proc->mmul2cache; page != nil; page = next){
 		next = page->next;
@@ -194,7 +190,7 @@
 
 	/* make sure map is in memory */
 	/* could be smarter about how much? */
-	cachedwbse(&m->mmul1[L1X(UZERO)], (L1hi - L1lo)*sizeof(PTE));
+	cachedwbtlb(&m->mmul1[L1X(UZERO)], (L1hi - L1lo)*sizeof(PTE));
 
 	/* lose any possible stale tlb entries */
 	mmuinvalidate();
@@ -203,39 +199,45 @@
 void
 putmmu(uintptr va, uintptr pa, Page* page)
 {
-	int x;
+	int x, s;
 	Page *pg;
 	PTE *l1, *pte;
 
+	/*
+	 * disable interrupts to prevent flushmmu (called from hzclock)
+	 * from clearing page tables while we are setting them
+	 */
+	s = splhi();
 	x = L1X(va);
 	l1 = &m->mmul1[x];
 	if(*l1 == Fault){
-		/* wasteful - l2 pages only have 256 entries - fix */
+		/* l2 pages only have 256 entries - wastes 3K per 1M of address space */
 		if(up->mmul2cache == nil){
-			/* auxpg since we don't need much? memset if so */
+			spllo();
 			pg = newpage(1, 0, 0);
+			splhi();
+			/* if newpage slept, we might be on a different cpu */
+			l1 = &m->mmul1[x];
 			pg->va = VA(kmap(pg));
-		}
-		else{
+		}else{
 			pg = up->mmul2cache;
 			up->mmul2cache = pg->next;
-			memset(UINT2PTR(pg->va), 0, BY2PG);
 		}
 		pg->daddr = x;
 		pg->next = up->mmul2;
 		up->mmul2 = pg;
 
-		/* force l2 page to memory */
-		cachedwbse((void *)pg->va, BY2PG);
+		/* force l2 page to memory (armv6) */
+		cachedwbtlb((void *)pg->va, L2size);
 
 		*l1 = PPN(pg->pa)|Dom0|Coarse;
-		cachedwbse(l1, sizeof *l1);
+		cachedwbtlb(l1, sizeof *l1);
 
-		if(x >= m->mmul1lo && x < m->mmul1hi){
-			if(x+1 - m->mmul1lo < m->mmul1hi - x)
-				m->mmul1lo = x+1;
+		if(x >= L1lo + m->mmul1lo && x < L1hi - m->mmul1hi){
+			if(x+1 - L1lo < L1hi - x)
+				m->mmul1lo = x+1 - L1lo;
 			else
-				m->mmul1hi = x;
+				m->mmul1hi = L1hi - x;
 		}
 	}
 	pte = UINT2PTR(KADDR(PPN(*l1)));
@@ -247,31 +249,53 @@
 	 */
 	x = Small;
 	if(!(pa & PTEUNCACHED))
-		x |= Cached|Buffered;
+		x |= L2ptedramattrs;
 	if(pa & PTEWRITE)
 		x |= L2AP(Urw);
 	else
 		x |= L2AP(Uro);
 	pte[L2X(va)] = PPN(pa)|x;
-	cachedwbse(&pte[L2X(va)], sizeof pte[0]);
+	cachedwbtlb(&pte[L2X(va)], sizeof(PTE));
 
 	/* clear out the current entry */
 	mmuinvalidateaddr(PPN(va));
 
-	/*  write back dirty entries - we need this because the pio() in
-	 *  fault.c is writing via a different virt addr and won't clean
-	 *  its changes out of the dcache.  Page coloring doesn't work
-	 *  on this mmu because the virtual cache is set associative
-	 *  rather than direct mapped.
-	 */
-	cachedwbinv();
-	if(page->txtflush){
-		cacheiinv();
-		page->txtflush = 0;
+	if((page->txtflush & (1<<m->machno)) != 0){
+		/* pio() sets PG_TXTFLUSH whenever a text pg has been written */
+		cachedwbse((void*)(page->pa|KZERO), BY2PG);
+		cacheiinvse((void*)page->va, BY2PG);
+		page->txtflush &= ~(1<<m->machno);
 	}
-	checkmmu(va, PPN(pa));
+	//checkmmu(va, PPN(pa));
+	splx(s);
 }
 
+void*
+mmuuncache(void* v, usize size)
+{
+	int x;
+	PTE *pte;
+	uintptr va;
+
+	/*
+	 * Simple helper for ucalloc().
+	 * Uncache a Section, must already be
+	 * valid in the MMU.
+	 */
+	va = PTR2UINT(v);
+	assert(!(va & (1*MiB-1)) && size == 1*MiB);
+
+	x = L1X(va);
+	pte = &m->mmul1[x];
+	if((*pte & (Fine|Section|Coarse)) != Section)
+		return nil;
+	*pte &= ~L1ptedramattrs;
+	mmuinvalidateaddr(va);
+	cachedwbinvse(pte, 4);
+
+	return v;
+}
+
 /*
  * Return the number of bytes that can be accessed via KADDR(pa).
  * If pa is not a valid argument to KADDR, return 0.
@@ -304,15 +328,31 @@
 		*pte++ = (pa+n)|Dom0|L1AP(Krw)|Section;
 		mmuinvalidateaddr(va+n);
 	}
-	cachedwbse(pte0, (uintptr)pte - (uintptr)pte0);
+	cachedwbtlb(pte0, (uintptr)pte - (uintptr)pte0);
 	return va + o;
 }
 
-
 void
 checkmmu(uintptr va, uintptr pa)
 {
-	USED(va);
-	USED(pa);
+	int x;
+	PTE *l1, *pte;
+
+	x = L1X(va);
+	l1 = &m->mmul1[x];
+	if(*l1 == Fault){
+		iprint("checkmmu cpu%d va=%lux l1 %p=%ux\n", m->machno, va, l1, *l1);
+		return;
+	}
+	pte = KADDR(PPN(*l1));
+	pte += L2X(va);
+	if(pa == ~0 || (pa != 0 && PPN(*pte) != pa))
+		iprint("checkmmu va=%lux pa=%lux l1 %p=%ux pte %p=%ux\n", va, pa, l1, *l1, pte, *pte);
 }
 
+void
+kunmap(KMap *k)
+{
+	USED(k);
+	coherence();
+}
--- /dev/null
+++ b/sys/src/9/bcm/pi
@@ -1,0 +1,55 @@
+dev
+	root
+	cons
+	swap
+	env
+	pipe
+	proc
+	mnt
+	srv
+	shr
+	swap
+	dup
+	arch
+	ssl
+	tls
+	cap
+	fs
+	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum inferno
+	draw	screen swcursor
+	mouse	mouse
+	uart	gpio
+	gpio	gpio
+	sd
+	usb
+
+link
+	loopbackmedium
+	ethermedium
+	archbcm
+	usbdwc
+
+ip
+	tcp
+	udp
+	ipifc
+	icmp
+	icmp6
+	ipmux
+
+misc
+	armv6
+	uartmini
+#	sdmmc	emmc
+	dma
+	vcore
+	vfp3	coproc
+
+port
+	int cpuserver = 0;
+
+bootdir
+	/$objtype/bin/paqfs
+	/$objtype/bin/auth/factotum
+	bootfs.paq
+	boot
--- /dev/null
+++ b/sys/src/9/bcm/pi2
@@ -1,0 +1,55 @@
+dev
+	root
+	cons
+	swap
+	env
+	pipe
+	proc
+	mnt
+	srv
+	shr
+	swap
+	dup
+	arch
+	ssl
+	tls
+	cap
+	fs
+	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum inferno
+	draw	screen swcursor
+	mouse	mouse
+	uart	gpio
+	gpio	gpio
+	sd
+	usb
+
+link
+	loopbackmedium
+	ethermedium
+	archbcm2
+	usbdwc
+
+ip
+	tcp
+	udp
+	ipifc
+	icmp
+	icmp6
+	ipmux
+
+misc
+	armv7
+	uartmini
+	sdmmc	emmc
+	dma
+	vcore
+	vfp3	coproc
+
+port
+	int cpuserver = 0;
+
+bootdir
+	/$objtype/bin/paqfs
+	/$objtype/bin/auth/factotum
+	bootfs.paq
+	boot
--- a/sys/src/9/bcm/pif
+++ /dev/null
@@ -1,56 +1,0 @@
-dev
-	root
-	cons
-	swap
-	env
-	pipe
-	proc
-	mnt
-	srv
-	shr
-	dup
-	arch
-	ssl
-	tls
-	cap
-	fs
-	ip		arp chandial ip ipv6 ipaux iproute netlog nullmedium pktmedium ptclbsum inferno
-	draw	screen swcursor
-	mouse	mouse
-	uart
-	gpio
-	sd
-	usb
-
-link
-	ethermedium
-	loopbackmedium
-	netdevmedium
-	archbcm
-	usbdwc
-
-ip
-	tcp
-	udp
-	ipifc
-	icmp
-	icmp6
-	ipmux
-	il
-
-misc
-	uartmini
-	sdmmc	emmc
-	dma
-	vcore
-	vfp3	coproc
-
-port
-	int cpuserver = 0;
-
-bootdir
-	/$objtype/bin/paqfs
-	/$objtype/bin/auth/factotum
-	bootfs.paq
-	boot
-
--- a/sys/src/9/bcm/rebootcode.s
+++ b/sys/src/9/bcm/rebootcode.s
@@ -1,8 +1,13 @@
 /*
- * armv6 reboot code
+ * armv6/armv7 reboot code
  */
 #include "arm.s"
 
+#define PTEDRAM		(Dom0|L1AP(Krw)|Section)
+
+#define WFI	WORD	$0xe320f003	/* wait for interrupt */
+#define WFE	WORD	$0xe320f002	/* wait for event */
+
 /*
  * Turn off MMU, then copy the new kernel to its correct location
  * in physical memory.  Then jump to the start of the kernel.
@@ -15,7 +20,7 @@
 	/* copy in arguments before stack gets unmapped */
 	MOVW	R0, R8			/* entry point */
 	MOVW	p2+4(FP), R9		/* source */
-	MOVW	n+8(FP), R10		/* byte count */
+	MOVW	n+8(FP), R6		/* byte count */
 
 	/* SVC mode, interrupts disabled */
 	MOVW	$(PsrDirq|PsrDfiq|PsrMsvc), R1
@@ -29,6 +34,28 @@
 	BIC	$CpCmmu, R1
 	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
 
+	/* continue with reboot only on cpu0 */
+	CPUID(R2)
+	BEQ	bootcpu
+
+	/* other cpus wait for inter processor interrupt from cpu0 */
+	/* turn icache back on */
+	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	ORR	$(CpCicache), R1
+	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	BARRIERS
+dowfi:
+	WFI
+	MOVW	$0x40000060, R1
+	ADD		R2<<2, R1
+	MOVW	0(R1), R0
+	AND		$0x10, R0
+	BEQ		dowfi
+	MOVW	$0x8000, R1
+	BL		(R1)
+	B		dowfi
+
+bootcpu:
 	/* set up a tiny stack for local vars and memmove args */
 	MOVW	R8, SP			/* stack top just before kernel dest */
 	SUB	$20, SP			/* allocate stack frame */
@@ -37,11 +64,12 @@
 	MOVW	R8, 16(SP)		/* save dest (entry point) */
 	MOVW	R8, R0			/* first arg is dest */
 	MOVW	R9, 8(SP)		/* push src */
-	MOVW	R10, 12(SP)		/* push size */
+	MOVW	R6, 12(SP)		/* push size */
 	BL	memmove(SB)
 	MOVW	16(SP), R8		/* restore entry point */
 
 	/* jump to kernel physical entry point */
+	ORR	R8,R8
 	B	(R8)
 	B	0(PC)
 
@@ -51,35 +79,31 @@
  * clobbers R0-R2, and returns with SP invalid.
  */
 TEXT cachesoff(SB), 1, $-4
+	MOVM.DB.W [R14,R1-R10], (R13)		/* save regs on stack */
 
-	/* write back and invalidate caches */
-	BARRIERS
-	MOVW	$0, R0
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEwbi), CpCACHEall
-	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall
-
-	/* turn caches off */
+	/* turn caches off, invalidate icache */
 	MRC	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
 	BIC	$(CpCdcache|CpCicache|CpCpredict), R1
 	MCR	CpSC, 0, R1, C(CpCONTROL), C(0), CpMainctl
+	MOVW	$0, R0
+	MCR	CpSC, 0, R0, C(CpCACHE), C(CpCACHEinvi), CpCACHEall
 
 	/* invalidate stale TLBs before changing them */
 	BARRIERS
-	MOVW	$KZERO, R0			/* some valid virtual address */
+	MOVW	$0, R0
 	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
 	BARRIERS
 
-	/* from here on, R0 is base of physical memory */
-	MOVW	$PHYSDRAM, R0
-
 	/* redo double map of first MiB PHYSDRAM = KZERO */
-	MOVW	$(L1+L1X(PHYSDRAM)), R2		/* address of PHYSDRAM's PTE */
+	MOVW	12(R(MACH)), R2		/* m->mmul1 (virtual addr) */
 	MOVW	$PTEDRAM, R1			/* PTE bits */
-	ORR	R0, R1				/* dram base */
 	MOVW	R1, (R2)
+	DSB
+	MCR	CpSC, 0, R2, C(CpCACHE), C(CpCACHEwb), CpCACHEse
 
 	/* invalidate stale TLBs again */
 	BARRIERS
+	MOVW	$0, R0
 	MCR	CpSC, 0, R0, C(CpTLB), C(CpTLBinvu), CpTLBinv
 	BARRIERS
 
@@ -86,8 +110,9 @@
 	/* relocate SB and return address to PHYSDRAM addressing */
 	MOVW	$KSEGM, R1		/* clear segment bits */
 	BIC	R1, R12			/* adjust SB */
-	ORR	R0, R12
+	MOVM.IA.W (R13), [R14,R1-R10]		/* restore regs from stack */
+
+	MOVW	$KSEGM, R1		/* clear segment bits */
 	BIC	R1, R14			/* adjust return address */
-	ORR	R0, R14
 
 	RET
--- a/sys/src/9/bcm/trap.c
+++ b/sys/src/9/bcm/trap.c
@@ -13,6 +13,7 @@
 #include "arm.h"
 
 #define INTREGS		(VIRTIO+0xB200)
+#define	LOCALREGS	(VIRTIO+IOSIZE)
 
 typedef struct Intregs Intregs;
 typedef struct Vctl Vctl;
@@ -20,6 +21,10 @@
 enum {
 	Nvec = 8,		/* # of vectors at start of lexception.s */
 	Fiqenable = 1<<7,
+
+	Localtimerint	= 0x40,
+	Localmboxint	= 0x50,
+	Localintpending	= 0x60,
 };
 
 /*
@@ -46,6 +51,7 @@
 struct Vctl {
 	Vctl	*next;
 	int	irq;
+	int	cpu;
 	u32int	*reg;
 	u32int	mask;
 	void	(*f)(Ureg*, void*);
@@ -52,6 +58,7 @@
 	void	*a;
 };
 
+static Lock vctllock;
 static Vctl *vctl, *vfiq;
 
 static char *trapnames[PsrMask+1] = {
@@ -75,15 +82,17 @@
 {
 	Vpage0 *vpage0;
 
-	/* disable everything */
-	intrsoff();
+	if (m->machno == 0) {
+		/* disable everything */
+		intrsoff();
+		/* set up the exception vectors */
+		vpage0 = (Vpage0*)HVECTORS;
+		memmove(vpage0->vectors, vectors, sizeof(vpage0->vectors));
+		memmove(vpage0->vtable, vtable, sizeof(vpage0->vtable));
+		cacheuwbinv();
+		l2cacheuwbinv();
+	}
 
-	/* set up the exception vectors */
-	vpage0 = (Vpage0*)HVECTORS;
-	memmove(vpage0->vectors, vectors, sizeof(vpage0->vectors));
-	memmove(vpage0->vtable, vtable, sizeof(vpage0->vtable));
-	cacheuwbinv();
-
 	/* set up the stacks for the interrupt modes */
 	setr13(PsrMfiq, (u32int*)(FIQSTKTOP));
 	setr13(PsrMirq, m->sirq);
@@ -95,6 +104,21 @@
 }
 
 void
+intrcpushutdown(void)
+{
+	u32int *enable;
+
+	if(soc.armlocal == 0)
+		return;
+	enable = (u32int*)(LOCALREGS + Localtimerint) + m->machno;
+	*enable = 0;
+	if(m->machno){
+		enable = (u32int*)(LOCALREGS + Localmboxint) + m->machno;
+		*enable = 1;
+	}
+}
+
+void
 intrsoff(void)
 {
 	Intregs *ip;
@@ -120,11 +144,11 @@
 
 	clockintr = 0;
 	for(v = vctl; v; v = v->next)
-		if(*v->reg & v->mask){
+		if(v->cpu == m->machno && (*v->reg & v->mask) != 0){
 			coherence();
 			v->f(ureg, v->a);
 			coherence();
-			if(v->irq == IRQclock)
+			if(v->irq == IRQclock || v->irq == IRQcntps || v->irq == IRQcntpns)
 				clockintr = 1;
 		}
 	return clockintr;
@@ -140,7 +164,7 @@
 
 	v = vfiq;
 	if(v == nil)
-		panic("unexpected item in bagging area");
+		panic("cpu%d: unexpected item in bagging area", m->machno);
 	m->intr++;
 	ureg->pc -= 4;
 	coherence();
@@ -160,7 +184,16 @@
 	if(v == nil)
 		panic("irqenable: no mem");
 	v->irq = irq;
-	if(irq >= IRQbasic){
+	v->cpu = 0;
+	if(irq >= IRQlocal){
+		v->reg = (u32int*)(LOCALREGS + Localintpending) + m->machno;
+		if(irq >= IRQmbox0)
+			enable = (u32int*)(LOCALREGS + Localmboxint) + m->machno;
+		else
+			enable = (u32int*)(LOCALREGS + Localtimerint) + m->machno;
+		v->mask = 1 << (irq - IRQlocal);
+		v->cpu = m->machno;
+	}else if(irq >= IRQbasic){
 		enable = &ip->ARMenable;
 		v->reg = &ip->ARMpending;
 		v->mask = 1 << (irq - IRQbasic);
@@ -171,6 +204,7 @@
 	}
 	v->f = f;
 	v->a = a;
+	lock(&vctllock);
 	if(irq == IRQfiq){
 		assert((ip->FIQctl & Fiqenable) == 0);
 		assert((*enable & v->mask) == 0);
@@ -179,8 +213,15 @@
 	}else{
 		v->next = vctl;
 		vctl = v;
-		*enable = v->mask;
+		if(irq >= IRQmbox0){
+			if(irq <= IRQmbox3)
+				*enable |= 1 << (irq - IRQmbox0);
+		}else if(irq >= IRQlocal)
+			*enable |= 1 << (irq - IRQlocal);
+		else
+			*enable = v->mask;
 	}
+	unlock(&vctllock);
 }
 
 static char *
--- a/sys/src/9/bcm/uartmini.c
+++ b/sys/src/9/bcm/uartmini.c
@@ -10,35 +10,11 @@
 #include "fns.h"
 #include "io.h"
 
-#define GPIOREGS	(VIRTIO+0x200000)
 #define AUXREGS		(VIRTIO+0x215000)
 #define	OkLed		16
 #define	TxPin		14
 #define	RxPin		15
 
-/* GPIO regs */
-enum {
-	Fsel0	= 0x00>>2,
-		FuncMask= 0x7,
-		Input	= 0x0,
-		Output	= 0x1,
-		Alt0	= 0x4,
-		Alt1	= 0x5,
-		Alt2	= 0x6,
-		Alt3	= 0x7,
-		Alt4	= 0x3,
-		Alt5	= 0x2,
-	Set0	= 0x1c>>2,
-	Clr0	= 0x28>>2,
-	Lev0	= 0x34>>2,
-	PUD	= 0x94>>2,
-		Off	= 0x0,
-		Pulldown= 0x1,
-		Pullup	= 0x2,
-	PUDclk0	= 0x98>>2,
-	PUDclk1	= 0x9c>>2,
-};
-
 /* AUX regs */
 enum {
 	Irq	= 0x00>>2,
@@ -73,57 +49,12 @@
 	.regs	= (u32int*)AUXREGS,
 	.name	= "uart0",
 	.freq	= 250000000,
+	.baud	= 115200,
 	.phys	= &miniphysuart,
 };
 
-void
-gpiosel(uint pin, int func)
-{	
-	u32int *gp, *fsel;
-	int off;
+static int baud(Uart*, int);
 
-	gp = (u32int*)GPIOREGS;
-	fsel = &gp[Fsel0 + pin/10];
-	off = (pin % 10) * 3;
-	*fsel = (*fsel & ~(FuncMask << off)) | func << off;
-}
-
-void
-gpiopulloff(uint pin)
-{
-	u32int *gp, *reg;
-	u32int mask;
-
-	gp = (u32int*)GPIOREGS;
-	reg = &gp[PUDclk0 + pin/32];
-	mask = 1 << (pin % 32);
-	gp[PUD] = Off;
-	microdelay(1);
-	*reg = mask;
-	microdelay(1);
-	*reg = 0;
-}
-
-void
-gpioout(uint pin, int set)
-{
-	u32int *gp;
-	int v;
-
-	gp = (u32int*)GPIOREGS;
-	v = set? Set0: Clr0;
-	gp[v + pin/32] = 1 << (pin % 32);
-}
-
-int
-gpioin(uint pin)
-{
-	u32int *gp;
-
-	gp = (u32int*)GPIOREGS;
-	return (gp[Lev0 + pin/32] & (1 << (pin % 32))) != 0;
-}
-
 static void
 interrupt(Ureg*, void *arg)
 {
@@ -162,10 +93,12 @@
 	gpiosel(TxPin, Alt5);
 	gpiosel(RxPin, Alt5);
 	gpiopulloff(TxPin);
-	gpiopulloff(RxPin);
+	gpiopullup(RxPin);
 	ap[Enables] |= UartEn;
 	ap[MuIir] = 6;
+	ap[MuLcr] = Bits8;
 	ap[MuCntl] = TxEn|RxEn;
+	baud(uart, uart->baud);
 	if(ie){
 		intrenable(IRQaux, interrupt, uart, 0, "uart");
 		ap[MuIer] = RxIen|TxIen;
@@ -370,13 +303,12 @@
 		break;
 	}
 
-	uartctl(uart, "b9600 l8 pn s1");
+	if(!uart->enabled)
+		(*uart->phys->enable)(uart, 0);
+	uartctl(uart, "l8 pn s1");
 	if(*cmd != '\0')
 		uartctl(uart, cmd);
 
-	if(!uart->enabled)
-		(*uart->phys->enable)(uart, 0);
-
 	consuart = uart;
 	uart->console = 1;
 }
@@ -405,8 +337,26 @@
 okay(int on)
 {
 	static int first;
+	static int okled, polarity;
+	char *p;
 
-	if(!first++)
-		gpiosel(OkLed, Output);
-	gpioout(OkLed, !on);
+	if(!first++){
+		p = getconf("bcm2709.disk_led_gpio");
+		if(p == nil)
+			p = getconf("bcm2708.disk_led_gpio");
+		if(p != nil)
+			okled = strtol(p, 0, 0);
+		else
+			okled = 'v';
+		p = getconf("bcm2709.disk_led_active_low");
+		if(p == nil)
+			p = getconf("bcm2708.disk_led_active_low");
+		polarity = (p == nil || *p == '1');
+		if(okled != 'v')
+			gpiosel(okled, Output);
+	}
+	if(okled == 'v')
+		vgpset(0, on);
+	else if(okled != 0)
+		gpioout(okled, on^polarity);
 }
--- a/sys/src/9/bcm/usbdwc.c
+++ b/sys/src/9/bcm/usbdwc.c
@@ -33,6 +33,18 @@
 
 	Read		= 0,
 	Write		= 1,
+
+	/*
+	 * Workaround for an unexplained glitch where an Ack interrupt
+	 * is received without Chhltd, whereupon all channels remain
+	 * permanently busy and can't be halted.  This was only seen
+	 * when the controller is reading a sequence of bulk input
+	 * packets in DMA mode.  Setting Slowbulkin=1 will avoid the
+	 * lockup by reading packets individually with an interrupt
+	 * after each.  More recent chips don't seem to exhibit the
+	 * problem, so it's probably safe to leave this off now.
+	 */
+	Slowbulkin	= 0,
 };
 
 typedef struct Ctlr Ctlr;
@@ -39,10 +51,11 @@
 typedef struct Epio Epio;
 
 struct Ctlr {
+	Lock;
 	Dwcregs	*regs;		/* controller registers */
 	int	nchan;		/* number of host channels */
 	ulong	chanbusy;	/* bitmap of in-use channels */
-	QLock	chanlock;	/* serialise access to chanbusy */
+	Lock	chanlock;	/* serialise access to chanbusy */
 	QLock	split;		/* serialise split transactions */
 	int	splitretry;	/* count retries of Nyet */
 	int	sofchan;	/* bitmap of channels waiting for sof */
@@ -52,7 +65,11 @@
 };
 
 struct Epio {
-	QLock;
+	union {
+		QLock	rlock;
+		QLock	ctllock;
+	};
+	QLock	wlock;
 	Block	*cb;
 	ulong	lastpoll;
 };
@@ -61,29 +78,48 @@
 static int debug;
 
 static char Ebadlen[] = "bad usb request length";
-static char Enotconfig[] = "usb endpoint not configured";
 
 static void clog(Ep *ep, Hostchan *hc);
 static void logdump(Ep *ep);
 
+static void
+filock(Lock *l)
+{
+	int x;
+
+	x = splfhi();
+	ilock(l);
+	l->sr = x;
+}
+
+static void
+fiunlock(Lock *l)
+{
+	iunlock(l);
+}
+
 static Hostchan*
 chanalloc(Ep *ep)
 {
 	Ctlr *ctlr;
 	int bitmap, i;
+	static int first;
 
 	ctlr = ep->hp->aux;
-	qlock(&ctlr->chanlock);
+retry:
+	lock(&ctlr->chanlock);
 	bitmap = ctlr->chanbusy;
 	for(i = 0; i < ctlr->nchan; i++)
 		if((bitmap & (1<<i)) == 0){
 			ctlr->chanbusy = bitmap | 1<<i;
-			qunlock(&ctlr->chanlock);
+			unlock(&ctlr->chanlock);
 			return &ctlr->regs->hchan[i];
 		}
-	qunlock(&ctlr->chanlock);
-	panic("miller is a lazy git");
-	return nil;
+	unlock(&ctlr->chanlock);
+	if(!first++)
+		print("usbdwc: all host channels busy - retrying\n");
+	tsleep(&up->sleep, return0, 0, 1);
+	goto retry;
 }
 
 static void
@@ -94,9 +130,9 @@
 
 	ctlr = ep->hp->aux;
 	i = chan - ctlr->regs->hchan;
-	qlock(&ctlr->chanlock);
+	lock(&ctlr->chanlock);
 	ctlr->chanbusy &= ~(1<<i);
-	qunlock(&ctlr->chanlock);
+	unlock(&ctlr->chanlock);
 }
 
 static void
@@ -158,7 +194,7 @@
 	Dwcregs *r;
 
 	r = a;
-	return r->gintsts & Sofintr;
+	return (r->gintmsk & Sofintr) == 0;
 }
 
 static void
@@ -165,16 +201,15 @@
 sofwait(Ctlr *ctlr, int n)
 {
 	Dwcregs *r;
-	int x;
 
 	r = ctlr->regs;
 	do{
+		filock(ctlr);
 		r->gintsts = Sofintr;
-		x = splfhi();
 		ctlr->sofchan |= 1<<n;
 		r->gintmsk |= Sofintr;
+		fiunlock(ctlr);
 		sleep(&ctlr->chanintr[n], sofdone, r);
-		splx(x);
 	}while((r->hfnum & 7) == 6);
 }
 
@@ -192,7 +227,7 @@
 static int
 chanwait(Ep *ep, Ctlr *ctlr, Hostchan *hc, int mask)
 {
-	int intr, n, x, ointr;
+	int intr, n, ointr;
 	ulong start, now;
 	Dwcregs *r;
 
@@ -200,13 +235,14 @@
 	n = hc - r->hchan;
 	for(;;){
 restart:
-		x = splfhi();
+		filock(ctlr);
 		r->haintmsk |= 1<<n;
 		hc->hcintmsk = mask;
-		sleep(&ctlr->chanintr[n], chandone, hc);
+		fiunlock(ctlr);
+		tsleep(&ctlr->chanintr[n], chandone, hc, 1000);
+		if((intr = hc->hcint) == 0)
+			goto restart;
 		hc->hcintmsk = 0;
-		splx(x);
-		intr = hc->hcint;
 		if(intr & Chhltd)
 			return intr;
 		start = fastticks(0);
@@ -218,13 +254,14 @@
 				if((ointr != Ack && ointr != (Ack|Xfercomp)) ||
 				   intr != (Ack|Chhltd|Xfercomp) ||
 				   (now - start) > 60)
-					dprint("await %x after %ld %x -> %x\n",
+					dprint("await %x after %ldµs %x -> %x\n",
 						mask, now - start, ointr, intr);
 				return intr;
 			}
 			if((intr & mask) == 0){
-				dprint("ep%d.%d await %x intr %x -> %x\n",
-					ep->dev->nb, ep->nb, mask, ointr, intr);
+				if(intr != Nak)
+					dprint("ep%d.%d await %x after %ldµs intr %x -> %x\n",
+						ep->dev->nb, ep->nb, mask, now - start, ointr, intr);
 				goto restart;
 			}
 			now = fastticks(0);
@@ -254,6 +291,8 @@
 	int i;
 
 	hc = &ctlr->regs->hchan[n];
+	if((hc->hcint & hc->hcintmsk) == 0)
+		return 1;
 	if(ctlr->debugchan & (1<<n))
 		clog(nil, hc);
 	if((hc->hcsplt & Spltena) == 0)
@@ -347,7 +386,7 @@
 	else
 		n = len;
 	hc->hctsiz = n | npkt<<OPktcnt | pid;
-	hc->hcdma  = PADDR(a);
+	hc->hcdma  = dmaaddr(a);
 
 	nleft = len;
 	logstart(ep);
@@ -378,13 +417,19 @@
 		}
 		hc->hcchar = (hc->hcchar &~ Chdis) | Chen;
 		clog(ep, hc);
+wait:
 		if(ep->ttype == Tbulk && dir == Epin)
-			i = chanwait(ep, ctlr, hc, /* Ack| */ Chhltd);
+			i = chanwait(ep, ctlr, hc, Chhltd);
 		else if(ep->ttype == Tintr && (hc->hcsplt & Spltena))
 			i = chanwait(ep, ctlr, hc, Chhltd);
 		else
 			i = chanwait(ep, ctlr, hc, Chhltd|Nak);
 		clog(ep, hc);
+		if(hc->hcint != i){
+			dprint("chanwait intr %ux->%ux\n", i, hc->hcint);
+			if((i = hc->hcint) == 0)
+				goto wait;
+		}
 		hc->hcint = i;
 
 		if(hc->hcsplt & Spltena){
@@ -405,12 +450,12 @@
 				continue;
 			}
 			logdump(ep);
-			print("usbotg: ep%d.%d error intr %8.8ux\n",
+			print("usbdwc: ep%d.%d error intr %8.8ux\n",
 				ep->dev->nb, ep->nb, i);
 			if(i & ~(Chhltd|Ack))
 				error(Eio);
 			if(hc->hcdma != hcdma)
-				print("usbotg: weird hcdma %x->%x intr %x->%x\n",
+				print("usbdwc: weird hcdma %ux->%ux intr %ux->%ux\n",
 					hcdma, hc->hcdma, i, hc->hcint);
 		}
 		n = hc->hcdma - hcdma;
@@ -420,13 +465,13 @@
 			else
 				continue;
 		}
-		if(dir == Epin && ep->ttype == Tbulk && n == nleft){
+		if(dir == Epin && ep->ttype == Tbulk){
 			nt = (hctsiz & Xfersize) - (hc->hctsiz & Xfersize);
 			if(nt != n){
 				if(n == ROUND(nt, 4))
 					n = nt;
 				else
-					print("usbotg: intr %8.8ux "
+					print("usbdwc: intr %8.8ux "
 						"dma %8.8ux-%8.8ux "
 						"hctsiz %8.8ux-%8.ux\n",
 						i, hcdma, hc->hcdma, hctsiz,
@@ -491,7 +536,7 @@
 		nexterror();
 	}
 	chansetup(hc, ep);
-	if(rw == Read && ep->ttype == Tbulk)
+	if(Slowbulkin && rw == Read && ep->ttype == Tbulk)
 		n = multitrans(ep, hc, rw, a, n);
 	else{
 		n = chanio(ep, hc, rw == Read? Epin : Epout, ep->toggle[rw],
@@ -524,8 +569,8 @@
 		if(datalen <= 0 || datalen > Maxctllen)
 			error(Ebadlen);
 		/* XXX cache madness */
-		epio->cb = b = allocb(ROUND(datalen, ep->maxpkt) + CACHELINESZ);
-		b->wp = (uchar*)ROUND((uintptr)b->wp, CACHELINESZ);
+		epio->cb = b = allocb(ROUND(datalen, ep->maxpkt));
+		assert(((uintptr)b->wp & (BLOCKALIGN-1)) == 0);
 		memset(b->wp, 0x55, b->lim - b->wp);
 		cachedwbinvse(b->wp, b->lim - b->wp);
 		data = b->wp;
@@ -550,6 +595,7 @@
 		}else
 			b->wp += chanio(ep, hc, Epin, DATA1, data, datalen);
 		chanio(ep, hc, Epout, DATA1, nil, 0);
+		cachedinvse(b->rp, BLEN(b));
 		n = Rsetuplen;
 	}else{
 		if(datalen > 0)
@@ -627,7 +673,7 @@
 	greset(r, Rxfflsh);
 	r->grstctl = TXF_ALL;
 	greset(r, Txfflsh);
-	dprint("usbotg: FIFO depth %d sizes rx/nptx/ptx %8.8ux %8.8ux %8.8ux\n",
+	dprint("usbdwc: FIFO depth %d sizes rx/nptx/ptx %8.8ux %8.8ux %8.8ux\n",
 		n, r->grxfsiz, r->gnptxfsiz, r->hptxfsiz);
 
 	r->hport0 = Prtpwr|Prtconndet|Prtenchng|Prtovrcurrchng;
@@ -654,6 +700,7 @@
 	ctlr = hp->aux;
 	r = ctlr->regs;
 	wakechan = 0;
+	filock(ctlr);
 	intr = r->gintsts;
 	if(intr & Hcintr){
 		haint = r->haint & r->haintmsk;
@@ -679,6 +726,7 @@
 		ctlr->wakechan |= wakechan;
 		armtimerset(1);
 	}
+	fiunlock(ctlr);
 }
 
 static void
@@ -686,14 +734,14 @@
 {
 	Ctlr *ctlr;
 	uint wakechan;
-	int i, x;
+	int i;
 
 	ctlr = a;
-	x = splfhi();
+	filock(ctlr);
 	armtimerset(0);
 	wakechan = ctlr->wakechan;
 	ctlr->wakechan = 0;
-	splx(x);
+	fiunlock(ctlr);
 	for(i = 0; wakechan; i++){
 		if(wakechan & 1)
 			wakeup(&ctlr->chanintr[i]);
@@ -704,11 +752,12 @@
 static void
 epopen(Ep *ep)
 {
-	ddprint("usbotg: epopen ep%d.%d ttype %d\n",
+	ddprint("usbdwc: epopen ep%d.%d ttype %d\n",
 		ep->dev->nb, ep->nb, ep->ttype);
 	switch(ep->ttype){
-	case Tnone:
-		error(Enotconfig);
+	default:
+		error("endpoint type not supported");
+		return;
 	case Tintr:
 		assert(ep->pollival > 0);
 		/* fall through */
@@ -717,6 +766,8 @@
 			ep->toggle[Read] = DATA0;
 		if(ep->toggle[Write] == 0)
 			ep->toggle[Write] = DATA0;
+		/* fall through */
+	case Tctl:
 		break;
 	}
 	ep->aux = malloc(sizeof(Epio));
@@ -727,7 +778,7 @@
 static void
 epclose(Ep *ep)
 {
-	ddprint("usbotg: epclose ep%d.%d ttype %d\n",
+	ddprint("usbdwc: epclose ep%d.%d ttype %d\n",
 		ep->dev->nb, ep->nb, ep->ttype);
 	switch(ep->ttype){
 	case Tctl:
@@ -743,6 +794,7 @@
 epread(Ep *ep, void *a, long n)
 {
 	Epio *epio;
+	QLock *q;
 	Block *b;
 	uchar *p;
 	ulong elapsed;
@@ -750,10 +802,11 @@
 
 	ddprint("epread ep%d.%d %ld\n", ep->dev->nb, ep->nb, n);
 	epio = ep->aux;
+	q = ep->ttype == Tctl? &epio->ctllock : &epio->rlock;
 	b = nil;
-	qlock(epio);
+	qlock(q);
 	if(waserror()){
-		qunlock(epio);
+		qunlock(q);
 		if(b)
 			freeb(b);
 		nexterror();
@@ -763,7 +816,7 @@
 		error(Egreg);
 	case Tctl:
 		nr = ctldata(ep, a, n);
-		qunlock(epio);
+		qunlock(q);
 		poperror();
 		return nr;
 	case Tintr:
@@ -773,13 +826,15 @@
 		/* fall through */
 	case Tbulk:
 		/* XXX cache madness */
-		b = allocb(ROUND(n, ep->maxpkt) + CACHELINESZ);
-		p = (uchar*)ROUND((uintptr)b->base, CACHELINESZ);
-		cachedwbinvse(p, n);
+		b = allocb(ROUND(n, ep->maxpkt));
+		p = b->rp;
+		assert(((uintptr)p & (BLOCKALIGN-1)) == 0);
+		cachedinvse(p, n);
 		nr = eptrans(ep, Read, p, n);
+		cachedinvse(p, nr);
 		epio->lastpoll = TK2MS(m->ticks);
 		memmove(a, p, nr);
-		qunlock(epio);
+		qunlock(q);
 		freeb(b);
 		poperror();
 		return nr;
@@ -790,6 +845,7 @@
 epwrite(Ep *ep, void *a, long n)
 {
 	Epio *epio;
+	QLock *q;
 	Block *b;
 	uchar *p;
 	ulong elapsed;
@@ -796,10 +852,11 @@
 
 	ddprint("epwrite ep%d.%d %ld\n", ep->dev->nb, ep->nb, n);
 	epio = ep->aux;
+	q = ep->ttype == Tctl? &epio->ctllock : &epio->wlock;
 	b = nil;
-	qlock(epio);
+	qlock(q);
 	if(waserror()){
-		qunlock(epio);
+		qunlock(q);
 		if(b)
 			freeb(b);
 		nexterror();
@@ -815,8 +872,9 @@
 	case Tctl:
 	case Tbulk:
 		/* XXX cache madness */
-		b = allocb(n + CACHELINESZ);
-		p = (uchar*)ROUND((uintptr)b->base, CACHELINESZ);
+		b = allocb(n);
+		p = b->wp;
+		assert(((uintptr)p & (BLOCKALIGN-1)) == 0);
 		memmove(p, a, n);
 		cachedwbse(p, n);
 		if(ep->ttype == Tctl)
@@ -825,7 +883,7 @@
 			n = eptrans(ep, Write, p, n);
 			epio->lastpoll = TK2MS(m->ticks);
 		}
-		qunlock(epio);
+		qunlock(q);
 		freeb(b);
 		poperror();
 		return n;
@@ -847,11 +905,11 @@
 	assert(port == 1);
 	ctlr = hp->aux;
 	r = ctlr->regs;
-	dprint("usbotg enable=%d; sts %#x\n", on, r->hport0);
+	dprint("usbdwc enable=%d; sts %#x\n", on, r->hport0);
 	if(!on)
 		r->hport0 = Prtpwr | Prtena;
 	tsleep(&up->sleep, return0, 0, Enabledelay);
-	dprint("usbotg enable=%d; sts %#x\n", on, r->hport0);
+	dprint("usbdwc enable=%d; sts %#x\n", on, r->hport0);
 	return 0;
 }
 
@@ -865,7 +923,7 @@
 	assert(port == 1);
 	ctlr = hp->aux;
 	r = ctlr->regs;
-	dprint("usbotg reset=%d; sts %#x\n", on, r->hport0);
+	dprint("usbdwc reset=%d; sts %#x\n", on, r->hport0);
 	if(!on)
 		return 0;
 	r->hport0 = Prtpwr | Prtrst;
@@ -876,9 +934,9 @@
 	b = s & (Prtconndet|Prtenchng|Prtovrcurrchng);
 	if(b != 0)
 		r->hport0 = Prtpwr | b;
-	dprint("usbotg reset=%d; sts %#x\n", on, s);
+	dprint("usbdwc reset=%d; sts %#x\n", on, s);
 	if((s & Prtena) == 0)
-		print("usbotg: host port not enabled after reset");
+		print("usbdwc: host port not enabled after reset");
 	return 0;
 }
 
@@ -948,7 +1006,7 @@
 	id = ctlr->regs->gsnpsid;
 	if((id>>16) != ('O'<<8 | 'T'))
 		return -1;
-	dprint("usbotg: rev %d.%3.3x\n", (id>>12)&0xF, id&0xFFF);
+	dprint("usbdwc: rev %d.%3.3x\n", (id>>12)&0xF, id&0xFFF);
 
 	intrenable(IRQtimerArm, irqintr, ctlr, 0, "dwc");
 
--- a/sys/src/9/bcm/vcore.c
+++ b/sys/src/9/bcm/vcore.c
@@ -12,6 +12,7 @@
 
 typedef struct Prophdr Prophdr;
 typedef struct Fbinfo Fbinfo;
+typedef struct Vgpio Vgpio;
 
 enum {
 	Read		= 0x00>>2,
@@ -33,7 +34,7 @@
 	TagResp		= 1<<31,
 
 	TagGetfwrev	= 0x00000001,
-	TagGetbrdrev	= 0x00010002,
+	TagGetrev	= 0x00010002,
 	TagGetmac	= 0x00010003,
 	TagGetram	= 0x00010005,
 	TagGetpower	= 0x00020001,
@@ -40,6 +41,9 @@
 	TagSetpower	= 0x00028001,
 		Powerwait	= 1<<1,
 	TagGetclkspd= 0x00030002,
+	TagGetclkmax= 0x00030004,
+	TagSetclkspd= 0x00038002,
+	TagGettemp	= 0x00030006,
 	TagFballoc	= 0x00040001,
 	TagFbfree	= 0x00048001,
 	TagFbblank	= 0x00040002,
@@ -49,8 +53,11 @@
 	TagSetvres	= 0x00048004,
 	TagGetdepth	= 0x00040005,
 	TagSetdepth	= 0x00048005,
-	TagGetrgb	= 0x00044006,
+	TagGetrgb	= 0x00040006,
 	TagSetrgb	= 0x00048006,
+	TagGetGpio	= 0x00040010,
+
+	Nvgpio		= 2,
 };
 
 struct Fbinfo {
@@ -76,6 +83,15 @@
 	u32int	data[1];
 };
 
+struct Vgpio {
+	u32int	*counts;
+	u16int	incs;
+	u16int	decs;
+	int	ison;
+};
+
+static Vgpio vgpio;
+
 static void
 vcwrite(uint chan, int val)
 {
@@ -115,7 +131,8 @@
 	uintptr r;
 	int n;
 	Prophdr *prop;
-	static uintptr base = BUSDRAM;
+	uintptr aprop;
+	static int busaddr = 1;
 
 	if(rsplen < vallen)
 		rsplen = vallen;
@@ -132,15 +149,18 @@
 		memmove(prop->data, buf, vallen);
 	cachedwbinvse(prop, prop->len);
 	for(;;){
-		vcwrite(ChanProps, PADDR(prop) + base);
+		aprop = busaddr? dmaaddr(prop) : PTR2UINT(prop);
+		vcwrite(ChanProps, aprop);
 		r = vcread(ChanProps);
-		if(r == PADDR(prop) + base)
+		if(r == aprop)
 			break;
-		if(base == 0)
+		if(!busaddr)
 			return -1;
-		base = 0;
+		busaddr = 0;
 	}
-	if(prop->req == RspOk && prop->tag == tag && prop->taglen & TagResp) {
+	if(prop->req == RspOk &&
+	   prop->tag == tag &&
+	   (prop->taglen&TagResp)) {
 		if((n = prop->taglen & ~TagResp) < rsplen)
 			rsplen = n;
 		memmove(buf, prop->data, rsplen);
@@ -158,6 +178,7 @@
 fbdefault(int *width, int *height, int *depth)
 {
 	u32int buf[3];
+	char *p;
 
 	if(vcreq(TagGetres, &buf[0], 0, 2*4) != 2*4 ||
 	   vcreq(TagGetdepth, &buf[2], 0, 4) != 4)
@@ -164,7 +185,10 @@
 		return -1;
 	*width = buf[0];
 	*height = buf[1];
-	*depth = buf[2];
+	if((p = getconf("bcm2708_fb.fbdepth")) != nil)
+		*depth = atoi(p);
+	else
+		*depth = buf[2];
 	return 0;
 }
 
@@ -184,7 +208,7 @@
 	fi->yres = fi->yresvirtual = *height;
 	fi->bpp = *depth;
 	cachedwbinvse(fi, sizeof(*fi));
-	vcwrite(ChanFb, DMAADDR(fi));
+	vcwrite(ChanFb, dmaaddr(fi));
 	if(vcread(ChanFb) != 0)
 		return 0;
 	va = mmukmap(FRAMEBUFFER, PADDR(fi->base), fi->screensize);
@@ -213,7 +237,7 @@
 	u32int buf[2];
 
 	buf[0] = dev;
-	buf[1] = Powerwait | (on? 1: 0);
+	buf[1] = Powerwait | (on? 1 : 0);
 	vcreq(TagSetpower, buf, sizeof buf, sizeof buf);
 }
 
@@ -250,23 +274,27 @@
 }
 
 /*
- * Get firmware revision
+ * Get board revision
  */
 uint
-getfirmware(void)
+getboardrev(void)
 {
 	u32int buf[1];
 
-	if(vcreq(TagGetfwrev, buf, 0, sizeof buf) != sizeof buf)
+	if(vcreq(TagGetrev, buf, 0, sizeof buf) != sizeof buf)
 		return 0;
 	return buf[0];
 }
 
+/*
+ * Get firmware revision
+ */
 uint
-getrevision(void)
+getfirmware(void)
 {
 	u32int buf[1];
-	if(vcreq(TagGetbrdrev, buf, 0, sizeof buf) != sizeof buf)
+
+	if(vcreq(TagGetfwrev, buf, 0, sizeof buf) != sizeof buf)
 		return 0;
 	return buf[0];
 }
@@ -299,13 +327,63 @@
 	return buf[1];
 }
 
+/*
+ * Set clock rate to hz (or max speed if hz == 0)
+ */
+void
+setclkrate(int clkid, ulong hz)
+{
+	u32int buf[2];
+
+	buf[0] = clkid;
+	if(hz != 0)
+		buf[1] = hz;
+	else if(vcreq(TagGetclkmax, buf, sizeof(buf[0]), sizeof(buf)) != sizeof buf)
+		return;
+	vcreq(TagSetclkspd, buf, sizeof(buf), sizeof(buf));
+}
+
+/*
+ * Get cpu temperature
+ */
 uint
-gettemp(int tempid)
+getcputemp(void)
 {
 	u32int buf[2];
-	buf[0] = tempid;
-	if(vcreq(0x00030006, buf, sizeof(buf[0]), sizeof(buf)) != sizeof buf)
-		return 0;
 
+	buf[0] = 0;
+	if(vcreq(TagGettemp, buf, sizeof(buf[0]), sizeof buf) != sizeof buf)
+		return 0;
 	return buf[1];
+}
+
+/*
+ * Virtual GPIO - used for ACT LED on pi3
+ */
+void
+vgpinit(void)
+{
+	u32int buf[1];
+	uintptr va;
+
+	buf[0] = 0;
+	if(vcreq(TagGetGpio, buf, 0, sizeof(buf)) != sizeof buf || buf[0] == 0)
+		return;
+	va = mmukmap(VGPIO, buf[0] & ~0xC0000000, BY2PG);
+	if(va == 0)
+		return;
+	vgpio.counts = (u32int*)va;
+}
+
+void
+vgpset(uint port, int on)
+{
+	if(vgpio.counts == nil || port >= Nvgpio || on == vgpio.ison)
+		return;
+	if(on)
+		vgpio.incs++;
+	else
+		vgpio.decs++;
+	vgpio.counts[port] = (vgpio.incs << 16) | vgpio.decs;
+	vgpio.ison = on;
 }
--- a/sys/src/9/bcm/vfp3.c
+++ b/sys/src/9/bcm/vfp3.c
@@ -163,7 +163,10 @@
 	static int printed;
 
 	/* clear pending exceptions; no traps in vfp3; all v7 ops are scalar */
-	m->fpscr = Dn | Fz | FPRNR | (FPINVAL | FPZDIV | FPOVFL) & ~Alltraps;
+	m->fpscr = Dn | FPRNR | (FPINVAL | FPZDIV | FPOVFL) & ~Alltraps;
+	/* VFPv2 needs software support for underflows, so force them to zero */
+	if(m->havefp == VFPv2)
+		m->fpscr |= Fz;
 	fpwr(Fpscr, m->fpscr);
 	m->fpconfiged = 1;
 
@@ -278,7 +281,7 @@
 {
 	if(p->fpstate == FPactive){
 		if(p->state == Moribund)
-			fpclear();
+			fpoff();
 		else{
 			/*
 			 * Fpsave() stores without handling pending
@@ -371,8 +374,6 @@
 static void
 mathemu(Ureg *)
 {
-	if(m->havefp == VFPv3 && !(fprd(Fpexc) & (Fpex|Fpdex)))
-		iprint("mathemu: not an FP exception but an unknown FP opcode\n");
 	switch(up->fpstate){
 	case FPemu:
 		error("illegal instruction: VFP opcode in emulated mode");
@@ -472,6 +473,7 @@
 {
 	int s, nfp, cop, op;
 	uintptr pc;
+	static int already;
 
 	if(waserror()){
 		postnote(up, 1, up->errstr, NDebug);
@@ -484,16 +486,14 @@
 	nfp = 0;
 	pc = ureg->pc;
 	validaddr(pc, 4, 0);
-	if(!condok(ureg->psr, *(ulong*)pc >> 28))
-		iprint("fpuemu: conditional instr shouldn't have got here\n");
 	op  = (*(ulong *)pc >> 24) & MASK(4);
 	cop = (*(ulong *)pc >>  8) & MASK(4);
 	if(m->fpon)
 		fpstuck(pc);		/* debugging; could move down 1 line */
 	if (ISFPAOP(cop, op)) {		/* old arm 7500 fpa opcode? */
-//		iprint("fpuemu: fpa instr %#8.8lux at %#p\n", *(ulong *)pc, pc);
-//		error("illegal instruction: old arm 7500 fpa opcode");
 		s = spllo();
+		if(!already++)
+			pprint("warning: emulated arm7500 fpa instr %#8.8lux at %#p\n", *(ulong *)pc, pc);
 		if(waserror()){
 			splx(s);
 			nexterror();
@@ -503,7 +503,7 @@
 			m->fppc = m->fpcnt = 0;
 		splx(s);
 		poperror();
-	} else if (ISVFPOP(cop, op)) {	/* if vfp, fpu must be off */
+	} else if (ISVFPOP(cop, op)) {	/* if vfp, fpu off or unsupported instruction */
 		mathemu(ureg);		/* enable fpu & retry */
 		nfp = 1;
 	}