shithub: riscv

Download patch

ref: 4cdd7049a6159f8d2ee863c08346fb08f51852b4
parent: c09cd2882c7c15600c74a9c12f104b80203f562c
author: cinap_lenrek <[email protected]>
date: Thu Nov 30 16:30:03 EST 2017

libsec: unroll portable sha1block function

just 6-10% slower than most assembly versions.
20% faster on zynq.

--- a/sys/src/libsec/port/sha1block.c
+++ b/sys/src/libsec/port/sha1block.c
@@ -1,12 +1,17 @@
 #include "os.h"
 
+#define ROTL(x,n)	(((x)<<n)|((x)>>32-n))
+
+#define F0(x,y,z)	(0x5a827999 + ((z) ^ ((x) & ((y) ^ (z)))))
+#define F1(x,y,z)	(0x6ed9eba1 + ((x) ^ (y) ^ (z)))
+#define F2(x,y,z)	(0x8f1bbcdc + (((x) & (y)) | (((x) | (y)) & (z))))
+#define F3(x,y,z)	(0xca62c1d6 + ((x) ^ (y) ^ (z)))
+
 void
 _sha1block(uchar *p, ulong len, u32int *s)
 {
-	u32int a, b, c, d, e, x;
+	u32int w[16], a, b, c, d, e;
 	uchar *end;
-	u32int *wp, *wend;
-	u32int w[80];
 
 	/* at this point, we have a multiple of 64 bytes */
 	for(end = p+len; p < end;){
@@ -16,168 +21,113 @@
 		d = s[3];
 		e = s[4];
 
-		wend = w + 15;
-		for(wp = w; wp < wend; wp += 5){
-			wp[0] = (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
-			e += ((a<<5) | (a>>27)) + wp[0];
-			e += 0x5a827999 + (((c^d)&b)^d);
-			b = (b<<30)|(b>>2);
+#define STEP(a,b,c,d,e,f,i) \
+	if(i < 16) {\
+		w[i] = p[0]<<24 | p[1]<<16 | p[2]<<8 | p[3]; \
+		p += 4; \
+	} else { \
+		u32int x = w[i-3&15] ^ w[i-8&15] ^ w[i-14&15] ^ w[i-16&15]; \
+		w[i&15] = ROTL(x, 1); \
+	} \
+	e += ROTL(a, 5) + w[i&15] + f(b,c,d); \
+	b = ROTL(b, 30);
 
-			wp[1] = (p[4]<<24) | (p[5]<<16) | (p[6]<<8) | p[7];
-			d += ((e<<5) | (e>>27)) + wp[1];
-			d += 0x5a827999 + (((b^c)&a)^c);
-			a = (a<<30)|(a>>2);
+		STEP(a,b,c,d,e,F0,0);
+		STEP(e,a,b,c,d,F0,1);
+		STEP(d,e,a,b,c,F0,2);
+		STEP(c,d,e,a,b,F0,3);
+		STEP(b,c,d,e,a,F0,4);
+	
+		STEP(a,b,c,d,e,F0,5);
+		STEP(e,a,b,c,d,F0,6);
+		STEP(d,e,a,b,c,F0,7);
+		STEP(c,d,e,a,b,F0,8);
+		STEP(b,c,d,e,a,F0,9);
+	
+		STEP(a,b,c,d,e,F0,10);
+		STEP(e,a,b,c,d,F0,11);
+		STEP(d,e,a,b,c,F0,12);
+		STEP(c,d,e,a,b,F0,13);
+		STEP(b,c,d,e,a,F0,14);
+	
+		STEP(a,b,c,d,e,F0,15);
+		STEP(e,a,b,c,d,F0,16);
+		STEP(d,e,a,b,c,F0,17);
+		STEP(c,d,e,a,b,F0,18);
+		STEP(b,c,d,e,a,F0,19);
+	
+		STEP(a,b,c,d,e,F1,20);
+		STEP(e,a,b,c,d,F1,21);
+		STEP(d,e,a,b,c,F1,22);
+		STEP(c,d,e,a,b,F1,23);
+		STEP(b,c,d,e,a,F1,24);
+	
+		STEP(a,b,c,d,e,F1,25);
+		STEP(e,a,b,c,d,F1,26);
+		STEP(d,e,a,b,c,F1,27);
+		STEP(c,d,e,a,b,F1,28);
+		STEP(b,c,d,e,a,F1,29);
+	
+		STEP(a,b,c,d,e,F1,30);
+		STEP(e,a,b,c,d,F1,31);
+		STEP(d,e,a,b,c,F1,32);
+		STEP(c,d,e,a,b,F1,33);
+		STEP(b,c,d,e,a,F1,34);
+	
+		STEP(a,b,c,d,e,F1,35);
+		STEP(e,a,b,c,d,F1,36);
+		STEP(d,e,a,b,c,F1,37);
+		STEP(c,d,e,a,b,F1,38);
+		STEP(b,c,d,e,a,F1,39);
+	
+		STEP(a,b,c,d,e,F2,40);
+		STEP(e,a,b,c,d,F2,41);
+		STEP(d,e,a,b,c,F2,42);
+		STEP(c,d,e,a,b,F2,43);
+		STEP(b,c,d,e,a,F2,44);
+	
+		STEP(a,b,c,d,e,F2,45);
+		STEP(e,a,b,c,d,F2,46);
+		STEP(d,e,a,b,c,F2,47);
+		STEP(c,d,e,a,b,F2,48);
+		STEP(b,c,d,e,a,F2,49);
+	
+		STEP(a,b,c,d,e,F2,50);
+		STEP(e,a,b,c,d,F2,51);
+		STEP(d,e,a,b,c,F2,52);
+		STEP(c,d,e,a,b,F2,53);
+		STEP(b,c,d,e,a,F2,54);
+	
+		STEP(a,b,c,d,e,F2,55);
+		STEP(e,a,b,c,d,F2,56);
+		STEP(d,e,a,b,c,F2,57);
+		STEP(c,d,e,a,b,F2,58);
+		STEP(b,c,d,e,a,F2,59);
+	
+		STEP(a,b,c,d,e,F3,60);
+		STEP(e,a,b,c,d,F3,61);
+		STEP(d,e,a,b,c,F3,62);
+		STEP(c,d,e,a,b,F3,63);
+		STEP(b,c,d,e,a,F3,64);
+	
+		STEP(a,b,c,d,e,F3,65);
+		STEP(e,a,b,c,d,F3,66);
+		STEP(d,e,a,b,c,F3,67);
+		STEP(c,d,e,a,b,F3,68);
+		STEP(b,c,d,e,a,F3,69);
+	
+		STEP(a,b,c,d,e,F3,70);
+		STEP(e,a,b,c,d,F3,71);
+		STEP(d,e,a,b,c,F3,72);
+		STEP(c,d,e,a,b,F3,73);
+		STEP(b,c,d,e,a,F3,74);
+	
+		STEP(a,b,c,d,e,F3,75);
+		STEP(e,a,b,c,d,F3,76);
+		STEP(d,e,a,b,c,F3,77);
+		STEP(c,d,e,a,b,F3,78);
+		STEP(b,c,d,e,a,F3,79);
 
-			wp[2] = (p[8]<<24) | (p[9]<<16) | (p[10]<<8) | p[11];
-			c += ((d<<5) | (d>>27)) + wp[2];
-			c += 0x5a827999 + (((a^b)&e)^b);
-			e = (e<<30)|(e>>2);
-
-			wp[3] = (p[12]<<24) | (p[13]<<16) | (p[14]<<8) | p[15];
-			b += ((c<<5) | (c>>27)) + wp[3];
-			b += 0x5a827999 + (((e^a)&d)^a);
-			d = (d<<30)|(d>>2);
-
-			wp[4] = (p[16]<<24) | (p[17]<<16) | (p[18]<<8) | p[19];
-			a += ((b<<5) | (b>>27)) + wp[4];
-			a += 0x5a827999 + (((d^e)&c)^e);
-			c = (c<<30)|(c>>2);
-			
-			p += 20;
-		}
-
-		wp[0] = (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3];
-		e += ((a<<5) | (a>>27)) + wp[0];
-		e += 0x5a827999 + (((c^d)&b)^d);
-		b = (b<<30)|(b>>2);
-
-		x = wp[-2] ^ wp[-7] ^ wp[-13] ^ wp[-15];
-		wp[1] = (x<<1) | (x>>31);
-		d += ((e<<5) | (e>>27)) + wp[1];
-		d += 0x5a827999 + (((b^c)&a)^c);
-		a = (a<<30)|(a>>2);
-
-		x = wp[-1] ^ wp[-6] ^ wp[-12] ^ wp[-14];
-		wp[2] = (x<<1) | (x>>31);
-		c += ((d<<5) | (d>>27)) + wp[2];
-		c += 0x5a827999 + (((a^b)&e)^b);
-		e = (e<<30)|(e>>2);
-
-		x = wp[0] ^ wp[-5] ^ wp[-11] ^ wp[-13];
-		wp[3] = (x<<1) | (x>>31);
-		b += ((c<<5) | (c>>27)) + wp[3];
-		b += 0x5a827999 + (((e^a)&d)^a);
-		d = (d<<30)|(d>>2);
-
-		x = wp[1] ^ wp[-4] ^ wp[-10] ^ wp[-12];
-		wp[4] = (x<<1) | (x>>31);
-		a += ((b<<5) | (b>>27)) + wp[4];
-		a += 0x5a827999 + (((d^e)&c)^e);
-		c = (c<<30)|(c>>2);
-
-		wp += 5;
-		p += 4;
-
-		wend = w + 40;
-		for(; wp < wend; wp += 5){
-			x = wp[-3] ^ wp[-8] ^ wp[-14] ^ wp[-16];
-			wp[0] = (x<<1) | (x>>31);
-			e += ((a<<5) | (a>>27)) + wp[0];
-			e += 0x6ed9eba1 + (b^c^d);
-			b = (b<<30)|(b>>2);
-
-			x = wp[-2] ^ wp[-7] ^ wp[-13] ^ wp[-15];
-			wp[1] = (x<<1) | (x>>31);
-			d += ((e<<5) | (e>>27)) + wp[1];
-			d += 0x6ed9eba1 + (a^b^c);
-			a = (a<<30)|(a>>2);
-
-			x = wp[-1] ^ wp[-6] ^ wp[-12] ^ wp[-14];
-			wp[2] = (x<<1) | (x>>31);
-			c += ((d<<5) | (d>>27)) + wp[2];
-			c += 0x6ed9eba1 + (e^a^b);
-			e = (e<<30)|(e>>2);
-
-			x = wp[0] ^ wp[-5] ^ wp[-11] ^ wp[-13];
-			wp[3] = (x<<1) | (x>>31);
-			b += ((c<<5) | (c>>27)) + wp[3];
-			b += 0x6ed9eba1 + (d^e^a);
-			d = (d<<30)|(d>>2);
-
-			x = wp[1] ^ wp[-4] ^ wp[-10] ^ wp[-12];
-			wp[4] = (x<<1) | (x>>31);
-			a += ((b<<5) | (b>>27)) + wp[4];
-			a += 0x6ed9eba1 + (c^d^e);
-			c = (c<<30)|(c>>2);
-		}
-
-		wend = w + 60;
-		for(; wp < wend; wp += 5){
-			x = wp[-3] ^ wp[-8] ^ wp[-14] ^ wp[-16];
-			wp[0] = (x<<1) | (x>>31);
-			e += ((a<<5) | (a>>27)) + wp[0];
-			e += 0x8f1bbcdc + ((b&c)|((b|c)&d));
-			b = (b<<30)|(b>>2);
-
-			x = wp[-2] ^ wp[-7] ^ wp[-13] ^ wp[-15];
-			wp[1] = (x<<1) | (x>>31);
-			d += ((e<<5) | (e>>27)) + wp[1];
-			d += 0x8f1bbcdc + ((a&b)|((a|b)&c));
-			a = (a<<30)|(a>>2);
-
-			x = wp[-1] ^ wp[-6] ^ wp[-12] ^ wp[-14];
-			wp[2] = (x<<1) | (x>>31);
-			c += ((d<<5) | (d>>27)) + wp[2];
-			c += 0x8f1bbcdc + ((e&a)|((e|a)&b));
-			e = (e<<30)|(e>>2);
-
-			x = wp[0] ^ wp[-5] ^ wp[-11] ^ wp[-13];
-			wp[3] = (x<<1) | (x>>31);
-			b += ((c<<5) | (c>>27)) + wp[3];
-			b += 0x8f1bbcdc + ((d&e)|((d|e)&a));
-			d = (d<<30)|(d>>2);
-
-			x = wp[1] ^ wp[-4] ^ wp[-10] ^ wp[-12];
-			wp[4] = (x<<1) | (x>>31);
-			a += ((b<<5) | (b>>27)) + wp[4];
-			a += 0x8f1bbcdc + ((c&d)|((c|d)&e));
-			c = (c<<30)|(c>>2);
-		}
-
-		wend = w + 80;
-		for(; wp < wend; wp += 5){
-			x = wp[-3] ^ wp[-8] ^ wp[-14] ^ wp[-16];
-			wp[0] = (x<<1) | (x>>31);
-			e += ((a<<5) | (a>>27)) + wp[0];
-			e += 0xca62c1d6 + (b^c^d);
-			b = (b<<30)|(b>>2);
-
-			x = wp[-2] ^ wp[-7] ^ wp[-13] ^ wp[-15];
-			wp[1] = (x<<1) | (x>>31);
-			d += ((e<<5) | (e>>27)) + wp[1];
-			d += 0xca62c1d6 + (a^b^c);
-			a = (a<<30)|(a>>2);
-
-			x = wp[-1] ^ wp[-6] ^ wp[-12] ^ wp[-14];
-			wp[2] = (x<<1) | (x>>31);
-			c += ((d<<5) | (d>>27)) + wp[2];
-			c += 0xca62c1d6 + (e^a^b);
-			e = (e<<30)|(e>>2);
-
-			x = wp[0] ^ wp[-5] ^ wp[-11] ^ wp[-13];
-			wp[3] = (x<<1) | (x>>31);
-			b += ((c<<5) | (c>>27)) + wp[3];
-			b += 0xca62c1d6 + (d^e^a);
-			d = (d<<30)|(d>>2);
-
-			x = wp[1] ^ wp[-4] ^ wp[-10] ^ wp[-12];
-			wp[4] = (x<<1) | (x>>31);
-			a += ((b<<5) | (b>>27)) + wp[4];
-			a += 0xca62c1d6 + (c^d^e);
-			c = (c<<30)|(c>>2);
-		}
-
-		/* save state */
 		s[0] += a;
 		s[1] += b;
 		s[2] += c;