shithub: riscv

Download patch

ref: 077e719dfbf9bf2582bed80026251cc0d108c16e
parent: 1eb373945455f1ba03fa1b221529d74ca2a778ad
author: cinap_lenrek <[email protected]>
date: Sun Nov 19 19:10:35 EST 2017

libsec: write optimized _chachablock() function for amd64 / sse2

doing 4 quarterround's in parallel using 128-bit
vector registers. for second round shuffle the columns and
then shuffle back.

code is rather obvious. only trick here is for the first
quaterround PSHUFLW/PSHUFHW is used to swap the halfwords
for the <<<16 rotation.

--- a/sys/src/ape/lib/sec/amd64/mkfile
+++ b/sys/src/ape/lib/sec/amd64/mkfile
@@ -3,6 +3,7 @@
 
 LIB=/$objtype/lib/ape/libsec.a
 FILES=\
+	chachablock\
 	md5block\
 	sha1block\
 	aesni\
--- a/sys/src/ape/lib/sec/port/mkfile
+++ b/sys/src/ape/lib/sec/port/mkfile
@@ -11,7 +11,7 @@
 	sha1pickle.c md5pickle.c\
 	poly1305.c\
 	rc4.c\
-	chacha.c\
+	chacha.c chachablock.c\
 	salsa.c\
 	genrandom.c prng.c fastrand.c nfastrand.c\
 	probably_prime.c smallprimetest.c genprime.c dsaprimes.c\
--- /dev/null
+++ b/sys/src/libsec/amd64/chachablock.s
@@ -1,0 +1,74 @@
+#define ROTATE(n, v1, v2) \
+	MOVO	v1, v2; \
+	PSLLL	$(n), v1; \
+	PSRLL	$(32-n), v2; \
+	POR	v1, v2
+
+TEXT _chachablock(SB), 0, $0
+	MOVOU	 0(RARG), X0
+	MOVOU	16(RARG), X1
+	MOVOU	32(RARG), X2
+	MOVOU	48(RARG), X3
+
+	MOVL	rounds+8(FP), CX
+	SHRL	$1, CX
+
+_loop:
+	PADDL	X1, X0
+	PXOR	X0, X3
+	/* ROTATE(16, X3, X3) */
+	PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+	PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+
+	PADDL	X3, X2
+	MOVO	X1, X4
+	PXOR	X2, X4
+	ROTATE(12, X4, X1)
+	
+	PADDL	X1, X0
+	MOVO	X0, X4
+	PXOR	X3, X4
+	ROTATE(8, X4, X3)
+
+	PADDL	X3, X2
+	MOVO	X1, X4
+	PXOR	X2, X4
+	ROTATE(7, X4, X1)
+
+	PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1
+	PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
+	PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3
+
+	PADDL	X1, X0
+	PXOR	X0, X3
+	/* ROTATE(16, X3, X3) */
+	PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+	PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+
+	PADDL	X3, X2
+	MOVO	X1, X4
+	PXOR	X2, X4
+	ROTATE(12, X4, X1)
+	
+	PADDL	X1, X0
+	MOVO	X0, X4
+	PXOR	X3, X4
+	ROTATE(8, X4, X3)
+
+	PADDL	X3, X2
+	MOVO	X1, X4
+	PXOR	X2, X4
+	ROTATE(7, X4, X1)
+
+	PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1
+	PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
+	PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3
+
+	DECL CX
+	JNE _loop
+
+	MOVOU	X0, 0(RARG)
+	MOVOU	X1, 16(RARG)
+	MOVOU	X2, 32(RARG)
+	MOVOU	X3, 48(RARG)
+	RET
--- a/sys/src/libsec/amd64/mkfile
+++ b/sys/src/libsec/amd64/mkfile
@@ -3,6 +3,7 @@
 
 LIB=/$objtype/lib/libsec.a
 FILES=\
+	chachablock\
 	md5block\
 	sha1block\
 	aesni\
--- a/sys/src/libsec/port/chacha.c
+++ b/sys/src/libsec/port/chacha.c
@@ -10,26 +10,13 @@
 #include "os.h"
 #include <libsec.h>
 
-enum{
-	Blockwords=	ChachaBsize/sizeof(u32int)
-};
+/* from chachablock.$O */
+extern void _chachablock(u32int x[16], int rounds);
 
 /* little-endian data order */
 #define	GET4(p)		((p)[0]|((p)[1]<<8)|((p)[2]<<16)|((p)[3]<<24))
 #define	PUT4(p,v)	(p)[0]=(v);(p)[1]=(v)>>8;(p)[2]=(v)>>16;(p)[3]=(v)>>24
 
-#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
-
-#define QUARTERROUND(ia,ib,ic,id) { \
-	u32int a, b, c, d, t; \
-	a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
-	a += b; t = d^a; d = ROTATE(t,16); \
-	c += d; t = b^c; b = ROTATE(t,12); \
-	a += b; t = d^a; d = ROTATE(t, 8); \
-	c += d; t = b^c; b = ROTATE(t, 7); \
-	x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
-}
-
 #define ENCRYPT(s, x, y, d) {\
 	u32int v; \
 	v = GET4(s); \
@@ -88,22 +75,6 @@
 }
 
 static void
-dorounds(u32int x[Blockwords], int rounds)
-{
-	for(; rounds > 0; rounds -= 2) {
-		QUARTERROUND(0, 4, 8,12)
-		QUARTERROUND(1, 5, 9,13)
-		QUARTERROUND(2, 6,10,14)
-		QUARTERROUND(3, 7,11,15)
-
-		QUARTERROUND(0, 5,10,15)
-		QUARTERROUND(1, 6,11,12)
-		QUARTERROUND(2, 7, 8,13)
-		QUARTERROUND(3, 4, 9,14)
-	}
-}
-
-static void
 hchachablock(uchar h[32], Chachastate *s)
 {
 	u32int x[16];
@@ -125,7 +96,7 @@
 	x[14] = s->input[14];
 	x[15] = s->input[15];
 
-	dorounds(x, s->rounds);
+	_chachablock(x, s->rounds);
 
 	PUT4(h+0*4, x[0]);
 	PUT4(h+1*4, x[1]);
@@ -183,7 +154,7 @@
 static void
 encryptblock(Chachastate *s, uchar *src, uchar *dst)
 {
-	u32int x[Blockwords];
+	u32int x[16];
 	int i;
 
 	x[0] = s->input[0];
@@ -202,7 +173,7 @@
 	x[13] = s->input[13];
 	x[14] = s->input[14];
 	x[15] = s->input[15];
-	dorounds(x, s->rounds);
+	_chachablock(x, s->rounds);
 
 	for(i=0; i<nelem(x); i+=4){
 		ENCRYPT(src, x[i], s->input[i], dst);
--- /dev/null
+++ b/sys/src/libsec/port/chachablock.c
@@ -1,0 +1,29 @@
+#include "os.h"
+
+#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
+
+#define QUARTERROUND(ia,ib,ic,id) { \
+	u32int a, b, c, d, t; \
+	a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
+	a += b; t = d^a; d = ROTATE(t,16); \
+	c += d; t = b^c; b = ROTATE(t,12); \
+	a += b; t = d^a; d = ROTATE(t, 8); \
+	c += d; t = b^c; b = ROTATE(t, 7); \
+	x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
+}
+
+void
+_chachablock(u32int x[16], int rounds)
+{
+	for(; rounds > 0; rounds -= 2) {
+		QUARTERROUND(0, 4, 8,12)
+		QUARTERROUND(1, 5, 9,13)
+		QUARTERROUND(2, 6,10,14)
+		QUARTERROUND(3, 7,11,15)
+
+		QUARTERROUND(0, 5,10,15)
+		QUARTERROUND(1, 6,11,12)
+		QUARTERROUND(2, 7, 8,13)
+		QUARTERROUND(3, 4, 9,14)
+	}
+}
--- a/sys/src/libsec/port/mkfile
+++ b/sys/src/libsec/port/mkfile
@@ -10,7 +10,7 @@
 	sha1pickle.c md5pickle.c\
 	poly1305.c\
 	rc4.c\
-	chacha.c\
+	chacha.c chachablock.c\
 	salsa.c\
 	genrandom.c prng.c fastrand.c nfastrand.c\
 	probably_prime.c smallprimetest.c genprime.c dsaprimes.c\