shithub: tlsclient

Download patch

ref: bfe9da47cfa2f51ebef78c59332b5ba2deba306d
parent: 98c1cd7ae022efe276123898af6b892eade0732c
author: grobe0ba <[email protected]>
date: Sat Apr 13 10:04:39 EDT 2024

use generated build files for boringssl

also, let's use some assembly where we've got it

diff: cannot open b/third_party/boringssl/linux-aarch64/crypto/chacha//null: file does not exist: 'b/third_party/boringssl/linux-aarch64/crypto/chacha//null' diff: cannot open b/third_party/boringssl/linux-aarch64/crypto/fipsmodule//null: file does not exist: 'b/third_party/boringssl/linux-aarch64/crypto/fipsmodule//null' diff: cannot open b/third_party/boringssl/linux-aarch64/crypto/test//null: file does not exist: 'b/third_party/boringssl/linux-aarch64/crypto/test//null' diff: cannot open b/third_party/boringssl/linux-aarch64/crypto//null: file does not exist: 'b/third_party/boringssl/linux-aarch64/crypto//null' diff: cannot open b/third_party/boringssl/linux-aarch64//null: file does not exist: 'b/third_party/boringssl/linux-aarch64//null' diff: cannot open b/third_party/boringssl/linux-arm/crypto/chacha//null: file does not exist: 'b/third_party/boringssl/linux-arm/crypto/chacha//null' diff: cannot open b/third_party/boringssl/linux-arm/crypto/fipsmodule//null: file does not exist: 'b/third_party/boringssl/linux-arm/crypto/fipsmodule//null' diff: cannot open b/third_party/boringssl/linux-arm/crypto/test//null: file does not exist: 'b/third_party/boringssl/linux-arm/crypto/test//null' diff: cannot open b/third_party/boringssl/linux-arm/crypto//null: file does not exist: 'b/third_party/boringssl/linux-arm/crypto//null' diff: cannot open b/third_party/boringssl/linux-arm//null: file does not exist: 'b/third_party/boringssl/linux-arm//null' diff: cannot open b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule//null: file does not exist: 'b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule//null' diff: cannot open b/third_party/boringssl/linux-ppc64le/crypto/test//null: file does not exist: 'b/third_party/boringssl/linux-ppc64le/crypto/test//null' diff: cannot open b/third_party/boringssl/linux-ppc64le/crypto//null: file does not exist: 'b/third_party/boringssl/linux-ppc64le/crypto//null' diff: cannot open b/third_party/boringssl/linux-ppc64le//null: file does not exist: 'b/third_party/boringssl/linux-ppc64le//null' diff: cannot open b/third_party/boringssl/linux-x86/crypto/chacha//null: file does not exist: 'b/third_party/boringssl/linux-x86/crypto/chacha//null' diff: cannot open b/third_party/boringssl/linux-x86/crypto/fipsmodule//null: file does not exist: 'b/third_party/boringssl/linux-x86/crypto/fipsmodule//null' diff: cannot open b/third_party/boringssl/linux-x86/crypto/test//null: file does not exist: 'b/third_party/boringssl/linux-x86/crypto/test//null' diff: cannot open b/third_party/boringssl/linux-x86/crypto//null: file does not exist: 'b/third_party/boringssl/linux-x86/crypto//null' diff: cannot open b/third_party/boringssl/linux-x86//null: file does not exist: 'b/third_party/boringssl/linux-x86//null' diff: cannot open b/third_party/boringssl/linux-x86_64/crypto/chacha//null: file does not exist: 'b/third_party/boringssl/linux-x86_64/crypto/chacha//null' diff: cannot open b/third_party/boringssl/linux-x86_64/crypto/cipher_extra//null: file does not exist: 'b/third_party/boringssl/linux-x86_64/crypto/cipher_extra//null' diff: cannot open b/third_party/boringssl/linux-x86_64/crypto/fipsmodule//null: file does not exist: 'b/third_party/boringssl/linux-x86_64/crypto/fipsmodule//null' diff: cannot open b/third_party/boringssl/linux-x86_64/crypto/test//null: file does not exist: 'b/third_party/boringssl/linux-x86_64/crypto/test//null' diff: cannot open b/third_party/boringssl/linux-x86_64/crypto//null: file does not exist: 'b/third_party/boringssl/linux-x86_64/crypto//null' diff: cannot open b/third_party/boringssl/linux-x86_64//null: file does not exist: 'b/third_party/boringssl/linux-x86_64//null'
--- a/third_party/boringssl/Makefile
+++ b/third_party/boringssl/Makefile
@@ -1,278 +1,39 @@
 .PHONY: all clean
 
-CRYPTO_srcs= err_data.c \
-		src/crypto/asn1/a_bitstr.c \
-		src/crypto/asn1/a_bool.c \
-		src/crypto/asn1/a_d2i_fp.c \
-		src/crypto/asn1/a_dup.c \
-		src/crypto/asn1/a_enum.c \
-		src/crypto/asn1/a_gentm.c \
-		src/crypto/asn1/a_i2d_fp.c \
-		src/crypto/asn1/a_int.c \
-		src/crypto/asn1/a_mbstr.c \
-		src/crypto/asn1/a_object.c \
-		src/crypto/asn1/a_octet.c \
-		src/crypto/asn1/a_print.c \
-		src/crypto/asn1/a_strex.c \
-		src/crypto/asn1/a_strnid.c \
-		src/crypto/asn1/a_time.c \
-		src/crypto/asn1/a_type.c \
-		src/crypto/asn1/a_utctm.c \
-		src/crypto/asn1/a_utf8.c \
-		src/crypto/asn1/asn1_lib.c \
-		src/crypto/asn1/asn1_par.c \
-		src/crypto/asn1/asn_pack.c \
-		src/crypto/asn1/f_int.c \
-		src/crypto/asn1/f_string.c \
-		src/crypto/asn1/tasn_dec.c \
-		src/crypto/asn1/tasn_enc.c \
-		src/crypto/asn1/tasn_fre.c \
-		src/crypto/asn1/tasn_new.c \
-		src/crypto/asn1/tasn_typ.c \
-		src/crypto/asn1/tasn_utl.c \
-		src/crypto/asn1/time_support.c \
-		src/crypto/base64/base64.c \
-		src/crypto/bio/bio.c \
-		src/crypto/bio/bio_mem.c \
-		src/crypto/bio/connect.c \
-		src/crypto/bio/fd.c \
-		src/crypto/bio/file.c \
-		src/crypto/bio/hexdump.c \
-		src/crypto/bio/pair.c \
-		src/crypto/bio/printf.c \
-		src/crypto/bio/socket.c \
-		src/crypto/bio/socket_helper.c \
-		src/crypto/blake2/blake2.c \
-		src/crypto/bn_extra/bn_asn1.c \
-		src/crypto/bn_extra/convert.c \
-		src/crypto/buf/buf.c \
-		src/crypto/bytestring/asn1_compat.c \
-		src/crypto/bytestring/ber.c \
-		src/crypto/bytestring/cbb.c \
-		src/crypto/bytestring/cbs.c \
-		src/crypto/bytestring/unicode.c \
-		src/crypto/chacha/chacha.c \
-		src/crypto/cipher_extra/cipher_extra.c \
-		src/crypto/cipher_extra/derive_key.c \
-		src/crypto/cipher_extra/e_aesccm.c \
-		src/crypto/cipher_extra/e_aesctrhmac.c \
-		src/crypto/cipher_extra/e_aesgcmsiv.c \
-		src/crypto/cipher_extra/e_chacha20poly1305.c \
-		src/crypto/cipher_extra/e_null.c \
-		src/crypto/cipher_extra/e_rc2.c \
-		src/crypto/cipher_extra/e_rc4.c \
-		src/crypto/cipher_extra/e_tls.c \
-		src/crypto/cipher_extra/tls_cbc.c \
-		src/crypto/cmac/cmac.c \
-		src/crypto/conf/conf.c \
-		src/crypto/cpu-aarch64-fuchsia.c \
-		src/crypto/cpu-aarch64-linux.c \
-		src/crypto/cpu-aarch64-win.c \
-		src/crypto/cpu-arm-linux.c \
-		src/crypto/cpu-arm.c \
-		src/crypto/cpu-intel.c \
-		src/crypto/cpu-ppc64le.c \
-		src/crypto/crypto.c \
-		src/crypto/curve25519/curve25519.c \
-		src/crypto/curve25519/spake25519.c \
-		src/crypto/dh_extra/dh_asn1.c \
-		src/crypto/dh_extra/params.c \
-		src/crypto/digest_extra/digest_extra.c \
-		src/crypto/dsa/dsa.c \
-		src/crypto/dsa/dsa_asn1.c \
-		src/crypto/ec_extra/ec_asn1.c \
-		src/crypto/ec_extra/ec_derive.c \
-		src/crypto/ec_extra/hash_to_curve.c \
-		src/crypto/ecdh_extra/ecdh_extra.c \
-		src/crypto/ecdsa_extra/ecdsa_asn1.c \
-		src/crypto/engine/engine.c \
-		src/crypto/err/err.c \
-		src/crypto/evp/digestsign.c \
-		src/crypto/evp/evp.c \
-		src/crypto/evp/evp_asn1.c \
-		src/crypto/evp/evp_ctx.c \
-		src/crypto/evp/p_dsa_asn1.c \
-		src/crypto/evp/p_ec.c \
-		src/crypto/evp/p_ec_asn1.c \
-		src/crypto/evp/p_ed25519.c \
-		src/crypto/evp/p_ed25519_asn1.c \
-		src/crypto/evp/p_rsa.c \
-		src/crypto/evp/p_rsa_asn1.c \
-		src/crypto/evp/p_x25519.c \
-		src/crypto/evp/p_x25519_asn1.c \
-		src/crypto/evp/pbkdf.c \
-		src/crypto/evp/print.c \
-		src/crypto/evp/scrypt.c \
-		src/crypto/evp/sign.c \
-		src/crypto/ex_data.c \
-		src/crypto/fipsmodule/bcm.c \
-		src/crypto/fipsmodule/fips_shared_support.c \
-		src/crypto/hkdf/hkdf.c \
-		src/crypto/hpke/hpke.c \
-		src/crypto/hrss/hrss.c \
-		src/crypto/lhash/lhash.c \
-		src/crypto/mem.c \
-		src/crypto/obj/obj.c \
-		src/crypto/obj/obj_xref.c \
-		src/crypto/pem/pem_all.c \
-		src/crypto/pem/pem_info.c \
-		src/crypto/pem/pem_lib.c \
-		src/crypto/pem/pem_oth.c \
-		src/crypto/pem/pem_pk8.c \
-		src/crypto/pem/pem_pkey.c \
-		src/crypto/pem/pem_x509.c \
-		src/crypto/pem/pem_xaux.c \
-		src/crypto/pkcs7/pkcs7.c \
-		src/crypto/pkcs7/pkcs7_x509.c \
-		src/crypto/pkcs8/p5_pbev2.c \
-		src/crypto/pkcs8/pkcs8.c \
-		src/crypto/pkcs8/pkcs8_x509.c \
-		src/crypto/poly1305/poly1305.c \
-		src/crypto/poly1305/poly1305_arm.c \
-		src/crypto/poly1305/poly1305_vec.c \
-		src/crypto/pool/pool.c \
-		src/crypto/rand_extra/deterministic.c \
-		src/crypto/rand_extra/forkunsafe.c \
-		src/crypto/rand_extra/fuchsia.c \
-		src/crypto/rand_extra/passive.c \
-		src/crypto/rand_extra/rand_extra.c \
-		src/crypto/rand_extra/windows.c \
-		src/crypto/rc4/rc4.c \
-		src/crypto/refcount_c11.c \
-		src/crypto/refcount_lock.c \
-		src/crypto/rsa_extra/rsa_asn1.c \
-		src/crypto/rsa_extra/rsa_print.c \
-		src/crypto/siphash/siphash.c \
-		src/crypto/stack/stack.c \
-		src/crypto/thread.c \
-		src/crypto/thread_none.c \
-		src/crypto/thread_pthread.c \
-		src/crypto/thread_win.c \
-		src/crypto/trust_token/pmbtoken.c \
-		src/crypto/trust_token/trust_token.c \
-		src/crypto/trust_token/voprf.c \
-		src/crypto/x509/a_digest.c \
-		src/crypto/x509/a_sign.c \
-		src/crypto/x509/a_verify.c \
-		src/crypto/x509/algorithm.c \
-		src/crypto/x509/asn1_gen.c \
-		src/crypto/x509/by_dir.c \
-		src/crypto/x509/by_file.c \
-		src/crypto/x509/i2d_pr.c \
-		src/crypto/x509/name_print.c \
-		src/crypto/x509/rsa_pss.c \
-		src/crypto/x509/t_crl.c \
-		src/crypto/x509/t_req.c \
-		src/crypto/x509/t_x509.c \
-		src/crypto/x509/t_x509a.c \
-		src/crypto/x509/x509.c \
-		src/crypto/x509/x509_att.c \
-		src/crypto/x509/x509_cmp.c \
-		src/crypto/x509/x509_d2.c \
-		src/crypto/x509/x509_def.c \
-		src/crypto/x509/x509_ext.c \
-		src/crypto/x509/x509_lu.c \
-		src/crypto/x509/x509_obj.c \
-		src/crypto/x509/x509_req.c \
-		src/crypto/x509/x509_set.c \
-		src/crypto/x509/x509_trs.c \
-		src/crypto/x509/x509_txt.c \
-		src/crypto/x509/x509_v3.c \
-		src/crypto/x509/x509_vfy.c \
-		src/crypto/x509/x509_vpm.c \
-		src/crypto/x509/x509cset.c \
-		src/crypto/x509/x509name.c \
-		src/crypto/x509/x509rset.c \
-		src/crypto/x509/x509spki.c \
-		src/crypto/x509/x_algor.c \
-		src/crypto/x509/x_all.c \
-		src/crypto/x509/x_attrib.c \
-		src/crypto/x509/x_crl.c \
-		src/crypto/x509/x_exten.c \
-		src/crypto/x509/x_info.c \
-		src/crypto/x509/x_name.c \
-		src/crypto/x509/x_pkey.c \
-		src/crypto/x509/x_pubkey.c \
-		src/crypto/x509/x_req.c \
-		src/crypto/x509/x_sig.c \
-		src/crypto/x509/x_spki.c \
-		src/crypto/x509/x_val.c \
-		src/crypto/x509/x_x509.c \
-		src/crypto/x509/x_x509a.c \
-		src/crypto/x509v3/pcy_cache.c \
-		src/crypto/x509v3/pcy_data.c \
-		src/crypto/x509v3/pcy_lib.c \
-		src/crypto/x509v3/pcy_map.c \
-		src/crypto/x509v3/pcy_node.c \
-		src/crypto/x509v3/pcy_tree.c \
-		src/crypto/x509v3/v3_akey.c \
-		src/crypto/x509v3/v3_akeya.c \
-		src/crypto/x509v3/v3_alt.c \
-		src/crypto/x509v3/v3_bcons.c \
-		src/crypto/x509v3/v3_bitst.c \
-		src/crypto/x509v3/v3_conf.c \
-		src/crypto/x509v3/v3_cpols.c \
-		src/crypto/x509v3/v3_crld.c \
-		src/crypto/x509v3/v3_enum.c \
-		src/crypto/x509v3/v3_extku.c \
-		src/crypto/x509v3/v3_genn.c \
-		src/crypto/x509v3/v3_ia5.c \
-		src/crypto/x509v3/v3_info.c \
-		src/crypto/x509v3/v3_int.c \
-		src/crypto/x509v3/v3_lib.c \
-		src/crypto/x509v3/v3_ncons.c \
-		src/crypto/x509v3/v3_ocsp.c \
-		src/crypto/x509v3/v3_pci.c \
-		src/crypto/x509v3/v3_pcia.c \
-		src/crypto/x509v3/v3_pcons.c \
-		src/crypto/x509v3/v3_pmaps.c \
-		src/crypto/x509v3/v3_prn.c \
-		src/crypto/x509v3/v3_purp.c \
-		src/crypto/x509v3/v3_skey.c \
-		src/crypto/x509v3/v3_utl.c
+include eureka.mk
 
-SSL_srcs= src/ssl/bio_ssl.cc \
-		src/ssl/d1_both.cc \
-		src/ssl/d1_lib.cc \
-		src/ssl/d1_pkt.cc \
-		src/ssl/d1_srtp.cc \
-		src/ssl/dtls_method.cc \
-		src/ssl/dtls_record.cc \
-		src/ssl/encrypted_client_hello.cc \
-		src/ssl/extensions.cc \
-		src/ssl/handoff.cc \
-		src/ssl/handshake.cc \
-		src/ssl/handshake_client.cc \
-		src/ssl/handshake_server.cc \
-		src/ssl/s3_both.cc \
-		src/ssl/s3_lib.cc \
-		src/ssl/s3_pkt.cc \
-		src/ssl/ssl_aead_ctx.cc \
-		src/ssl/ssl_asn1.cc \
-		src/ssl/ssl_buffer.cc \
-		src/ssl/ssl_cert.cc \
-		src/ssl/ssl_cipher.cc \
-		src/ssl/ssl_file.cc \
-		src/ssl/ssl_key_share.cc \
-		src/ssl/ssl_lib.cc \
-		src/ssl/ssl_privkey.cc \
-		src/ssl/ssl_session.cc \
-		src/ssl/ssl_stat.cc \
-		src/ssl/ssl_transcript.cc \
-		src/ssl/ssl_versions.cc \
-		src/ssl/ssl_x509.cc \
-		src/ssl/t1_enc.cc \
-		src/ssl/tls13_both.cc \
-		src/ssl/tls13_client.cc \
-		src/ssl/tls13_enc.cc \
-		src/ssl/tls13_server.cc \
-		src/ssl/tls_method.cc \
-		src/ssl/tls_record.cc \
+CFLAGS += -Isrc/include
 
-CRYPTO_objs= $(CRYPTO_srcs:.c=.o)
-SSL_objs= $(SSL_srcs:.cc=.o)
+CRYPTO_objs= $(crypto_sources:.c=.o)
+SSL_objs= $(ssl_sources:.cc=.o)
 
+MACH := $(shell $(CC) -dumpmachine 2>/dev/null)
 
+USE_ASM := false
+
+ifneq (,$(findstring linux,$(MACH)))
+	ifneq (,$(findstring x86_64,$(MACH)))
+		CRYPTO_objs += $(linux_x86_64_sources:.S=.o)
+		USE_ASM := true
+	endif
+	ifneq (,$(findstring i686,$(MACH)))
+		CRYPTO_objs += $(linux_x86_sources:.S=.o)
+		USE_ASM := true
+	endif
+	ifneq (,$(findstring aarch64,$(MACH)))
+		CRYPTO_objs += $(linux_aarch64_sources:.S=.o)
+		USE_ASM := true
+	endif
+	ifneq (,$(findstring ppc64le,$(MACH)))
+		CRYPTO_objs += $(linux_ppc64le_sources:.S=.o)
+		USE_ASM := true
+	endif
+endif
+
+ifeq (,$(findstring true,$(USE_ASM)))
+	CFLAGS += -DOPENSSL_NO_ASM
+endif
+
 all: libssl.a libcrypto.a
 
 libcrypto.a: $(CRYPTO_objs)
@@ -284,10 +45,13 @@
 .SUFFIXES: .c .o
 
 %.o: %.c
-	$(CC) -DOPENSSL_NO_ASM -Isrc/include -c -o $@ $<
+	$(CC) $(CFLAGS) -c -o $@ $<
 
 %.o: %.cc
-	$(CXX) -DOPENSSL_NO_ASM -Isrc/include -c -o $@ $<
+	$(CXX) $(CFLAGS) -c -o $@ $<
+
+%.o: %.S
+	$(CC) $(CFLAGS) -c -o $@ $<
 
 clean:
 	rm -f $(CRYPTO_objs)
--- /dev/null
+++ b/third_party/boringssl/eureka.mk
@@ -1,0 +1,375 @@
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is created by generate_build_files.py. Do not edit manually.
+
+crypto_sources := \
+  err_data.c\
+  src/crypto/asn1/a_bitstr.c\
+  src/crypto/asn1/a_bool.c\
+  src/crypto/asn1/a_d2i_fp.c\
+  src/crypto/asn1/a_dup.c\
+  src/crypto/asn1/a_enum.c\
+  src/crypto/asn1/a_gentm.c\
+  src/crypto/asn1/a_i2d_fp.c\
+  src/crypto/asn1/a_int.c\
+  src/crypto/asn1/a_mbstr.c\
+  src/crypto/asn1/a_object.c\
+  src/crypto/asn1/a_octet.c\
+  src/crypto/asn1/a_print.c\
+  src/crypto/asn1/a_strex.c\
+  src/crypto/asn1/a_strnid.c\
+  src/crypto/asn1/a_time.c\
+  src/crypto/asn1/a_type.c\
+  src/crypto/asn1/a_utctm.c\
+  src/crypto/asn1/a_utf8.c\
+  src/crypto/asn1/asn1_lib.c\
+  src/crypto/asn1/asn1_par.c\
+  src/crypto/asn1/asn_pack.c\
+  src/crypto/asn1/f_int.c\
+  src/crypto/asn1/f_string.c\
+  src/crypto/asn1/tasn_dec.c\
+  src/crypto/asn1/tasn_enc.c\
+  src/crypto/asn1/tasn_fre.c\
+  src/crypto/asn1/tasn_new.c\
+  src/crypto/asn1/tasn_typ.c\
+  src/crypto/asn1/tasn_utl.c\
+  src/crypto/asn1/time_support.c\
+  src/crypto/base64/base64.c\
+  src/crypto/bio/bio.c\
+  src/crypto/bio/bio_mem.c\
+  src/crypto/bio/connect.c\
+  src/crypto/bio/fd.c\
+  src/crypto/bio/file.c\
+  src/crypto/bio/hexdump.c\
+  src/crypto/bio/pair.c\
+  src/crypto/bio/printf.c\
+  src/crypto/bio/socket.c\
+  src/crypto/bio/socket_helper.c\
+  src/crypto/blake2/blake2.c\
+  src/crypto/bn_extra/bn_asn1.c\
+  src/crypto/bn_extra/convert.c\
+  src/crypto/buf/buf.c\
+  src/crypto/bytestring/asn1_compat.c\
+  src/crypto/bytestring/ber.c\
+  src/crypto/bytestring/cbb.c\
+  src/crypto/bytestring/cbs.c\
+  src/crypto/bytestring/unicode.c\
+  src/crypto/chacha/chacha.c\
+  src/crypto/cipher_extra/cipher_extra.c\
+  src/crypto/cipher_extra/derive_key.c\
+  src/crypto/cipher_extra/e_aesccm.c\
+  src/crypto/cipher_extra/e_aesctrhmac.c\
+  src/crypto/cipher_extra/e_aesgcmsiv.c\
+  src/crypto/cipher_extra/e_chacha20poly1305.c\
+  src/crypto/cipher_extra/e_null.c\
+  src/crypto/cipher_extra/e_rc2.c\
+  src/crypto/cipher_extra/e_rc4.c\
+  src/crypto/cipher_extra/e_tls.c\
+  src/crypto/cipher_extra/tls_cbc.c\
+  src/crypto/cmac/cmac.c\
+  src/crypto/conf/conf.c\
+  src/crypto/cpu-aarch64-fuchsia.c\
+  src/crypto/cpu-aarch64-linux.c\
+  src/crypto/cpu-aarch64-win.c\
+  src/crypto/cpu-arm-linux.c\
+  src/crypto/cpu-arm.c\
+  src/crypto/cpu-intel.c\
+  src/crypto/cpu-ppc64le.c\
+  src/crypto/crypto.c\
+  src/crypto/curve25519/curve25519.c\
+  src/crypto/curve25519/spake25519.c\
+  src/crypto/dh_extra/dh_asn1.c\
+  src/crypto/dh_extra/params.c\
+  src/crypto/digest_extra/digest_extra.c\
+  src/crypto/dsa/dsa.c\
+  src/crypto/dsa/dsa_asn1.c\
+  src/crypto/ec_extra/ec_asn1.c\
+  src/crypto/ec_extra/ec_derive.c\
+  src/crypto/ec_extra/hash_to_curve.c\
+  src/crypto/ecdh_extra/ecdh_extra.c\
+  src/crypto/ecdsa_extra/ecdsa_asn1.c\
+  src/crypto/engine/engine.c\
+  src/crypto/err/err.c\
+  src/crypto/evp/digestsign.c\
+  src/crypto/evp/evp.c\
+  src/crypto/evp/evp_asn1.c\
+  src/crypto/evp/evp_ctx.c\
+  src/crypto/evp/p_dsa_asn1.c\
+  src/crypto/evp/p_ec.c\
+  src/crypto/evp/p_ec_asn1.c\
+  src/crypto/evp/p_ed25519.c\
+  src/crypto/evp/p_ed25519_asn1.c\
+  src/crypto/evp/p_rsa.c\
+  src/crypto/evp/p_rsa_asn1.c\
+  src/crypto/evp/p_x25519.c\
+  src/crypto/evp/p_x25519_asn1.c\
+  src/crypto/evp/pbkdf.c\
+  src/crypto/evp/print.c\
+  src/crypto/evp/scrypt.c\
+  src/crypto/evp/sign.c\
+  src/crypto/ex_data.c\
+  src/crypto/fipsmodule/bcm.c\
+  src/crypto/fipsmodule/fips_shared_support.c\
+  src/crypto/hkdf/hkdf.c\
+  src/crypto/hpke/hpke.c\
+  src/crypto/hrss/hrss.c\
+  src/crypto/lhash/lhash.c\
+  src/crypto/mem.c\
+  src/crypto/obj/obj.c\
+  src/crypto/obj/obj_xref.c\
+  src/crypto/pem/pem_all.c\
+  src/crypto/pem/pem_info.c\
+  src/crypto/pem/pem_lib.c\
+  src/crypto/pem/pem_oth.c\
+  src/crypto/pem/pem_pk8.c\
+  src/crypto/pem/pem_pkey.c\
+  src/crypto/pem/pem_x509.c\
+  src/crypto/pem/pem_xaux.c\
+  src/crypto/pkcs7/pkcs7.c\
+  src/crypto/pkcs7/pkcs7_x509.c\
+  src/crypto/pkcs8/p5_pbev2.c\
+  src/crypto/pkcs8/pkcs8.c\
+  src/crypto/pkcs8/pkcs8_x509.c\
+  src/crypto/poly1305/poly1305.c\
+  src/crypto/poly1305/poly1305_arm.c\
+  src/crypto/poly1305/poly1305_vec.c\
+  src/crypto/pool/pool.c\
+  src/crypto/rand_extra/deterministic.c\
+  src/crypto/rand_extra/forkunsafe.c\
+  src/crypto/rand_extra/fuchsia.c\
+  src/crypto/rand_extra/passive.c\
+  src/crypto/rand_extra/rand_extra.c\
+  src/crypto/rand_extra/windows.c\
+  src/crypto/rc4/rc4.c\
+  src/crypto/refcount_c11.c\
+  src/crypto/refcount_lock.c\
+  src/crypto/rsa_extra/rsa_asn1.c\
+  src/crypto/rsa_extra/rsa_print.c\
+  src/crypto/siphash/siphash.c\
+  src/crypto/stack/stack.c\
+  src/crypto/thread.c\
+  src/crypto/thread_none.c\
+  src/crypto/thread_pthread.c\
+  src/crypto/thread_win.c\
+  src/crypto/trust_token/pmbtoken.c\
+  src/crypto/trust_token/trust_token.c\
+  src/crypto/trust_token/voprf.c\
+  src/crypto/x509/a_digest.c\
+  src/crypto/x509/a_sign.c\
+  src/crypto/x509/a_verify.c\
+  src/crypto/x509/algorithm.c\
+  src/crypto/x509/asn1_gen.c\
+  src/crypto/x509/by_dir.c\
+  src/crypto/x509/by_file.c\
+  src/crypto/x509/i2d_pr.c\
+  src/crypto/x509/name_print.c\
+  src/crypto/x509/rsa_pss.c\
+  src/crypto/x509/t_crl.c\
+  src/crypto/x509/t_req.c\
+  src/crypto/x509/t_x509.c\
+  src/crypto/x509/t_x509a.c\
+  src/crypto/x509/x509.c\
+  src/crypto/x509/x509_att.c\
+  src/crypto/x509/x509_cmp.c\
+  src/crypto/x509/x509_d2.c\
+  src/crypto/x509/x509_def.c\
+  src/crypto/x509/x509_ext.c\
+  src/crypto/x509/x509_lu.c\
+  src/crypto/x509/x509_obj.c\
+  src/crypto/x509/x509_req.c\
+  src/crypto/x509/x509_set.c\
+  src/crypto/x509/x509_trs.c\
+  src/crypto/x509/x509_txt.c\
+  src/crypto/x509/x509_v3.c\
+  src/crypto/x509/x509_vfy.c\
+  src/crypto/x509/x509_vpm.c\
+  src/crypto/x509/x509cset.c\
+  src/crypto/x509/x509name.c\
+  src/crypto/x509/x509rset.c\
+  src/crypto/x509/x509spki.c\
+  src/crypto/x509/x_algor.c\
+  src/crypto/x509/x_all.c\
+  src/crypto/x509/x_attrib.c\
+  src/crypto/x509/x_crl.c\
+  src/crypto/x509/x_exten.c\
+  src/crypto/x509/x_info.c\
+  src/crypto/x509/x_name.c\
+  src/crypto/x509/x_pkey.c\
+  src/crypto/x509/x_pubkey.c\
+  src/crypto/x509/x_req.c\
+  src/crypto/x509/x_sig.c\
+  src/crypto/x509/x_spki.c\
+  src/crypto/x509/x_val.c\
+  src/crypto/x509/x_x509.c\
+  src/crypto/x509/x_x509a.c\
+  src/crypto/x509v3/pcy_cache.c\
+  src/crypto/x509v3/pcy_data.c\
+  src/crypto/x509v3/pcy_lib.c\
+  src/crypto/x509v3/pcy_map.c\
+  src/crypto/x509v3/pcy_node.c\
+  src/crypto/x509v3/pcy_tree.c\
+  src/crypto/x509v3/v3_akey.c\
+  src/crypto/x509v3/v3_akeya.c\
+  src/crypto/x509v3/v3_alt.c\
+  src/crypto/x509v3/v3_bcons.c\
+  src/crypto/x509v3/v3_bitst.c\
+  src/crypto/x509v3/v3_conf.c\
+  src/crypto/x509v3/v3_cpols.c\
+  src/crypto/x509v3/v3_crld.c\
+  src/crypto/x509v3/v3_enum.c\
+  src/crypto/x509v3/v3_extku.c\
+  src/crypto/x509v3/v3_genn.c\
+  src/crypto/x509v3/v3_ia5.c\
+  src/crypto/x509v3/v3_info.c\
+  src/crypto/x509v3/v3_int.c\
+  src/crypto/x509v3/v3_lib.c\
+  src/crypto/x509v3/v3_ncons.c\
+  src/crypto/x509v3/v3_ocsp.c\
+  src/crypto/x509v3/v3_pci.c\
+  src/crypto/x509v3/v3_pcia.c\
+  src/crypto/x509v3/v3_pcons.c\
+  src/crypto/x509v3/v3_pmaps.c\
+  src/crypto/x509v3/v3_prn.c\
+  src/crypto/x509v3/v3_purp.c\
+  src/crypto/x509v3/v3_skey.c\
+  src/crypto/x509v3/v3_utl.c\
+
+ssl_sources := \
+  src/ssl/bio_ssl.cc\
+  src/ssl/d1_both.cc\
+  src/ssl/d1_lib.cc\
+  src/ssl/d1_pkt.cc\
+  src/ssl/d1_srtp.cc\
+  src/ssl/dtls_method.cc\
+  src/ssl/dtls_record.cc\
+  src/ssl/encrypted_client_hello.cc\
+  src/ssl/extensions.cc\
+  src/ssl/handoff.cc\
+  src/ssl/handshake.cc\
+  src/ssl/handshake_client.cc\
+  src/ssl/handshake_server.cc\
+  src/ssl/s3_both.cc\
+  src/ssl/s3_lib.cc\
+  src/ssl/s3_pkt.cc\
+  src/ssl/ssl_aead_ctx.cc\
+  src/ssl/ssl_asn1.cc\
+  src/ssl/ssl_buffer.cc\
+  src/ssl/ssl_cert.cc\
+  src/ssl/ssl_cipher.cc\
+  src/ssl/ssl_file.cc\
+  src/ssl/ssl_key_share.cc\
+  src/ssl/ssl_lib.cc\
+  src/ssl/ssl_privkey.cc\
+  src/ssl/ssl_session.cc\
+  src/ssl/ssl_stat.cc\
+  src/ssl/ssl_transcript.cc\
+  src/ssl/ssl_versions.cc\
+  src/ssl/ssl_x509.cc\
+  src/ssl/t1_enc.cc\
+  src/ssl/tls13_both.cc\
+  src/ssl/tls13_client.cc\
+  src/ssl/tls13_enc.cc\
+  src/ssl/tls13_server.cc\
+  src/ssl/tls_method.cc\
+  src/ssl/tls_record.cc\
+
+tool_sources := \
+  src/tool/args.cc\
+  src/tool/ciphers.cc\
+  src/tool/client.cc\
+  src/tool/const.cc\
+  src/tool/digest.cc\
+  src/tool/fd.cc\
+  src/tool/file.cc\
+  src/tool/generate_ech.cc\
+  src/tool/generate_ed25519.cc\
+  src/tool/genrsa.cc\
+  src/tool/pkcs12.cc\
+  src/tool/rand.cc\
+  src/tool/server.cc\
+  src/tool/sign.cc\
+  src/tool/speed.cc\
+  src/tool/tool.cc\
+  src/tool/transport_common.cc\
+
+linux_aarch64_sources := \
+  linux-aarch64/crypto/chacha/chacha-armv8.S\
+  linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\
+  linux-aarch64/crypto/fipsmodule/armv8-mont.S\
+  linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\
+  linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\
+  linux-aarch64/crypto/fipsmodule/sha1-armv8.S\
+  linux-aarch64/crypto/fipsmodule/sha256-armv8.S\
+  linux-aarch64/crypto/fipsmodule/sha512-armv8.S\
+  linux-aarch64/crypto/fipsmodule/vpaes-armv8.S\
+  linux-aarch64/crypto/test/trampoline-armv8.S\
+
+linux_arm_sources := \
+  linux-arm/crypto/chacha/chacha-armv4.S\
+  linux-arm/crypto/fipsmodule/aesv8-armx32.S\
+  linux-arm/crypto/fipsmodule/armv4-mont.S\
+  linux-arm/crypto/fipsmodule/bsaes-armv7.S\
+  linux-arm/crypto/fipsmodule/ghash-armv4.S\
+  linux-arm/crypto/fipsmodule/ghashv8-armx32.S\
+  linux-arm/crypto/fipsmodule/sha1-armv4-large.S\
+  linux-arm/crypto/fipsmodule/sha256-armv4.S\
+  linux-arm/crypto/fipsmodule/sha512-armv4.S\
+  linux-arm/crypto/fipsmodule/vpaes-armv7.S\
+  linux-arm/crypto/test/trampoline-armv4.S\
+  src/crypto/curve25519/asm/x25519-asm-arm.S\
+  src/crypto/poly1305/poly1305_arm_asm.S\
+
+linux_ppc64le_sources := \
+  linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S\
+  linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S\
+  linux-ppc64le/crypto/test/trampoline-ppc.S\
+
+linux_x86_sources := \
+  linux-x86/crypto/chacha/chacha-x86.S\
+  linux-x86/crypto/fipsmodule/aesni-x86.S\
+  linux-x86/crypto/fipsmodule/bn-586.S\
+  linux-x86/crypto/fipsmodule/co-586.S\
+  linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S\
+  linux-x86/crypto/fipsmodule/ghash-x86.S\
+  linux-x86/crypto/fipsmodule/md5-586.S\
+  linux-x86/crypto/fipsmodule/sha1-586.S\
+  linux-x86/crypto/fipsmodule/sha256-586.S\
+  linux-x86/crypto/fipsmodule/sha512-586.S\
+  linux-x86/crypto/fipsmodule/vpaes-x86.S\
+  linux-x86/crypto/fipsmodule/x86-mont.S\
+  linux-x86/crypto/test/trampoline-x86.S\
+
+linux_x86_64_sources := \
+  linux-x86_64/crypto/chacha/chacha-x86_64.S\
+  linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S\
+  linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S\
+  linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/aesni-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/ghash-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/md5-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S\
+  linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S\
+  linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/rsaz-avx2.S\
+  linux-x86_64/crypto/fipsmodule/sha1-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/sha256-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/sha512-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S\
+  linux-x86_64/crypto/fipsmodule/x86_64-mont.S\
+  linux-x86_64/crypto/fipsmodule/x86_64-mont5.S\
+  linux-x86_64/crypto/test/trampoline-x86_64.S\
+  src/crypto/hrss/asm/poly_rq_mul.S\
+
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S
@@ -1,0 +1,1995 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+
+.hidden	OPENSSL_armcap_P
+
+.section	.rodata
+
+.align	5
+.Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+.Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	ChaCha20_ctr32
+.hidden	ChaCha20_ctr32
+.type	ChaCha20_ctr32,%function
+.align	5
+ChaCha20_ctr32:
+	AARCH64_VALID_CALL_TARGET
+	cbz	x2,.Labort
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x5,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+	adrp	x5,OPENSSL_armcap_P
+#endif
+	cmp	x2,#192
+	b.lo	.Lshort
+	ldr	w17,[x5,:lo12:OPENSSL_armcap_P]
+	tst	w17,#ARMV7_NEON
+	b.ne	ChaCha20_neon
+
+.Lshort:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__ARMEB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+.Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+.Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,.Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	.Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	.Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+.Labort:
+	ret
+
+.align	4
+.Ltail:
+	add	x2,x2,#64
+.Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+.Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+
+.type	ChaCha20_neon,%function
+.align	5
+ChaCha20_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	.L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+.Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+.Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,.Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	.Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	.Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	.Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	.Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	.Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	.Last_neon
+
+.Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	.Last_neon
+.Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	.Last_neon
+
+.align	4
+.Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+.Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,.Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_neon,.-ChaCha20_neon
+.type	ChaCha20_512_neon,%function
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,.Lsigma
+	add	x5,x5,:lo12:.Lsigma
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__ARMEB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+.Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+.Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+.Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,.Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__ARMEB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	.Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	.Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	.Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	.Loop_outer
+
+.Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	ChaCha20_512_neon,.-ChaCha20_512_neon
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S
@@ -1,0 +1,802 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,%function
+.align	5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	.Lenc_key_abort
+	cmp	x2,#0
+	b.eq	.Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	.Lenc_key_abort
+	cmp	w1,#256
+	b.gt	.Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	.Lenc_key_abort
+
+	adrp	x3,.Lrcon
+	add	x3,x3,:lo12:.Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	.Loop128
+	b.eq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	.Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+.Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	.Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+.Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	.Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	.Loop256
+
+.Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+.Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl	aes_hw_set_decrypt_key
+.hidden	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,%function
+.align	5
+aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	.Lenc_key
+
+	cmp	x0,#0
+	b.ne	.Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+.Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	.Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+.Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_encrypt
+.hidden	aes_hw_encrypt
+.type	aes_hw_encrypt,%function
+.align	5
+aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+.Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	.Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+.globl	aes_hw_decrypt
+.hidden	aes_hw_decrypt
+.type	aes_hw_decrypt,%function
+.align	5
+aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+.Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	.Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.size	aes_hw_decrypt,.-aes_hw_decrypt
+.globl	aes_hw_cbc_encrypt
+.hidden	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,%function
+.align	5
+aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	.Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	.Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	.Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	.Lenter_cbc_enc
+
+.align	4
+.Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+.Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	.Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+.Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	.Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	.Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	.Lcbc_done
+.align	5
+.Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	.Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+.Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	.Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	.Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+.Lcbc_done:
+	st1	{v6.16b},[x4]
+.Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,%function
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	.Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	.Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	.Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+.Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	.Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	.Lctr32_done
+	st1	{v3.16b},[x1]
+
+.Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S
@@ -1,0 +1,1436 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,%function
+.align	5
+bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+.Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,.L1st_skip
+
+.L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,.L1st
+
+.L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+.Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,.Linner_skip
+
+.Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,.Linner
+
+.Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,.Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+.Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,.Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+.Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,.Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+.type	__bn_sqr8x_mont,%function
+.align	5
+__bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+.Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	.Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,.Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+.Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,.Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+.Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,.Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	.Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	.Lsqr8x_mul
+
+.align	4
+.Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,.Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	.Lsqr8x_outer_loop
+
+.align	4
+.Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+.Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,.Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+.Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,.Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,.Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+.Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,.Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,.Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	.Lsqr8x_tail
+
+.align	4
+.Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	.Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+.Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,.Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+.Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,.Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	.Lsqr8x_done
+
+.align	4
+.Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+.Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
+.type	__bn_mul4x_mont,%function
+.align	5
+__bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+.Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_1st_reduction
+
+	cbz	x10,.Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,.Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	.Loop_mul4x_1st_tail
+
+.align	5
+.Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+.Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+.Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,.Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,.Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	.Loop_mul4x_tail
+
+.align	4
+.Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	.Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	.Loop_mul4x_reduction
+
+.align	4
+.Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+.Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,.Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+.Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,.Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	.Lmul4x_done
+
+.align	4
+.Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+.Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	__bn_mul4x_mont,.-__bn_mul4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -1,0 +1,346 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	.Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S
@@ -1,0 +1,576 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_v8
+.hidden	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+.globl	gcm_gmult_v8
+.hidden	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+.globl	gcm_ghash_v8
+.hidden	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	.Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __ARMEB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	.Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __ARMEB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __ARMEB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	.Ldone_v8		//is x3 zero?
+.Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	.Ltail4x
+
+	b	.Loop4x
+
+.align	4
+.Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	.Loop4x
+
+.Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	.Ldone4x
+
+	cmp	x3,#32
+	b.lo	.Lone
+	b.eq	.Ltwo
+.Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+.Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S
@@ -1,0 +1,1238 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+
+.hidden	OPENSSL_armcap_P
+.globl	sha1_block_data_order
+.hidden	sha1_block_data_order
+.type	sha1_block_data_order,%function
+.align	6
+sha1_block_data_order:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+	adrp	x16,OPENSSL_armcap_P
+#endif
+	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
+	tst	w16,#ARMV8_SHA1
+	b.ne	.Lv8_entry
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+.Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__ARMEB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__ARMEB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,.Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
+.type	sha1_block_armv8,%function
+.align	6
+sha1_block_armv8:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,.Lconst
+	add	x4,x4,:lo12:.Lconst
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b
+.inst	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.inst	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.inst	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.inst	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.inst	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.inst	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.inst	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.inst	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.inst	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.inst	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.inst	0x5e281885	//sha1su1 v5.16b,v4.16b
+.inst	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.inst	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.inst	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.inst	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.inst	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.inst	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.inst	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.inst	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha1_block_armv8,.-sha1_block_armv8
+.section	.rodata
+.align	6
+.Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S
@@ -1,0 +1,1214 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <[email protected]> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significanty faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.hidden	OPENSSL_armcap_P
+.globl	sha256_block_data_order
+.hidden	sha256_block_data_order
+.type	sha256_block_data_order,%function
+.align	6
+sha256_block_data_order:
+	AARCH64_VALID_CALL_TARGET
+#ifndef	__KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+	adrp	x16,OPENSSL_armcap_P
+#endif
+	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
+	tst	w16,#ARMV8_SHA256
+	b.ne	.Lv8_entry
+#endif
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,.LK256
+	add	x30,x30,:lo12:.LK256
+	stp	x0,x2,[x29,#96]
+
+.Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__ARMEB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+.Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,.Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	sha256_block_data_order,.-sha256_block_data_order
+
+.section	.rodata
+.align	6
+.type	.LK256,%object
+.LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+.size	.LK256,.-.LK256
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+.type	sha256_block_armv8,%function
+.align	6
+sha256_block_armv8:
+.Lv8_entry:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,.LK256
+	add	x3,x3,:lo12:.LK256
+
+.Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.inst	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.inst	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.inst	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.inst	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.inst	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.inst	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.inst	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.inst	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.inst	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha256_block_armv8,.-sha256_block_armv8
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -1,0 +1,1084 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <[email protected]> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significanty faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.hidden	OPENSSL_armcap_P
+.globl	sha512_block_data_order
+.hidden	sha512_block_data_order
+.type	sha512_block_data_order,%function
+.align	6
+sha512_block_data_order:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,.LK512
+	add	x30,x30,:lo12:.LK512
+	stp	x0,x2,[x29,#96]
+
+.Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__ARMEB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__ARMEB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+.Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,.Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	sha512_block_data_order,.-sha512_block_data_order
+
+.section	.rodata
+.align	6
+.type	.LK512,%object
+.LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+.size	.LK512,.-.LK512
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S
@@ -1,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.section	.rodata
+
+.type	_vpaes_consts,%object
+.align	7	// totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+.Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+.Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_encrypt_preheat,%function
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
+	ret
+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, .Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+.Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.type	_vpaes_encrypt_2x,%function
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, .Lk_mc_forward+16
+	add	x11, x11, :lo12:.Lk_mc_forward+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	.Lenc_2x_entry
+
+.align	4
+.Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+.Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, .Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type	_vpaes_decrypt_preheat,%function
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v17.16b, #0x0f
+	adrp	x11, .Lk_dipt
+	add	x11, x11, :lo12:.Lk_dipt
+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
+	ret
+.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, .Lk_sr
+	add	x10, x10, :lo12:.Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, .Lk_mc_forward+48
+	add	x10, x10, :lo12:.Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	.Ldec_entry
+
+.align	4
+.Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+.Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.hidden	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type	_vpaes_decrypt_2x,%function
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, .Lk_sr
+	add	x10, x10, :lo12:.Lk_sr
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, .Lk_mc_forward+48
+	add	x10, x10, :lo12:.Lk_mc_forward+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	.Ldec_2x_entry
+
+.align	4
+.Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+.Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, .Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, .Lk_inv
+	add	x10, x10, :lo12:.Lk_inv
+	movi	v16.16b, #0x5b			// .Lk_s63
+	adrp	x11, .Lk_sb1
+	add	x11, x11, :lo12:.Lk_sb1
+	movi	v17.16b, #0x0f			// .Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
+	adrp	x10, .Lk_dksd
+	add	x10, x10, :lo12:.Lk_dksd
+	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
+	adrp	x11, .Lk_mc_forward
+	add	x11, x11, :lo12:.Lk_mc_forward
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
+	ld1	{v8.2d}, [x10]			// .Lk_rcon
+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
+	ret
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, .Lk_sr		// lea	.Lk_sr(%rip),%r10
+	add	x10, x10, :lo12:.Lk_sr
+
+	add	x8, x8, x10
+	cbnz	w3, .Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+.Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+.Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+.Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+.Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+.Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+.Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, .Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+.Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, .Lk_deskew	// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, :lo12:.Lk_deskew
+
+	cbnz	w3, .Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, .Lk_opt		// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, :lo12:.Lk_opt
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+.Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, .Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+.Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.hidden	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+.globl	vpaes_cbc_encrypt
+.hidden	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,%function
+.align	4
+vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	cbz	x2, .Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	.Lcbc_enc_loop
+
+.align	4
+.Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	.Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+.Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type	vpaes_cbc_decrypt,%function
+.align	4
+vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	.Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	.Lcbc_dec_done
+
+.align	4
+.Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	.Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, .Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	.Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// .Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	.Lctr32_done
+
+.Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+.Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// .Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	.Lctr32_loop
+
+.Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S
@@ -1,0 +1,761 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+//                              const uint64_t *argv, size_t argc,
+//                              uint64_t unwind);
+.type	abi_test_trampoline, %function
+.globl	abi_test_trampoline
+.hidden	abi_test_trampoline
+.align	4
+abi_test_trampoline:
+.Labi_test_trampoline_begin:
+	AARCH64_SIGN_LINK_REGISTER
+	// Stack layout (low to high addresses)
+	//   x29,x30 (16 bytes)
+	//    d8-d15 (64 bytes)
+	//   x19-x28 (80 bytes)
+	//    x1 (8 bytes)
+	//   padding (8 bytes)
+	stp	x29, x30, [sp, #-176]!
+	mov	x29, sp
+
+	// Saved callee-saved registers and |state|.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+	stp	x19, x20, [sp, #80]
+	stp	x21, x22, [sp, #96]
+	stp	x23, x24, [sp, #112]
+	stp	x25, x26, [sp, #128]
+	stp	x27, x28, [sp, #144]
+	str	x1, [sp, #160]
+
+	// Load registers from |state|, with the exception of x29. x29 is the
+	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
+	// mandate that x29 always point to a frame. iOS64 does so, which means
+	// we cannot fill x29 with entropy without violating ABI rules
+	// ourselves. x29 is tested separately below.
+	ldp	d8, d9, [x1], #16
+	ldp	d10, d11, [x1], #16
+	ldp	d12, d13, [x1], #16
+	ldp	d14, d15, [x1], #16
+	ldp	x19, x20, [x1], #16
+	ldp	x21, x22, [x1], #16
+	ldp	x23, x24, [x1], #16
+	ldp	x25, x26, [x1], #16
+	ldp	x27, x28, [x1], #16
+
+	// Move parameters into temporary registers.
+	mov	x9, x0
+	mov	x10, x2
+	mov	x11, x3
+
+	// Load parameters into registers.
+	cbz	x11, .Largs_done
+	ldr	x0, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x1, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x2, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x3, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x4, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x5, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x6, [x10], #8
+	subs	x11, x11, #1
+	b.eq	.Largs_done
+	ldr	x7, [x10], #8
+
+.Largs_done:
+	blr	x9
+
+	// Reload |state| and store registers.
+	ldr	x1, [sp, #160]
+	stp	d8, d9, [x1], #16
+	stp	d10, d11, [x1], #16
+	stp	d12, d13, [x1], #16
+	stp	d14, d15, [x1], #16
+	stp	x19, x20, [x1], #16
+	stp	x21, x22, [x1], #16
+	stp	x23, x24, [x1], #16
+	stp	x25, x26, [x1], #16
+	stp	x27, x28, [x1], #16
+
+	// |func| is required to preserve x29, the frame pointer. We cannot load
+	// random values into x29 (see comment above), so compare it against the
+	// expected value and zero the field of |state| if corrupted.
+	mov	x9, sp
+	cmp	x29, x9
+	b.eq	.Lx29_ok
+	str	xzr, [x1]
+
+.Lx29_ok:
+	// Restore callee-saved registers.
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+	ldp	x19, x20, [sp, #80]
+	ldp	x21, x22, [sp, #96]
+	ldp	x23, x24, [sp, #112]
+	ldp	x25, x26, [sp, #128]
+	ldp	x27, x28, [sp, #144]
+
+	ldp	x29, x30, [sp], #176
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	abi_test_trampoline,.-abi_test_trampoline
+.type	abi_test_clobber_x0, %function
+.globl	abi_test_clobber_x0
+.hidden	abi_test_clobber_x0
+.align	4
+abi_test_clobber_x0:
+	AARCH64_VALID_CALL_TARGET
+	mov	x0, xzr
+	ret
+.size	abi_test_clobber_x0,.-abi_test_clobber_x0
+.type	abi_test_clobber_x1, %function
+.globl	abi_test_clobber_x1
+.hidden	abi_test_clobber_x1
+.align	4
+abi_test_clobber_x1:
+	AARCH64_VALID_CALL_TARGET
+	mov	x1, xzr
+	ret
+.size	abi_test_clobber_x1,.-abi_test_clobber_x1
+.type	abi_test_clobber_x2, %function
+.globl	abi_test_clobber_x2
+.hidden	abi_test_clobber_x2
+.align	4
+abi_test_clobber_x2:
+	AARCH64_VALID_CALL_TARGET
+	mov	x2, xzr
+	ret
+.size	abi_test_clobber_x2,.-abi_test_clobber_x2
+.type	abi_test_clobber_x3, %function
+.globl	abi_test_clobber_x3
+.hidden	abi_test_clobber_x3
+.align	4
+abi_test_clobber_x3:
+	AARCH64_VALID_CALL_TARGET
+	mov	x3, xzr
+	ret
+.size	abi_test_clobber_x3,.-abi_test_clobber_x3
+.type	abi_test_clobber_x4, %function
+.globl	abi_test_clobber_x4
+.hidden	abi_test_clobber_x4
+.align	4
+abi_test_clobber_x4:
+	AARCH64_VALID_CALL_TARGET
+	mov	x4, xzr
+	ret
+.size	abi_test_clobber_x4,.-abi_test_clobber_x4
+.type	abi_test_clobber_x5, %function
+.globl	abi_test_clobber_x5
+.hidden	abi_test_clobber_x5
+.align	4
+abi_test_clobber_x5:
+	AARCH64_VALID_CALL_TARGET
+	mov	x5, xzr
+	ret
+.size	abi_test_clobber_x5,.-abi_test_clobber_x5
+.type	abi_test_clobber_x6, %function
+.globl	abi_test_clobber_x6
+.hidden	abi_test_clobber_x6
+.align	4
+abi_test_clobber_x6:
+	AARCH64_VALID_CALL_TARGET
+	mov	x6, xzr
+	ret
+.size	abi_test_clobber_x6,.-abi_test_clobber_x6
+.type	abi_test_clobber_x7, %function
+.globl	abi_test_clobber_x7
+.hidden	abi_test_clobber_x7
+.align	4
+abi_test_clobber_x7:
+	AARCH64_VALID_CALL_TARGET
+	mov	x7, xzr
+	ret
+.size	abi_test_clobber_x7,.-abi_test_clobber_x7
+.type	abi_test_clobber_x8, %function
+.globl	abi_test_clobber_x8
+.hidden	abi_test_clobber_x8
+.align	4
+abi_test_clobber_x8:
+	AARCH64_VALID_CALL_TARGET
+	mov	x8, xzr
+	ret
+.size	abi_test_clobber_x8,.-abi_test_clobber_x8
+.type	abi_test_clobber_x9, %function
+.globl	abi_test_clobber_x9
+.hidden	abi_test_clobber_x9
+.align	4
+abi_test_clobber_x9:
+	AARCH64_VALID_CALL_TARGET
+	mov	x9, xzr
+	ret
+.size	abi_test_clobber_x9,.-abi_test_clobber_x9
+.type	abi_test_clobber_x10, %function
+.globl	abi_test_clobber_x10
+.hidden	abi_test_clobber_x10
+.align	4
+abi_test_clobber_x10:
+	AARCH64_VALID_CALL_TARGET
+	mov	x10, xzr
+	ret
+.size	abi_test_clobber_x10,.-abi_test_clobber_x10
+.type	abi_test_clobber_x11, %function
+.globl	abi_test_clobber_x11
+.hidden	abi_test_clobber_x11
+.align	4
+abi_test_clobber_x11:
+	AARCH64_VALID_CALL_TARGET
+	mov	x11, xzr
+	ret
+.size	abi_test_clobber_x11,.-abi_test_clobber_x11
+.type	abi_test_clobber_x12, %function
+.globl	abi_test_clobber_x12
+.hidden	abi_test_clobber_x12
+.align	4
+abi_test_clobber_x12:
+	AARCH64_VALID_CALL_TARGET
+	mov	x12, xzr
+	ret
+.size	abi_test_clobber_x12,.-abi_test_clobber_x12
+.type	abi_test_clobber_x13, %function
+.globl	abi_test_clobber_x13
+.hidden	abi_test_clobber_x13
+.align	4
+abi_test_clobber_x13:
+	AARCH64_VALID_CALL_TARGET
+	mov	x13, xzr
+	ret
+.size	abi_test_clobber_x13,.-abi_test_clobber_x13
+.type	abi_test_clobber_x14, %function
+.globl	abi_test_clobber_x14
+.hidden	abi_test_clobber_x14
+.align	4
+abi_test_clobber_x14:
+	AARCH64_VALID_CALL_TARGET
+	mov	x14, xzr
+	ret
+.size	abi_test_clobber_x14,.-abi_test_clobber_x14
+.type	abi_test_clobber_x15, %function
+.globl	abi_test_clobber_x15
+.hidden	abi_test_clobber_x15
+.align	4
+abi_test_clobber_x15:
+	AARCH64_VALID_CALL_TARGET
+	mov	x15, xzr
+	ret
+.size	abi_test_clobber_x15,.-abi_test_clobber_x15
+.type	abi_test_clobber_x16, %function
+.globl	abi_test_clobber_x16
+.hidden	abi_test_clobber_x16
+.align	4
+abi_test_clobber_x16:
+	AARCH64_VALID_CALL_TARGET
+	mov	x16, xzr
+	ret
+.size	abi_test_clobber_x16,.-abi_test_clobber_x16
+.type	abi_test_clobber_x17, %function
+.globl	abi_test_clobber_x17
+.hidden	abi_test_clobber_x17
+.align	4
+abi_test_clobber_x17:
+	AARCH64_VALID_CALL_TARGET
+	mov	x17, xzr
+	ret
+.size	abi_test_clobber_x17,.-abi_test_clobber_x17
+.type	abi_test_clobber_x19, %function
+.globl	abi_test_clobber_x19
+.hidden	abi_test_clobber_x19
+.align	4
+abi_test_clobber_x19:
+	AARCH64_VALID_CALL_TARGET
+	mov	x19, xzr
+	ret
+.size	abi_test_clobber_x19,.-abi_test_clobber_x19
+.type	abi_test_clobber_x20, %function
+.globl	abi_test_clobber_x20
+.hidden	abi_test_clobber_x20
+.align	4
+abi_test_clobber_x20:
+	AARCH64_VALID_CALL_TARGET
+	mov	x20, xzr
+	ret
+.size	abi_test_clobber_x20,.-abi_test_clobber_x20
+.type	abi_test_clobber_x21, %function
+.globl	abi_test_clobber_x21
+.hidden	abi_test_clobber_x21
+.align	4
+abi_test_clobber_x21:
+	AARCH64_VALID_CALL_TARGET
+	mov	x21, xzr
+	ret
+.size	abi_test_clobber_x21,.-abi_test_clobber_x21
+.type	abi_test_clobber_x22, %function
+.globl	abi_test_clobber_x22
+.hidden	abi_test_clobber_x22
+.align	4
+abi_test_clobber_x22:
+	AARCH64_VALID_CALL_TARGET
+	mov	x22, xzr
+	ret
+.size	abi_test_clobber_x22,.-abi_test_clobber_x22
+.type	abi_test_clobber_x23, %function
+.globl	abi_test_clobber_x23
+.hidden	abi_test_clobber_x23
+.align	4
+abi_test_clobber_x23:
+	AARCH64_VALID_CALL_TARGET
+	mov	x23, xzr
+	ret
+.size	abi_test_clobber_x23,.-abi_test_clobber_x23
+.type	abi_test_clobber_x24, %function
+.globl	abi_test_clobber_x24
+.hidden	abi_test_clobber_x24
+.align	4
+abi_test_clobber_x24:
+	AARCH64_VALID_CALL_TARGET
+	mov	x24, xzr
+	ret
+.size	abi_test_clobber_x24,.-abi_test_clobber_x24
+.type	abi_test_clobber_x25, %function
+.globl	abi_test_clobber_x25
+.hidden	abi_test_clobber_x25
+.align	4
+abi_test_clobber_x25:
+	AARCH64_VALID_CALL_TARGET
+	mov	x25, xzr
+	ret
+.size	abi_test_clobber_x25,.-abi_test_clobber_x25
+.type	abi_test_clobber_x26, %function
+.globl	abi_test_clobber_x26
+.hidden	abi_test_clobber_x26
+.align	4
+abi_test_clobber_x26:
+	AARCH64_VALID_CALL_TARGET
+	mov	x26, xzr
+	ret
+.size	abi_test_clobber_x26,.-abi_test_clobber_x26
+.type	abi_test_clobber_x27, %function
+.globl	abi_test_clobber_x27
+.hidden	abi_test_clobber_x27
+.align	4
+abi_test_clobber_x27:
+	AARCH64_VALID_CALL_TARGET
+	mov	x27, xzr
+	ret
+.size	abi_test_clobber_x27,.-abi_test_clobber_x27
+.type	abi_test_clobber_x28, %function
+.globl	abi_test_clobber_x28
+.hidden	abi_test_clobber_x28
+.align	4
+abi_test_clobber_x28:
+	AARCH64_VALID_CALL_TARGET
+	mov	x28, xzr
+	ret
+.size	abi_test_clobber_x28,.-abi_test_clobber_x28
+.type	abi_test_clobber_x29, %function
+.globl	abi_test_clobber_x29
+.hidden	abi_test_clobber_x29
+.align	4
+abi_test_clobber_x29:
+	AARCH64_VALID_CALL_TARGET
+	mov	x29, xzr
+	ret
+.size	abi_test_clobber_x29,.-abi_test_clobber_x29
+.type	abi_test_clobber_d0, %function
+.globl	abi_test_clobber_d0
+.hidden	abi_test_clobber_d0
+.align	4
+abi_test_clobber_d0:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d0, xzr
+	ret
+.size	abi_test_clobber_d0,.-abi_test_clobber_d0
+.type	abi_test_clobber_d1, %function
+.globl	abi_test_clobber_d1
+.hidden	abi_test_clobber_d1
+.align	4
+abi_test_clobber_d1:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d1, xzr
+	ret
+.size	abi_test_clobber_d1,.-abi_test_clobber_d1
+.type	abi_test_clobber_d2, %function
+.globl	abi_test_clobber_d2
+.hidden	abi_test_clobber_d2
+.align	4
+abi_test_clobber_d2:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d2, xzr
+	ret
+.size	abi_test_clobber_d2,.-abi_test_clobber_d2
+.type	abi_test_clobber_d3, %function
+.globl	abi_test_clobber_d3
+.hidden	abi_test_clobber_d3
+.align	4
+abi_test_clobber_d3:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d3, xzr
+	ret
+.size	abi_test_clobber_d3,.-abi_test_clobber_d3
+.type	abi_test_clobber_d4, %function
+.globl	abi_test_clobber_d4
+.hidden	abi_test_clobber_d4
+.align	4
+abi_test_clobber_d4:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d4, xzr
+	ret
+.size	abi_test_clobber_d4,.-abi_test_clobber_d4
+.type	abi_test_clobber_d5, %function
+.globl	abi_test_clobber_d5
+.hidden	abi_test_clobber_d5
+.align	4
+abi_test_clobber_d5:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d5, xzr
+	ret
+.size	abi_test_clobber_d5,.-abi_test_clobber_d5
+.type	abi_test_clobber_d6, %function
+.globl	abi_test_clobber_d6
+.hidden	abi_test_clobber_d6
+.align	4
+abi_test_clobber_d6:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d6, xzr
+	ret
+.size	abi_test_clobber_d6,.-abi_test_clobber_d6
+.type	abi_test_clobber_d7, %function
+.globl	abi_test_clobber_d7
+.hidden	abi_test_clobber_d7
+.align	4
+abi_test_clobber_d7:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d7, xzr
+	ret
+.size	abi_test_clobber_d7,.-abi_test_clobber_d7
+.type	abi_test_clobber_d8, %function
+.globl	abi_test_clobber_d8
+.hidden	abi_test_clobber_d8
+.align	4
+abi_test_clobber_d8:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d8, xzr
+	ret
+.size	abi_test_clobber_d8,.-abi_test_clobber_d8
+.type	abi_test_clobber_d9, %function
+.globl	abi_test_clobber_d9
+.hidden	abi_test_clobber_d9
+.align	4
+abi_test_clobber_d9:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d9, xzr
+	ret
+.size	abi_test_clobber_d9,.-abi_test_clobber_d9
+.type	abi_test_clobber_d10, %function
+.globl	abi_test_clobber_d10
+.hidden	abi_test_clobber_d10
+.align	4
+abi_test_clobber_d10:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d10, xzr
+	ret
+.size	abi_test_clobber_d10,.-abi_test_clobber_d10
+.type	abi_test_clobber_d11, %function
+.globl	abi_test_clobber_d11
+.hidden	abi_test_clobber_d11
+.align	4
+abi_test_clobber_d11:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d11, xzr
+	ret
+.size	abi_test_clobber_d11,.-abi_test_clobber_d11
+.type	abi_test_clobber_d12, %function
+.globl	abi_test_clobber_d12
+.hidden	abi_test_clobber_d12
+.align	4
+abi_test_clobber_d12:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d12, xzr
+	ret
+.size	abi_test_clobber_d12,.-abi_test_clobber_d12
+.type	abi_test_clobber_d13, %function
+.globl	abi_test_clobber_d13
+.hidden	abi_test_clobber_d13
+.align	4
+abi_test_clobber_d13:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d13, xzr
+	ret
+.size	abi_test_clobber_d13,.-abi_test_clobber_d13
+.type	abi_test_clobber_d14, %function
+.globl	abi_test_clobber_d14
+.hidden	abi_test_clobber_d14
+.align	4
+abi_test_clobber_d14:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d14, xzr
+	ret
+.size	abi_test_clobber_d14,.-abi_test_clobber_d14
+.type	abi_test_clobber_d15, %function
+.globl	abi_test_clobber_d15
+.hidden	abi_test_clobber_d15
+.align	4
+abi_test_clobber_d15:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d15, xzr
+	ret
+.size	abi_test_clobber_d15,.-abi_test_clobber_d15
+.type	abi_test_clobber_d16, %function
+.globl	abi_test_clobber_d16
+.hidden	abi_test_clobber_d16
+.align	4
+abi_test_clobber_d16:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d16, xzr
+	ret
+.size	abi_test_clobber_d16,.-abi_test_clobber_d16
+.type	abi_test_clobber_d17, %function
+.globl	abi_test_clobber_d17
+.hidden	abi_test_clobber_d17
+.align	4
+abi_test_clobber_d17:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d17, xzr
+	ret
+.size	abi_test_clobber_d17,.-abi_test_clobber_d17
+.type	abi_test_clobber_d18, %function
+.globl	abi_test_clobber_d18
+.hidden	abi_test_clobber_d18
+.align	4
+abi_test_clobber_d18:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d18, xzr
+	ret
+.size	abi_test_clobber_d18,.-abi_test_clobber_d18
+.type	abi_test_clobber_d19, %function
+.globl	abi_test_clobber_d19
+.hidden	abi_test_clobber_d19
+.align	4
+abi_test_clobber_d19:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d19, xzr
+	ret
+.size	abi_test_clobber_d19,.-abi_test_clobber_d19
+.type	abi_test_clobber_d20, %function
+.globl	abi_test_clobber_d20
+.hidden	abi_test_clobber_d20
+.align	4
+abi_test_clobber_d20:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d20, xzr
+	ret
+.size	abi_test_clobber_d20,.-abi_test_clobber_d20
+.type	abi_test_clobber_d21, %function
+.globl	abi_test_clobber_d21
+.hidden	abi_test_clobber_d21
+.align	4
+abi_test_clobber_d21:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d21, xzr
+	ret
+.size	abi_test_clobber_d21,.-abi_test_clobber_d21
+.type	abi_test_clobber_d22, %function
+.globl	abi_test_clobber_d22
+.hidden	abi_test_clobber_d22
+.align	4
+abi_test_clobber_d22:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d22, xzr
+	ret
+.size	abi_test_clobber_d22,.-abi_test_clobber_d22
+.type	abi_test_clobber_d23, %function
+.globl	abi_test_clobber_d23
+.hidden	abi_test_clobber_d23
+.align	4
+abi_test_clobber_d23:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d23, xzr
+	ret
+.size	abi_test_clobber_d23,.-abi_test_clobber_d23
+.type	abi_test_clobber_d24, %function
+.globl	abi_test_clobber_d24
+.hidden	abi_test_clobber_d24
+.align	4
+abi_test_clobber_d24:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d24, xzr
+	ret
+.size	abi_test_clobber_d24,.-abi_test_clobber_d24
+.type	abi_test_clobber_d25, %function
+.globl	abi_test_clobber_d25
+.hidden	abi_test_clobber_d25
+.align	4
+abi_test_clobber_d25:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d25, xzr
+	ret
+.size	abi_test_clobber_d25,.-abi_test_clobber_d25
+.type	abi_test_clobber_d26, %function
+.globl	abi_test_clobber_d26
+.hidden	abi_test_clobber_d26
+.align	4
+abi_test_clobber_d26:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d26, xzr
+	ret
+.size	abi_test_clobber_d26,.-abi_test_clobber_d26
+.type	abi_test_clobber_d27, %function
+.globl	abi_test_clobber_d27
+.hidden	abi_test_clobber_d27
+.align	4
+abi_test_clobber_d27:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d27, xzr
+	ret
+.size	abi_test_clobber_d27,.-abi_test_clobber_d27
+.type	abi_test_clobber_d28, %function
+.globl	abi_test_clobber_d28
+.hidden	abi_test_clobber_d28
+.align	4
+abi_test_clobber_d28:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d28, xzr
+	ret
+.size	abi_test_clobber_d28,.-abi_test_clobber_d28
+.type	abi_test_clobber_d29, %function
+.globl	abi_test_clobber_d29
+.hidden	abi_test_clobber_d29
+.align	4
+abi_test_clobber_d29:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d29, xzr
+	ret
+.size	abi_test_clobber_d29,.-abi_test_clobber_d29
+.type	abi_test_clobber_d30, %function
+.globl	abi_test_clobber_d30
+.hidden	abi_test_clobber_d30
+.align	4
+abi_test_clobber_d30:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d30, xzr
+	ret
+.size	abi_test_clobber_d30,.-abi_test_clobber_d30
+.type	abi_test_clobber_d31, %function
+.globl	abi_test_clobber_d31
+.hidden	abi_test_clobber_d31
+.align	4
+abi_test_clobber_d31:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d31, xzr
+	ret
+.size	abi_test_clobber_d31,.-abi_test_clobber_d31
+.type	abi_test_clobber_v8_upper, %function
+.globl	abi_test_clobber_v8_upper
+.hidden	abi_test_clobber_v8_upper
+.align	4
+abi_test_clobber_v8_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v8.d[1], xzr
+	ret
+.size	abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper
+.type	abi_test_clobber_v9_upper, %function
+.globl	abi_test_clobber_v9_upper
+.hidden	abi_test_clobber_v9_upper
+.align	4
+abi_test_clobber_v9_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v9.d[1], xzr
+	ret
+.size	abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper
+.type	abi_test_clobber_v10_upper, %function
+.globl	abi_test_clobber_v10_upper
+.hidden	abi_test_clobber_v10_upper
+.align	4
+abi_test_clobber_v10_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v10.d[1], xzr
+	ret
+.size	abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper
+.type	abi_test_clobber_v11_upper, %function
+.globl	abi_test_clobber_v11_upper
+.hidden	abi_test_clobber_v11_upper
+.align	4
+abi_test_clobber_v11_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v11.d[1], xzr
+	ret
+.size	abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper
+.type	abi_test_clobber_v12_upper, %function
+.globl	abi_test_clobber_v12_upper
+.hidden	abi_test_clobber_v12_upper
+.align	4
+abi_test_clobber_v12_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v12.d[1], xzr
+	ret
+.size	abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper
+.type	abi_test_clobber_v13_upper, %function
+.globl	abi_test_clobber_v13_upper
+.hidden	abi_test_clobber_v13_upper
+.align	4
+abi_test_clobber_v13_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v13.d[1], xzr
+	ret
+.size	abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper
+.type	abi_test_clobber_v14_upper, %function
+.globl	abi_test_clobber_v14_upper
+.hidden	abi_test_clobber_v14_upper
+.align	4
+abi_test_clobber_v14_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v14.d[1], xzr
+	ret
+.size	abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper
+.type	abi_test_clobber_v15_upper, %function
+.globl	abi_test_clobber_v15_upper
+.hidden	abi_test_clobber_v15_upper
+.align	4
+abi_test_clobber_v15_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v15.d[1], xzr
+	ret
+.size	abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S
@@ -1,0 +1,1493 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+.Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+.Lone:
+.long	1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.LChaCha20_ctr32
+#else
+.word	-1
+#endif
+
+.globl	ChaCha20_ctr32
+.hidden	ChaCha20_ctr32
+.type	ChaCha20_ctr32,%function
+.align	5
+ChaCha20_ctr32:
+.LChaCha20_ctr32:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r14,pc,#16		@ ChaCha20_ctr32
+#else
+	adr	r14,.LChaCha20_ctr32
+#endif
+	cmp	r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt	eq
+#endif
+	addeq	sp,sp,#4*3
+	beq	.Lno_data
+#if __ARM_MAX_ARCH__>=7
+	cmp	r2,#192			@ test len
+	bls	.Lshort
+	ldr	r4,[r14,#-32]
+	ldr	r4,[r14,r4]
+# ifdef	__APPLE__
+	ldr	r4,[r4]
+# endif
+	tst	r4,#ARMV7_NEON
+	bne	.LChaCha20_neon
+.Lshort:
+#endif
+	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+	sub	r14,r14,#64		@ .Lsigma
+	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
+	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	.Loop_outer_enter
+
+.align	4
+.Loop_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+.Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	.Loop
+
+.align	4
+.Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	.Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	.Lunaligned
+	cmp	r11,#64		@ restore flags
+# else
+	ldr	r10,[sp,#4*(2)]
+# endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1		@ next counter value
+	strhi	r8,[sp,#4*(12)]	@ save next counter value
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_outer
+
+	beq	.Ldone
+# if __ARM_ARCH__<7
+	b	.Ltail
+
+.align	4
+.Lunaligned:@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	.Loop_outer
+
+	beq	.Ldone
+#endif
+
+.Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+.Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	.Loop_tail
+
+.Ldone:
+	add	sp,sp,#4*(32+3)
+.Lno_data:
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.type	ChaCha20_neon,%function
+.align	5
+ChaCha20_neon:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+.LChaCha20_neon:
+	adr	r14,.Lsigma
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
+	stmdb	sp!,{r0,r1,r2,r3}
+
+	vld1.32	{q1,q2},[r3]		@ load key
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+
+	sub	sp,sp,#4*(16+16)
+	vld1.32	{q3},[r12]		@ load counter and nonce
+	add	r12,sp,#4*8
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	vld1.32	{q0},[r14]!		@ load sigma
+	vld1.32	{q12},[r14]		@ one
+	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
+
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr	d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr	d26,[sp,#4*(16+2)]
+	vmov	q4,q0
+	vstr	d28,[sp,#4*(16+4)]
+	vmov	q8,q0
+	vmov	q5,q1
+	vmov	q9,q1
+	b	.Loop_neon_enter
+
+.align	4
+.Loop_neon_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	cmp	r11,#64*2		@ if len<=64*2
+	bls	.Lbreak_neon		@ switch to integer-only
+	vmov	q4,q0
+	str	r11,[sp,#4*(32+2)]	@ save len
+	vmov	q8,q0
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	vmov	q5,q1
+	str	r14,  [sp,#4*(32+0)]	@ save out
+	vmov	q9,q1
+.Loop_neon_enter:
+	ldr	r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov	q6,q2
+	ldr	r10, [sp,#4*(13)]
+	vmov	q10,q2
+	ldr	r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	add	r12,r12,#3	@ counter+3
+	b	.Loop_neon
+
+.align	4
+.Loop_neon:
+	subs	r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne	.Loop_neon
+
+	add	r11,sp,#32
+	vld1.32	{q12,q13},[sp]		@ load key material
+	vld1.32	{q14,q15},[r11]
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr	d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp	r11,#64*4
+	blo	.Ltail_neon
+
+	vld1.8	{q12,q13},[r12]!	@ load input
+	mov	r11,sp
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12		@ xor with input
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	vst1.8	{q0,q1},[r14]!	@ store output
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vld1.32	{q0,q1},[r11]!	@ load for next iteration
+	veor	d25,d25,d25
+	vldr	d24,[sp,#4*(16+4)]	@ four
+	veor	q9,q9,q13
+	vld1.32	{q2,q3},[r11]
+	veor	q10,q10,q14
+	vst1.8	{q4,q5},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q6,q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	vst1.8	{q8,q9},[r14]!
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+	vst1.8	{q10,q11},[r14]!
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8	@ xor with input
+	add	r8,sp,#4*(4)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r5,r5,r9
+	ldr	r9,[r12,#-12]
+	add	r6,r6,r10
+	ldr	r10,[r12,#-8]
+	add	r7,r7,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+	add	r8,sp,#4*(8)
+	eor	r5,r5,r9
+	str	r4,[r14],#16		@ store output
+	eor	r6,r6,r10
+	str	r5,[r14,#-12]
+	eor	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8
+	add	r8,sp,#4*(12)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,r8,#4		@ next counter value
+	add	r5,r5,r9
+	str	r8,[sp,#4*(12)]	@ save next counter value
+	ldr	r8,[r12],#16		@ load input
+	add	r6,r6,r10
+	add	r4,r4,#3		@ counter+3
+	ldr	r9,[r12,#-12]
+	add	r7,r7,r11
+	ldr	r10,[r12,#-8]
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+# ifdef	__thumb2__
+	it	hi
+# endif
+	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
+	eor	r5,r5,r9
+	eor	r6,r6,r10
+	str	r4,[r14],#16		@ store output
+	eor	r7,r7,r11
+	str	r5,[r14,#-12]
+	sub	r11,r8,#64*4	@ len-=64*4
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	.Loop_neon_outer
+
+	b	.Ldone_neon
+
+.align	4
+.Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str	r11, [sp,#4*(20+32+2)]	@ save len
+	add	r11,sp,#4*(32+4)
+	str	r12,   [sp,#4*(20+32+1)]	@ save inp
+	str	r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr	r12,[sp,#4*(16+10)]
+	ldr	r14,[sp,#4*(16+11)]
+	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
+	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(20+16+15)]
+	add	r11,sp,#4*(20)
+	vst1.32	{q0,q1},[r11]!		@ copy key
+	add	sp,sp,#4*(20)			@ switch frame
+	vst1.32	{q2,q3},[r11]
+	mov	r11,#10
+	b	.Loop				@ go integer-only
+
+.align	4
+.Ltail_neon:
+	cmp	r11,#64*3
+	bhs	.L192_or_more_neon
+	cmp	r11,#64*2
+	bhs	.L128_or_more_neon
+	cmp	r11,#64*1
+	bhs	.L64_or_more_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q0,q1},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q2,q3},[r8]
+	b	.Loop_tail_neon
+
+.align	4
+.L64_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vst1.8	{q0,q1},[r14]!
+	vst1.8	{q2,q3},[r14]!
+
+	beq	.Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q4,q5},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q6,q7},[r8]
+	sub	r11,r11,#64*1	@ len-=64*1
+	b	.Loop_tail_neon
+
+.align	4
+.L128_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vst1.8	{q0,q1},[r14]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vst1.8	{q4,q5},[r14]!
+	vst1.8	{q6,q7},[r14]!
+
+	beq	.Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q8,q9},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q10,q11},[r8]
+	sub	r11,r11,#64*2	@ len-=64*2
+	b	.Loop_tail_neon
+
+.align	4
+.L192_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q0,q1},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vst1.8	{q2,q3},[r14]!
+	veor	q9,q9,q13
+	vst1.8	{q4,q5},[r14]!
+	veor	q10,q10,q14
+	vst1.8	{q6,q7},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q8,q9},[r14]!
+	vst1.8	{q10,q11},[r14]!
+
+	beq	.Ldone_neon
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(4)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r0,sp,#4*(16+8)
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(12)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r4,r4,#3		@ counter+3
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldr	r11,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r10,sp,#4*(0)
+	sub	r11,r11,#64*3	@ len-=64*3
+
+.Loop_tail_neon:
+	ldrb	r8,[r10],#1	@ read buffer on stack
+	ldrb	r9,[r12],#1		@ read input
+	subs	r11,r11,#1
+	eor	r8,r8,r9
+	strb	r8,[r14],#1		@ store output
+	bne	.Loop_tail_neon
+
+.Ldone_neon:
+	add	sp,sp,#4*(32+4)
+	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+	add	sp,sp,#4*(16+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size	ChaCha20_neon,.-ChaCha20_neon
+.comm	OPENSSL_armcap_P,4,4
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S
@@ -1,0 +1,800 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv7-a	@ don't confuse not-so-latest binutils with argv8 :-)
+.fpu	neon
+.code	32
+#undef	__thumb2__
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,%function
+.align	5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+	mov	r3,#-1
+	cmp	r0,#0
+	beq	.Lenc_key_abort
+	cmp	r2,#0
+	beq	.Lenc_key_abort
+	mov	r3,#-2
+	cmp	r1,#128
+	blt	.Lenc_key_abort
+	cmp	r1,#256
+	bgt	.Lenc_key_abort
+	tst	r1,#0x3f
+	bne	.Lenc_key_abort
+
+	adr	r3,.Lrcon
+	cmp	r1,#192
+
+	veor	q0,q0,q0
+	vld1.8	{q3},[r0]!
+	mov	r1,#8		@ reuse r1
+	vld1.32	{q1,q2},[r3]!
+
+	blt	.Loop128
+	beq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	bne	.Loop128
+
+	vld1.32	{q1},[r3]
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]
+	add	r2,r2,#0x50
+
+	mov	r12,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	vld1.8	{d16},[r0]!
+	vmov.i8	q10,#8			@ borrow q10
+	vst1.32	{q3},[r2]!
+	vsub.i8	q2,q2,q10	@ adjust the mask
+
+.Loop192:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{d16},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+
+	vdup.32	q9,d7[1]
+	veor	q9,q9,q8
+	veor	q10,q10,q1
+	vext.8	q8,q0,q8,#12
+	vshl.u8	q1,q1,#1
+	veor	q8,q8,q9
+	veor	q3,q3,q10
+	veor	q8,q8,q10
+	vst1.32	{q3},[r2]!
+	bne	.Loop192
+
+	mov	r12,#12
+	add	r2,r2,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	vld1.8	{q8},[r0]
+	mov	r1,#7
+	mov	r12,#14
+	vst1.32	{q3},[r2]!
+
+.Loop256:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q8},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]!
+	beq	.Ldone
+
+	vdup.32	q10,d7[1]
+	vext.8	q9,q0,q8,#12
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+
+	veor	q8,q8,q10
+	b	.Loop256
+
+.Ldone:
+	str	r12,[r2]
+	mov	r3,#0
+
+.Lenc_key_abort:
+	mov	r0,r3			@ return value
+
+	bx	lr
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl	aes_hw_set_decrypt_key
+.hidden	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,%function
+.align	5
+aes_hw_set_decrypt_key:
+	stmdb	sp!,{r4,lr}
+	bl	.Lenc_key
+
+	cmp	r0,#0
+	bne	.Ldec_key_abort
+
+	sub	r2,r2,#240		@ restore original r2
+	mov	r4,#-16
+	add	r0,r2,r12,lsl#4	@ end of key schedule
+
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+
+.Loop_imc:
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+	cmp	r0,r2
+	bhi	.Loop_imc
+
+	vld1.32	{q0},[r2]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+	vst1.32	{q0},[r0]
+
+	eor	r0,r0,r0		@ return value
+.Ldec_key_abort:
+	ldmia	sp!,{r4,pc}
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_encrypt
+.hidden	aes_hw_encrypt
+.type	aes_hw_encrypt,%function
+.align	5
+aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+.Loop_enc:
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	.Loop_enc
+
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+.globl	aes_hw_decrypt
+.hidden	aes_hw_decrypt
+.type	aes_hw_decrypt,%function
+.align	5
+aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+.Loop_dec:
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	.Loop_dec
+
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+.size	aes_hw_decrypt,.-aes_hw_decrypt
+.globl	aes_hw_cbc_encrypt
+.hidden	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,%function
+.align	5
+aes_hw_cbc_encrypt:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load remaining args
+	subs	r2,r2,#16
+	mov	r8,#16
+	blo	.Lcbc_abort
+	moveq	r8,#0
+
+	cmp	r5,#0			@ en- or decrypting?
+	ldr	r5,[r3,#240]
+	and	r2,r2,#-16
+	vld1.8	{q6},[r4]
+	vld1.8	{q0},[r0],r8
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#6
+	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
+	sub	r5,r5,#2
+	vld1.32	{q10,q11},[r7]!
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+
+	add	r7,r3,#32
+	mov	r6,r5
+	beq	.Lcbc_dec
+
+	cmp	r5,#2
+	veor	q0,q0,q6
+	veor	q5,q8,q7
+	beq	.Lcbc_enc128
+
+	vld1.32	{q2,q3},[r7]
+	add	r7,r3,#16
+	add	r6,r3,#16*4
+	add	r12,r3,#16*5
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	add	r14,r3,#16*6
+	add	r3,r3,#16*7
+	b	.Lenter_cbc_enc
+
+.align	4
+.Loop_cbc_enc:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+.Lenter_cbc_enc:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r6]
+	cmp	r5,#4
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r12]
+	beq	.Lcbc_enc192
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r14]
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r3]
+	nop
+
+.Lcbc_enc192:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	.Loop_cbc_enc
+
+	vst1.8	{q6},[r1]!
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	vld1.32	{q2,q3},[r7]
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+.Lenter_cbc_enc128:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	.Loop_cbc_enc128
+
+	vst1.8	{q6},[r1]!
+	b	.Lcbc_done
+.align	5
+.Lcbc_dec:
+	vld1.8	{q10},[r0]!
+	subs	r2,r2,#32		@ bias
+	add	r6,r5,#2
+	vorr	q3,q0,q0
+	vorr	q1,q0,q0
+	vorr	q11,q10,q10
+	blo	.Lcbc_dec_tail
+
+	vorr	q1,q10,q10
+	vld1.8	{q10},[r0]!
+	vorr	q2,q0,q0
+	vorr	q3,q1,q1
+	vorr	q11,q10,q10
+
+.Loop3x_cbc_dec:
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	.Loop3x_cbc_dec
+
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q4,q6,q7
+	subs	r2,r2,#0x30
+	veor	q5,q2,q7
+	movlo	r6,r2			@ r6, r6, is zero at this point
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+	add	r0,r0,r6		@ r0 is adjusted in such way that
+					@ at exit from the loop q1-q10
+					@ are loaded with last "words"
+	vorr	q6,q11,q11
+	mov	r7,r3
+.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q2},[r0]!
+.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q3},[r0]!
+.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q11},[r0]!
+.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	add	r6,r5,#2
+	veor	q4,q4,q0
+	veor	q5,q5,q1
+	veor	q10,q10,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q4},[r1]!
+	vorr	q0,q2,q2
+	vst1.8	{q5},[r1]!
+	vorr	q1,q3,q3
+	vst1.8	{q10},[r1]!
+	vorr	q10,q11,q11
+	bhs	.Loop3x_cbc_dec
+
+	cmn	r2,#0x30
+	beq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	.Lcbc_dec_tail
+
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	cmn	r2,#0x20
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q5,q6,q7
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	beq	.Lcbc_dec_one
+	veor	q5,q5,q1
+	veor	q9,q9,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+	vst1.8	{q9},[r1]!
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	veor	q5,q5,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+
+.Lcbc_done:
+	vst1.8	{q6},[r4]
+.Lcbc_abort:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,%function
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldr	r4, [ip]		@ load remaining arg
+	ldr	r5,[r3,#240]
+
+	ldr	r8, [r4, #12]
+	vld1.32	{q0},[r4]
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#4
+	mov	r12,#16
+	cmp	r2,#2
+	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
+	sub	r5,r5,#2
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+	add	r7,r3,#32
+	mov	r6,r5
+	movlo	r12,#0
+
+	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	@ affected by silicon errata #1742098 [0] and #1655431 [1],
+	@ respectively, where the second instruction of an aese/aesmc
+	@ instruction pair may execute twice if an interrupt is taken right
+	@ after the first instruction consumes an input register of which a
+	@ single 32-bit lane has been updated the last time it was modified.
+	@ 
+	@ This function uses a counter in one 32-bit lane. The 
+	@ could write to q1 and q10 directly, but that trips this bugs.
+	@ We write to q6 and copy to the final register as a workaround.
+	@ 
+	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+	rev	r8, r8
+#endif
+	add	r10, r8, #1
+	vorr	q6,q0,q0
+	rev	r10, r10
+	vmov.32	d13[1],r10
+	add	r8, r8, #2
+	vorr	q1,q6,q6
+	bls	.Lctr32_tail
+	rev	r12, r8
+	vmov.32	d13[1],r12
+	sub	r2,r2,#3		@ bias
+	vorr	q10,q6,q6
+	b	.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	.Loop3x_ctr32
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
+	vld1.8	{q2},[r0]!
+	add	r9,r8,#1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.8	{q3},[r0]!
+	rev	r9,r9
+.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vld1.8	{q11},[r0]!
+	mov	r7,r3
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
+.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	veor	q2,q2,q7
+	add	r10,r8,#2
+.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	veor	q3,q3,q7
+	add	r8,r8,#3
+.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	 @ Note the logic to update q0, q1, and q1 is written to work
+	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 @ 32-bit mode. See the comment above.
+	veor	q11,q11,q7
+	vmov.32	d13[1], r9
+.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vorr	q0,q6,q6
+	rev	r10,r10
+.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+	vmov.32	d13[1], r10
+	rev	r12,r8
+.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vorr	q1,q6,q6
+	vmov.32	d13[1], r12
+.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vorr	q10,q6,q6
+	subs	r2,r2,#3
+.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
+.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
+.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
+
+	veor	q2,q2,q4
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	vst1.8	{q2},[r1]!
+	veor	q3,q3,q5
+	mov	r6,r5
+	vst1.8	{q3},[r1]!
+	veor	q11,q11,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q11},[r1]!
+	bhs	.Loop3x_ctr32
+
+	adds	r2,r2,#3
+	beq	.Lctr32_done
+	cmp	r2,#1
+	mov	r12,#16
+	moveq	r12,#0
+
+.Lctr32_tail:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q9},[r7]!
+	bgt	.Lctr32_tail
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q2},[r0],r12
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q3},[r0]
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q2,q2,q7
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q3,q3,q7
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
+
+	cmp	r2,#1
+	veor	q2,q2,q0
+	veor	q3,q3,q1
+	vst1.8	{q2},[r1]!
+	beq	.Lctr32_done
+	vst1.8	{q3},[r1]
+
+.Lctr32_done:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S
@@ -1,0 +1,977 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lbn_mul_mont
+#endif
+
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,%function
+
+.align	5
+bn_mul_mont:
+.Lbn_mul_mont:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+	tst	ip,#7
+	bne	.Lialu
+	adr	r0,.Lbn_mul_mont
+	ldr	r2,.LOPENSSL_armcap
+	ldr	r0,[r0,r2]
+#ifdef	__APPLE__
+	ldr	r0,[r0]
+#endif
+	tst	r0,#ARMV7_NEON		@ NEON available?
+	ldmia	sp, {r0,r2}
+	beq	.Lialu
+	add	sp,sp,#8
+	b	bn_mul8x_mont_neon
+.align	4
+.Lialu:
+#endif
+	cmp	ip,#2
+	mov	r0,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
+
+	mov	r0,r0,lsl#2		@ rescale r0 for byte count
+	sub	sp,sp,r0		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	r0,r0,#4		@ "num=num-1"
+	add	r4,r2,r0		@ &bp[num-1]
+
+	add	r0,sp,r0		@ r0 to point at &tp[num-1]
+	ldr	r8,[r0,#14*4]		@ &n0
+	ldr	r2,[r2]		@ bp[0]
+	ldr	r5,[r1],#4		@ ap[0],ap++
+	ldr	r6,[r3],#4		@ np[0],np++
+	ldr	r8,[r8]		@ *n0
+	str	r4,[r0,#15*4]		@ save &bp[num]
+
+	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
+	str	r8,[r0,#14*4]		@ save n0 value
+	mul	r8,r10,r8		@ "tp[0]"*n0
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
+	mov	r4,sp
+
+.L1st:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	.L1st
+
+	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
+	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	mov	r7,sp
+	str	r14,[r0,#4]		@ tp[num]=
+
+.Louter:
+	sub	r7,r0,r7		@ "original" r0-1 value
+	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
+	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
+	ldr	r5,[r1,#-4]		@ ap[0]
+	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
+	ldr	r7,[sp,#4]		@ tp[1]
+
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
+	str	r4,[r0,#13*4]		@ save bp
+	mul	r8,r10,r8
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
+	mov	r4,sp
+
+.Linner:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	.Linner
+
+	adds	r12,r12,r11
+	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
+	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adds	r12,r12,r7
+	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	str	r14,[r0,#4]		@ tp[num]=
+
+	cmp	r4,r7
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	r7,sp
+	bne	.Louter
+
+	ldr	r2,[r0,#12*4]		@ pull rp
+	mov	r5,sp
+	add	r0,r0,#4		@ r0 to point at &tp[num]
+	sub	r5,r0,r5		@ "original" num value
+	mov	r4,sp			@ "rewind" r4
+	mov	r1,r4			@ "borrow" r1
+	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
+
+	subs	r7,r7,r7		@ "clear" carry flag
+.Lsub:	ldr	r7,[r4],#4
+	ldr	r6,[r3],#4
+	sbcs	r7,r7,r6		@ tp[j]-np[j]
+	str	r7,[r2],#4		@ rp[j]=
+	teq	r4,r0		@ preserve carry
+	bne	.Lsub
+	sbcs	r14,r14,#0		@ upmost carry
+	mov	r4,sp			@ "rewind" r4
+	sub	r2,r2,r5		@ "rewind" r2
+
+.Lcopy:	ldr	r7,[r4]		@ conditional copy
+	ldr	r5,[r2]
+	str	sp,[r4],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	r5,r7
+	str	r5,[r2],#4
+	teq	r4,r0		@ preserve carry
+	bne	.Lcopy
+
+	mov	sp,r0
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+	bx	lr				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont,.-bn_mul_mont
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	r5,#8
+	bhi	.LNEON_8n
+
+	@ special case for r5==8, everything is in register bank...
+
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	sub	r7,sp,r5,lsl#4
+	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
+	and	r7,r7,#-64
+	vld1.32	{d30[0]}, [r4,:32]
+	mov	sp,r7			@ alloca
+	vzip.16	d28,d8
+
+	vmull.u32	q6,d28,d0[0]
+	vmull.u32	q7,d28,d0[1]
+	vmull.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmull.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	vmul.u32	d29,d29,d30
+
+	vmull.u32	q10,d28,d2[0]
+	vld1.32	{d4,d5,d6,d7}, [r3]!
+	vmull.u32	q11,d28,d2[1]
+	vmull.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmull.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	sub	r9,r5,#1
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	vadd.u64	d12,d12,d10
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	subs	r9,r9,#1
+	vmul.u32	d29,d29,d30
+
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	d12,d12,d10
+	mov	r7,sp
+	vshr.u64	d10,d12,#16
+	mov	r8,r5
+	vadd.u64	d13,d13,d10
+	add	r6,sp,#96
+	vshr.u64	d10,d13,#16
+	vzip.16	d12,d13
+
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_8n:
+	veor	q6,q6,q6
+	sub	r7,sp,#128
+	veor	q7,q7,q7
+	sub	r7,r7,r5,lsl#4
+	veor	q8,q8,q8
+	and	r7,r7,#-64
+	veor	q9,q9,q9
+	mov	sp,r7			@ alloca
+	veor	q10,q10,q10
+	add	r7,r7,#256
+	veor	q11,q11,q11
+	sub	r8,r5,#8
+	veor	q12,q12,q12
+	veor	q13,q13,q13
+
+.LNEON_8n_init:
+	vst1.64	{q6,q7},[r7,:256]!
+	subs	r8,r8,#8
+	vst1.64	{q8,q9},[r7,:256]!
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12,q13},[r7,:256]!
+	bne	.LNEON_8n_init
+
+	add	r6,sp,#256
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	add	r10,sp,#8
+	vld1.32	{d30[0]},[r4,:32]
+	mov	r9,r5
+	b	.LNEON_8n_outer
+
+.align	4
+.LNEON_8n_outer:
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	add	r7,sp,#128
+	vld1.32	{d4,d5,d6,d7},[r3]!
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+	vadd.u64	d29,d29,d12
+	vmlal.u32	q10,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q11,d28,d2[1]
+	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q6,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q7,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q8,d29,d5[0]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vadd.u64	d12,d12,d13
+	vmlal.u32	q11,d29,d6[1]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vadd.u64	d14,d14,d12
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]!
+	vmlal.u32	q8,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q9,d28,d1[0]
+	vshl.i64	d29,d15,#16
+	vmlal.u32	q10,d28,d1[1]
+	vadd.u64	d29,d29,d14
+	vmlal.u32	q11,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q12,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
+	vmlal.u32	q13,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q7,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q8,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q9,d29,d5[0]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vadd.u64	d14,d14,d15
+	vmlal.u32	q12,d29,d6[1]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vadd.u64	d16,d16,d14
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]!
+	vmlal.u32	q9,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q10,d28,d1[0]
+	vshl.i64	d29,d17,#16
+	vmlal.u32	q11,d28,d1[1]
+	vadd.u64	d29,d29,d16
+	vmlal.u32	q12,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q13,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
+	vmlal.u32	q6,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q8,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q9,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q10,d29,d5[0]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vadd.u64	d16,d16,d17
+	vmlal.u32	q13,d29,d6[1]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vadd.u64	d18,d18,d16
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]!
+	vmlal.u32	q10,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q11,d28,d1[0]
+	vshl.i64	d29,d19,#16
+	vmlal.u32	q12,d28,d1[1]
+	vadd.u64	d29,d29,d18
+	vmlal.u32	q13,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q6,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
+	vmlal.u32	q7,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q9,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q10,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q11,d29,d5[0]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vadd.u64	d18,d18,d19
+	vmlal.u32	q6,d29,d6[1]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vadd.u64	d20,d20,d18
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]!
+	vmlal.u32	q11,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q12,d28,d1[0]
+	vshl.i64	d29,d21,#16
+	vmlal.u32	q13,d28,d1[1]
+	vadd.u64	d29,d29,d20
+	vmlal.u32	q6,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q7,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
+	vmlal.u32	q8,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q10,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q11,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q12,d29,d5[0]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vadd.u64	d20,d20,d21
+	vmlal.u32	q7,d29,d6[1]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vadd.u64	d22,d22,d20
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]!
+	vmlal.u32	q12,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q13,d28,d1[0]
+	vshl.i64	d29,d23,#16
+	vmlal.u32	q6,d28,d1[1]
+	vadd.u64	d29,d29,d22
+	vmlal.u32	q7,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q8,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
+	vmlal.u32	q9,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q11,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q12,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q13,d29,d5[0]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vadd.u64	d22,d22,d23
+	vmlal.u32	q8,d29,d6[1]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vadd.u64	d24,d24,d22
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]!
+	vmlal.u32	q13,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q6,d28,d1[0]
+	vshl.i64	d29,d25,#16
+	vmlal.u32	q7,d28,d1[1]
+	vadd.u64	d29,d29,d24
+	vmlal.u32	q8,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q9,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
+	vmlal.u32	q10,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q12,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q13,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q6,d29,d5[0]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vadd.u64	d24,d24,d25
+	vmlal.u32	q9,d29,d6[1]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vadd.u64	d26,d26,d24
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]!
+	vmlal.u32	q6,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q7,d28,d1[0]
+	vshl.i64	d29,d27,#16
+	vmlal.u32	q8,d28,d1[1]
+	vadd.u64	d29,d29,d26
+	vmlal.u32	q9,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q10,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
+	vmlal.u32	q11,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q12,d28,d3[1]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q6,d29,d4[1]
+	vmlal.u32	q7,d29,d5[0]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vadd.u64	d26,d26,d27
+	vmlal.u32	q10,d29,d6[1]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q11,d29,d7[0]
+	vmlal.u32	q12,d29,d7[1]
+	vadd.u64	d12,d12,d26
+	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
+	add	r10,sp,#8		@ rewind
+	sub	r8,r5,#8
+	b	.LNEON_8n_inner
+
+.align	4
+.LNEON_8n_inner:
+	subs	r8,r8,#8
+	vmlal.u32	q6,d28,d0[0]
+	vld1.64	{q13},[r6,:128]
+	vmlal.u32	q7,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	q8,d28,d1[0]
+	vld1.32	{d4,d5,d6,d7},[r3]!
+	vmlal.u32	q9,d28,d1[1]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vmlal.u32	q11,d29,d6[1]
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vst1.64	{q6},[r7,:128]!
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]
+	vmlal.u32	q8,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
+	vmlal.u32	q9,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d1[1]
+	vmlal.u32	q11,d28,d2[0]
+	vmlal.u32	q12,d28,d2[1]
+	vmlal.u32	q13,d28,d3[0]
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
+	vmlal.u32	q7,d29,d4[0]
+	vmlal.u32	q8,d29,d4[1]
+	vmlal.u32	q9,d29,d5[0]
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vmlal.u32	q12,d29,d6[1]
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vst1.64	{q7},[r7,:128]!
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]
+	vmlal.u32	q9,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
+	vmlal.u32	q10,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q11,d28,d1[1]
+	vmlal.u32	q12,d28,d2[0]
+	vmlal.u32	q13,d28,d2[1]
+	vmlal.u32	q6,d28,d3[0]
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
+	vmlal.u32	q8,d29,d4[0]
+	vmlal.u32	q9,d29,d4[1]
+	vmlal.u32	q10,d29,d5[0]
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vmlal.u32	q13,d29,d6[1]
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vst1.64	{q8},[r7,:128]!
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]
+	vmlal.u32	q10,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
+	vmlal.u32	q11,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q12,d28,d1[1]
+	vmlal.u32	q13,d28,d2[0]
+	vmlal.u32	q6,d28,d2[1]
+	vmlal.u32	q7,d28,d3[0]
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
+	vmlal.u32	q9,d29,d4[0]
+	vmlal.u32	q10,d29,d4[1]
+	vmlal.u32	q11,d29,d5[0]
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vmlal.u32	q6,d29,d6[1]
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vst1.64	{q9},[r7,:128]!
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]
+	vmlal.u32	q11,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
+	vmlal.u32	q12,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q13,d28,d1[1]
+	vmlal.u32	q6,d28,d2[0]
+	vmlal.u32	q7,d28,d2[1]
+	vmlal.u32	q8,d28,d3[0]
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
+	vmlal.u32	q10,d29,d4[0]
+	vmlal.u32	q11,d29,d4[1]
+	vmlal.u32	q12,d29,d5[0]
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vmlal.u32	q7,d29,d6[1]
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vst1.64	{q10},[r7,:128]!
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]
+	vmlal.u32	q12,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
+	vmlal.u32	q13,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q6,d28,d1[1]
+	vmlal.u32	q7,d28,d2[0]
+	vmlal.u32	q8,d28,d2[1]
+	vmlal.u32	q9,d28,d3[0]
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
+	vmlal.u32	q11,d29,d4[0]
+	vmlal.u32	q12,d29,d4[1]
+	vmlal.u32	q13,d29,d5[0]
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vmlal.u32	q8,d29,d6[1]
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vst1.64	{q11},[r7,:128]!
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]
+	vmlal.u32	q13,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
+	vmlal.u32	q6,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q7,d28,d1[1]
+	vmlal.u32	q8,d28,d2[0]
+	vmlal.u32	q9,d28,d2[1]
+	vmlal.u32	q10,d28,d3[0]
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
+	vmlal.u32	q12,d29,d4[0]
+	vmlal.u32	q13,d29,d4[1]
+	vmlal.u32	q6,d29,d5[0]
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vmlal.u32	q9,d29,d6[1]
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vst1.64	{q12},[r7,:128]!
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]
+	vmlal.u32	q6,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
+	vmlal.u32	q7,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q8,d28,d1[1]
+	vmlal.u32	q9,d28,d2[0]
+	vmlal.u32	q10,d28,d2[1]
+	vmlal.u32	q11,d28,d3[0]
+	vmlal.u32	q12,d28,d3[1]
+	it	eq
+	subeq	r1,r1,r5,lsl#2	@ rewind
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q6,d29,d4[1]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q7,d29,d5[0]
+	add	r10,sp,#8		@ rewind
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vmlal.u32	q10,d29,d6[1]
+	vmlal.u32	q11,d29,d7[0]
+	vst1.64	{q13},[r7,:128]!
+	vmlal.u32	q12,d29,d7[1]
+
+	bne	.LNEON_8n_inner
+	add	r6,sp,#128
+	vst1.64	{q6,q7},[r7,:256]!
+	veor	q2,q2,q2		@ d4-d5
+	vst1.64	{q8,q9},[r7,:256]!
+	veor	q3,q3,q3		@ d6-d7
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12},[r7,:128]
+
+	subs	r9,r9,#8
+	vld1.64	{q6,q7},[r6,:256]!
+	vld1.64	{q8,q9},[r6,:256]!
+	vld1.64	{q10,q11},[r6,:256]!
+	vld1.64	{q12,q13},[r6,:256]!
+
+	itt	ne
+	subne	r3,r3,r5,lsl#2	@ rewind
+	bne	.LNEON_8n_outer
+
+	add	r7,sp,#128
+	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	d10,d12,#16
+	vst1.64	{q2,q3},[sp,:256]!
+	vadd.u64	d13,d13,d10
+	vst1.64	{q2,q3}, [sp,:256]!
+	vshr.u64	d10,d13,#16
+	vst1.64	{q2,q3}, [sp,:256]!
+	vzip.16	d12,d13
+
+	mov	r8,r5
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_tail:
+	vadd.u64	d12,d12,d10
+	vshr.u64	d10,d12,#16
+	vld1.64	{q8,q9}, [r6, :256]!
+	vadd.u64	d13,d13,d10
+	vld1.64	{q10,q11}, [r6, :256]!
+	vshr.u64	d10,d13,#16
+	vld1.64	{q12,q13}, [r6, :256]!
+	vzip.16	d12,d13
+
+.LNEON_tail_entry:
+	vadd.u64	d14,d14,d10
+	vst1.32	{d12[0]}, [r7, :32]!
+	vshr.u64	d10,d14,#16
+	vadd.u64	d15,d15,d10
+	vshr.u64	d10,d15,#16
+	vzip.16	d14,d15
+	vadd.u64	d16,d16,d10
+	vst1.32	{d14[0]}, [r7, :32]!
+	vshr.u64	d10,d16,#16
+	vadd.u64	d17,d17,d10
+	vshr.u64	d10,d17,#16
+	vzip.16	d16,d17
+	vadd.u64	d18,d18,d10
+	vst1.32	{d16[0]}, [r7, :32]!
+	vshr.u64	d10,d18,#16
+	vadd.u64	d19,d19,d10
+	vshr.u64	d10,d19,#16
+	vzip.16	d18,d19
+	vadd.u64	d20,d20,d10
+	vst1.32	{d18[0]}, [r7, :32]!
+	vshr.u64	d10,d20,#16
+	vadd.u64	d21,d21,d10
+	vshr.u64	d10,d21,#16
+	vzip.16	d20,d21
+	vadd.u64	d22,d22,d10
+	vst1.32	{d20[0]}, [r7, :32]!
+	vshr.u64	d10,d22,#16
+	vadd.u64	d23,d23,d10
+	vshr.u64	d10,d23,#16
+	vzip.16	d22,d23
+	vadd.u64	d24,d24,d10
+	vst1.32	{d22[0]}, [r7, :32]!
+	vshr.u64	d10,d24,#16
+	vadd.u64	d25,d25,d10
+	vshr.u64	d10,d25,#16
+	vzip.16	d24,d25
+	vadd.u64	d26,d26,d10
+	vst1.32	{d24[0]}, [r7, :32]!
+	vshr.u64	d10,d26,#16
+	vadd.u64	d27,d27,d10
+	vshr.u64	d10,d27,#16
+	vzip.16	d26,d27
+	vld1.64	{q6,q7}, [r6, :256]!
+	subs	r8,r8,#8
+	vst1.32	{d26[0]},   [r7, :32]!
+	bne	.LNEON_tail
+
+	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
+	sub	r3,r3,r5,lsl#2			@ rewind r3
+	subs	r1,sp,#0				@ clear carry flag
+	add	r2,sp,r5,lsl#2
+
+.LNEON_sub:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r3!, {r8,r9,r10,r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [r1]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,r2,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	r1,sp
+	sub	r0,r0,r11				@ rewind r0
+	mov	r3,r2				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r0,  {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	r1, {r4,r5,r6,r7}
+	stmia	r0!, {r8,r9,r10,r11}
+	sub	r1,r1,#16
+	ldmia	r0, {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	.LNEON_copy_n_zap
+
+	mov	sp,ip
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	bx	lr						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -1,0 +1,1529 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <[email protected]> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro. Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt	19.5 cycles per byte processed with 128-bit key
+@ decrypt	22.1 cycles per byte processed with 128-bit key
+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@						<[email protected]>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code	32
+# undef __thumb2__
+#endif
+
+.type	_bsaes_decrypt8,%function
+.align	4
+_bsaes_decrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0ISR
+#else
+	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
+#endif
+
+	vldmia	r6!, {q8}		@ .LM0ISR
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	.Ldec_sbox
+.align	4
+.Ldec_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Ldec_sbox:
+	veor	q1, q1, q4
+	veor	q3, q3, q4
+
+	veor	q4, q4, q7
+	veor	q1, q1, q6
+	veor	q2, q2, q7
+	veor	q6, q6, q4
+
+	veor	q0, q0, q1
+	veor	q2, q2, q5
+	veor	q7, q7, q6
+	veor	q3, q3, q0
+	veor	q5, q5, q0
+	veor	q1, q1, q3
+	veor	q11, q3, q0
+	veor	q10, q7, q4
+	veor	q9, q1, q6
+	veor	q13, q4, q0
+	vmov	q8, q10
+	veor	q12, q5, q2
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q6, q2
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q3, q7
+	veor	q12, q1, q5
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q4, q6
+	veor	q9, q9, q14
+	vand	q13, q0, q2
+	vand	q14, q7, q1
+	vorr	q15, q3, q5
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q5, q2
+	veor	q8, q1, q6
+	veor	q10, q15, q14
+	vand	q10, q10, q5
+	veor	q5, q5, q1
+	vand	q11, q1, q15
+	vand	q5, q5, q14
+	veor	q1, q11, q10
+	veor	q5, q5, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q2
+	veor	q12, q12, q8
+	veor	q2, q2, q6
+	vand	q8, q8, q15
+	vand	q6, q6, q13
+	vand	q12, q12, q14
+	vand	q2, q2, q9
+	veor	q8, q8, q12
+	veor	q2, q2, q6
+	veor	q12, q12, q11
+	veor	q6, q6, q10
+	veor	q5, q5, q12
+	veor	q2, q2, q12
+	veor	q1, q1, q8
+	veor	q6, q6, q8
+
+	veor	q12, q3, q0
+	veor	q8, q7, q4
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q4
+	vand	q8, q8, q15
+	vand	q4, q4, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q4
+	veor	q12, q12, q11
+	veor	q4, q4, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q3
+	veor	q3, q3, q7
+	vand	q11, q7, q15
+	vand	q3, q3, q14
+	veor	q7, q11, q10
+	veor	q3, q3, q11
+	veor	q3, q3, q12
+	veor	q0, q0, q12
+	veor	q7, q7, q8
+	veor	q4, q4, q8
+	veor	q1, q1, q7
+	veor	q6, q6, q5
+
+	veor	q4, q4, q1
+	veor	q2, q2, q7
+	veor	q5, q5, q7
+	veor	q4, q4, q2
+	veor	q7, q7, q0
+	veor	q4, q4, q5
+	veor	q3, q3, q6
+	veor	q6, q6, q1
+	veor	q3, q3, q4
+
+	veor	q4, q4, q0
+	veor	q7, q7, q3
+	subs	r5,r5,#1
+	bcc	.Ldec_done
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	q8, q0, q0, #8
+	vext.8	q14, q3, q3, #8
+	vext.8	q15, q5, q5, #8
+	veor	q8, q8, q0
+	vext.8	q9, q1, q1, #8
+	veor	q14, q14, q3
+	vext.8	q10, q6, q6, #8
+	veor	q15, q15, q5
+	vext.8	q11, q4, q4, #8
+	veor	q9, q9, q1
+	vext.8	q12, q2, q2, #8
+	veor	q10, q10, q6
+	vext.8	q13, q7, q7, #8
+	veor	q11, q11, q4
+	veor	q12, q12, q2
+	veor	q13, q13, q7
+
+	veor	q0, q0, q14
+	veor	q1, q1, q14
+	veor	q6, q6, q8
+	veor	q2, q2, q10
+	veor	q4, q4, q9
+	veor	q1, q1, q15
+	veor	q6, q6, q15
+	veor	q2, q2, q14
+	veor	q7, q7, q11
+	veor	q4, q4, q14
+	veor	q3, q3, q12
+	veor	q2, q2, q15
+	veor	q7, q7, q15
+	veor	q5, q5, q13
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q6, q6, #12
+	veor	q1, q1, q9
+	vext.8	q11, q4, q4, #12
+	veor	q6, q6, q10
+	vext.8	q12, q2, q2, #12
+	veor	q4, q4, q11
+	vext.8	q13, q7, q7, #12
+	veor	q2, q2, q12
+	vext.8	q14, q3, q3, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q3, q3, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q2
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q2, q2, #8
+	veor	q12, q12, q4
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q3
+	vext.8	q2, q4, q4, #8
+	veor	q11, q11, q6
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q4, q3, q3, #8
+	veor	q11, q11, q5
+	vext.8	q3, q6, q6, #8
+	veor	q5, q9, q13
+	veor	q11, q11, q2
+	veor	q7, q7, q15
+	veor	q6, q4, q14
+	veor	q4, q8, q12
+	veor	q2, q3, q10
+	vmov	q3, q11
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LISR
+	ite	eq				@ Thumb2 thing, sanity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Ldec_loop
+	vldmia	r6, {q12}		@ .LISRM0
+	b	.Ldec_loop
+.align	4
+.Ldec_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q3, #1
+	vshr.u64	q11, q2, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q4
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q2, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q4
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q4, #4
+	vshr.u64	q11, q6, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q4, q4, q10
+	veor	q6, q6, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q6, q6, q8
+	veor	q4, q4, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q8
+	veor	q3, q3, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type	_bsaes_const,%object
+.align	6
+_bsaes_const:
+.LM0ISR:@ InvShiftRows constants
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:@ ShiftRows constants
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	6
+.size	_bsaes_const,.-_bsaes_const
+
+.type	_bsaes_encrypt8,%function
+.align	4
+_bsaes_encrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0SR
+#else
+	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
+#endif
+
+	vldmia	r6!, {q8}		@ .LM0SR
+_bsaes_encrypt8_alt:
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	.Lenc_sbox
+.align	4
+.Lenc_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+.Lenc_sbox:
+	veor	q2, q2, q1
+	veor	q5, q5, q6
+	veor	q3, q3, q0
+	veor	q6, q6, q2
+	veor	q5, q5, q0
+
+	veor	q6, q6, q3
+	veor	q3, q3, q7
+	veor	q7, q7, q5
+	veor	q3, q3, q4
+	veor	q4, q4, q5
+
+	veor	q2, q2, q7
+	veor	q3, q3, q1
+	veor	q1, q1, q5
+	veor	q11, q7, q4
+	veor	q10, q1, q2
+	veor	q9, q5, q3
+	veor	q13, q2, q4
+	vmov	q8, q10
+	veor	q12, q6, q0
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q3, q0
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q7, q1
+	veor	q12, q5, q6
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q2, q3
+	veor	q9, q9, q14
+	vand	q13, q4, q0
+	vand	q14, q1, q5
+	vorr	q15, q7, q6
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q6, q0
+	veor	q8, q5, q3
+	veor	q10, q15, q14
+	vand	q10, q10, q6
+	veor	q6, q6, q5
+	vand	q11, q5, q15
+	vand	q6, q6, q14
+	veor	q5, q11, q10
+	veor	q6, q6, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q3
+	vand	q8, q8, q15
+	vand	q3, q3, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q3
+	veor	q12, q12, q11
+	veor	q3, q3, q10
+	veor	q6, q6, q12
+	veor	q0, q0, q12
+	veor	q5, q5, q8
+	veor	q3, q3, q8
+
+	veor	q12, q7, q4
+	veor	q8, q1, q2
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q4
+	veor	q12, q12, q8
+	veor	q4, q4, q2
+	vand	q8, q8, q15
+	vand	q2, q2, q13
+	vand	q12, q12, q14
+	vand	q4, q4, q9
+	veor	q8, q8, q12
+	veor	q4, q4, q2
+	veor	q12, q12, q11
+	veor	q2, q2, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q7
+	veor	q7, q7, q1
+	vand	q11, q1, q15
+	vand	q7, q7, q14
+	veor	q1, q11, q10
+	veor	q7, q7, q11
+	veor	q7, q7, q12
+	veor	q4, q4, q12
+	veor	q1, q1, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q0
+	veor	q1, q1, q6
+	veor	q6, q6, q0
+	veor	q4, q4, q7
+	veor	q0, q0, q1
+
+	veor	q1, q1, q5
+	veor	q5, q5, q2
+	veor	q2, q2, q3
+	veor	q3, q3, q5
+	veor	q4, q4, q5
+
+	veor	q6, q6, q3
+	subs	r5,r5,#1
+	bcc	.Lenc_done
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q4, q4, #12
+	veor	q1, q1, q9
+	vext.8	q11, q6, q6, #12
+	veor	q4, q4, q10
+	vext.8	q12, q3, q3, #12
+	veor	q6, q6, q11
+	vext.8	q13, q7, q7, #12
+	veor	q3, q3, q12
+	vext.8	q14, q2, q2, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q2, q2, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q3
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q3, q3, #8
+	veor	q12, q12, q6
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q2
+	vext.8	q3, q6, q6, #8
+	veor	q11, q11, q4
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q6, q2, q2, #8
+	veor	q11, q11, q5
+	vext.8	q2, q4, q4, #8
+	veor	q5, q9, q13
+	veor	q4, q8, q12
+	veor	q3, q3, q11
+	veor	q7, q7, q15
+	veor	q6, q6, q14
+	 @ vmov	q4, q8
+	veor	q2, q2, q10
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ .LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	r6,r6,#0x10
+	bne	.Lenc_loop
+	vldmia	r6, {q12}		@ .LSRM0
+	b	.Lenc_loop
+.align	4
+.Lenc_done:
+	vmov.i8	q8,#0x55			@ compose .LBS0
+	vmov.i8	q9,#0x33			@ compose .LBS1
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q3, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q4, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q6
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q4, q4, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose .LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q3, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q6
+	veor	q11, q11, q4
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #2
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q6, #4
+	vshr.u64	q11, q4, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q4, q4, q8
+	veor	q6, q6, q8
+	veor	q3, q3, q8
+	veor	q7, q7, q8
+	veor	q2, q2, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+.type	_bsaes_key_convert,%function
+.align	4
+_bsaes_key_convert:
+	adr	r6,.
+	vld1.8	{q7},  [r4]!		@ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,.LM0
+#else
+	sub	r6,r6,#_bsaes_key_convert-.LM0
+#endif
+	vld1.8	{q15}, [r4]!		@ load round 1 key
+
+	vmov.i8	q8,  #0x01			@ bit masks
+	vmov.i8	q9,  #0x02
+	vmov.i8	q10, #0x04
+	vmov.i8	q11, #0x08
+	vmov.i8	q12, #0x10
+	vmov.i8	q13, #0x20
+	vldmia	r6, {q14}		@ .LM0
+
+#ifdef __ARMEL__
+	vrev32.8	q7,  q7
+	vrev32.8	q15, q15
+#endif
+	sub	r5,r5,#1
+	vstmia	r12!, {q7}		@ save round 0 key
+	b	.Lkey_loop
+
+.align	4
+.Lkey_loop:
+	vtbl.8	d14,{q15},d28
+	vtbl.8	d15,{q15},d29
+	vmov.i8	q6,  #0x40
+	vmov.i8	q15, #0x80
+
+	vtst.8	q0, q7, q8
+	vtst.8	q1, q7, q9
+	vtst.8	q2, q7, q10
+	vtst.8	q3, q7, q11
+	vtst.8	q4, q7, q12
+	vtst.8	q5, q7, q13
+	vtst.8	q6, q7, q6
+	vtst.8	q7, q7, q15
+	vld1.8	{q15}, [r4]!		@ load next round key
+	vmvn	q0, q0		@ "pnot"
+	vmvn	q1, q1
+	vmvn	q5, q5
+	vmvn	q6, q6
+#ifdef __ARMEL__
+	vrev32.8	q15, q15
+#endif
+	subs	r5,r5,#1
+	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
+	bne	.Lkey_loop
+
+	vmov.i8	q7,#0x63			@ compose .L63
+	@ don't save last round key
+	bx	lr
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+.globl	bsaes_cbc_encrypt
+.hidden	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,%function
+.align	5
+bsaes_cbc_encrypt:
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
+
+	@ it is up to the caller to make sure we are called with enc == 0
+
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ IV is 1st arg on the stack
+	mov	r2, r2, lsr#4		@ len in 16 byte blocks
+	sub	sp, #0x10			@ scratch space to carry over the IV
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ sifze of bit-slices key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	vldmia	sp, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	sp, {q7}
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r3, #248
+	vldmia	r4, {q6}
+	vstmia	r12, {q15}			@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+
+#endif
+
+	vld1.8	{q15}, [r8]		@ load IV
+	b	.Lcbc_dec_loop
+
+.align	4
+.Lcbc_dec_loop:
+	subs	r2, r2, #0x8
+	bmi	.Lcbc_dec_loop_finish
+
+	vld1.8	{q0,q1}, [r0]!	@ load input
+	vld1.8	{q2,q3}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	vld1.8	{q4,q5}, [r0]!
+	mov	r5, r10
+	vld1.8	{q6,q7}, [r0]
+	sub	r0, r0, #0x60
+	vstmia	r9, {q15}			@ put aside IV
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	veor	q5, q5, q14
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	vst1.8	{q5}, [r1]!
+
+	b	.Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+	adds	r2, r2, #8
+	beq	.Lcbc_dec_done
+
+	@ Set up most parameters for the _bsaes_decrypt8 call.
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	mov	r5, r10
+	vstmia	r9, {q15}			@ put aside IV
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
+	beq	.Lcbc_dec_two
+	vld1.8	{q2}, [r0]!
+	cmp	r2, #4
+	blo	.Lcbc_dec_three
+	vld1.8	{q3}, [r0]!
+	beq	.Lcbc_dec_four
+	vld1.8	{q4}, [r0]!
+	cmp	r2, #6
+	blo	.Lcbc_dec_five
+	vld1.8	{q5}, [r0]!
+	beq	.Lcbc_dec_six
+	vld1.8	{q6}, [r0]!
+	sub	r0, r0, #0x70
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_six:
+	sub	r0, r0, #0x60
+	bl	_bsaes_decrypt8
+	vldmia	r9,{q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_five:
+	sub	r0, r0, #0x50
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q2, q2, q11
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_four:
+	sub	r0, r0, #0x40
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_three:
+	sub	r0, r0, #0x30
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_two:
+	sub	r0, r0, #0x20
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q1, q1, q8
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	b	.Lcbc_dec_done
+.align	4
+.Lcbc_dec_one:
+	sub	r0, r0, #0x10
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vst1.8	{q0}, [r1]!		@ write output
+
+.Lcbc_dec_done:
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+.Lcbc_dec_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	.Lcbc_dec_bzero
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
+	vst1.8	{q15}, [r8]		@ return IV
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.globl	bsaes_ctr32_encrypt_blocks
+.hidden	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,%function
+.align	5
+bsaes_ctr32_encrypt_blocks:
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+	vld1.8	{q0}, [r8]		@ load counter
+#ifdef	__APPLE__
+	mov	r8, #:lower16:(.LREVM0SR-.LM0)
+	add	r8, r6, r8
+#else
+	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
+#endif
+	vldmia	sp, {q4}		@ load round0 key
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+.align	2
+	add	r12, r3, #248
+	vld1.8	{q0}, [r8]		@ load counter
+	adrl	r8, .LREVM0SR			@ borrow r8
+	vldmia	r12, {q4}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	q8,#1		@ compose 1<<96
+	veor	q9,q9,q9
+	vrev32.8	q0,q0
+	vext.8	q8,q9,q8,#4
+	vrev32.8	q4,q4
+	vadd.u32	q9,q8,q8	@ compose 2<<96
+	vstmia	sp, {q4}		@ save adjusted round0 key
+	b	.Lctr_enc_loop
+
+.align	4
+.Lctr_enc_loop:
+	vadd.u32	q10, q8, q9	@ compose 3<<96
+	vadd.u32	q1, q0, q8	@ +1
+	vadd.u32	q2, q0, q9	@ +2
+	vadd.u32	q3, q0, q10	@ +3
+	vadd.u32	q4, q1, q10
+	vadd.u32	q5, q2, q10
+	vadd.u32	q6, q3, q10
+	vadd.u32	q7, q4, q10
+	vadd.u32	q10, q5, q10	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia	sp, {q9}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x10		@ pass next round key
+#else
+	add	r4, r3, #264
+#endif
+	vldmia	r8, {q8}			@ .LREVM0SR
+	mov	r5, r10			@ pass rounds
+	vstmia	r9, {q10}			@ save next counter
+#ifdef	__APPLE__
+	mov	r6, #:lower16:(.LREVM0SR-.LSR)
+	sub	r6, r8, r6
+#else
+	sub	r6, r8, #.LREVM0SR-.LSR	@ pass constants
+#endif
+
+	bl	_bsaes_encrypt8_alt
+
+	subs	r2, r2, #8
+	blo	.Lctr_enc_loop_done
+
+	vld1.8	{q8,q9}, [r0]!	@ load input
+	vld1.8	{q10,q11}, [r0]!
+	veor	q0, q8
+	veor	q1, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q10
+	veor	q6, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q3, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q7, q13
+	veor	q2, q14
+	vst1.8	{q4}, [r1]!
+	veor	q5, q15
+	vst1.8	{q6}, [r1]!
+	vmov.i32	q8, #1			@ compose 1<<96
+	vst1.8	{q3}, [r1]!
+	veor	q9, q9, q9
+	vst1.8	{q7}, [r1]!
+	vext.8	q8, q9, q8, #4
+	vst1.8	{q2}, [r1]!
+	vadd.u32	q9,q8,q8		@ compose 2<<96
+	vst1.8	{q5}, [r1]!
+	vldmia	r9, {q0}			@ load counter
+
+	bne	.Lctr_enc_loop
+	b	.Lctr_enc_done
+
+.align	4
+.Lctr_enc_loop_done:
+	add	r2, r2, #8
+	vld1.8	{q8}, [r0]!	@ load input
+	veor	q0, q8
+	vst1.8	{q0}, [r1]!	@ write output
+	cmp	r2, #2
+	blo	.Lctr_enc_done
+	vld1.8	{q9}, [r0]!
+	veor	q1, q9
+	vst1.8	{q1}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q10}, [r0]!
+	veor	q4, q10
+	vst1.8	{q4}, [r1]!
+	cmp	r2, #4
+	blo	.Lctr_enc_done
+	vld1.8	{q11}, [r0]!
+	veor	q6, q11
+	vst1.8	{q6}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q12}, [r0]!
+	veor	q3, q12
+	vst1.8	{q3}, [r1]!
+	cmp	r2, #6
+	blo	.Lctr_enc_done
+	vld1.8	{q13}, [r0]!
+	veor	q7, q13
+	vst1.8	{q7}, [r1]!
+	beq	.Lctr_enc_done
+	vld1.8	{q14}, [r0]
+	veor	q2, q14
+	vst1.8	{q2}, [r1]!
+
+.Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	.Lctr_enc_bzero
+#else
+	vstmia	sp, {q0,q1}
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S
@@ -1,0 +1,255 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64	d7,[r1]!		@ load H
+	vmov.i8	q8,#0xe1
+	vld1.64	d6,[r1]
+	vshl.i64	d17,#57
+	vshr.u64	d16,#63		@ t0=0xc2....01
+	vdup.8	q9,d7[7]
+	vshr.u64	d26,d6,#63
+	vshr.s8	q9,#7			@ broadcast carry bit
+	vshl.i64	q3,q3,#1
+	vand	q8,q8,q9
+	vorr	d7,d26		@ H<<<=1
+	veor	q3,q3,q8		@ twisted H
+	vstmia	r0,{q3}
+
+	bx	lr					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64	d7,[r0]!		@ load Xi
+	vld1.64	d6,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+	mov	r3,#16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64	d1,[r0]!		@ load Xi
+	vld1.64	d0,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64	d7,[r2]!		@ load inp
+	vld1.64	d6,[r2]!
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	veor	q3,q0			@ inp^=Xi
+.Lgmult_neon:
+	vext.8	d16, d26, d26, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d0, d6, d6, #1	@ B1
+	vmull.p8	q0, d26, d0		@ E = A*B1
+	vext.8	d18, d26, d26, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d26, d22		@ G = A*B2
+	vext.8	d20, d26, d26, #3	@ A3
+	veor	q8, q8, q0		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d0, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q0, d26, d0		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d26, d22		@ K = A*B4
+	veor	q10, q10, q0		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q0, d26, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q0, q0, q8
+	veor	q0, q0, q10
+	veor	d6,d6,d7	@ Karatsuba pre-processing
+	vext.8	d16, d28, d28, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d2, d6, d6, #1	@ B1
+	vmull.p8	q1, d28, d2		@ E = A*B1
+	vext.8	d18, d28, d28, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d28, d22		@ G = A*B2
+	vext.8	d20, d28, d28, #3	@ A3
+	veor	q8, q8, q1		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d2, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q1, d28, d2		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d28, d22		@ K = A*B4
+	veor	q10, q10, q1		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q1, d28, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q1, q1, q8
+	veor	q1, q1, q10
+	vext.8	d16, d27, d27, #1	@ A1
+	vmull.p8	q8, d16, d7		@ F = A1*B
+	vext.8	d4, d7, d7, #1	@ B1
+	vmull.p8	q2, d27, d4		@ E = A*B1
+	vext.8	d18, d27, d27, #2	@ A2
+	vmull.p8	q9, d18, d7		@ H = A2*B
+	vext.8	d22, d7, d7, #2	@ B2
+	vmull.p8	q11, d27, d22		@ G = A*B2
+	vext.8	d20, d27, d27, #3	@ A3
+	veor	q8, q8, q2		@ L = E + F
+	vmull.p8	q10, d20, d7		@ J = A3*B
+	vext.8	d4, d7, d7, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q2, d27, d4		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d7, d7, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d27, d22		@ K = A*B4
+	veor	q10, q10, q2		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q2, d27, d7		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q2, q2, q8
+	veor	q2, q2, q10
+	veor	q1,q1,q0		@ Karatsuba post-processing
+	veor	q1,q1,q2
+	veor	d1,d1,d2
+	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	q9,q0,#57		@ 1st phase
+	vshl.i64	q10,q0,#62
+	veor	q10,q10,q9		@
+	vshl.i64	q9,q0,#63
+	veor	q10, q10, q9		@
+	veor	d1,d1,d20	@
+	veor	d4,d4,d21
+
+	vshr.u64	q10,q0,#1		@ 2nd phase
+	veor	q2,q2,q0
+	veor	q0,q0,q10		@
+	vshr.u64	q10,q10,#6
+	vshr.u64	q0,q0,#1		@
+	veor	q0,q0,q2		@
+	veor	q0,q0,q10		@
+
+	subs	r3,#16
+	bne	.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	sub	r0,#16
+	vst1.64	d1,[r0]!		@ write out Xi
+	vst1.64	d0,[r0]
+
+	bx	lr					@ bx lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S
@@ -1,0 +1,257 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.fpu	neon
+.code	32
+#undef	__thumb2__
+.globl	gcm_init_v8
+.hidden	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64	{q9},[r1]		@ load input H
+	vmov.i8	q11,#0xe1
+	vshl.i64	q11,q11,#57		@ 0xc2.0
+	vext.8	q3,q9,q9,#8
+	vshr.u64	q10,q11,#63
+	vdup.32	q9,d18[1]
+	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
+	vshr.u64	q10,q3,#63
+	vshr.s32	q9,q9,#31		@ broadcast carry bit
+	vand	q10,q10,q8
+	vshl.i64	q3,q3,#1
+	vext.8	q10,q10,q10,#8
+	vand	q8,q8,q9
+	vorr	q3,q3,q10		@ H<<<=1
+	veor	q12,q3,q8		@ twisted H
+	vst1.64	{q12},[r0]!		@ store Htable[0]
+
+	@ calculate H^2
+	vext.8	q8,q12,q12,#8		@ Karatsuba pre-processing
+.byte	0xa8,0x0e,0xa8,0xf2	@ pmull q0,q12,q12
+	veor	q8,q8,q12
+.byte	0xa9,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q12
+.byte	0xa0,0x2e,0xa0,0xf2	@ pmull q1,q8,q8
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q14,q0,q10
+
+	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
+	veor	q9,q9,q14
+	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
+	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
+	bx	lr
+.size	gcm_init_v8,.-gcm_init_v8
+.globl	gcm_gmult_v8
+.hidden	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64	{q9},[r0]		@ load Xi
+	vmov.i8	q11,#0xe1
+	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
+	vshl.u64	q11,q11,#57
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vext.8	q3,q9,q9,#8
+
+.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
+	veor	q9,q9,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
+.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q0,q0,q10
+
+#ifndef __ARMEB__
+	vrev64.8	q0,q0
+#endif
+	vext.8	q0,q0,q0,#8
+	vst1.64	{q0},[r0]		@ write out Xi
+
+	bx	lr
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+.globl	gcm_ghash_v8
+.hidden	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
+	vld1.64	{q0},[r0]		@ load [rotated] Xi
+						@ "[rotated]" means that
+						@ loaded value would have
+						@ to be rotated in order to
+						@ make it appear as in
+						@ algorithm specification
+	subs	r3,r3,#32		@ see if r3 is 32 or larger
+	mov	r12,#16		@ r12 is used as post-
+						@ increment for input pointer;
+						@ as loop is modulo-scheduled
+						@ r12 is zeroed just in time
+						@ to preclude overstepping
+						@ inp[len], which means that
+						@ last block[s] are actually
+						@ loaded twice, but last
+						@ copy is not processed
+	vld1.64	{q12,q13},[r1]!	@ load twisted H, ..., H^2
+	vmov.i8	q11,#0xe1
+	vld1.64	{q14},[r1]
+	moveq	r12,#0			@ is it time to zero r12?
+	vext.8	q0,q0,q0,#8		@ rotate Xi
+	vld1.64	{q8},[r2]!	@ load [rotated] I[0]
+	vshl.u64	q11,q11,#57		@ compose 0xc2.0 constant
+#ifndef __ARMEB__
+	vrev64.8	q8,q8
+	vrev64.8	q0,q0
+#endif
+	vext.8	q3,q8,q8,#8		@ rotate I[0]
+	blo	.Lodd_tail_v8		@ r3 was less than 32
+	vld1.64	{q9},[r2],r12	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vext.8	q7,q9,q9,#8
+	veor	q3,q3,q0		@ I[i]^=Xi
+.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
+	veor	q9,q9,q7		@ Karatsuba pre-processing
+.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
+	b	.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	vext.8	q10,q3,q3,#8
+	subs	r3,r3,#32		@ is there more data?
+.byte	0x86,0x0e,0xac,0xf2	@ pmull q0,q14,q3		@ H^2.lo·Xi.lo
+	movlo	r12,#0			@ is it time to zero r12?
+
+.byte	0xa2,0xae,0xaa,0xf2	@ pmull q5,q13,q9
+	veor	q10,q10,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xad,0xf2	@ pmull2 q2,q14,q3		@ H^2.hi·Xi.hi
+	veor	q0,q0,q4		@ accumulate
+.byte	0xa5,0x2e,0xab,0xf2	@ pmull2 q1,q13,q10		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	vld1.64	{q8},[r2],r12	@ load [rotated] I[i+2]
+
+	veor	q2,q2,q6
+	moveq	r12,#0			@ is it time to zero r12?
+	veor	q1,q1,q5
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	vld1.64	{q9},[r2],r12	@ load [rotated] I[i+3]
+#ifndef __ARMEB__
+	vrev64.8	q8,q8
+#endif
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	vext.8	q7,q9,q9,#8
+	vext.8	q3,q8,q8,#8
+	veor	q0,q1,q10
+.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
+	veor	q3,q3,q2		@ accumulate q3 early
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q3,q3,q10
+	veor	q9,q9,q7		@ Karatsuba pre-processing
+	veor	q3,q3,q0
+.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
+	bhs	.Loop_mod2x_v8		@ there was at least 32 more bytes
+
+	veor	q2,q2,q10
+	vext.8	q3,q8,q8,#8		@ re-construct q3
+	adds	r3,r3,#32		@ re-construct r3
+	veor	q0,q0,q2		@ re-construct q0
+	beq	.Ldone_v8		@ is r3 zero?
+.Lodd_tail_v8:
+	vext.8	q10,q0,q0,#8
+	veor	q3,q3,q0		@ inp^=Xi
+	veor	q9,q8,q10		@ q9 is rotated inp^Xi
+
+.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
+	veor	q9,q9,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
+.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q0,q0,q10
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	q0,q0
+#endif
+	vext.8	q0,q0,q0,#8
+	vst1.64	{q0},[r0]		@ write out Xi
+
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
+	bx	lr
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S
@@ -1,0 +1,1511 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.globl	sha1_block_data_order
+.hidden	sha1_block_data_order
+.type	sha1_block_data_order,%function
+
+.align	5
+sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+.Lsha1_block:
+	adr	r3,.Lsha1_block
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA1
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	ldmia	r0,{r3,r4,r5,r6,r7}
+.Lloop:
+	ldr	r8,.LK_00_19
+	mov	r14,sp
+	sub	sp,sp,#15*4
+	mov	r5,r5,ror#30
+	mov	r6,r6,ror#30
+	mov	r7,r7,ror#30		@ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	eor	r10,r4,r5			@ F_xx_xx
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r3,r10,ror#2
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	eor	r10,r3,r4			@ F_xx_xx
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r7,r10,ror#2
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	eor	r10,r7,r3			@ F_xx_xx
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r6,r10,ror#2
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	eor	r10,r6,r7			@ F_xx_xx
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r5,r10,ror#2
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp
+#endif
+	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+
+	ldr	r8,.LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r4,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp			@ preserve carry
+#endif
+	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	r8,.LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+.L_40_59:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r4,r10,ror#2					@ F_xx_xx
+	and	r11,r5,r6					@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
+	add	r7,r7,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r3,r10,ror#2					@ F_xx_xx
+	and	r11,r4,r5					@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
+	add	r6,r6,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r7,r10,ror#2					@ F_xx_xx
+	and	r11,r3,r4					@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
+	add	r5,r5,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r6,r10,ror#2					@ F_xx_xx
+	and	r11,r7,r3					@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
+	add	r4,r4,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r5,r10,ror#2					@ F_xx_xx
+	and	r11,r6,r7					@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
+	add	r3,r3,r11,ror#2
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp
+#endif
+	bne	.L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	r8,.LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
+.L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	r0,{r8,r9,r10,r11,r12}
+	add	r3,r8,r3
+	add	r4,r9,r4
+	add	r5,r10,r5,ror#2
+	add	r6,r11,r6,ror#2
+	add	r7,r12,r7,ror#2
+	stmia	r0,{r3,r4,r5,r6,r7}
+	teq	r1,r2
+	bne	.Lloop			@ [+18], total 1307
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha1_block_data_order,.-sha1_block_data_order
+
+.align	5
+.LK_00_19:.word	0x5a827999
+.LK_20_39:.word	0x6ed9eba1
+.LK_40_59:.word	0x8f1bbcdc
+.LK_60_79:.word	0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha1_block
+#endif
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	5
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.type	sha1_block_data_order_neon,%function
+.align	4
+sha1_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	@ dmb				@ errata #451034 on early Cortex A8
+	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
+	mov	r14,sp
+	sub	r12,sp,#64
+	adr	r8,.LK_00_19
+	bic	r12,r12,#15		@ align for 128-bit stores
+
+	ldmia	r0,{r3,r4,r5,r6,r7}	@ load context
+	mov	sp,r12		@ alloca
+
+	vld1.8	{q0,q1},[r1]!	@ handles unaligned
+	veor	q15,q15,q15
+	vld1.8	{q2,q3},[r1]!
+	vld1.32	{d28[],d29[]},[r8,:32]!	@ load K_00_19
+	vrev32.8	q0,q0		@ yes, even on
+	vrev32.8	q1,q1		@ big-endian...
+	vrev32.8	q2,q2
+	vadd.i32	q8,q0,q14
+	vrev32.8	q3,q3
+	vadd.i32	q9,q1,q14
+	vst1.32	{q8},[r12,:128]!
+	vadd.i32	q10,q2,q14
+	vst1.32	{q9},[r12,:128]!
+	vst1.32	{q10},[r12,:128]!
+	ldr	r9,[sp]			@ big RAW stall
+
+.Loop_neon:
+	vext.8	q8,q0,q1,#8
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	and	r11,r5,r4
+	vadd.i32	q13,q3,q14
+	ldr	r9,[sp,#4]
+	add	r7,r7,r3,ror#27
+	vext.8	q12,q3,q15,#4
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	veor	q8,q8,q0
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	veor	q12,q12,q2
+	and	r11,r4,r3
+	ldr	r9,[sp,#8]
+	veor	q12,q12,q8
+	add	r6,r6,r7,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r4,r7
+	add	r5,r5,r9
+	vadd.i32	q8,q12,q12
+	and	r11,r3,r7
+	ldr	r9,[sp,#12]
+	vsri.32	q8,q12,#31
+	add	r5,r5,r6,ror#27
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	vshr.u32	q12,q13,#30
+	add	r5,r5,r11
+	bic	r10,r3,r6
+	vshl.u32	q13,q13,#2
+	add	r4,r4,r9
+	and	r11,r7,r6
+	veor	q8,q8,q12
+	ldr	r9,[sp,#16]
+	add	r4,r4,r5,ror#27
+	veor	q8,q8,q13
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q9,q1,q2,#8
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	and	r11,r6,r5
+	vadd.i32	q13,q8,q14
+	ldr	r9,[sp,#20]
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r4,ror#27
+	vext.8	q12,q8,q15,#4
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	veor	q9,q9,q1
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	veor	q12,q12,q3
+	and	r11,r5,r4
+	ldr	r9,[sp,#24]
+	veor	q12,q12,q9
+	add	r7,r7,r3,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	vadd.i32	q9,q12,q12
+	and	r11,r4,r3
+	ldr	r9,[sp,#28]
+	vsri.32	q9,q12,#31
+	add	r6,r6,r7,ror#27
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	vshr.u32	q12,q13,#30
+	add	r6,r6,r11
+	bic	r10,r4,r7
+	vshl.u32	q13,q13,#2
+	add	r5,r5,r9
+	and	r11,r3,r7
+	veor	q9,q9,q12
+	ldr	r9,[sp,#32]
+	add	r5,r5,r6,ror#27
+	veor	q9,q9,q13
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q10,q2,q3,#8
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	and	r11,r7,r6
+	vadd.i32	q13,q9,q14
+	ldr	r9,[sp,#36]
+	add	r4,r4,r5,ror#27
+	vext.8	q12,q9,q15,#4
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	veor	q10,q10,q2
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	veor	q12,q12,q8
+	and	r11,r6,r5
+	ldr	r9,[sp,#40]
+	veor	q12,q12,q10
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	vadd.i32	q10,q12,q12
+	and	r11,r5,r4
+	ldr	r9,[sp,#44]
+	vsri.32	q10,q12,#31
+	add	r7,r7,r3,ror#27
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	vshr.u32	q12,q13,#30
+	add	r7,r7,r11
+	bic	r10,r5,r3
+	vshl.u32	q13,q13,#2
+	add	r6,r6,r9
+	and	r11,r4,r3
+	veor	q10,q10,q12
+	ldr	r9,[sp,#48]
+	add	r6,r6,r7,ror#27
+	veor	q10,q10,q13
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q11,q3,q8,#8
+	bic	r10,r4,r7
+	add	r5,r5,r9
+	and	r11,r3,r7
+	vadd.i32	q13,q10,q14
+	ldr	r9,[sp,#52]
+	add	r5,r5,r6,ror#27
+	vext.8	q12,q10,q15,#4
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	veor	q11,q11,q3
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	veor	q12,q12,q9
+	and	r11,r7,r6
+	ldr	r9,[sp,#56]
+	veor	q12,q12,q11
+	add	r4,r4,r5,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	vadd.i32	q11,q12,q12
+	and	r11,r6,r5
+	ldr	r9,[sp,#60]
+	vsri.32	q11,q12,#31
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	vshr.u32	q12,q13,#30
+	add	r3,r3,r11
+	bic	r10,r6,r4
+	vshl.u32	q13,q13,#2
+	add	r7,r7,r9
+	and	r11,r5,r4
+	veor	q11,q11,q12
+	ldr	r9,[sp,#0]
+	add	r7,r7,r3,ror#27
+	veor	q11,q11,q13
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q10,q11,#8
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	and	r11,r4,r3
+	veor	q0,q0,q8
+	ldr	r9,[sp,#4]
+	add	r6,r6,r7,ror#27
+	veor	q0,q0,q1
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	vadd.i32	q13,q11,q14
+	add	r6,r6,r11
+	bic	r10,r4,r7
+	veor	q12,q12,q0
+	add	r5,r5,r9
+	and	r11,r3,r7
+	vshr.u32	q0,q12,#30
+	ldr	r9,[sp,#8]
+	add	r5,r5,r6,ror#27
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	vsli.32	q0,q12,#2
+	add	r5,r5,r11
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	and	r11,r7,r6
+	ldr	r9,[sp,#12]
+	add	r4,r4,r5,ror#27
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	and	r11,r6,r5
+	ldr	r9,[sp,#16]
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q11,q0,#8
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#20]
+	veor	q1,q1,q9
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	veor	q1,q1,q2
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vadd.i32	q13,q0,q14
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	veor	q12,q12,q1
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r4
+	vshr.u32	q1,q12,#30
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	vsli.32	q1,q12,#2
+	add	r5,r5,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q12,q0,q1,#8
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#36]
+	veor	q2,q2,q10
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	veor	q2,q2,q3
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vadd.i32	q13,q1,q14
+	eor	r10,r4,r6
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r7,r7,r9
+	veor	q12,q12,q2
+	ldr	r9,[sp,#40]
+	eor	r11,r10,r5
+	vshr.u32	q2,q12,#30
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	vsli.32	q2,q12,#2
+	add	r6,r6,r9
+	ldr	r9,[sp,#44]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#48]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q12,q1,q2,#8
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#52]
+	veor	q3,q3,q11
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	veor	q3,q3,q8
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vadd.i32	q13,q2,q14
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	veor	q12,q12,q3
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r6
+	vshr.u32	q3,q12,#30
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	vsli.32	q3,q12,#2
+	add	r7,r7,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#0]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q12,q2,q3,#8
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#4]
+	veor	q8,q8,q0
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	veor	q8,q8,q9
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vadd.i32	q13,q3,q14
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	veor	q12,q12,q8
+	ldr	r9,[sp,#8]
+	eor	r11,r10,r7
+	vshr.u32	q8,q12,#30
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	vsli.32	q8,q12,#2
+	add	r3,r3,r9
+	ldr	r9,[sp,#12]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#16]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q3,q8,#8
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#20]
+	veor	q9,q9,q1
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	veor	q9,q9,q10
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vadd.i32	q13,q8,q14
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	veor	q12,q12,q9
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r3
+	vshr.u32	q9,q12,#30
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	vsli.32	q9,q12,#2
+	add	r4,r4,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q8,q9,#8
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#36]
+	veor	q10,q10,q2
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	veor	q10,q10,q11
+	add	r7,r7,r10
+	and	r11,r11,r4
+	vadd.i32	q13,q9,q14
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	veor	q12,q12,q10
+	add	r6,r6,r9
+	and	r10,r4,r5
+	vshr.u32	q10,q12,#30
+	ldr	r9,[sp,#40]
+	add	r6,r6,r7,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	vsli.32	q10,q12,#2
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#44]
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#48]
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q12,q9,q10,#8
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#52]
+	veor	q11,q11,q3
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	veor	q11,q11,q0
+	add	r3,r3,r10
+	and	r11,r11,r5
+	vadd.i32	q13,q10,q14
+	mov	r5,r5,ror#2
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r11
+	veor	q12,q12,q11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	vshr.u32	q11,q12,#30
+	ldr	r9,[sp,#56]
+	add	r7,r7,r3,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	vsli.32	q11,q12,#2
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#60]
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#0]
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q12,q10,q11,#8
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#4]
+	veor	q0,q0,q8
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	veor	q0,q0,q1
+	add	r4,r4,r10
+	and	r11,r11,r6
+	vadd.i32	q13,q11,q14
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	veor	q12,q12,q0
+	add	r3,r3,r9
+	and	r10,r6,r7
+	vshr.u32	q0,q12,#30
+	ldr	r9,[sp,#8]
+	add	r3,r3,r4,ror#27
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	vsli.32	q0,q12,#2
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#12]
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#16]
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q12,q11,q0,#8
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#20]
+	veor	q1,q1,q9
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	veor	q1,q1,q2
+	add	r5,r5,r10
+	and	r11,r11,r7
+	vadd.i32	q13,q0,q14
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	veor	q12,q12,q1
+	add	r4,r4,r9
+	and	r10,r7,r3
+	vshr.u32	q1,q12,#30
+	ldr	r9,[sp,#24]
+	add	r4,r4,r5,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	vsli.32	q1,q12,#2
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#28]
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#32]
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q0,q1,#8
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#36]
+	veor	q2,q2,q10
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	veor	q2,q2,q3
+	add	r6,r6,r10
+	and	r11,r11,r3
+	vadd.i32	q13,q1,q14
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	veor	q12,q12,q2
+	add	r5,r5,r9
+	and	r10,r3,r4
+	vshr.u32	q2,q12,#30
+	ldr	r9,[sp,#40]
+	add	r5,r5,r6,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	vsli.32	q2,q12,#2
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#44]
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#48]
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q1,q2,#8
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#52]
+	veor	q3,q3,q11
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	veor	q3,q3,q8
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vadd.i32	q13,q2,q14
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	veor	q12,q12,q3
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r4
+	vshr.u32	q3,q12,#30
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	vsli.32	q3,q12,#2
+	add	r5,r5,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#0]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vadd.i32	q13,q3,q14
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	teq	r1,r2
+	sub	r8,r8,#16
+	it	eq
+	subeq	r1,r1,#64
+	vld1.8	{q0,q1},[r1]!
+	ldr	r9,[sp,#4]
+	eor	r11,r10,r6
+	vld1.8	{q2,q3},[r1]!
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	vrev32.8	q0,q0
+	add	r7,r7,r9
+	ldr	r9,[sp,#8]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#12]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#16]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vrev32.8	q1,q1
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	vadd.i32	q8,q0,q14
+	ldr	r9,[sp,#20]
+	eor	r11,r10,r7
+	vst1.32	{q8},[r12,:128]!
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vrev32.8	q2,q2
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	vadd.i32	q9,q1,q14
+	ldr	r9,[sp,#36]
+	eor	r11,r10,r3
+	vst1.32	{q9},[r12,:128]!
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#40]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#44]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#48]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vrev32.8	q3,q3
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	vadd.i32	q10,q2,q14
+	ldr	r9,[sp,#52]
+	eor	r11,r10,r4
+	vst1.32	{q10},[r12,:128]!
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	ldmia	r0,{r9,r10,r11,r12}	@ accumulate context
+	add	r3,r3,r9
+	ldr	r9,[r0,#16]
+	add	r4,r4,r10
+	add	r5,r5,r11
+	add	r6,r6,r12
+	it	eq
+	moveq	sp,r14
+	add	r7,r7,r9
+	it	ne
+	ldrne	r9,[sp]
+	stmia	r0,{r3,r4,r5,r6,r7}
+	itt	ne
+	addne	r12,sp,#3*16
+	bne	.Loop_neon
+
+	@ vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
+# endif
+
+.type	sha1_block_data_order_armv8,%function
+.align	5
+sha1_block_data_order_armv8:
+.LARMv8:
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+
+	veor	q1,q1,q1
+	adr	r3,.LK_00_19
+	vld1.32	{q0},[r0]!
+	vld1.32	{d2[0]},[r0]
+	sub	r0,r0,#16
+	vld1.32	{d16[],d17[]},[r3,:32]!
+	vld1.32	{d18[],d19[]},[r3,:32]!
+	vld1.32	{d20[],d21[]},[r3,:32]!
+	vld1.32	{d22[],d23[]},[r3,:32]
+
+.Loop_v8:
+	vld1.8	{q4,q5},[r1]!
+	vld1.8	{q6,q7},[r1]!
+	vrev32.8	q4,q4
+	vrev32.8	q5,q5
+
+	vadd.i32	q12,q8,q4
+	vrev32.8	q6,q6
+	vmov	q14,q0	@ offload
+	subs	r2,r2,#1
+
+	vadd.i32	q13,q8,q5
+	vrev32.8	q7,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 0
+	INST(0x68,0x0c,0x02,0xe2)	@ sha1c q0,q1,q12
+	vadd.i32	q12,q8,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 1
+	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
+	vadd.i32	q13,q8,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 2
+	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
+	vadd.i32	q12,q8,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 3
+	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
+	vadd.i32	q13,q9,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 4
+	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
+	vadd.i32	q12,q9,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 5
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q9,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 6
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q9,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 7
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q9,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 8
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q10,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 9
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q10,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 10
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q10,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 11
+	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
+	vadd.i32	q13,q10,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 12
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q10,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 13
+	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
+	vadd.i32	q13,q11,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 14
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q11,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 15
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q11,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 16
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q11,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 17
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q11,q7
+
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 18
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 19
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+
+	vadd.i32	q1,q1,q2
+	vadd.i32	q0,q0,q14
+	bne	.Loop_v8
+
+	vst1.32	{q0},[r0]!
+	vst1.32	{d2[0]},[r0]
+
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	bx	lr					@ bx lr
+.size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+#endif
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S
@@ -1,0 +1,2839 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <[email protected]> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch	armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha256_block_data_order
+#endif
+.align	5
+
+.globl	sha256_block_data_order
+.hidden	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r3,pc,#8		@ sha256_block_data_order
+#else
+	adr	r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA256
+	bne	.LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r14,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	sha256_block_data_order_neon
+.hidden	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	5
+.skip	16
+sha256_block_data_order_neon:
+.LNEON:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+	sub	r11,sp,#16*4+16
+	adr	r14,K256
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8	{q0},[r1]!
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	vld1.32	{q8},[r14,:128]!
+	vld1.32	{q9},[r14,:128]!
+	vld1.32	{q10},[r14,:128]!
+	vld1.32	{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str	r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str	r1,[sp,#68]
+	mov	r1,sp
+	vrev32.8	q2,q2
+	str	r2,[sp,#72]
+	vrev32.8	q3,q3
+	str	r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32	{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32	{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32	{q10},[r1,:128]!
+	vst1.32	{q11},[r1,:128]!
+
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r1,r1,#64
+	ldr	r2,[sp,#0]
+	eor	r12,r12,r12
+	eor	r3,r5,r6
+	b	.L_00_48
+
+.align	4
+.L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	.L_00_48
+
+	ldr	r1,[sp,#68]
+	ldr	r0,[sp,#72]
+	sub	r14,r14,#256	@ rewind r14
+	teq	r1,r0
+	it	eq
+	subeq	r1,r1,#64		@ avoid SEGV
+	vld1.8	{q0},[r1]!		@ load next input block
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	it	ne
+	strne	r1,[sp,#68]
+	mov	r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8,r9,r10,r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	.L_00_48
+
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.type	sha256_block_data_order_armv8,%function
+.align	5
+sha256_block_data_order_armv8:
+.LARMv8:
+	vld1.32	{q0,q1},[r0]
+	sub	r3,r3,#256+32
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	b	.Loop_v8
+
+.align	4
+.Loop_v8:
+	vld1.8	{q8,q9},[r1]!
+	vld1.8	{q10,q11},[r1]!
+	vld1.32	{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov	q14,q0	@ offload
+	vmov	q15,q1
+	teq	r1,r2
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32	{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub	r3,r3,#256-16	@ rewind
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it	ne
+	bne	.Loop_v8
+
+	vst1.32	{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S
@@ -1,0 +1,1894 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <[email protected]> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch	armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+# define adrl adr
+#else
+.code	32
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
+	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
+	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
+	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
+	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
+	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
+	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
+	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
+	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
+	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
+	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
+	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
+	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
+	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
+	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
+	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
+	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
+	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
+	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
+	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
+	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
+	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
+	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
+	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
+	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
+	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
+	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
+	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
+	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
+	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
+	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
+	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
+	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
+	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
+	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
+	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
+	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
+	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
+	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
+	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.Lsha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.globl	sha512_block_data_order
+.hidden	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV7_NEON
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	sub	r14,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+.Loop:
+	str	r9, [sp,#48+0]
+	str	r10, [sp,#48+4]
+	str	r11, [sp,#56+0]
+	str	r12, [sp,#56+4]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	str	r3,[sp,#8+0]
+	str	r4,[sp,#8+4]
+	str	r9, [sp,#16+0]
+	str	r10, [sp,#16+4]
+	str	r11, [sp,#24+0]
+	str	r12, [sp,#24+4]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
+	str	r3,[sp,#40+0]
+	str	r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	r3,[r1,#7]
+	ldrb	r9, [r1,#6]
+	ldrb	r10, [r1,#5]
+	ldrb	r11, [r1,#4]
+	ldrb	r4,[r1,#3]
+	ldrb	r12, [r1,#2]
+	orr	r3,r3,r9,lsl#8
+	ldrb	r9, [r1,#1]
+	orr	r3,r3,r10,lsl#16
+	ldrb	r10, [r1],#8
+	orr	r3,r3,r11,lsl#24
+	orr	r4,r4,r12,lsl#8
+	orr	r4,r4,r9,lsl#16
+	orr	r4,r4,r10,lsl#24
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	tst	r14,#1
+	beq	.L00_15
+	ldr	r9,[sp,#184+0]
+	ldr	r10,[sp,#184+4]
+	bic	r14,r14,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
+	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
+	eor	r3,r3,r10,lsl#31
+	eor	r4,r4,r9,lsl#31
+	eor	r3,r3,r9,lsr#8
+	eor	r4,r4,r10,lsr#8
+	eor	r3,r3,r10,lsl#24
+	eor	r4,r4,r9,lsl#24
+	eor	r3,r3,r9,lsr#7
+	eor	r4,r4,r10,lsr#7
+	eor	r3,r3,r10,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	r9,r11,lsr#19
+	mov	r10,r12,lsr#19
+	eor	r9,r9,r12,lsl#13
+	eor	r10,r10,r11,lsl#13
+	eor	r9,r9,r12,lsr#29
+	eor	r10,r10,r11,lsr#29
+	eor	r9,r9,r11,lsl#3
+	eor	r10,r10,r12,lsl#3
+	eor	r9,r9,r11,lsr#6
+	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
+	eor	r9,r9,r12,lsl#26
+
+	ldr	r12,[sp,#120+4]
+	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
+	adc	r4,r4,r10
+
+	ldr	r10,[sp,#192+4]
+	adds	r3,r3,r11
+	adc	r4,r4,r12
+	adds	r3,r3,r9
+	adc	r4,r4,r10
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
+	beq	.L16_79
+	bic	r14,r14,#1
+
+	ldr	r3,[sp,#8+0]
+	ldr	r4,[sp,#8+4]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#8+HI]
+
+	ldr	r5,[sp,#16+0]
+	ldr	r6,[sp,#16+4]
+	ldr	r3,[sp,#24+0]
+	ldr	r4,[sp,#24+4]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#24+HI]
+
+	ldr	r3,[sp,#40+0]
+	ldr	r4,[sp,#40+4]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
+	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
+	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#40+HI]
+
+	ldr	r5,[sp,#48+0]
+	ldr	r6,[sp,#48+4]
+	ldr	r3,[sp,#56+0]
+	ldr	r4,[sp,#56+4]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#56+HI]
+
+	add	sp,sp,#640
+	sub	r14,r14,#640
+
+	teq	r1,r2
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.globl	sha512_block_data_order_neon
+.hidden	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb	@ errata #451034 on early Cortex A8
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	adr	r3,K512
+	VFP_ABI_PUSH
+	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64	{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 0>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 1>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64	{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 2>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 3>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64	{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 4>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 5>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64	{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 6>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 7>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64	{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 8>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 9>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64	{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 10>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 11>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64	{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 12>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 13>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64	{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 14>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 15>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	mov	r12,#4
+.L16_79_neon:
+	subs	r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q7,#6
+	vsli.64	q12,q7,#45
+	vext.8	q14,q0,q1,#8	@ X[i+1]
+	vsli.64	q13,q7,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q4,q5,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 17>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q0,#6
+	vsli.64	q12,q0,#45
+	vext.8	q14,q1,q2,#8	@ X[i+1]
+	vsli.64	q13,q0,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q5,q6,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 19>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q1,#6
+	vsli.64	q12,q1,#45
+	vext.8	q14,q2,q3,#8	@ X[i+1]
+	vsli.64	q13,q1,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q6,q7,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 21>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q2,#6
+	vsli.64	q12,q2,#45
+	vext.8	q14,q3,q4,#8	@ X[i+1]
+	vsli.64	q13,q2,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q7,q0,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 23>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q3,#6
+	vsli.64	q12,q3,#45
+	vext.8	q14,q4,q5,#8	@ X[i+1]
+	vsli.64	q13,q3,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q0,q1,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 25>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q4,#6
+	vsli.64	q12,q4,#45
+	vext.8	q14,q5,q6,#8	@ X[i+1]
+	vsli.64	q13,q4,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q1,q2,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 27>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q5,#6
+	vsli.64	q12,q5,#45
+	vext.8	q14,q6,q7,#8	@ X[i+1]
+	vsli.64	q13,q5,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q2,q3,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 29>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q6,#6
+	vsli.64	q12,q6,#45
+	vext.8	q14,q7,q0,#8	@ X[i+1]
+	vsli.64	q13,q6,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q3,q4,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 31>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	bne	.L16_79_neon
+
+	vadd.i64	d16,d30		@ h+=Maj from the past
+	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
+	teq	r1,r2
+	sub	r3,#640	@ rewind K512
+	bne	.Loop_neon
+
+	VFP_ABI_POP
+	bx	lr				@ .word	0xe12fff1e
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S
@@ -1,0 +1,1236 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax	unified
+
+.arch	armv7-a
+.fpu	neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.type	_vpaes_consts,%object
+.align	7	@ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:@ mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:@ sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:@ inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:@ input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:@ sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:@ sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:@ sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+.size	_vpaes_consts,.-_vpaes_consts
+.align	6
+@@
+@@  _aes_preheat
+@@
+@@  Fills q9-q15 as specified below.
+@@
+.type	_vpaes_preheat,%function
+.align	4
+_vpaes_preheat:
+	adr	r10, .Lk_inv
+	vmov.i8	q9, #0x0f		@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]!	@ .Lk_inv
+	add	r10, r10, #64		@ Skip .Lk_ipt, .Lk_sbo
+	vld1.64	{q12,q13}, [r10]!	@ .Lk_sb1
+	vld1.64	{q14,q15}, [r10]	@ .Lk_sb2
+	bx	lr
+
+@@
+@@  _aes_encrypt_core
+@@
+@@  AES-encrypt q0.
+@@
+@@  Inputs:
+@@     q0 = input
+@@     q9-q15 as in _vpaes_preheat
+@@    [r2] = scheduled keys
+@@
+@@  Output in q0
+@@  Clobbers  q1-q5, r8-r11
+@@  Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type	_vpaes_encrypt_core,%function
+.align	4
+_vpaes_encrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+	adr	r11, .Lk_ipt
+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	vld1.64	{q2, q3}, [r11]
+	adr	r11, .Lk_mc_forward+16
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
+	vtbl.8	d3, {q2}, d3
+	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
+	vtbl.8	d5, {q3}, d1
+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+
+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Lenc_loop.
+	tst	r8, r8
+	b	.Lenc_entry
+
+.align	4
+.Lenc_loop:
+	@ middle of middle round
+	add	r10, r11, #0x40
+	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	vtbl.8	d9, {q13}, d5
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
+	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	vtbl.8	d1, {q12}, d7
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	vtbl.8	d11, {q15}, d5
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
+	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	vtbl.8	d5, {q14}, d7
+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
+	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	vtbl.8	d7, {q0}, d3
+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	@ Write to q5 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	vtbl.8	d11, {q0}, d9
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	vtbl.8	d9, {q3}, d3
+	@ Here we restore the original q0/q5 usage.
+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	subs	r8, r8, #1		@ nr--
+
+.Lenc_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	vtbl.8	d11, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
+	bne	.Lenc_loop
+
+	@ middle of last round
+	add	r10, r11, #0x80
+
+	adr	r11, .Lk_sbo
+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+	@ overlap table and destination registers.
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
+	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
+	@ Write to q2 instead of q0 below, to avoid overlapping table and
+	@ destination registers.
+	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	vtbl.8	d5, {q0}, d7
+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	@ Here we restore the original q0/q2 usage.
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
+	vtbl.8	d1, {q2}, d3
+	bx	lr
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,%function
+.align	4
+vpaes_encrypt:
+	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+	@ alignment.
+	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
+	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11}
+
+	vld1.64	{q0}, [r0]
+	bl	_vpaes_preheat
+	bl	_vpaes_encrypt_core
+	vst1.64	{q0}, [r1]
+
+	vldmia	sp!, {d8,d9,d10,d11}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+@
+@  Decryption stuff
+@
+.type	_vpaes_decrypt_consts,%object
+.align	4
+_vpaes_decrypt_consts:
+.Lk_dipt:@ decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:@ decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:@ decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:@ decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:@ decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:@ decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.size	_vpaes_decrypt_consts,.-_vpaes_decrypt_consts
+
+@@
+@@  Decryption core
+@@
+@@  Same API as encryption core, except it clobbers q12-q15 rather than using
+@@  the values from _vpaes_preheat. q9-q11 must still be set from
+@@  _vpaes_preheat.
+@@
+.type	_vpaes_decrypt_core,%function
+.align	4
+_vpaes_decrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+
+	@ This function performs shuffles with various constants. The x86_64
+	@ version loads them on-demand into %xmm0-%xmm5. This does not work well
+	@ for ARMv7 because those registers are shuffle destinations. The ARMv8
+	@ version preloads those constants into registers, but ARMv7 has half
+	@ the registers to work with. Instead, we load them on-demand into
+	@ q12-q15, registers normally use for preloaded constants. This is fine
+	@ because decryption doesn't use those constants. The values are
+	@ constant, so this does not interfere with potential 2x optimizations.
+	adr	r7, .Lk_dipt
+
+	vld1.64	{q12,q13}, [r7]		@ vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	r11, r8, #4		@ mov		%rax,	%r11;	shl	$4, %r11
+	eor	r11, r11, #0x30		@ xor		$0x30,	%r11
+	adr	r10, .Lk_sr
+	and	r11, r11, #0x30		@ and		$0x30,	%r11
+	add	r11, r11, r10
+	adr	r10, .Lk_mc_forward+48
+
+	vld1.64	{q4}, [r9]!		@ vmovdqu	(%r9),	%xmm4		# round0 key
+	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q12}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q12}, d3
+	vld1.64	{q5}, [r10]		@ vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
+					@ vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	vtbl.8	d0, {q13}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q13}, d1
+	veor	q2, q2, q4		@ vpxor		%xmm4,	%xmm2,	%xmm2
+	veor	q0, q0, q2		@ vpxor		%xmm2,	%xmm0,	%xmm0
+
+	@ .Ldec_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Ldec_loop.
+	tst	r8, r8
+	b	.Ldec_entry
+
+.align	4
+.Ldec_loop:
+@
+@  Inverse mix columns
+@
+
+	@ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
+	@ the function.
+	adr	r10, .Lk_dsb9
+	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+					@ vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	@ Load sbd* ahead of time.
+	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+					@ vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	vtbl.8	d9, {q12}, d5
+	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	vtbl.8	d3, {q13}, d7
+	veor	q0, q4, q0		@ vpxor		%xmm4,	%xmm0,	%xmm0
+
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+
+	@ Load sbb* ahead of time.
+	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	0x20(%r10),%xmm4		# 4 : sbbu
+					@ vmovdqa	0x30(%r10),%xmm1		# 0 : sbbt
+
+	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	vtbl.8	d9, {q14}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	vtbl.8	d3, {q15}, d7
+					@ vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+					@ vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	@ Load sbd* ahead of time.
+	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x40(%r10),%xmm4		# 4 : sbeu
+					@ vmovdqa	0x50(%r10),%xmm1		# 0 : sbet
+
+	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	vtbl.8	d9, {q12}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	vtbl.8	d3, {q13}, d7
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+
+	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	vtbl.8	d9, {q14}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	vtbl.8	d3, {q15}, d7
+	vext.8	q5, q5, q5, #12		@ vpalignr 	$12,	%xmm5,	%xmm5,	%xmm5
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	subs	r8, r8, #1		@ sub		$1,%rax			# nr--
+
+.Ldec_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1	# 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d4, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q2		@ vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q0}, [r9]!		@ vmovdqu	(%r9),	%xmm0
+	bne	.Ldec_loop
+
+	@ middle of last round
+
+	adr	r10, .Lk_dsbo
+
+	@ Write to q1 rather than q4 to avoid overlapping table and destination.
+	vld1.64	{q1}, [r10]!		@ vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	@ Write to q2 rather than q1 to avoid overlapping table and destination.
+	vld1.64	{q2}, [r10]		@ vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	vtbl.8	d2, {q2}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	vtbl.8	d3, {q2}, d7
+	vld1.64	{q2}, [r11]		@ vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	veor	q4, q4, q0		@ vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	@ Write to q1 rather than q0 so the table and destination registers
+	@ below do not overlap.
+	veor	q1, q1, q4		@ vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
+	vtbl.8	d0, {q1}, d4	@ vpshufb	%xmm2,	%xmm0,	%xmm0
+	vtbl.8	d1, {q1}, d5
+	bx	lr
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl	vpaes_decrypt
+.hidden	vpaes_decrypt
+.type	vpaes_decrypt,%function
+.align	4
+vpaes_decrypt:
+	@ _vpaes_decrypt_core uses r7-r11.
+	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
+	@ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11}
+
+	vld1.64	{q0}, [r0]
+	bl	_vpaes_preheat
+	bl	_vpaes_decrypt_core
+	vst1.64	{q0}, [r1]
+
+	vldmia	sp!, {d8,d9,d10,d11}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_decrypt,.-vpaes_decrypt
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                                                    @@
+@@                  AES key schedule                  @@
+@@                                                    @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@  Key schedule constants
+@
+.type	_vpaes_key_consts,%object
+.align	4
+_vpaes_key_consts:
+.Lk_dksd:@ decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:@ decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:@ decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:@ rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:@ output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size	_vpaes_key_consts,.-_vpaes_key_consts
+
+.type	_vpaes_key_preheat,%function
+.align	4
+_vpaes_key_preheat:
+	adr	r11, .Lk_rcon
+	vmov.i8	q12, #0x5b			@ .Lk_s63
+	adr	r10, .Lk_inv			@ Must be aligned to 8 mod 16.
+	vmov.i8	q9, #0x0f			@ .Lk_s0F
+	vld1.64	{q10,q11}, [r10]		@ .Lk_inv
+	vld1.64	{q8}, [r11]			@ .Lk_rcon
+	bx	lr
+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type	_vpaes_schedule_core,%function
+.align	4
+_vpaes_schedule_core:
+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
+	@ so save an extra register.
+	stmdb	sp!, {r3,lr}
+
+	bl	_vpaes_key_preheat	@ load the tables
+
+	adr	r11, .Lk_ipt		@ Must be aligned to 8 mod 16.
+	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	@ input transform
+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+	@ overlap table and destination.
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	adr	r10, .Lk_sr		@ Must be aligned to 8 mod 16.
+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
+
+	add	r8, r8, r10
+	tst	r3, r3
+	bne	.Lschedule_am_decrypting
+
+	@ encrypting, output zeroth round key after transform
+	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
+	b	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	@ decrypting, output zeroth round key after shiftrows
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	vtbl.8	d6, {q4}, d2	@ vpshufb  	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q4}, d3
+	vst1.64	{q3}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	eor	r8, r8, #0x30		@ xor	$0x30, %r8
+
+.Lschedule_go:
+	cmp	r1, #192		@ cmp	$192,	%esi
+	bhi	.Lschedule_256
+	beq	.Lschedule_192
+	@ 128: fall though
+
+@@
+@@  .schedule_128
+@@
+@@  128-bit specific part of key schedule.
+@@
+@@  This schedule is really simple, because all its parts
+@@  are accomplished by the subroutines.
+@@
+.Lschedule_128:
+	mov	r0, #10		@ mov	$10, %esi
+
+.Loop_schedule_128:
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1		@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle	@ write output
+	b	.Loop_schedule_128
+
+@@
+@@  .aes_schedule_192
+@@
+@@  192-bit specific part of key schedule.
+@@
+@@  The main body of this schedule is the same as the 128-bit
+@@  schedule, but with more smearing.  The long, high side is
+@@  stored in q7 as before, and the short, low side is in
+@@  the high bits of q6.
+@@
+@@  This schedule is somewhat nastier, however, because each
+@@  round produces 192 bits of key material, or 1.5 round keys.
+@@  Therefore, on each cycle we do 2 rounds and produce 3 round
+@@  keys.
+@@
+.align	4
+.Lschedule_192:
+	sub	r0, r0, #8
+	vld1.64	{q0}, [r0]			@ vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save short part
+	vmov.i8	d12, #0			@ vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+						@ vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	r0, #4			@ mov	$4,	%esi
+
+.Loop_schedule_192:
+	bl	_vpaes_schedule_round
+	vext.8	q0, q6, q0, #8			@ vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		@ save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		@ save key n+1
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		@ save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	.Loop_schedule_192
+
+@@
+@@  .aes_schedule_256
+@@
+@@  256-bit specific part of key schedule.
+@@
+@@  The structure here is very similar to the 128-bit
+@@  schedule, but with an additional "low side" in
+@@  q6.  The low side's rounds are the same as the
+@@  high side's, except no rcon and no rotation.
+@@
+.align	4
+.Lschedule_256:
+	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	mov	r0, #7			@ mov	$7, %esi
+
+.Loop_schedule_256:
+	bl	_vpaes_schedule_mangle		@ output low result
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	@ high round
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	.Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	@ low round. swap xmm7 and xmm6
+	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vmov.i8	q4, #0
+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
+
+	b	.Loop_schedule_256
+
+@@
+@@  .aes_schedule_mangle_last
+@@
+@@  Mangler for last round of key schedule
+@@  Mangles q0
+@@    when encrypting, outputs out(q0) ^ 63
+@@    when decrypting, outputs unskew(q0)
+@@
+@@  Always called right before return... jumps to cleanup and exits
+@@
+.align	4
+.Lschedule_mangle_last:
+	@ schedule last round key from xmm0
+	adr	r11, .Lk_deskew			@ lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	tst	r3, r3
+	bne	.Lschedule_mangle_last_dec
+
+	@ encrypting
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
+	adr	r11, .Lk_opt		@ lea		.Lk_opt(%rip),	%r11		# prepare to output transform
+	add	r2, r2, #32		@ add		$32,	%rdx
+	vmov	q2, q0
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+	vtbl.8	d1, {q2}, d3
+
+.Lschedule_mangle_last_dec:
+	sub	r2, r2, #16			@ add	$-16,	%rdx
+	veor	q0, q0, q12			@ vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	@ output transform
+	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	@ cleanup
+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
+	ldmia	sp!, {r3,pc}		@ return
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@  .aes_schedule_192_smear
+@@
+@@  Smear the short, low side in the 192-bit key schedule.
+@@
+@@  Inputs:
+@@    q7: high side, b  a  x  y
+@@    q6:  low side, d  c  0  0
+@@
+@@  Outputs:
+@@    q6: b+c+d  b+c  0  0
+@@    q0: b+c+d  b+c  b  a
+@@
+.type	_vpaes_schedule_192_smear,%function
+.align	4
+_vpaes_schedule_192_smear:
+	vmov.i8	q1, #0
+	vdup.32	q0, d15[1]
+	vshl.i64	q1, q6, #32		@ vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	vmov	d0, d15		@ vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	veor	q6, q6, q1		@ vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q6, q6, q0		@ vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	vmov	q0, q6			@ vmovdqa	%xmm6,	%xmm0
+	vmov	d12, d2		@ vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	bx	lr
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+@@
+@@  .aes_schedule_round
+@@
+@@  Runs one main round of the key schedule on q0, q7
+@@
+@@  Specifically, runs subbytes on the high dword of q0
+@@  then rotates it by one byte and xors into the low dword of
+@@  q7.
+@@
+@@  Adds rcon from low byte of q8, then rotates q8 for
+@@  next rcon.
+@@
+@@  Smears the dwords of q7 by xoring the low into the
+@@  second low, result into third, result into highest.
+@@
+@@  Returns results in q7 = q0.
+@@  Clobbers q1-q4, r11.
+@@
+.type	_vpaes_schedule_round,%function
+.align	4
+_vpaes_schedule_round:
+	@ extract rcon from xmm8
+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
+	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
+
+	@ rotate
+	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	@ fall through...
+
+	@ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+	@ We pin other values in _vpaes_key_preheat, so load them now.
+	adr	r11, .Lk_sb1
+	vld1.64	{q14,q15}, [r11]
+
+	@ smear xmm7
+	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
+	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
+
+	@ subbytes
+	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
+	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
+	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
+	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q7, q7, q12			@ vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
+	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	vtbl.8	d7, {q10}, d7
+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	vtbl.8	d5, {q10}, d9
+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	vtbl.8	d9, {q15}, d7
+	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	vtbl.8	d3, {q14}, d5
+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	@ add in smeared stuff
+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
+	bx	lr
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@  .aes_schedule_transform
+@@
+@@  Linear-transform q0 according to tables at [r11]
+@@
+@@  Requires that q9 = 0x0F0F... as in preheat
+@@  Output in q0
+@@  Clobbers q1, q2, q14, q15
+@@
+.type	_vpaes_schedule_transform,%function
+.align	4
+_vpaes_schedule_transform:
+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
+					@ vmovdqa	16(%r11),	%xmm1 # hi
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d3
+	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q15}, d1
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+	bx	lr
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@  .aes_schedule_mangle
+@@
+@@  Mangles q0 from (basis-transformed) standard version
+@@  to our version.
+@@
+@@  On encrypt,
+@@    xor with 0x63
+@@    multiply by circulant 0,1,1,1
+@@    apply shiftrows transform
+@@
+@@  On decrypt,
+@@    xor with 0x63
+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
+@@    deskew
+@@    apply shiftrows transform
+@@
+@@
+@@  Writes out to [r2], and increments or decrements it
+@@  Keeps track of round number mod 4 in r8
+@@  Preserves q0
+@@  Clobbers q1-q5
+@@
+.type	_vpaes_schedule_mangle,%function
+.align	4
+_vpaes_schedule_mangle:
+	tst	r3, r3
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+	adr	r11, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	vld1.64	{q5}, [r11]		@ vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	bne	.Lschedule_mangle_dec
+
+	@ encrypting
+	@ Write to q2 so we do not overlap table and destination below.
+	veor	q2, q0, q12		@ vpxor		.Lk_s63(%rip),	%xmm0,	%xmm4
+	add	r2, r2, #16		@ add		$16,	%rdx
+	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
+	vtbl.8	d9, {q2}, d11
+	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
+	vtbl.8	d3, {q4}, d11
+	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
+	vtbl.8	d7, {q1}, d11
+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
+
+	b	.Lschedule_mangle_both
+.align	4
+.Lschedule_mangle_dec:
+	@ inverse mix columns
+	adr	r11, .Lk_dksd 		@ lea		.Lk_dksd(%rip),%r11
+	vshr.u8	q1, q4, #4		@ vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	vand	q4, q4, q9		@ vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x00(%r11),	%xmm2
+					@ vmovdqa	0x10(%r11),	%xmm3
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dksb ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x20(%r11),	%xmm2
+					@ vmovdqa	0x30(%r11),	%xmm3
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dkse ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x40(%r11),	%xmm2
+					@ vmovdqa	0x50(%r11),	%xmm3
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dkse ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x60(%r11),	%xmm2
+					@ vmovdqa	0x70(%r11),	%xmm4
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+	vtbl.8	d8, {q15}, d2	@ vpshufb	%xmm1,	%xmm4,	%xmm4
+	vtbl.8	d9, {q15}, d3
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q2, q2, q3		@ vpxor	%xmm3,	%xmm2,	%xmm2
+	veor	q3, q4, q2		@ vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	r2, r2, #16		@ add	$-16,	%rdx
+
+.Lschedule_mangle_both:
+	@ Write to q2 so table and destination do not overlap.
+	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d5, {q3}, d3
+	add	r8, r8, #64-16		@ add	$-16,	%r8
+	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
+	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	bx	lr
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,%function
+.align	4
+vpaes_set_encrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	r3, #0		@ mov	$0,%ecx
+	mov	r8, #0x30		@ mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	r0, r0, r0
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.hidden	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,%function
+.align	4
+vpaes_set_decrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	r9, r9, #4		@ shl	$4,%eax
+	add	r2, r2, #16		@ lea	16(%rdx,%rax),%rdx
+	add	r2, r2, r9
+
+	mov	r3, #1		@ mov	$1,%ecx
+	lsr	r8, r1, #1		@ shr	$1,%r8d
+	and	r8, r8, #32		@ and	$32,%r8d
+	eor	r8, r8, #32		@ xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+@ Additional constants for converting to bsaes.
+.type	_vpaes_convert_consts,%object
+.align	4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+.Lk_decrypt_transform:
+.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
+.size	_vpaes_convert_consts,.-_vpaes_convert_consts
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	vpaes_encrypt_key_to_bsaes
+.hidden	vpaes_encrypt_key_to_bsaes
+.type	vpaes_encrypt_key_to_bsaes,%function
+.align	4
+vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+	adr	r2, .Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	add	r3, r2, 0x90		@ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+	vld1.64	{q12}, [r2]
+	vmov.i8	q10, #0x5b		@ .Lk_s63 from vpaes-x86_64
+	adr	r11, .Lk_opt		@ Must be aligned to 8 mod 16.
+	vmov.i8	q11, #0x63		@ .LK_s63 without .Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+	vmov	q0, q1
+	vtbl.8	d4, {q1}, d24
+	vtbl.8	d5, {q1}, d25
+	veor	q0, q0, q2
+	vtbl.8	d2, {q2}, d24
+	vtbl.8	d3, {q2}, d25
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, q10
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+	b	.Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ q11, not q10.
+	veor	q0, q0, q11
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl	vpaes_decrypt_key_to_bsaes
+.hidden	vpaes_decrypt_key_to_bsaes
+.type	vpaes_decrypt_key_to_bsaes,%function
+.align	4
+vpaes_decrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+	@ computes the decryption key schedule in reverse. Additionally,
+	@ aes-x86_64.pl shares some transformations, so we must only partially
+	@ invert vpaes's transformations. In general, vpaes computes in a
+	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	adr	r2, .Lk_decrypt_transform
+	adr	r3, .Lk_sr+0x30
+	adr	r11, .Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
+	vld1.64	{q12}, [r2]	@ Reuse q12 from encryption.
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ Undo the basis change and reapply the S-box affine transform. See
+	@ .Lschedule_mangle_last.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+	@ it simultaneously inverts MixColumns and the S-box affine transform.
+	@ See .Lk_dksd through .Lk_dks9.
+.Loop_dec_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+	@ forwards cancels inverting for which direction we cycle r3. We use r3
+	@ rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #64-16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	.Loop_dec_key_to_bsaes_last
+
+	@ Undo the basis change and reapply the S-box affine transform.
+	bl	_vpaes_schedule_transform
+
+	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+	@ combine the two operations in .Lk_decrypt_transform.
+	@
+	@ TODO(davidben): Where does the rotation come from?
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+
+	vst1.64	{q1}, [r0]!
+	b	.Loop_dec_key_to_bsaes
+
+.Loop_dec_key_to_bsaes_last:
+	@ The final key only inverts ShiftRows (already done in the loop). See
+	@ .Lschedule_am_decrypting. Its basis is not transformed.
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+.size	vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden	vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,%function
+.align	4
+vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	cmp	r2, #0
+	@ r8 is passed on the stack.
+	ldr	r8, [ip]
+	beq	.Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+	mov	r9, r3
+	mov	r3, r2
+	mov	r2, r9
+
+	@ Load the IV and counter portion.
+	ldr	r7, [r8, #12]
+	vld1.8	{q7}, [r8]
+
+	bl	_vpaes_preheat
+	rev	r7, r7		@ The counter is big-endian.
+
+.Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [r0]!		@ .Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [r1]!
+	subs	r3, r3, #1
+	@ Update the counter.
+	add	r7, r7, #1
+	rev	r9, r7
+	vmov.32	d15[1], r9
+	bne	.Lctr32_loop
+
+.Lctr32_done:
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S
@@ -1,0 +1,379 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax	unified
+
+.arch	armv7-a
+.fpu	vfp
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@                              const uint32_t *argv, size_t argc,
+@                              int unwind);
+.type	abi_test_trampoline, %function
+.globl	abi_test_trampoline
+.hidden	abi_test_trampoline
+.align	4
+abi_test_trampoline:
+	@ Save parameters and all callee-saved registers. For convenience, we
+	@ save r9 on iOS even though it's volatile.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	stmdb	sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
+
+	@ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+	@ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
+	sub	sp, sp, #28
+
+	@ Every register in AAPCS is either non-volatile or a parameter (except
+	@ r9 on iOS), so this code, by the actual call, loses all its scratch
+	@ registers. First fill in stack parameters while there are registers
+	@ to spare.
+	cmp	r3, #4
+	bls	.Lstack_args_done
+	mov	r4, sp				@ r4 is the output pointer.
+	add	r5, r2, r3, lsl #2	@ Set r5 to the end of argv.
+	add	r2, r2, #16		@ Skip four arguments.
+.Lstack_args_loop:
+	ldr	r6, [r2], #4
+	cmp	r2, r5
+	str	r6, [r4], #4
+	bne	.Lstack_args_loop
+
+.Lstack_args_done:
+	@ Load registers from |r1|.
+	vldmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	ldmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+	ldmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+	@ Load register parameters. This uses up our remaining registers, so we
+	@ repurpose lr as scratch space.
+	ldr	r3, [sp, #40]	@ Reload argc.
+	ldr	lr, [sp, #36]		@ .Load argv into lr.
+	cmp	r3, #3
+	bhi	.Larg_r3
+	beq	.Larg_r2
+	cmp	r3, #1
+	bhi	.Larg_r1
+	beq	.Larg_r0
+	b	.Largs_done
+
+.Larg_r3:
+	ldr	r3, [lr, #12]	@ argv[3]
+.Larg_r2:
+	ldr	r2, [lr, #8]	@ argv[2]
+.Larg_r1:
+	ldr	r1, [lr, #4]	@ argv[1]
+.Larg_r0:
+	ldr	r0, [lr]	@ argv[0]
+.Largs_done:
+
+	@ With every other register in use, load the function pointer into lr
+	@ and call the function.
+	ldr	lr, [sp, #28]
+	blx	lr
+
+	@ r1-r3 are free for use again. The trampoline only supports
+	@ single-return functions. Pass r4-r11 to the caller.
+	ldr	r1, [sp, #32]
+	vstmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	stmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+	stmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+	@ Unwind the stack and restore registers.
+	add	sp, sp, #44		@ 44 = 28+16
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}	@ Skip r0-r3 (see +16 above).
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	bx	lr
+.size	abi_test_trampoline,.-abi_test_trampoline
+.type	abi_test_clobber_r0, %function
+.globl	abi_test_clobber_r0
+.hidden	abi_test_clobber_r0
+.align	4
+abi_test_clobber_r0:
+	mov	r0, #0
+	bx	lr
+.size	abi_test_clobber_r0,.-abi_test_clobber_r0
+.type	abi_test_clobber_r1, %function
+.globl	abi_test_clobber_r1
+.hidden	abi_test_clobber_r1
+.align	4
+abi_test_clobber_r1:
+	mov	r1, #0
+	bx	lr
+.size	abi_test_clobber_r1,.-abi_test_clobber_r1
+.type	abi_test_clobber_r2, %function
+.globl	abi_test_clobber_r2
+.hidden	abi_test_clobber_r2
+.align	4
+abi_test_clobber_r2:
+	mov	r2, #0
+	bx	lr
+.size	abi_test_clobber_r2,.-abi_test_clobber_r2
+.type	abi_test_clobber_r3, %function
+.globl	abi_test_clobber_r3
+.hidden	abi_test_clobber_r3
+.align	4
+abi_test_clobber_r3:
+	mov	r3, #0
+	bx	lr
+.size	abi_test_clobber_r3,.-abi_test_clobber_r3
+.type	abi_test_clobber_r4, %function
+.globl	abi_test_clobber_r4
+.hidden	abi_test_clobber_r4
+.align	4
+abi_test_clobber_r4:
+	mov	r4, #0
+	bx	lr
+.size	abi_test_clobber_r4,.-abi_test_clobber_r4
+.type	abi_test_clobber_r5, %function
+.globl	abi_test_clobber_r5
+.hidden	abi_test_clobber_r5
+.align	4
+abi_test_clobber_r5:
+	mov	r5, #0
+	bx	lr
+.size	abi_test_clobber_r5,.-abi_test_clobber_r5
+.type	abi_test_clobber_r6, %function
+.globl	abi_test_clobber_r6
+.hidden	abi_test_clobber_r6
+.align	4
+abi_test_clobber_r6:
+	mov	r6, #0
+	bx	lr
+.size	abi_test_clobber_r6,.-abi_test_clobber_r6
+.type	abi_test_clobber_r7, %function
+.globl	abi_test_clobber_r7
+.hidden	abi_test_clobber_r7
+.align	4
+abi_test_clobber_r7:
+	mov	r7, #0
+	bx	lr
+.size	abi_test_clobber_r7,.-abi_test_clobber_r7
+.type	abi_test_clobber_r8, %function
+.globl	abi_test_clobber_r8
+.hidden	abi_test_clobber_r8
+.align	4
+abi_test_clobber_r8:
+	mov	r8, #0
+	bx	lr
+.size	abi_test_clobber_r8,.-abi_test_clobber_r8
+.type	abi_test_clobber_r9, %function
+.globl	abi_test_clobber_r9
+.hidden	abi_test_clobber_r9
+.align	4
+abi_test_clobber_r9:
+	mov	r9, #0
+	bx	lr
+.size	abi_test_clobber_r9,.-abi_test_clobber_r9
+.type	abi_test_clobber_r10, %function
+.globl	abi_test_clobber_r10
+.hidden	abi_test_clobber_r10
+.align	4
+abi_test_clobber_r10:
+	mov	r10, #0
+	bx	lr
+.size	abi_test_clobber_r10,.-abi_test_clobber_r10
+.type	abi_test_clobber_r11, %function
+.globl	abi_test_clobber_r11
+.hidden	abi_test_clobber_r11
+.align	4
+abi_test_clobber_r11:
+	mov	r11, #0
+	bx	lr
+.size	abi_test_clobber_r11,.-abi_test_clobber_r11
+.type	abi_test_clobber_r12, %function
+.globl	abi_test_clobber_r12
+.hidden	abi_test_clobber_r12
+.align	4
+abi_test_clobber_r12:
+	mov	r12, #0
+	bx	lr
+.size	abi_test_clobber_r12,.-abi_test_clobber_r12
+.type	abi_test_clobber_d0, %function
+.globl	abi_test_clobber_d0
+.hidden	abi_test_clobber_d0
+.align	4
+abi_test_clobber_d0:
+	mov	r0, #0
+	vmov	s0, r0
+	vmov	s1, r0
+	bx	lr
+.size	abi_test_clobber_d0,.-abi_test_clobber_d0
+.type	abi_test_clobber_d1, %function
+.globl	abi_test_clobber_d1
+.hidden	abi_test_clobber_d1
+.align	4
+abi_test_clobber_d1:
+	mov	r0, #0
+	vmov	s2, r0
+	vmov	s3, r0
+	bx	lr
+.size	abi_test_clobber_d1,.-abi_test_clobber_d1
+.type	abi_test_clobber_d2, %function
+.globl	abi_test_clobber_d2
+.hidden	abi_test_clobber_d2
+.align	4
+abi_test_clobber_d2:
+	mov	r0, #0
+	vmov	s4, r0
+	vmov	s5, r0
+	bx	lr
+.size	abi_test_clobber_d2,.-abi_test_clobber_d2
+.type	abi_test_clobber_d3, %function
+.globl	abi_test_clobber_d3
+.hidden	abi_test_clobber_d3
+.align	4
+abi_test_clobber_d3:
+	mov	r0, #0
+	vmov	s6, r0
+	vmov	s7, r0
+	bx	lr
+.size	abi_test_clobber_d3,.-abi_test_clobber_d3
+.type	abi_test_clobber_d4, %function
+.globl	abi_test_clobber_d4
+.hidden	abi_test_clobber_d4
+.align	4
+abi_test_clobber_d4:
+	mov	r0, #0
+	vmov	s8, r0
+	vmov	s9, r0
+	bx	lr
+.size	abi_test_clobber_d4,.-abi_test_clobber_d4
+.type	abi_test_clobber_d5, %function
+.globl	abi_test_clobber_d5
+.hidden	abi_test_clobber_d5
+.align	4
+abi_test_clobber_d5:
+	mov	r0, #0
+	vmov	s10, r0
+	vmov	s11, r0
+	bx	lr
+.size	abi_test_clobber_d5,.-abi_test_clobber_d5
+.type	abi_test_clobber_d6, %function
+.globl	abi_test_clobber_d6
+.hidden	abi_test_clobber_d6
+.align	4
+abi_test_clobber_d6:
+	mov	r0, #0
+	vmov	s12, r0
+	vmov	s13, r0
+	bx	lr
+.size	abi_test_clobber_d6,.-abi_test_clobber_d6
+.type	abi_test_clobber_d7, %function
+.globl	abi_test_clobber_d7
+.hidden	abi_test_clobber_d7
+.align	4
+abi_test_clobber_d7:
+	mov	r0, #0
+	vmov	s14, r0
+	vmov	s15, r0
+	bx	lr
+.size	abi_test_clobber_d7,.-abi_test_clobber_d7
+.type	abi_test_clobber_d8, %function
+.globl	abi_test_clobber_d8
+.hidden	abi_test_clobber_d8
+.align	4
+abi_test_clobber_d8:
+	mov	r0, #0
+	vmov	s16, r0
+	vmov	s17, r0
+	bx	lr
+.size	abi_test_clobber_d8,.-abi_test_clobber_d8
+.type	abi_test_clobber_d9, %function
+.globl	abi_test_clobber_d9
+.hidden	abi_test_clobber_d9
+.align	4
+abi_test_clobber_d9:
+	mov	r0, #0
+	vmov	s18, r0
+	vmov	s19, r0
+	bx	lr
+.size	abi_test_clobber_d9,.-abi_test_clobber_d9
+.type	abi_test_clobber_d10, %function
+.globl	abi_test_clobber_d10
+.hidden	abi_test_clobber_d10
+.align	4
+abi_test_clobber_d10:
+	mov	r0, #0
+	vmov	s20, r0
+	vmov	s21, r0
+	bx	lr
+.size	abi_test_clobber_d10,.-abi_test_clobber_d10
+.type	abi_test_clobber_d11, %function
+.globl	abi_test_clobber_d11
+.hidden	abi_test_clobber_d11
+.align	4
+abi_test_clobber_d11:
+	mov	r0, #0
+	vmov	s22, r0
+	vmov	s23, r0
+	bx	lr
+.size	abi_test_clobber_d11,.-abi_test_clobber_d11
+.type	abi_test_clobber_d12, %function
+.globl	abi_test_clobber_d12
+.hidden	abi_test_clobber_d12
+.align	4
+abi_test_clobber_d12:
+	mov	r0, #0
+	vmov	s24, r0
+	vmov	s25, r0
+	bx	lr
+.size	abi_test_clobber_d12,.-abi_test_clobber_d12
+.type	abi_test_clobber_d13, %function
+.globl	abi_test_clobber_d13
+.hidden	abi_test_clobber_d13
+.align	4
+abi_test_clobber_d13:
+	mov	r0, #0
+	vmov	s26, r0
+	vmov	s27, r0
+	bx	lr
+.size	abi_test_clobber_d13,.-abi_test_clobber_d13
+.type	abi_test_clobber_d14, %function
+.globl	abi_test_clobber_d14
+.hidden	abi_test_clobber_d14
+.align	4
+abi_test_clobber_d14:
+	mov	r0, #0
+	vmov	s28, r0
+	vmov	s29, r0
+	bx	lr
+.size	abi_test_clobber_d14,.-abi_test_clobber_d14
+.type	abi_test_clobber_d15, %function
+.globl	abi_test_clobber_d15
+.hidden	abi_test_clobber_d15
+.align	4
+abi_test_clobber_d15:
+	mov	r0, #0
+	vmov	s30, r0
+	vmov	s31, r0
+	bx	lr
+.size	abi_test_clobber_d15,.-abi_test_clobber_d15
+#endif
+#endif  // !OPENSSL_NO_ASM
+.section	.note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S
@@ -1,0 +1,3670 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__)
+.machine	"any"
+
+.abiversion	2
+.text
+
+.align	7
+.Lrcon:
+.byte	0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01
+.byte	0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b
+.byte	0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d
+.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.Lconsts:
+	mflr	0
+	bcl	20,31,$+4
+	mflr	6
+	addi	6,6,-0x48
+	mtlr	0
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,0,0
+.byte	65,69,83,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.globl	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,@function
+.align	5
+aes_hw_set_encrypt_key:
+.localentry	aes_hw_set_encrypt_key,0
+
+.Lset_encrypt_key:
+	mflr	11
+	std	11,16(1)
+
+	li	6,-1
+	cmpldi	3,0
+	beq-	.Lenc_key_abort
+	cmpldi	5,0
+	beq-	.Lenc_key_abort
+	li	6,-2
+	cmpwi	4,128
+	blt-	.Lenc_key_abort
+	cmpwi	4,256
+	bgt-	.Lenc_key_abort
+	andi.	0,4,0x3f
+	bne-	.Lenc_key_abort
+
+	lis	0,0xfff0
+	li	12,-1
+	or	0,0,0
+
+	bl	.Lconsts
+	mtlr	11
+
+	neg	9,3
+	lvx	1,0,3
+	addi	3,3,15
+	lvsr	3,0,9
+	li	8,0x20
+	cmpwi	4,192
+	lvx	2,0,3
+	vspltisb	5,0x0f
+	lvx	4,0,6
+	vxor	3,3,5
+	lvx	5,8,6
+	addi	6,6,0x10
+	vperm	1,1,2,3
+	li	7,8
+	vxor	0,0,0
+	mtctr	7
+
+	lvsl	8,0,5
+	vspltisb	9,-1
+	lvx	10,0,5
+	vperm	9,9,0,8
+
+	blt	.Loop128
+	addi	3,3,8
+	beq	.L192
+	addi	3,3,8
+	b	.L256
+
+.align	4
+.Loop128:
+	vperm	3,1,1,5
+	vsldoi	6,0,1,12
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	.long	0x10632509
+	stvx	7,0,5
+	addi	5,5,16
+
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vadduwm	4,4,4
+	vxor	1,1,3
+	bdnz	.Loop128
+
+	lvx	4,0,6
+
+	vperm	3,1,1,5
+	vsldoi	6,0,1,12
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	.long	0x10632509
+	stvx	7,0,5
+	addi	5,5,16
+
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vadduwm	4,4,4
+	vxor	1,1,3
+
+	vperm	3,1,1,5
+	vsldoi	6,0,1,12
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	.long	0x10632509
+	stvx	7,0,5
+	addi	5,5,16
+
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vxor	1,1,3
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	stvx	7,0,5
+
+	addi	3,5,15
+	addi	5,5,0x50
+
+	li	8,10
+	b	.Ldone
+
+.align	4
+.L192:
+	lvx	6,0,3
+	li	7,4
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	stvx	7,0,5
+	addi	5,5,16
+	vperm	2,2,6,3
+	vspltisb	3,8
+	mtctr	7
+	vsububm	5,5,3
+
+.Loop192:
+	vperm	3,2,2,5
+	vsldoi	6,0,1,12
+	.long	0x10632509
+
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+
+	vsldoi	7,0,2,8
+	vspltw	6,1,3
+	vxor	6,6,2
+	vsldoi	2,0,2,12
+	vadduwm	4,4,4
+	vxor	2,2,6
+	vxor	1,1,3
+	vxor	2,2,3
+	vsldoi	7,7,1,8
+
+	vperm	3,2,2,5
+	vsldoi	6,0,1,12
+	vperm	11,7,7,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	.long	0x10632509
+	stvx	7,0,5
+	addi	5,5,16
+
+	vsldoi	7,1,2,8
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vperm	11,7,7,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	stvx	7,0,5
+	addi	5,5,16
+
+	vspltw	6,1,3
+	vxor	6,6,2
+	vsldoi	2,0,2,12
+	vadduwm	4,4,4
+	vxor	2,2,6
+	vxor	1,1,3
+	vxor	2,2,3
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	stvx	7,0,5
+	addi	3,5,15
+	addi	5,5,16
+	bdnz	.Loop192
+
+	li	8,12
+	addi	5,5,0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	lvx	6,0,3
+	li	7,7
+	li	8,14
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	stvx	7,0,5
+	addi	5,5,16
+	vperm	2,2,6,3
+	mtctr	7
+
+.Loop256:
+	vperm	3,2,2,5
+	vsldoi	6,0,1,12
+	vperm	11,2,2,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	.long	0x10632509
+	stvx	7,0,5
+	addi	5,5,16
+
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vsldoi	6,0,6,12
+	vxor	1,1,6
+	vadduwm	4,4,4
+	vxor	1,1,3
+	vperm	11,1,1,8
+	vsel	7,10,11,9
+	vor	10,11,11
+	stvx	7,0,5
+	addi	3,5,15
+	addi	5,5,16
+	bdz	.Ldone
+
+	vspltw	3,1,3
+	vsldoi	6,0,2,12
+	.long	0x106305C8
+
+	vxor	2,2,6
+	vsldoi	6,0,6,12
+	vxor	2,2,6
+	vsldoi	6,0,6,12
+	vxor	2,2,6
+
+	vxor	2,2,3
+	b	.Loop256
+
+.align	4
+.Ldone:
+	lvx	2,0,3
+	vsel	2,10,2,9
+	stvx	2,0,3
+	li	6,0
+	or	12,12,12
+	stw	8,0(5)
+
+.Lenc_key_abort:
+	mr	3,6
+	blr	
+.long	0
+.byte	0,12,0x14,1,0,0,3,0
+.long	0
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,@function
+.align	5
+aes_hw_set_decrypt_key:
+.localentry	aes_hw_set_decrypt_key,0
+
+	stdu	1,-64(1)
+	mflr	10
+	std	10,80(1)
+	bl	.Lset_encrypt_key
+	mtlr	10
+
+	cmpwi	3,0
+	bne-	.Ldec_key_abort
+
+	slwi	7,8,4
+	subi	3,5,240
+	srwi	8,8,1
+	add	5,3,7
+	mtctr	8
+
+.Ldeckey:
+	lwz	0, 0(3)
+	lwz	6, 4(3)
+	lwz	7, 8(3)
+	lwz	8, 12(3)
+	addi	3,3,16
+	lwz	9, 0(5)
+	lwz	10,4(5)
+	lwz	11,8(5)
+	lwz	12,12(5)
+	stw	0, 0(5)
+	stw	6, 4(5)
+	stw	7, 8(5)
+	stw	8, 12(5)
+	subi	5,5,16
+	stw	9, -16(3)
+	stw	10,-12(3)
+	stw	11,-8(3)
+	stw	12,-4(3)
+	bdnz	.Ldeckey
+
+	xor	3,3,3
+.Ldec_key_abort:
+	addi	1,1,64
+	blr	
+.long	0
+.byte	0,12,4,1,0x80,0,3,0
+.long	0
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_encrypt
+.type	aes_hw_encrypt,@function
+.align	5
+aes_hw_encrypt:
+.localentry	aes_hw_encrypt,0
+
+	lwz	6,240(5)
+	lis	0,0xfc00
+	li	12,-1
+	li	7,15
+	or	0,0,0
+
+	lvx	0,0,3
+	neg	11,4
+	lvx	1,7,3
+	lvsl	2,0,3
+	vspltisb	4,0x0f
+	lvsr	3,0,11
+	vxor	2,2,4
+	li	7,16
+	vperm	0,0,1,2
+	lvx	1,0,5
+	lvsr	5,0,5
+	srwi	6,6,1
+	lvx	2,7,5
+	addi	7,7,16
+	subi	6,6,1
+	vperm	1,2,1,5
+
+	vxor	0,0,1
+	lvx	1,7,5
+	addi	7,7,16
+	mtctr	6
+
+.Loop_enc:
+	vperm	2,1,2,5
+	.long	0x10001508
+	lvx	2,7,5
+	addi	7,7,16
+	vperm	1,2,1,5
+	.long	0x10000D08
+	lvx	1,7,5
+	addi	7,7,16
+	bdnz	.Loop_enc
+
+	vperm	2,1,2,5
+	.long	0x10001508
+	lvx	2,7,5
+	vperm	1,2,1,5
+	.long	0x10000D09
+
+	vspltisb	2,-1
+	vxor	1,1,1
+	li	7,15
+	vperm	2,2,1,3
+	vxor	3,3,4
+	lvx	1,0,4
+	vperm	0,0,0,3
+	vsel	1,1,0,2
+	lvx	4,7,4
+	stvx	1,0,4
+	vsel	0,0,4,2
+	stvx	0,7,4
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,3,0
+.long	0
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+.globl	aes_hw_decrypt
+.type	aes_hw_decrypt,@function
+.align	5
+aes_hw_decrypt:
+.localentry	aes_hw_decrypt,0
+
+	lwz	6,240(5)
+	lis	0,0xfc00
+	li	12,-1
+	li	7,15
+	or	0,0,0
+
+	lvx	0,0,3
+	neg	11,4
+	lvx	1,7,3
+	lvsl	2,0,3
+	vspltisb	4,0x0f
+	lvsr	3,0,11
+	vxor	2,2,4
+	li	7,16
+	vperm	0,0,1,2
+	lvx	1,0,5
+	lvsr	5,0,5
+	srwi	6,6,1
+	lvx	2,7,5
+	addi	7,7,16
+	subi	6,6,1
+	vperm	1,2,1,5
+
+	vxor	0,0,1
+	lvx	1,7,5
+	addi	7,7,16
+	mtctr	6
+
+.Loop_dec:
+	vperm	2,1,2,5
+	.long	0x10001548
+	lvx	2,7,5
+	addi	7,7,16
+	vperm	1,2,1,5
+	.long	0x10000D48
+	lvx	1,7,5
+	addi	7,7,16
+	bdnz	.Loop_dec
+
+	vperm	2,1,2,5
+	.long	0x10001548
+	lvx	2,7,5
+	vperm	1,2,1,5
+	.long	0x10000D49
+
+	vspltisb	2,-1
+	vxor	1,1,1
+	li	7,15
+	vperm	2,2,1,3
+	vxor	3,3,4
+	lvx	1,0,4
+	vperm	0,0,0,3
+	vsel	1,1,0,2
+	lvx	4,7,4
+	stvx	1,0,4
+	vsel	0,0,4,2
+	stvx	0,7,4
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,3,0
+.long	0
+.size	aes_hw_decrypt,.-aes_hw_decrypt
+.globl	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,@function
+.align	5
+aes_hw_cbc_encrypt:
+.localentry	aes_hw_cbc_encrypt,0
+
+	cmpldi	5,16
+	.long	0x4dc00020
+
+	cmpwi	8,0
+	lis	0,0xffe0
+	li	12,-1
+	or	0,0,0
+
+	li	10,15
+	vxor	0,0,0
+	vspltisb	3,0x0f
+
+	lvx	4,0,7
+	lvsl	6,0,7
+	lvx	5,10,7
+	vxor	6,6,3
+	vperm	4,4,5,6
+
+	neg	11,3
+	lvsr	10,0,6
+	lwz	9,240(6)
+
+	lvsr	6,0,11
+	lvx	5,0,3
+	addi	3,3,15
+	vxor	6,6,3
+
+	lvsl	8,0,4
+	vspltisb	9,-1
+	lvx	7,0,4
+	vperm	9,9,0,8
+	vxor	8,8,3
+
+	srwi	9,9,1
+	li	10,16
+	subi	9,9,1
+	beq	.Lcbc_dec
+
+.Lcbc_enc:
+	vor	2,5,5
+	lvx	5,0,3
+	addi	3,3,16
+	mtctr	9
+	subi	5,5,16
+
+	lvx	0,0,6
+	vperm	2,2,5,6
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	0,1,0,10
+	vxor	2,2,0
+	lvx	0,10,6
+	addi	10,10,16
+	vxor	2,2,4
+
+.Loop_cbc_enc:
+	vperm	1,0,1,10
+	.long	0x10420D08
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	0,1,0,10
+	.long	0x10420508
+	lvx	0,10,6
+	addi	10,10,16
+	bdnz	.Loop_cbc_enc
+
+	vperm	1,0,1,10
+	.long	0x10420D08
+	lvx	1,10,6
+	li	10,16
+	vperm	0,1,0,10
+	.long	0x10820509
+	cmpldi	5,16
+
+	vperm	3,4,4,8
+	vsel	2,7,3,9
+	vor	7,3,3
+	stvx	2,0,4
+	addi	4,4,16
+	bge	.Lcbc_enc
+
+	b	.Lcbc_done
+
+.align	4
+.Lcbc_dec:
+	cmpldi	5,128
+	bge	_aesp8_cbc_decrypt8x
+	vor	3,5,5
+	lvx	5,0,3
+	addi	3,3,16
+	mtctr	9
+	subi	5,5,16
+
+	lvx	0,0,6
+	vperm	3,3,5,6
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	0,1,0,10
+	vxor	2,3,0
+	lvx	0,10,6
+	addi	10,10,16
+
+.Loop_cbc_dec:
+	vperm	1,0,1,10
+	.long	0x10420D48
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	0,1,0,10
+	.long	0x10420548
+	lvx	0,10,6
+	addi	10,10,16
+	bdnz	.Loop_cbc_dec
+
+	vperm	1,0,1,10
+	.long	0x10420D48
+	lvx	1,10,6
+	li	10,16
+	vperm	0,1,0,10
+	.long	0x10420549
+	cmpldi	5,16
+
+	vxor	2,2,4
+	vor	4,3,3
+	vperm	3,2,2,8
+	vsel	2,7,3,9
+	vor	7,3,3
+	stvx	2,0,4
+	addi	4,4,16
+	bge	.Lcbc_dec
+
+.Lcbc_done:
+	addi	4,4,-1
+	lvx	2,0,4
+	vsel	2,7,2,9
+	stvx	2,0,4
+
+	neg	8,7
+	li	10,15
+	vxor	0,0,0
+	vspltisb	9,-1
+	vspltisb	3,0x0f
+	lvsr	8,0,8
+	vperm	9,9,0,8
+	vxor	8,8,3
+	lvx	7,0,7
+	vperm	4,4,4,8
+	vsel	2,7,4,9
+	lvx	5,10,7
+	stvx	2,0,7
+	vsel	2,4,5,9
+	stvx	2,10,7
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,6,0
+.long	0
+.align	5
+_aesp8_cbc_decrypt8x:
+	stdu	1,-448(1)
+	li	10,207
+	li	11,223
+	stvx	20,10,1
+	addi	10,10,32
+	stvx	21,11,1
+	addi	11,11,32
+	stvx	22,10,1
+	addi	10,10,32
+	stvx	23,11,1
+	addi	11,11,32
+	stvx	24,10,1
+	addi	10,10,32
+	stvx	25,11,1
+	addi	11,11,32
+	stvx	26,10,1
+	addi	10,10,32
+	stvx	27,11,1
+	addi	11,11,32
+	stvx	28,10,1
+	addi	10,10,32
+	stvx	29,11,1
+	addi	11,11,32
+	stvx	30,10,1
+	stvx	31,11,1
+	li	0,-1
+	stw	12,396(1)
+	li	8,0x10
+	std	26,400(1)
+	li	26,0x20
+	std	27,408(1)
+	li	27,0x30
+	std	28,416(1)
+	li	28,0x40
+	std	29,424(1)
+	li	29,0x50
+	std	30,432(1)
+	li	30,0x60
+	std	31,440(1)
+	li	31,0x70
+	or	0,0,0
+
+	subi	9,9,3
+	subi	5,5,128
+
+	lvx	23,0,6
+	lvx	30,8,6
+	addi	6,6,0x20
+	lvx	31,0,6
+	vperm	23,30,23,10
+	addi	11,1,79
+	mtctr	9
+
+.Load_cbc_dec_key:
+	vperm	24,31,30,10
+	lvx	30,8,6
+	addi	6,6,0x20
+	stvx	24,0,11
+	vperm	25,30,31,10
+	lvx	31,0,6
+	stvx	25,8,11
+	addi	11,11,0x20
+	bdnz	.Load_cbc_dec_key
+
+	lvx	26,8,6
+	vperm	24,31,30,10
+	lvx	27,26,6
+	stvx	24,0,11
+	vperm	25,26,31,10
+	lvx	28,27,6
+	stvx	25,8,11
+	addi	11,1,79
+	vperm	26,27,26,10
+	lvx	29,28,6
+	vperm	27,28,27,10
+	lvx	30,29,6
+	vperm	28,29,28,10
+	lvx	31,30,6
+	vperm	29,30,29,10
+	lvx	14,31,6
+	vperm	30,31,30,10
+	lvx	24,0,11
+	vperm	31,14,31,10
+	lvx	25,8,11
+
+
+
+	subi	3,3,15
+
+	li	10,8
+	.long	0x7C001E99
+	lvsl	6,0,10
+	vspltisb	3,0x0f
+	.long	0x7C281E99
+	vxor	6,6,3
+	.long	0x7C5A1E99
+	vperm	0,0,0,6
+	.long	0x7C7B1E99
+	vperm	1,1,1,6
+	.long	0x7D5C1E99
+	vperm	2,2,2,6
+	vxor	14,0,23
+	.long	0x7D7D1E99
+	vperm	3,3,3,6
+	vxor	15,1,23
+	.long	0x7D9E1E99
+	vperm	10,10,10,6
+	vxor	16,2,23
+	.long	0x7DBF1E99
+	addi	3,3,0x80
+	vperm	11,11,11,6
+	vxor	17,3,23
+	vperm	12,12,12,6
+	vxor	18,10,23
+	vperm	13,13,13,6
+	vxor	19,11,23
+	vxor	20,12,23
+	vxor	21,13,23
+
+	mtctr	9
+	b	.Loop_cbc_dec8x
+.align	5
+.Loop_cbc_dec8x:
+	.long	0x11CEC548
+	.long	0x11EFC548
+	.long	0x1210C548
+	.long	0x1231C548
+	.long	0x1252C548
+	.long	0x1273C548
+	.long	0x1294C548
+	.long	0x12B5C548
+	lvx	24,26,11
+	addi	11,11,0x20
+
+	.long	0x11CECD48
+	.long	0x11EFCD48
+	.long	0x1210CD48
+	.long	0x1231CD48
+	.long	0x1252CD48
+	.long	0x1273CD48
+	.long	0x1294CD48
+	.long	0x12B5CD48
+	lvx	25,8,11
+	bdnz	.Loop_cbc_dec8x
+
+	subic	5,5,128
+	.long	0x11CEC548
+	.long	0x11EFC548
+	.long	0x1210C548
+	.long	0x1231C548
+	.long	0x1252C548
+	.long	0x1273C548
+	.long	0x1294C548
+	.long	0x12B5C548
+
+	subfe.	0,0,0
+	.long	0x11CECD48
+	.long	0x11EFCD48
+	.long	0x1210CD48
+	.long	0x1231CD48
+	.long	0x1252CD48
+	.long	0x1273CD48
+	.long	0x1294CD48
+	.long	0x12B5CD48
+
+	and	0,0,5
+	.long	0x11CED548
+	.long	0x11EFD548
+	.long	0x1210D548
+	.long	0x1231D548
+	.long	0x1252D548
+	.long	0x1273D548
+	.long	0x1294D548
+	.long	0x12B5D548
+
+	add	3,3,0
+
+
+
+	.long	0x11CEDD48
+	.long	0x11EFDD48
+	.long	0x1210DD48
+	.long	0x1231DD48
+	.long	0x1252DD48
+	.long	0x1273DD48
+	.long	0x1294DD48
+	.long	0x12B5DD48
+
+	addi	11,1,79
+	.long	0x11CEE548
+	.long	0x11EFE548
+	.long	0x1210E548
+	.long	0x1231E548
+	.long	0x1252E548
+	.long	0x1273E548
+	.long	0x1294E548
+	.long	0x12B5E548
+	lvx	24,0,11
+
+	.long	0x11CEED48
+	.long	0x11EFED48
+	.long	0x1210ED48
+	.long	0x1231ED48
+	.long	0x1252ED48
+	.long	0x1273ED48
+	.long	0x1294ED48
+	.long	0x12B5ED48
+	lvx	25,8,11
+
+	.long	0x11CEF548
+	vxor	4,4,31
+	.long	0x11EFF548
+	vxor	0,0,31
+	.long	0x1210F548
+	vxor	1,1,31
+	.long	0x1231F548
+	vxor	2,2,31
+	.long	0x1252F548
+	vxor	3,3,31
+	.long	0x1273F548
+	vxor	10,10,31
+	.long	0x1294F548
+	vxor	11,11,31
+	.long	0x12B5F548
+	vxor	12,12,31
+
+	.long	0x11CE2549
+	.long	0x11EF0549
+	.long	0x7C001E99
+	.long	0x12100D49
+	.long	0x7C281E99
+	.long	0x12311549
+	vperm	0,0,0,6
+	.long	0x7C5A1E99
+	.long	0x12521D49
+	vperm	1,1,1,6
+	.long	0x7C7B1E99
+	.long	0x12735549
+	vperm	2,2,2,6
+	.long	0x7D5C1E99
+	.long	0x12945D49
+	vperm	3,3,3,6
+	.long	0x7D7D1E99
+	.long	0x12B56549
+	vperm	10,10,10,6
+	.long	0x7D9E1E99
+	vor	4,13,13
+	vperm	11,11,11,6
+	.long	0x7DBF1E99
+	addi	3,3,0x80
+
+	vperm	14,14,14,6
+	vperm	15,15,15,6
+	.long	0x7DC02799
+	vperm	12,12,12,6
+	vxor	14,0,23
+	vperm	16,16,16,6
+	.long	0x7DE82799
+	vperm	13,13,13,6
+	vxor	15,1,23
+	vperm	17,17,17,6
+	.long	0x7E1A2799
+	vxor	16,2,23
+	vperm	18,18,18,6
+	.long	0x7E3B2799
+	vxor	17,3,23
+	vperm	19,19,19,6
+	.long	0x7E5C2799
+	vxor	18,10,23
+	vperm	20,20,20,6
+	.long	0x7E7D2799
+	vxor	19,11,23
+	vperm	21,21,21,6
+	.long	0x7E9E2799
+	vxor	20,12,23
+	.long	0x7EBF2799
+	addi	4,4,0x80
+	vxor	21,13,23
+
+	mtctr	9
+	beq	.Loop_cbc_dec8x
+
+	addic.	5,5,128
+	beq	.Lcbc_dec8x_done
+	nop	
+	nop	
+
+.Loop_cbc_dec8x_tail:
+	.long	0x11EFC548
+	.long	0x1210C548
+	.long	0x1231C548
+	.long	0x1252C548
+	.long	0x1273C548
+	.long	0x1294C548
+	.long	0x12B5C548
+	lvx	24,26,11
+	addi	11,11,0x20
+
+	.long	0x11EFCD48
+	.long	0x1210CD48
+	.long	0x1231CD48
+	.long	0x1252CD48
+	.long	0x1273CD48
+	.long	0x1294CD48
+	.long	0x12B5CD48
+	lvx	25,8,11
+	bdnz	.Loop_cbc_dec8x_tail
+
+	.long	0x11EFC548
+	.long	0x1210C548
+	.long	0x1231C548
+	.long	0x1252C548
+	.long	0x1273C548
+	.long	0x1294C548
+	.long	0x12B5C548
+
+	.long	0x11EFCD48
+	.long	0x1210CD48
+	.long	0x1231CD48
+	.long	0x1252CD48
+	.long	0x1273CD48
+	.long	0x1294CD48
+	.long	0x12B5CD48
+
+	.long	0x11EFD548
+	.long	0x1210D548
+	.long	0x1231D548
+	.long	0x1252D548
+	.long	0x1273D548
+	.long	0x1294D548
+	.long	0x12B5D548
+
+	.long	0x11EFDD48
+	.long	0x1210DD48
+	.long	0x1231DD48
+	.long	0x1252DD48
+	.long	0x1273DD48
+	.long	0x1294DD48
+	.long	0x12B5DD48
+
+	.long	0x11EFE548
+	.long	0x1210E548
+	.long	0x1231E548
+	.long	0x1252E548
+	.long	0x1273E548
+	.long	0x1294E548
+	.long	0x12B5E548
+
+	.long	0x11EFED48
+	.long	0x1210ED48
+	.long	0x1231ED48
+	.long	0x1252ED48
+	.long	0x1273ED48
+	.long	0x1294ED48
+	.long	0x12B5ED48
+
+	.long	0x11EFF548
+	vxor	4,4,31
+	.long	0x1210F548
+	vxor	1,1,31
+	.long	0x1231F548
+	vxor	2,2,31
+	.long	0x1252F548
+	vxor	3,3,31
+	.long	0x1273F548
+	vxor	10,10,31
+	.long	0x1294F548
+	vxor	11,11,31
+	.long	0x12B5F548
+	vxor	12,12,31
+
+	cmplwi	5,32
+	blt	.Lcbc_dec8x_one
+	nop	
+	beq	.Lcbc_dec8x_two
+	cmplwi	5,64
+	blt	.Lcbc_dec8x_three
+	nop	
+	beq	.Lcbc_dec8x_four
+	cmplwi	5,96
+	blt	.Lcbc_dec8x_five
+	nop	
+	beq	.Lcbc_dec8x_six
+
+.Lcbc_dec8x_seven:
+	.long	0x11EF2549
+	.long	0x12100D49
+	.long	0x12311549
+	.long	0x12521D49
+	.long	0x12735549
+	.long	0x12945D49
+	.long	0x12B56549
+	vor	4,13,13
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	vperm	18,18,18,6
+	.long	0x7E3A2799
+	vperm	19,19,19,6
+	.long	0x7E5B2799
+	vperm	20,20,20,6
+	.long	0x7E7C2799
+	vperm	21,21,21,6
+	.long	0x7E9D2799
+	.long	0x7EBE2799
+	addi	4,4,0x70
+	b	.Lcbc_dec8x_done
+
+.align	5
+.Lcbc_dec8x_six:
+	.long	0x12102549
+	.long	0x12311549
+	.long	0x12521D49
+	.long	0x12735549
+	.long	0x12945D49
+	.long	0x12B56549
+	vor	4,13,13
+
+	vperm	16,16,16,6
+	vperm	17,17,17,6
+	.long	0x7E002799
+	vperm	18,18,18,6
+	.long	0x7E282799
+	vperm	19,19,19,6
+	.long	0x7E5A2799
+	vperm	20,20,20,6
+	.long	0x7E7B2799
+	vperm	21,21,21,6
+	.long	0x7E9C2799
+	.long	0x7EBD2799
+	addi	4,4,0x60
+	b	.Lcbc_dec8x_done
+
+.align	5
+.Lcbc_dec8x_five:
+	.long	0x12312549
+	.long	0x12521D49
+	.long	0x12735549
+	.long	0x12945D49
+	.long	0x12B56549
+	vor	4,13,13
+
+	vperm	17,17,17,6
+	vperm	18,18,18,6
+	.long	0x7E202799
+	vperm	19,19,19,6
+	.long	0x7E482799
+	vperm	20,20,20,6
+	.long	0x7E7A2799
+	vperm	21,21,21,6
+	.long	0x7E9B2799
+	.long	0x7EBC2799
+	addi	4,4,0x50
+	b	.Lcbc_dec8x_done
+
+.align	5
+.Lcbc_dec8x_four:
+	.long	0x12522549
+	.long	0x12735549
+	.long	0x12945D49
+	.long	0x12B56549
+	vor	4,13,13
+
+	vperm	18,18,18,6
+	vperm	19,19,19,6
+	.long	0x7E402799
+	vperm	20,20,20,6
+	.long	0x7E682799
+	vperm	21,21,21,6
+	.long	0x7E9A2799
+	.long	0x7EBB2799
+	addi	4,4,0x40
+	b	.Lcbc_dec8x_done
+
+.align	5
+.Lcbc_dec8x_three:
+	.long	0x12732549
+	.long	0x12945D49
+	.long	0x12B56549
+	vor	4,13,13
+
+	vperm	19,19,19,6
+	vperm	20,20,20,6
+	.long	0x7E602799
+	vperm	21,21,21,6
+	.long	0x7E882799
+	.long	0x7EBA2799
+	addi	4,4,0x30
+	b	.Lcbc_dec8x_done
+
+.align	5
+.Lcbc_dec8x_two:
+	.long	0x12942549
+	.long	0x12B56549
+	vor	4,13,13
+
+	vperm	20,20,20,6
+	vperm	21,21,21,6
+	.long	0x7E802799
+	.long	0x7EA82799
+	addi	4,4,0x20
+	b	.Lcbc_dec8x_done
+
+.align	5
+.Lcbc_dec8x_one:
+	.long	0x12B52549
+	vor	4,13,13
+
+	vperm	21,21,21,6
+	.long	0x7EA02799
+	addi	4,4,0x10
+
+.Lcbc_dec8x_done:
+	vperm	4,4,4,6
+	.long	0x7C803F99
+
+	li	10,79
+	li	11,95
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+
+	or	12,12,12
+	lvx	20,10,1
+	addi	10,10,32
+	lvx	21,11,1
+	addi	11,11,32
+	lvx	22,10,1
+	addi	10,10,32
+	lvx	23,11,1
+	addi	11,11,32
+	lvx	24,10,1
+	addi	10,10,32
+	lvx	25,11,1
+	addi	11,11,32
+	lvx	26,10,1
+	addi	10,10,32
+	lvx	27,11,1
+	addi	11,11,32
+	lvx	28,10,1
+	addi	10,10,32
+	lvx	29,11,1
+	addi	11,11,32
+	lvx	30,10,1
+	lvx	31,11,1
+	ld	26,400(1)
+	ld	27,408(1)
+	ld	28,416(1)
+	ld	29,424(1)
+	ld	30,432(1)
+	ld	31,440(1)
+	addi	1,1,448
+	blr	
+.long	0
+.byte	0,12,0x04,0,0x80,6,6,0
+.long	0
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+.localentry	aes_hw_ctr32_encrypt_blocks,0
+
+	cmpldi	5,1
+	.long	0x4dc00020
+
+	lis	0,0xfff0
+	li	12,-1
+	or	0,0,0
+
+	li	10,15
+	vxor	0,0,0
+	vspltisb	3,0x0f
+
+	lvx	4,0,7
+	lvsl	6,0,7
+	lvx	5,10,7
+	vspltisb	11,1
+	vxor	6,6,3
+	vperm	4,4,5,6
+	vsldoi	11,0,11,1
+
+	neg	11,3
+	lvsr	10,0,6
+	lwz	9,240(6)
+
+	lvsr	6,0,11
+	lvx	5,0,3
+	addi	3,3,15
+	vxor	6,6,3
+
+	srwi	9,9,1
+	li	10,16
+	subi	9,9,1
+
+	cmpldi	5,8
+	bge	_aesp8_ctr32_encrypt8x
+
+	lvsl	8,0,4
+	vspltisb	9,-1
+	lvx	7,0,4
+	vperm	9,9,0,8
+	vxor	8,8,3
+
+	lvx	0,0,6
+	mtctr	9
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	0,1,0,10
+	vxor	2,4,0
+	lvx	0,10,6
+	addi	10,10,16
+	b	.Loop_ctr32_enc
+
+.align	5
+.Loop_ctr32_enc:
+	vperm	1,0,1,10
+	.long	0x10420D08
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	0,1,0,10
+	.long	0x10420508
+	lvx	0,10,6
+	addi	10,10,16
+	bdnz	.Loop_ctr32_enc
+
+	vadduwm	4,4,11
+	vor	3,5,5
+	lvx	5,0,3
+	addi	3,3,16
+	subic.	5,5,1
+
+	vperm	1,0,1,10
+	.long	0x10420D08
+	lvx	1,10,6
+	vperm	3,3,5,6
+	li	10,16
+	vperm	1,1,0,10
+	lvx	0,0,6
+	vxor	3,3,1
+	.long	0x10421D09
+
+	lvx	1,10,6
+	addi	10,10,16
+	vperm	2,2,2,8
+	vsel	3,7,2,9
+	mtctr	9
+	vperm	0,1,0,10
+	vor	7,2,2
+	vxor	2,4,0
+	lvx	0,10,6
+	addi	10,10,16
+	stvx	3,0,4
+	addi	4,4,16
+	bne	.Loop_ctr32_enc
+
+	addi	4,4,-1
+	lvx	2,0,4
+	vsel	2,7,2,9
+	stvx	2,0,4
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,6,0
+.long	0
+.align	5
+_aesp8_ctr32_encrypt8x:
+	stdu	1,-448(1)
+	li	10,207
+	li	11,223
+	stvx	20,10,1
+	addi	10,10,32
+	stvx	21,11,1
+	addi	11,11,32
+	stvx	22,10,1
+	addi	10,10,32
+	stvx	23,11,1
+	addi	11,11,32
+	stvx	24,10,1
+	addi	10,10,32
+	stvx	25,11,1
+	addi	11,11,32
+	stvx	26,10,1
+	addi	10,10,32
+	stvx	27,11,1
+	addi	11,11,32
+	stvx	28,10,1
+	addi	10,10,32
+	stvx	29,11,1
+	addi	11,11,32
+	stvx	30,10,1
+	stvx	31,11,1
+	li	0,-1
+	stw	12,396(1)
+	li	8,0x10
+	std	26,400(1)
+	li	26,0x20
+	std	27,408(1)
+	li	27,0x30
+	std	28,416(1)
+	li	28,0x40
+	std	29,424(1)
+	li	29,0x50
+	std	30,432(1)
+	li	30,0x60
+	std	31,440(1)
+	li	31,0x70
+	or	0,0,0
+
+	subi	9,9,3
+
+	lvx	23,0,6
+	lvx	30,8,6
+	addi	6,6,0x20
+	lvx	31,0,6
+	vperm	23,30,23,10
+	addi	11,1,79
+	mtctr	9
+
+.Load_ctr32_enc_key:
+	vperm	24,31,30,10
+	lvx	30,8,6
+	addi	6,6,0x20
+	stvx	24,0,11
+	vperm	25,30,31,10
+	lvx	31,0,6
+	stvx	25,8,11
+	addi	11,11,0x20
+	bdnz	.Load_ctr32_enc_key
+
+	lvx	26,8,6
+	vperm	24,31,30,10
+	lvx	27,26,6
+	stvx	24,0,11
+	vperm	25,26,31,10
+	lvx	28,27,6
+	stvx	25,8,11
+	addi	11,1,79
+	vperm	26,27,26,10
+	lvx	29,28,6
+	vperm	27,28,27,10
+	lvx	30,29,6
+	vperm	28,29,28,10
+	lvx	31,30,6
+	vperm	29,30,29,10
+	lvx	15,31,6
+	vperm	30,31,30,10
+	lvx	24,0,11
+	vperm	31,15,31,10
+	lvx	25,8,11
+
+	vadduwm	7,11,11
+	subi	3,3,15
+	sldi	5,5,4
+
+	vadduwm	16,4,11
+	vadduwm	17,4,7
+	vxor	15,4,23
+	li	10,8
+	vadduwm	18,16,7
+	vxor	16,16,23
+	lvsl	6,0,10
+	vadduwm	19,17,7
+	vxor	17,17,23
+	vspltisb	3,0x0f
+	vadduwm	20,18,7
+	vxor	18,18,23
+	vxor	6,6,3
+	vadduwm	21,19,7
+	vxor	19,19,23
+	vadduwm	22,20,7
+	vxor	20,20,23
+	vadduwm	4,21,7
+	vxor	21,21,23
+	vxor	22,22,23
+
+	mtctr	9
+	b	.Loop_ctr32_enc8x
+.align	5
+.Loop_ctr32_enc8x:
+	.long	0x11EFC508
+	.long	0x1210C508
+	.long	0x1231C508
+	.long	0x1252C508
+	.long	0x1273C508
+	.long	0x1294C508
+	.long	0x12B5C508
+	.long	0x12D6C508
+.Loop_ctr32_enc8x_middle:
+	lvx	24,26,11
+	addi	11,11,0x20
+
+	.long	0x11EFCD08
+	.long	0x1210CD08
+	.long	0x1231CD08
+	.long	0x1252CD08
+	.long	0x1273CD08
+	.long	0x1294CD08
+	.long	0x12B5CD08
+	.long	0x12D6CD08
+	lvx	25,8,11
+	bdnz	.Loop_ctr32_enc8x
+
+	subic	11,5,256
+	.long	0x11EFC508
+	.long	0x1210C508
+	.long	0x1231C508
+	.long	0x1252C508
+	.long	0x1273C508
+	.long	0x1294C508
+	.long	0x12B5C508
+	.long	0x12D6C508
+
+	subfe	0,0,0
+	.long	0x11EFCD08
+	.long	0x1210CD08
+	.long	0x1231CD08
+	.long	0x1252CD08
+	.long	0x1273CD08
+	.long	0x1294CD08
+	.long	0x12B5CD08
+	.long	0x12D6CD08
+
+	and	0,0,11
+	addi	11,1,79
+	.long	0x11EFD508
+	.long	0x1210D508
+	.long	0x1231D508
+	.long	0x1252D508
+	.long	0x1273D508
+	.long	0x1294D508
+	.long	0x12B5D508
+	.long	0x12D6D508
+	lvx	24,0,11
+
+	subic	5,5,129
+	.long	0x11EFDD08
+	addi	5,5,1
+	.long	0x1210DD08
+	.long	0x1231DD08
+	.long	0x1252DD08
+	.long	0x1273DD08
+	.long	0x1294DD08
+	.long	0x12B5DD08
+	.long	0x12D6DD08
+	lvx	25,8,11
+
+	.long	0x11EFE508
+	.long	0x7C001E99
+	.long	0x1210E508
+	.long	0x7C281E99
+	.long	0x1231E508
+	.long	0x7C5A1E99
+	.long	0x1252E508
+	.long	0x7C7B1E99
+	.long	0x1273E508
+	.long	0x7D5C1E99
+	.long	0x1294E508
+	.long	0x7D9D1E99
+	.long	0x12B5E508
+	.long	0x7DBE1E99
+	.long	0x12D6E508
+	.long	0x7DDF1E99
+	addi	3,3,0x80
+
+	.long	0x11EFED08
+	vperm	0,0,0,6
+	.long	0x1210ED08
+	vperm	1,1,1,6
+	.long	0x1231ED08
+	vperm	2,2,2,6
+	.long	0x1252ED08
+	vperm	3,3,3,6
+	.long	0x1273ED08
+	vperm	10,10,10,6
+	.long	0x1294ED08
+	vperm	12,12,12,6
+	.long	0x12B5ED08
+	vperm	13,13,13,6
+	.long	0x12D6ED08
+	vperm	14,14,14,6
+
+	add	3,3,0
+
+
+
+	subfe.	0,0,0
+	.long	0x11EFF508
+	vxor	0,0,31
+	.long	0x1210F508
+	vxor	1,1,31
+	.long	0x1231F508
+	vxor	2,2,31
+	.long	0x1252F508
+	vxor	3,3,31
+	.long	0x1273F508
+	vxor	10,10,31
+	.long	0x1294F508
+	vxor	12,12,31
+	.long	0x12B5F508
+	vxor	13,13,31
+	.long	0x12D6F508
+	vxor	14,14,31
+
+	bne	.Lctr32_enc8x_break
+
+	.long	0x100F0509
+	.long	0x10300D09
+	vadduwm	16,4,11
+	.long	0x10511509
+	vadduwm	17,4,7
+	vxor	15,4,23
+	.long	0x10721D09
+	vadduwm	18,16,7
+	vxor	16,16,23
+	.long	0x11535509
+	vadduwm	19,17,7
+	vxor	17,17,23
+	.long	0x11946509
+	vadduwm	20,18,7
+	vxor	18,18,23
+	.long	0x11B56D09
+	vadduwm	21,19,7
+	vxor	19,19,23
+	.long	0x11D67509
+	vadduwm	22,20,7
+	vxor	20,20,23
+	vperm	0,0,0,6
+	vadduwm	4,21,7
+	vxor	21,21,23
+	vperm	1,1,1,6
+	vxor	22,22,23
+	mtctr	9
+
+	.long	0x11EFC508
+	.long	0x7C002799
+	vperm	2,2,2,6
+	.long	0x1210C508
+	.long	0x7C282799
+	vperm	3,3,3,6
+	.long	0x1231C508
+	.long	0x7C5A2799
+	vperm	10,10,10,6
+	.long	0x1252C508
+	.long	0x7C7B2799
+	vperm	12,12,12,6
+	.long	0x1273C508
+	.long	0x7D5C2799
+	vperm	13,13,13,6
+	.long	0x1294C508
+	.long	0x7D9D2799
+	vperm	14,14,14,6
+	.long	0x12B5C508
+	.long	0x7DBE2799
+	.long	0x12D6C508
+	.long	0x7DDF2799
+	addi	4,4,0x80
+
+	b	.Loop_ctr32_enc8x_middle
+
+.align	5
+.Lctr32_enc8x_break:
+	cmpwi	5,-0x60
+	blt	.Lctr32_enc8x_one
+	nop	
+	beq	.Lctr32_enc8x_two
+	cmpwi	5,-0x40
+	blt	.Lctr32_enc8x_three
+	nop	
+	beq	.Lctr32_enc8x_four
+	cmpwi	5,-0x20
+	blt	.Lctr32_enc8x_five
+	nop	
+	beq	.Lctr32_enc8x_six
+	cmpwi	5,0x00
+	blt	.Lctr32_enc8x_seven
+
+.Lctr32_enc8x_eight:
+	.long	0x11EF0509
+	.long	0x12100D09
+	.long	0x12311509
+	.long	0x12521D09
+	.long	0x12735509
+	.long	0x12946509
+	.long	0x12B56D09
+	.long	0x12D67509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	vperm	18,18,18,6
+	.long	0x7E3A2799
+	vperm	19,19,19,6
+	.long	0x7E5B2799
+	vperm	20,20,20,6
+	.long	0x7E7C2799
+	vperm	21,21,21,6
+	.long	0x7E9D2799
+	vperm	22,22,22,6
+	.long	0x7EBE2799
+	.long	0x7EDF2799
+	addi	4,4,0x80
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_seven:
+	.long	0x11EF0D09
+	.long	0x12101509
+	.long	0x12311D09
+	.long	0x12525509
+	.long	0x12736509
+	.long	0x12946D09
+	.long	0x12B57509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	vperm	18,18,18,6
+	.long	0x7E3A2799
+	vperm	19,19,19,6
+	.long	0x7E5B2799
+	vperm	20,20,20,6
+	.long	0x7E7C2799
+	vperm	21,21,21,6
+	.long	0x7E9D2799
+	.long	0x7EBE2799
+	addi	4,4,0x70
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_six:
+	.long	0x11EF1509
+	.long	0x12101D09
+	.long	0x12315509
+	.long	0x12526509
+	.long	0x12736D09
+	.long	0x12947509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	vperm	18,18,18,6
+	.long	0x7E3A2799
+	vperm	19,19,19,6
+	.long	0x7E5B2799
+	vperm	20,20,20,6
+	.long	0x7E7C2799
+	.long	0x7E9D2799
+	addi	4,4,0x60
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_five:
+	.long	0x11EF1D09
+	.long	0x12105509
+	.long	0x12316509
+	.long	0x12526D09
+	.long	0x12737509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	vperm	18,18,18,6
+	.long	0x7E3A2799
+	vperm	19,19,19,6
+	.long	0x7E5B2799
+	.long	0x7E7C2799
+	addi	4,4,0x50
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_four:
+	.long	0x11EF5509
+	.long	0x12106509
+	.long	0x12316D09
+	.long	0x12527509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	vperm	18,18,18,6
+	.long	0x7E3A2799
+	.long	0x7E5B2799
+	addi	4,4,0x40
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_three:
+	.long	0x11EF6509
+	.long	0x12106D09
+	.long	0x12317509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	vperm	17,17,17,6
+	.long	0x7E082799
+	.long	0x7E3A2799
+	addi	4,4,0x30
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_two:
+	.long	0x11EF6D09
+	.long	0x12107509
+
+	vperm	15,15,15,6
+	vperm	16,16,16,6
+	.long	0x7DE02799
+	.long	0x7E082799
+	addi	4,4,0x20
+	b	.Lctr32_enc8x_done
+
+.align	5
+.Lctr32_enc8x_one:
+	.long	0x11EF7509
+
+	vperm	15,15,15,6
+	.long	0x7DE02799
+	addi	4,4,0x10
+
+.Lctr32_enc8x_done:
+	li	10,79
+	li	11,95
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+	stvx	6,10,1
+	addi	10,10,32
+	stvx	6,11,1
+	addi	11,11,32
+
+	or	12,12,12
+	lvx	20,10,1
+	addi	10,10,32
+	lvx	21,11,1
+	addi	11,11,32
+	lvx	22,10,1
+	addi	10,10,32
+	lvx	23,11,1
+	addi	11,11,32
+	lvx	24,10,1
+	addi	10,10,32
+	lvx	25,11,1
+	addi	11,11,32
+	lvx	26,10,1
+	addi	10,10,32
+	lvx	27,11,1
+	addi	11,11,32
+	lvx	28,10,1
+	addi	10,10,32
+	lvx	29,11,1
+	addi	11,11,32
+	lvx	30,10,1
+	lvx	31,11,1
+	ld	26,400(1)
+	ld	27,408(1)
+	ld	28,416(1)
+	ld	29,424(1)
+	ld	30,432(1)
+	ld	31,440(1)
+	addi	1,1,448
+	blr	
+.long	0
+.byte	0,12,0x04,0,0x80,6,6,0
+.long	0
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl	aes_hw_xts_encrypt
+.type	aes_hw_xts_encrypt,@function
+.align	5
+aes_hw_xts_encrypt:
+.localentry	aes_hw_xts_encrypt,0
+
+	mr	10,3
+	li	3,-1
+	cmpldi	5,16
+	.long	0x4dc00020
+
+	lis	0,0xfff0
+	li	12,-1
+	li	11,0
+	or	0,0,0
+
+	vspltisb	9,0x07
+	lvsl	6,11,11
+	vspltisb	11,0x0f
+	vxor	6,6,9
+
+	li	3,15
+	lvx	8,0,8
+	lvsl	5,0,8
+	lvx	4,3,8
+	vxor	5,5,11
+	vperm	8,8,4,5
+
+	neg	11,10
+	lvsr	5,0,11
+	lvx	2,0,10
+	addi	10,10,15
+	vxor	5,5,11
+
+	cmpldi	7,0
+	beq	.Lxts_enc_no_key2
+
+	lvsr	7,0,7
+	lwz	9,240(7)
+	srwi	9,9,1
+	subi	9,9,1
+	li	3,16
+
+	lvx	0,0,7
+	lvx	1,3,7
+	addi	3,3,16
+	vperm	0,1,0,7
+	vxor	8,8,0
+	lvx	0,3,7
+	addi	3,3,16
+	mtctr	9
+
+.Ltweak_xts_enc:
+	vperm	1,0,1,7
+	.long	0x11080D08
+	lvx	1,3,7
+	addi	3,3,16
+	vperm	0,1,0,7
+	.long	0x11080508
+	lvx	0,3,7
+	addi	3,3,16
+	bdnz	.Ltweak_xts_enc
+
+	vperm	1,0,1,7
+	.long	0x11080D08
+	lvx	1,3,7
+	vperm	0,1,0,7
+	.long	0x11080509
+
+	li	8,0
+	b	.Lxts_enc
+
+.Lxts_enc_no_key2:
+	li	3,-16
+	and	5,5,3
+
+
+.Lxts_enc:
+	lvx	4,0,10
+	addi	10,10,16
+
+	lvsr	7,0,6
+	lwz	9,240(6)
+	srwi	9,9,1
+	subi	9,9,1
+	li	3,16
+
+	vslb	10,9,9
+	vor	10,10,9
+	vspltisb	11,1
+	vsldoi	10,10,11,15
+
+	cmpldi	5,96
+	bge	_aesp8_xts_encrypt6x
+
+	andi.	7,5,15
+	subic	0,5,32
+	subi	7,7,16
+	subfe	0,0,0
+	and	0,0,7
+	add	10,10,0
+
+	lvx	0,0,6
+	lvx	1,3,6
+	addi	3,3,16
+	vperm	2,2,4,5
+	vperm	0,1,0,7
+	vxor	2,2,8
+	vxor	2,2,0
+	lvx	0,3,6
+	addi	3,3,16
+	mtctr	9
+	b	.Loop_xts_enc
+
+.align	5
+.Loop_xts_enc:
+	vperm	1,0,1,7
+	.long	0x10420D08
+	lvx	1,3,6
+	addi	3,3,16
+	vperm	0,1,0,7
+	.long	0x10420508
+	lvx	0,3,6
+	addi	3,3,16
+	bdnz	.Loop_xts_enc
+
+	vperm	1,0,1,7
+	.long	0x10420D08
+	lvx	1,3,6
+	li	3,16
+	vperm	0,1,0,7
+	vxor	0,0,8
+	.long	0x10620509
+
+	vperm	11,3,3,6
+
+	.long	0x7D602799
+
+	addi	4,4,16
+
+	subic.	5,5,16
+	beq	.Lxts_enc_done
+
+	vor	2,4,4
+	lvx	4,0,10
+	addi	10,10,16
+	lvx	0,0,6
+	lvx	1,3,6
+	addi	3,3,16
+
+	subic	0,5,32
+	subfe	0,0,0
+	and	0,0,7
+	add	10,10,0
+
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	8,8,11
+
+	vperm	2,2,4,5
+	vperm	0,1,0,7
+	vxor	2,2,8
+	vxor	3,3,0
+	vxor	2,2,0
+	lvx	0,3,6
+	addi	3,3,16
+
+	mtctr	9
+	cmpldi	5,16
+	bge	.Loop_xts_enc
+
+	vxor	3,3,8
+	lvsr	5,0,5
+	vxor	4,4,4
+	vspltisb	11,-1
+	vperm	4,4,11,5
+	vsel	2,2,3,4
+
+	subi	11,4,17
+	subi	4,4,16
+	mtctr	5
+	li	5,16
+.Loop_xts_enc_steal:
+	lbzu	0,1(11)
+	stb	0,16(11)
+	bdnz	.Loop_xts_enc_steal
+
+	mtctr	9
+	b	.Loop_xts_enc
+
+.Lxts_enc_done:
+	cmpldi	8,0
+	beq	.Lxts_enc_ret
+
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	8,8,11
+
+	vperm	8,8,8,6
+	.long	0x7D004799
+
+.Lxts_enc_ret:
+	or	12,12,12
+	li	3,0
+	blr	
+.long	0
+.byte	0,12,0x04,0,0x80,6,6,0
+.long	0
+.size	aes_hw_xts_encrypt,.-aes_hw_xts_encrypt
+
+.globl	aes_hw_xts_decrypt
+.type	aes_hw_xts_decrypt,@function
+.align	5
+aes_hw_xts_decrypt:
+.localentry	aes_hw_xts_decrypt,0
+
+	mr	10,3
+	li	3,-1
+	cmpldi	5,16
+	.long	0x4dc00020
+
+	lis	0,0xfff8
+	li	12,-1
+	li	11,0
+	or	0,0,0
+
+	andi.	0,5,15
+	neg	0,0
+	andi.	0,0,16
+	sub	5,5,0
+
+	vspltisb	9,0x07
+	lvsl	6,11,11
+	vspltisb	11,0x0f
+	vxor	6,6,9
+
+	li	3,15
+	lvx	8,0,8
+	lvsl	5,0,8
+	lvx	4,3,8
+	vxor	5,5,11
+	vperm	8,8,4,5
+
+	neg	11,10
+	lvsr	5,0,11
+	lvx	2,0,10
+	addi	10,10,15
+	vxor	5,5,11
+
+	cmpldi	7,0
+	beq	.Lxts_dec_no_key2
+
+	lvsr	7,0,7
+	lwz	9,240(7)
+	srwi	9,9,1
+	subi	9,9,1
+	li	3,16
+
+	lvx	0,0,7
+	lvx	1,3,7
+	addi	3,3,16
+	vperm	0,1,0,7
+	vxor	8,8,0
+	lvx	0,3,7
+	addi	3,3,16
+	mtctr	9
+
+.Ltweak_xts_dec:
+	vperm	1,0,1,7
+	.long	0x11080D08
+	lvx	1,3,7
+	addi	3,3,16
+	vperm	0,1,0,7
+	.long	0x11080508
+	lvx	0,3,7
+	addi	3,3,16
+	bdnz	.Ltweak_xts_dec
+
+	vperm	1,0,1,7
+	.long	0x11080D08
+	lvx	1,3,7
+	vperm	0,1,0,7
+	.long	0x11080509
+
+	li	8,0
+	b	.Lxts_dec
+
+.Lxts_dec_no_key2:
+	neg	3,5
+	andi.	3,3,15
+	add	5,5,3
+
+
+.Lxts_dec:
+	lvx	4,0,10
+	addi	10,10,16
+
+	lvsr	7,0,6
+	lwz	9,240(6)
+	srwi	9,9,1
+	subi	9,9,1
+	li	3,16
+
+	vslb	10,9,9
+	vor	10,10,9
+	vspltisb	11,1
+	vsldoi	10,10,11,15
+
+	cmpldi	5,96
+	bge	_aesp8_xts_decrypt6x
+
+	lvx	0,0,6
+	lvx	1,3,6
+	addi	3,3,16
+	vperm	2,2,4,5
+	vperm	0,1,0,7
+	vxor	2,2,8
+	vxor	2,2,0
+	lvx	0,3,6
+	addi	3,3,16
+	mtctr	9
+
+	cmpldi	5,16
+	blt	.Ltail_xts_dec
+
+
+.align	5
+.Loop_xts_dec:
+	vperm	1,0,1,7
+	.long	0x10420D48
+	lvx	1,3,6
+	addi	3,3,16
+	vperm	0,1,0,7
+	.long	0x10420548
+	lvx	0,3,6
+	addi	3,3,16
+	bdnz	.Loop_xts_dec
+
+	vperm	1,0,1,7
+	.long	0x10420D48
+	lvx	1,3,6
+	li	3,16
+	vperm	0,1,0,7
+	vxor	0,0,8
+	.long	0x10620549
+
+	vperm	11,3,3,6
+
+	.long	0x7D602799
+
+	addi	4,4,16
+
+	subic.	5,5,16
+	beq	.Lxts_dec_done
+
+	vor	2,4,4
+	lvx	4,0,10
+	addi	10,10,16
+	lvx	0,0,6
+	lvx	1,3,6
+	addi	3,3,16
+
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	8,8,11
+
+	vperm	2,2,4,5
+	vperm	0,1,0,7
+	vxor	2,2,8
+	vxor	2,2,0
+	lvx	0,3,6
+	addi	3,3,16
+
+	mtctr	9
+	cmpldi	5,16
+	bge	.Loop_xts_dec
+
+.Ltail_xts_dec:
+	vsrab	11,8,9
+	vaddubm	12,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	12,12,11
+
+	subi	10,10,16
+	add	10,10,5
+
+	vxor	2,2,8
+	vxor	2,2,12
+
+.Loop_xts_dec_short:
+	vperm	1,0,1,7
+	.long	0x10420D48
+	lvx	1,3,6
+	addi	3,3,16
+	vperm	0,1,0,7
+	.long	0x10420548
+	lvx	0,3,6
+	addi	3,3,16
+	bdnz	.Loop_xts_dec_short
+
+	vperm	1,0,1,7
+	.long	0x10420D48
+	lvx	1,3,6
+	li	3,16
+	vperm	0,1,0,7
+	vxor	0,0,12
+	.long	0x10620549
+
+	vperm	11,3,3,6
+
+	.long	0x7D602799
+
+
+	vor	2,4,4
+	lvx	4,0,10
+
+	lvx	0,0,6
+	lvx	1,3,6
+	addi	3,3,16
+	vperm	2,2,4,5
+	vperm	0,1,0,7
+
+	lvsr	5,0,5
+	vxor	4,4,4
+	vspltisb	11,-1
+	vperm	4,4,11,5
+	vsel	2,2,3,4
+
+	vxor	0,0,8
+	vxor	2,2,0
+	lvx	0,3,6
+	addi	3,3,16
+
+	subi	11,4,1
+	mtctr	5
+	li	5,16
+.Loop_xts_dec_steal:
+	lbzu	0,1(11)
+	stb	0,16(11)
+	bdnz	.Loop_xts_dec_steal
+
+	mtctr	9
+	b	.Loop_xts_dec
+
+.Lxts_dec_done:
+	cmpldi	8,0
+	beq	.Lxts_dec_ret
+
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	8,8,11
+
+	vperm	8,8,8,6
+	.long	0x7D004799
+
+.Lxts_dec_ret:
+	or	12,12,12
+	li	3,0
+	blr	
+.long	0
+.byte	0,12,0x04,0,0x80,6,6,0
+.long	0
+.size	aes_hw_xts_decrypt,.-aes_hw_xts_decrypt
+.align	5
+_aesp8_xts_encrypt6x:
+	stdu	1,-448(1)
+	mflr	11
+	li	7,207
+	li	3,223
+	std	11,464(1)
+	stvx	20,7,1
+	addi	7,7,32
+	stvx	21,3,1
+	addi	3,3,32
+	stvx	22,7,1
+	addi	7,7,32
+	stvx	23,3,1
+	addi	3,3,32
+	stvx	24,7,1
+	addi	7,7,32
+	stvx	25,3,1
+	addi	3,3,32
+	stvx	26,7,1
+	addi	7,7,32
+	stvx	27,3,1
+	addi	3,3,32
+	stvx	28,7,1
+	addi	7,7,32
+	stvx	29,3,1
+	addi	3,3,32
+	stvx	30,7,1
+	stvx	31,3,1
+	li	0,-1
+	stw	12,396(1)
+	li	3,0x10
+	std	26,400(1)
+	li	26,0x20
+	std	27,408(1)
+	li	27,0x30
+	std	28,416(1)
+	li	28,0x40
+	std	29,424(1)
+	li	29,0x50
+	std	30,432(1)
+	li	30,0x60
+	std	31,440(1)
+	li	31,0x70
+	or	0,0,0
+
+	subi	9,9,3
+
+	lvx	23,0,6
+	lvx	30,3,6
+	addi	6,6,0x20
+	lvx	31,0,6
+	vperm	23,30,23,7
+	addi	7,1,79
+	mtctr	9
+
+.Load_xts_enc_key:
+	vperm	24,31,30,7
+	lvx	30,3,6
+	addi	6,6,0x20
+	stvx	24,0,7
+	vperm	25,30,31,7
+	lvx	31,0,6
+	stvx	25,3,7
+	addi	7,7,0x20
+	bdnz	.Load_xts_enc_key
+
+	lvx	26,3,6
+	vperm	24,31,30,7
+	lvx	27,26,6
+	stvx	24,0,7
+	vperm	25,26,31,7
+	lvx	28,27,6
+	stvx	25,3,7
+	addi	7,1,79
+	vperm	26,27,26,7
+	lvx	29,28,6
+	vperm	27,28,27,7
+	lvx	30,29,6
+	vperm	28,29,28,7
+	lvx	31,30,6
+	vperm	29,30,29,7
+	lvx	22,31,6
+	vperm	30,31,30,7
+	lvx	24,0,7
+	vperm	31,22,31,7
+	lvx	25,3,7
+
+	vperm	0,2,4,5
+	subi	10,10,31
+	vxor	17,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	7,0,17
+	vxor	8,8,11
+
+	.long	0x7C235699
+	vxor	18,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	1,1,1,6
+	vand	11,11,10
+	vxor	12,1,18
+	vxor	8,8,11
+
+	.long	0x7C5A5699
+	andi.	31,5,15
+	vxor	19,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	2,2,2,6
+	vand	11,11,10
+	vxor	13,2,19
+	vxor	8,8,11
+
+	.long	0x7C7B5699
+	sub	5,5,31
+	vxor	20,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	3,3,3,6
+	vand	11,11,10
+	vxor	14,3,20
+	vxor	8,8,11
+
+	.long	0x7C9C5699
+	subi	5,5,0x60
+	vxor	21,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	4,4,4,6
+	vand	11,11,10
+	vxor	15,4,21
+	vxor	8,8,11
+
+	.long	0x7CBD5699
+	addi	10,10,0x60
+	vxor	22,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	5,5,5,6
+	vand	11,11,10
+	vxor	16,5,22
+	vxor	8,8,11
+
+	vxor	31,31,23
+	mtctr	9
+	b	.Loop_xts_enc6x
+
+.align	5
+.Loop_xts_enc6x:
+	.long	0x10E7C508
+	.long	0x118CC508
+	.long	0x11ADC508
+	.long	0x11CEC508
+	.long	0x11EFC508
+	.long	0x1210C508
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD08
+	.long	0x118CCD08
+	.long	0x11ADCD08
+	.long	0x11CECD08
+	.long	0x11EFCD08
+	.long	0x1210CD08
+	lvx	25,3,7
+	bdnz	.Loop_xts_enc6x
+
+	subic	5,5,96
+	vxor	0,17,31
+	.long	0x10E7C508
+	.long	0x118CC508
+	vsrab	11,8,9
+	vxor	17,8,23
+	vaddubm	8,8,8
+	.long	0x11ADC508
+	.long	0x11CEC508
+	vsldoi	11,11,11,15
+	.long	0x11EFC508
+	.long	0x1210C508
+
+	subfe.	0,0,0
+	vand	11,11,10
+	.long	0x10E7CD08
+	.long	0x118CCD08
+	vxor	8,8,11
+	.long	0x11ADCD08
+	.long	0x11CECD08
+	vxor	1,18,31
+	vsrab	11,8,9
+	vxor	18,8,23
+	.long	0x11EFCD08
+	.long	0x1210CD08
+
+	and	0,0,5
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	.long	0x10E7D508
+	.long	0x118CD508
+	vand	11,11,10
+	.long	0x11ADD508
+	.long	0x11CED508
+	vxor	8,8,11
+	.long	0x11EFD508
+	.long	0x1210D508
+
+	add	10,10,0
+
+
+
+	vxor	2,19,31
+	vsrab	11,8,9
+	vxor	19,8,23
+	vaddubm	8,8,8
+	.long	0x10E7DD08
+	.long	0x118CDD08
+	vsldoi	11,11,11,15
+	.long	0x11ADDD08
+	.long	0x11CEDD08
+	vand	11,11,10
+	.long	0x11EFDD08
+	.long	0x1210DD08
+
+	addi	7,1,79
+	vxor	8,8,11
+	.long	0x10E7E508
+	.long	0x118CE508
+	vxor	3,20,31
+	vsrab	11,8,9
+	vxor	20,8,23
+	.long	0x11ADE508
+	.long	0x11CEE508
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	.long	0x11EFE508
+	.long	0x1210E508
+	lvx	24,0,7
+	vand	11,11,10
+
+	.long	0x10E7ED08
+	.long	0x118CED08
+	vxor	8,8,11
+	.long	0x11ADED08
+	.long	0x11CEED08
+	vxor	4,21,31
+	vsrab	11,8,9
+	vxor	21,8,23
+	.long	0x11EFED08
+	.long	0x1210ED08
+	lvx	25,3,7
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+
+	.long	0x10E7F508
+	.long	0x118CF508
+	vand	11,11,10
+	.long	0x11ADF508
+	.long	0x11CEF508
+	vxor	8,8,11
+	.long	0x11EFF508
+	.long	0x1210F508
+	vxor	5,22,31
+	vsrab	11,8,9
+	vxor	22,8,23
+
+	.long	0x10E70509
+	.long	0x7C005699
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	.long	0x118C0D09
+	.long	0x7C235699
+	.long	0x11AD1509
+	vperm	0,0,0,6
+	.long	0x7C5A5699
+	vand	11,11,10
+	.long	0x11CE1D09
+	vperm	1,1,1,6
+	.long	0x7C7B5699
+	.long	0x11EF2509
+	vperm	2,2,2,6
+	.long	0x7C9C5699
+	vxor	8,8,11
+	.long	0x11702D09
+
+	vperm	3,3,3,6
+	.long	0x7CBD5699
+	addi	10,10,0x60
+	vperm	4,4,4,6
+	vperm	5,5,5,6
+
+	vperm	7,7,7,6
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	7,0,17
+	vperm	13,13,13,6
+	.long	0x7D832799
+	vxor	12,1,18
+	vperm	14,14,14,6
+	.long	0x7DBA2799
+	vxor	13,2,19
+	vperm	15,15,15,6
+	.long	0x7DDB2799
+	vxor	14,3,20
+	vperm	16,11,11,6
+	.long	0x7DFC2799
+	vxor	15,4,21
+	.long	0x7E1D2799
+
+	vxor	16,5,22
+	addi	4,4,0x60
+
+	mtctr	9
+	beq	.Loop_xts_enc6x
+
+	addic.	5,5,0x60
+	beq	.Lxts_enc6x_zero
+	cmpwi	5,0x20
+	blt	.Lxts_enc6x_one
+	nop	
+	beq	.Lxts_enc6x_two
+	cmpwi	5,0x40
+	blt	.Lxts_enc6x_three
+	nop	
+	beq	.Lxts_enc6x_four
+
+.Lxts_enc6x_five:
+	vxor	7,1,17
+	vxor	12,2,18
+	vxor	13,3,19
+	vxor	14,4,20
+	vxor	15,5,21
+
+	bl	_aesp8_xts_enc5x
+
+	vperm	7,7,7,6
+	vor	17,22,22
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vperm	13,13,13,6
+	.long	0x7D832799
+	vperm	14,14,14,6
+	.long	0x7DBA2799
+	vxor	11,15,22
+	vperm	15,15,15,6
+	.long	0x7DDB2799
+	.long	0x7DFC2799
+	addi	4,4,0x50
+	bne	.Lxts_enc6x_steal
+	b	.Lxts_enc6x_done
+
+.align	4
+.Lxts_enc6x_four:
+	vxor	7,2,17
+	vxor	12,3,18
+	vxor	13,4,19
+	vxor	14,5,20
+	vxor	15,15,15
+
+	bl	_aesp8_xts_enc5x
+
+	vperm	7,7,7,6
+	vor	17,21,21
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vperm	13,13,13,6
+	.long	0x7D832799
+	vxor	11,14,21
+	vperm	14,14,14,6
+	.long	0x7DBA2799
+	.long	0x7DDB2799
+	addi	4,4,0x40
+	bne	.Lxts_enc6x_steal
+	b	.Lxts_enc6x_done
+
+.align	4
+.Lxts_enc6x_three:
+	vxor	7,3,17
+	vxor	12,4,18
+	vxor	13,5,19
+	vxor	14,14,14
+	vxor	15,15,15
+
+	bl	_aesp8_xts_enc5x
+
+	vperm	7,7,7,6
+	vor	17,20,20
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	11,13,20
+	vperm	13,13,13,6
+	.long	0x7D832799
+	.long	0x7DBA2799
+	addi	4,4,0x30
+	bne	.Lxts_enc6x_steal
+	b	.Lxts_enc6x_done
+
+.align	4
+.Lxts_enc6x_two:
+	vxor	7,4,17
+	vxor	12,5,18
+	vxor	13,13,13
+	vxor	14,14,14
+	vxor	15,15,15
+
+	bl	_aesp8_xts_enc5x
+
+	vperm	7,7,7,6
+	vor	17,19,19
+	vxor	11,12,19
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	.long	0x7D832799
+	addi	4,4,0x20
+	bne	.Lxts_enc6x_steal
+	b	.Lxts_enc6x_done
+
+.align	4
+.Lxts_enc6x_one:
+	vxor	7,5,17
+	nop	
+.Loop_xts_enc1x:
+	.long	0x10E7C508
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD08
+	lvx	25,3,7
+	bdnz	.Loop_xts_enc1x
+
+	add	10,10,31
+	cmpwi	31,0
+	.long	0x10E7C508
+
+	subi	10,10,16
+	.long	0x10E7CD08
+
+	lvsr	5,0,31
+	.long	0x10E7D508
+
+	.long	0x7C005699
+	.long	0x10E7DD08
+
+	addi	7,1,79
+	.long	0x10E7E508
+	lvx	24,0,7
+
+	.long	0x10E7ED08
+	lvx	25,3,7
+	vxor	17,17,31
+
+	vperm	0,0,0,6
+	.long	0x10E7F508
+
+	vperm	0,0,0,5
+	.long	0x10E78D09
+
+	vor	17,18,18
+	vxor	11,7,18
+	vperm	7,7,7,6
+	.long	0x7CE02799
+	addi	4,4,0x10
+	bne	.Lxts_enc6x_steal
+	b	.Lxts_enc6x_done
+
+.align	4
+.Lxts_enc6x_zero:
+	cmpwi	31,0
+	beq	.Lxts_enc6x_done
+
+	add	10,10,31
+	subi	10,10,16
+	.long	0x7C005699
+	lvsr	5,0,31
+	vperm	0,0,0,6
+	vperm	0,0,0,5
+	vxor	11,11,17
+.Lxts_enc6x_steal:
+	vxor	0,0,17
+	vxor	7,7,7
+	vspltisb	12,-1
+	vperm	7,7,12,5
+	vsel	7,0,11,7
+
+	subi	30,4,17
+	subi	4,4,16
+	mtctr	31
+.Loop_xts_enc6x_steal:
+	lbzu	0,1(30)
+	stb	0,16(30)
+	bdnz	.Loop_xts_enc6x_steal
+
+	li	31,0
+	mtctr	9
+	b	.Loop_xts_enc1x
+
+.align	4
+.Lxts_enc6x_done:
+	cmpldi	8,0
+	beq	.Lxts_enc6x_ret
+
+	vxor	8,17,23
+	vperm	8,8,8,6
+	.long	0x7D004799
+
+.Lxts_enc6x_ret:
+	mtlr	11
+	li	10,79
+	li	11,95
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+
+	or	12,12,12
+	lvx	20,10,1
+	addi	10,10,32
+	lvx	21,11,1
+	addi	11,11,32
+	lvx	22,10,1
+	addi	10,10,32
+	lvx	23,11,1
+	addi	11,11,32
+	lvx	24,10,1
+	addi	10,10,32
+	lvx	25,11,1
+	addi	11,11,32
+	lvx	26,10,1
+	addi	10,10,32
+	lvx	27,11,1
+	addi	11,11,32
+	lvx	28,10,1
+	addi	10,10,32
+	lvx	29,11,1
+	addi	11,11,32
+	lvx	30,10,1
+	lvx	31,11,1
+	ld	26,400(1)
+	ld	27,408(1)
+	ld	28,416(1)
+	ld	29,424(1)
+	ld	30,432(1)
+	ld	31,440(1)
+	addi	1,1,448
+	blr	
+.long	0
+.byte	0,12,0x04,1,0x80,6,6,0
+.long	0
+
+.align	5
+_aesp8_xts_enc5x:
+	.long	0x10E7C508
+	.long	0x118CC508
+	.long	0x11ADC508
+	.long	0x11CEC508
+	.long	0x11EFC508
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD08
+	.long	0x118CCD08
+	.long	0x11ADCD08
+	.long	0x11CECD08
+	.long	0x11EFCD08
+	lvx	25,3,7
+	bdnz	_aesp8_xts_enc5x
+
+	add	10,10,31
+	cmpwi	31,0
+	.long	0x10E7C508
+	.long	0x118CC508
+	.long	0x11ADC508
+	.long	0x11CEC508
+	.long	0x11EFC508
+
+	subi	10,10,16
+	.long	0x10E7CD08
+	.long	0x118CCD08
+	.long	0x11ADCD08
+	.long	0x11CECD08
+	.long	0x11EFCD08
+	vxor	17,17,31
+
+	.long	0x10E7D508
+	lvsr	5,0,31
+	.long	0x118CD508
+	.long	0x11ADD508
+	.long	0x11CED508
+	.long	0x11EFD508
+	vxor	1,18,31
+
+	.long	0x10E7DD08
+	.long	0x7C005699
+	.long	0x118CDD08
+	.long	0x11ADDD08
+	.long	0x11CEDD08
+	.long	0x11EFDD08
+	vxor	2,19,31
+
+	addi	7,1,79
+	.long	0x10E7E508
+	.long	0x118CE508
+	.long	0x11ADE508
+	.long	0x11CEE508
+	.long	0x11EFE508
+	lvx	24,0,7
+	vxor	3,20,31
+
+	.long	0x10E7ED08
+	vperm	0,0,0,6
+	.long	0x118CED08
+	.long	0x11ADED08
+	.long	0x11CEED08
+	.long	0x11EFED08
+	lvx	25,3,7
+	vxor	4,21,31
+
+	.long	0x10E7F508
+	vperm	0,0,0,5
+	.long	0x118CF508
+	.long	0x11ADF508
+	.long	0x11CEF508
+	.long	0x11EFF508
+
+	.long	0x10E78D09
+	.long	0x118C0D09
+	.long	0x11AD1509
+	.long	0x11CE1D09
+	.long	0x11EF2509
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,0,0
+
+.align	5
+_aesp8_xts_decrypt6x:
+	stdu	1,-448(1)
+	mflr	11
+	li	7,207
+	li	3,223
+	std	11,464(1)
+	stvx	20,7,1
+	addi	7,7,32
+	stvx	21,3,1
+	addi	3,3,32
+	stvx	22,7,1
+	addi	7,7,32
+	stvx	23,3,1
+	addi	3,3,32
+	stvx	24,7,1
+	addi	7,7,32
+	stvx	25,3,1
+	addi	3,3,32
+	stvx	26,7,1
+	addi	7,7,32
+	stvx	27,3,1
+	addi	3,3,32
+	stvx	28,7,1
+	addi	7,7,32
+	stvx	29,3,1
+	addi	3,3,32
+	stvx	30,7,1
+	stvx	31,3,1
+	li	0,-1
+	stw	12,396(1)
+	li	3,0x10
+	std	26,400(1)
+	li	26,0x20
+	std	27,408(1)
+	li	27,0x30
+	std	28,416(1)
+	li	28,0x40
+	std	29,424(1)
+	li	29,0x50
+	std	30,432(1)
+	li	30,0x60
+	std	31,440(1)
+	li	31,0x70
+	or	0,0,0
+
+	subi	9,9,3
+
+	lvx	23,0,6
+	lvx	30,3,6
+	addi	6,6,0x20
+	lvx	31,0,6
+	vperm	23,30,23,7
+	addi	7,1,79
+	mtctr	9
+
+.Load_xts_dec_key:
+	vperm	24,31,30,7
+	lvx	30,3,6
+	addi	6,6,0x20
+	stvx	24,0,7
+	vperm	25,30,31,7
+	lvx	31,0,6
+	stvx	25,3,7
+	addi	7,7,0x20
+	bdnz	.Load_xts_dec_key
+
+	lvx	26,3,6
+	vperm	24,31,30,7
+	lvx	27,26,6
+	stvx	24,0,7
+	vperm	25,26,31,7
+	lvx	28,27,6
+	stvx	25,3,7
+	addi	7,1,79
+	vperm	26,27,26,7
+	lvx	29,28,6
+	vperm	27,28,27,7
+	lvx	30,29,6
+	vperm	28,29,28,7
+	lvx	31,30,6
+	vperm	29,30,29,7
+	lvx	22,31,6
+	vperm	30,31,30,7
+	lvx	24,0,7
+	vperm	31,22,31,7
+	lvx	25,3,7
+
+	vperm	0,2,4,5
+	subi	10,10,31
+	vxor	17,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vand	11,11,10
+	vxor	7,0,17
+	vxor	8,8,11
+
+	.long	0x7C235699
+	vxor	18,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	1,1,1,6
+	vand	11,11,10
+	vxor	12,1,18
+	vxor	8,8,11
+
+	.long	0x7C5A5699
+	andi.	31,5,15
+	vxor	19,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	2,2,2,6
+	vand	11,11,10
+	vxor	13,2,19
+	vxor	8,8,11
+
+	.long	0x7C7B5699
+	sub	5,5,31
+	vxor	20,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	3,3,3,6
+	vand	11,11,10
+	vxor	14,3,20
+	vxor	8,8,11
+
+	.long	0x7C9C5699
+	subi	5,5,0x60
+	vxor	21,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	4,4,4,6
+	vand	11,11,10
+	vxor	15,4,21
+	vxor	8,8,11
+
+	.long	0x7CBD5699
+	addi	10,10,0x60
+	vxor	22,8,23
+	vsrab	11,8,9
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	vperm	5,5,5,6
+	vand	11,11,10
+	vxor	16,5,22
+	vxor	8,8,11
+
+	vxor	31,31,23
+	mtctr	9
+	b	.Loop_xts_dec6x
+
+.align	5
+.Loop_xts_dec6x:
+	.long	0x10E7C548
+	.long	0x118CC548
+	.long	0x11ADC548
+	.long	0x11CEC548
+	.long	0x11EFC548
+	.long	0x1210C548
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD48
+	.long	0x118CCD48
+	.long	0x11ADCD48
+	.long	0x11CECD48
+	.long	0x11EFCD48
+	.long	0x1210CD48
+	lvx	25,3,7
+	bdnz	.Loop_xts_dec6x
+
+	subic	5,5,96
+	vxor	0,17,31
+	.long	0x10E7C548
+	.long	0x118CC548
+	vsrab	11,8,9
+	vxor	17,8,23
+	vaddubm	8,8,8
+	.long	0x11ADC548
+	.long	0x11CEC548
+	vsldoi	11,11,11,15
+	.long	0x11EFC548
+	.long	0x1210C548
+
+	subfe.	0,0,0
+	vand	11,11,10
+	.long	0x10E7CD48
+	.long	0x118CCD48
+	vxor	8,8,11
+	.long	0x11ADCD48
+	.long	0x11CECD48
+	vxor	1,18,31
+	vsrab	11,8,9
+	vxor	18,8,23
+	.long	0x11EFCD48
+	.long	0x1210CD48
+
+	and	0,0,5
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	.long	0x10E7D548
+	.long	0x118CD548
+	vand	11,11,10
+	.long	0x11ADD548
+	.long	0x11CED548
+	vxor	8,8,11
+	.long	0x11EFD548
+	.long	0x1210D548
+
+	add	10,10,0
+
+
+
+	vxor	2,19,31
+	vsrab	11,8,9
+	vxor	19,8,23
+	vaddubm	8,8,8
+	.long	0x10E7DD48
+	.long	0x118CDD48
+	vsldoi	11,11,11,15
+	.long	0x11ADDD48
+	.long	0x11CEDD48
+	vand	11,11,10
+	.long	0x11EFDD48
+	.long	0x1210DD48
+
+	addi	7,1,79
+	vxor	8,8,11
+	.long	0x10E7E548
+	.long	0x118CE548
+	vxor	3,20,31
+	vsrab	11,8,9
+	vxor	20,8,23
+	.long	0x11ADE548
+	.long	0x11CEE548
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	.long	0x11EFE548
+	.long	0x1210E548
+	lvx	24,0,7
+	vand	11,11,10
+
+	.long	0x10E7ED48
+	.long	0x118CED48
+	vxor	8,8,11
+	.long	0x11ADED48
+	.long	0x11CEED48
+	vxor	4,21,31
+	vsrab	11,8,9
+	vxor	21,8,23
+	.long	0x11EFED48
+	.long	0x1210ED48
+	lvx	25,3,7
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+
+	.long	0x10E7F548
+	.long	0x118CF548
+	vand	11,11,10
+	.long	0x11ADF548
+	.long	0x11CEF548
+	vxor	8,8,11
+	.long	0x11EFF548
+	.long	0x1210F548
+	vxor	5,22,31
+	vsrab	11,8,9
+	vxor	22,8,23
+
+	.long	0x10E70549
+	.long	0x7C005699
+	vaddubm	8,8,8
+	vsldoi	11,11,11,15
+	.long	0x118C0D49
+	.long	0x7C235699
+	.long	0x11AD1549
+	vperm	0,0,0,6
+	.long	0x7C5A5699
+	vand	11,11,10
+	.long	0x11CE1D49
+	vperm	1,1,1,6
+	.long	0x7C7B5699
+	.long	0x11EF2549
+	vperm	2,2,2,6
+	.long	0x7C9C5699
+	vxor	8,8,11
+	.long	0x12102D49
+	vperm	3,3,3,6
+	.long	0x7CBD5699
+	addi	10,10,0x60
+	vperm	4,4,4,6
+	vperm	5,5,5,6
+
+	vperm	7,7,7,6
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	7,0,17
+	vperm	13,13,13,6
+	.long	0x7D832799
+	vxor	12,1,18
+	vperm	14,14,14,6
+	.long	0x7DBA2799
+	vxor	13,2,19
+	vperm	15,15,15,6
+	.long	0x7DDB2799
+	vxor	14,3,20
+	vperm	16,16,16,6
+	.long	0x7DFC2799
+	vxor	15,4,21
+	.long	0x7E1D2799
+	vxor	16,5,22
+	addi	4,4,0x60
+
+	mtctr	9
+	beq	.Loop_xts_dec6x
+
+	addic.	5,5,0x60
+	beq	.Lxts_dec6x_zero
+	cmpwi	5,0x20
+	blt	.Lxts_dec6x_one
+	nop	
+	beq	.Lxts_dec6x_two
+	cmpwi	5,0x40
+	blt	.Lxts_dec6x_three
+	nop	
+	beq	.Lxts_dec6x_four
+
+.Lxts_dec6x_five:
+	vxor	7,1,17
+	vxor	12,2,18
+	vxor	13,3,19
+	vxor	14,4,20
+	vxor	15,5,21
+
+	bl	_aesp8_xts_dec5x
+
+	vperm	7,7,7,6
+	vor	17,22,22
+	vxor	18,8,23
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	7,0,18
+	vperm	13,13,13,6
+	.long	0x7D832799
+	vperm	14,14,14,6
+	.long	0x7DBA2799
+	vperm	15,15,15,6
+	.long	0x7DDB2799
+	.long	0x7DFC2799
+	addi	4,4,0x50
+	bne	.Lxts_dec6x_steal
+	b	.Lxts_dec6x_done
+
+.align	4
+.Lxts_dec6x_four:
+	vxor	7,2,17
+	vxor	12,3,18
+	vxor	13,4,19
+	vxor	14,5,20
+	vxor	15,15,15
+
+	bl	_aesp8_xts_dec5x
+
+	vperm	7,7,7,6
+	vor	17,21,21
+	vor	18,22,22
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	7,0,22
+	vperm	13,13,13,6
+	.long	0x7D832799
+	vperm	14,14,14,6
+	.long	0x7DBA2799
+	.long	0x7DDB2799
+	addi	4,4,0x40
+	bne	.Lxts_dec6x_steal
+	b	.Lxts_dec6x_done
+
+.align	4
+.Lxts_dec6x_three:
+	vxor	7,3,17
+	vxor	12,4,18
+	vxor	13,5,19
+	vxor	14,14,14
+	vxor	15,15,15
+
+	bl	_aesp8_xts_dec5x
+
+	vperm	7,7,7,6
+	vor	17,20,20
+	vor	18,21,21
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	7,0,21
+	vperm	13,13,13,6
+	.long	0x7D832799
+	.long	0x7DBA2799
+	addi	4,4,0x30
+	bne	.Lxts_dec6x_steal
+	b	.Lxts_dec6x_done
+
+.align	4
+.Lxts_dec6x_two:
+	vxor	7,4,17
+	vxor	12,5,18
+	vxor	13,13,13
+	vxor	14,14,14
+	vxor	15,15,15
+
+	bl	_aesp8_xts_dec5x
+
+	vperm	7,7,7,6
+	vor	17,19,19
+	vor	18,20,20
+	vperm	12,12,12,6
+	.long	0x7CE02799
+	vxor	7,0,20
+	.long	0x7D832799
+	addi	4,4,0x20
+	bne	.Lxts_dec6x_steal
+	b	.Lxts_dec6x_done
+
+.align	4
+.Lxts_dec6x_one:
+	vxor	7,5,17
+	nop	
+.Loop_xts_dec1x:
+	.long	0x10E7C548
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD48
+	lvx	25,3,7
+	bdnz	.Loop_xts_dec1x
+
+	subi	0,31,1
+	.long	0x10E7C548
+
+	andi.	0,0,16
+	cmpwi	31,0
+	.long	0x10E7CD48
+
+	sub	10,10,0
+	.long	0x10E7D548
+
+	.long	0x7C005699
+	.long	0x10E7DD48
+
+	addi	7,1,79
+	.long	0x10E7E548
+	lvx	24,0,7
+
+	.long	0x10E7ED48
+	lvx	25,3,7
+	vxor	17,17,31
+
+	vperm	0,0,0,6
+	.long	0x10E7F548
+
+	mtctr	9
+	.long	0x10E78D49
+
+	vor	17,18,18
+	vor	18,19,19
+	vperm	7,7,7,6
+	.long	0x7CE02799
+	addi	4,4,0x10
+	vxor	7,0,19
+	bne	.Lxts_dec6x_steal
+	b	.Lxts_dec6x_done
+
+.align	4
+.Lxts_dec6x_zero:
+	cmpwi	31,0
+	beq	.Lxts_dec6x_done
+
+	.long	0x7C005699
+	vperm	0,0,0,6
+	vxor	7,0,18
+.Lxts_dec6x_steal:
+	.long	0x10E7C548
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD48
+	lvx	25,3,7
+	bdnz	.Lxts_dec6x_steal
+
+	add	10,10,31
+	.long	0x10E7C548
+
+	cmpwi	31,0
+	.long	0x10E7CD48
+
+	.long	0x7C005699
+	.long	0x10E7D548
+
+	lvsr	5,0,31
+	.long	0x10E7DD48
+
+	addi	7,1,79
+	.long	0x10E7E548
+	lvx	24,0,7
+
+	.long	0x10E7ED48
+	lvx	25,3,7
+	vxor	18,18,31
+
+	vperm	0,0,0,6
+	.long	0x10E7F548
+
+	vperm	0,0,0,5
+	.long	0x11679549
+
+	vperm	7,11,11,6
+	.long	0x7CE02799
+
+
+	vxor	7,7,7
+	vspltisb	12,-1
+	vperm	7,7,12,5
+	vsel	7,0,11,7
+	vxor	7,7,17
+
+	subi	30,4,1
+	mtctr	31
+.Loop_xts_dec6x_steal:
+	lbzu	0,1(30)
+	stb	0,16(30)
+	bdnz	.Loop_xts_dec6x_steal
+
+	li	31,0
+	mtctr	9
+	b	.Loop_xts_dec1x
+
+.align	4
+.Lxts_dec6x_done:
+	cmpldi	8,0
+	beq	.Lxts_dec6x_ret
+
+	vxor	8,17,23
+	vperm	8,8,8,6
+	.long	0x7D004799
+
+.Lxts_dec6x_ret:
+	mtlr	11
+	li	10,79
+	li	11,95
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+	stvx	9,10,1
+	addi	10,10,32
+	stvx	9,11,1
+	addi	11,11,32
+
+	or	12,12,12
+	lvx	20,10,1
+	addi	10,10,32
+	lvx	21,11,1
+	addi	11,11,32
+	lvx	22,10,1
+	addi	10,10,32
+	lvx	23,11,1
+	addi	11,11,32
+	lvx	24,10,1
+	addi	10,10,32
+	lvx	25,11,1
+	addi	11,11,32
+	lvx	26,10,1
+	addi	10,10,32
+	lvx	27,11,1
+	addi	11,11,32
+	lvx	28,10,1
+	addi	10,10,32
+	lvx	29,11,1
+	addi	11,11,32
+	lvx	30,10,1
+	lvx	31,11,1
+	ld	26,400(1)
+	ld	27,408(1)
+	ld	28,416(1)
+	ld	29,424(1)
+	ld	30,432(1)
+	ld	31,440(1)
+	addi	1,1,448
+	blr	
+.long	0
+.byte	0,12,0x04,1,0x80,6,6,0
+.long	0
+
+.align	5
+_aesp8_xts_dec5x:
+	.long	0x10E7C548
+	.long	0x118CC548
+	.long	0x11ADC548
+	.long	0x11CEC548
+	.long	0x11EFC548
+	lvx	24,26,7
+	addi	7,7,0x20
+
+	.long	0x10E7CD48
+	.long	0x118CCD48
+	.long	0x11ADCD48
+	.long	0x11CECD48
+	.long	0x11EFCD48
+	lvx	25,3,7
+	bdnz	_aesp8_xts_dec5x
+
+	subi	0,31,1
+	.long	0x10E7C548
+	.long	0x118CC548
+	.long	0x11ADC548
+	.long	0x11CEC548
+	.long	0x11EFC548
+
+	andi.	0,0,16
+	cmpwi	31,0
+	.long	0x10E7CD48
+	.long	0x118CCD48
+	.long	0x11ADCD48
+	.long	0x11CECD48
+	.long	0x11EFCD48
+	vxor	17,17,31
+
+	sub	10,10,0
+	.long	0x10E7D548
+	.long	0x118CD548
+	.long	0x11ADD548
+	.long	0x11CED548
+	.long	0x11EFD548
+	vxor	1,18,31
+
+	.long	0x10E7DD48
+	.long	0x7C005699
+	.long	0x118CDD48
+	.long	0x11ADDD48
+	.long	0x11CEDD48
+	.long	0x11EFDD48
+	vxor	2,19,31
+
+	addi	7,1,79
+	.long	0x10E7E548
+	.long	0x118CE548
+	.long	0x11ADE548
+	.long	0x11CEE548
+	.long	0x11EFE548
+	lvx	24,0,7
+	vxor	3,20,31
+
+	.long	0x10E7ED48
+	vperm	0,0,0,6
+	.long	0x118CED48
+	.long	0x11ADED48
+	.long	0x11CEED48
+	.long	0x11EFED48
+	lvx	25,3,7
+	vxor	4,21,31
+
+	.long	0x10E7F548
+	.long	0x118CF548
+	.long	0x11ADF548
+	.long	0x11CEF548
+	.long	0x11EFF548
+
+	.long	0x10E78D49
+	.long	0x118C0D49
+	.long	0x11AD1549
+	.long	0x11CE1D49
+	.long	0x11EF2549
+	mtctr	9
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,0,0
+#endif  // !OPENSSL_NO_ASM && __powerpc64__
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S
@@ -1,0 +1,587 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__)
+.machine	"any"
+
+.abiversion	2
+.text
+
+.globl	gcm_init_p8
+.type	gcm_init_p8,@function
+.align	5
+gcm_init_p8:
+.localentry	gcm_init_p8,0
+
+	li	0,-4096
+	li	8,0x10
+	li	12,-1
+	li	9,0x20
+	or	0,0,0
+	li	10,0x30
+	.long	0x7D202699
+
+	vspltisb	8,-16
+	vspltisb	5,1
+	vaddubm	8,8,8
+	vxor	4,4,4
+	vor	8,8,5
+	vsldoi	8,8,4,15
+	vsldoi	6,4,5,1
+	vaddubm	8,8,8
+	vspltisb	7,7
+	vor	8,8,6
+	vspltb	6,9,0
+	vsl	9,9,5
+	vsrab	6,6,7
+	vand	6,6,8
+	vxor	3,9,6
+
+	vsldoi	9,3,3,8
+	vsldoi	8,4,8,8
+	vsldoi	11,4,9,8
+	vsldoi	10,9,4,8
+
+	.long	0x7D001F99
+	.long	0x7D681F99
+	li	8,0x40
+	.long	0x7D291F99
+	li	9,0x50
+	.long	0x7D4A1F99
+	li	10,0x60
+
+	.long	0x10035CC8
+	.long	0x10234CC8
+	.long	0x104354C8
+
+	.long	0x10E044C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vxor	0,0,5
+	vxor	2,2,6
+
+	vsldoi	0,0,0,8
+	vxor	0,0,7
+
+	vsldoi	6,0,0,8
+	.long	0x100044C8
+	vxor	6,6,2
+	vxor	16,0,6
+
+	vsldoi	17,16,16,8
+	vsldoi	19,4,17,8
+	vsldoi	18,17,4,8
+
+	.long	0x7E681F99
+	li	8,0x70
+	.long	0x7E291F99
+	li	9,0x80
+	.long	0x7E4A1F99
+	li	10,0x90
+	.long	0x10039CC8
+	.long	0x11B09CC8
+	.long	0x10238CC8
+	.long	0x11D08CC8
+	.long	0x104394C8
+	.long	0x11F094C8
+
+	.long	0x10E044C8
+	.long	0x114D44C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vsldoi	11,14,4,8
+	vsldoi	9,4,14,8
+	vxor	0,0,5
+	vxor	2,2,6
+	vxor	13,13,11
+	vxor	15,15,9
+
+	vsldoi	0,0,0,8
+	vsldoi	13,13,13,8
+	vxor	0,0,7
+	vxor	13,13,10
+
+	vsldoi	6,0,0,8
+	vsldoi	9,13,13,8
+	.long	0x100044C8
+	.long	0x11AD44C8
+	vxor	6,6,2
+	vxor	9,9,15
+	vxor	0,0,6
+	vxor	13,13,9
+
+	vsldoi	9,0,0,8
+	vsldoi	17,13,13,8
+	vsldoi	11,4,9,8
+	vsldoi	10,9,4,8
+	vsldoi	19,4,17,8
+	vsldoi	18,17,4,8
+
+	.long	0x7D681F99
+	li	8,0xa0
+	.long	0x7D291F99
+	li	9,0xb0
+	.long	0x7D4A1F99
+	li	10,0xc0
+	.long	0x7E681F99
+	.long	0x7E291F99
+	.long	0x7E4A1F99
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,2,0
+.long	0
+.size	gcm_init_p8,.-gcm_init_p8
+.globl	gcm_gmult_p8
+.type	gcm_gmult_p8,@function
+.align	5
+gcm_gmult_p8:
+.localentry	gcm_gmult_p8,0
+
+	lis	0,0xfff8
+	li	8,0x10
+	li	12,-1
+	li	9,0x20
+	or	0,0,0
+	li	10,0x30
+	.long	0x7C601E99
+
+	.long	0x7D682699
+	lvsl	12,0,0
+	.long	0x7D292699
+	vspltisb	5,0x07
+	.long	0x7D4A2699
+	vxor	12,12,5
+	.long	0x7D002699
+	vperm	3,3,3,12
+	vxor	4,4,4
+
+	.long	0x10035CC8
+	.long	0x10234CC8
+	.long	0x104354C8
+
+	.long	0x10E044C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vxor	0,0,5
+	vxor	2,2,6
+
+	vsldoi	0,0,0,8
+	vxor	0,0,7
+
+	vsldoi	6,0,0,8
+	.long	0x100044C8
+	vxor	6,6,2
+	vxor	0,0,6
+
+	vperm	0,0,0,12
+	.long	0x7C001F99
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,2,0
+.long	0
+.size	gcm_gmult_p8,.-gcm_gmult_p8
+
+.globl	gcm_ghash_p8
+.type	gcm_ghash_p8,@function
+.align	5
+gcm_ghash_p8:
+.localentry	gcm_ghash_p8,0
+
+	li	0,-4096
+	li	8,0x10
+	li	12,-1
+	li	9,0x20
+	or	0,0,0
+	li	10,0x30
+	.long	0x7C001E99
+
+	.long	0x7D682699
+	li	8,0x40
+	lvsl	12,0,0
+	.long	0x7D292699
+	li	9,0x50
+	vspltisb	5,0x07
+	.long	0x7D4A2699
+	li	10,0x60
+	vxor	12,12,5
+	.long	0x7D002699
+	vperm	0,0,0,12
+	vxor	4,4,4
+
+	cmpldi	6,64
+	bge	.Lgcm_ghash_p8_4x
+
+	.long	0x7C602E99
+	addi	5,5,16
+	subic.	6,6,16
+	vperm	3,3,3,12
+	vxor	3,3,0
+	beq	.Lshort
+
+	.long	0x7E682699
+	li	8,16
+	.long	0x7E292699
+	add	9,5,6
+	.long	0x7E4A2699
+
+
+.align	5
+.Loop_2x:
+	.long	0x7E002E99
+	vperm	16,16,16,12
+
+	subic	6,6,32
+	.long	0x10039CC8
+	.long	0x11B05CC8
+	subfe	0,0,0
+	.long	0x10238CC8
+	.long	0x11D04CC8
+	and	0,0,6
+	.long	0x104394C8
+	.long	0x11F054C8
+	add	5,5,0
+
+	vxor	0,0,13
+	vxor	1,1,14
+
+	.long	0x10E044C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vxor	2,2,15
+	vxor	0,0,5
+	vxor	2,2,6
+
+	vsldoi	0,0,0,8
+	vxor	0,0,7
+	.long	0x7C682E99
+	addi	5,5,32
+
+	vsldoi	6,0,0,8
+	.long	0x100044C8
+	vperm	3,3,3,12
+	vxor	6,6,2
+	vxor	3,3,6
+	vxor	3,3,0
+	cmpld	9,5
+	bgt	.Loop_2x
+
+	cmplwi	6,0
+	bne	.Leven
+
+.Lshort:
+	.long	0x10035CC8
+	.long	0x10234CC8
+	.long	0x104354C8
+
+	.long	0x10E044C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vxor	0,0,5
+	vxor	2,2,6
+
+	vsldoi	0,0,0,8
+	vxor	0,0,7
+
+	vsldoi	6,0,0,8
+	.long	0x100044C8
+	vxor	6,6,2
+
+.Leven:
+	vxor	0,0,6
+	vperm	0,0,0,12
+	.long	0x7C001F99
+
+	or	12,12,12
+	blr	
+.long	0
+.byte	0,12,0x14,0,0,0,4,0
+.long	0
+.align	5
+.gcm_ghash_p8_4x:
+.Lgcm_ghash_p8_4x:
+	stdu	1,-256(1)
+	li	10,63
+	li	11,79
+	stvx	20,10,1
+	addi	10,10,32
+	stvx	21,11,1
+	addi	11,11,32
+	stvx	22,10,1
+	addi	10,10,32
+	stvx	23,11,1
+	addi	11,11,32
+	stvx	24,10,1
+	addi	10,10,32
+	stvx	25,11,1
+	addi	11,11,32
+	stvx	26,10,1
+	addi	10,10,32
+	stvx	27,11,1
+	addi	11,11,32
+	stvx	28,10,1
+	addi	10,10,32
+	stvx	29,11,1
+	addi	11,11,32
+	stvx	30,10,1
+	li	10,0x60
+	stvx	31,11,1
+	li	0,-1
+	stw	12,252(1)
+	or	0,0,0
+
+	lvsl	5,0,8
+
+	li	8,0x70
+	.long	0x7E292699
+	li	9,0x80
+	vspltisb	6,8
+
+	li	10,0x90
+	.long	0x7EE82699
+	li	8,0xa0
+	.long	0x7F092699
+	li	9,0xb0
+	.long	0x7F2A2699
+	li	10,0xc0
+	.long	0x7FA82699
+	li	8,0x10
+	.long	0x7FC92699
+	li	9,0x20
+	.long	0x7FEA2699
+	li	10,0x30
+
+	vsldoi	7,4,6,8
+	vaddubm	18,5,7
+	vaddubm	19,6,18
+
+	srdi	6,6,4
+
+	.long	0x7C602E99
+	.long	0x7E082E99
+	subic.	6,6,8
+	.long	0x7EC92E99
+	.long	0x7F8A2E99
+	addi	5,5,0x40
+	vperm	3,3,3,12
+	vperm	16,16,16,12
+	vperm	22,22,22,12
+	vperm	28,28,28,12
+
+	vxor	2,3,0
+
+	.long	0x11B0BCC8
+	.long	0x11D0C4C8
+	.long	0x11F0CCC8
+
+	vperm	11,17,9,18
+	vperm	5,22,28,19
+	vperm	10,17,9,19
+	vperm	6,22,28,18
+	.long	0x12B68CC8
+	.long	0x12855CC8
+	.long	0x137C4CC8
+	.long	0x134654C8
+
+	vxor	21,21,14
+	vxor	20,20,13
+	vxor	27,27,21
+	vxor	26,26,15
+
+	blt	.Ltail_4x
+
+.Loop_4x:
+	.long	0x7C602E99
+	.long	0x7E082E99
+	subic.	6,6,4
+	.long	0x7EC92E99
+	.long	0x7F8A2E99
+	addi	5,5,0x40
+	vperm	16,16,16,12
+	vperm	22,22,22,12
+	vperm	28,28,28,12
+	vperm	3,3,3,12
+
+	.long	0x1002ECC8
+	.long	0x1022F4C8
+	.long	0x1042FCC8
+	.long	0x11B0BCC8
+	.long	0x11D0C4C8
+	.long	0x11F0CCC8
+
+	vxor	0,0,20
+	vxor	1,1,27
+	vxor	2,2,26
+	vperm	5,22,28,19
+	vperm	6,22,28,18
+
+	.long	0x10E044C8
+	.long	0x12855CC8
+	.long	0x134654C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vxor	0,0,5
+	vxor	2,2,6
+
+	vsldoi	0,0,0,8
+	vxor	0,0,7
+
+	vsldoi	6,0,0,8
+	.long	0x12B68CC8
+	.long	0x137C4CC8
+	.long	0x100044C8
+
+	vxor	20,20,13
+	vxor	26,26,15
+	vxor	2,2,3
+	vxor	21,21,14
+	vxor	2,2,6
+	vxor	27,27,21
+	vxor	2,2,0
+	bge	.Loop_4x
+
+.Ltail_4x:
+	.long	0x1002ECC8
+	.long	0x1022F4C8
+	.long	0x1042FCC8
+
+	vxor	0,0,20
+	vxor	1,1,27
+
+	.long	0x10E044C8
+
+	vsldoi	5,1,4,8
+	vsldoi	6,4,1,8
+	vxor	2,2,26
+	vxor	0,0,5
+	vxor	2,2,6
+
+	vsldoi	0,0,0,8
+	vxor	0,0,7
+
+	vsldoi	6,0,0,8
+	.long	0x100044C8
+	vxor	6,6,2
+	vxor	0,0,6
+
+	addic.	6,6,4
+	beq	.Ldone_4x
+
+	.long	0x7C602E99
+	cmpldi	6,2
+	li	6,-4
+	blt	.Lone
+	.long	0x7E082E99
+	beq	.Ltwo
+
+.Lthree:
+	.long	0x7EC92E99
+	vperm	3,3,3,12
+	vperm	16,16,16,12
+	vperm	22,22,22,12
+
+	vxor	2,3,0
+	vor	29,23,23
+	vor	30, 24, 24
+	vor	31,25,25
+
+	vperm	5,16,22,19
+	vperm	6,16,22,18
+	.long	0x12B08CC8
+	.long	0x13764CC8
+	.long	0x12855CC8
+	.long	0x134654C8
+
+	vxor	27,27,21
+	b	.Ltail_4x
+
+.align	4
+.Ltwo:
+	vperm	3,3,3,12
+	vperm	16,16,16,12
+
+	vxor	2,3,0
+	vperm	5,4,16,19
+	vperm	6,4,16,18
+
+	vsldoi	29,4,17,8
+	vor	30, 17, 17
+	vsldoi	31,17,4,8
+
+	.long	0x12855CC8
+	.long	0x13704CC8
+	.long	0x134654C8
+
+	b	.Ltail_4x
+
+.align	4
+.Lone:
+	vperm	3,3,3,12
+
+	vsldoi	29,4,9,8
+	vor	30, 9, 9
+	vsldoi	31,9,4,8
+
+	vxor	2,3,0
+	vxor	20,20,20
+	vxor	27,27,27
+	vxor	26,26,26
+
+	b	.Ltail_4x
+
+.Ldone_4x:
+	vperm	0,0,0,12
+	.long	0x7C001F99
+
+	li	10,63
+	li	11,79
+	or	12,12,12
+	lvx	20,10,1
+	addi	10,10,32
+	lvx	21,11,1
+	addi	11,11,32
+	lvx	22,10,1
+	addi	10,10,32
+	lvx	23,11,1
+	addi	11,11,32
+	lvx	24,10,1
+	addi	10,10,32
+	lvx	25,11,1
+	addi	11,11,32
+	lvx	26,10,1
+	addi	10,10,32
+	lvx	27,11,1
+	addi	11,11,32
+	lvx	28,10,1
+	addi	10,10,32
+	lvx	29,11,1
+	addi	11,11,32
+	lvx	30,10,1
+	lvx	31,11,1
+	addi	1,1,256
+	blr	
+.long	0
+.byte	0,12,0x04,0,0x80,0,4,0
+.long	0
+.size	gcm_ghash_p8,.-gcm_ghash_p8
+
+.byte	71,72,65,83,72,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && __powerpc64__
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-ppc64le/crypto/test/trampoline-ppc.S
@@ -1,0 +1,1410 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__)
+.machine	"any"
+.abiversion	2
+.text
+
+
+
+
+
+
+
+.globl	abi_test_trampoline
+.type	abi_test_trampoline,@function
+.align	5
+abi_test_trampoline:
+.localentry	abi_test_trampoline,0
+
+
+	mflr	0
+	std	0, 16(1)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	stdu	1, -528(1)
+
+	mfcr	0
+	std	0, 8(1)
+	std	2, 24(1)
+	std	4, 32(1)
+	li	11, 48
+	stvx	20, 11, 1
+	li	11, 64
+	stvx	21, 11, 1
+	li	11, 80
+	stvx	22, 11, 1
+	li	11, 96
+	stvx	23, 11, 1
+	li	11, 112
+	stvx	24, 11, 1
+	li	11, 128
+	stvx	25, 11, 1
+	li	11, 144
+	stvx	26, 11, 1
+	li	11, 160
+	stvx	27, 11, 1
+	li	11, 176
+	stvx	28, 11, 1
+	li	11, 192
+	stvx	29, 11, 1
+	li	11, 208
+	stvx	30, 11, 1
+	li	11, 224
+	stvx	31, 11, 1
+	std	14, 240(1)
+	std	15, 248(1)
+	std	16, 256(1)
+	std	17, 264(1)
+	std	18, 272(1)
+	std	19, 280(1)
+	std	20, 288(1)
+	std	21, 296(1)
+	std	22, 304(1)
+	std	23, 312(1)
+	std	24, 320(1)
+	std	25, 328(1)
+	std	26, 336(1)
+	std	27, 344(1)
+	std	28, 352(1)
+	std	29, 360(1)
+	std	30, 368(1)
+	std	31, 376(1)
+	stfd	14, 384(1)
+	stfd	15, 392(1)
+	stfd	16, 400(1)
+	stfd	17, 408(1)
+	stfd	18, 416(1)
+	stfd	19, 424(1)
+	stfd	20, 432(1)
+	stfd	21, 440(1)
+	stfd	22, 448(1)
+	stfd	23, 456(1)
+	stfd	24, 464(1)
+	stfd	25, 472(1)
+	stfd	26, 480(1)
+	stfd	27, 488(1)
+	stfd	28, 496(1)
+	stfd	29, 504(1)
+	stfd	30, 512(1)
+	stfd	31, 520(1)
+	li	11, 0
+	lvx	20, 11, 4
+	li	11, 16
+	lvx	21, 11, 4
+	li	11, 32
+	lvx	22, 11, 4
+	li	11, 48
+	lvx	23, 11, 4
+	li	11, 64
+	lvx	24, 11, 4
+	li	11, 80
+	lvx	25, 11, 4
+	li	11, 96
+	lvx	26, 11, 4
+	li	11, 112
+	lvx	27, 11, 4
+	li	11, 128
+	lvx	28, 11, 4
+	li	11, 144
+	lvx	29, 11, 4
+	li	11, 160
+	lvx	30, 11, 4
+	li	11, 176
+	lvx	31, 11, 4
+	ld	14, 192(4)
+	ld	15, 200(4)
+	ld	16, 208(4)
+	ld	17, 216(4)
+	ld	18, 224(4)
+	ld	19, 232(4)
+	ld	20, 240(4)
+	ld	21, 248(4)
+	ld	22, 256(4)
+	ld	23, 264(4)
+	ld	24, 272(4)
+	ld	25, 280(4)
+	ld	26, 288(4)
+	ld	27, 296(4)
+	ld	28, 304(4)
+	ld	29, 312(4)
+	ld	30, 320(4)
+	ld	31, 328(4)
+	lfd	14, 336(4)
+	lfd	15, 344(4)
+	lfd	16, 352(4)
+	lfd	17, 360(4)
+	lfd	18, 368(4)
+	lfd	19, 376(4)
+	lfd	20, 384(4)
+	lfd	21, 392(4)
+	lfd	22, 400(4)
+	lfd	23, 408(4)
+	lfd	24, 416(4)
+	lfd	25, 424(4)
+	lfd	26, 432(4)
+	lfd	27, 440(4)
+	lfd	28, 448(4)
+	lfd	29, 456(4)
+	lfd	30, 464(4)
+	lfd	31, 472(4)
+
+	ld	0, 480(4)
+	mtcr	0
+
+
+	addi	11, 5, -8
+	mr	12, 3
+
+
+	cmpdi	6, 0
+	beq	.Largs_done
+	mtctr	6
+	ldu	3, 8(11)
+	bdz	.Largs_done
+	ldu	4, 8(11)
+	bdz	.Largs_done
+	ldu	5, 8(11)
+	bdz	.Largs_done
+	ldu	6, 8(11)
+	bdz	.Largs_done
+	ldu	7, 8(11)
+	bdz	.Largs_done
+	ldu	8, 8(11)
+	bdz	.Largs_done
+	ldu	9, 8(11)
+	bdz	.Largs_done
+	ldu	10, 8(11)
+
+.Largs_done:
+	li	2, 0
+	mtctr	12
+	bctrl	
+	ld	2, 24(1)
+
+	ld	4, 32(1)
+	li	11, 0
+	stvx	20, 11, 4
+	li	11, 16
+	stvx	21, 11, 4
+	li	11, 32
+	stvx	22, 11, 4
+	li	11, 48
+	stvx	23, 11, 4
+	li	11, 64
+	stvx	24, 11, 4
+	li	11, 80
+	stvx	25, 11, 4
+	li	11, 96
+	stvx	26, 11, 4
+	li	11, 112
+	stvx	27, 11, 4
+	li	11, 128
+	stvx	28, 11, 4
+	li	11, 144
+	stvx	29, 11, 4
+	li	11, 160
+	stvx	30, 11, 4
+	li	11, 176
+	stvx	31, 11, 4
+	std	14, 192(4)
+	std	15, 200(4)
+	std	16, 208(4)
+	std	17, 216(4)
+	std	18, 224(4)
+	std	19, 232(4)
+	std	20, 240(4)
+	std	21, 248(4)
+	std	22, 256(4)
+	std	23, 264(4)
+	std	24, 272(4)
+	std	25, 280(4)
+	std	26, 288(4)
+	std	27, 296(4)
+	std	28, 304(4)
+	std	29, 312(4)
+	std	30, 320(4)
+	std	31, 328(4)
+	stfd	14, 336(4)
+	stfd	15, 344(4)
+	stfd	16, 352(4)
+	stfd	17, 360(4)
+	stfd	18, 368(4)
+	stfd	19, 376(4)
+	stfd	20, 384(4)
+	stfd	21, 392(4)
+	stfd	22, 400(4)
+	stfd	23, 408(4)
+	stfd	24, 416(4)
+	stfd	25, 424(4)
+	stfd	26, 432(4)
+	stfd	27, 440(4)
+	stfd	28, 448(4)
+	stfd	29, 456(4)
+	stfd	30, 464(4)
+	stfd	31, 472(4)
+	li	11, 48
+	lvx	20, 11, 1
+	li	11, 64
+	lvx	21, 11, 1
+	li	11, 80
+	lvx	22, 11, 1
+	li	11, 96
+	lvx	23, 11, 1
+	li	11, 112
+	lvx	24, 11, 1
+	li	11, 128
+	lvx	25, 11, 1
+	li	11, 144
+	lvx	26, 11, 1
+	li	11, 160
+	lvx	27, 11, 1
+	li	11, 176
+	lvx	28, 11, 1
+	li	11, 192
+	lvx	29, 11, 1
+	li	11, 208
+	lvx	30, 11, 1
+	li	11, 224
+	lvx	31, 11, 1
+	ld	14, 240(1)
+	ld	15, 248(1)
+	ld	16, 256(1)
+	ld	17, 264(1)
+	ld	18, 272(1)
+	ld	19, 280(1)
+	ld	20, 288(1)
+	ld	21, 296(1)
+	ld	22, 304(1)
+	ld	23, 312(1)
+	ld	24, 320(1)
+	ld	25, 328(1)
+	ld	26, 336(1)
+	ld	27, 344(1)
+	ld	28, 352(1)
+	ld	29, 360(1)
+	ld	30, 368(1)
+	ld	31, 376(1)
+	lfd	14, 384(1)
+	lfd	15, 392(1)
+	lfd	16, 400(1)
+	lfd	17, 408(1)
+	lfd	18, 416(1)
+	lfd	19, 424(1)
+	lfd	20, 432(1)
+	lfd	21, 440(1)
+	lfd	22, 448(1)
+	lfd	23, 456(1)
+	lfd	24, 464(1)
+	lfd	25, 472(1)
+	lfd	26, 480(1)
+	lfd	27, 488(1)
+	lfd	28, 496(1)
+	lfd	29, 504(1)
+	lfd	30, 512(1)
+	lfd	31, 520(1)
+	mfcr	0
+	std	0, 480(4)
+	ld	0, 8(1)
+	mtcrf	0b00111000, 0
+	addi	1, 1, 528
+	ld	0, 16(1)
+	mtlr	0
+	blr	
+.size	abi_test_trampoline,.-abi_test_trampoline
+.globl	abi_test_clobber_r0
+.type	abi_test_clobber_r0,@function
+.align	5
+abi_test_clobber_r0:
+.localentry	abi_test_clobber_r0,0
+
+	li	0, 0
+	blr	
+.size	abi_test_clobber_r0,.-abi_test_clobber_r0
+.globl	abi_test_clobber_r2
+.type	abi_test_clobber_r2,@function
+.align	5
+abi_test_clobber_r2:
+.localentry	abi_test_clobber_r2,0
+
+	li	2, 0
+	blr	
+.size	abi_test_clobber_r2,.-abi_test_clobber_r2
+.globl	abi_test_clobber_r3
+.type	abi_test_clobber_r3,@function
+.align	5
+abi_test_clobber_r3:
+.localentry	abi_test_clobber_r3,0
+
+	li	3, 0
+	blr	
+.size	abi_test_clobber_r3,.-abi_test_clobber_r3
+.globl	abi_test_clobber_r4
+.type	abi_test_clobber_r4,@function
+.align	5
+abi_test_clobber_r4:
+.localentry	abi_test_clobber_r4,0
+
+	li	4, 0
+	blr	
+.size	abi_test_clobber_r4,.-abi_test_clobber_r4
+.globl	abi_test_clobber_r5
+.type	abi_test_clobber_r5,@function
+.align	5
+abi_test_clobber_r5:
+.localentry	abi_test_clobber_r5,0
+
+	li	5, 0
+	blr	
+.size	abi_test_clobber_r5,.-abi_test_clobber_r5
+.globl	abi_test_clobber_r6
+.type	abi_test_clobber_r6,@function
+.align	5
+abi_test_clobber_r6:
+.localentry	abi_test_clobber_r6,0
+
+	li	6, 0
+	blr	
+.size	abi_test_clobber_r6,.-abi_test_clobber_r6
+.globl	abi_test_clobber_r7
+.type	abi_test_clobber_r7,@function
+.align	5
+abi_test_clobber_r7:
+.localentry	abi_test_clobber_r7,0
+
+	li	7, 0
+	blr	
+.size	abi_test_clobber_r7,.-abi_test_clobber_r7
+.globl	abi_test_clobber_r8
+.type	abi_test_clobber_r8,@function
+.align	5
+abi_test_clobber_r8:
+.localentry	abi_test_clobber_r8,0
+
+	li	8, 0
+	blr	
+.size	abi_test_clobber_r8,.-abi_test_clobber_r8
+.globl	abi_test_clobber_r9
+.type	abi_test_clobber_r9,@function
+.align	5
+abi_test_clobber_r9:
+.localentry	abi_test_clobber_r9,0
+
+	li	9, 0
+	blr	
+.size	abi_test_clobber_r9,.-abi_test_clobber_r9
+.globl	abi_test_clobber_r10
+.type	abi_test_clobber_r10,@function
+.align	5
+abi_test_clobber_r10:
+.localentry	abi_test_clobber_r10,0
+
+	li	10, 0
+	blr	
+.size	abi_test_clobber_r10,.-abi_test_clobber_r10
+.globl	abi_test_clobber_r11
+.type	abi_test_clobber_r11,@function
+.align	5
+abi_test_clobber_r11:
+.localentry	abi_test_clobber_r11,0
+
+	li	11, 0
+	blr	
+.size	abi_test_clobber_r11,.-abi_test_clobber_r11
+.globl	abi_test_clobber_r12
+.type	abi_test_clobber_r12,@function
+.align	5
+abi_test_clobber_r12:
+.localentry	abi_test_clobber_r12,0
+
+	li	12, 0
+	blr	
+.size	abi_test_clobber_r12,.-abi_test_clobber_r12
+.globl	abi_test_clobber_r14
+.type	abi_test_clobber_r14,@function
+.align	5
+abi_test_clobber_r14:
+.localentry	abi_test_clobber_r14,0
+
+	li	14, 0
+	blr	
+.size	abi_test_clobber_r14,.-abi_test_clobber_r14
+.globl	abi_test_clobber_r15
+.type	abi_test_clobber_r15,@function
+.align	5
+abi_test_clobber_r15:
+.localentry	abi_test_clobber_r15,0
+
+	li	15, 0
+	blr	
+.size	abi_test_clobber_r15,.-abi_test_clobber_r15
+.globl	abi_test_clobber_r16
+.type	abi_test_clobber_r16,@function
+.align	5
+abi_test_clobber_r16:
+.localentry	abi_test_clobber_r16,0
+
+	li	16, 0
+	blr	
+.size	abi_test_clobber_r16,.-abi_test_clobber_r16
+.globl	abi_test_clobber_r17
+.type	abi_test_clobber_r17,@function
+.align	5
+abi_test_clobber_r17:
+.localentry	abi_test_clobber_r17,0
+
+	li	17, 0
+	blr	
+.size	abi_test_clobber_r17,.-abi_test_clobber_r17
+.globl	abi_test_clobber_r18
+.type	abi_test_clobber_r18,@function
+.align	5
+abi_test_clobber_r18:
+.localentry	abi_test_clobber_r18,0
+
+	li	18, 0
+	blr	
+.size	abi_test_clobber_r18,.-abi_test_clobber_r18
+.globl	abi_test_clobber_r19
+.type	abi_test_clobber_r19,@function
+.align	5
+abi_test_clobber_r19:
+.localentry	abi_test_clobber_r19,0
+
+	li	19, 0
+	blr	
+.size	abi_test_clobber_r19,.-abi_test_clobber_r19
+.globl	abi_test_clobber_r20
+.type	abi_test_clobber_r20,@function
+.align	5
+abi_test_clobber_r20:
+.localentry	abi_test_clobber_r20,0
+
+	li	20, 0
+	blr	
+.size	abi_test_clobber_r20,.-abi_test_clobber_r20
+.globl	abi_test_clobber_r21
+.type	abi_test_clobber_r21,@function
+.align	5
+abi_test_clobber_r21:
+.localentry	abi_test_clobber_r21,0
+
+	li	21, 0
+	blr	
+.size	abi_test_clobber_r21,.-abi_test_clobber_r21
+.globl	abi_test_clobber_r22
+.type	abi_test_clobber_r22,@function
+.align	5
+abi_test_clobber_r22:
+.localentry	abi_test_clobber_r22,0
+
+	li	22, 0
+	blr	
+.size	abi_test_clobber_r22,.-abi_test_clobber_r22
+.globl	abi_test_clobber_r23
+.type	abi_test_clobber_r23,@function
+.align	5
+abi_test_clobber_r23:
+.localentry	abi_test_clobber_r23,0
+
+	li	23, 0
+	blr	
+.size	abi_test_clobber_r23,.-abi_test_clobber_r23
+.globl	abi_test_clobber_r24
+.type	abi_test_clobber_r24,@function
+.align	5
+abi_test_clobber_r24:
+.localentry	abi_test_clobber_r24,0
+
+	li	24, 0
+	blr	
+.size	abi_test_clobber_r24,.-abi_test_clobber_r24
+.globl	abi_test_clobber_r25
+.type	abi_test_clobber_r25,@function
+.align	5
+abi_test_clobber_r25:
+.localentry	abi_test_clobber_r25,0
+
+	li	25, 0
+	blr	
+.size	abi_test_clobber_r25,.-abi_test_clobber_r25
+.globl	abi_test_clobber_r26
+.type	abi_test_clobber_r26,@function
+.align	5
+abi_test_clobber_r26:
+.localentry	abi_test_clobber_r26,0
+
+	li	26, 0
+	blr	
+.size	abi_test_clobber_r26,.-abi_test_clobber_r26
+.globl	abi_test_clobber_r27
+.type	abi_test_clobber_r27,@function
+.align	5
+abi_test_clobber_r27:
+.localentry	abi_test_clobber_r27,0
+
+	li	27, 0
+	blr	
+.size	abi_test_clobber_r27,.-abi_test_clobber_r27
+.globl	abi_test_clobber_r28
+.type	abi_test_clobber_r28,@function
+.align	5
+abi_test_clobber_r28:
+.localentry	abi_test_clobber_r28,0
+
+	li	28, 0
+	blr	
+.size	abi_test_clobber_r28,.-abi_test_clobber_r28
+.globl	abi_test_clobber_r29
+.type	abi_test_clobber_r29,@function
+.align	5
+abi_test_clobber_r29:
+.localentry	abi_test_clobber_r29,0
+
+	li	29, 0
+	blr	
+.size	abi_test_clobber_r29,.-abi_test_clobber_r29
+.globl	abi_test_clobber_r30
+.type	abi_test_clobber_r30,@function
+.align	5
+abi_test_clobber_r30:
+.localentry	abi_test_clobber_r30,0
+
+	li	30, 0
+	blr	
+.size	abi_test_clobber_r30,.-abi_test_clobber_r30
+.globl	abi_test_clobber_r31
+.type	abi_test_clobber_r31,@function
+.align	5
+abi_test_clobber_r31:
+.localentry	abi_test_clobber_r31,0
+
+	li	31, 0
+	blr	
+.size	abi_test_clobber_r31,.-abi_test_clobber_r31
+.globl	abi_test_clobber_f0
+.type	abi_test_clobber_f0,@function
+.align	4
+abi_test_clobber_f0:
+.localentry	abi_test_clobber_f0,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	0, -8(1)
+	blr	
+.size	abi_test_clobber_f0,.-abi_test_clobber_f0
+.globl	abi_test_clobber_f1
+.type	abi_test_clobber_f1,@function
+.align	4
+abi_test_clobber_f1:
+.localentry	abi_test_clobber_f1,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	1, -8(1)
+	blr	
+.size	abi_test_clobber_f1,.-abi_test_clobber_f1
+.globl	abi_test_clobber_f2
+.type	abi_test_clobber_f2,@function
+.align	4
+abi_test_clobber_f2:
+.localentry	abi_test_clobber_f2,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	2, -8(1)
+	blr	
+.size	abi_test_clobber_f2,.-abi_test_clobber_f2
+.globl	abi_test_clobber_f3
+.type	abi_test_clobber_f3,@function
+.align	4
+abi_test_clobber_f3:
+.localentry	abi_test_clobber_f3,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	3, -8(1)
+	blr	
+.size	abi_test_clobber_f3,.-abi_test_clobber_f3
+.globl	abi_test_clobber_f4
+.type	abi_test_clobber_f4,@function
+.align	4
+abi_test_clobber_f4:
+.localentry	abi_test_clobber_f4,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	4, -8(1)
+	blr	
+.size	abi_test_clobber_f4,.-abi_test_clobber_f4
+.globl	abi_test_clobber_f5
+.type	abi_test_clobber_f5,@function
+.align	4
+abi_test_clobber_f5:
+.localentry	abi_test_clobber_f5,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	5, -8(1)
+	blr	
+.size	abi_test_clobber_f5,.-abi_test_clobber_f5
+.globl	abi_test_clobber_f6
+.type	abi_test_clobber_f6,@function
+.align	4
+abi_test_clobber_f6:
+.localentry	abi_test_clobber_f6,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	6, -8(1)
+	blr	
+.size	abi_test_clobber_f6,.-abi_test_clobber_f6
+.globl	abi_test_clobber_f7
+.type	abi_test_clobber_f7,@function
+.align	4
+abi_test_clobber_f7:
+.localentry	abi_test_clobber_f7,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	7, -8(1)
+	blr	
+.size	abi_test_clobber_f7,.-abi_test_clobber_f7
+.globl	abi_test_clobber_f8
+.type	abi_test_clobber_f8,@function
+.align	4
+abi_test_clobber_f8:
+.localentry	abi_test_clobber_f8,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	8, -8(1)
+	blr	
+.size	abi_test_clobber_f8,.-abi_test_clobber_f8
+.globl	abi_test_clobber_f9
+.type	abi_test_clobber_f9,@function
+.align	4
+abi_test_clobber_f9:
+.localentry	abi_test_clobber_f9,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	9, -8(1)
+	blr	
+.size	abi_test_clobber_f9,.-abi_test_clobber_f9
+.globl	abi_test_clobber_f10
+.type	abi_test_clobber_f10,@function
+.align	4
+abi_test_clobber_f10:
+.localentry	abi_test_clobber_f10,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	10, -8(1)
+	blr	
+.size	abi_test_clobber_f10,.-abi_test_clobber_f10
+.globl	abi_test_clobber_f11
+.type	abi_test_clobber_f11,@function
+.align	4
+abi_test_clobber_f11:
+.localentry	abi_test_clobber_f11,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	11, -8(1)
+	blr	
+.size	abi_test_clobber_f11,.-abi_test_clobber_f11
+.globl	abi_test_clobber_f12
+.type	abi_test_clobber_f12,@function
+.align	4
+abi_test_clobber_f12:
+.localentry	abi_test_clobber_f12,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	12, -8(1)
+	blr	
+.size	abi_test_clobber_f12,.-abi_test_clobber_f12
+.globl	abi_test_clobber_f13
+.type	abi_test_clobber_f13,@function
+.align	4
+abi_test_clobber_f13:
+.localentry	abi_test_clobber_f13,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	13, -8(1)
+	blr	
+.size	abi_test_clobber_f13,.-abi_test_clobber_f13
+.globl	abi_test_clobber_f14
+.type	abi_test_clobber_f14,@function
+.align	4
+abi_test_clobber_f14:
+.localentry	abi_test_clobber_f14,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	14, -8(1)
+	blr	
+.size	abi_test_clobber_f14,.-abi_test_clobber_f14
+.globl	abi_test_clobber_f15
+.type	abi_test_clobber_f15,@function
+.align	4
+abi_test_clobber_f15:
+.localentry	abi_test_clobber_f15,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	15, -8(1)
+	blr	
+.size	abi_test_clobber_f15,.-abi_test_clobber_f15
+.globl	abi_test_clobber_f16
+.type	abi_test_clobber_f16,@function
+.align	4
+abi_test_clobber_f16:
+.localentry	abi_test_clobber_f16,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	16, -8(1)
+	blr	
+.size	abi_test_clobber_f16,.-abi_test_clobber_f16
+.globl	abi_test_clobber_f17
+.type	abi_test_clobber_f17,@function
+.align	4
+abi_test_clobber_f17:
+.localentry	abi_test_clobber_f17,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	17, -8(1)
+	blr	
+.size	abi_test_clobber_f17,.-abi_test_clobber_f17
+.globl	abi_test_clobber_f18
+.type	abi_test_clobber_f18,@function
+.align	4
+abi_test_clobber_f18:
+.localentry	abi_test_clobber_f18,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	18, -8(1)
+	blr	
+.size	abi_test_clobber_f18,.-abi_test_clobber_f18
+.globl	abi_test_clobber_f19
+.type	abi_test_clobber_f19,@function
+.align	4
+abi_test_clobber_f19:
+.localentry	abi_test_clobber_f19,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	19, -8(1)
+	blr	
+.size	abi_test_clobber_f19,.-abi_test_clobber_f19
+.globl	abi_test_clobber_f20
+.type	abi_test_clobber_f20,@function
+.align	4
+abi_test_clobber_f20:
+.localentry	abi_test_clobber_f20,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	20, -8(1)
+	blr	
+.size	abi_test_clobber_f20,.-abi_test_clobber_f20
+.globl	abi_test_clobber_f21
+.type	abi_test_clobber_f21,@function
+.align	4
+abi_test_clobber_f21:
+.localentry	abi_test_clobber_f21,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	21, -8(1)
+	blr	
+.size	abi_test_clobber_f21,.-abi_test_clobber_f21
+.globl	abi_test_clobber_f22
+.type	abi_test_clobber_f22,@function
+.align	4
+abi_test_clobber_f22:
+.localentry	abi_test_clobber_f22,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	22, -8(1)
+	blr	
+.size	abi_test_clobber_f22,.-abi_test_clobber_f22
+.globl	abi_test_clobber_f23
+.type	abi_test_clobber_f23,@function
+.align	4
+abi_test_clobber_f23:
+.localentry	abi_test_clobber_f23,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	23, -8(1)
+	blr	
+.size	abi_test_clobber_f23,.-abi_test_clobber_f23
+.globl	abi_test_clobber_f24
+.type	abi_test_clobber_f24,@function
+.align	4
+abi_test_clobber_f24:
+.localentry	abi_test_clobber_f24,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	24, -8(1)
+	blr	
+.size	abi_test_clobber_f24,.-abi_test_clobber_f24
+.globl	abi_test_clobber_f25
+.type	abi_test_clobber_f25,@function
+.align	4
+abi_test_clobber_f25:
+.localentry	abi_test_clobber_f25,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	25, -8(1)
+	blr	
+.size	abi_test_clobber_f25,.-abi_test_clobber_f25
+.globl	abi_test_clobber_f26
+.type	abi_test_clobber_f26,@function
+.align	4
+abi_test_clobber_f26:
+.localentry	abi_test_clobber_f26,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	26, -8(1)
+	blr	
+.size	abi_test_clobber_f26,.-abi_test_clobber_f26
+.globl	abi_test_clobber_f27
+.type	abi_test_clobber_f27,@function
+.align	4
+abi_test_clobber_f27:
+.localentry	abi_test_clobber_f27,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	27, -8(1)
+	blr	
+.size	abi_test_clobber_f27,.-abi_test_clobber_f27
+.globl	abi_test_clobber_f28
+.type	abi_test_clobber_f28,@function
+.align	4
+abi_test_clobber_f28:
+.localentry	abi_test_clobber_f28,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	28, -8(1)
+	blr	
+.size	abi_test_clobber_f28,.-abi_test_clobber_f28
+.globl	abi_test_clobber_f29
+.type	abi_test_clobber_f29,@function
+.align	4
+abi_test_clobber_f29:
+.localentry	abi_test_clobber_f29,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	29, -8(1)
+	blr	
+.size	abi_test_clobber_f29,.-abi_test_clobber_f29
+.globl	abi_test_clobber_f30
+.type	abi_test_clobber_f30,@function
+.align	4
+abi_test_clobber_f30:
+.localentry	abi_test_clobber_f30,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	30, -8(1)
+	blr	
+.size	abi_test_clobber_f30,.-abi_test_clobber_f30
+.globl	abi_test_clobber_f31
+.type	abi_test_clobber_f31,@function
+.align	4
+abi_test_clobber_f31:
+.localentry	abi_test_clobber_f31,0
+
+	li	0, 0
+
+	std	0, -8(1)
+	lfd	31, -8(1)
+	blr	
+.size	abi_test_clobber_f31,.-abi_test_clobber_f31
+.globl	abi_test_clobber_v0
+.type	abi_test_clobber_v0,@function
+.align	4
+abi_test_clobber_v0:
+.localentry	abi_test_clobber_v0,0
+
+	vxor	0, 0, 0
+	blr	
+.size	abi_test_clobber_v0,.-abi_test_clobber_v0
+.globl	abi_test_clobber_v1
+.type	abi_test_clobber_v1,@function
+.align	4
+abi_test_clobber_v1:
+.localentry	abi_test_clobber_v1,0
+
+	vxor	1, 1, 1
+	blr	
+.size	abi_test_clobber_v1,.-abi_test_clobber_v1
+.globl	abi_test_clobber_v2
+.type	abi_test_clobber_v2,@function
+.align	4
+abi_test_clobber_v2:
+.localentry	abi_test_clobber_v2,0
+
+	vxor	2, 2, 2
+	blr	
+.size	abi_test_clobber_v2,.-abi_test_clobber_v2
+.globl	abi_test_clobber_v3
+.type	abi_test_clobber_v3,@function
+.align	4
+abi_test_clobber_v3:
+.localentry	abi_test_clobber_v3,0
+
+	vxor	3, 3, 3
+	blr	
+.size	abi_test_clobber_v3,.-abi_test_clobber_v3
+.globl	abi_test_clobber_v4
+.type	abi_test_clobber_v4,@function
+.align	4
+abi_test_clobber_v4:
+.localentry	abi_test_clobber_v4,0
+
+	vxor	4, 4, 4
+	blr	
+.size	abi_test_clobber_v4,.-abi_test_clobber_v4
+.globl	abi_test_clobber_v5
+.type	abi_test_clobber_v5,@function
+.align	4
+abi_test_clobber_v5:
+.localentry	abi_test_clobber_v5,0
+
+	vxor	5, 5, 5
+	blr	
+.size	abi_test_clobber_v5,.-abi_test_clobber_v5
+.globl	abi_test_clobber_v6
+.type	abi_test_clobber_v6,@function
+.align	4
+abi_test_clobber_v6:
+.localentry	abi_test_clobber_v6,0
+
+	vxor	6, 6, 6
+	blr	
+.size	abi_test_clobber_v6,.-abi_test_clobber_v6
+.globl	abi_test_clobber_v7
+.type	abi_test_clobber_v7,@function
+.align	4
+abi_test_clobber_v7:
+.localentry	abi_test_clobber_v7,0
+
+	vxor	7, 7, 7
+	blr	
+.size	abi_test_clobber_v7,.-abi_test_clobber_v7
+.globl	abi_test_clobber_v8
+.type	abi_test_clobber_v8,@function
+.align	4
+abi_test_clobber_v8:
+.localentry	abi_test_clobber_v8,0
+
+	vxor	8, 8, 8
+	blr	
+.size	abi_test_clobber_v8,.-abi_test_clobber_v8
+.globl	abi_test_clobber_v9
+.type	abi_test_clobber_v9,@function
+.align	4
+abi_test_clobber_v9:
+.localentry	abi_test_clobber_v9,0
+
+	vxor	9, 9, 9
+	blr	
+.size	abi_test_clobber_v9,.-abi_test_clobber_v9
+.globl	abi_test_clobber_v10
+.type	abi_test_clobber_v10,@function
+.align	4
+abi_test_clobber_v10:
+.localentry	abi_test_clobber_v10,0
+
+	vxor	10, 10, 10
+	blr	
+.size	abi_test_clobber_v10,.-abi_test_clobber_v10
+.globl	abi_test_clobber_v11
+.type	abi_test_clobber_v11,@function
+.align	4
+abi_test_clobber_v11:
+.localentry	abi_test_clobber_v11,0
+
+	vxor	11, 11, 11
+	blr	
+.size	abi_test_clobber_v11,.-abi_test_clobber_v11
+.globl	abi_test_clobber_v12
+.type	abi_test_clobber_v12,@function
+.align	4
+abi_test_clobber_v12:
+.localentry	abi_test_clobber_v12,0
+
+	vxor	12, 12, 12
+	blr	
+.size	abi_test_clobber_v12,.-abi_test_clobber_v12
+.globl	abi_test_clobber_v13
+.type	abi_test_clobber_v13,@function
+.align	4
+abi_test_clobber_v13:
+.localentry	abi_test_clobber_v13,0
+
+	vxor	13, 13, 13
+	blr	
+.size	abi_test_clobber_v13,.-abi_test_clobber_v13
+.globl	abi_test_clobber_v14
+.type	abi_test_clobber_v14,@function
+.align	4
+abi_test_clobber_v14:
+.localentry	abi_test_clobber_v14,0
+
+	vxor	14, 14, 14
+	blr	
+.size	abi_test_clobber_v14,.-abi_test_clobber_v14
+.globl	abi_test_clobber_v15
+.type	abi_test_clobber_v15,@function
+.align	4
+abi_test_clobber_v15:
+.localentry	abi_test_clobber_v15,0
+
+	vxor	15, 15, 15
+	blr	
+.size	abi_test_clobber_v15,.-abi_test_clobber_v15
+.globl	abi_test_clobber_v16
+.type	abi_test_clobber_v16,@function
+.align	4
+abi_test_clobber_v16:
+.localentry	abi_test_clobber_v16,0
+
+	vxor	16, 16, 16
+	blr	
+.size	abi_test_clobber_v16,.-abi_test_clobber_v16
+.globl	abi_test_clobber_v17
+.type	abi_test_clobber_v17,@function
+.align	4
+abi_test_clobber_v17:
+.localentry	abi_test_clobber_v17,0
+
+	vxor	17, 17, 17
+	blr	
+.size	abi_test_clobber_v17,.-abi_test_clobber_v17
+.globl	abi_test_clobber_v18
+.type	abi_test_clobber_v18,@function
+.align	4
+abi_test_clobber_v18:
+.localentry	abi_test_clobber_v18,0
+
+	vxor	18, 18, 18
+	blr	
+.size	abi_test_clobber_v18,.-abi_test_clobber_v18
+.globl	abi_test_clobber_v19
+.type	abi_test_clobber_v19,@function
+.align	4
+abi_test_clobber_v19:
+.localentry	abi_test_clobber_v19,0
+
+	vxor	19, 19, 19
+	blr	
+.size	abi_test_clobber_v19,.-abi_test_clobber_v19
+.globl	abi_test_clobber_v20
+.type	abi_test_clobber_v20,@function
+.align	4
+abi_test_clobber_v20:
+.localentry	abi_test_clobber_v20,0
+
+	vxor	20, 20, 20
+	blr	
+.size	abi_test_clobber_v20,.-abi_test_clobber_v20
+.globl	abi_test_clobber_v21
+.type	abi_test_clobber_v21,@function
+.align	4
+abi_test_clobber_v21:
+.localentry	abi_test_clobber_v21,0
+
+	vxor	21, 21, 21
+	blr	
+.size	abi_test_clobber_v21,.-abi_test_clobber_v21
+.globl	abi_test_clobber_v22
+.type	abi_test_clobber_v22,@function
+.align	4
+abi_test_clobber_v22:
+.localentry	abi_test_clobber_v22,0
+
+	vxor	22, 22, 22
+	blr	
+.size	abi_test_clobber_v22,.-abi_test_clobber_v22
+.globl	abi_test_clobber_v23
+.type	abi_test_clobber_v23,@function
+.align	4
+abi_test_clobber_v23:
+.localentry	abi_test_clobber_v23,0
+
+	vxor	23, 23, 23
+	blr	
+.size	abi_test_clobber_v23,.-abi_test_clobber_v23
+.globl	abi_test_clobber_v24
+.type	abi_test_clobber_v24,@function
+.align	4
+abi_test_clobber_v24:
+.localentry	abi_test_clobber_v24,0
+
+	vxor	24, 24, 24
+	blr	
+.size	abi_test_clobber_v24,.-abi_test_clobber_v24
+.globl	abi_test_clobber_v25
+.type	abi_test_clobber_v25,@function
+.align	4
+abi_test_clobber_v25:
+.localentry	abi_test_clobber_v25,0
+
+	vxor	25, 25, 25
+	blr	
+.size	abi_test_clobber_v25,.-abi_test_clobber_v25
+.globl	abi_test_clobber_v26
+.type	abi_test_clobber_v26,@function
+.align	4
+abi_test_clobber_v26:
+.localentry	abi_test_clobber_v26,0
+
+	vxor	26, 26, 26
+	blr	
+.size	abi_test_clobber_v26,.-abi_test_clobber_v26
+.globl	abi_test_clobber_v27
+.type	abi_test_clobber_v27,@function
+.align	4
+abi_test_clobber_v27:
+.localentry	abi_test_clobber_v27,0
+
+	vxor	27, 27, 27
+	blr	
+.size	abi_test_clobber_v27,.-abi_test_clobber_v27
+.globl	abi_test_clobber_v28
+.type	abi_test_clobber_v28,@function
+.align	4
+abi_test_clobber_v28:
+.localentry	abi_test_clobber_v28,0
+
+	vxor	28, 28, 28
+	blr	
+.size	abi_test_clobber_v28,.-abi_test_clobber_v28
+.globl	abi_test_clobber_v29
+.type	abi_test_clobber_v29,@function
+.align	4
+abi_test_clobber_v29:
+.localentry	abi_test_clobber_v29,0
+
+	vxor	29, 29, 29
+	blr	
+.size	abi_test_clobber_v29,.-abi_test_clobber_v29
+.globl	abi_test_clobber_v30
+.type	abi_test_clobber_v30,@function
+.align	4
+abi_test_clobber_v30:
+.localentry	abi_test_clobber_v30,0
+
+	vxor	30, 30, 30
+	blr	
+.size	abi_test_clobber_v30,.-abi_test_clobber_v30
+.globl	abi_test_clobber_v31
+.type	abi_test_clobber_v31,@function
+.align	4
+abi_test_clobber_v31:
+.localentry	abi_test_clobber_v31,0
+
+	vxor	31, 31, 31
+	blr	
+.size	abi_test_clobber_v31,.-abi_test_clobber_v31
+.globl	abi_test_clobber_cr0
+.type	abi_test_clobber_cr0,@function
+.align	4
+abi_test_clobber_cr0:
+.localentry	abi_test_clobber_cr0,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	128, 0
+	blr	
+.size	abi_test_clobber_cr0,.-abi_test_clobber_cr0
+.globl	abi_test_clobber_cr1
+.type	abi_test_clobber_cr1,@function
+.align	4
+abi_test_clobber_cr1:
+.localentry	abi_test_clobber_cr1,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	64, 0
+	blr	
+.size	abi_test_clobber_cr1,.-abi_test_clobber_cr1
+.globl	abi_test_clobber_cr2
+.type	abi_test_clobber_cr2,@function
+.align	4
+abi_test_clobber_cr2:
+.localentry	abi_test_clobber_cr2,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	32, 0
+	blr	
+.size	abi_test_clobber_cr2,.-abi_test_clobber_cr2
+.globl	abi_test_clobber_cr3
+.type	abi_test_clobber_cr3,@function
+.align	4
+abi_test_clobber_cr3:
+.localentry	abi_test_clobber_cr3,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	16, 0
+	blr	
+.size	abi_test_clobber_cr3,.-abi_test_clobber_cr3
+.globl	abi_test_clobber_cr4
+.type	abi_test_clobber_cr4,@function
+.align	4
+abi_test_clobber_cr4:
+.localentry	abi_test_clobber_cr4,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	8, 0
+	blr	
+.size	abi_test_clobber_cr4,.-abi_test_clobber_cr4
+.globl	abi_test_clobber_cr5
+.type	abi_test_clobber_cr5,@function
+.align	4
+abi_test_clobber_cr5:
+.localentry	abi_test_clobber_cr5,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	4, 0
+	blr	
+.size	abi_test_clobber_cr5,.-abi_test_clobber_cr5
+.globl	abi_test_clobber_cr6
+.type	abi_test_clobber_cr6,@function
+.align	4
+abi_test_clobber_cr6:
+.localentry	abi_test_clobber_cr6,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	2, 0
+	blr	
+.size	abi_test_clobber_cr6,.-abi_test_clobber_cr6
+.globl	abi_test_clobber_cr7
+.type	abi_test_clobber_cr7,@function
+.align	4
+abi_test_clobber_cr7:
+.localentry	abi_test_clobber_cr7,0
+
+
+
+	mfcr	0
+	not	0, 0
+	mtcrf	1, 0
+	blr	
+.size	abi_test_clobber_cr7,.-abi_test_clobber_cr7
+.globl	abi_test_clobber_ctr
+.type	abi_test_clobber_ctr,@function
+.align	4
+abi_test_clobber_ctr:
+.localentry	abi_test_clobber_ctr,0
+
+	li	0, 0
+	mtctr	0
+	blr	
+.size	abi_test_clobber_ctr,.-abi_test_clobber_ctr
+
+.globl	abi_test_clobber_lr
+.type	abi_test_clobber_lr,@function
+.align	4
+abi_test_clobber_lr:
+.localentry	abi_test_clobber_lr,0
+
+	mflr	0
+	mtctr	0
+	li	0, 0
+	mtlr	0
+	bctr	
+.size	abi_test_clobber_lr,.-abi_test_clobber_lr
+
+#endif  // !OPENSSL_NO_ASM && __powerpc64__
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S
@@ -1,0 +1,975 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	ChaCha20_ctr32
+.hidden	ChaCha20_ctr32
+.type	ChaCha20_ctr32,@function
+.align	16
+ChaCha20_ctr32:
+.L_ChaCha20_ctr32_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	cmpl	28(%esp),%eax
+	je	.L000no_data
+	call	.Lpic_point
+.Lpic_point:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp
+	testl	$16777216,(%ebp)
+	jz	.L001x86
+	testl	$512,4(%ebp)
+	jz	.L001x86
+	jmp	.Lssse3_shortcut
+.L001x86:
+	movl	32(%esp),%esi
+	movl	36(%esp),%edi
+	subl	$132,%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	movl	%eax,80(%esp)
+	movl	%ebx,84(%esp)
+	movl	%ecx,88(%esp)
+	movl	%edx,92(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	movl	%eax,96(%esp)
+	movl	%ebx,100(%esp)
+	movl	%ecx,104(%esp)
+	movl	%edx,108(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	subl	$1,%eax
+	movl	%eax,112(%esp)
+	movl	%ebx,116(%esp)
+	movl	%ecx,120(%esp)
+	movl	%edx,124(%esp)
+	jmp	.L002entry
+.align	16
+.L003outer_loop:
+	movl	%ebx,156(%esp)
+	movl	%eax,152(%esp)
+	movl	%ecx,160(%esp)
+.L002entry:
+	movl	$1634760805,%eax
+	movl	$857760878,4(%esp)
+	movl	$2036477234,8(%esp)
+	movl	$1797285236,12(%esp)
+	movl	84(%esp),%ebx
+	movl	88(%esp),%ebp
+	movl	104(%esp),%ecx
+	movl	108(%esp),%esi
+	movl	116(%esp),%edx
+	movl	120(%esp),%edi
+	movl	%ebx,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ecx,40(%esp)
+	movl	%esi,44(%esp)
+	movl	%edx,52(%esp)
+	movl	%edi,56(%esp)
+	movl	92(%esp),%ebx
+	movl	124(%esp),%edi
+	movl	112(%esp),%edx
+	movl	80(%esp),%ebp
+	movl	96(%esp),%ecx
+	movl	100(%esp),%esi
+	addl	$1,%edx
+	movl	%ebx,28(%esp)
+	movl	%edi,60(%esp)
+	movl	%edx,112(%esp)
+	movl	$10,%ebx
+	jmp	.L004loop
+.align	16
+.L004loop:
+	addl	%ebp,%eax
+	movl	%ebx,128(%esp)
+	movl	%ebp,%ebx
+	xorl	%eax,%edx
+	roll	$16,%edx
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	52(%esp),%edi
+	roll	$12,%ebx
+	movl	20(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,48(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,32(%esp)
+	roll	$16,%edi
+	movl	%ebx,16(%esp)
+	addl	%edi,%esi
+	movl	40(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	56(%esp),%edx
+	roll	$12,%ebp
+	movl	24(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,52(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,36(%esp)
+	roll	$16,%edx
+	movl	%ebp,20(%esp)
+	addl	%edx,%ecx
+	movl	44(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	60(%esp),%edi
+	roll	$12,%ebx
+	movl	28(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,56(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,24(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	roll	$12,%ebp
+	movl	20(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,%edx
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	roll	$16,%edx
+	movl	%ebp,28(%esp)
+	addl	%edx,%ecx
+	xorl	%ecx,%ebx
+	movl	48(%esp),%edi
+	roll	$12,%ebx
+	movl	24(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,(%esp)
+	roll	$8,%edx
+	movl	4(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,60(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	movl	%ecx,40(%esp)
+	roll	$16,%edi
+	movl	%ebx,20(%esp)
+	addl	%edi,%esi
+	movl	32(%esp),%ecx
+	xorl	%esi,%ebp
+	movl	52(%esp),%edx
+	roll	$12,%ebp
+	movl	28(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,4(%esp)
+	roll	$8,%edi
+	movl	8(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,48(%esp)
+	xorl	%esi,%ebp
+	addl	%ebx,%eax
+	roll	$7,%ebp
+	xorl	%eax,%edx
+	movl	%esi,44(%esp)
+	roll	$16,%edx
+	movl	%ebp,24(%esp)
+	addl	%edx,%ecx
+	movl	36(%esp),%esi
+	xorl	%ecx,%ebx
+	movl	56(%esp),%edi
+	roll	$12,%ebx
+	movl	16(%esp),%ebp
+	addl	%ebx,%eax
+	xorl	%eax,%edx
+	movl	%eax,8(%esp)
+	roll	$8,%edx
+	movl	12(%esp),%eax
+	addl	%edx,%ecx
+	movl	%edx,52(%esp)
+	xorl	%ecx,%ebx
+	addl	%ebp,%eax
+	roll	$7,%ebx
+	xorl	%eax,%edi
+	roll	$16,%edi
+	movl	%ebx,28(%esp)
+	addl	%edi,%esi
+	xorl	%esi,%ebp
+	movl	48(%esp),%edx
+	roll	$12,%ebp
+	movl	128(%esp),%ebx
+	addl	%ebp,%eax
+	xorl	%eax,%edi
+	movl	%eax,12(%esp)
+	roll	$8,%edi
+	movl	(%esp),%eax
+	addl	%edi,%esi
+	movl	%edi,56(%esp)
+	xorl	%esi,%ebp
+	roll	$7,%ebp
+	decl	%ebx
+	jnz	.L004loop
+	movl	160(%esp),%ebx
+	addl	$1634760805,%eax
+	addl	80(%esp),%ebp
+	addl	96(%esp),%ecx
+	addl	100(%esp),%esi
+	cmpl	$64,%ebx
+	jb	.L005tail
+	movl	156(%esp),%ebx
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	xorl	(%ebx),%eax
+	xorl	16(%ebx),%ebp
+	movl	%eax,(%esp)
+	movl	152(%esp),%eax
+	xorl	32(%ebx),%ecx
+	xorl	36(%ebx),%esi
+	xorl	48(%ebx),%edx
+	xorl	56(%ebx),%edi
+	movl	%ebp,16(%eax)
+	movl	%ecx,32(%eax)
+	movl	%esi,36(%eax)
+	movl	%edx,48(%eax)
+	movl	%edi,56(%eax)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	xorl	4(%ebx),%ebp
+	xorl	8(%ebx),%ecx
+	xorl	12(%ebx),%esi
+	xorl	20(%ebx),%edx
+	xorl	24(%ebx),%edi
+	movl	%ebp,4(%eax)
+	movl	%ecx,8(%eax)
+	movl	%esi,12(%eax)
+	movl	%edx,20(%eax)
+	movl	%edi,24(%eax)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	xorl	28(%ebx),%ebp
+	xorl	40(%ebx),%ecx
+	xorl	44(%ebx),%esi
+	xorl	52(%ebx),%edx
+	xorl	60(%ebx),%edi
+	leal	64(%ebx),%ebx
+	movl	%ebp,28(%eax)
+	movl	(%esp),%ebp
+	movl	%ecx,40(%eax)
+	movl	160(%esp),%ecx
+	movl	%esi,44(%eax)
+	movl	%edx,52(%eax)
+	movl	%edi,60(%eax)
+	movl	%ebp,(%eax)
+	leal	64(%eax),%eax
+	subl	$64,%ecx
+	jnz	.L003outer_loop
+	jmp	.L006done
+.L005tail:
+	addl	112(%esp),%edx
+	addl	120(%esp),%edi
+	movl	%eax,(%esp)
+	movl	%ebp,16(%esp)
+	movl	%ecx,32(%esp)
+	movl	%esi,36(%esp)
+	movl	%edx,48(%esp)
+	movl	%edi,56(%esp)
+	movl	4(%esp),%ebp
+	movl	8(%esp),%ecx
+	movl	12(%esp),%esi
+	movl	20(%esp),%edx
+	movl	24(%esp),%edi
+	addl	$857760878,%ebp
+	addl	$2036477234,%ecx
+	addl	$1797285236,%esi
+	addl	84(%esp),%edx
+	addl	88(%esp),%edi
+	movl	%ebp,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%esi,12(%esp)
+	movl	%edx,20(%esp)
+	movl	%edi,24(%esp)
+	movl	28(%esp),%ebp
+	movl	40(%esp),%ecx
+	movl	44(%esp),%esi
+	movl	52(%esp),%edx
+	movl	60(%esp),%edi
+	addl	92(%esp),%ebp
+	addl	104(%esp),%ecx
+	addl	108(%esp),%esi
+	addl	116(%esp),%edx
+	addl	124(%esp),%edi
+	movl	%ebp,28(%esp)
+	movl	156(%esp),%ebp
+	movl	%ecx,40(%esp)
+	movl	152(%esp),%ecx
+	movl	%esi,44(%esp)
+	xorl	%esi,%esi
+	movl	%edx,52(%esp)
+	movl	%edi,60(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+.L007tail_loop:
+	movb	(%esi,%ebp,1),%al
+	movb	(%esp,%esi,1),%dl
+	leal	1(%esi),%esi
+	xorb	%dl,%al
+	movb	%al,-1(%ecx,%esi,1)
+	decl	%ebx
+	jnz	.L007tail_loop
+.L006done:
+	addl	$132,%esp
+.L000no_data:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin
+.globl	ChaCha20_ssse3
+.hidden	ChaCha20_ssse3
+.type	ChaCha20_ssse3,@function
+.align	16
+ChaCha20_ssse3:
+.L_ChaCha20_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+.Lssse3_shortcut:
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%ecx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$524,%esp
+	andl	$-64,%esp
+	movl	%ebp,512(%esp)
+	leal	.Lssse3_data-.Lpic_point(%eax),%eax
+	movdqu	(%ebx),%xmm3
+	cmpl	$256,%ecx
+	jb	.L0081x
+	movl	%edx,516(%esp)
+	movl	%ebx,520(%esp)
+	subl	$256,%ecx
+	leal	384(%esp),%ebp
+	movdqu	(%edx),%xmm7
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	paddd	48(%eax),%xmm0
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	psubd	64(%eax),%xmm0
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,64(%ebp)
+	movdqa	%xmm1,80(%ebp)
+	movdqa	%xmm2,96(%ebp)
+	movdqa	%xmm3,112(%ebp)
+	movdqu	16(%edx),%xmm3
+	movdqa	%xmm4,-64(%ebp)
+	movdqa	%xmm5,-48(%ebp)
+	movdqa	%xmm6,-32(%ebp)
+	movdqa	%xmm7,-16(%ebp)
+	movdqa	32(%eax),%xmm7
+	leal	128(%esp),%ebx
+	pshufd	$0,%xmm3,%xmm0
+	pshufd	$85,%xmm3,%xmm1
+	pshufd	$170,%xmm3,%xmm2
+	pshufd	$255,%xmm3,%xmm3
+	pshufd	$0,%xmm7,%xmm4
+	pshufd	$85,%xmm7,%xmm5
+	pshufd	$170,%xmm7,%xmm6
+	pshufd	$255,%xmm7,%xmm7
+	movdqa	%xmm0,(%ebp)
+	movdqa	%xmm1,16(%ebp)
+	movdqa	%xmm2,32(%ebp)
+	movdqa	%xmm3,48(%ebp)
+	movdqa	%xmm4,-128(%ebp)
+	movdqa	%xmm5,-112(%ebp)
+	movdqa	%xmm6,-96(%ebp)
+	movdqa	%xmm7,-80(%ebp)
+	leal	128(%esi),%esi
+	leal	128(%edi),%edi
+	jmp	.L009outer_loop
+.align	16
+.L009outer_loop:
+	movdqa	-112(%ebp),%xmm1
+	movdqa	-96(%ebp),%xmm2
+	movdqa	-80(%ebp),%xmm3
+	movdqa	-48(%ebp),%xmm5
+	movdqa	-32(%ebp),%xmm6
+	movdqa	-16(%ebp),%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	movdqa	%xmm2,-96(%ebx)
+	movdqa	%xmm3,-80(%ebx)
+	movdqa	%xmm5,-48(%ebx)
+	movdqa	%xmm6,-32(%ebx)
+	movdqa	%xmm7,-16(%ebx)
+	movdqa	32(%ebp),%xmm2
+	movdqa	48(%ebp),%xmm3
+	movdqa	64(%ebp),%xmm4
+	movdqa	80(%ebp),%xmm5
+	movdqa	96(%ebp),%xmm6
+	movdqa	112(%ebp),%xmm7
+	paddd	64(%eax),%xmm4
+	movdqa	%xmm2,32(%ebx)
+	movdqa	%xmm3,48(%ebx)
+	movdqa	%xmm4,64(%ebx)
+	movdqa	%xmm5,80(%ebx)
+	movdqa	%xmm6,96(%ebx)
+	movdqa	%xmm7,112(%ebx)
+	movdqa	%xmm4,64(%ebp)
+	movdqa	-128(%ebp),%xmm0
+	movdqa	%xmm4,%xmm6
+	movdqa	-64(%ebp),%xmm3
+	movdqa	(%ebp),%xmm4
+	movdqa	16(%ebp),%xmm5
+	movl	$10,%edx
+	nop
+.align	16
+.L010loop:
+	paddd	%xmm3,%xmm0
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm0,%xmm6
+	pshufb	(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-48(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	80(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,64(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-64(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-32(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	96(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,80(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,16(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-48(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	48(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-16(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	112(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,96(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-32(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	-48(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,%xmm6
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-16(%ebx)
+	paddd	%xmm6,%xmm4
+	pxor	%xmm4,%xmm2
+	movdqa	-32(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-112(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	64(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-128(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,112(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	movdqa	%xmm4,32(%ebx)
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-48(%ebx)
+	paddd	%xmm7,%xmm5
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	-16(%ebx),%xmm2
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-96(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	80(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-112(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,64(%ebx)
+	pxor	%xmm5,%xmm3
+	paddd	%xmm2,%xmm0
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	pxor	%xmm0,%xmm6
+	por	%xmm1,%xmm3
+	movdqa	%xmm5,48(%ebx)
+	pshufb	(%eax),%xmm6
+	movdqa	%xmm3,-32(%ebx)
+	paddd	%xmm6,%xmm4
+	movdqa	16(%ebx),%xmm5
+	pxor	%xmm4,%xmm2
+	movdqa	-64(%ebx),%xmm3
+	movdqa	%xmm2,%xmm1
+	pslld	$12,%xmm2
+	psrld	$20,%xmm1
+	por	%xmm1,%xmm2
+	movdqa	-80(%ebx),%xmm1
+	paddd	%xmm2,%xmm0
+	movdqa	96(%ebx),%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm0,-96(%ebx)
+	pshufb	16(%eax),%xmm6
+	paddd	%xmm6,%xmm4
+	movdqa	%xmm6,80(%ebx)
+	pxor	%xmm4,%xmm2
+	paddd	%xmm3,%xmm1
+	movdqa	%xmm2,%xmm0
+	pslld	$7,%xmm2
+	psrld	$25,%xmm0
+	pxor	%xmm1,%xmm7
+	por	%xmm0,%xmm2
+	pshufb	(%eax),%xmm7
+	movdqa	%xmm2,-16(%ebx)
+	paddd	%xmm7,%xmm5
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm0
+	pslld	$12,%xmm3
+	psrld	$20,%xmm0
+	por	%xmm0,%xmm3
+	movdqa	-128(%ebx),%xmm0
+	paddd	%xmm3,%xmm1
+	movdqa	64(%ebx),%xmm6
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm1,-80(%ebx)
+	pshufb	16(%eax),%xmm7
+	paddd	%xmm7,%xmm5
+	movdqa	%xmm7,96(%ebx)
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm3,%xmm1
+	pslld	$7,%xmm3
+	psrld	$25,%xmm1
+	por	%xmm1,%xmm3
+	decl	%edx
+	jnz	.L010loop
+	movdqa	%xmm3,-64(%ebx)
+	movdqa	%xmm4,(%ebx)
+	movdqa	%xmm5,16(%ebx)
+	movdqa	%xmm6,64(%ebx)
+	movdqa	%xmm7,96(%ebx)
+	movdqa	-112(%ebx),%xmm1
+	movdqa	-96(%ebx),%xmm2
+	movdqa	-80(%ebx),%xmm3
+	paddd	-128(%ebp),%xmm0
+	paddd	-112(%ebp),%xmm1
+	paddd	-96(%ebp),%xmm2
+	paddd	-80(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	-64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	-48(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	-32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	-16(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	-64(%ebp),%xmm0
+	paddd	-48(%ebp),%xmm1
+	paddd	-32(%ebp),%xmm2
+	paddd	-16(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	16(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	32(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	48(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	(%ebp),%xmm0
+	paddd	16(%ebp),%xmm1
+	paddd	32(%ebp),%xmm2
+	paddd	48(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	16(%esi),%esi
+	pxor	%xmm0,%xmm4
+	movdqa	64(%ebx),%xmm0
+	pxor	%xmm1,%xmm5
+	movdqa	80(%ebx),%xmm1
+	pxor	%xmm2,%xmm6
+	movdqa	96(%ebx),%xmm2
+	pxor	%xmm3,%xmm7
+	movdqa	112(%ebx),%xmm3
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	16(%edi),%edi
+	paddd	64(%ebp),%xmm0
+	paddd	80(%ebp),%xmm1
+	paddd	96(%ebp),%xmm2
+	paddd	112(%ebp),%xmm3
+	movdqa	%xmm0,%xmm6
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm6
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm6,%xmm3
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	movdqu	-128(%esi),%xmm4
+	movdqu	-64(%esi),%xmm5
+	movdqu	(%esi),%xmm2
+	movdqu	64(%esi),%xmm7
+	leal	208(%esi),%esi
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm5
+	pxor	%xmm2,%xmm6
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm4,-128(%edi)
+	movdqu	%xmm5,-64(%edi)
+	movdqu	%xmm6,(%edi)
+	movdqu	%xmm7,64(%edi)
+	leal	208(%edi),%edi
+	subl	$256,%ecx
+	jnc	.L009outer_loop
+	addl	$256,%ecx
+	jz	.L011done
+	movl	520(%esp),%ebx
+	leal	-128(%esi),%esi
+	movl	516(%esp),%edx
+	leal	-128(%edi),%edi
+	movd	64(%ebp),%xmm2
+	movdqu	(%ebx),%xmm3
+	paddd	96(%eax),%xmm2
+	pand	112(%eax),%xmm3
+	por	%xmm2,%xmm3
+.L0081x:
+	movdqa	32(%eax),%xmm0
+	movdqu	(%edx),%xmm1
+	movdqu	16(%edx),%xmm2
+	movdqa	(%eax),%xmm6
+	movdqa	16(%eax),%xmm7
+	movl	%ebp,48(%esp)
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	movl	$10,%edx
+	jmp	.L012loop1x
+.align	16
+.L013outer1x:
+	movdqa	80(%eax),%xmm3
+	movdqa	(%esp),%xmm0
+	movdqa	16(%esp),%xmm1
+	movdqa	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	movl	$10,%edx
+	movdqa	%xmm3,48(%esp)
+	jmp	.L012loop1x
+.align	16
+.L012loop1x:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decl	%edx
+	jnz	.L012loop1x
+	paddd	(%esp),%xmm0
+	paddd	16(%esp),%xmm1
+	paddd	32(%esp),%xmm2
+	paddd	48(%esp),%xmm3
+	cmpl	$64,%ecx
+	jb	.L014tail
+	movdqu	(%esi),%xmm4
+	movdqu	16(%esi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+	leal	64(%esi),%esi
+	movdqu	%xmm0,(%edi)
+	movdqu	%xmm1,16(%edi)
+	movdqu	%xmm2,32(%edi)
+	movdqu	%xmm3,48(%edi)
+	leal	64(%edi),%edi
+	subl	$64,%ecx
+	jnz	.L013outer1x
+	jmp	.L011done
+.L014tail:
+	movdqa	%xmm0,(%esp)
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm2,32(%esp)
+	movdqa	%xmm3,48(%esp)
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	xorl	%ebp,%ebp
+.L015tail_loop:
+	movb	(%esp,%ebp,1),%al
+	movb	(%esi,%ebp,1),%dl
+	leal	1(%ebp),%ebp
+	xorb	%dl,%al
+	movb	%al,-1(%edi,%ebp,1)
+	decl	%ecx
+	jnz	.L015tail_loop
+.L011done:
+	movl	512(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin
+.align	64
+.Lssse3_data:
+.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long	1634760805,857760878,2036477234,1797285236
+.long	0,1,2,3
+.long	4,4,4,4
+.long	1,0,0,0
+.long	4,0,0,0
+.long	0,-1,-1,-1
+.align	64
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte	114,103,62,0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S
@@ -1,0 +1,2513 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl	aes_hw_encrypt
+.hidden	aes_hw_encrypt
+.type	aes_hw_encrypt,@function
+.align	16
+aes_hw_encrypt:
+.L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L000pic
+.L000pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L001enc1_loop_1:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L001enc1_loop_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.size	aes_hw_encrypt,.-.L_aes_hw_encrypt_begin
+.globl	aes_hw_decrypt
+.hidden	aes_hw_decrypt
+.type	aes_hw_decrypt,@function
+.align	16
+aes_hw_decrypt:
+.L_aes_hw_decrypt_begin:
+	movl	4(%esp),%eax
+	movl	12(%esp),%edx
+	movups	(%eax),%xmm2
+	movl	240(%edx),%ecx
+	movl	8(%esp),%eax
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L002dec1_loop_2:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L002dec1_loop_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%eax)
+	pxor	%xmm2,%xmm2
+	ret
+.size	aes_hw_decrypt,.-.L_aes_hw_decrypt_begin
+.hidden	_aesni_encrypt2
+.type	_aesni_encrypt2,@function
+.align	16
+_aesni_encrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L003enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L003enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	ret
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.hidden	_aesni_decrypt2
+.type	_aesni_decrypt2,@function
+.align	16
+_aesni_decrypt2:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L004dec2_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L004dec2_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	ret
+.size	_aesni_decrypt2,.-_aesni_decrypt2
+.hidden	_aesni_encrypt3
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L005enc3_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L005enc3_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	ret
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.hidden	_aesni_decrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+	addl	$16,%ecx
+.L006dec3_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L006dec3_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	ret
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.hidden	_aesni_encrypt4
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L007enc4_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L007enc4_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	ret
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.hidden	_aesni_decrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	shll	$4,%ecx
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	movups	32(%edx),%xmm0
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	15,31,64,0
+	addl	$16,%ecx
+.L008dec4_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L008dec4_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	ret
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.hidden	_aesni_encrypt6
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L009_aesni_encrypt6_inner
+.align	16
+.L010enc6_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.L009_aesni_encrypt6_inner:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.L_aesni_encrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L010enc6_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	ret
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.hidden	_aesni_decrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+	movups	(%edx),%xmm0
+	shll	$4,%ecx
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,217
+	leal	32(%edx,%ecx,1),%edx
+	negl	%ecx
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%edx,%ecx,1),%xmm0
+	addl	$16,%ecx
+	jmp	.L011_aesni_decrypt6_inner
+.align	16
+.L012dec6_loop:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.L011_aesni_decrypt6_inner:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.L_aesni_decrypt6_enter:
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L012dec6_loop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	ret
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.globl	aes_hw_ecb_encrypt
+.hidden	aes_hw_ecb_encrypt
+.type	aes_hw_ecb_encrypt,@function
+.align	16
+aes_hw_ecb_encrypt:
+.L_aes_hw_ecb_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	andl	$-16,%eax
+	jz	.L013ecb_ret
+	movl	240(%edx),%ecx
+	testl	%ebx,%ebx
+	jz	.L014ecb_decrypt
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L015ecb_enc_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L016ecb_enc_loop6_enter
+.align	16
+.L017ecb_enc_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L016ecb_enc_loop6_enter:
+	call	_aesni_encrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L017ecb_enc_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L013ecb_ret
+.L015ecb_enc_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L018ecb_enc_one
+	movups	16(%esi),%xmm3
+	je	.L019ecb_enc_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L020ecb_enc_three
+	movups	48(%esi),%xmm5
+	je	.L021ecb_enc_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L018ecb_enc_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L022enc1_loop_3:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L022enc1_loop_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L019ecb_enc_two:
+	call	_aesni_encrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L020ecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L021ecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L014ecb_decrypt:
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	cmpl	$96,%eax
+	jb	.L023ecb_dec_tail
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+	subl	$96,%eax
+	jmp	.L024ecb_dec_loop6_enter
+.align	16
+.L025ecb_dec_loop6:
+	movups	%xmm2,(%edi)
+	movdqu	(%esi),%xmm2
+	movups	%xmm3,16(%edi)
+	movdqu	16(%esi),%xmm3
+	movups	%xmm4,32(%edi)
+	movdqu	32(%esi),%xmm4
+	movups	%xmm5,48(%edi)
+	movdqu	48(%esi),%xmm5
+	movups	%xmm6,64(%edi)
+	movdqu	64(%esi),%xmm6
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqu	80(%esi),%xmm7
+	leal	96(%esi),%esi
+.L024ecb_dec_loop6_enter:
+	call	_aesni_decrypt6
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	subl	$96,%eax
+	jnc	.L025ecb_dec_loop6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	addl	$96,%eax
+	jz	.L013ecb_ret
+.L023ecb_dec_tail:
+	movups	(%esi),%xmm2
+	cmpl	$32,%eax
+	jb	.L026ecb_dec_one
+	movups	16(%esi),%xmm3
+	je	.L027ecb_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$64,%eax
+	jb	.L028ecb_dec_three
+	movups	48(%esi),%xmm5
+	je	.L029ecb_dec_four
+	movups	64(%esi),%xmm6
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L026ecb_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L030dec1_loop_4:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L030dec1_loop_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L027ecb_dec_two:
+	call	_aesni_decrypt2
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L028ecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L013ecb_ret
+.align	16
+.L029ecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L013ecb_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin
+.globl	aes_hw_ccm64_encrypt_blocks
+.hidden	aes_hw_ccm64_encrypt_blocks
+.type	aes_hw_ccm64_encrypt_blocks,@function
+.align	16
+aes_hw_ccm64_encrypt_blocks:
+.L_aes_hw_ccm64_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	shll	$4,%ecx
+	movl	$16,%ebx
+	leal	(%edx),%ebp
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	leal	32(%edx,%ecx,1),%edx
+	subl	%ecx,%ebx
+.byte	102,15,56,0,253
+.L031ccm64_enc_outer:
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm6
+	xorps	%xmm0,%xmm2
+	movups	16(%ebp),%xmm1
+	xorps	%xmm6,%xmm0
+	xorps	%xmm0,%xmm3
+	movups	32(%ebp),%xmm0
+.L032ccm64_enc2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L032ccm64_enc2_loop
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	paddq	16(%esp),%xmm7
+	decl	%eax
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+.byte	102,15,56,0,213
+	leal	16(%edi),%edi
+	jnz	.L031ccm64_enc_outer
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin
+.globl	aes_hw_ccm64_decrypt_blocks
+.hidden	aes_hw_ccm64_decrypt_blocks
+.type	aes_hw_ccm64_decrypt_blocks,@function
+.align	16
+aes_hw_ccm64_decrypt_blocks:
+.L_aes_hw_ccm64_decrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	40(%esp),%ecx
+	movl	%esp,%ebp
+	subl	$60,%esp
+	andl	$-16,%esp
+	movl	%ebp,48(%esp)
+	movdqu	(%ebx),%xmm7
+	movdqu	(%ecx),%xmm3
+	movl	240(%edx),%ecx
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$1,%ebx
+	xorl	%ebp,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ebp,20(%esp)
+	movl	%ebp,24(%esp)
+	movl	%ebp,28(%esp)
+	movdqa	(%esp),%xmm5
+	movdqa	%xmm7,%xmm2
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+.byte	102,15,56,0,253
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L033enc1_loop_5:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L033enc1_loop_5
+.byte	102,15,56,221,209
+	shll	$4,%ebx
+	movl	$16,%ecx
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+	leal	16(%esi),%esi
+	subl	%ebx,%ecx
+	leal	32(%ebp,%ebx,1),%edx
+	movl	%ecx,%ebx
+	jmp	.L034ccm64_dec_outer
+.align	16
+.L034ccm64_dec_outer:
+	xorps	%xmm2,%xmm6
+	movdqa	%xmm7,%xmm2
+	movups	%xmm6,(%edi)
+	leal	16(%edi),%edi
+.byte	102,15,56,0,213
+	subl	$1,%eax
+	jz	.L035ccm64_dec_break
+	movups	(%ebp),%xmm0
+	movl	%ebx,%ecx
+	movups	16(%ebp),%xmm1
+	xorps	%xmm0,%xmm6
+	xorps	%xmm0,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	32(%ebp),%xmm0
+.L036ccm64_dec2_loop:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%edx,%ecx,1),%xmm1
+	addl	$32,%ecx
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%edx,%ecx,1),%xmm0
+	jnz	.L036ccm64_dec2_loop
+	movups	(%esi),%xmm6
+	paddq	16(%esp),%xmm7
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	leal	16(%esi),%esi
+	jmp	.L034ccm64_dec_outer
+.align	16
+.L035ccm64_dec_break:
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm6
+	leal	32(%edx),%edx
+	xorps	%xmm6,%xmm3
+.L037enc1_loop_6:
+.byte	102,15,56,220,217
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L037enc1_loop_6
+.byte	102,15,56,221,217
+	movl	48(%esp),%esp
+	movl	40(%esp),%edi
+	movups	%xmm3,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden	aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	16
+aes_hw_ctr32_encrypt_blocks:
+.L_aes_hw_ctr32_encrypt_blocks_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L038pic
+.L038pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebx
+	movl	%esp,%ebp
+	subl	$88,%esp
+	andl	$-16,%esp
+	movl	%ebp,80(%esp)
+	cmpl	$1,%eax
+	je	.L039ctr32_one_shortcut
+	movdqu	(%ebx),%xmm7
+	movl	$202182159,(%esp)
+	movl	$134810123,4(%esp)
+	movl	$67438087,8(%esp)
+	movl	$66051,12(%esp)
+	movl	$6,%ecx
+	xorl	%ebp,%ebp
+	movl	%ecx,16(%esp)
+	movl	%ecx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%ebp,28(%esp)
+.byte	102,15,58,22,251,3
+.byte	102,15,58,34,253,3
+	movl	240(%edx),%ecx
+	bswap	%ebx
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqa	(%esp),%xmm2
+.byte	102,15,58,34,195,0
+	leal	3(%ebx),%ebp
+.byte	102,15,58,34,205,0
+	incl	%ebx
+.byte	102,15,58,34,195,1
+	incl	%ebp
+.byte	102,15,58,34,205,1
+	incl	%ebx
+.byte	102,15,58,34,195,2
+	incl	%ebp
+.byte	102,15,58,34,205,2
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	movdqu	(%edx),%xmm6
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	pshufd	$192,%xmm0,%xmm2
+	pshufd	$128,%xmm0,%xmm3
+	cmpl	$6,%eax
+	jb	.L040ctr32_tail
+	pxor	%xmm6,%xmm7
+	shll	$4,%ecx
+	movl	$16,%ebx
+	movdqa	%xmm7,32(%esp)
+	movl	%edx,%ebp
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	subl	$6,%eax
+	jmp	.L041ctr32_loop6
+.align	16
+.L041ctr32_loop6:
+	pshufd	$64,%xmm0,%xmm4
+	movdqa	32(%esp),%xmm0
+	pshufd	$192,%xmm1,%xmm5
+	pxor	%xmm0,%xmm2
+	pshufd	$128,%xmm1,%xmm6
+	pxor	%xmm0,%xmm3
+	pshufd	$64,%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+.byte	102,15,56,220,217
+	movups	32(%ebp),%xmm0
+	movl	%ebx,%ecx
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	%xmm2,(%edi)
+	movdqa	16(%esp),%xmm0
+	xorps	%xmm1,%xmm4
+	movdqa	64(%esp),%xmm1
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	paddd	%xmm0,%xmm1
+	paddd	48(%esp),%xmm0
+	movdqa	(%esp),%xmm2
+	movups	48(%esi),%xmm3
+	movups	64(%esi),%xmm4
+	xorps	%xmm3,%xmm5
+	movups	80(%esi),%xmm3
+	leal	96(%esi),%esi
+	movdqa	%xmm0,48(%esp)
+.byte	102,15,56,0,194
+	xorps	%xmm4,%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm3,%xmm7
+	movdqa	%xmm1,64(%esp)
+.byte	102,15,56,0,202
+	movups	%xmm6,64(%edi)
+	pshufd	$192,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	pshufd	$128,%xmm0,%xmm3
+	subl	$6,%eax
+	jnc	.L041ctr32_loop6
+	addl	$6,%eax
+	jz	.L042ctr32_ret
+	movdqu	(%ebp),%xmm7
+	movl	%ebp,%edx
+	pxor	32(%esp),%xmm7
+	movl	240(%ebp),%ecx
+.L040ctr32_tail:
+	por	%xmm7,%xmm2
+	cmpl	$2,%eax
+	jb	.L043ctr32_one
+	pshufd	$64,%xmm0,%xmm4
+	por	%xmm7,%xmm3
+	je	.L044ctr32_two
+	pshufd	$192,%xmm1,%xmm5
+	por	%xmm7,%xmm4
+	cmpl	$4,%eax
+	jb	.L045ctr32_three
+	pshufd	$128,%xmm1,%xmm6
+	por	%xmm7,%xmm5
+	je	.L046ctr32_four
+	por	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	%xmm1,%xmm2
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm3
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm4
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm5
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L039ctr32_one_shortcut:
+	movups	(%ebx),%xmm2
+	movl	240(%edx),%ecx
+.L043ctr32_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L047enc1_loop_7:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L047enc1_loop_7
+.byte	102,15,56,221,209
+	movups	(%esi),%xmm6
+	xorps	%xmm2,%xmm6
+	movups	%xmm6,(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L044ctr32_two:
+	call	_aesni_encrypt2
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L045ctr32_three:
+	call	_aesni_encrypt3
+	movups	(%esi),%xmm5
+	movups	16(%esi),%xmm6
+	xorps	%xmm5,%xmm2
+	movups	32(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm7,%xmm4
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	jmp	.L042ctr32_ret
+.align	16
+.L046ctr32_four:
+	call	_aesni_encrypt4
+	movups	(%esi),%xmm6
+	movups	16(%esi),%xmm7
+	movups	32(%esi),%xmm1
+	xorps	%xmm6,%xmm2
+	movups	48(%esi),%xmm0
+	xorps	%xmm7,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+.L042ctr32_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movl	80(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin
+.globl	aes_hw_xts_encrypt
+.hidden	aes_hw_xts_encrypt
+.type	aes_hw_xts_encrypt,@function
+.align	16
+aes_hw_xts_encrypt:
+.L_aes_hw_xts_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L048enc1_loop_8:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L048enc1_loop_8
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	movl	240(%edx),%ecx
+	andl	$-16,%esp
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	subl	$96,%eax
+	jc	.L049xts_enc_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L050xts_enc_loop6
+.align	16
+.L050xts_enc_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,220,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	call	.L_aesni_encrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L050xts_enc_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L049xts_enc_short:
+	addl	$96,%eax
+	jz	.L051xts_enc_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L052xts_enc_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L053xts_enc_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L054xts_enc_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L055xts_enc_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_encrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L056xts_enc_done
+.align	16
+.L052xts_enc_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L057enc1_loop_9:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L057enc1_loop_9
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L053xts_enc_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	_aesni_encrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L054xts_enc_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_encrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L055xts_enc_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_encrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L056xts_enc_done
+.align	16
+.L051xts_enc_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L058xts_enc_ret
+	movdqa	%xmm1,%xmm5
+	movl	%eax,112(%esp)
+	jmp	.L059xts_enc_steal
+.align	16
+.L056xts_enc_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L058xts_enc_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm5
+	paddq	%xmm1,%xmm1
+	pand	96(%esp),%xmm5
+	pxor	%xmm1,%xmm5
+.L059xts_enc_steal:
+	movzbl	(%esi),%ecx
+	movzbl	-16(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,-16(%edi)
+	movb	%dl,(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L059xts_enc_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	-16(%edi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L060enc1_loop_10:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L060enc1_loop_10
+.byte	102,15,56,221,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,-16(%edi)
+.L058xts_enc_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin
+.globl	aes_hw_xts_decrypt
+.hidden	aes_hw_xts_decrypt
+.type	aes_hw_xts_decrypt,@function
+.align	16
+aes_hw_xts_decrypt:
+.L_aes_hw_xts_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	36(%esp),%edx
+	movl	40(%esp),%esi
+	movl	240(%edx),%ecx
+	movups	(%esi),%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L061enc1_loop_11:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L061enc1_loop_11
+.byte	102,15,56,221,209
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	movl	%esp,%ebp
+	subl	$120,%esp
+	andl	$-16,%esp
+	xorl	%ebx,%ebx
+	testl	$15,%eax
+	setnz	%bl
+	shll	$4,%ebx
+	subl	%ebx,%eax
+	movl	$135,96(%esp)
+	movl	$0,100(%esp)
+	movl	$1,104(%esp)
+	movl	$0,108(%esp)
+	movl	%eax,112(%esp)
+	movl	%ebp,116(%esp)
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ecx,%ebx
+	movdqa	%xmm2,%xmm1
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	pcmpgtd	%xmm1,%xmm0
+	andl	$-16,%eax
+	subl	$96,%eax
+	jc	.L062xts_dec_short
+	shll	$4,%ecx
+	movl	$16,%ebx
+	subl	%ecx,%ebx
+	leal	32(%edx,%ecx,1),%edx
+	jmp	.L063xts_dec_loop6
+.align	16
+.L063xts_dec_loop6:
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,16(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,32(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,64(%esp)
+	paddq	%xmm1,%xmm1
+	movups	(%ebp),%xmm0
+	pand	%xmm3,%xmm7
+	movups	(%esi),%xmm2
+	pxor	%xmm1,%xmm7
+	movl	%ebx,%ecx
+	movdqu	16(%esi),%xmm3
+	xorps	%xmm0,%xmm2
+	movdqu	32(%esi),%xmm4
+	pxor	%xmm0,%xmm3
+	movdqu	48(%esi),%xmm5
+	pxor	%xmm0,%xmm4
+	movdqu	64(%esi),%xmm6
+	pxor	%xmm0,%xmm5
+	movdqu	80(%esi),%xmm1
+	pxor	%xmm0,%xmm6
+	leal	96(%esi),%esi
+	pxor	(%esp),%xmm2
+	movdqa	%xmm7,80(%esp)
+	pxor	%xmm1,%xmm7
+	movups	16(%ebp),%xmm1
+	pxor	16(%esp),%xmm3
+	pxor	32(%esp),%xmm4
+.byte	102,15,56,222,209
+	pxor	48(%esp),%xmm5
+	pxor	64(%esp),%xmm6
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm7
+	movups	32(%ebp),%xmm0
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	call	.L_aesni_decrypt6_enter
+	movdqa	80(%esp),%xmm1
+	pxor	%xmm0,%xmm0
+	xorps	(%esp),%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	xorps	16(%esp),%xmm3
+	movups	%xmm2,(%edi)
+	xorps	32(%esp),%xmm4
+	movups	%xmm3,16(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm4,32(%edi)
+	xorps	64(%esp),%xmm6
+	movups	%xmm5,48(%edi)
+	xorps	%xmm1,%xmm7
+	movups	%xmm6,64(%edi)
+	pshufd	$19,%xmm0,%xmm2
+	movups	%xmm7,80(%edi)
+	leal	96(%edi),%edi
+	movdqa	96(%esp),%xmm3
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	subl	$96,%eax
+	jnc	.L063xts_dec_loop6
+	movl	240(%ebp),%ecx
+	movl	%ebp,%edx
+	movl	%ecx,%ebx
+.L062xts_dec_short:
+	addl	$96,%eax
+	jz	.L064xts_dec_done6x
+	movdqa	%xmm1,%xmm5
+	cmpl	$32,%eax
+	jb	.L065xts_dec_one
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	je	.L066xts_dec_two
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	cmpl	$64,%eax
+	jb	.L067xts_dec_three
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	%xmm1,%xmm7
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,(%esp)
+	movdqa	%xmm6,16(%esp)
+	je	.L068xts_dec_four
+	movdqa	%xmm7,32(%esp)
+	pshufd	$19,%xmm0,%xmm7
+	movdqa	%xmm1,48(%esp)
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	pxor	(%esp),%xmm2
+	movdqu	48(%esi),%xmm5
+	pxor	16(%esp),%xmm3
+	movdqu	64(%esi),%xmm6
+	pxor	32(%esp),%xmm4
+	leal	80(%esi),%esi
+	pxor	48(%esp),%xmm5
+	movdqa	%xmm7,64(%esp)
+	pxor	%xmm7,%xmm6
+	call	_aesni_decrypt6
+	movaps	64(%esp),%xmm1
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	32(%esp),%xmm4
+	movups	%xmm2,(%edi)
+	xorps	48(%esp),%xmm5
+	movups	%xmm3,16(%edi)
+	xorps	%xmm1,%xmm6
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	jmp	.L069xts_dec_done
+.align	16
+.L065xts_dec_one:
+	movups	(%esi),%xmm2
+	leal	16(%esi),%esi
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L070dec1_loop_12:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L070dec1_loop_12
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	movdqa	%xmm5,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L066xts_dec_two:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	leal	32(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	call	_aesni_decrypt2
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	32(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L067xts_dec_three:
+	movaps	%xmm1,%xmm7
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	leal	48(%esi),%esi
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	call	_aesni_decrypt3
+	xorps	%xmm5,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	leal	48(%edi),%edi
+	movdqa	%xmm7,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L068xts_dec_four:
+	movaps	%xmm1,%xmm6
+	movups	(%esi),%xmm2
+	movups	16(%esi),%xmm3
+	movups	32(%esi),%xmm4
+	xorps	(%esp),%xmm2
+	movups	48(%esi),%xmm5
+	leal	64(%esi),%esi
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	xorps	%xmm6,%xmm5
+	call	_aesni_decrypt4
+	xorps	(%esp),%xmm2
+	xorps	16(%esp),%xmm3
+	xorps	%xmm7,%xmm4
+	movups	%xmm2,(%edi)
+	xorps	%xmm6,%xmm5
+	movups	%xmm3,16(%edi)
+	movups	%xmm4,32(%edi)
+	movups	%xmm5,48(%edi)
+	leal	64(%edi),%edi
+	movdqa	%xmm6,%xmm1
+	jmp	.L069xts_dec_done
+.align	16
+.L064xts_dec_done6x:
+	movl	112(%esp),%eax
+	andl	$15,%eax
+	jz	.L071xts_dec_ret
+	movl	%eax,112(%esp)
+	jmp	.L072xts_dec_only_one_more
+.align	16
+.L069xts_dec_done:
+	movl	112(%esp),%eax
+	pxor	%xmm0,%xmm0
+	andl	$15,%eax
+	jz	.L071xts_dec_ret
+	pcmpgtd	%xmm1,%xmm0
+	movl	%eax,112(%esp)
+	pshufd	$19,%xmm0,%xmm2
+	pxor	%xmm0,%xmm0
+	movdqa	96(%esp),%xmm3
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm2
+	pcmpgtd	%xmm1,%xmm0
+	pxor	%xmm2,%xmm1
+.L072xts_dec_only_one_more:
+	pshufd	$19,%xmm0,%xmm5
+	movdqa	%xmm1,%xmm6
+	paddq	%xmm1,%xmm1
+	pand	%xmm3,%xmm5
+	pxor	%xmm1,%xmm5
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%esi),%xmm2
+	xorps	%xmm5,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L073dec1_loop_13:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L073dec1_loop_13
+.byte	102,15,56,223,209
+	xorps	%xmm5,%xmm2
+	movups	%xmm2,(%edi)
+.L074xts_dec_steal:
+	movzbl	16(%esi),%ecx
+	movzbl	(%edi),%edx
+	leal	1(%esi),%esi
+	movb	%cl,(%edi)
+	movb	%dl,16(%edi)
+	leal	1(%edi),%edi
+	subl	$1,%eax
+	jnz	.L074xts_dec_steal
+	subl	112(%esp),%edi
+	movl	%ebp,%edx
+	movl	%ebx,%ecx
+	movups	(%edi),%xmm2
+	xorps	%xmm6,%xmm2
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L075dec1_loop_14:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L075dec1_loop_14
+.byte	102,15,56,223,209
+	xorps	%xmm6,%xmm2
+	movups	%xmm2,(%edi)
+.L071xts_dec_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	movdqa	%xmm0,(%esp)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm0,16(%esp)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm0,32(%esp)
+	pxor	%xmm5,%xmm5
+	movdqa	%xmm0,48(%esp)
+	pxor	%xmm6,%xmm6
+	movdqa	%xmm0,64(%esp)
+	pxor	%xmm7,%xmm7
+	movdqa	%xmm0,80(%esp)
+	movl	116(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin
+.globl	aes_hw_cbc_encrypt
+.hidden	aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,@function
+.align	16
+aes_hw_cbc_encrypt:
+.L_aes_hw_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	%esp,%ebx
+	movl	24(%esp),%edi
+	subl	$24,%ebx
+	movl	28(%esp),%eax
+	andl	$-16,%ebx
+	movl	32(%esp),%edx
+	movl	36(%esp),%ebp
+	testl	%eax,%eax
+	jz	.L076cbc_abort
+	cmpl	$0,40(%esp)
+	xchgl	%esp,%ebx
+	movups	(%ebp),%xmm7
+	movl	240(%edx),%ecx
+	movl	%edx,%ebp
+	movl	%ebx,16(%esp)
+	movl	%ecx,%ebx
+	je	.L077cbc_decrypt
+	movaps	%xmm7,%xmm2
+	cmpl	$16,%eax
+	jb	.L078cbc_enc_tail
+	subl	$16,%eax
+	jmp	.L079cbc_enc_loop
+.align	16
+.L079cbc_enc_loop:
+	movups	(%esi),%xmm7
+	leal	16(%esi),%esi
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	xorps	%xmm0,%xmm7
+	leal	32(%edx),%edx
+	xorps	%xmm7,%xmm2
+.L080enc1_loop_15:
+.byte	102,15,56,220,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L080enc1_loop_15
+.byte	102,15,56,221,209
+	movl	%ebx,%ecx
+	movl	%ebp,%edx
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+	subl	$16,%eax
+	jnc	.L079cbc_enc_loop
+	addl	$16,%eax
+	jnz	.L078cbc_enc_tail
+	movaps	%xmm2,%xmm7
+	pxor	%xmm2,%xmm2
+	jmp	.L081cbc_ret
+.L078cbc_enc_tail:
+	movl	%eax,%ecx
+.long	2767451785
+	movl	$16,%ecx
+	subl	%eax,%ecx
+	xorl	%eax,%eax
+.long	2868115081
+	leal	-16(%edi),%edi
+	movl	%ebx,%ecx
+	movl	%edi,%esi
+	movl	%ebp,%edx
+	jmp	.L079cbc_enc_loop
+.align	16
+.L077cbc_decrypt:
+	cmpl	$80,%eax
+	jbe	.L082cbc_dec_tail
+	movaps	%xmm7,(%esp)
+	subl	$80,%eax
+	jmp	.L083cbc_dec_loop6_enter
+.align	16
+.L084cbc_dec_loop6:
+	movaps	%xmm0,(%esp)
+	movups	%xmm7,(%edi)
+	leal	16(%edi),%edi
+.L083cbc_dec_loop6_enter:
+	movdqu	(%esi),%xmm2
+	movdqu	16(%esi),%xmm3
+	movdqu	32(%esi),%xmm4
+	movdqu	48(%esi),%xmm5
+	movdqu	64(%esi),%xmm6
+	movdqu	80(%esi),%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm1
+	xorps	%xmm0,%xmm6
+	movups	80(%esi),%xmm0
+	xorps	%xmm1,%xmm7
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	leal	96(%esi),%esi
+	movups	%xmm4,32(%edi)
+	movl	%ebx,%ecx
+	movups	%xmm5,48(%edi)
+	movl	%ebp,%edx
+	movups	%xmm6,64(%edi)
+	leal	80(%edi),%edi
+	subl	$96,%eax
+	ja	.L084cbc_dec_loop6
+	movaps	%xmm7,%xmm2
+	movaps	%xmm0,%xmm7
+	addl	$80,%eax
+	jle	.L085cbc_dec_clear_tail_collected
+	movups	%xmm2,(%edi)
+	leal	16(%edi),%edi
+.L082cbc_dec_tail:
+	movups	(%esi),%xmm2
+	movaps	%xmm2,%xmm6
+	cmpl	$16,%eax
+	jbe	.L086cbc_dec_one
+	movups	16(%esi),%xmm3
+	movaps	%xmm3,%xmm5
+	cmpl	$32,%eax
+	jbe	.L087cbc_dec_two
+	movups	32(%esi),%xmm4
+	cmpl	$48,%eax
+	jbe	.L088cbc_dec_three
+	movups	48(%esi),%xmm5
+	cmpl	$64,%eax
+	jbe	.L089cbc_dec_four
+	movups	64(%esi),%xmm6
+	movaps	%xmm7,(%esp)
+	movups	(%esi),%xmm2
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	(%esi),%xmm1
+	movups	16(%esi),%xmm0
+	xorps	(%esp),%xmm2
+	xorps	%xmm1,%xmm3
+	movups	32(%esi),%xmm1
+	xorps	%xmm0,%xmm4
+	movups	48(%esi),%xmm0
+	xorps	%xmm1,%xmm5
+	movups	64(%esi),%xmm7
+	xorps	%xmm0,%xmm6
+	movups	%xmm2,(%edi)
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%edi)
+	pxor	%xmm5,%xmm5
+	leal	64(%edi),%edi
+	movaps	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	subl	$80,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L086cbc_dec_one:
+	movups	(%edx),%xmm0
+	movups	16(%edx),%xmm1
+	leal	32(%edx),%edx
+	xorps	%xmm0,%xmm2
+.L091dec1_loop_16:
+.byte	102,15,56,222,209
+	decl	%ecx
+	movups	(%edx),%xmm1
+	leal	16(%edx),%edx
+	jnz	.L091dec1_loop_16
+.byte	102,15,56,223,209
+	xorps	%xmm7,%xmm2
+	movaps	%xmm6,%xmm7
+	subl	$16,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L087cbc_dec_two:
+	call	_aesni_decrypt2
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	movaps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leal	16(%edi),%edi
+	movaps	%xmm5,%xmm7
+	subl	$32,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L088cbc_dec_three:
+	call	_aesni_decrypt3
+	xorps	%xmm7,%xmm2
+	xorps	%xmm6,%xmm3
+	xorps	%xmm5,%xmm4
+	movups	%xmm2,(%edi)
+	movaps	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	leal	32(%edi),%edi
+	movups	32(%esi),%xmm7
+	subl	$48,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L089cbc_dec_four:
+	call	_aesni_decrypt4
+	movups	16(%esi),%xmm1
+	movups	32(%esi),%xmm0
+	xorps	%xmm7,%xmm2
+	movups	48(%esi),%xmm7
+	xorps	%xmm6,%xmm3
+	movups	%xmm2,(%edi)
+	xorps	%xmm1,%xmm4
+	movups	%xmm3,16(%edi)
+	pxor	%xmm3,%xmm3
+	xorps	%xmm0,%xmm5
+	movups	%xmm4,32(%edi)
+	pxor	%xmm4,%xmm4
+	leal	48(%edi),%edi
+	movaps	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	subl	$64,%eax
+	jmp	.L090cbc_dec_tail_collected
+.align	16
+.L085cbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+.L090cbc_dec_tail_collected:
+	andl	$15,%eax
+	jnz	.L092cbc_dec_tail_partial
+	movups	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	jmp	.L081cbc_ret
+.align	16
+.L092cbc_dec_tail_partial:
+	movaps	%xmm2,(%esp)
+	pxor	%xmm0,%xmm0
+	movl	$16,%ecx
+	movl	%esp,%esi
+	subl	%eax,%ecx
+.long	2767451785
+	movdqa	%xmm2,(%esp)
+.L081cbc_ret:
+	movl	16(%esp),%esp
+	movl	36(%esp),%ebp
+	pxor	%xmm2,%xmm2
+	pxor	%xmm1,%xmm1
+	movups	%xmm7,(%ebp)
+	pxor	%xmm7,%xmm7
+.L076cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
+.hidden	_aesni_set_encrypt_key
+.type	_aesni_set_encrypt_key,@function
+.align	16
+_aesni_set_encrypt_key:
+	pushl	%ebp
+	pushl	%ebx
+	testl	%eax,%eax
+	jz	.L093bad_pointer
+	testl	%edx,%edx
+	jz	.L093bad_pointer
+	call	.L094pic
+.L094pic:
+	popl	%ebx
+	leal	.Lkey_const-.L094pic(%ebx),%ebx
+	leal	OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
+	movups	(%eax),%xmm0
+	xorps	%xmm4,%xmm4
+	movl	4(%ebp),%ebp
+	leal	16(%edx),%edx
+	andl	$268437504,%ebp
+	cmpl	$256,%ecx
+	je	.L09514rounds
+	cmpl	$192,%ecx
+	je	.L09612rounds
+	cmpl	$128,%ecx
+	jne	.L097bad_keybits
+.align	16
+.L09810rounds:
+	cmpl	$268435456,%ebp
+	je	.L09910rounds_alt
+	movl	$9,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,200,1
+	call	.L100key_128_cold
+.byte	102,15,58,223,200,2
+	call	.L101key_128
+.byte	102,15,58,223,200,4
+	call	.L101key_128
+.byte	102,15,58,223,200,8
+	call	.L101key_128
+.byte	102,15,58,223,200,16
+	call	.L101key_128
+.byte	102,15,58,223,200,32
+	call	.L101key_128
+.byte	102,15,58,223,200,64
+	call	.L101key_128
+.byte	102,15,58,223,200,128
+	call	.L101key_128
+.byte	102,15,58,223,200,27
+	call	.L101key_128
+.byte	102,15,58,223,200,54
+	call	.L101key_128
+	movups	%xmm0,(%edx)
+	movl	%ecx,80(%edx)
+	jmp	.L102good_key
+.align	16
+.L101key_128:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.L100key_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L09910rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movl	$8,%ecx
+	movdqa	32(%ebx),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,-16(%edx)
+.L103loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leal	16(%edx),%edx
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%edx)
+	movdqa	%xmm0,%xmm2
+	decl	%ecx
+	jnz	.L103loop_key128
+	movdqa	48(%ebx),%xmm4
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%edx)
+	movl	$9,%ecx
+	movl	%ecx,96(%edx)
+	jmp	.L102good_key
+.align	16
+.L09612rounds:
+	movq	16(%eax),%xmm2
+	cmpl	$268435456,%ebp
+	je	.L10412rounds_alt
+	movl	$11,%ecx
+	movups	%xmm0,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L105key_192a_cold
+.byte	102,15,58,223,202,2
+	call	.L106key_192b
+.byte	102,15,58,223,202,4
+	call	.L107key_192a
+.byte	102,15,58,223,202,8
+	call	.L106key_192b
+.byte	102,15,58,223,202,16
+	call	.L107key_192a
+.byte	102,15,58,223,202,32
+	call	.L106key_192b
+.byte	102,15,58,223,202,64
+	call	.L107key_192a
+.byte	102,15,58,223,202,128
+	call	.L106key_192b
+	movups	%xmm0,(%edx)
+	movl	%ecx,48(%edx)
+	jmp	.L102good_key
+.align	16
+.L107key_192a:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+.align	16
+.L105key_192a_cold:
+	movaps	%xmm2,%xmm5
+.L108key_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	ret
+.align	16
+.L106key_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%edx)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%edx)
+	leal	32(%edx),%edx
+	jmp	.L108key_192b_warm
+.align	16
+.L10412rounds_alt:
+	movdqa	16(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$8,%ecx
+	movdqu	%xmm0,-16(%edx)
+.L109loop_key192:
+	movq	%xmm2,(%edx)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leal	24(%edx),%edx
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%edx)
+	decl	%ecx
+	jnz	.L109loop_key192
+	movl	$11,%ecx
+	movl	%ecx,32(%edx)
+	jmp	.L102good_key
+.align	16
+.L09514rounds:
+	movups	16(%eax),%xmm2
+	leal	16(%edx),%edx
+	cmpl	$268435456,%ebp
+	je	.L11014rounds_alt
+	movl	$13,%ecx
+	movups	%xmm0,-32(%edx)
+	movups	%xmm2,-16(%edx)
+.byte	102,15,58,223,202,1
+	call	.L111key_256a_cold
+.byte	102,15,58,223,200,1
+	call	.L112key_256b
+.byte	102,15,58,223,202,2
+	call	.L113key_256a
+.byte	102,15,58,223,200,2
+	call	.L112key_256b
+.byte	102,15,58,223,202,4
+	call	.L113key_256a
+.byte	102,15,58,223,200,4
+	call	.L112key_256b
+.byte	102,15,58,223,202,8
+	call	.L113key_256a
+.byte	102,15,58,223,200,8
+	call	.L112key_256b
+.byte	102,15,58,223,202,16
+	call	.L113key_256a
+.byte	102,15,58,223,200,16
+	call	.L112key_256b
+.byte	102,15,58,223,202,32
+	call	.L113key_256a
+.byte	102,15,58,223,200,32
+	call	.L112key_256b
+.byte	102,15,58,223,202,64
+	call	.L113key_256a
+	movups	%xmm0,(%edx)
+	movl	%ecx,16(%edx)
+	xorl	%eax,%eax
+	jmp	.L102good_key
+.align	16
+.L113key_256a:
+	movups	%xmm2,(%edx)
+	leal	16(%edx),%edx
+.L111key_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	ret
+.align	16
+.L112key_256b:
+	movups	%xmm0,(%edx)
+	leal	16(%edx),%edx
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	ret
+.align	16
+.L11014rounds_alt:
+	movdqa	(%ebx),%xmm5
+	movdqa	32(%ebx),%xmm4
+	movl	$7,%ecx
+	movdqu	%xmm0,-32(%edx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,-16(%edx)
+.L114loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%edx)
+	decl	%ecx
+	jz	.L115done_key256
+	pshufd	$255,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%edx)
+	leal	32(%edx),%edx
+	movdqa	%xmm2,%xmm1
+	jmp	.L114loop_key256
+.L115done_key256:
+	movl	$13,%ecx
+	movl	%ecx,16(%edx)
+.L102good_key:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	xorl	%eax,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4
+.L093bad_pointer:
+	movl	$-1,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	4
+.L097bad_keybits:
+	pxor	%xmm0,%xmm0
+	movl	$-2,%eax
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl	aes_hw_set_encrypt_key
+.hidden	aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,@function
+.align	16
+aes_hw_set_encrypt_key:
+.L_aes_hw_set_encrypt_key_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L116pic
+.L116pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	ret
+.size	aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
+.globl	aes_hw_set_decrypt_key
+.hidden	aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,@function
+.align	16
+aes_hw_set_decrypt_key:
+.L_aes_hw_set_decrypt_key_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	call	_aesni_set_encrypt_key
+	movl	12(%esp),%edx
+	shll	$4,%ecx
+	testl	%eax,%eax
+	jnz	.L117dec_key_ret
+	leal	16(%edx,%ecx,1),%eax
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+	movups	%xmm0,(%eax)
+	movups	%xmm1,(%edx)
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+.L118dec_key_inverse:
+	movups	(%edx),%xmm0
+	movups	(%eax),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leal	16(%edx),%edx
+	leal	-16(%eax),%eax
+	movups	%xmm0,16(%eax)
+	movups	%xmm1,-16(%edx)
+	cmpl	%edx,%eax
+	ja	.L118dec_key_inverse
+	movups	(%edx),%xmm0
+.byte	102,15,56,219,192
+	movups	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorl	%eax,%eax
+.L117dec_key_ret:
+	ret
+.size	aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin
+.align	64
+.Lkey_const:
+.long	202313229,202313229,202313229,202313229
+.long	67569157,67569157,67569157,67569157
+.long	1,1,1,1
+.long	27,27,27,27
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte	83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte	115,108,46,111,114,103,62,0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S
@@ -1,0 +1,997 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	bn_mul_add_words
+.hidden	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+	call	.L000PIC_me_up
+.L000PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L001maw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+	jmp	.L002maw_sse2_entry
+.align	16
+.L003maw_sse2_unrolled:
+	movd	(%eax),%mm3
+	paddq	%mm3,%mm1
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	movd	4(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	movd	8(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	12(%edx),%mm7
+	pmuludq	%mm0,%mm7
+	paddq	%mm2,%mm1
+	movd	4(%eax),%mm3
+	paddq	%mm4,%mm3
+	movd	8(%eax),%mm5
+	paddq	%mm6,%mm5
+	movd	12(%eax),%mm4
+	paddq	%mm4,%mm7
+	movd	%mm1,(%eax)
+	movd	16(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	psrlq	$32,%mm1
+	movd	20(%edx),%mm4
+	pmuludq	%mm0,%mm4
+	paddq	%mm3,%mm1
+	movd	24(%edx),%mm6
+	pmuludq	%mm0,%mm6
+	movd	%mm1,4(%eax)
+	psrlq	$32,%mm1
+	movd	28(%edx),%mm3
+	addl	$32,%edx
+	pmuludq	%mm0,%mm3
+	paddq	%mm5,%mm1
+	movd	16(%eax),%mm5
+	paddq	%mm5,%mm2
+	movd	%mm1,8(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm7,%mm1
+	movd	20(%eax),%mm5
+	paddq	%mm5,%mm4
+	movd	%mm1,12(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm2,%mm1
+	movd	24(%eax),%mm5
+	paddq	%mm5,%mm6
+	movd	%mm1,16(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm4,%mm1
+	movd	28(%eax),%mm5
+	paddq	%mm5,%mm3
+	movd	%mm1,20(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm6,%mm1
+	movd	%mm1,24(%eax)
+	psrlq	$32,%mm1
+	paddq	%mm3,%mm1
+	movd	%mm1,28(%eax)
+	leal	32(%eax),%eax
+	psrlq	$32,%mm1
+	subl	$8,%ecx
+	jz	.L004maw_sse2_exit
+.L002maw_sse2_entry:
+	testl	$4294967288,%ecx
+	jnz	.L003maw_sse2_unrolled
+.align	4
+.L005maw_sse2_loop:
+	movd	(%edx),%mm2
+	movd	(%eax),%mm3
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm3,%mm1
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L005maw_sse2_loop
+.L004maw_sse2_exit:
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L001maw_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	28(%esp),%ecx
+	movl	24(%esp),%ebx
+	andl	$4294967288,%ecx
+	movl	32(%esp),%ebp
+	pushl	%ecx
+	jz	.L006maw_finish
+.align	16
+.L007maw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	28(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	subl	$8,%ecx
+	leal	32(%ebx),%ebx
+	leal	32(%edi),%edi
+	jnz	.L007maw_loop
+.L006maw_finish:
+	movl	32(%esp),%ecx
+	andl	$7,%ecx
+	jnz	.L008maw_finish2
+	jmp	.L009maw_end
+.L008maw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	4(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	4(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	8(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	8(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	12(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	12(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	16(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	16(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	20(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	20(%edi),%eax
+	adcl	$0,%edx
+	decl	%ecx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	jz	.L009maw_end
+
+	movl	24(%ebx),%eax
+	mull	%ebp
+	addl	%esi,%eax
+	adcl	$0,%edx
+	addl	24(%edi),%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L009maw_end:
+	movl	%esi,%eax
+	popl	%ecx
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl	bn_mul_words
+.hidden	bn_mul_words
+.type	bn_mul_words,@function
+.align	16
+bn_mul_words:
+.L_bn_mul_words_begin:
+	call	.L010PIC_me_up
+.L010PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L011mw_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+	movd	16(%esp),%mm0
+	pxor	%mm1,%mm1
+.align	16
+.L012mw_sse2_loop:
+	movd	(%edx),%mm2
+	pmuludq	%mm0,%mm2
+	leal	4(%edx),%edx
+	paddq	%mm2,%mm1
+	movd	%mm1,(%eax)
+	subl	$1,%ecx
+	psrlq	$32,%mm1
+	leal	4(%eax),%eax
+	jnz	.L012mw_sse2_loop
+	movd	%mm1,%eax
+	emms
+	ret
+.align	16
+.L011mw_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	xorl	%esi,%esi
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ebp
+	movl	32(%esp),%ecx
+	andl	$4294967288,%ebp
+	jz	.L013mw_finish
+.L014mw_loop:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+
+	movl	28(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,28(%edi)
+	movl	%edx,%esi
+
+	addl	$32,%ebx
+	addl	$32,%edi
+	subl	$8,%ebp
+	jz	.L013mw_finish
+	jmp	.L014mw_loop
+.L013mw_finish:
+	movl	28(%esp),%ebp
+	andl	$7,%ebp
+	jnz	.L015mw_finish2
+	jmp	.L016mw_end
+.L015mw_finish2:
+
+	movl	(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	4(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,4(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	8(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,8(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	12(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,12(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	16(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,16(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	20(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,20(%edi)
+	movl	%edx,%esi
+	decl	%ebp
+	jz	.L016mw_end
+
+	movl	24(%ebx),%eax
+	mull	%ecx
+	addl	%esi,%eax
+	adcl	$0,%edx
+	movl	%eax,24(%edi)
+	movl	%edx,%esi
+.L016mw_end:
+	movl	%esi,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_words,.-.L_bn_mul_words_begin
+.globl	bn_sqr_words
+.hidden	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+	call	.L017PIC_me_up
+.L017PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L018sqr_non_sse2
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	movl	12(%esp),%ecx
+.align	16
+.L019sqr_sse2_loop:
+	movd	(%edx),%mm0
+	pmuludq	%mm0,%mm0
+	leal	4(%edx),%edx
+	movq	%mm0,(%eax)
+	subl	$1,%ecx
+	leal	8(%eax),%eax
+	jnz	.L019sqr_sse2_loop
+	emms
+	ret
+.align	16
+.L018sqr_non_sse2:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%ebx
+	andl	$4294967288,%ebx
+	jz	.L020sw_finish
+.L021sw_loop:
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	movl	%edx,4(%esi)
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	movl	%edx,12(%esi)
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	movl	%edx,20(%esi)
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	movl	%edx,28(%esi)
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	movl	%edx,36(%esi)
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	movl	%edx,44(%esi)
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+
+	movl	28(%edi),%eax
+	mull	%eax
+	movl	%eax,56(%esi)
+	movl	%edx,60(%esi)
+
+	addl	$32,%edi
+	addl	$64,%esi
+	subl	$8,%ebx
+	jnz	.L021sw_loop
+.L020sw_finish:
+	movl	28(%esp),%ebx
+	andl	$7,%ebx
+	jz	.L022sw_end
+
+	movl	(%edi),%eax
+	mull	%eax
+	movl	%eax,(%esi)
+	decl	%ebx
+	movl	%edx,4(%esi)
+	jz	.L022sw_end
+
+	movl	4(%edi),%eax
+	mull	%eax
+	movl	%eax,8(%esi)
+	decl	%ebx
+	movl	%edx,12(%esi)
+	jz	.L022sw_end
+
+	movl	8(%edi),%eax
+	mull	%eax
+	movl	%eax,16(%esi)
+	decl	%ebx
+	movl	%edx,20(%esi)
+	jz	.L022sw_end
+
+	movl	12(%edi),%eax
+	mull	%eax
+	movl	%eax,24(%esi)
+	decl	%ebx
+	movl	%edx,28(%esi)
+	jz	.L022sw_end
+
+	movl	16(%edi),%eax
+	mull	%eax
+	movl	%eax,32(%esi)
+	decl	%ebx
+	movl	%edx,36(%esi)
+	jz	.L022sw_end
+
+	movl	20(%edi),%eax
+	mull	%eax
+	movl	%eax,40(%esi)
+	decl	%ebx
+	movl	%edx,44(%esi)
+	jz	.L022sw_end
+
+	movl	24(%edi),%eax
+	mull	%eax
+	movl	%eax,48(%esi)
+	movl	%edx,52(%esi)
+.L022sw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl	bn_div_words
+.hidden	bn_div_words
+.type	bn_div_words,@function
+.align	16
+bn_div_words:
+.L_bn_div_words_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	movl	12(%esp),%ecx
+	divl	%ecx
+	ret
+.size	bn_div_words,.-.L_bn_div_words_begin
+.globl	bn_add_words
+.hidden	bn_add_words
+.type	bn_add_words,@function
+.align	16
+bn_add_words:
+.L_bn_add_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L023aw_finish
+.L024aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L024aw_loop
+.L023aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L025aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L025aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L025aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L025aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L025aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L025aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L025aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	addl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	addl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L025aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_add_words,.-.L_bn_add_words_begin
+.globl	bn_sub_words
+.hidden	bn_sub_words
+.type	bn_sub_words,@function
+.align	16
+bn_sub_words:
+.L_bn_sub_words_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+
+	movl	20(%esp),%ebx
+	movl	24(%esp),%esi
+	movl	28(%esp),%edi
+	movl	32(%esp),%ebp
+	xorl	%eax,%eax
+	andl	$4294967288,%ebp
+	jz	.L026aw_finish
+.L027aw_loop:
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,(%ebx)
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,4(%ebx)
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,8(%ebx)
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,12(%ebx)
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,16(%ebx)
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,20(%ebx)
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+
+	movl	28(%esi),%ecx
+	movl	28(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,28(%ebx)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	addl	$32,%ebx
+	subl	$8,%ebp
+	jnz	.L027aw_loop
+.L026aw_finish:
+	movl	32(%esp),%ebp
+	andl	$7,%ebp
+	jz	.L028aw_end
+
+	movl	(%esi),%ecx
+	movl	(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,(%ebx)
+	jz	.L028aw_end
+
+	movl	4(%esi),%ecx
+	movl	4(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,4(%ebx)
+	jz	.L028aw_end
+
+	movl	8(%esi),%ecx
+	movl	8(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,8(%ebx)
+	jz	.L028aw_end
+
+	movl	12(%esi),%ecx
+	movl	12(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,12(%ebx)
+	jz	.L028aw_end
+
+	movl	16(%esi),%ecx
+	movl	16(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,16(%ebx)
+	jz	.L028aw_end
+
+	movl	20(%esi),%ecx
+	movl	20(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	decl	%ebp
+	movl	%ecx,20(%ebx)
+	jz	.L028aw_end
+
+	movl	24(%esi),%ecx
+	movl	24(%edi),%edx
+	subl	%eax,%ecx
+	movl	$0,%eax
+	adcl	%eax,%eax
+	subl	%edx,%ecx
+	adcl	$0,%eax
+	movl	%ecx,24(%ebx)
+.L028aw_end:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_sub_words,.-.L_bn_sub_words_begin
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S
@@ -1,0 +1,1266 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	bn_mul_comba8
+.hidden	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	16(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	24(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,28(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	16(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	12(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	16(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	16(%esi),%eax
+	adcl	%edx,%ecx
+	movl	20(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	12(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,36(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esi),%eax
+	adcl	%edx,%ebp
+	movl	20(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	16(%esi),%eax
+	adcl	%edx,%ebp
+	movl	24(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	12(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	16(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	24(%esi),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esi),%eax
+	adcl	%edx,%ebx
+	movl	24(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	16(%esi),%eax
+	adcl	%edx,%ebx
+	movl	28(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	20(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,44(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	24(%esi),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esi),%eax
+	adcl	%edx,%ecx
+	movl	28(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	24(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	24(%esi),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	28(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,52(%eax)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%eax)
+
+
+	movl	%ebx,60(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl	bn_mul_comba4
+.hidden	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+	pushl	%esi
+	movl	12(%esp),%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	pushl	%ebp
+	pushl	%ebx
+	xorl	%ebx,%ebx
+	movl	(%esi),%eax
+	xorl	%ecx,%ecx
+	movl	(%edi),%edx
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%eax)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	(%esi),%eax
+	adcl	%edx,%ebp
+	movl	4(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,4(%eax)
+	movl	8(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	4(%esi),%eax
+	adcl	%edx,%ebx
+	movl	4(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	(%esi),%eax
+	adcl	%edx,%ebx
+	movl	8(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	8(%esi),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	4(%esi),%eax
+	adcl	%edx,%ecx
+	movl	8(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	(%esi),%eax
+	adcl	%edx,%ecx
+	movl	12(%edi),%edx
+	adcl	$0,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	movl	4(%edi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,12(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	8(%esi),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	4(%esi),%eax
+	adcl	%edx,%ebp
+	movl	12(%edi),%edx
+	adcl	$0,%ebx
+
+	mull	%edx
+	addl	%eax,%ecx
+	movl	20(%esp),%eax
+	adcl	%edx,%ebp
+	movl	8(%edi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	8(%esi),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+
+	mull	%edx
+	addl	%eax,%ebp
+	movl	20(%esp),%eax
+	adcl	%edx,%ebx
+	movl	12(%edi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,20(%eax)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%ebx
+	movl	20(%esp),%eax
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%eax)
+
+
+	movl	%ecx,28(%eax)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl	bn_sqr_comba8
+.hidden	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	12(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	20(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	16(%esi),%eax
+	adcl	$0,%ebp
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%eax
+	adcl	$0,%ebx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,28(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	8(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	20(%esi),%eax
+	adcl	$0,%ecx
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	16(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	8(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,32(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+	movl	12(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	20(%esi),%eax
+	adcl	$0,%ebp
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	28(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,36(%edi)
+	movl	12(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	24(%esi),%eax
+	adcl	$0,%ebx
+	movl	16(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	20(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	16(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,40(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	24(%esi),%eax
+	adcl	$0,%ecx
+	movl	20(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	28(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,44(%edi)
+	movl	20(%esi),%edx
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%eax
+	adcl	$0,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	24(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,48(%edi)
+	movl	28(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	28(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,52(%edi)
+
+
+	xorl	%ecx,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	adcl	$0,%ecx
+	movl	%ebp,56(%edi)
+
+	movl	%ebx,60(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl	bn_sqr_comba4
+.hidden	bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushl	%ebx
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	xorl	%ebx,%ebx
+	xorl	%ecx,%ecx
+	movl	(%esi),%eax
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	(%esi),%edx
+	adcl	$0,%ebp
+	movl	%ebx,(%edi)
+	movl	4(%esi),%eax
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+	movl	%ecx,4(%edi)
+	movl	(%esi),%edx
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	4(%esi),%eax
+	adcl	$0,%ecx
+
+	mull	%eax
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	$0,%ecx
+	movl	%ebp,8(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ebp,%ebp
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	8(%esi),%eax
+	adcl	$0,%ebp
+	movl	4(%esi),%edx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebp
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	movl	12(%esi),%eax
+	adcl	$0,%ebp
+	movl	%ebx,12(%edi)
+	movl	4(%esi),%edx
+
+
+	xorl	%ebx,%ebx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ebx
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%eax
+	adcl	$0,%ebx
+
+	mull	%eax
+	addl	%eax,%ecx
+	adcl	%edx,%ebp
+	movl	8(%esi),%edx
+	adcl	$0,%ebx
+	movl	%ecx,16(%edi)
+	movl	12(%esi),%eax
+
+
+	xorl	%ecx,%ecx
+
+	mull	%edx
+	addl	%eax,%eax
+	adcl	%edx,%edx
+	adcl	$0,%ecx
+	addl	%eax,%ebp
+	adcl	%edx,%ebx
+	movl	12(%esi),%eax
+	adcl	$0,%ecx
+	movl	%ebp,20(%edi)
+
+
+	xorl	%ebp,%ebp
+
+	mull	%eax
+	addl	%eax,%ebx
+	adcl	%edx,%ecx
+	adcl	$0,%ebp
+	movl	%ebx,24(%edi)
+
+	movl	%ecx,28(%edi)
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S
@@ -1,0 +1,294 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	gcm_gmult_ssse3
+.hidden	gcm_gmult_ssse3
+.type	gcm_gmult_ssse3,@function
+.align	16
+gcm_gmult_ssse3:
+.L_gcm_gmult_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movdqu	(%edi),%xmm0
+	call	.L000pic_point
+.L000pic_point:
+	popl	%eax
+	movdqa	.Lreverse_bytes-.L000pic_point(%eax),%xmm7
+	movdqa	.Llow4_mask-.L000pic_point(%eax),%xmm2
+.byte	102,15,56,0,199
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+.L001loop_row_1:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L001loop_row_1
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+.L002loop_row_2:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L002loop_row_2
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+.L003loop_row_3:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L003loop_row_3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,0,215
+	movdqu	%xmm2,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin
+.globl	gcm_ghash_ssse3
+.hidden	gcm_ghash_ssse3
+.type	gcm_ghash_ssse3,@function
+.align	16
+gcm_ghash_ssse3:
+.L_gcm_ghash_ssse3_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%edi
+	movl	24(%esp),%esi
+	movl	28(%esp),%edx
+	movl	32(%esp),%ecx
+	movdqu	(%edi),%xmm0
+	call	.L004pic_point
+.L004pic_point:
+	popl	%ebx
+	movdqa	.Lreverse_bytes-.L004pic_point(%ebx),%xmm7
+	andl	$-16,%ecx
+.byte	102,15,56,0,199
+	pxor	%xmm3,%xmm3
+.L005loop_ghash:
+	movdqa	.Llow4_mask-.L004pic_point(%ebx),%xmm2
+	movdqu	(%edx),%xmm1
+.byte	102,15,56,0,207
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	pxor	%xmm2,%xmm2
+	movl	$5,%eax
+.L006loop_row_4:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L006loop_row_4
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$5,%eax
+.L007loop_row_5:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L007loop_row_5
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movl	$6,%eax
+.L008loop_row_6:
+	movdqa	(%esi),%xmm4
+	leal	16(%esi),%esi
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+	subl	$1,%eax
+	jnz	.L008loop_row_6
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+	leal	-256(%esi),%esi
+	leal	16(%edx),%edx
+	subl	$16,%ecx
+	jnz	.L005loop_ghash
+.byte	102,15,56,0,199
+	movdqu	%xmm0,(%edi)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin
+.align	16
+.Lreverse_bytes:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align	16
+.Llow4_mask:
+.long	252645135,252645135,252645135,252645135
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S
@@ -1,0 +1,330 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	gcm_init_clmul
+.hidden	gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+	movl	4(%esp),%edx
+	movl	8(%esp),%eax
+	call	.L000pic
+.L000pic:
+	popl	%ecx
+	leal	.Lbswap-.L000pic(%ecx),%ecx
+	movdqu	(%eax),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+	pand	16(%ecx),%xmm5
+	pxor	%xmm5,%xmm2
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,(%edx)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%edx)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%edx)
+	ret
+.size	gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl	gcm_gmult_clmul
+.hidden	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+	movl	4(%esp),%eax
+	movl	8(%esp),%edx
+	call	.L001pic
+.L001pic:
+	popl	%ecx
+	leal	.Lbswap-.L001pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movups	(%edx),%xmm2
+.byte	102,15,56,0,197
+	movups	32(%edx),%xmm4
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	ret
+.size	gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl	gcm_ghash_clmul
+.hidden	gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%eax
+	movl	24(%esp),%edx
+	movl	28(%esp),%esi
+	movl	32(%esp),%ebx
+	call	.L002pic
+.L002pic:
+	popl	%ecx
+	leal	.Lbswap-.L002pic(%ecx),%ecx
+	movdqu	(%eax),%xmm0
+	movdqa	(%ecx),%xmm5
+	movdqu	(%edx),%xmm2
+.byte	102,15,56,0,197
+	subl	$16,%ebx
+	jz	.L003odd_tail
+	movdqu	(%esi),%xmm3
+	movdqu	16(%esi),%xmm6
+.byte	102,15,56,0,221
+.byte	102,15,56,0,245
+	movdqu	32(%edx),%xmm5
+	pxor	%xmm3,%xmm0
+	pshufd	$78,%xmm6,%xmm3
+	movdqa	%xmm6,%xmm7
+	pxor	%xmm6,%xmm3
+	leal	32(%esi),%esi
+.byte	102,15,58,68,242,0
+.byte	102,15,58,68,250,17
+.byte	102,15,58,68,221,0
+	movups	16(%edx),%xmm2
+	nop
+	subl	$32,%ebx
+	jbe	.L004even_tail
+	jmp	.L005mod_loop
+.align	32
+.L005mod_loop:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+	nop
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movups	(%edx),%xmm2
+	xorps	%xmm6,%xmm0
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm7,%xmm1
+	movdqu	(%esi),%xmm7
+	pxor	%xmm0,%xmm3
+	movdqu	16(%esi),%xmm6
+	pxor	%xmm1,%xmm3
+.byte	102,15,56,0,253
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+.byte	102,15,56,0,245
+	pxor	%xmm7,%xmm1
+	movdqa	%xmm6,%xmm7
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+.byte	102,15,58,68,242,0
+	movups	32(%edx),%xmm5
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	pshufd	$78,%xmm7,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm7,%xmm3
+	pxor	%xmm4,%xmm1
+.byte	102,15,58,68,250,17
+	movups	16(%edx),%xmm2
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,58,68,221,0
+	leal	32(%esi),%esi
+	subl	$32,%ebx
+	ja	.L005mod_loop
+.L004even_tail:
+	pshufd	$78,%xmm0,%xmm4
+	movdqa	%xmm0,%xmm1
+	pxor	%xmm0,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,229,16
+	movdqa	(%ecx),%xmm5
+	xorps	%xmm6,%xmm0
+	xorps	%xmm7,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm4,%xmm3
+	psrldq	$8,%xmm4
+	pslldq	$8,%xmm3
+	pxor	%xmm4,%xmm1
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testl	%ebx,%ebx
+	jnz	.L006done
+	movups	(%edx),%xmm2
+.L003odd_tail:
+	movdqu	(%esi),%xmm3
+.byte	102,15,56,0,221
+	pxor	%xmm3,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pshufd	$78,%xmm2,%xmm4
+	pxor	%xmm0,%xmm3
+	pxor	%xmm2,%xmm4
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	xorps	%xmm0,%xmm3
+	xorps	%xmm1,%xmm3
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.L006done:
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%eax)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align	64
+.Lbswap:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte	0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S
@@ -1,0 +1,688 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	md5_block_asm_data_order
+.hidden	md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+.align	16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+	pushl	%ebp
+	shll	$6,%ecx
+	pushl	%ebx
+	addl	%esi,%ecx
+	subl	$64,%ecx
+	movl	(%edi),%eax
+	pushl	%ecx
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+.L000start:
+
+
+	movl	%ecx,%edi
+	movl	(%esi),%ebp
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	3614090360(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	4(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	3905402710(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	8(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	606105819(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	12(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	3250441966(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	16(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	4118548399(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	20(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	1200080426(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	24(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2821735955(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	28(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	4249261313(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	32(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1770035416(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	36(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	2336552879(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	40(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	4294925233(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	44(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	2304563134(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	48(%esi),%ebp
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	leal	1804603682(%eax,%ebp,1),%eax
+	xorl	%edx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$7,%eax
+	movl	52(%esi),%ebp
+	addl	%ebx,%eax
+
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	leal	4254626195(%edx,%ebp,1),%edx
+	xorl	%ecx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$12,%edx
+	movl	56(%esi),%ebp
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	leal	2792965006(%ecx,%ebp,1),%ecx
+	xorl	%ebx,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$17,%ecx
+	movl	60(%esi),%ebp
+	addl	%edx,%ecx
+
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	leal	1236535329(%ebx,%ebp,1),%ebx
+	xorl	%eax,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$22,%ebx
+	movl	4(%esi),%ebp
+	addl	%ecx,%ebx
+
+
+
+	leal	4129170786(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	24(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3225465664(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	44(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	643717713(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3921069994(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	3593408605(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	40(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	38016083(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	60(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	3634488961(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	16(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	3889429448(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	36(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	568446438(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	56(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	3275163606(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	12(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	4107603335(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	32(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	1163531501(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	52(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+	leal	2850285829(%eax,%ebp,1),%eax
+	xorl	%ebx,%edi
+	andl	%edx,%edi
+	movl	8(%esi),%ebp
+	xorl	%ecx,%edi
+	addl	%edi,%eax
+	movl	%ebx,%edi
+	roll	$5,%eax
+	addl	%ebx,%eax
+
+	leal	4243563512(%edx,%ebp,1),%edx
+	xorl	%eax,%edi
+	andl	%ecx,%edi
+	movl	28(%esi),%ebp
+	xorl	%ebx,%edi
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$9,%edx
+	addl	%eax,%edx
+
+	leal	1735328473(%ecx,%ebp,1),%ecx
+	xorl	%edx,%edi
+	andl	%ebx,%edi
+	movl	48(%esi),%ebp
+	xorl	%eax,%edi
+	addl	%edi,%ecx
+	movl	%edx,%edi
+	roll	$14,%ecx
+	addl	%edx,%ecx
+
+	leal	2368359562(%ebx,%ebp,1),%ebx
+	xorl	%ecx,%edi
+	andl	%eax,%edi
+	movl	20(%esi),%ebp
+	xorl	%edx,%edi
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	4294588738(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	32(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	2272392833(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	1839030562(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	56(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	4259657740(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	2763975236(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	16(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	1272893353(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	4139469664(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	40(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3200236656(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	681279174(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3936430074(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	3572445317(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	24(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	76029189(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ebx
+	movl	%ecx,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+	xorl	%edx,%edi
+	xorl	%ebx,%edi
+	leal	3654602809(%eax,%ebp,1),%eax
+	addl	%edi,%eax
+	roll	$4,%eax
+	movl	48(%esi),%ebp
+	movl	%ebx,%edi
+
+	leal	3873151461(%edx,%ebp,1),%edx
+	addl	%ebx,%eax
+	xorl	%ecx,%edi
+	xorl	%eax,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%edx
+	movl	%eax,%edi
+	roll	$11,%edx
+	addl	%eax,%edx
+
+	xorl	%ebx,%edi
+	xorl	%edx,%edi
+	leal	530742520(%ecx,%ebp,1),%ecx
+	addl	%edi,%ecx
+	roll	$16,%ecx
+	movl	8(%esi),%ebp
+	movl	%edx,%edi
+
+	leal	3299628645(%ebx,%ebp,1),%ebx
+	addl	%edx,%ecx
+	xorl	%eax,%edi
+	xorl	%ecx,%edi
+	movl	(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$23,%ebx
+	addl	%ecx,%ebx
+
+
+
+	xorl	%edx,%edi
+	orl	%ebx,%edi
+	leal	4096336452(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	28(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	1126891415(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	56(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2878612391(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	20(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	4237533241(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	48(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1700485571(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	12(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	2399980690(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	40(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	4293915773(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	4(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	2240044497(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	32(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	1873313359(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	60(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	4264355552(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	24(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	2734768916(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	52(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	1309151649(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	16(%esi),%ebp
+	addl	%edi,%ebx
+	movl	$-1,%edi
+	roll	$21,%ebx
+	xorl	%edx,%edi
+	addl	%ecx,%ebx
+
+	orl	%ebx,%edi
+	leal	4149444226(%eax,%ebp,1),%eax
+	xorl	%ecx,%edi
+	movl	44(%esi),%ebp
+	addl	%edi,%eax
+	movl	$-1,%edi
+	roll	$6,%eax
+	xorl	%ecx,%edi
+	addl	%ebx,%eax
+
+	orl	%eax,%edi
+	leal	3174756917(%edx,%ebp,1),%edx
+	xorl	%ebx,%edi
+	movl	8(%esi),%ebp
+	addl	%edi,%edx
+	movl	$-1,%edi
+	roll	$10,%edx
+	xorl	%ebx,%edi
+	addl	%eax,%edx
+
+	orl	%edx,%edi
+	leal	718787259(%ecx,%ebp,1),%ecx
+	xorl	%eax,%edi
+	movl	36(%esi),%ebp
+	addl	%edi,%ecx
+	movl	$-1,%edi
+	roll	$15,%ecx
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+
+	orl	%ecx,%edi
+	leal	3951481745(%ebx,%ebp,1),%ebx
+	xorl	%edx,%edi
+	movl	24(%esp),%ebp
+	addl	%edi,%ebx
+	addl	$64,%esi
+	roll	$21,%ebx
+	movl	(%ebp),%edi
+	addl	%ecx,%ebx
+	addl	%edi,%eax
+	movl	4(%ebp),%edi
+	addl	%edi,%ebx
+	movl	8(%ebp),%edi
+	addl	%edi,%ecx
+	movl	12(%ebp),%edi
+	addl	%edi,%edx
+	movl	%eax,(%ebp)
+	movl	%ebx,4(%ebp)
+	movl	(%esp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	cmpl	%esi,%edi
+	jae	.L000start
+	popl	%eax
+	popl	%ebx
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	ret
+.size	md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S
@@ -1,0 +1,3808 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	sha1_block_data_order
+.hidden	sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+.L_sha1_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi
+	leal	.LK_XX_XX-.L000pic_point(%ebp),%ebp
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	testl	$512,%edx
+	jz	.L001x86
+	movl	8(%esi),%ecx
+	testl	$16777216,%eax
+	jz	.L001x86
+	andl	$268435456,%edx
+	andl	$1073741824,%eax
+	orl	%edx,%eax
+	cmpl	$1342177280,%eax
+	je	.Lavx_shortcut
+	jmp	.Lssse3_shortcut
+.align	16
+.L001x86:
+	movl	20(%esp),%ebp
+	movl	24(%esp),%esi
+	movl	28(%esp),%eax
+	subl	$76,%esp
+	shll	$6,%eax
+	addl	%esi,%eax
+	movl	%eax,104(%esp)
+	movl	16(%ebp),%edi
+	jmp	.L002loop
+.align	16
+.L002loop:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	%ecx,8(%esp)
+	movl	%edx,12(%esp)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,16(%esp)
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%edx,28(%esp)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,40(%esp)
+	movl	%edx,44(%esp)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	movl	%eax,48(%esp)
+	movl	%ebx,52(%esp)
+	movl	%ecx,56(%esp)
+	movl	%edx,60(%esp)
+	movl	%esi,100(%esp)
+	movl	(%ebp),%eax
+	movl	4(%ebp),%ebx
+	movl	8(%ebp),%ecx
+	movl	12(%ebp),%edx
+
+	movl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	4(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	8(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	12(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	16(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	20(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	24(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	28(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	32(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	36(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	addl	%ecx,%ebp
+
+	movl	%edi,%ebx
+	movl	%ebp,%ecx
+	roll	$5,%ebp
+	xorl	%esi,%ebx
+	addl	%eax,%ebp
+	movl	40(%esp),%eax
+	andl	%edx,%ebx
+	rorl	$2,%edx
+	xorl	%esi,%ebx
+	leal	1518500249(%ebp,%eax,1),%ebp
+	addl	%ebx,%ebp
+
+	movl	%edx,%eax
+	movl	%ebp,%ebx
+	roll	$5,%ebp
+	xorl	%edi,%eax
+	addl	%esi,%ebp
+	movl	44(%esp),%esi
+	andl	%ecx,%eax
+	rorl	$2,%ecx
+	xorl	%edi,%eax
+	leal	1518500249(%ebp,%esi,1),%ebp
+	addl	%eax,%ebp
+
+	movl	%ecx,%esi
+	movl	%ebp,%eax
+	roll	$5,%ebp
+	xorl	%edx,%esi
+	addl	%edi,%ebp
+	movl	48(%esp),%edi
+	andl	%ebx,%esi
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	leal	1518500249(%ebp,%edi,1),%ebp
+	addl	%esi,%ebp
+
+	movl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	xorl	%ecx,%edi
+	addl	%edx,%ebp
+	movl	52(%esp),%edx
+	andl	%eax,%edi
+	rorl	$2,%eax
+	xorl	%ecx,%edi
+	leal	1518500249(%ebp,%edx,1),%ebp
+	addl	%edi,%ebp
+
+	movl	%eax,%edx
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	xorl	%ebx,%edx
+	addl	%ecx,%ebp
+	movl	56(%esp),%ecx
+	andl	%esi,%edx
+	rorl	$2,%esi
+	xorl	%ebx,%edx
+	leal	1518500249(%ebp,%ecx,1),%ebp
+	addl	%edx,%ebp
+
+	movl	%esi,%ecx
+	movl	%ebp,%edx
+	roll	$5,%ebp
+	xorl	%eax,%ecx
+	addl	%ebx,%ebp
+	movl	60(%esp),%ebx
+	andl	%edi,%ecx
+	rorl	$2,%edi
+	xorl	%eax,%ecx
+	leal	1518500249(%ebp,%ebx,1),%ebp
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	32(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	xorl	%esi,%ebp
+	addl	%ebp,%eax
+	movl	%ecx,%ebp
+	rorl	$2,%edx
+	movl	%ebx,(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	36(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	xorl	%edi,%ebp
+	addl	%ebp,%esi
+	movl	%ebx,%ebp
+	rorl	$2,%ecx
+	movl	%eax,4(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	40(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	xorl	%edx,%ebp
+	addl	%ebp,%edi
+	movl	%eax,%ebp
+	rorl	$2,%ebx
+	movl	%esi,8(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	44(%esp),%edi
+	andl	%eax,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	xorl	%ecx,%ebp
+	addl	%ebp,%edx
+	movl	%esi,%ebp
+	rorl	$2,%eax
+	movl	%edi,12(%esp)
+	roll	$5,%ebp
+	leal	1518500249(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,52(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,56(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,60(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	8(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	4(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,4(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	8(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,8(%esp)
+	leal	1859775393(%ebx,%eax,1),%ebx
+	movl	12(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,12(%esp)
+	leal	1859775393(%eax,%esi,1),%eax
+	movl	16(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	4(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,16(%esp)
+	leal	1859775393(%esi,%edi,1),%esi
+	movl	20(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	8(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,20(%esp)
+	leal	1859775393(%edi,%edx,1),%edi
+	movl	24(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	32(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	12(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,24(%esp)
+	leal	1859775393(%edx,%ecx,1),%edx
+	movl	28(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	36(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	16(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,28(%esp)
+	leal	1859775393(%ecx,%ebx,1),%ecx
+	movl	32(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	40(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	20(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,32(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	36(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	44(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	4(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	24(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,36(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	40(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	48(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	8(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	28(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,40(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	44(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	52(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	12(%esp),%edi
+	andl	%eax,%ebp
+	xorl	32(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,44(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	48(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	56(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	16(%esp),%edx
+	andl	%esi,%ebp
+	xorl	36(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,48(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	52(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	60(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	20(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	40(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,52(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	56(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	24(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	44(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,56(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	60(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	4(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	28(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	48(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,60(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	8(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	32(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	52(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	4(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	36(%esp),%edi
+	andl	%eax,%ebp
+	xorl	56(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,4(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	8(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	16(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	40(%esp),%edx
+	andl	%esi,%ebp
+	xorl	60(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,8(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	12(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	44(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,12(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	16(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	24(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	48(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	4(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,16(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	20(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	28(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	52(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	8(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,20(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	24(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ecx,%ebp
+	xorl	32(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	56(%esp),%esi
+	andl	%ebx,%ebp
+	xorl	12(%esp),%esi
+	roll	$1,%esi
+	addl	%edi,%ebp
+	rorl	$2,%ebx
+	movl	%eax,%edi
+	roll	$5,%edi
+	movl	%esi,24(%esp)
+	leal	2400959708(%esi,%ebp,1),%esi
+	movl	%ecx,%ebp
+	addl	%edi,%esi
+	andl	%edx,%ebp
+	movl	28(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%ebx,%ebp
+	xorl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	60(%esp),%edi
+	andl	%eax,%ebp
+	xorl	16(%esp),%edi
+	roll	$1,%edi
+	addl	%edx,%ebp
+	rorl	$2,%eax
+	movl	%esi,%edx
+	roll	$5,%edx
+	movl	%edi,28(%esp)
+	leal	2400959708(%edi,%ebp,1),%edi
+	movl	%ebx,%ebp
+	addl	%edx,%edi
+	andl	%ecx,%ebp
+	movl	32(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%eax,%ebp
+	xorl	40(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	(%esp),%edx
+	andl	%esi,%ebp
+	xorl	20(%esp),%edx
+	roll	$1,%edx
+	addl	%ecx,%ebp
+	rorl	$2,%esi
+	movl	%edi,%ecx
+	roll	$5,%ecx
+	movl	%edx,32(%esp)
+	leal	2400959708(%edx,%ebp,1),%edx
+	movl	%eax,%ebp
+	addl	%ecx,%edx
+	andl	%ebx,%ebp
+	movl	36(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%esi,%ebp
+	xorl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	4(%esp),%ecx
+	andl	%edi,%ebp
+	xorl	24(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebx,%ebp
+	rorl	$2,%edi
+	movl	%edx,%ebx
+	roll	$5,%ebx
+	movl	%ecx,36(%esp)
+	leal	2400959708(%ecx,%ebp,1),%ecx
+	movl	%esi,%ebp
+	addl	%ebx,%ecx
+	andl	%eax,%ebp
+	movl	40(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edi,%ebp
+	xorl	48(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	8(%esp),%ebx
+	andl	%edx,%ebp
+	xorl	28(%esp),%ebx
+	roll	$1,%ebx
+	addl	%eax,%ebp
+	rorl	$2,%edx
+	movl	%ecx,%eax
+	roll	$5,%eax
+	movl	%ebx,40(%esp)
+	leal	2400959708(%ebx,%ebp,1),%ebx
+	movl	%edi,%ebp
+	addl	%eax,%ebx
+	andl	%esi,%ebp
+	movl	44(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%edx,%ebp
+	xorl	52(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	12(%esp),%eax
+	andl	%ecx,%ebp
+	xorl	32(%esp),%eax
+	roll	$1,%eax
+	addl	%esi,%ebp
+	rorl	$2,%ecx
+	movl	%ebx,%esi
+	roll	$5,%esi
+	movl	%eax,44(%esp)
+	leal	2400959708(%eax,%ebp,1),%eax
+	movl	%edx,%ebp
+	addl	%esi,%eax
+	andl	%edi,%ebp
+	movl	48(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	56(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	36(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,48(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	52(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	60(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,52(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	56(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	24(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,56(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	60(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	4(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,60(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	8(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	4(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	12(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	56(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,4(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	8(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	16(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	60(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,8(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	12(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	20(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,12(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	16(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	24(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	48(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,16(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	20(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	28(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,20(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	24(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	32(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,24(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	28(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	36(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	16(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	movl	%eax,28(%esp)
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	32(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	40(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	20(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	movl	%esi,32(%esp)
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	36(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	44(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	movl	%edi,36(%esp)
+	leal	3395469782(%edi,%edx,1),%edi
+	movl	40(%esp),%edx
+	addl	%ebp,%edi
+
+	movl	%esi,%ebp
+	xorl	48(%esp),%edx
+	xorl	%eax,%ebp
+	xorl	8(%esp),%edx
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edx
+	roll	$1,%edx
+	addl	%ebp,%ecx
+	rorl	$2,%esi
+	movl	%edi,%ebp
+	roll	$5,%ebp
+	movl	%edx,40(%esp)
+	leal	3395469782(%edx,%ecx,1),%edx
+	movl	44(%esp),%ecx
+	addl	%ebp,%edx
+
+	movl	%edi,%ebp
+	xorl	52(%esp),%ecx
+	xorl	%esi,%ebp
+	xorl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	xorl	32(%esp),%ecx
+	roll	$1,%ecx
+	addl	%ebp,%ebx
+	rorl	$2,%edi
+	movl	%edx,%ebp
+	roll	$5,%ebp
+	movl	%ecx,44(%esp)
+	leal	3395469782(%ecx,%ebx,1),%ecx
+	movl	48(%esp),%ebx
+	addl	%ebp,%ecx
+
+	movl	%edx,%ebp
+	xorl	56(%esp),%ebx
+	xorl	%edi,%ebp
+	xorl	16(%esp),%ebx
+	xorl	%esi,%ebp
+	xorl	36(%esp),%ebx
+	roll	$1,%ebx
+	addl	%ebp,%eax
+	rorl	$2,%edx
+	movl	%ecx,%ebp
+	roll	$5,%ebp
+	movl	%ebx,48(%esp)
+	leal	3395469782(%ebx,%eax,1),%ebx
+	movl	52(%esp),%eax
+	addl	%ebp,%ebx
+
+	movl	%ecx,%ebp
+	xorl	60(%esp),%eax
+	xorl	%edx,%ebp
+	xorl	20(%esp),%eax
+	xorl	%edi,%ebp
+	xorl	40(%esp),%eax
+	roll	$1,%eax
+	addl	%ebp,%esi
+	rorl	$2,%ecx
+	movl	%ebx,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%eax,%esi,1),%eax
+	movl	56(%esp),%esi
+	addl	%ebp,%eax
+
+	movl	%ebx,%ebp
+	xorl	(%esp),%esi
+	xorl	%ecx,%ebp
+	xorl	24(%esp),%esi
+	xorl	%edx,%ebp
+	xorl	44(%esp),%esi
+	roll	$1,%esi
+	addl	%ebp,%edi
+	rorl	$2,%ebx
+	movl	%eax,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%esi,%edi,1),%esi
+	movl	60(%esp),%edi
+	addl	%ebp,%esi
+
+	movl	%eax,%ebp
+	xorl	4(%esp),%edi
+	xorl	%ebx,%ebp
+	xorl	28(%esp),%edi
+	xorl	%ecx,%ebp
+	xorl	48(%esp),%edi
+	roll	$1,%edi
+	addl	%ebp,%edx
+	rorl	$2,%eax
+	movl	%esi,%ebp
+	roll	$5,%ebp
+	leal	3395469782(%edi,%edx,1),%edi
+	addl	%ebp,%edi
+	movl	96(%esp),%ebp
+	movl	100(%esp),%edx
+	addl	(%ebp),%edi
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%eax
+	addl	12(%ebp),%ebx
+	addl	16(%ebp),%ecx
+	movl	%edi,(%ebp)
+	addl	$64,%edx
+	movl	%esi,4(%ebp)
+	cmpl	104(%esp),%edx
+	movl	%eax,8(%ebp)
+	movl	%ecx,%edi
+	movl	%ebx,12(%ebp)
+	movl	%edx,%esi
+	movl	%ecx,16(%ebp)
+	jb	.L002loop
+	addl	$76,%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.hidden	_sha1_block_data_order_ssse3
+.type	_sha1_block_data_order_ssse3,@function
+.align	16
+_sha1_block_data_order_ssse3:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L003pic_point
+.L003pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+	movdqa	(%ebp),%xmm7
+	movdqa	16(%ebp),%xmm0
+	movdqa	32(%ebp),%xmm1
+	movdqa	48(%ebp),%xmm2
+	movdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	movdqa	%xmm0,112(%esp)
+	movdqa	%xmm1,128(%esp)
+	movdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	movdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	movdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	movdqu	-64(%ebp),%xmm0
+	movdqu	-48(%ebp),%xmm1
+	movdqu	-32(%ebp),%xmm2
+	movdqu	-16(%ebp),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	movdqa	%xmm7,96(%esp)
+.byte	102,15,56,0,222
+	paddd	%xmm7,%xmm0
+	paddd	%xmm7,%xmm1
+	paddd	%xmm7,%xmm2
+	movdqa	%xmm0,(%esp)
+	psubd	%xmm7,%xmm0
+	movdqa	%xmm1,16(%esp)
+	psubd	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	xorl	%edx,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebp,%esi
+	jmp	.L004loop
+.align	16
+.L004loop:
+	rorl	$2,%ebx
+	xorl	%edx,%esi
+	movl	%eax,%ebp
+	punpcklqdq	%xmm1,%xmm4
+	movdqa	%xmm3,%xmm6
+	addl	(%esp),%edi
+	xorl	%ecx,%ebx
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm0,64(%esp)
+	roll	$5,%eax
+	addl	%esi,%edi
+	psrldq	$4,%xmm6
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm6
+	xorl	%ecx,%ebp
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	movdqa	%xmm4,%xmm6
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm0
+	paddd	%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	psrld	$31,%xmm6
+	xorl	%eax,%edi
+	roll	$5,%edx
+	movdqa	%xmm0,%xmm7
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	psrld	$30,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm6,%xmm4
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	pslld	$2,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	pxor	%xmm0,%xmm4
+	movdqa	96(%esp),%xmm0
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	pxor	%xmm7,%xmm4
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,%ebp
+	punpcklqdq	%xmm2,%xmm5
+	movdqa	%xmm4,%xmm7
+	addl	16(%esp),%eax
+	xorl	%edx,%ecx
+	paddd	%xmm4,%xmm0
+	movdqa	%xmm1,80(%esp)
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm7
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm7
+	xorl	%edx,%ebp
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	pxor	%xmm7,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm7
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm1
+	paddd	%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	psrld	$31,%xmm7
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	psrld	$30,%xmm1
+	addl	%edi,%edx
+	rorl	$7,%edi
+	por	%xmm7,%xmm5
+	xorl	%ebx,%ebp
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	pslld	$2,%xmm0
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pxor	%xmm1,%xmm5
+	movdqa	112(%esp),%xmm1
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	pxor	%xmm0,%xmm5
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	punpcklqdq	%xmm3,%xmm6
+	movdqa	%xmm5,%xmm0
+	addl	32(%esp),%ebx
+	xorl	%edi,%edx
+	paddd	%xmm5,%xmm1
+	movdqa	%xmm2,96(%esp)
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm0
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm0
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	pxor	%xmm0,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm1,16(%esp)
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm0
+	xorl	%edx,%esi
+	pslldq	$12,%xmm2
+	paddd	%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	psrld	$31,%xmm0
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm1
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm2
+	addl	%eax,%edi
+	rorl	$7,%eax
+	por	%xmm0,%xmm6
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm0
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	pslld	$2,%xmm1
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	pxor	%xmm2,%xmm6
+	movdqa	112(%esp),%xmm2
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	pxor	%xmm1,%xmm6
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%esi
+	movl	%edx,%ebp
+	punpcklqdq	%xmm4,%xmm7
+	movdqa	%xmm6,%xmm1
+	addl	48(%esp),%ecx
+	xorl	%eax,%edi
+	paddd	%xmm6,%xmm2
+	movdqa	%xmm3,64(%esp)
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm1
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm1
+	xorl	%eax,%ebp
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	pxor	%xmm1,%xmm7
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	movdqa	%xmm2,32(%esp)
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm1
+	xorl	%edi,%esi
+	pslldq	$12,%xmm3
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	psrld	$31,%xmm1
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm2
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	psrld	$30,%xmm3
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm1,%xmm7
+	xorl	%edx,%ebp
+	movdqa	80(%esp),%xmm1
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	pslld	$2,%xmm2
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	pxor	%xmm3,%xmm7
+	movdqa	112(%esp),%xmm3
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	pxor	%xmm2,%xmm7
+	pshufd	$238,%xmm6,%xmm2
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%edx
+	paddd	%xmm7,%xmm3
+	andl	%eax,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	rorl	$7,%edi
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	roll	$5,%edx
+	pslld	$2,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	psrld	$30,%xmm2
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	xorl	%edi,%edx
+	roll	$5,%ecx
+	por	%xmm2,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	movdqa	96(%esp),%xmm2
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	pshufd	$238,%xmm7,%xmm3
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	16(%esp),%edi
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm4,%xmm5
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm4
+	addl	%eax,%edi
+	pxor	%xmm3,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm3
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm3,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	movdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	pshufd	$238,%xmm0,%xmm4
+	addl	%ecx,%ebx
+	addl	32(%esp),%eax
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	movdqa	128(%esp),%xmm6
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm5
+	addl	%ebx,%eax
+	pxor	%xmm4,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	pslld	$2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	psrld	$30,%xmm4
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	por	%xmm4,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	movdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	pshufd	$238,%xmm1,%xmm5
+	addl	%edx,%ecx
+	addl	48(%esp),%ebx
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pslld	$2,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	psrld	$30,%xmm5
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	por	%xmm5,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	movdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	pshufd	$238,%xmm2,%xmm6
+	addl	%edi,%edx
+	addl	(%esp),%ecx
+	pxor	%xmm0,%xmm4
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	movdqa	%xmm7,%xmm0
+	rorl	$7,%edi
+	paddd	%xmm3,%xmm7
+	addl	%edx,%ecx
+	pxor	%xmm6,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pslld	$2,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	psrld	$30,%xmm6
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	por	%xmm6,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	movdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	pshufd	$238,%xmm3,%xmm7
+	addl	%eax,%edi
+	addl	16(%esp),%edx
+	pxor	%xmm1,%xmm5
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	movdqa	%xmm0,%xmm1
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm0
+	addl	%edi,%edx
+	pxor	%xmm7,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm7
+	movdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	pslld	$2,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	psrld	$30,%xmm7
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	por	%xmm7,%xmm5
+	addl	28(%esp),%eax
+	movdqa	80(%esp),%xmm7
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pshufd	$238,%xmm4,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	32(%esp),%edi
+	pxor	%xmm2,%xmm6
+	punpcklqdq	%xmm5,%xmm0
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	movdqa	%xmm1,%xmm2
+	addl	%esi,%edi
+	paddd	%xmm5,%xmm1
+	xorl	%ebx,%ebp
+	pxor	%xmm0,%xmm6
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	andl	%ebx,%ebp
+	movdqa	%xmm6,%xmm0
+	movdqa	%xmm1,16(%esp)
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	pslld	$2,%xmm6
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	psrld	$30,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	por	%xmm0,%xmm6
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	movdqa	96(%esp),%xmm0
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	pshufd	$238,%xmm5,%xmm1
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	48(%esp),%eax
+	pxor	%xmm3,%xmm7
+	punpcklqdq	%xmm6,%xmm1
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	movdqa	144(%esp),%xmm3
+	addl	%esi,%eax
+	paddd	%xmm6,%xmm2
+	xorl	%ecx,%ebp
+	pxor	%xmm1,%xmm7
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	andl	%ecx,%ebp
+	movdqa	%xmm7,%xmm1
+	movdqa	%xmm2,32(%esp)
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	pslld	$2,%xmm7
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	psrld	$30,%xmm1
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	por	%xmm1,%xmm7
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	movdqa	64(%esp),%xmm1
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	pshufd	$238,%xmm6,%xmm2
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	(%esp),%ebx
+	pxor	%xmm4,%xmm0
+	punpcklqdq	%xmm7,%xmm2
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	roll	$5,%ecx
+	movdqa	%xmm3,%xmm4
+	addl	%esi,%ebx
+	paddd	%xmm7,%xmm3
+	xorl	%edx,%ebp
+	pxor	%xmm2,%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	andl	%edx,%ebp
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm3,48(%esp)
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	pslld	$2,%xmm0
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	psrld	$30,%xmm2
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	por	%xmm2,%xmm0
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	movdqa	80(%esp),%xmm2
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	pshufd	$238,%xmm7,%xmm3
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	16(%esp),%ecx
+	pxor	%xmm5,%xmm1
+	punpcklqdq	%xmm0,%xmm3
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	roll	$5,%edx
+	movdqa	%xmm4,%xmm5
+	addl	%esi,%ecx
+	paddd	%xmm0,%xmm4
+	xorl	%edi,%ebp
+	pxor	%xmm3,%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	andl	%edi,%ebp
+	movdqa	%xmm1,%xmm3
+	movdqa	%xmm4,(%esp)
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	roll	$5,%ecx
+	pslld	$2,%xmm1
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	psrld	$30,%xmm3
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	por	%xmm3,%xmm1
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	movdqa	96(%esp),%xmm3
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pshufd	$238,%xmm0,%xmm4
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	32(%esp),%edx
+	pxor	%xmm6,%xmm2
+	punpcklqdq	%xmm1,%xmm4
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	roll	$5,%edi
+	movdqa	%xmm5,%xmm6
+	addl	%esi,%edx
+	paddd	%xmm1,%xmm5
+	xorl	%eax,%ebp
+	pxor	%xmm4,%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	andl	%eax,%ebp
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm5,16(%esp)
+	xorl	%ebx,%eax
+	rorl	$7,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	pslld	$2,%xmm2
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	psrld	$30,%xmm4
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	rorl	$7,%edx
+	por	%xmm4,%xmm2
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	movdqa	64(%esp),%xmm4
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	pshufd	$238,%xmm1,%xmm5
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	addl	48(%esp),%edi
+	pxor	%xmm7,%xmm3
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	movdqa	%xmm6,%xmm7
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm6
+	addl	%eax,%edi
+	pxor	%xmm5,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	movdqa	%xmm3,%xmm5
+	movdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	pslld	$2,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	psrld	$30,%xmm5
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	por	%xmm5,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	paddd	%xmm3,%xmm7
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	movdqa	%xmm7,48(%esp)
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	.L005done
+	movdqa	160(%esp),%xmm7
+	movdqa	176(%esp),%xmm6
+	movdqu	(%ebp),%xmm0
+	movdqu	16(%ebp),%xmm1
+	movdqu	32(%ebp),%xmm2
+	movdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+.byte	102,15,56,0,198
+	movl	%ebp,196(%esp)
+	movdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+.byte	102,15,56,0,206
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	paddd	%xmm7,%xmm0
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	movdqa	%xmm0,(%esp)
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	psubd	%xmm7,%xmm0
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+.byte	102,15,56,0,214
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	paddd	%xmm7,%xmm1
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	movdqa	%xmm1,16(%esp)
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	psubd	%xmm7,%xmm1
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+.byte	102,15,56,0,222
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	paddd	%xmm7,%xmm2
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	movdqa	%xmm2,32(%esp)
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	psubd	%xmm7,%xmm2
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%ecx,%ebx
+	movl	%edx,12(%ebp)
+	xorl	%edx,%ebx
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	pshufd	$238,%xmm0,%xmm4
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L004loop
+.align	16
+.L005done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	roll	$5,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	roll	$5,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	roll	$5,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	rorl	$7,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%ebp,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.hidden	_sha1_block_data_order_avx
+.type	_sha1_block_data_order_avx,@function
+.align	16
+_sha1_block_data_order_avx:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	call	.L006pic_point
+.L006pic_point:
+	popl	%ebp
+	leal	.LK_XX_XX-.L006pic_point(%ebp),%ebp
+.Lavx_shortcut:
+	vzeroall
+	vmovdqa	(%ebp),%xmm7
+	vmovdqa	16(%ebp),%xmm0
+	vmovdqa	32(%ebp),%xmm1
+	vmovdqa	48(%ebp),%xmm2
+	vmovdqa	64(%ebp),%xmm6
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebp
+	movl	28(%esp),%edx
+	movl	%esp,%esi
+	subl	$208,%esp
+	andl	$-64,%esp
+	vmovdqa	%xmm0,112(%esp)
+	vmovdqa	%xmm1,128(%esp)
+	vmovdqa	%xmm2,144(%esp)
+	shll	$6,%edx
+	vmovdqa	%xmm7,160(%esp)
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,176(%esp)
+	addl	$64,%ebp
+	movl	%edi,192(%esp)
+	movl	%ebp,196(%esp)
+	movl	%edx,200(%esp)
+	movl	%esi,204(%esp)
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	movl	16(%edi),%edi
+	movl	%ebx,%esi
+	vmovdqu	-64(%ebp),%xmm0
+	vmovdqu	-48(%ebp),%xmm1
+	vmovdqu	-32(%ebp),%xmm2
+	vmovdqu	-16(%ebp),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm7,96(%esp)
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm7,%xmm0,%xmm4
+	vpaddd	%xmm7,%xmm1,%xmm5
+	vpaddd	%xmm7,%xmm2,%xmm6
+	vmovdqa	%xmm4,(%esp)
+	movl	%ecx,%ebp
+	vmovdqa	%xmm5,16(%esp)
+	xorl	%edx,%ebp
+	vmovdqa	%xmm6,32(%esp)
+	andl	%ebp,%esi
+	jmp	.L007loop
+.align	16
+.L007loop:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%ebp
+	addl	(%esp),%edi
+	vpaddd	%xmm3,%xmm7,%xmm7
+	vmovdqa	%xmm0,64(%esp)
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm7,48(%esp)
+	movl	%edi,%esi
+	addl	4(%esp),%edx
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm6
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm0
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%ebp
+	addl	8(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm0,%xmm7
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm0,%xmm0
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vpxor	%xmm7,%xmm4,%xmm4
+	movl	%ecx,%esi
+	addl	12(%esp),%ebx
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vmovdqa	96(%esp),%xmm0
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%ebp
+	addl	16(%esp),%eax
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqa	%xmm1,80(%esp)
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vmovdqa	%xmm0,(%esp)
+	movl	%eax,%esi
+	addl	20(%esp),%edi
+	vpxor	%xmm7,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm1
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%edi,%ebp
+	addl	24(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm0
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpxor	%xmm0,%xmm5,%xmm5
+	movl	%edx,%esi
+	addl	28(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	vmovdqa	112(%esp),%xmm1
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%ebp
+	addl	32(%esp),%ebx
+	vpaddd	%xmm5,%xmm1,%xmm1
+	vmovdqa	%xmm2,96(%esp)
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm0
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm1,16(%esp)
+	movl	%ebx,%esi
+	addl	36(%esp),%eax
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm2
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%ebp
+	addl	40(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm1
+	vpor	%xmm0,%xmm6,%xmm6
+	addl	%esi,%edi
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	vmovdqa	64(%esp),%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%ebp
+	vpxor	%xmm1,%xmm6,%xmm6
+	movl	%edi,%esi
+	addl	44(%esp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	%ebp,%edx
+	andl	%eax,%esi
+	vmovdqa	112(%esp),%xmm2
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%ebp
+	addl	48(%esp),%ecx
+	vpaddd	%xmm6,%xmm2,%xmm2
+	vmovdqa	%xmm3,64(%esp)
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm1
+	addl	%esi,%ecx
+	andl	%edi,%ebp
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm1,%xmm1
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm2,32(%esp)
+	movl	%ecx,%esi
+	addl	52(%esp),%ebx
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm1
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%edi,%esi
+	vpslldq	$12,%xmm7,%xmm3
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%ebp
+	addl	56(%esp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm2
+	vpor	%xmm1,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	vmovdqa	80(%esp),%xmm1
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%ebp
+	vpxor	%xmm2,%xmm7,%xmm7
+	movl	%eax,%esi
+	addl	60(%esp),%edi
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	%ebp,%edi
+	andl	%ebx,%esi
+	vmovdqa	112(%esp),%xmm3
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%edi,%ebp
+	addl	(%esp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,80(%esp)
+	xorl	%ebx,%eax
+	shldl	$5,%edi,%edi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	addl	%esi,%edx
+	andl	%eax,%ebp
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	shrdl	$7,%edi,%edi
+	xorl	%ebx,%ebp
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	movl	%edx,%esi
+	addl	4(%esp),%ecx
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%ebp,%ecx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%ebp
+	addl	8(%esp),%ebx
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edi,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm2
+	addl	%esi,%ebx
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	12(%esp),%eax
+	xorl	%edi,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,96(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm3,%xmm1,%xmm1
+	addl	20(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm3,%xmm1,%xmm1
+	addl	28(%esp),%ebx
+	xorl	%edi,%ebp
+	vmovdqa	64(%esp),%xmm3
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,64(%esp)
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	vmovdqa	128(%esp),%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm4,%xmm2,%xmm2
+	addl	36(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpor	%xmm4,%xmm2,%xmm2
+	addl	44(%esp),%ecx
+	xorl	%eax,%ebp
+	vmovdqa	80(%esp),%xmm4
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,80(%esp)
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%edx
+	xorl	%ebx,%ebp
+	vmovdqa	96(%esp),%xmm5
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	vmovdqa	%xmm0,96(%esp)
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	vmovdqa	%xmm7,%xmm0
+	vpaddd	%xmm3,%xmm7,%xmm7
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpxor	%xmm6,%xmm4,%xmm4
+	addl	4(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm6
+	vmovdqa	%xmm7,48(%esp)
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm6,%xmm4,%xmm4
+	addl	12(%esp),%edi
+	xorl	%ecx,%ebp
+	vmovdqa	64(%esp),%xmm6
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpxor	%xmm6,%xmm5,%xmm5
+	vmovdqa	%xmm1,64(%esp)
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpxor	%xmm7,%xmm5,%xmm5
+	addl	20(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm7
+	vmovdqa	%xmm0,(%esp)
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm7,%xmm5,%xmm5
+	addl	28(%esp),%eax
+	vmovdqa	80(%esp),%xmm7
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%esp),%edi
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	vmovdqa	%xmm2,80(%esp)
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	vmovdqa	%xmm1,%xmm2
+	vpaddd	%xmm5,%xmm1,%xmm1
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	vpxor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	36(%esp),%edx
+	vpsrld	$30,%xmm6,%xmm0
+	vmovdqa	%xmm1,16(%esp)
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	40(%esp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm0,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vmovdqa	96(%esp),%xmm0
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	44(%esp),%ebx
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm1
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%esp),%eax
+	andl	%edx,%esi
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	vmovdqa	%xmm3,96(%esp)
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	vmovdqa	144(%esp),%xmm3
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%esp),%edi
+	vpsrld	$30,%xmm7,%xmm1
+	vmovdqa	%xmm2,32(%esp)
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	56(%esp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm1,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vmovdqa	64(%esp),%xmm1
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	60(%esp),%ecx
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm2
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	(%esp),%ebx
+	andl	%edi,%esi
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	vmovdqa	%xmm4,64(%esp)
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	vmovdqa	%xmm3,%xmm4
+	vpaddd	%xmm7,%xmm3,%xmm3
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	4(%esp),%eax
+	vpsrld	$30,%xmm0,%xmm2
+	vmovdqa	%xmm3,48(%esp)
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%esp),%edi
+	andl	%ecx,%esi
+	vpor	%xmm2,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vmovdqa	80(%esp),%xmm2
+	movl	%eax,%ebp
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	addl	12(%esp),%edx
+	andl	%ebx,%ebp
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%edi,%esi
+	xorl	%ebx,%ebp
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm3
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%esp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm5,80(%esp)
+	movl	%edx,%ebp
+	xorl	%eax,%esi
+	vmovdqa	%xmm4,%xmm5
+	vpaddd	%xmm0,%xmm4,%xmm4
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	20(%esp),%ebx
+	vpsrld	$30,%xmm1,%xmm3
+	vmovdqa	%xmm4,(%esp)
+	andl	%edi,%ebp
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%edi,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edx,%esi
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	24(%esp),%eax
+	andl	%edx,%esi
+	vpor	%xmm3,%xmm1,%xmm1
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	vmovdqa	96(%esp),%xmm3
+	movl	%ebx,%ebp
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%ebp
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%esp),%edi
+	andl	%ecx,%ebp
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%ebp
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%edi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%esp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	vmovdqa	%xmm6,96(%esp)
+	movl	%edi,%ebp
+	xorl	%ebx,%esi
+	vmovdqa	%xmm5,%xmm6
+	vpaddd	%xmm1,%xmm5,%xmm5
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	vpxor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%edi,%edx
+	addl	36(%esp),%ecx
+	vpsrld	$30,%xmm2,%xmm4
+	vmovdqa	%xmm5,16(%esp)
+	andl	%eax,%ebp
+	xorl	%ebx,%eax
+	shrdl	$7,%edi,%edi
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%edi,%esi
+	xorl	%eax,%edi
+	addl	%edx,%ecx
+	addl	40(%esp),%ebx
+	andl	%edi,%esi
+	vpor	%xmm4,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shrdl	$7,%edx,%edx
+	vmovdqa	64(%esp),%xmm4
+	movl	%ecx,%ebp
+	xorl	%edi,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%ebp
+	xorl	%edi,%edx
+	addl	%ecx,%ebx
+	addl	44(%esp),%eax
+	andl	%edx,%ebp
+	xorl	%edi,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	vmovdqa	%xmm7,64(%esp)
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	vmovdqa	%xmm6,%xmm7
+	vpaddd	%xmm2,%xmm6,%xmm6
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	vpxor	%xmm5,%xmm3,%xmm3
+	addl	52(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	vpsrld	$30,%xmm3,%xmm5
+	vmovdqa	%xmm6,32(%esp)
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vpor	%xmm5,%xmm3,%xmm3
+	addl	60(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	(%esp),%eax
+	vpaddd	%xmm3,%xmm7,%xmm7
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm7,48(%esp)
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	8(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	12(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	movl	196(%esp),%ebp
+	cmpl	200(%esp),%ebp
+	je	.L008done
+	vmovdqa	160(%esp),%xmm7
+	vmovdqa	176(%esp),%xmm6
+	vmovdqu	(%ebp),%xmm0
+	vmovdqu	16(%ebp),%xmm1
+	vmovdqu	32(%ebp),%xmm2
+	vmovdqu	48(%ebp),%xmm3
+	addl	$64,%ebp
+	vpshufb	%xmm6,%xmm0,%xmm0
+	movl	%ebp,196(%esp)
+	vmovdqa	%xmm7,96(%esp)
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm7,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,(%esp)
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm7,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%esp)
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	vpaddd	%xmm7,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	vmovdqa	%xmm6,32(%esp)
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,%ebx
+	movl	%ecx,8(%ebp)
+	xorl	%edx,%ebx
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	movl	%esi,%ebp
+	andl	%ebx,%esi
+	movl	%ebp,%ebx
+	jmp	.L007loop
+.align	16
+.L008done:
+	addl	16(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%esp),%edi
+	xorl	%ecx,%esi
+	movl	%eax,%ebp
+	shldl	$5,%eax,%eax
+	addl	%esi,%edi
+	xorl	%ecx,%ebp
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	28(%esp),%edx
+	xorl	%ebx,%ebp
+	movl	%edi,%esi
+	shldl	$5,%edi,%edi
+	addl	%ebp,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	32(%esp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%ebp
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%ebp
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	36(%esp),%ebx
+	xorl	%edi,%ebp
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%ebp,%ebx
+	xorl	%edi,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%esp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%ebp
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%ebp
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%esp),%edi
+	xorl	%ecx,%ebp
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%ebp,%edi
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%edi
+	addl	48(%esp),%edx
+	xorl	%ebx,%esi
+	movl	%edi,%ebp
+	shldl	$5,%edi,%edi
+	addl	%esi,%edx
+	xorl	%ebx,%ebp
+	shrdl	$7,%eax,%eax
+	addl	%edi,%edx
+	addl	52(%esp),%ecx
+	xorl	%eax,%ebp
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%ebp,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%edi,%edi
+	addl	%edx,%ecx
+	addl	56(%esp),%ebx
+	xorl	%edi,%esi
+	movl	%ecx,%ebp
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edi,%ebp
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%esp),%eax
+	xorl	%edx,%ebp
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%ebp,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroall
+	movl	192(%esp),%ebp
+	addl	(%ebp),%eax
+	movl	204(%esp),%esp
+	addl	4(%ebp),%esi
+	addl	8(%ebp),%ecx
+	movl	%eax,(%ebp)
+	addl	12(%ebp),%edx
+	movl	%esi,4(%ebp)
+	addl	16(%ebp),%edi
+	movl	%ecx,8(%ebp)
+	movl	%edx,12(%ebp)
+	movl	%edi,16(%ebp)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	_sha1_block_data_order_avx,.-_sha1_block_data_order_avx
+.align	64
+.LK_XX_XX:
+.long	1518500249,1518500249,1518500249,1518500249
+.long	1859775393,1859775393,1859775393,1859775393
+.long	2400959708,2400959708,2400959708,2400959708
+.long	3395469782,3395469782,3395469782,3395469782
+.long	66051,67438087,134810123,202182159
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte	102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte	89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte	114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S
@@ -1,0 +1,5567 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	sha256_block_data_order
+.hidden	sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+.L_sha256_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K256-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$6,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	OPENSSL_ia32cap_P-.L001K256(%ebp),%edx
+	movl	(%edx),%ecx
+	movl	4(%edx),%ebx
+	testl	$1048576,%ecx
+	jnz	.L002loop
+	movl	8(%edx),%edx
+	testl	$16777216,%ecx
+	jz	.L003no_xmm
+	andl	$1073741824,%ecx
+	andl	$268435968,%ebx
+	orl	%ebx,%ecx
+	andl	$1342177280,%ecx
+	cmpl	$1342177280,%ecx
+	je	.L004AVX
+	testl	$512,%ebx
+	jnz	.L005SSSE3
+.L003no_xmm:
+	subl	%edi,%eax
+	cmpl	$256,%eax
+	jae	.L006unrolled
+	jmp	.L002loop
+.align	16
+.L002loop:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	bswap	%eax
+	movl	12(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	bswap	%eax
+	movl	28(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%eax
+	movl	44(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	bswap	%eax
+	movl	60(%edi),%edx
+	bswap	%ebx
+	pushl	%eax
+	bswap	%ecx
+	pushl	%ebx
+	bswap	%edx
+	pushl	%ecx
+	pushl	%edx
+	addl	$64,%edi
+	leal	-36(%esp),%esp
+	movl	%edi,104(%esp)
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,8(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,12(%esp)
+	movl	%edi,16(%esp)
+	movl	%ebx,(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	movl	%edi,32(%esp)
+.align	16
+.L00700_15:
+	movl	%edx,%ecx
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	96(%esp),%ebx
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3248222580,%esi
+	jne	.L00700_15
+	movl	156(%esp),%ecx
+	jmp	.L00816_63
+.align	16
+.L00816_63:
+	movl	%ecx,%ebx
+	movl	104(%esp),%esi
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	160(%esp),%ebx
+	shrl	$10,%edi
+	addl	124(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	24(%esp),%esi
+	rorl	$14,%ecx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%edx,%ecx
+	xorl	%edi,%esi
+	movl	%ebx,96(%esp)
+	rorl	$5,%ecx
+	andl	%edx,%esi
+	movl	%edx,20(%esp)
+	xorl	%ecx,%edx
+	addl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%esi,%ebx
+	rorl	$9,%ecx
+	addl	%edx,%ebx
+	movl	8(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,4(%esp)
+	leal	-4(%esp),%esp
+	rorl	$11,%ecx
+	movl	(%ebp),%esi
+	xorl	%eax,%ecx
+	movl	20(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%esi,%ebx
+	movl	%eax,(%esp)
+	addl	%ebx,%edx
+	andl	4(%esp),%eax
+	addl	%ecx,%ebx
+	xorl	%edi,%eax
+	movl	156(%esp),%ecx
+	addl	$4,%ebp
+	addl	%ebx,%eax
+	cmpl	$3329325298,%esi
+	jne	.L00816_63
+	movl	356(%esp),%esi
+	movl	8(%esp),%ebx
+	movl	16(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	24(%esp),%eax
+	movl	28(%esp),%ebx
+	movl	32(%esp),%ecx
+	movl	360(%esp),%edi
+	addl	16(%esi),%edx
+	addl	20(%esi),%eax
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%eax,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	leal	356(%esp),%esp
+	subl	$256,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K256:
+.long	1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long	66051,67438087,134810123,202182159
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+.align	16
+.L006unrolled:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebp
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebx
+	movl	%ebp,4(%esp)
+	xorl	%ecx,%ebp
+	movl	%ecx,8(%esp)
+	movl	%ebx,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%ebx,20(%esp)
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	jmp	.L009grand_loop
+.align	16
+.L009grand_loop:
+	movl	(%edi),%ebx
+	movl	4(%edi),%ecx
+	bswap	%ebx
+	movl	8(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,32(%esp)
+	bswap	%esi
+	movl	%ecx,36(%esp)
+	movl	%esi,40(%esp)
+	movl	12(%edi),%ebx
+	movl	16(%edi),%ecx
+	bswap	%ebx
+	movl	20(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,44(%esp)
+	bswap	%esi
+	movl	%ecx,48(%esp)
+	movl	%esi,52(%esp)
+	movl	24(%edi),%ebx
+	movl	28(%edi),%ecx
+	bswap	%ebx
+	movl	32(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,56(%esp)
+	bswap	%esi
+	movl	%ecx,60(%esp)
+	movl	%esi,64(%esp)
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	bswap	%ebx
+	movl	44(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,68(%esp)
+	bswap	%esi
+	movl	%ecx,72(%esp)
+	movl	%esi,76(%esp)
+	movl	48(%edi),%ebx
+	movl	52(%edi),%ecx
+	bswap	%ebx
+	movl	56(%edi),%esi
+	bswap	%ecx
+	movl	%ebx,80(%esp)
+	bswap	%esi
+	movl	%ecx,84(%esp)
+	movl	%esi,88(%esp)
+	movl	60(%edi),%ebx
+	addl	$64,%edi
+	bswap	%ebx
+	movl	%edi,100(%esp)
+	movl	%ebx,92(%esp)
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	32(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1116352408(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	36(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1899447441(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	40(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3049323471(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	44(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3921009573(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	48(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	961987163(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	52(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1508970993(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	56(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2453635748(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	60(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2870763221(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	64(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3624381080(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	68(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	310598401(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	72(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	607225278(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	76(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1426881987(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	80(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1925078388(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	84(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2162078206(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	%edx,%ecx
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	88(%esp),%ebx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2614888103(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	%edx,%esi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	92(%esp),%ebx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3248222580(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3835390401(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	4022224774(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	264347078(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	604807628(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	770255983(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1249150122(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1555081692(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1996064986(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2554220882(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2821834349(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2952996808(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3210313671(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3336571891(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3584528711(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	113926993(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	338241895(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	666307205(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	773529912(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1294757372(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1396182291(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1695183700(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1986661051(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2177026350(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2456956037(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2730485921(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2820302411(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3259730800(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3345764771(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3516065817(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3600352804(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,88(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	4094571909(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,92(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	275423344(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	36(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	88(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	32(%esp),%ebx
+	shrl	$10,%edi
+	addl	68(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,32(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	430227734(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	40(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	92(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	36(%esp),%ebx
+	shrl	$10,%edi
+	addl	72(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,36(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	506948616(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	44(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	32(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	40(%esp),%ebx
+	shrl	$10,%edi
+	addl	76(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,40(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	659060556(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	48(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	36(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	44(%esp),%ebx
+	shrl	$10,%edi
+	addl	80(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,44(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	883997877(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	52(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	40(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	48(%esp),%ebx
+	shrl	$10,%edi
+	addl	84(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,48(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	958139571(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	56(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	44(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	52(%esp),%ebx
+	shrl	$10,%edi
+	addl	88(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,52(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1322822218(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	60(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	48(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	56(%esp),%ebx
+	shrl	$10,%edi
+	addl	92(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,56(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1537002063(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	64(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	52(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	60(%esp),%ebx
+	shrl	$10,%edi
+	addl	32(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,60(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	1747873779(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	68(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	56(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	64(%esp),%ebx
+	shrl	$10,%edi
+	addl	36(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	20(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	24(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,64(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	addl	28(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	4(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	1955562222(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	72(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	12(%esp),%edx
+	addl	%ecx,%ebp
+	movl	60(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	68(%esp),%ebx
+	shrl	$10,%edi
+	addl	40(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	16(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	20(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,68(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,12(%esp)
+	xorl	%esi,%edx
+	addl	24(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,28(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2024104815(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	76(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%esi,%eax
+	movl	64(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	72(%esp),%ebx
+	shrl	$10,%edi
+	addl	44(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	12(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	16(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,72(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	addl	20(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	28(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,24(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2227730452(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	80(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	4(%esp),%edx
+	addl	%ecx,%ebp
+	movl	68(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	76(%esp),%ebx
+	shrl	$10,%edi
+	addl	48(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	8(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	12(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,76(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,4(%esp)
+	xorl	%esi,%edx
+	addl	16(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	24(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,20(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2361852424(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	84(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%esi,%eax
+	movl	72(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	80(%esp),%ebx
+	shrl	$10,%edi
+	addl	52(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	4(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	8(%esp),%edi
+	xorl	%ecx,%edx
+	movl	%ebx,80(%esp)
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	addl	12(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	20(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,16(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	2428436474(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	88(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	28(%esp),%edx
+	addl	%ecx,%ebp
+	movl	76(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	84(%esp),%ebx
+	shrl	$10,%edi
+	addl	56(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	4(%esp),%edi
+	xorl	%esi,%edx
+	movl	%ebx,84(%esp)
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,28(%esp)
+	xorl	%esi,%edx
+	addl	8(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	16(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,12(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	2756734187(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	movl	92(%esp),%ecx
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%esi,%eax
+	movl	80(%esp),%esi
+	movl	%ecx,%ebx
+	rorl	$11,%ecx
+	movl	%esi,%edi
+	rorl	$2,%esi
+	xorl	%ebx,%ecx
+	shrl	$3,%ebx
+	rorl	$7,%ecx
+	xorl	%edi,%esi
+	xorl	%ecx,%ebx
+	rorl	$17,%esi
+	addl	88(%esp),%ebx
+	shrl	$10,%edi
+	addl	60(%esp),%ebx
+	movl	%edx,%ecx
+	xorl	%esi,%edi
+	movl	28(%esp),%esi
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	(%esp),%edi
+	xorl	%ecx,%edx
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	addl	4(%esp),%ebx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%ebx
+	rorl	$9,%ecx
+	movl	%eax,%esi
+	movl	12(%esp),%edi
+	xorl	%eax,%ecx
+	movl	%eax,8(%esp)
+	xorl	%edi,%eax
+	rorl	$11,%ecx
+	andl	%eax,%ebp
+	leal	3204031479(%ebx,%edx,1),%edx
+	xorl	%esi,%ecx
+	xorl	%edi,%ebp
+	movl	32(%esp),%esi
+	rorl	$2,%ecx
+	addl	%edx,%ebp
+	addl	20(%esp),%edx
+	addl	%ecx,%ebp
+	movl	84(%esp),%ecx
+	movl	%esi,%ebx
+	rorl	$11,%esi
+	movl	%ecx,%edi
+	rorl	$2,%ecx
+	xorl	%ebx,%esi
+	shrl	$3,%ebx
+	rorl	$7,%esi
+	xorl	%edi,%ecx
+	xorl	%esi,%ebx
+	rorl	$17,%ecx
+	addl	92(%esp),%ebx
+	shrl	$10,%edi
+	addl	64(%esp),%ebx
+	movl	%edx,%esi
+	xorl	%ecx,%edi
+	movl	24(%esp),%ecx
+	rorl	$14,%edx
+	addl	%edi,%ebx
+	movl	28(%esp),%edi
+	xorl	%esi,%edx
+	xorl	%edi,%ecx
+	rorl	$5,%edx
+	andl	%esi,%ecx
+	movl	%esi,20(%esp)
+	xorl	%esi,%edx
+	addl	(%esp),%ebx
+	xorl	%ecx,%edi
+	rorl	$6,%edx
+	movl	%ebp,%esi
+	addl	%edi,%ebx
+	rorl	$9,%esi
+	movl	%ebp,%ecx
+	movl	8(%esp),%edi
+	xorl	%ebp,%esi
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	rorl	$11,%esi
+	andl	%ebp,%eax
+	leal	3329325298(%ebx,%edx,1),%edx
+	xorl	%ecx,%esi
+	xorl	%edi,%eax
+	rorl	$2,%esi
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%esi,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebp
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebp
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebp,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebp,4(%esp)
+	xorl	%edi,%ebp
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ebx
+	movl	28(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ebx
+	addl	28(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%ebx,24(%esi)
+	movl	%ecx,28(%esi)
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ebx,24(%esp)
+	movl	%ecx,28(%esp)
+	cmpl	104(%esp),%edi
+	jb	.L009grand_loop
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L005SSSE3:
+	leal	-96(%esp),%esp
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	movdqa	256(%ebp),%xmm7
+	jmp	.L010grand_ssse3
+.align	16
+.L010grand_ssse3:
+	movdqu	(%edi),%xmm0
+	movdqu	16(%edi),%xmm1
+	movdqu	32(%edi),%xmm2
+	movdqu	48(%edi),%xmm3
+	addl	$64,%edi
+.byte	102,15,56,0,199
+	movl	%edi,100(%esp)
+.byte	102,15,56,0,207
+	movdqa	(%ebp),%xmm4
+.byte	102,15,56,0,215
+	movdqa	16(%ebp),%xmm5
+	paddd	%xmm0,%xmm4
+.byte	102,15,56,0,223
+	movdqa	32(%ebp),%xmm6
+	paddd	%xmm1,%xmm5
+	movdqa	48(%ebp),%xmm7
+	movdqa	%xmm4,32(%esp)
+	paddd	%xmm2,%xmm6
+	movdqa	%xmm5,48(%esp)
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm6,64(%esp)
+	movdqa	%xmm7,80(%esp)
+	jmp	.L011ssse3_00_47
+.align	16
+.L011ssse3_00_47:
+	addl	$64,%ebp
+	movl	%edx,%ecx
+	movdqa	%xmm1,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,224,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,250,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm3,%xmm7
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm0
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm0
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm0,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,32(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm2,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,225,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,251,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm0,%xmm7
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm1
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	16(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm1
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm1,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,48(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm3,%xmm4
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	movdqa	%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+.byte	102,15,58,15,226,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,248,4
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm1,%xmm7
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm2
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	32(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm2
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	paddd	%xmm2,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,64(%esp)
+	movl	%edx,%ecx
+	movdqa	%xmm0,%xmm4
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	movdqa	%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+.byte	102,15,58,15,227,4
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+.byte	102,15,58,15,249,4
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	movdqa	%xmm4,%xmm5
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	movdqa	%xmm4,%xmm6
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	psrld	$3,%xmm4
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	psrld	$7,%xmm6
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	pshufd	$250,%xmm2,%xmm7
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	pslld	$14,%xmm5
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	psrld	$11,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm5,%xmm4
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	pslld	$11,%xmm5
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	pxor	%xmm6,%xmm4
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	movdqa	%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	pxor	%xmm5,%xmm4
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	psrld	$10,%xmm7
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm4,%xmm3
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	psrlq	$17,%xmm6
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	pxor	%xmm6,%xmm7
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	psrlq	$2,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	pshufd	$128,%xmm7,%xmm7
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	psrldq	$8,%xmm7
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	movdqa	%xmm7,%xmm6
+	rorl	$11,%ecx
+	psrld	$10,%xmm7
+	andl	%eax,%ebx
+	psrlq	$17,%xmm6
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	pxor	%xmm6,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	psrlq	$2,%xmm6
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	pxor	%xmm6,%xmm7
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	movdqa	48(%ebp),%xmm6
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	pslldq	$8,%xmm7
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	paddd	%xmm7,%xmm3
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	paddd	%xmm3,%xmm6
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L011ssse3_00_47
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	rorl	$9,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	rorl	$11,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	rorl	$2,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	rorl	$14,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	rorl	$5,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	rorl	$6,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	rorl	$9,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	rorl	$11,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	rorl	$2,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	movdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L010grand_ssse3
+	movl	108(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L004AVX:
+	leal	-96(%esp),%esp
+	vzeroall
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edi
+	movl	%ebx,4(%esp)
+	xorl	%ecx,%ebx
+	movl	%ecx,8(%esp)
+	movl	%edi,12(%esp)
+	movl	16(%esi),%edx
+	movl	20(%esi),%edi
+	movl	24(%esi),%ecx
+	movl	28(%esi),%esi
+	movl	%edi,20(%esp)
+	movl	100(%esp),%edi
+	movl	%ecx,24(%esp)
+	movl	%esi,28(%esp)
+	vmovdqa	256(%ebp),%xmm7
+	jmp	.L012grand_avx
+.align	32
+.L012grand_avx:
+	vmovdqu	(%edi),%xmm0
+	vmovdqu	16(%edi),%xmm1
+	vmovdqu	32(%edi),%xmm2
+	vmovdqu	48(%edi),%xmm3
+	addl	$64,%edi
+	vpshufb	%xmm7,%xmm0,%xmm0
+	movl	%edi,100(%esp)
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	(%ebp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	16(%ebp),%xmm1,%xmm5
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	vpaddd	48(%ebp),%xmm3,%xmm7
+	vmovdqa	%xmm4,32(%esp)
+	vmovdqa	%xmm5,48(%esp)
+	vmovdqa	%xmm6,64(%esp)
+	vmovdqa	%xmm7,80(%esp)
+	jmp	.L013avx_00_47
+.align	16
+.L013avx_00_47:
+	addl	$64,%ebp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm0,%xmm0
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm0,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm0,%xmm0
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	(%ebp),%xmm0,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,32(%esp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm1,%xmm1
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm1,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm1,%xmm1
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	16(%ebp),%xmm1,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,48(%esp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	vpaddd	%xmm4,%xmm2,%xmm2
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm2,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm2,%xmm2
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	vpaddd	32(%ebp),%xmm2,%xmm6
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,64(%esp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrld	$3,%xmm4,%xmm7
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	vpslld	$14,%xmm4,%xmm5
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpsrld	$11,%xmm6,%xmm6
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	vpslld	$11,%xmm5,%xmm5
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	vpsrld	$10,%xmm7,%xmm6
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	vpxor	%xmm5,%xmm4,%xmm4
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	vpaddd	%xmm4,%xmm3,%xmm3
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	vpsrlq	$19,%xmm7,%xmm7
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	vpshufd	$132,%xmm6,%xmm7
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	vpsrldq	$8,%xmm7,%xmm7
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	vpshufd	$80,%xmm3,%xmm7
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	vpsrlq	$17,%xmm7,%xmm5
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	vpxor	%xmm5,%xmm6,%xmm6
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	vpsrlq	$19,%xmm7,%xmm7
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	vpshufd	$232,%xmm6,%xmm7
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	vpslldq	$8,%xmm7,%xmm7
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	vpaddd	%xmm7,%xmm3,%xmm3
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	vpaddd	48(%ebp),%xmm3,%xmm6
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	vmovdqa	%xmm6,80(%esp)
+	cmpl	$66051,64(%ebp)
+	jne	.L013avx_00_47
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	32(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	36(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	40(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	44(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	48(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	52(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	56(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	60(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	20(%esp),%esi
+	xorl	%ecx,%edx
+	movl	24(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,16(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	4(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	28(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	64(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	12(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	16(%esp),%esi
+	xorl	%ecx,%edx
+	movl	20(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,28(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	24(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	68(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	8(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	12(%esp),%esi
+	xorl	%ecx,%edx
+	movl	16(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,8(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	28(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,24(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	20(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	72(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	4(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	8(%esp),%esi
+	xorl	%ecx,%edx
+	movl	12(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,4(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	24(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,20(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	16(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	76(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	4(%esp),%esi
+	xorl	%ecx,%edx
+	movl	8(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	20(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,16(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	12(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	80(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	28(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	(%esp),%esi
+	xorl	%ecx,%edx
+	movl	4(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,28(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	16(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,12(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	8(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	84(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	24(%esp),%edx
+	addl	%ecx,%eax
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	28(%esp),%esi
+	xorl	%ecx,%edx
+	movl	(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,24(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%eax,%ecx
+	addl	%edi,%edx
+	movl	12(%esp),%edi
+	movl	%eax,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%eax,8(%esp)
+	xorl	%eax,%ecx
+	xorl	%edi,%eax
+	addl	4(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%eax,%ebx
+	xorl	%esi,%ecx
+	addl	88(%esp),%edx
+	xorl	%edi,%ebx
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%ebx
+	addl	20(%esp),%edx
+	addl	%ecx,%ebx
+	movl	%edx,%ecx
+	shrdl	$14,%edx,%edx
+	movl	24(%esp),%esi
+	xorl	%ecx,%edx
+	movl	28(%esp),%edi
+	xorl	%edi,%esi
+	shrdl	$5,%edx,%edx
+	andl	%ecx,%esi
+	movl	%ecx,20(%esp)
+	xorl	%ecx,%edx
+	xorl	%esi,%edi
+	shrdl	$6,%edx,%edx
+	movl	%ebx,%ecx
+	addl	%edi,%edx
+	movl	8(%esp),%edi
+	movl	%ebx,%esi
+	shrdl	$9,%ecx,%ecx
+	movl	%ebx,4(%esp)
+	xorl	%ebx,%ecx
+	xorl	%edi,%ebx
+	addl	(%esp),%edx
+	shrdl	$11,%ecx,%ecx
+	andl	%ebx,%eax
+	xorl	%esi,%ecx
+	addl	92(%esp),%edx
+	xorl	%edi,%eax
+	shrdl	$2,%ecx,%ecx
+	addl	%edx,%eax
+	addl	16(%esp),%edx
+	addl	%ecx,%eax
+	movl	96(%esp),%esi
+	xorl	%edi,%ebx
+	movl	12(%esp),%ecx
+	addl	(%esi),%eax
+	addl	4(%esi),%ebx
+	addl	8(%esi),%edi
+	addl	12(%esi),%ecx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	movl	%edi,8(%esi)
+	movl	%ecx,12(%esi)
+	movl	%ebx,4(%esp)
+	xorl	%edi,%ebx
+	movl	%edi,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	20(%esp),%edi
+	movl	24(%esp),%ecx
+	addl	16(%esi),%edx
+	addl	20(%esi),%edi
+	addl	24(%esi),%ecx
+	movl	%edx,16(%esi)
+	movl	%edi,20(%esi)
+	movl	%edi,20(%esp)
+	movl	28(%esp),%edi
+	movl	%ecx,24(%esi)
+	addl	28(%esi),%edi
+	movl	%ecx,24(%esp)
+	movl	%edi,28(%esi)
+	movl	%edi,28(%esp)
+	movl	100(%esp),%edi
+	vmovdqa	64(%ebp),%xmm7
+	subl	$192,%ebp
+	cmpl	104(%esp),%edi
+	jb	.L012grand_avx
+	movl	108(%esp),%esp
+	vzeroall
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	sha256_block_data_order,.-.L_sha256_block_data_order_begin
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S
@@ -1,0 +1,2837 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	sha512_block_data_order
+.hidden	sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	%esp,%ebx
+	call	.L000pic_point
+.L000pic_point:
+	popl	%ebp
+	leal	.L001K512-.L000pic_point(%ebp),%ebp
+	subl	$16,%esp
+	andl	$-64,%esp
+	shll	$7,%eax
+	addl	%edi,%eax
+	movl	%esi,(%esp)
+	movl	%edi,4(%esp)
+	movl	%eax,8(%esp)
+	movl	%ebx,12(%esp)
+	leal	OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+	movl	(%edx),%ecx
+	testl	$67108864,%ecx
+	jz	.L002loop_x86
+	movl	4(%edx),%edx
+	movq	(%esi),%mm0
+	andl	$16777216,%ecx
+	movq	8(%esi),%mm1
+	andl	$512,%edx
+	movq	16(%esi),%mm2
+	orl	%edx,%ecx
+	movq	24(%esi),%mm3
+	movq	32(%esi),%mm4
+	movq	40(%esi),%mm5
+	movq	48(%esi),%mm6
+	movq	56(%esi),%mm7
+	cmpl	$16777728,%ecx
+	je	.L003SSSE3
+	subl	$80,%esp
+	jmp	.L004loop_sse2
+.align	16
+.L004loop_sse2:
+	movq	%mm1,8(%esp)
+	movq	%mm2,16(%esp)
+	movq	%mm3,24(%esp)
+	movq	%mm5,40(%esp)
+	movq	%mm6,48(%esp)
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	movq	%mm0,%mm3
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	movl	$15,%edx
+	bswap	%eax
+	bswap	%ebx
+	jmp	.L00500_14_sse2
+.align	16
+.L00500_14_sse2:
+	movd	%eax,%mm1
+	movl	(%edi),%eax
+	movd	%ebx,%mm7
+	movl	4(%edi),%ebx
+	addl	$8,%edi
+	bswap	%eax
+	bswap	%ebx
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	movq	48(%esp),%mm6
+	decl	%edx
+	jnz	.L00500_14_sse2
+	movd	%eax,%mm1
+	movd	%ebx,%mm7
+	punpckldq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm3,%mm0
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm2,%mm3
+	movq	%mm0,%mm2
+	addl	$8,%ebp
+	paddq	%mm6,%mm3
+	pxor	%mm0,%mm0
+	movl	$32,%edx
+	jmp	.L00616_79_sse2
+.align	16
+.L00616_79_sse2:
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm0
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm2
+	addl	$8,%ebp
+	movq	88(%esp),%mm5
+	movq	%mm7,%mm1
+	psrlq	$1,%mm7
+	movq	%mm5,%mm6
+	psrlq	$6,%mm5
+	psllq	$56,%mm1
+	paddq	%mm3,%mm2
+	movq	%mm7,%mm3
+	psrlq	$6,%mm7
+	pxor	%mm1,%mm3
+	psllq	$7,%mm1
+	pxor	%mm7,%mm3
+	psrlq	$1,%mm7
+	pxor	%mm1,%mm3
+	movq	%mm5,%mm1
+	psrlq	$13,%mm5
+	pxor	%mm3,%mm7
+	psllq	$3,%mm6
+	pxor	%mm5,%mm1
+	paddq	200(%esp),%mm7
+	pxor	%mm6,%mm1
+	psrlq	$42,%mm5
+	paddq	128(%esp),%mm7
+	pxor	%mm5,%mm1
+	psllq	$42,%mm6
+	movq	40(%esp),%mm5
+	pxor	%mm6,%mm1
+	movq	48(%esp),%mm6
+	paddq	%mm1,%mm7
+	movq	%mm4,%mm1
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	movq	%mm7,72(%esp)
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	paddq	(%ebp),%mm7
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	subl	$8,%esp
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	192(%esp),%mm7
+	paddq	%mm6,%mm0
+	addl	$8,%ebp
+	decl	%edx
+	jnz	.L00616_79_sse2
+	paddq	%mm3,%mm0
+	movq	8(%esp),%mm1
+	movq	24(%esp),%mm3
+	movq	40(%esp),%mm5
+	movq	48(%esp),%mm6
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movl	$640,%eax
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	leal	(%esp,%eax,1),%esp
+	subl	%eax,%ebp
+	cmpl	88(%esp),%edi
+	jb	.L004loop_sse2
+	movl	92(%esp),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	32
+.L003SSSE3:
+	leal	-64(%esp),%edx
+	subl	$256,%esp
+	movdqa	640(%ebp),%xmm1
+	movdqu	(%edi),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%edi),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%edi),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%edi),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%edi),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%edi),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%edi),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%edi),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movdqa	%xmm2,-16(%edx)
+	nop
+.align	32
+.L007loop_ssse3:
+	movdqa	16(%edx),%xmm2
+	movdqa	%xmm3,48(%edx)
+	leal	128(%ebp),%ebp
+	movq	%mm1,8(%esp)
+	movl	%edi,%ebx
+	movq	%mm2,16(%esp)
+	leal	128(%edi),%edi
+	movq	%mm3,24(%esp)
+	cmpl	%eax,%edi
+	movq	%mm5,40(%esp)
+	cmovbl	%edi,%ebx
+	movq	%mm6,48(%esp)
+	movl	$4,%ecx
+	pxor	%mm1,%mm2
+	movq	%mm7,56(%esp)
+	pxor	%mm3,%mm3
+	jmp	.L00800_47_ssse3
+.align	32
+.L00800_47_ssse3:
+	movdqa	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm1
+.byte	102,15,58,15,208,8
+	movdqa	%xmm4,(%edx)
+.byte	102,15,58,15,220,8
+	movdqa	%xmm2,%xmm4
+	psrlq	$7,%xmm2
+	paddq	%xmm3,%xmm0
+	movdqa	%xmm4,%xmm3
+	psrlq	$1,%xmm4
+	psllq	$56,%xmm3
+	pxor	%xmm4,%xmm2
+	psrlq	$7,%xmm4
+	pxor	%xmm3,%xmm2
+	psllq	$7,%xmm3
+	pxor	%xmm4,%xmm2
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm3,%xmm2
+	movdqa	%xmm7,%xmm3
+	psrlq	$6,%xmm4
+	paddq	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm2
+	psrlq	$19,%xmm3
+	psllq	$3,%xmm2
+	pxor	%xmm3,%xmm4
+	psrlq	$42,%xmm3
+	pxor	%xmm2,%xmm4
+	psllq	$42,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqa	32(%edx),%xmm3
+	pxor	%xmm2,%xmm4
+	movdqa	(%ebp),%xmm2
+	movq	%mm4,%mm1
+	paddq	%xmm4,%xmm0
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm0,%xmm2
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm2,-128(%edx)
+	movdqa	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm2
+.byte	102,15,58,15,217,8
+	movdqa	%xmm5,16(%edx)
+.byte	102,15,58,15,229,8
+	movdqa	%xmm3,%xmm5
+	psrlq	$7,%xmm3
+	paddq	%xmm4,%xmm1
+	movdqa	%xmm5,%xmm4
+	psrlq	$1,%xmm5
+	psllq	$56,%xmm4
+	pxor	%xmm5,%xmm3
+	psrlq	$7,%xmm5
+	pxor	%xmm4,%xmm3
+	psllq	$7,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqa	%xmm0,%xmm5
+	pxor	%xmm4,%xmm3
+	movdqa	%xmm0,%xmm4
+	psrlq	$6,%xmm5
+	paddq	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm3
+	psrlq	$19,%xmm4
+	psllq	$3,%xmm3
+	pxor	%xmm4,%xmm5
+	psrlq	$42,%xmm4
+	pxor	%xmm3,%xmm5
+	psllq	$42,%xmm3
+	pxor	%xmm4,%xmm5
+	movdqa	48(%edx),%xmm4
+	pxor	%xmm3,%xmm5
+	movdqa	16(%ebp),%xmm3
+	movq	%mm4,%mm1
+	paddq	%xmm5,%xmm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm1,%xmm3
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm3,-112(%edx)
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm3
+.byte	102,15,58,15,226,8
+	movdqa	%xmm6,32(%edx)
+.byte	102,15,58,15,238,8
+	movdqa	%xmm4,%xmm6
+	psrlq	$7,%xmm4
+	paddq	%xmm5,%xmm2
+	movdqa	%xmm6,%xmm5
+	psrlq	$1,%xmm6
+	psllq	$56,%xmm5
+	pxor	%xmm6,%xmm4
+	psrlq	$7,%xmm6
+	pxor	%xmm5,%xmm4
+	psllq	$7,%xmm5
+	pxor	%xmm6,%xmm4
+	movdqa	%xmm1,%xmm6
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm1,%xmm5
+	psrlq	$6,%xmm6
+	paddq	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm4
+	psrlq	$19,%xmm5
+	psllq	$3,%xmm4
+	pxor	%xmm5,%xmm6
+	psrlq	$42,%xmm5
+	pxor	%xmm4,%xmm6
+	psllq	$42,%xmm4
+	pxor	%xmm5,%xmm6
+	movdqa	(%edx),%xmm5
+	pxor	%xmm4,%xmm6
+	movdqa	32(%ebp),%xmm4
+	movq	%mm4,%mm1
+	paddq	%xmm6,%xmm2
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm2,%xmm4
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm4,-96(%edx)
+	movdqa	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm4
+.byte	102,15,58,15,235,8
+	movdqa	%xmm7,48(%edx)
+.byte	102,15,58,15,247,8
+	movdqa	%xmm5,%xmm7
+	psrlq	$7,%xmm5
+	paddq	%xmm6,%xmm3
+	movdqa	%xmm7,%xmm6
+	psrlq	$1,%xmm7
+	psllq	$56,%xmm6
+	pxor	%xmm7,%xmm5
+	psrlq	$7,%xmm7
+	pxor	%xmm6,%xmm5
+	psllq	$7,%xmm6
+	pxor	%xmm7,%xmm5
+	movdqa	%xmm2,%xmm7
+	pxor	%xmm6,%xmm5
+	movdqa	%xmm2,%xmm6
+	psrlq	$6,%xmm7
+	paddq	%xmm5,%xmm3
+	movdqa	%xmm2,%xmm5
+	psrlq	$19,%xmm6
+	psllq	$3,%xmm5
+	pxor	%xmm6,%xmm7
+	psrlq	$42,%xmm6
+	pxor	%xmm5,%xmm7
+	psllq	$42,%xmm5
+	pxor	%xmm6,%xmm7
+	movdqa	16(%edx),%xmm6
+	pxor	%xmm5,%xmm7
+	movdqa	48(%ebp),%xmm5
+	movq	%mm4,%mm1
+	paddq	%xmm7,%xmm3
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm3,%xmm5
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm5,-80(%edx)
+	movdqa	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm5
+.byte	102,15,58,15,244,8
+	movdqa	%xmm0,(%edx)
+.byte	102,15,58,15,248,8
+	movdqa	%xmm6,%xmm0
+	psrlq	$7,%xmm6
+	paddq	%xmm7,%xmm4
+	movdqa	%xmm0,%xmm7
+	psrlq	$1,%xmm0
+	psllq	$56,%xmm7
+	pxor	%xmm0,%xmm6
+	psrlq	$7,%xmm0
+	pxor	%xmm7,%xmm6
+	psllq	$7,%xmm7
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm3,%xmm0
+	pxor	%xmm7,%xmm6
+	movdqa	%xmm3,%xmm7
+	psrlq	$6,%xmm0
+	paddq	%xmm6,%xmm4
+	movdqa	%xmm3,%xmm6
+	psrlq	$19,%xmm7
+	psllq	$3,%xmm6
+	pxor	%xmm7,%xmm0
+	psrlq	$42,%xmm7
+	pxor	%xmm6,%xmm0
+	psllq	$42,%xmm6
+	pxor	%xmm7,%xmm0
+	movdqa	32(%edx),%xmm7
+	pxor	%xmm6,%xmm0
+	movdqa	64(%ebp),%xmm6
+	movq	%mm4,%mm1
+	paddq	%xmm0,%xmm4
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	paddq	%xmm4,%xmm6
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm6,-64(%edx)
+	movdqa	%xmm2,%xmm0
+	movdqa	%xmm7,%xmm6
+.byte	102,15,58,15,253,8
+	movdqa	%xmm1,16(%edx)
+.byte	102,15,58,15,193,8
+	movdqa	%xmm7,%xmm1
+	psrlq	$7,%xmm7
+	paddq	%xmm0,%xmm5
+	movdqa	%xmm1,%xmm0
+	psrlq	$1,%xmm1
+	psllq	$56,%xmm0
+	pxor	%xmm1,%xmm7
+	psrlq	$7,%xmm1
+	pxor	%xmm0,%xmm7
+	psllq	$7,%xmm0
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm4,%xmm1
+	pxor	%xmm0,%xmm7
+	movdqa	%xmm4,%xmm0
+	psrlq	$6,%xmm1
+	paddq	%xmm7,%xmm5
+	movdqa	%xmm4,%xmm7
+	psrlq	$19,%xmm0
+	psllq	$3,%xmm7
+	pxor	%xmm0,%xmm1
+	psrlq	$42,%xmm0
+	pxor	%xmm7,%xmm1
+	psllq	$42,%xmm7
+	pxor	%xmm0,%xmm1
+	movdqa	48(%edx),%xmm0
+	pxor	%xmm7,%xmm1
+	movdqa	80(%ebp),%xmm7
+	movq	%mm4,%mm1
+	paddq	%xmm1,%xmm5
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	paddq	%xmm5,%xmm7
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm7,-48(%edx)
+	movdqa	%xmm3,%xmm1
+	movdqa	%xmm0,%xmm7
+.byte	102,15,58,15,198,8
+	movdqa	%xmm2,32(%edx)
+.byte	102,15,58,15,202,8
+	movdqa	%xmm0,%xmm2
+	psrlq	$7,%xmm0
+	paddq	%xmm1,%xmm6
+	movdqa	%xmm2,%xmm1
+	psrlq	$1,%xmm2
+	psllq	$56,%xmm1
+	pxor	%xmm2,%xmm0
+	psrlq	$7,%xmm2
+	pxor	%xmm1,%xmm0
+	psllq	$7,%xmm1
+	pxor	%xmm2,%xmm0
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm1
+	psrlq	$6,%xmm2
+	paddq	%xmm0,%xmm6
+	movdqa	%xmm5,%xmm0
+	psrlq	$19,%xmm1
+	psllq	$3,%xmm0
+	pxor	%xmm1,%xmm2
+	psrlq	$42,%xmm1
+	pxor	%xmm0,%xmm2
+	psllq	$42,%xmm0
+	pxor	%xmm1,%xmm2
+	movdqa	(%edx),%xmm1
+	pxor	%xmm0,%xmm2
+	movdqa	96(%ebp),%xmm0
+	movq	%mm4,%mm1
+	paddq	%xmm2,%xmm6
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	paddq	%xmm6,%xmm0
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm0,-32(%edx)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm1,%xmm0
+.byte	102,15,58,15,207,8
+	movdqa	%xmm3,48(%edx)
+.byte	102,15,58,15,211,8
+	movdqa	%xmm1,%xmm3
+	psrlq	$7,%xmm1
+	paddq	%xmm2,%xmm7
+	movdqa	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	psllq	$56,%xmm2
+	pxor	%xmm3,%xmm1
+	psrlq	$7,%xmm3
+	pxor	%xmm2,%xmm1
+	psllq	$7,%xmm2
+	pxor	%xmm3,%xmm1
+	movdqa	%xmm6,%xmm3
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm6,%xmm2
+	psrlq	$6,%xmm3
+	paddq	%xmm1,%xmm7
+	movdqa	%xmm6,%xmm1
+	psrlq	$19,%xmm2
+	psllq	$3,%xmm1
+	pxor	%xmm2,%xmm3
+	psrlq	$42,%xmm2
+	pxor	%xmm1,%xmm3
+	psllq	$42,%xmm1
+	pxor	%xmm2,%xmm3
+	movdqa	16(%edx),%xmm2
+	pxor	%xmm1,%xmm3
+	movdqa	112(%ebp),%xmm1
+	movq	%mm4,%mm1
+	paddq	%xmm3,%xmm7
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	paddq	%xmm7,%xmm1
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm1,-16(%edx)
+	leal	128(%ebp),%ebp
+	decl	%ecx
+	jnz	.L00800_47_ssse3
+	movdqa	(%ebp),%xmm1
+	leal	-640(%ebp),%ebp
+	movdqu	(%ebx),%xmm0
+.byte	102,15,56,0,193
+	movdqa	(%ebp),%xmm3
+	movdqa	%xmm1,%xmm2
+	movdqu	16(%ebx),%xmm1
+	paddq	%xmm0,%xmm3
+.byte	102,15,56,0,202
+	movq	%mm4,%mm1
+	movq	-128(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-120(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm3,-128(%edx)
+	movdqa	16(%ebp),%xmm4
+	movdqa	%xmm2,%xmm3
+	movdqu	32(%ebx),%xmm2
+	paddq	%xmm1,%xmm4
+.byte	102,15,56,0,211
+	movq	%mm4,%mm1
+	movq	-112(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-104(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm4,-112(%edx)
+	movdqa	32(%ebp),%xmm5
+	movdqa	%xmm3,%xmm4
+	movdqu	48(%ebx),%xmm3
+	paddq	%xmm2,%xmm5
+.byte	102,15,56,0,220
+	movq	%mm4,%mm1
+	movq	-96(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-88(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm5,-96(%edx)
+	movdqa	48(%ebp),%xmm6
+	movdqa	%xmm4,%xmm5
+	movdqu	64(%ebx),%xmm4
+	paddq	%xmm3,%xmm6
+.byte	102,15,56,0,229
+	movq	%mm4,%mm1
+	movq	-80(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-72(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm6,-80(%edx)
+	movdqa	64(%ebp),%xmm7
+	movdqa	%xmm5,%xmm6
+	movdqu	80(%ebx),%xmm5
+	paddq	%xmm4,%xmm7
+.byte	102,15,56,0,238
+	movq	%mm4,%mm1
+	movq	-64(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,32(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	56(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	24(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	8(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	32(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	40(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-56(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,24(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,56(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	48(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	16(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	24(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	32(%esp),%mm6
+	movdqa	%xmm7,-64(%edx)
+	movdqa	%xmm0,(%edx)
+	movdqa	80(%ebp),%xmm0
+	movdqa	%xmm6,%xmm7
+	movdqu	96(%ebx),%xmm6
+	paddq	%xmm5,%xmm0
+.byte	102,15,56,0,247
+	movq	%mm4,%mm1
+	movq	-48(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,16(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,48(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	40(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	8(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	56(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	16(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	24(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-40(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,8(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,40(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	32(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	48(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	8(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	16(%esp),%mm6
+	movdqa	%xmm0,-48(%edx)
+	movdqa	%xmm1,16(%edx)
+	movdqa	96(%ebp),%xmm1
+	movdqa	%xmm7,%xmm0
+	movdqu	112(%ebx),%xmm7
+	paddq	%xmm6,%xmm1
+.byte	102,15,56,0,248
+	movq	%mm4,%mm1
+	movq	-32(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,32(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	24(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	56(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	40(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	8(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-24(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,56(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,24(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	16(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	48(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	32(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	56(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	(%esp),%mm6
+	movdqa	%xmm1,-32(%edx)
+	movdqa	%xmm2,32(%edx)
+	movdqa	112(%ebp),%xmm2
+	movdqa	(%edx),%xmm0
+	paddq	%xmm7,%xmm2
+	movq	%mm4,%mm1
+	movq	-16(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,48(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm0
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm0,16(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	8(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	40(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm0,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm0,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	24(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm0,%mm2
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	pxor	%mm7,%mm6
+	movq	48(%esp),%mm5
+	paddq	%mm6,%mm2
+	movq	56(%esp),%mm6
+	movq	%mm4,%mm1
+	movq	-8(%edx),%mm7
+	pxor	%mm6,%mm5
+	psrlq	$14,%mm1
+	movq	%mm4,40(%esp)
+	pand	%mm4,%mm5
+	psllq	$23,%mm4
+	paddq	%mm3,%mm2
+	movq	%mm1,%mm3
+	psrlq	$4,%mm1
+	pxor	%mm6,%mm5
+	pxor	%mm4,%mm3
+	psllq	$23,%mm4
+	pxor	%mm1,%mm3
+	movq	%mm2,8(%esp)
+	paddq	%mm5,%mm7
+	pxor	%mm4,%mm3
+	psrlq	$23,%mm1
+	paddq	(%esp),%mm7
+	pxor	%mm1,%mm3
+	psllq	$4,%mm4
+	pxor	%mm4,%mm3
+	movq	32(%esp),%mm4
+	paddq	%mm7,%mm3
+	movq	%mm2,%mm5
+	psrlq	$28,%mm5
+	paddq	%mm3,%mm4
+	movq	%mm2,%mm6
+	movq	%mm5,%mm7
+	psllq	$25,%mm6
+	movq	16(%esp),%mm1
+	psrlq	$6,%mm5
+	pxor	%mm6,%mm7
+	psllq	$5,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm2
+	psrlq	$5,%mm5
+	pxor	%mm6,%mm7
+	pand	%mm2,%mm0
+	psllq	$6,%mm6
+	pxor	%mm5,%mm7
+	pxor	%mm1,%mm0
+	pxor	%mm7,%mm6
+	movq	40(%esp),%mm5
+	paddq	%mm6,%mm0
+	movq	48(%esp),%mm6
+	movdqa	%xmm2,-16(%edx)
+	movq	8(%esp),%mm1
+	paddq	%mm3,%mm0
+	movq	24(%esp),%mm3
+	movq	56(%esp),%mm7
+	pxor	%mm1,%mm2
+	paddq	(%esi),%mm0
+	paddq	8(%esi),%mm1
+	paddq	16(%esi),%mm2
+	paddq	24(%esi),%mm3
+	paddq	32(%esi),%mm4
+	paddq	40(%esi),%mm5
+	paddq	48(%esi),%mm6
+	paddq	56(%esi),%mm7
+	movq	%mm0,(%esi)
+	movq	%mm1,8(%esi)
+	movq	%mm2,16(%esi)
+	movq	%mm3,24(%esi)
+	movq	%mm4,32(%esi)
+	movq	%mm5,40(%esi)
+	movq	%mm6,48(%esi)
+	movq	%mm7,56(%esi)
+	cmpl	%eax,%edi
+	jb	.L007loop_ssse3
+	movl	76(%edx),%esp
+	emms
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	16
+.L002loop_x86:
+	movl	(%edi),%eax
+	movl	4(%edi),%ebx
+	movl	8(%edi),%ecx
+	movl	12(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	16(%edi),%eax
+	movl	20(%edi),%ebx
+	movl	24(%edi),%ecx
+	movl	28(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	32(%edi),%eax
+	movl	36(%edi),%ebx
+	movl	40(%edi),%ecx
+	movl	44(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	48(%edi),%eax
+	movl	52(%edi),%ebx
+	movl	56(%edi),%ecx
+	movl	60(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	64(%edi),%eax
+	movl	68(%edi),%ebx
+	movl	72(%edi),%ecx
+	movl	76(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	80(%edi),%eax
+	movl	84(%edi),%ebx
+	movl	88(%edi),%ecx
+	movl	92(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	96(%edi),%eax
+	movl	100(%edi),%ebx
+	movl	104(%edi),%ecx
+	movl	108(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	movl	112(%edi),%eax
+	movl	116(%edi),%ebx
+	movl	120(%edi),%ecx
+	movl	124(%edi),%edx
+	bswap	%eax
+	bswap	%ebx
+	bswap	%ecx
+	bswap	%edx
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	addl	$128,%edi
+	subl	$72,%esp
+	movl	%edi,204(%esp)
+	leal	8(%esp),%edi
+	movl	$16,%ecx
+.long	2784229001
+.align	16
+.L00900_15_x86:
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$148,%dl
+	jne	.L00900_15_x86
+.align	16
+.L01016_79_x86:
+	movl	312(%esp),%ecx
+	movl	316(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$1,%ecx
+	movl	%edx,%edi
+	shrl	$1,%edx
+	movl	%ecx,%eax
+	shll	$24,%esi
+	movl	%edx,%ebx
+	shll	$24,%edi
+	xorl	%esi,%ebx
+	shrl	$6,%ecx
+	xorl	%edi,%eax
+	shrl	$6,%edx
+	xorl	%ecx,%eax
+	shll	$7,%esi
+	xorl	%edx,%ebx
+	shll	$1,%edi
+	xorl	%esi,%ebx
+	shrl	$1,%ecx
+	xorl	%edi,%eax
+	shrl	$1,%edx
+	xorl	%ecx,%eax
+	shll	$6,%edi
+	xorl	%edx,%ebx
+	xorl	%edi,%eax
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movl	208(%esp),%ecx
+	movl	212(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$6,%ecx
+	movl	%edx,%edi
+	shrl	$6,%edx
+	movl	%ecx,%eax
+	shll	$3,%esi
+	movl	%edx,%ebx
+	shll	$3,%edi
+	xorl	%esi,%eax
+	shrl	$13,%ecx
+	xorl	%edi,%ebx
+	shrl	$13,%edx
+	xorl	%ecx,%eax
+	shll	$10,%esi
+	xorl	%edx,%ebx
+	shll	$10,%edi
+	xorl	%esi,%ebx
+	shrl	$10,%ecx
+	xorl	%edi,%eax
+	shrl	$10,%edx
+	xorl	%ecx,%ebx
+	shll	$13,%edi
+	xorl	%edx,%eax
+	xorl	%edi,%eax
+	movl	320(%esp),%ecx
+	movl	324(%esp),%edx
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	movl	248(%esp),%esi
+	movl	252(%esp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,192(%esp)
+	movl	%ebx,196(%esp)
+	movl	40(%esp),%ecx
+	movl	44(%esp),%edx
+	movl	%ecx,%esi
+	shrl	$9,%ecx
+	movl	%edx,%edi
+	shrl	$9,%edx
+	movl	%ecx,%ebx
+	shll	$14,%esi
+	movl	%edx,%eax
+	shll	$14,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%eax
+	shll	$4,%esi
+	xorl	%edx,%ebx
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$4,%ecx
+	xorl	%edi,%eax
+	shrl	$4,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	48(%esp),%ecx
+	movl	52(%esp),%edx
+	movl	56(%esp),%esi
+	movl	60(%esp),%edi
+	addl	64(%esp),%eax
+	adcl	68(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	andl	40(%esp),%ecx
+	andl	44(%esp),%edx
+	addl	192(%esp),%eax
+	adcl	196(%esp),%ebx
+	xorl	%esi,%ecx
+	xorl	%edi,%edx
+	movl	(%ebp),%esi
+	movl	4(%ebp),%edi
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	32(%esp),%ecx
+	movl	36(%esp),%edx
+	addl	%esi,%eax
+	adcl	%edi,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	%eax,32(%esp)
+	movl	%ebx,36(%esp)
+	movl	%ecx,%esi
+	shrl	$2,%ecx
+	movl	%edx,%edi
+	shrl	$2,%edx
+	movl	%ecx,%ebx
+	shll	$4,%esi
+	movl	%edx,%eax
+	shll	$4,%edi
+	xorl	%esi,%ebx
+	shrl	$5,%ecx
+	xorl	%edi,%eax
+	shrl	$5,%edx
+	xorl	%ecx,%ebx
+	shll	$21,%esi
+	xorl	%edx,%eax
+	shll	$21,%edi
+	xorl	%esi,%eax
+	shrl	$21,%ecx
+	xorl	%edi,%ebx
+	shrl	$21,%edx
+	xorl	%ecx,%eax
+	shll	$5,%esi
+	xorl	%edx,%ebx
+	shll	$5,%edi
+	xorl	%esi,%eax
+	xorl	%edi,%ebx
+	movl	8(%esp),%ecx
+	movl	12(%esp),%edx
+	movl	16(%esp),%esi
+	movl	20(%esp),%edi
+	addl	(%esp),%eax
+	adcl	4(%esp),%ebx
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	andl	24(%esp),%ecx
+	andl	28(%esp),%edx
+	andl	8(%esp),%esi
+	andl	12(%esp),%edi
+	orl	%esi,%ecx
+	orl	%edi,%edx
+	addl	%ecx,%eax
+	adcl	%edx,%ebx
+	movl	%eax,(%esp)
+	movl	%ebx,4(%esp)
+	movb	(%ebp),%dl
+	subl	$8,%esp
+	leal	8(%ebp),%ebp
+	cmpb	$23,%dl
+	jne	.L01016_79_x86
+	movl	840(%esp),%esi
+	movl	844(%esp),%edi
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%edx
+	addl	8(%esp),%eax
+	adcl	12(%esp),%ebx
+	movl	%eax,(%esi)
+	movl	%ebx,4(%esi)
+	addl	16(%esp),%ecx
+	adcl	20(%esp),%edx
+	movl	%ecx,8(%esi)
+	movl	%edx,12(%esi)
+	movl	16(%esi),%eax
+	movl	20(%esi),%ebx
+	movl	24(%esi),%ecx
+	movl	28(%esi),%edx
+	addl	24(%esp),%eax
+	adcl	28(%esp),%ebx
+	movl	%eax,16(%esi)
+	movl	%ebx,20(%esi)
+	addl	32(%esp),%ecx
+	adcl	36(%esp),%edx
+	movl	%ecx,24(%esi)
+	movl	%edx,28(%esi)
+	movl	32(%esi),%eax
+	movl	36(%esi),%ebx
+	movl	40(%esi),%ecx
+	movl	44(%esi),%edx
+	addl	40(%esp),%eax
+	adcl	44(%esp),%ebx
+	movl	%eax,32(%esi)
+	movl	%ebx,36(%esi)
+	addl	48(%esp),%ecx
+	adcl	52(%esp),%edx
+	movl	%ecx,40(%esi)
+	movl	%edx,44(%esi)
+	movl	48(%esi),%eax
+	movl	52(%esi),%ebx
+	movl	56(%esi),%ecx
+	movl	60(%esi),%edx
+	addl	56(%esp),%eax
+	adcl	60(%esp),%ebx
+	movl	%eax,48(%esi)
+	movl	%ebx,52(%esi)
+	addl	64(%esp),%ecx
+	adcl	68(%esp),%edx
+	movl	%ecx,56(%esi)
+	movl	%edx,60(%esi)
+	addl	$840,%esp
+	subl	$640,%ebp
+	cmpl	8(%esp),%edi
+	jb	.L002loop_x86
+	movl	12(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.align	64
+.L001K512:
+.long	3609767458,1116352408
+.long	602891725,1899447441
+.long	3964484399,3049323471
+.long	2173295548,3921009573
+.long	4081628472,961987163
+.long	3053834265,1508970993
+.long	2937671579,2453635748
+.long	3664609560,2870763221
+.long	2734883394,3624381080
+.long	1164996542,310598401
+.long	1323610764,607225278
+.long	3590304994,1426881987
+.long	4068182383,1925078388
+.long	991336113,2162078206
+.long	633803317,2614888103
+.long	3479774868,3248222580
+.long	2666613458,3835390401
+.long	944711139,4022224774
+.long	2341262773,264347078
+.long	2007800933,604807628
+.long	1495990901,770255983
+.long	1856431235,1249150122
+.long	3175218132,1555081692
+.long	2198950837,1996064986
+.long	3999719339,2554220882
+.long	766784016,2821834349
+.long	2566594879,2952996808
+.long	3203337956,3210313671
+.long	1034457026,3336571891
+.long	2466948901,3584528711
+.long	3758326383,113926993
+.long	168717936,338241895
+.long	1188179964,666307205
+.long	1546045734,773529912
+.long	1522805485,1294757372
+.long	2643833823,1396182291
+.long	2343527390,1695183700
+.long	1014477480,1986661051
+.long	1206759142,2177026350
+.long	344077627,2456956037
+.long	1290863460,2730485921
+.long	3158454273,2820302411
+.long	3505952657,3259730800
+.long	106217008,3345764771
+.long	3606008344,3516065817
+.long	1432725776,3600352804
+.long	1467031594,4094571909
+.long	851169720,275423344
+.long	3100823752,430227734
+.long	1363258195,506948616
+.long	3750685593,659060556
+.long	3785050280,883997877
+.long	3318307427,958139571
+.long	3812723403,1322822218
+.long	2003034995,1537002063
+.long	3602036899,1747873779
+.long	1575990012,1955562222
+.long	1125592928,2024104815
+.long	2716904306,2227730452
+.long	442776044,2361852424
+.long	593698344,2428436474
+.long	3733110249,2756734187
+.long	2999351573,3204031479
+.long	3815920427,3329325298
+.long	3928383900,3391569614
+.long	566280711,3515267271
+.long	3454069534,3940187606
+.long	4000239992,4118630271
+.long	1914138554,116418474
+.long	2731055270,174292421
+.long	3203993006,289380356
+.long	320620315,460393269
+.long	587496836,685471733
+.long	1086792851,852142971
+.long	365543100,1017036298
+.long	2618297676,1126000580
+.long	3409855158,1288033470
+.long	4234509866,1501505948
+.long	987167468,1607167915
+.long	1246189591,1816402316
+.long	67438087,66051
+.long	202182159,134810123
+.size	sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte	110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte	67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte	112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte	62,0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S
@@ -1,0 +1,708 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align	64
+.L_vpaes_consts:
+.long	218628480,235210255,168496130,67568393
+.long	252381056,17041926,33884169,51187212
+.long	252645135,252645135,252645135,252645135
+.long	1512730624,3266504856,1377990664,3401244816
+.long	830229760,1275146365,2969422977,3447763452
+.long	3411033600,2979783055,338359620,2782886510
+.long	4209124096,907596821,221174255,1006095553
+.long	191964160,3799684038,3164090317,1589111125
+.long	182528256,1777043520,2877432650,3265356744
+.long	1874708224,3503451415,3305285752,363511674
+.long	1606117888,3487855781,1093350906,2384367825
+.long	197121,67569157,134941193,202313229
+.long	67569157,134941193,202313229,197121
+.long	134941193,202313229,197121,67569157
+.long	202313229,197121,67569157,134941193
+.long	33619971,100992007,168364043,235736079
+.long	235736079,33619971,100992007,168364043
+.long	168364043,235736079,33619971,100992007
+.long	100992007,168364043,235736079,33619971
+.long	50462976,117835012,185207048,252579084
+.long	252314880,51251460,117574920,184942860
+.long	184682752,252054788,50987272,118359308
+.long	118099200,185467140,251790600,50727180
+.long	2946363062,528716217,1300004225,1881839624
+.long	1532713819,1532713819,1532713819,1532713819
+.long	3602276352,4288629033,3737020424,4153884961
+.long	1354558464,32357713,2958822624,3775749553
+.long	1201988352,132424512,1572796698,503232858
+.long	2213177600,1597421020,4103937655,675398315
+.long	2749646592,4273543773,1511898873,121693092
+.long	3040248576,1103263732,2871565598,1608280554
+.long	2236667136,2588920351,482954393,64377734
+.long	3069987328,291237287,2117370568,3650299247
+.long	533321216,3573750986,2572112006,1401264716
+.long	1339849704,2721158661,548607111,3445553514
+.long	2128193280,3054596040,2183486460,1257083700
+.long	655635200,1165381986,3923443150,2344132524
+.long	190078720,256924420,290342170,357187870
+.long	1610966272,2263057382,4103205268,309794674
+.long	2592527872,2233205587,1335446729,3402964816
+.long	3973531904,3225098121,3002836325,1918774430
+.long	3870401024,2102906079,2284471353,4117666579
+.long	617007872,1021508343,366931923,691083277
+.long	2528395776,3491914898,2968704004,1613121270
+.long	3445188352,3247741094,844474987,4093578302
+.long	651481088,1190302358,1689581232,574775300
+.long	4289380608,206939853,2555985458,2489840491
+.long	2130264064,327674451,3566485037,3349835193
+.long	2470714624,316102159,3636825756,3393945945
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte	111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte	83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte	114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte	118,101,114,115,105,116,121,41,0
+.align	64
+.hidden	_vpaes_preheat
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+	addl	(%esp),%ebp
+	movdqa	-48(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm6
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+.hidden	_vpaes_encrypt_core
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+	movl	$16,%ecx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	movdqu	(%edx),%xmm5
+.byte	102,15,56,0,208
+	movdqa	16(%ebp),%xmm0
+	pxor	%xmm5,%xmm2
+	psrld	$4,%xmm1
+	addl	$16,%edx
+.byte	102,15,56,0,193
+	leal	192(%ebp),%ebx
+	pxor	%xmm2,%xmm0
+	jmp	.L000enc_entry
+.align	16
+.L001enc_loop:
+	movdqa	32(%ebp),%xmm4
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	64(%ebp),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%ebx,%ecx,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	80(%ebp),%xmm2
+	movdqa	(%ebx,%ecx,1),%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addl	$16,%edx
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addl	$16,%ecx
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	subl	$1,%eax
+	pxor	%xmm3,%xmm0
+.L000enc_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.L001enc_loop
+	movdqa	96(%ebp),%xmm4
+	movdqa	112(%ebp),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%ebx,%ecx,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+.hidden	_vpaes_decrypt_core
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+	leal	608(%ebp),%ebx
+	movl	240(%edx),%eax
+	movdqa	%xmm6,%xmm1
+	movdqa	-64(%ebx),%xmm2
+	pandn	%xmm0,%xmm1
+	movl	%eax,%ecx
+	psrld	$4,%xmm1
+	movdqu	(%edx),%xmm5
+	shll	$4,%ecx
+	pand	%xmm6,%xmm0
+.byte	102,15,56,0,208
+	movdqa	-48(%ebx),%xmm0
+	xorl	$48,%ecx
+.byte	102,15,56,0,193
+	andl	$48,%ecx
+	pxor	%xmm5,%xmm2
+	movdqa	176(%ebp),%xmm5
+	pxor	%xmm2,%xmm0
+	addl	$16,%edx
+	leal	-352(%ebx,%ecx,1),%ecx
+	jmp	.L002dec_entry
+.align	16
+.L003dec_loop:
+	movdqa	-32(%ebx),%xmm4
+	movdqa	-16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%ebx),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%ebx),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addl	$16,%edx
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subl	$1,%eax
+.L002dec_entry:
+	movdqa	%xmm6,%xmm1
+	movdqa	-32(%ebp),%xmm2
+	pandn	%xmm0,%xmm1
+	pand	%xmm6,%xmm0
+	psrld	$4,%xmm1
+.byte	102,15,56,0,208
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm7,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm7,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm7,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%edx),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	.L003dec_loop
+	movdqa	96(%ebx),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%ebx),%xmm0
+	movdqa	(%ecx),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+.hidden	_vpaes_schedule_core
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+	addl	(%esp),%ebp
+	movdqu	(%esi),%xmm0
+	movdqa	320(%ebp),%xmm2
+	movdqa	%xmm0,%xmm3
+	leal	(%ebp),%ebx
+	movdqa	%xmm2,4(%esp)
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+	testl	%edi,%edi
+	jnz	.L004schedule_am_decrypting
+	movdqu	%xmm0,(%edx)
+	jmp	.L005schedule_go
+.L004schedule_am_decrypting:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%edx)
+	xorl	$48,%ecx
+.L005schedule_go:
+	cmpl	$192,%eax
+	ja	.L006schedule_256
+	je	.L007schedule_192
+.L008schedule_128:
+	movl	$10,%eax
+.L009loop_schedule_128:
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.L009loop_schedule_128
+.align	16
+.L007schedule_192:
+	movdqu	8(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%eax
+.L011loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.L011loop_schedule_192
+.align	16
+.L006schedule_256:
+	movdqu	16(%esi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%eax
+.L012loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+	call	_vpaes_schedule_round
+	decl	%eax
+	jz	.L010schedule_mangle_last
+	call	_vpaes_schedule_mangle
+	pshufd	$255,%xmm0,%xmm0
+	movdqa	%xmm7,20(%esp)
+	movdqa	%xmm6,%xmm7
+	call	.L_vpaes_schedule_low_round
+	movdqa	20(%esp),%xmm7
+	jmp	.L012loop_schedule_256
+.align	16
+.L010schedule_mangle_last:
+	leal	384(%ebp),%ebx
+	testl	%edi,%edi
+	jnz	.L013schedule_mangle_last_dec
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,193
+	leal	352(%ebp),%ebx
+	addl	$32,%edx
+.L013schedule_mangle_last_dec:
+	addl	$-16,%edx
+	pxor	336(%ebp),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%edx)
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+.hidden	_vpaes_schedule_192_smear
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	$128,%xmm6,%xmm1
+	pshufd	$254,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.hidden	_vpaes_schedule_round
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+	movdqa	8(%esp),%xmm2
+	pxor	%xmm1,%xmm1
+.byte	102,15,58,15,202,15
+.byte	102,15,58,15,210,15
+	pxor	%xmm1,%xmm7
+	pshufd	$255,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+	movdqa	%xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	336(%ebp),%xmm7
+	movdqa	-16(%ebp),%xmm4
+	movdqa	-48(%ebp),%xmm5
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm4,%xmm0
+	movdqa	-32(%ebp),%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm5,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm5,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm5,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	32(%ebp),%xmm4
+.byte	102,15,56,0,226
+	movdqa	48(%ebp),%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+.hidden	_vpaes_schedule_transform
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+	movdqa	-16(%ebp),%xmm2
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+	movdqa	(%ebx),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%ebx),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+.hidden	_vpaes_schedule_mangle
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,%xmm4
+	movdqa	128(%ebp),%xmm5
+	testl	%edi,%edi
+	jnz	.L014schedule_mangle_dec
+	addl	$16,%edx
+	pxor	336(%ebp),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+	jmp	.L015schedule_mangle_both
+.align	16
+.L014schedule_mangle_dec:
+	movdqa	-16(%ebp),%xmm2
+	leal	416(%ebp),%esi
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm4
+	movdqa	(%esi),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	32(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	64(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+	movdqa	96(%esi),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%esi),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	addl	$-16,%edx
+.L015schedule_mangle_both:
+	movdqa	256(%ebp,%ecx,1),%xmm1
+.byte	102,15,56,0,217
+	addl	$-16,%ecx
+	andl	$48,%ecx
+	movdqu	%xmm3,(%edx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl	vpaes_set_encrypt_key
+.hidden	vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L016pic
+.L016pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	movl	$48,%ecx
+	movl	$0,%edi
+	leal	.L_vpaes_consts+0x30-.L017pic_point,%ebp
+	call	_vpaes_schedule_core
+.L017pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl	vpaes_set_decrypt_key
+.hidden	vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%eax
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movl	%eax,%ebx
+	shrl	$5,%ebx
+	addl	$5,%ebx
+	movl	%ebx,240(%edx)
+	shll	$4,%ebx
+	leal	16(%edx,%ebx,1),%edx
+	movl	$1,%edi
+	movl	%eax,%ecx
+	shrl	$1,%ecx
+	andl	$32,%ecx
+	xorl	$32,%ecx
+	leal	.L_vpaes_consts+0x30-.L018pic_point,%ebp
+	call	_vpaes_schedule_core
+.L018pic_point:
+	movl	48(%esp),%esp
+	xorl	%eax,%eax
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl	vpaes_encrypt
+.hidden	vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+#ifdef BORINGSSL_DISPATCH_TEST
+	pushl	%ebx
+	pushl	%edx
+	call	.L019pic
+.L019pic:
+	popl	%ebx
+	leal	BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+	movl	$1,%edx
+	movb	%dl,(%ebx)
+	popl	%edx
+	popl	%ebx
+#endif
+	leal	.L_vpaes_consts+0x30-.L020pic_point,%ebp
+	call	_vpaes_preheat
+.L020pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl	vpaes_decrypt
+.hidden	vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	leal	.L_vpaes_consts+0x30-.L021pic_point,%ebp
+	call	_vpaes_preheat
+.L021pic_point:
+	movl	20(%esp),%esi
+	leal	-56(%esp),%ebx
+	movl	24(%esp),%edi
+	andl	$-16,%ebx
+	movl	28(%esp),%edx
+	xchgl	%esp,%ebx
+	movl	%ebx,48(%esp)
+	movdqu	(%esi),%xmm0
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%edi)
+	movl	48(%esp),%esp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl	vpaes_cbc_encrypt
+.hidden	vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	20(%esp),%esi
+	movl	24(%esp),%edi
+	movl	28(%esp),%eax
+	movl	32(%esp),%edx
+	subl	$16,%eax
+	jc	.L022cbc_abort
+	leal	-56(%esp),%ebx
+	movl	36(%esp),%ebp
+	andl	$-16,%ebx
+	movl	40(%esp),%ecx
+	xchgl	%esp,%ebx
+	movdqu	(%ebp),%xmm1
+	subl	%esi,%edi
+	movl	%ebx,48(%esp)
+	movl	%edi,(%esp)
+	movl	%edx,4(%esp)
+	movl	%ebp,8(%esp)
+	movl	%eax,%edi
+	leal	.L_vpaes_consts+0x30-.L023pic_point,%ebp
+	call	_vpaes_preheat
+.L023pic_point:
+	cmpl	$0,%ecx
+	je	.L024cbc_dec_loop
+	jmp	.L025cbc_enc_loop
+.align	16
+.L025cbc_enc_loop:
+	movdqu	(%esi),%xmm0
+	pxor	%xmm1,%xmm0
+	call	_vpaes_encrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	movdqa	%xmm0,%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L025cbc_enc_loop
+	jmp	.L026cbc_done
+.align	16
+.L024cbc_dec_loop:
+	movdqu	(%esi),%xmm0
+	movdqa	%xmm1,16(%esp)
+	movdqa	%xmm0,32(%esp)
+	call	_vpaes_decrypt_core
+	movl	(%esp),%ebx
+	movl	4(%esp),%edx
+	pxor	16(%esp),%xmm0
+	movdqa	32(%esp),%xmm1
+	movdqu	%xmm0,(%ebx,%esi,1)
+	leal	16(%esi),%esi
+	subl	$16,%edi
+	jnc	.L024cbc_dec_loop
+.L026cbc_done:
+	movl	8(%esp),%ebx
+	movl	48(%esp),%esp
+	movdqu	%xmm1,(%ebx)
+.L022cbc_abort:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S
@@ -1,0 +1,484 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	bn_mul_mont
+.hidden	bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	xorl	%eax,%eax
+	movl	40(%esp),%edi
+	cmpl	$4,%edi
+	jl	.L000just_leave
+	leal	20(%esp),%esi
+	leal	24(%esp),%edx
+	addl	$2,%edi
+	negl	%edi
+	leal	-32(%esp,%edi,4),%ebp
+	negl	%edi
+	movl	%ebp,%eax
+	subl	%edx,%eax
+	andl	$2047,%eax
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
+	andl	$2048,%edx
+	xorl	$2048,%edx
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+	jmp	.L002page_walk_done
+.align	16
+.L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+.L002page_walk_done:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	8(%esi),%ecx
+	movl	12(%esi),%ebp
+	movl	16(%esi),%esi
+	movl	(%esi),%esi
+	movl	%eax,4(%esp)
+	movl	%ebx,8(%esp)
+	movl	%ecx,12(%esp)
+	movl	%ebp,16(%esp)
+	movl	%esi,20(%esp)
+	leal	-3(%edi),%ebx
+	movl	%edx,24(%esp)
+	call	.L003PIC_me_up
+.L003PIC_me_up:
+	popl	%eax
+	leal	OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
+	btl	$26,(%eax)
+	jnc	.L004non_sse2
+	movl	$-1,%eax
+	movd	%eax,%mm7
+	movl	8(%esp),%esi
+	movl	12(%esp),%edi
+	movl	16(%esp),%ebp
+	xorl	%edx,%edx
+	xorl	%ecx,%ecx
+	movd	(%edi),%mm4
+	movd	(%esi),%mm5
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	movq	%mm5,%mm2
+	movq	%mm5,%mm0
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	incl	%ecx
+.align	16
+.L0051st:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	leal	1(%ecx),%ecx
+	cmpl	%ebx,%ecx
+	jl	.L0051st
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm2,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	incl	%edx
+.L006outer:
+	xorl	%ecx,%ecx
+	movd	(%edi,%edx,4),%mm4
+	movd	(%esi),%mm5
+	movd	32(%esp),%mm6
+	movd	(%ebp),%mm3
+	pmuludq	%mm4,%mm5
+	paddq	%mm6,%mm5
+	movq	%mm5,%mm0
+	movq	%mm5,%mm2
+	pand	%mm7,%mm0
+	pmuludq	20(%esp),%mm5
+	pmuludq	%mm5,%mm3
+	paddq	%mm0,%mm3
+	movd	36(%esp),%mm6
+	movd	4(%ebp),%mm1
+	movd	4(%esi),%mm0
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	incl	%ecx
+	decl	%ebx
+.L007inner:
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	movd	36(%esp,%ecx,4),%mm6
+	pand	%mm7,%mm0
+	movd	4(%ebp,%ecx,4),%mm1
+	paddq	%mm0,%mm3
+	movd	4(%esi,%ecx,4),%mm0
+	psrlq	$32,%mm2
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm3
+	paddq	%mm6,%mm2
+	decl	%ebx
+	leal	1(%ecx),%ecx
+	jnz	.L007inner
+	movl	%ecx,%ebx
+	pmuludq	%mm4,%mm0
+	pmuludq	%mm5,%mm1
+	paddq	%mm0,%mm2
+	paddq	%mm1,%mm3
+	movq	%mm2,%mm0
+	pand	%mm7,%mm0
+	paddq	%mm0,%mm3
+	movd	%mm3,28(%esp,%ecx,4)
+	psrlq	$32,%mm2
+	psrlq	$32,%mm3
+	movd	36(%esp,%ebx,4),%mm6
+	paddq	%mm2,%mm3
+	paddq	%mm6,%mm3
+	movq	%mm3,32(%esp,%ebx,4)
+	leal	1(%edx),%edx
+	cmpl	%ebx,%edx
+	jle	.L006outer
+	emms
+	jmp	.L008common_tail
+.align	16
+.L004non_sse2:
+	movl	8(%esp),%esi
+	leal	1(%ebx),%ebp
+	movl	12(%esp),%edi
+	xorl	%ecx,%ecx
+	movl	%esi,%edx
+	andl	$1,%ebp
+	subl	%edi,%edx
+	leal	4(%edi,%ebx,4),%eax
+	orl	%edx,%ebp
+	movl	(%edi),%edi
+	jz	.L009bn_sqr_mont
+	movl	%eax,28(%esp)
+	movl	(%esi),%eax
+	xorl	%edx,%edx
+.align	16
+.L010mull:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%eax,%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	movl	(%esi,%ecx,4),%eax
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L010mull
+	movl	%edx,%ebp
+	mull	%edi
+	movl	20(%esp),%edi
+	addl	%ebp,%eax
+	movl	16(%esp),%esi
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	movl	%eax,32(%esp,%ebx,4)
+	xorl	%ecx,%ecx
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	movl	(%esi),%eax
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	incl	%ecx
+	jmp	.L0112ndmadd
+.align	16
+.L0121stmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L0121stmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	imull	32(%esp),%edi
+	xorl	%ecx,%ecx
+	addl	36(%esp,%ebx,4),%edx
+	movl	%ebp,32(%esp,%ebx,4)
+	adcl	$0,%ecx
+	movl	(%esi),%eax
+	movl	%edx,36(%esp,%ebx,4)
+	movl	%ecx,40(%esp,%ebx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	4(%esi),%eax
+	adcl	$0,%edx
+	movl	$1,%ecx
+.align	16
+.L0112ndmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0112ndmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	xorl	%eax,%eax
+	movl	12(%esp),%ecx
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	leal	4(%ecx),%ecx
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	28(%esp),%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L008common_tail
+	movl	(%ecx),%edi
+	movl	8(%esp),%esi
+	movl	%ecx,12(%esp)
+	xorl	%ecx,%ecx
+	xorl	%edx,%edx
+	movl	(%esi),%eax
+	jmp	.L0121stmadd
+.align	16
+.L009bn_sqr_mont:
+	movl	%ebx,(%esp)
+	movl	%ecx,12(%esp)
+	movl	%edi,%eax
+	mull	%edi
+	movl	%eax,32(%esp)
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+	incl	%ecx
+.align	16
+.L013sqr:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	1(%ecx),%ecx
+	adcl	$0,%edx
+	leal	(%ebx,%eax,2),%ebp
+	shrl	$31,%eax
+	cmpl	(%esp),%ecx
+	movl	%eax,%ebx
+	movl	%ebp,28(%esp,%ecx,4)
+	jl	.L013sqr
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	movl	20(%esp),%edi
+	adcl	$0,%edx
+	movl	16(%esp),%esi
+	leal	(%ebx,%eax,2),%ebp
+	imull	32(%esp),%edi
+	shrl	$31,%eax
+	movl	%ebp,32(%esp,%ecx,4)
+	leal	(%eax,%edx,2),%ebp
+	movl	(%esi),%eax
+	shrl	$31,%edx
+	movl	%ebp,36(%esp,%ecx,4)
+	movl	%edx,40(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	movl	%ecx,%ebx
+	adcl	$0,%edx
+	movl	4(%esi),%eax
+	movl	$1,%ecx
+.align	16
+.L0143rdmadd:
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ecx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	4(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%edx,%ebp
+	mull	%edi
+	addl	36(%esp,%ecx,4),%ebp
+	leal	2(%ecx),%ecx
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	movl	(%esi,%ecx,4),%eax
+	adcl	$0,%edx
+	cmpl	%ebx,%ecx
+	movl	%ebp,24(%esp,%ecx,4)
+	jl	.L0143rdmadd
+	movl	%edx,%ebp
+	mull	%edi
+	addl	32(%esp,%ebx,4),%ebp
+	adcl	$0,%edx
+	addl	%eax,%ebp
+	adcl	$0,%edx
+	movl	%ebp,28(%esp,%ebx,4)
+	movl	12(%esp),%ecx
+	xorl	%eax,%eax
+	movl	8(%esp),%esi
+	addl	36(%esp,%ebx,4),%edx
+	adcl	40(%esp,%ebx,4),%eax
+	movl	%edx,32(%esp,%ebx,4)
+	cmpl	%ebx,%ecx
+	movl	%eax,36(%esp,%ebx,4)
+	je	.L008common_tail
+	movl	4(%esi,%ecx,4),%edi
+	leal	1(%ecx),%ecx
+	movl	%edi,%eax
+	movl	%ecx,12(%esp)
+	mull	%edi
+	addl	32(%esp,%ecx,4),%eax
+	adcl	$0,%edx
+	movl	%eax,32(%esp,%ecx,4)
+	xorl	%ebp,%ebp
+	cmpl	%ebx,%ecx
+	leal	1(%ecx),%ecx
+	je	.L015sqrlast
+	movl	%edx,%ebx
+	shrl	$1,%edx
+	andl	$1,%ebx
+.align	16
+.L016sqradd:
+	movl	(%esi,%ecx,4),%eax
+	movl	%edx,%ebp
+	mull	%edi
+	addl	%ebp,%eax
+	leal	(%eax,%eax,1),%ebp
+	adcl	$0,%edx
+	shrl	$31,%eax
+	addl	32(%esp,%ecx,4),%ebp
+	leal	1(%ecx),%ecx
+	adcl	$0,%eax
+	addl	%ebx,%ebp
+	adcl	$0,%eax
+	cmpl	(%esp),%ecx
+	movl	%ebp,28(%esp,%ecx,4)
+	movl	%eax,%ebx
+	jle	.L016sqradd
+	movl	%edx,%ebp
+	addl	%edx,%edx
+	shrl	$31,%ebp
+	addl	%ebx,%edx
+	adcl	$0,%ebp
+.L015sqrlast:
+	movl	20(%esp),%edi
+	movl	16(%esp),%esi
+	imull	32(%esp),%edi
+	addl	32(%esp,%ecx,4),%edx
+	movl	(%esi),%eax
+	adcl	$0,%ebp
+	movl	%edx,32(%esp,%ecx,4)
+	movl	%ebp,36(%esp,%ecx,4)
+	mull	%edi
+	addl	32(%esp),%eax
+	leal	-1(%ecx),%ebx
+	adcl	$0,%edx
+	movl	$1,%ecx
+	movl	4(%esi),%eax
+	jmp	.L0143rdmadd
+.align	16
+.L008common_tail:
+	movl	16(%esp),%ebp
+	movl	4(%esp),%edi
+	leal	32(%esp),%esi
+	movl	(%esi),%eax
+	movl	%ebx,%ecx
+	xorl	%edx,%edx
+.align	16
+.L017sub:
+	sbbl	(%ebp,%edx,4),%eax
+	movl	%eax,(%edi,%edx,4)
+	decl	%ecx
+	movl	4(%esi,%edx,4),%eax
+	leal	1(%edx),%edx
+	jge	.L017sub
+	sbbl	$0,%eax
+	movl	$-1,%edx
+	xorl	%eax,%edx
+	jmp	.L018copy
+.align	16
+.L018copy:
+	movl	32(%esp,%ebx,4),%esi
+	movl	(%edi,%ebx,4),%ebp
+	movl	%ecx,32(%esp,%ebx,4)
+	andl	%eax,%esi
+	andl	%edx,%ebp
+	orl	%esi,%ebp
+	movl	%ebp,(%edi,%ebx,4)
+	decl	%ebx
+	jge	.L018copy
+	movl	24(%esp),%esp
+	movl	$1,%eax
+.L000just_leave:
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte	54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte	32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte	111,114,103,62,0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S
@@ -1,0 +1,206 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl	abi_test_trampoline
+.hidden	abi_test_trampoline
+.type	abi_test_trampoline,@function
+.align	16
+abi_test_trampoline:
+.L_abi_test_trampoline_begin:
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	movl	24(%esp),%ecx
+	movl	(%ecx),%esi
+	movl	4(%ecx),%edi
+	movl	8(%ecx),%ebx
+	movl	12(%ecx),%ebp
+	subl	$44,%esp
+	movl	72(%esp),%eax
+	xorl	%ecx,%ecx
+.L000loop:
+	cmpl	76(%esp),%ecx
+	jae	.L001loop_done
+	movl	(%eax,%ecx,4),%edx
+	movl	%edx,(%esp,%ecx,4)
+	addl	$1,%ecx
+	jmp	.L000loop
+.L001loop_done:
+	call	*64(%esp)
+	addl	$44,%esp
+	movl	24(%esp),%ecx
+	movl	%esi,(%ecx)
+	movl	%edi,4(%ecx)
+	movl	%ebx,8(%ecx)
+	movl	%ebp,12(%ecx)
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+.size	abi_test_trampoline,.-.L_abi_test_trampoline_begin
+.globl	abi_test_get_and_clear_direction_flag
+.hidden	abi_test_get_and_clear_direction_flag
+.type	abi_test_get_and_clear_direction_flag,@function
+.align	16
+abi_test_get_and_clear_direction_flag:
+.L_abi_test_get_and_clear_direction_flag_begin:
+	pushfl
+	popl	%eax
+	andl	$1024,%eax
+	shrl	$10,%eax
+	cld
+	ret
+.size	abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin
+.globl	abi_test_set_direction_flag
+.hidden	abi_test_set_direction_flag
+.type	abi_test_set_direction_flag,@function
+.align	16
+abi_test_set_direction_flag:
+.L_abi_test_set_direction_flag_begin:
+	std
+	ret
+.size	abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin
+.globl	abi_test_clobber_eax
+.hidden	abi_test_clobber_eax
+.type	abi_test_clobber_eax,@function
+.align	16
+abi_test_clobber_eax:
+.L_abi_test_clobber_eax_begin:
+	xorl	%eax,%eax
+	ret
+.size	abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin
+.globl	abi_test_clobber_ebx
+.hidden	abi_test_clobber_ebx
+.type	abi_test_clobber_ebx,@function
+.align	16
+abi_test_clobber_ebx:
+.L_abi_test_clobber_ebx_begin:
+	xorl	%ebx,%ebx
+	ret
+.size	abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin
+.globl	abi_test_clobber_ecx
+.hidden	abi_test_clobber_ecx
+.type	abi_test_clobber_ecx,@function
+.align	16
+abi_test_clobber_ecx:
+.L_abi_test_clobber_ecx_begin:
+	xorl	%ecx,%ecx
+	ret
+.size	abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin
+.globl	abi_test_clobber_edx
+.hidden	abi_test_clobber_edx
+.type	abi_test_clobber_edx,@function
+.align	16
+abi_test_clobber_edx:
+.L_abi_test_clobber_edx_begin:
+	xorl	%edx,%edx
+	ret
+.size	abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin
+.globl	abi_test_clobber_edi
+.hidden	abi_test_clobber_edi
+.type	abi_test_clobber_edi,@function
+.align	16
+abi_test_clobber_edi:
+.L_abi_test_clobber_edi_begin:
+	xorl	%edi,%edi
+	ret
+.size	abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin
+.globl	abi_test_clobber_esi
+.hidden	abi_test_clobber_esi
+.type	abi_test_clobber_esi,@function
+.align	16
+abi_test_clobber_esi:
+.L_abi_test_clobber_esi_begin:
+	xorl	%esi,%esi
+	ret
+.size	abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin
+.globl	abi_test_clobber_ebp
+.hidden	abi_test_clobber_ebp
+.type	abi_test_clobber_ebp,@function
+.align	16
+abi_test_clobber_ebp:
+.L_abi_test_clobber_ebp_begin:
+	xorl	%ebp,%ebp
+	ret
+.size	abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin
+.globl	abi_test_clobber_xmm0
+.hidden	abi_test_clobber_xmm0
+.type	abi_test_clobber_xmm0,@function
+.align	16
+abi_test_clobber_xmm0:
+.L_abi_test_clobber_xmm0_begin:
+	pxor	%xmm0,%xmm0
+	ret
+.size	abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin
+.globl	abi_test_clobber_xmm1
+.hidden	abi_test_clobber_xmm1
+.type	abi_test_clobber_xmm1,@function
+.align	16
+abi_test_clobber_xmm1:
+.L_abi_test_clobber_xmm1_begin:
+	pxor	%xmm1,%xmm1
+	ret
+.size	abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin
+.globl	abi_test_clobber_xmm2
+.hidden	abi_test_clobber_xmm2
+.type	abi_test_clobber_xmm2,@function
+.align	16
+abi_test_clobber_xmm2:
+.L_abi_test_clobber_xmm2_begin:
+	pxor	%xmm2,%xmm2
+	ret
+.size	abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin
+.globl	abi_test_clobber_xmm3
+.hidden	abi_test_clobber_xmm3
+.type	abi_test_clobber_xmm3,@function
+.align	16
+abi_test_clobber_xmm3:
+.L_abi_test_clobber_xmm3_begin:
+	pxor	%xmm3,%xmm3
+	ret
+.size	abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin
+.globl	abi_test_clobber_xmm4
+.hidden	abi_test_clobber_xmm4
+.type	abi_test_clobber_xmm4,@function
+.align	16
+abi_test_clobber_xmm4:
+.L_abi_test_clobber_xmm4_begin:
+	pxor	%xmm4,%xmm4
+	ret
+.size	abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin
+.globl	abi_test_clobber_xmm5
+.hidden	abi_test_clobber_xmm5
+.type	abi_test_clobber_xmm5,@function
+.align	16
+abi_test_clobber_xmm5:
+.L_abi_test_clobber_xmm5_begin:
+	pxor	%xmm5,%xmm5
+	ret
+.size	abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin
+.globl	abi_test_clobber_xmm6
+.hidden	abi_test_clobber_xmm6
+.type	abi_test_clobber_xmm6,@function
+.align	16
+abi_test_clobber_xmm6:
+.L_abi_test_clobber_xmm6_begin:
+	pxor	%xmm6,%xmm6
+	ret
+.size	abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin
+.globl	abi_test_clobber_xmm7
+.hidden	abi_test_clobber_xmm7
+.type	abi_test_clobber_xmm7,@function
+.align	16
+abi_test_clobber_xmm7:
+.L_abi_test_clobber_xmm7_begin:
+	pxor	%xmm7,%xmm7
+	ret
+.size	abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S
@@ -1,0 +1,1633 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.align	64
+.Lzero:
+.long	0,0,0,0
+.Lone:
+.long	1,0,0,0
+.Linc:
+.long	0,1,2,3
+.Lfour:
+.long	4,4,4,4
+.Lincy:
+.long	0,2,4,6,1,3,5,7
+.Leight:
+.long	8,8,8,8,8,8,8,8
+.Lrot16:
+.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.globl	ChaCha20_ctr32
+.hidden ChaCha20_ctr32
+.type	ChaCha20_ctr32,@function
+.align	64
+ChaCha20_ctr32:
+.cfi_startproc	
+	cmpq	$0,%rdx
+	je	.Lno_data
+	movq	OPENSSL_ia32cap_P+4(%rip),%r10
+	testl	$512,%r10d
+	jnz	.LChaCha20_ssse3
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-56
+	subq	$64+24,%rsp
+.cfi_adjust_cfa_offset	88
+.Lctr32_body:
+
+
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lone(%rip),%xmm4
+
+
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	%rdx,%rbp
+	jmp	.Loop_outer
+
+.align	32
+.Loop_outer:
+	movl	$0x61707865,%eax
+	movl	$0x3320646e,%ebx
+	movl	$0x79622d32,%ecx
+	movl	$0x6b206574,%edx
+	movl	16(%rsp),%r8d
+	movl	20(%rsp),%r9d
+	movl	24(%rsp),%r10d
+	movl	28(%rsp),%r11d
+	movd	%xmm3,%r12d
+	movl	52(%rsp),%r13d
+	movl	56(%rsp),%r14d
+	movl	60(%rsp),%r15d
+
+	movq	%rbp,64+0(%rsp)
+	movl	$10,%ebp
+	movq	%rsi,64+8(%rsp)
+.byte	102,72,15,126,214
+	movq	%rdi,64+16(%rsp)
+	movq	%rsi,%rdi
+	shrq	$32,%rdi
+	jmp	.Loop
+
+.align	32
+.Loop:
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$16,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$16,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$12,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$12,%r9d
+	addl	%r8d,%eax
+	xorl	%eax,%r12d
+	roll	$8,%r12d
+	addl	%r9d,%ebx
+	xorl	%ebx,%r13d
+	roll	$8,%r13d
+	addl	%r12d,%esi
+	xorl	%esi,%r8d
+	roll	$7,%r8d
+	addl	%r13d,%edi
+	xorl	%edi,%r9d
+	roll	$7,%r9d
+	movl	%esi,32(%rsp)
+	movl	%edi,36(%rsp)
+	movl	40(%rsp),%esi
+	movl	44(%rsp),%edi
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$16,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$16,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$12,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$12,%r11d
+	addl	%r10d,%ecx
+	xorl	%ecx,%r14d
+	roll	$8,%r14d
+	addl	%r11d,%edx
+	xorl	%edx,%r15d
+	roll	$8,%r15d
+	addl	%r14d,%esi
+	xorl	%esi,%r10d
+	roll	$7,%r10d
+	addl	%r15d,%edi
+	xorl	%edi,%r11d
+	roll	$7,%r11d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$16,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$16,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$12,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$12,%r10d
+	addl	%r9d,%eax
+	xorl	%eax,%r15d
+	roll	$8,%r15d
+	addl	%r10d,%ebx
+	xorl	%ebx,%r12d
+	roll	$8,%r12d
+	addl	%r15d,%esi
+	xorl	%esi,%r9d
+	roll	$7,%r9d
+	addl	%r12d,%edi
+	xorl	%edi,%r10d
+	roll	$7,%r10d
+	movl	%esi,40(%rsp)
+	movl	%edi,44(%rsp)
+	movl	32(%rsp),%esi
+	movl	36(%rsp),%edi
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$16,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$16,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$12,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$12,%r8d
+	addl	%r11d,%ecx
+	xorl	%ecx,%r13d
+	roll	$8,%r13d
+	addl	%r8d,%edx
+	xorl	%edx,%r14d
+	roll	$8,%r14d
+	addl	%r13d,%esi
+	xorl	%esi,%r11d
+	roll	$7,%r11d
+	addl	%r14d,%edi
+	xorl	%edi,%r8d
+	roll	$7,%r8d
+	decl	%ebp
+	jnz	.Loop
+	movl	%edi,36(%rsp)
+	movl	%esi,32(%rsp)
+	movq	64(%rsp),%rbp
+	movdqa	%xmm2,%xmm1
+	movq	64+8(%rsp),%rsi
+	paddd	%xmm4,%xmm3
+	movq	64+16(%rsp),%rdi
+
+	addl	$0x61707865,%eax
+	addl	$0x3320646e,%ebx
+	addl	$0x79622d32,%ecx
+	addl	$0x6b206574,%edx
+	addl	16(%rsp),%r8d
+	addl	20(%rsp),%r9d
+	addl	24(%rsp),%r10d
+	addl	28(%rsp),%r11d
+	addl	48(%rsp),%r12d
+	addl	52(%rsp),%r13d
+	addl	56(%rsp),%r14d
+	addl	60(%rsp),%r15d
+	paddd	32(%rsp),%xmm1
+
+	cmpq	$64,%rbp
+	jb	.Ltail
+
+	xorl	0(%rsi),%eax
+	xorl	4(%rsi),%ebx
+	xorl	8(%rsi),%ecx
+	xorl	12(%rsi),%edx
+	xorl	16(%rsi),%r8d
+	xorl	20(%rsi),%r9d
+	xorl	24(%rsi),%r10d
+	xorl	28(%rsi),%r11d
+	movdqu	32(%rsi),%xmm0
+	xorl	48(%rsi),%r12d
+	xorl	52(%rsi),%r13d
+	xorl	56(%rsi),%r14d
+	xorl	60(%rsi),%r15d
+	leaq	64(%rsi),%rsi
+	pxor	%xmm1,%xmm0
+
+	movdqa	%xmm2,32(%rsp)
+	movd	%xmm3,48(%rsp)
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movl	%r12d,48(%rdi)
+	movl	%r13d,52(%rdi)
+	movl	%r14d,56(%rdi)
+	movl	%r15d,60(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rbp
+	jnz	.Loop_outer
+
+	jmp	.Ldone
+
+.align	16
+.Ltail:
+	movl	%eax,0(%rsp)
+	movl	%ebx,4(%rsp)
+	xorq	%rbx,%rbx
+	movl	%ecx,8(%rsp)
+	movl	%edx,12(%rsp)
+	movl	%r8d,16(%rsp)
+	movl	%r9d,20(%rsp)
+	movl	%r10d,24(%rsp)
+	movl	%r11d,28(%rsp)
+	movdqa	%xmm1,32(%rsp)
+	movl	%r12d,48(%rsp)
+	movl	%r13d,52(%rsp)
+	movl	%r14d,56(%rsp)
+	movl	%r15d,60(%rsp)
+
+.Loop_tail:
+	movzbl	(%rsi,%rbx,1),%eax
+	movzbl	(%rsp,%rbx,1),%edx
+	leaq	1(%rbx),%rbx
+	xorl	%edx,%eax
+	movb	%al,-1(%rdi,%rbx,1)
+	decq	%rbp
+	jnz	.Loop_tail
+
+.Ldone:
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+.cfi_restore	r15
+	movq	-40(%rsi),%r14
+.cfi_restore	r14
+	movq	-32(%rsi),%r13
+.cfi_restore	r13
+	movq	-24(%rsi),%r12
+.cfi_restore	r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	rbx
+	leaq	(%rsi),%rsp
+.cfi_adjust_cfa_offset	-136
+.Lno_data:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+.type	ChaCha20_ssse3,@function
+.align	32
+ChaCha20_ssse3:
+.LChaCha20_ssse3:
+.cfi_startproc	
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	cmpq	$128,%rdx
+	ja	.LChaCha20_4x
+
+.Ldo_sse3_after_all:
+	subq	$64+8,%rsp
+	movdqa	.Lsigma(%rip),%xmm0
+	movdqu	(%rcx),%xmm1
+	movdqu	16(%rcx),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	.Lrot16(%rip),%xmm6
+	movdqa	.Lrot24(%rip),%xmm7
+
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	movq	$10,%r8
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_outer_ssse3:
+	movdqa	.Lone(%rip),%xmm3
+	movdqa	0(%rsp),%xmm0
+	movdqa	16(%rsp),%xmm1
+	movdqa	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+	movq	$10,%r8
+	movdqa	%xmm3,48(%rsp)
+	jmp	.Loop_ssse3
+
+.align	32
+.Loop_ssse3:
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$57,%xmm1,%xmm1
+	pshufd	$147,%xmm3,%xmm3
+	nop
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,222
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$20,%xmm1
+	pslld	$12,%xmm4
+	por	%xmm4,%xmm1
+	paddd	%xmm1,%xmm0
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,223
+	paddd	%xmm3,%xmm2
+	pxor	%xmm2,%xmm1
+	movdqa	%xmm1,%xmm4
+	psrld	$25,%xmm1
+	pslld	$7,%xmm4
+	por	%xmm4,%xmm1
+	pshufd	$78,%xmm2,%xmm2
+	pshufd	$147,%xmm1,%xmm1
+	pshufd	$57,%xmm3,%xmm3
+	decq	%r8
+	jnz	.Loop_ssse3
+	paddd	0(%rsp),%xmm0
+	paddd	16(%rsp),%xmm1
+	paddd	32(%rsp),%xmm2
+	paddd	48(%rsp),%xmm3
+
+	cmpq	$64,%rdx
+	jb	.Ltail_ssse3
+
+	movdqu	0(%rsi),%xmm4
+	movdqu	16(%rsi),%xmm5
+	pxor	%xmm4,%xmm0
+	movdqu	32(%rsi),%xmm4
+	pxor	%xmm5,%xmm1
+	movdqu	48(%rsi),%xmm5
+	leaq	64(%rsi),%rsi
+	pxor	%xmm4,%xmm2
+	pxor	%xmm5,%xmm3
+
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm1,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+	leaq	64(%rdi),%rdi
+
+	subq	$64,%rdx
+	jnz	.Loop_outer_ssse3
+
+	jmp	.Ldone_ssse3
+
+.align	16
+.Ltail_ssse3:
+	movdqa	%xmm0,0(%rsp)
+	movdqa	%xmm1,16(%rsp)
+	movdqa	%xmm2,32(%rsp)
+	movdqa	%xmm3,48(%rsp)
+	xorq	%r8,%r8
+
+.Loop_tail_ssse3:
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r8,1)
+	decq	%rdx
+	jnz	.Loop_tail_ssse3
+
+.Ldone_ssse3:
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.Lssse3_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ChaCha20_ssse3,.-ChaCha20_ssse3
+.type	ChaCha20_4x,@function
+.align	32
+ChaCha20_4x:
+.LChaCha20_4x:
+.cfi_startproc	
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	movq	%r10,%r11
+	shrq	$32,%r10
+	testq	$32,%r10
+	jnz	.LChaCha20_8x
+	cmpq	$192,%rdx
+	ja	.Lproceed4x
+
+	andq	$71303168,%r11
+	cmpq	$4194304,%r11
+	je	.Ldo_sse3_after_all
+
+.Lproceed4x:
+	subq	$0x140+8,%rsp
+	movdqa	.Lsigma(%rip),%xmm11
+	movdqu	(%rcx),%xmm15
+	movdqu	16(%rcx),%xmm7
+	movdqu	(%r8),%xmm3
+	leaq	256(%rsp),%rcx
+	leaq	.Lrot16(%rip),%r10
+	leaq	.Lrot24(%rip),%r11
+
+	pshufd	$0x00,%xmm11,%xmm8
+	pshufd	$0x55,%xmm11,%xmm9
+	movdqa	%xmm8,64(%rsp)
+	pshufd	$0xaa,%xmm11,%xmm10
+	movdqa	%xmm9,80(%rsp)
+	pshufd	$0xff,%xmm11,%xmm11
+	movdqa	%xmm10,96(%rsp)
+	movdqa	%xmm11,112(%rsp)
+
+	pshufd	$0x00,%xmm15,%xmm12
+	pshufd	$0x55,%xmm15,%xmm13
+	movdqa	%xmm12,128-256(%rcx)
+	pshufd	$0xaa,%xmm15,%xmm14
+	movdqa	%xmm13,144-256(%rcx)
+	pshufd	$0xff,%xmm15,%xmm15
+	movdqa	%xmm14,160-256(%rcx)
+	movdqa	%xmm15,176-256(%rcx)
+
+	pshufd	$0x00,%xmm7,%xmm4
+	pshufd	$0x55,%xmm7,%xmm5
+	movdqa	%xmm4,192-256(%rcx)
+	pshufd	$0xaa,%xmm7,%xmm6
+	movdqa	%xmm5,208-256(%rcx)
+	pshufd	$0xff,%xmm7,%xmm7
+	movdqa	%xmm6,224-256(%rcx)
+	movdqa	%xmm7,240-256(%rcx)
+
+	pshufd	$0x00,%xmm3,%xmm0
+	pshufd	$0x55,%xmm3,%xmm1
+	paddd	.Linc(%rip),%xmm0
+	pshufd	$0xaa,%xmm3,%xmm2
+	movdqa	%xmm1,272-256(%rcx)
+	pshufd	$0xff,%xmm3,%xmm3
+	movdqa	%xmm2,288-256(%rcx)
+	movdqa	%xmm3,304-256(%rcx)
+
+	jmp	.Loop_enter4x
+
+.align	32
+.Loop_outer4x:
+	movdqa	64(%rsp),%xmm8
+	movdqa	80(%rsp),%xmm9
+	movdqa	96(%rsp),%xmm10
+	movdqa	112(%rsp),%xmm11
+	movdqa	128-256(%rcx),%xmm12
+	movdqa	144-256(%rcx),%xmm13
+	movdqa	160-256(%rcx),%xmm14
+	movdqa	176-256(%rcx),%xmm15
+	movdqa	192-256(%rcx),%xmm4
+	movdqa	208-256(%rcx),%xmm5
+	movdqa	224-256(%rcx),%xmm6
+	movdqa	240-256(%rcx),%xmm7
+	movdqa	256-256(%rcx),%xmm0
+	movdqa	272-256(%rcx),%xmm1
+	movdqa	288-256(%rcx),%xmm2
+	movdqa	304-256(%rcx),%xmm3
+	paddd	.Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm7,48(%rsp)
+	movdqa	(%r10),%xmm7
+	movl	$10,%eax
+	movdqa	%xmm0,256-256(%rcx)
+	jmp	.Loop4x
+
+.align	32
+.Loop4x:
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,199
+.byte	102,15,56,0,207
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm6
+	pslld	$12,%xmm12
+	psrld	$20,%xmm6
+	movdqa	%xmm13,%xmm7
+	pslld	$12,%xmm13
+	por	%xmm6,%xmm12
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm13
+	paddd	%xmm12,%xmm8
+	paddd	%xmm13,%xmm9
+	pxor	%xmm8,%xmm0
+	pxor	%xmm9,%xmm1
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+	paddd	%xmm0,%xmm4
+	paddd	%xmm1,%xmm5
+	pxor	%xmm4,%xmm12
+	pxor	%xmm5,%xmm13
+	movdqa	%xmm12,%xmm7
+	pslld	$7,%xmm12
+	psrld	$25,%xmm7
+	movdqa	%xmm13,%xmm6
+	pslld	$7,%xmm13
+	por	%xmm7,%xmm12
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm13
+	movdqa	%xmm4,0(%rsp)
+	movdqa	%xmm5,16(%rsp)
+	movdqa	32(%rsp),%xmm4
+	movdqa	48(%rsp),%xmm5
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,215
+.byte	102,15,56,0,223
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm6
+	pslld	$12,%xmm14
+	psrld	$20,%xmm6
+	movdqa	%xmm15,%xmm7
+	pslld	$12,%xmm15
+	por	%xmm6,%xmm14
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm15
+	paddd	%xmm14,%xmm10
+	paddd	%xmm15,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm3
+.byte	102,15,56,0,214
+.byte	102,15,56,0,222
+	paddd	%xmm2,%xmm4
+	paddd	%xmm3,%xmm5
+	pxor	%xmm4,%xmm14
+	pxor	%xmm5,%xmm15
+	movdqa	%xmm14,%xmm7
+	pslld	$7,%xmm14
+	psrld	$25,%xmm7
+	movdqa	%xmm15,%xmm6
+	pslld	$7,%xmm15
+	por	%xmm7,%xmm14
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm15
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,223
+.byte	102,15,56,0,199
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm6
+	pslld	$12,%xmm13
+	psrld	$20,%xmm6
+	movdqa	%xmm14,%xmm7
+	pslld	$12,%xmm14
+	por	%xmm6,%xmm13
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm14
+	paddd	%xmm13,%xmm8
+	paddd	%xmm14,%xmm9
+	pxor	%xmm8,%xmm3
+	pxor	%xmm9,%xmm0
+.byte	102,15,56,0,222
+.byte	102,15,56,0,198
+	paddd	%xmm3,%xmm4
+	paddd	%xmm0,%xmm5
+	pxor	%xmm4,%xmm13
+	pxor	%xmm5,%xmm14
+	movdqa	%xmm13,%xmm7
+	pslld	$7,%xmm13
+	psrld	$25,%xmm7
+	movdqa	%xmm14,%xmm6
+	pslld	$7,%xmm14
+	por	%xmm7,%xmm13
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm14
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm5,48(%rsp)
+	movdqa	0(%rsp),%xmm4
+	movdqa	16(%rsp),%xmm5
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,207
+.byte	102,15,56,0,215
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm6
+	pslld	$12,%xmm15
+	psrld	$20,%xmm6
+	movdqa	%xmm12,%xmm7
+	pslld	$12,%xmm12
+	por	%xmm6,%xmm15
+	psrld	$20,%xmm7
+	movdqa	(%r11),%xmm6
+	por	%xmm7,%xmm12
+	paddd	%xmm15,%xmm10
+	paddd	%xmm12,%xmm11
+	pxor	%xmm10,%xmm1
+	pxor	%xmm11,%xmm2
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	paddd	%xmm1,%xmm4
+	paddd	%xmm2,%xmm5
+	pxor	%xmm4,%xmm15
+	pxor	%xmm5,%xmm12
+	movdqa	%xmm15,%xmm7
+	pslld	$7,%xmm15
+	psrld	$25,%xmm7
+	movdqa	%xmm12,%xmm6
+	pslld	$7,%xmm12
+	por	%xmm7,%xmm15
+	psrld	$25,%xmm6
+	movdqa	(%r10),%xmm7
+	por	%xmm6,%xmm12
+	decl	%eax
+	jnz	.Loop4x
+
+	paddd	64(%rsp),%xmm8
+	paddd	80(%rsp),%xmm9
+	paddd	96(%rsp),%xmm10
+	paddd	112(%rsp),%xmm11
+
+	movdqa	%xmm8,%xmm6
+	punpckldq	%xmm9,%xmm8
+	movdqa	%xmm10,%xmm7
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm9,%xmm6
+	punpckhdq	%xmm11,%xmm7
+	movdqa	%xmm8,%xmm9
+	punpcklqdq	%xmm10,%xmm8
+	movdqa	%xmm6,%xmm11
+	punpcklqdq	%xmm7,%xmm6
+	punpckhqdq	%xmm10,%xmm9
+	punpckhqdq	%xmm7,%xmm11
+	paddd	128-256(%rcx),%xmm12
+	paddd	144-256(%rcx),%xmm13
+	paddd	160-256(%rcx),%xmm14
+	paddd	176-256(%rcx),%xmm15
+
+	movdqa	%xmm8,0(%rsp)
+	movdqa	%xmm9,16(%rsp)
+	movdqa	32(%rsp),%xmm8
+	movdqa	48(%rsp),%xmm9
+
+	movdqa	%xmm12,%xmm10
+	punpckldq	%xmm13,%xmm12
+	movdqa	%xmm14,%xmm7
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm13,%xmm10
+	punpckhdq	%xmm15,%xmm7
+	movdqa	%xmm12,%xmm13
+	punpcklqdq	%xmm14,%xmm12
+	movdqa	%xmm10,%xmm15
+	punpcklqdq	%xmm7,%xmm10
+	punpckhqdq	%xmm14,%xmm13
+	punpckhqdq	%xmm7,%xmm15
+	paddd	192-256(%rcx),%xmm4
+	paddd	208-256(%rcx),%xmm5
+	paddd	224-256(%rcx),%xmm8
+	paddd	240-256(%rcx),%xmm9
+
+	movdqa	%xmm6,32(%rsp)
+	movdqa	%xmm11,48(%rsp)
+
+	movdqa	%xmm4,%xmm14
+	punpckldq	%xmm5,%xmm4
+	movdqa	%xmm8,%xmm7
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm5,%xmm14
+	punpckhdq	%xmm9,%xmm7
+	movdqa	%xmm4,%xmm5
+	punpcklqdq	%xmm8,%xmm4
+	movdqa	%xmm14,%xmm9
+	punpcklqdq	%xmm7,%xmm14
+	punpckhqdq	%xmm8,%xmm5
+	punpckhqdq	%xmm7,%xmm9
+	paddd	256-256(%rcx),%xmm0
+	paddd	272-256(%rcx),%xmm1
+	paddd	288-256(%rcx),%xmm2
+	paddd	304-256(%rcx),%xmm3
+
+	movdqa	%xmm0,%xmm8
+	punpckldq	%xmm1,%xmm0
+	movdqa	%xmm2,%xmm7
+	punpckldq	%xmm3,%xmm2
+	punpckhdq	%xmm1,%xmm8
+	punpckhdq	%xmm3,%xmm7
+	movdqa	%xmm0,%xmm1
+	punpcklqdq	%xmm2,%xmm0
+	movdqa	%xmm8,%xmm3
+	punpcklqdq	%xmm7,%xmm8
+	punpckhqdq	%xmm2,%xmm1
+	punpckhqdq	%xmm7,%xmm3
+	cmpq	$256,%rdx
+	jb	.Ltail4x
+
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	48(%rsp),%xmm6
+	pxor	%xmm15,%xmm11
+	pxor	%xmm9,%xmm2
+	pxor	%xmm3,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$256,%rdx
+	jnz	.Loop_outer4x
+
+	jmp	.Ldone4x
+
+.Ltail4x:
+	cmpq	$192,%rdx
+	jae	.L192_or_more4x
+	cmpq	$128,%rdx
+	jae	.L128_or_more4x
+	cmpq	$64,%rdx
+	jae	.L64_or_more4x
+
+
+	xorq	%r10,%r10
+
+	movdqa	%xmm12,16(%rsp)
+	movdqa	%xmm4,32(%rsp)
+	movdqa	%xmm0,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L64_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	16(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm13,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm5,32(%rsp)
+	subq	$64,%rdx
+	movdqa	%xmm1,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L128_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm11,80(%rdi)
+	movdqu	%xmm2,96(%rdi)
+	movdqu	%xmm7,112(%rdi)
+	je	.Ldone4x
+
+	movdqa	32(%rsp),%xmm6
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm10,16(%rsp)
+	leaq	128(%rdi),%rdi
+	movdqa	%xmm14,32(%rsp)
+	subq	$128,%rdx
+	movdqa	%xmm8,48(%rsp)
+	jmp	.Loop_tail4x
+
+.align	32
+.L192_or_more4x:
+	movdqu	0(%rsi),%xmm6
+	movdqu	16(%rsi),%xmm11
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm7
+	pxor	0(%rsp),%xmm6
+	pxor	%xmm12,%xmm11
+	pxor	%xmm4,%xmm2
+	pxor	%xmm0,%xmm7
+
+	movdqu	%xmm6,0(%rdi)
+	movdqu	64(%rsi),%xmm6
+	movdqu	%xmm11,16(%rdi)
+	movdqu	80(%rsi),%xmm11
+	movdqu	%xmm2,32(%rdi)
+	movdqu	96(%rsi),%xmm2
+	movdqu	%xmm7,48(%rdi)
+	movdqu	112(%rsi),%xmm7
+	leaq	128(%rsi),%rsi
+	pxor	16(%rsp),%xmm6
+	pxor	%xmm13,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm1,%xmm7
+
+	movdqu	%xmm6,64(%rdi)
+	movdqu	0(%rsi),%xmm6
+	movdqu	%xmm11,80(%rdi)
+	movdqu	16(%rsi),%xmm11
+	movdqu	%xmm2,96(%rdi)
+	movdqu	32(%rsi),%xmm2
+	movdqu	%xmm7,112(%rdi)
+	leaq	128(%rdi),%rdi
+	movdqu	48(%rsi),%xmm7
+	pxor	32(%rsp),%xmm6
+	pxor	%xmm10,%xmm11
+	pxor	%xmm14,%xmm2
+	pxor	%xmm8,%xmm7
+	movdqu	%xmm6,0(%rdi)
+	movdqu	%xmm11,16(%rdi)
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm7,48(%rdi)
+	je	.Ldone4x
+
+	movdqa	48(%rsp),%xmm6
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	movdqa	%xmm6,0(%rsp)
+	movdqa	%xmm15,16(%rsp)
+	leaq	64(%rdi),%rdi
+	movdqa	%xmm9,32(%rsp)
+	subq	$192,%rdx
+	movdqa	%xmm3,48(%rsp)
+
+.Loop_tail4x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	.Loop_tail4x
+
+.Ldone4x:
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L4x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ChaCha20_4x,.-ChaCha20_4x
+.type	ChaCha20_8x,@function
+.align	32
+ChaCha20_8x:
+.LChaCha20_8x:
+.cfi_startproc	
+	movq	%rsp,%r9
+.cfi_def_cfa_register	r9
+	subq	$0x280+8,%rsp
+	andq	$-32,%rsp
+	vzeroupper
+
+
+
+
+
+
+
+
+
+
+	vbroadcasti128	.Lsigma(%rip),%ymm11
+	vbroadcasti128	(%rcx),%ymm3
+	vbroadcasti128	16(%rcx),%ymm15
+	vbroadcasti128	(%r8),%ymm7
+	leaq	256(%rsp),%rcx
+	leaq	512(%rsp),%rax
+	leaq	.Lrot16(%rip),%r10
+	leaq	.Lrot24(%rip),%r11
+
+	vpshufd	$0x00,%ymm11,%ymm8
+	vpshufd	$0x55,%ymm11,%ymm9
+	vmovdqa	%ymm8,128-256(%rcx)
+	vpshufd	$0xaa,%ymm11,%ymm10
+	vmovdqa	%ymm9,160-256(%rcx)
+	vpshufd	$0xff,%ymm11,%ymm11
+	vmovdqa	%ymm10,192-256(%rcx)
+	vmovdqa	%ymm11,224-256(%rcx)
+
+	vpshufd	$0x00,%ymm3,%ymm0
+	vpshufd	$0x55,%ymm3,%ymm1
+	vmovdqa	%ymm0,256-256(%rcx)
+	vpshufd	$0xaa,%ymm3,%ymm2
+	vmovdqa	%ymm1,288-256(%rcx)
+	vpshufd	$0xff,%ymm3,%ymm3
+	vmovdqa	%ymm2,320-256(%rcx)
+	vmovdqa	%ymm3,352-256(%rcx)
+
+	vpshufd	$0x00,%ymm15,%ymm12
+	vpshufd	$0x55,%ymm15,%ymm13
+	vmovdqa	%ymm12,384-512(%rax)
+	vpshufd	$0xaa,%ymm15,%ymm14
+	vmovdqa	%ymm13,416-512(%rax)
+	vpshufd	$0xff,%ymm15,%ymm15
+	vmovdqa	%ymm14,448-512(%rax)
+	vmovdqa	%ymm15,480-512(%rax)
+
+	vpshufd	$0x00,%ymm7,%ymm4
+	vpshufd	$0x55,%ymm7,%ymm5
+	vpaddd	.Lincy(%rip),%ymm4,%ymm4
+	vpshufd	$0xaa,%ymm7,%ymm6
+	vmovdqa	%ymm5,544-512(%rax)
+	vpshufd	$0xff,%ymm7,%ymm7
+	vmovdqa	%ymm6,576-512(%rax)
+	vmovdqa	%ymm7,608-512(%rax)
+
+	jmp	.Loop_enter8x
+
+.align	32
+.Loop_outer8x:
+	vmovdqa	128-256(%rcx),%ymm8
+	vmovdqa	160-256(%rcx),%ymm9
+	vmovdqa	192-256(%rcx),%ymm10
+	vmovdqa	224-256(%rcx),%ymm11
+	vmovdqa	256-256(%rcx),%ymm0
+	vmovdqa	288-256(%rcx),%ymm1
+	vmovdqa	320-256(%rcx),%ymm2
+	vmovdqa	352-256(%rcx),%ymm3
+	vmovdqa	384-512(%rax),%ymm12
+	vmovdqa	416-512(%rax),%ymm13
+	vmovdqa	448-512(%rax),%ymm14
+	vmovdqa	480-512(%rax),%ymm15
+	vmovdqa	512-512(%rax),%ymm4
+	vmovdqa	544-512(%rax),%ymm5
+	vmovdqa	576-512(%rax),%ymm6
+	vmovdqa	608-512(%rax),%ymm7
+	vpaddd	.Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+	vmovdqa	%ymm14,64(%rsp)
+	vmovdqa	%ymm15,96(%rsp)
+	vbroadcasti128	(%r10),%ymm15
+	vmovdqa	%ymm4,512-512(%rax)
+	movl	$10,%eax
+	jmp	.Loop8x
+
+.align	32
+.Loop8x:
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$12,%ymm0,%ymm14
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$12,%ymm1,%ymm15
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpxor	%ymm4,%ymm8,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm1,%ymm9,%ymm9
+	vpxor	%ymm5,%ymm9,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm12,%ymm12
+	vpxor	%ymm0,%ymm12,%ymm0
+	vpslld	$7,%ymm0,%ymm15
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm5,%ymm13,%ymm13
+	vpxor	%ymm1,%ymm13,%ymm1
+	vpslld	$7,%ymm1,%ymm14
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vmovdqa	%ymm12,0(%rsp)
+	vmovdqa	%ymm13,32(%rsp)
+	vmovdqa	64(%rsp),%ymm12
+	vmovdqa	96(%rsp),%ymm13
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$12,%ymm2,%ymm14
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$12,%ymm3,%ymm15
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vpaddd	%ymm2,%ymm10,%ymm10
+	vpxor	%ymm6,%ymm10,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm3,%ymm11,%ymm11
+	vpxor	%ymm7,%ymm11,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm6,%ymm12,%ymm12
+	vpxor	%ymm2,%ymm12,%ymm2
+	vpslld	$7,%ymm2,%ymm15
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm7,%ymm13,%ymm13
+	vpxor	%ymm3,%ymm13,%ymm3
+	vpslld	$7,%ymm3,%ymm14
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm15,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm15,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$12,%ymm1,%ymm14
+	vpsrld	$20,%ymm1,%ymm1
+	vpor	%ymm1,%ymm14,%ymm1
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$12,%ymm2,%ymm15
+	vpsrld	$20,%ymm2,%ymm2
+	vpor	%ymm2,%ymm15,%ymm2
+	vpaddd	%ymm1,%ymm8,%ymm8
+	vpxor	%ymm7,%ymm8,%ymm7
+	vpshufb	%ymm14,%ymm7,%ymm7
+	vpaddd	%ymm2,%ymm9,%ymm9
+	vpxor	%ymm4,%ymm9,%ymm4
+	vpshufb	%ymm14,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm12,%ymm12
+	vpxor	%ymm1,%ymm12,%ymm1
+	vpslld	$7,%ymm1,%ymm15
+	vpsrld	$25,%ymm1,%ymm1
+	vpor	%ymm1,%ymm15,%ymm1
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm4,%ymm13,%ymm13
+	vpxor	%ymm2,%ymm13,%ymm2
+	vpslld	$7,%ymm2,%ymm14
+	vpsrld	$25,%ymm2,%ymm2
+	vpor	%ymm2,%ymm14,%ymm2
+	vmovdqa	%ymm12,64(%rsp)
+	vmovdqa	%ymm13,96(%rsp)
+	vmovdqa	0(%rsp),%ymm12
+	vmovdqa	32(%rsp),%ymm13
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm15,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm15,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$12,%ymm3,%ymm14
+	vpsrld	$20,%ymm3,%ymm3
+	vpor	%ymm3,%ymm14,%ymm3
+	vbroadcasti128	(%r11),%ymm14
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$12,%ymm0,%ymm15
+	vpsrld	$20,%ymm0,%ymm0
+	vpor	%ymm0,%ymm15,%ymm0
+	vpaddd	%ymm3,%ymm10,%ymm10
+	vpxor	%ymm5,%ymm10,%ymm5
+	vpshufb	%ymm14,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm11,%ymm11
+	vpxor	%ymm6,%ymm11,%ymm6
+	vpshufb	%ymm14,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm12,%ymm12
+	vpxor	%ymm3,%ymm12,%ymm3
+	vpslld	$7,%ymm3,%ymm15
+	vpsrld	$25,%ymm3,%ymm3
+	vpor	%ymm3,%ymm15,%ymm3
+	vbroadcasti128	(%r10),%ymm15
+	vpaddd	%ymm6,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm13,%ymm0
+	vpslld	$7,%ymm0,%ymm14
+	vpsrld	$25,%ymm0,%ymm0
+	vpor	%ymm0,%ymm14,%ymm0
+	decl	%eax
+	jnz	.Loop8x
+
+	leaq	512(%rsp),%rax
+	vpaddd	128-256(%rcx),%ymm8,%ymm8
+	vpaddd	160-256(%rcx),%ymm9,%ymm9
+	vpaddd	192-256(%rcx),%ymm10,%ymm10
+	vpaddd	224-256(%rcx),%ymm11,%ymm11
+
+	vpunpckldq	%ymm9,%ymm8,%ymm14
+	vpunpckldq	%ymm11,%ymm10,%ymm15
+	vpunpckhdq	%ymm9,%ymm8,%ymm8
+	vpunpckhdq	%ymm11,%ymm10,%ymm10
+	vpunpcklqdq	%ymm15,%ymm14,%ymm9
+	vpunpckhqdq	%ymm15,%ymm14,%ymm14
+	vpunpcklqdq	%ymm10,%ymm8,%ymm11
+	vpunpckhqdq	%ymm10,%ymm8,%ymm8
+	vpaddd	256-256(%rcx),%ymm0,%ymm0
+	vpaddd	288-256(%rcx),%ymm1,%ymm1
+	vpaddd	320-256(%rcx),%ymm2,%ymm2
+	vpaddd	352-256(%rcx),%ymm3,%ymm3
+
+	vpunpckldq	%ymm1,%ymm0,%ymm10
+	vpunpckldq	%ymm3,%ymm2,%ymm15
+	vpunpckhdq	%ymm1,%ymm0,%ymm0
+	vpunpckhdq	%ymm3,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm10,%ymm1
+	vpunpckhqdq	%ymm15,%ymm10,%ymm10
+	vpunpcklqdq	%ymm2,%ymm0,%ymm3
+	vpunpckhqdq	%ymm2,%ymm0,%ymm0
+	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
+	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
+	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
+	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
+	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
+	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
+	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
+	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
+	vmovdqa	%ymm15,0(%rsp)
+	vmovdqa	%ymm9,32(%rsp)
+	vmovdqa	64(%rsp),%ymm15
+	vmovdqa	96(%rsp),%ymm9
+
+	vpaddd	384-512(%rax),%ymm12,%ymm12
+	vpaddd	416-512(%rax),%ymm13,%ymm13
+	vpaddd	448-512(%rax),%ymm15,%ymm15
+	vpaddd	480-512(%rax),%ymm9,%ymm9
+
+	vpunpckldq	%ymm13,%ymm12,%ymm2
+	vpunpckldq	%ymm9,%ymm15,%ymm8
+	vpunpckhdq	%ymm13,%ymm12,%ymm12
+	vpunpckhdq	%ymm9,%ymm15,%ymm15
+	vpunpcklqdq	%ymm8,%ymm2,%ymm13
+	vpunpckhqdq	%ymm8,%ymm2,%ymm2
+	vpunpcklqdq	%ymm15,%ymm12,%ymm9
+	vpunpckhqdq	%ymm15,%ymm12,%ymm12
+	vpaddd	512-512(%rax),%ymm4,%ymm4
+	vpaddd	544-512(%rax),%ymm5,%ymm5
+	vpaddd	576-512(%rax),%ymm6,%ymm6
+	vpaddd	608-512(%rax),%ymm7,%ymm7
+
+	vpunpckldq	%ymm5,%ymm4,%ymm15
+	vpunpckldq	%ymm7,%ymm6,%ymm8
+	vpunpckhdq	%ymm5,%ymm4,%ymm4
+	vpunpckhdq	%ymm7,%ymm6,%ymm6
+	vpunpcklqdq	%ymm8,%ymm15,%ymm5
+	vpunpckhqdq	%ymm8,%ymm15,%ymm15
+	vpunpcklqdq	%ymm6,%ymm4,%ymm7
+	vpunpckhqdq	%ymm6,%ymm4,%ymm4
+	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
+	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
+	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
+	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
+	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
+	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
+	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
+	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
+	vmovdqa	0(%rsp),%ymm6
+	vmovdqa	32(%rsp),%ymm12
+
+	cmpq	$512,%rdx
+	jb	.Ltail8x
+
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm12,%ymm12
+	vpxor	32(%rsi),%ymm13,%ymm13
+	vpxor	64(%rsi),%ymm10,%ymm10
+	vpxor	96(%rsi),%ymm15,%ymm15
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm12,0(%rdi)
+	vmovdqu	%ymm13,32(%rdi)
+	vmovdqu	%ymm10,64(%rdi)
+	vmovdqu	%ymm15,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm14,%ymm14
+	vpxor	32(%rsi),%ymm2,%ymm2
+	vpxor	64(%rsi),%ymm3,%ymm3
+	vpxor	96(%rsi),%ymm7,%ymm7
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm14,0(%rdi)
+	vmovdqu	%ymm2,32(%rdi)
+	vmovdqu	%ymm3,64(%rdi)
+	vmovdqu	%ymm7,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	vpxor	0(%rsi),%ymm11,%ymm11
+	vpxor	32(%rsi),%ymm9,%ymm9
+	vpxor	64(%rsi),%ymm0,%ymm0
+	vpxor	96(%rsi),%ymm4,%ymm4
+	leaq	128(%rsi),%rsi
+	vmovdqu	%ymm11,0(%rdi)
+	vmovdqu	%ymm9,32(%rdi)
+	vmovdqu	%ymm0,64(%rdi)
+	vmovdqu	%ymm4,96(%rdi)
+	leaq	128(%rdi),%rdi
+
+	subq	$512,%rdx
+	jnz	.Loop_outer8x
+
+	jmp	.Ldone8x
+
+.Ltail8x:
+	cmpq	$448,%rdx
+	jae	.L448_or_more8x
+	cmpq	$384,%rdx
+	jae	.L384_or_more8x
+	cmpq	$320,%rdx
+	jae	.L320_or_more8x
+	cmpq	$256,%rdx
+	jae	.L256_or_more8x
+	cmpq	$192,%rdx
+	jae	.L192_or_more8x
+	cmpq	$128,%rdx
+	jae	.L128_or_more8x
+	cmpq	$64,%rdx
+	jae	.L64_or_more8x
+
+	xorq	%r10,%r10
+	vmovdqa	%ymm6,0(%rsp)
+	vmovdqa	%ymm8,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L64_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	je	.Ldone8x
+
+	leaq	64(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm1,0(%rsp)
+	leaq	64(%rdi),%rdi
+	subq	$64,%rdx
+	vmovdqa	%ymm5,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L128_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	je	.Ldone8x
+
+	leaq	128(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm12,0(%rsp)
+	leaq	128(%rdi),%rdi
+	subq	$128,%rdx
+	vmovdqa	%ymm13,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L192_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	je	.Ldone8x
+
+	leaq	192(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm10,0(%rsp)
+	leaq	192(%rdi),%rdi
+	subq	$192,%rdx
+	vmovdqa	%ymm15,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L256_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	je	.Ldone8x
+
+	leaq	256(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm14,0(%rsp)
+	leaq	256(%rdi),%rdi
+	subq	$256,%rdx
+	vmovdqa	%ymm2,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L320_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	je	.Ldone8x
+
+	leaq	320(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm3,0(%rsp)
+	leaq	320(%rdi),%rdi
+	subq	$320,%rdx
+	vmovdqa	%ymm7,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L384_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	je	.Ldone8x
+
+	leaq	384(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm11,0(%rsp)
+	leaq	384(%rdi),%rdi
+	subq	$384,%rdx
+	vmovdqa	%ymm9,32(%rsp)
+	jmp	.Loop_tail8x
+
+.align	32
+.L448_or_more8x:
+	vpxor	0(%rsi),%ymm6,%ymm6
+	vpxor	32(%rsi),%ymm8,%ymm8
+	vpxor	64(%rsi),%ymm1,%ymm1
+	vpxor	96(%rsi),%ymm5,%ymm5
+	vpxor	128(%rsi),%ymm12,%ymm12
+	vpxor	160(%rsi),%ymm13,%ymm13
+	vpxor	192(%rsi),%ymm10,%ymm10
+	vpxor	224(%rsi),%ymm15,%ymm15
+	vpxor	256(%rsi),%ymm14,%ymm14
+	vpxor	288(%rsi),%ymm2,%ymm2
+	vpxor	320(%rsi),%ymm3,%ymm3
+	vpxor	352(%rsi),%ymm7,%ymm7
+	vpxor	384(%rsi),%ymm11,%ymm11
+	vpxor	416(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm6,0(%rdi)
+	vmovdqu	%ymm8,32(%rdi)
+	vmovdqu	%ymm1,64(%rdi)
+	vmovdqu	%ymm5,96(%rdi)
+	vmovdqu	%ymm12,128(%rdi)
+	vmovdqu	%ymm13,160(%rdi)
+	vmovdqu	%ymm10,192(%rdi)
+	vmovdqu	%ymm15,224(%rdi)
+	vmovdqu	%ymm14,256(%rdi)
+	vmovdqu	%ymm2,288(%rdi)
+	vmovdqu	%ymm3,320(%rdi)
+	vmovdqu	%ymm7,352(%rdi)
+	vmovdqu	%ymm11,384(%rdi)
+	vmovdqu	%ymm9,416(%rdi)
+	je	.Ldone8x
+
+	leaq	448(%rsi),%rsi
+	xorq	%r10,%r10
+	vmovdqa	%ymm0,0(%rsp)
+	leaq	448(%rdi),%rdi
+	subq	$448,%rdx
+	vmovdqa	%ymm4,32(%rsp)
+
+.Loop_tail8x:
+	movzbl	(%rsi,%r10,1),%eax
+	movzbl	(%rsp,%r10,1),%ecx
+	leaq	1(%r10),%r10
+	xorl	%ecx,%eax
+	movb	%al,-1(%rdi,%r10,1)
+	decq	%rdx
+	jnz	.Loop_tail8x
+
+.Ldone8x:
+	vzeroall
+	leaq	(%r9),%rsp
+.cfi_def_cfa_register	rsp
+.L8x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ChaCha20_8x,.-ChaCha20_8x
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
@@ -1,0 +1,3079 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.data	
+
+.align	16
+one:
+.quad	1,0
+two:
+.quad	2,0
+three:
+.quad	3,0
+four:
+.quad	4,0
+five:
+.quad	5,0
+six:
+.quad	6,0
+seven:
+.quad	7,0
+eight:
+.quad	8,0
+
+OR_MASK:
+.long	0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+.quad	0x1, 0xc200000000000000
+mask:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+.long	1,1,1,1
+con2:
+.long	0x1b,0x1b,0x1b,0x1b
+con3:
+.byte	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+.long	0,0xffffffff, 0xffffffff, 0xffffffff
+.text	
+.type	GFMUL,@function
+.align	16
+GFMUL:
+.cfi_startproc	
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm5
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpxor	%xmm3,%xmm5,%xmm5
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
+	vpshufd	$78,%xmm2,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm2
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm2,%xmm3
+	vpshufd	$78,%xmm2,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm2
+
+	vpxor	%xmm5,%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	GFMUL, .-GFMUL
+.globl	aesgcmsiv_htable_init
+.hidden aesgcmsiv_htable_init
+.type	aesgcmsiv_htable_init,@function
+.align	16
+aesgcmsiv_htable_init:
+.cfi_startproc	
+	vmovdqa	(%rsi),%xmm0
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm0,(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,16(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,32(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,48(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,64(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,80(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,96(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,112(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
+.globl	aesgcmsiv_htable6_init
+.hidden aesgcmsiv_htable6_init
+.type	aesgcmsiv_htable6_init,@function
+.align	16
+aesgcmsiv_htable6_init:
+.cfi_startproc	
+	vmovdqa	(%rsi),%xmm0
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm0,(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,16(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,32(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,48(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,64(%rdi)
+	call	GFMUL
+	vmovdqa	%xmm0,80(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
+.globl	aesgcmsiv_htable_polyval
+.hidden aesgcmsiv_htable_polyval
+.type	aesgcmsiv_htable_polyval,@function
+.align	16
+aesgcmsiv_htable_polyval:
+.cfi_startproc	
+	testq	%rdx,%rdx
+	jnz	.Lhtable_polyval_start
+	.byte	0xf3,0xc3
+
+.Lhtable_polyval_start:
+	vzeroall
+
+
+
+	movq	%rdx,%r11
+	andq	$127,%r11
+
+	jz	.Lhtable_polyval_no_prefix
+
+	vpxor	%xmm9,%xmm9,%xmm9
+	vmovdqa	(%rcx),%xmm1
+	subq	%r11,%rdx
+
+	subq	$16,%r11
+
+
+	vmovdqu	(%rsi),%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm5
+	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm3
+	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm4
+	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+	leaq	16(%rsi),%rsi
+	testq	%r11,%r11
+	jnz	.Lhtable_polyval_prefix_loop
+	jmp	.Lhtable_polyval_prefix_complete
+
+
+.align	64
+.Lhtable_polyval_prefix_loop:
+	subq	$16,%r11
+
+	vmovdqu	(%rsi),%xmm0
+
+	vpclmulqdq	$0x00,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x01,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x10,(%rdi,%r11,1),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+	testq	%r11,%r11
+
+	leaq	16(%rsi),%rsi
+
+	jnz	.Lhtable_polyval_prefix_loop
+
+.Lhtable_polyval_prefix_complete:
+	vpsrldq	$8,%xmm5,%xmm6
+	vpslldq	$8,%xmm5,%xmm5
+
+	vpxor	%xmm6,%xmm4,%xmm9
+	vpxor	%xmm5,%xmm3,%xmm1
+
+	jmp	.Lhtable_polyval_main_loop
+
+.Lhtable_polyval_no_prefix:
+
+
+
+
+	vpxor	%xmm1,%xmm1,%xmm1
+	vmovdqa	(%rcx),%xmm9
+
+.align	64
+.Lhtable_polyval_main_loop:
+	subq	$0x80,%rdx
+	jb	.Lhtable_polyval_out
+
+	vmovdqu	112(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,(%rdi),%xmm0,%xmm5
+	vpclmulqdq	$0x00,(%rdi),%xmm0,%xmm3
+	vpclmulqdq	$0x11,(%rdi),%xmm0,%xmm4
+	vpclmulqdq	$0x10,(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	96(%rsi),%xmm0
+	vpclmulqdq	$0x01,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,16(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+
+	vmovdqu	80(%rsi),%xmm0
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+
+	vpclmulqdq	$0x01,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,32(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm7,%xmm1,%xmm1
+
+	vmovdqu	64(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,48(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	48(%rsi),%xmm0
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm7
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+
+	vpclmulqdq	$0x01,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,64(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm7,%xmm1,%xmm1
+
+	vmovdqu	32(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,80(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vmovdqu	16(%rsi),%xmm0
+
+	vpclmulqdq	$0x01,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,96(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vmovdqu	0(%rsi),%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+
+	vpclmulqdq	$0x01,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x00,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm3,%xmm3
+	vpclmulqdq	$0x11,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm4,%xmm4
+	vpclmulqdq	$0x10,112(%rdi),%xmm0,%xmm6
+	vpxor	%xmm6,%xmm5,%xmm5
+
+
+	vpsrldq	$8,%xmm5,%xmm6
+	vpslldq	$8,%xmm5,%xmm5
+
+	vpxor	%xmm6,%xmm4,%xmm9
+	vpxor	%xmm5,%xmm3,%xmm1
+
+	leaq	128(%rsi),%rsi
+	jmp	.Lhtable_polyval_main_loop
+
+
+
+.Lhtable_polyval_out:
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+	vpxor	%xmm6,%xmm1,%xmm1
+
+	vpclmulqdq	$0x10,poly(%rip),%xmm1,%xmm6
+	vpalignr	$8,%xmm1,%xmm1,%xmm1
+	vpxor	%xmm6,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rcx)
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
+.globl	aesgcmsiv_polyval_horner
+.hidden aesgcmsiv_polyval_horner
+.type	aesgcmsiv_polyval_horner,@function
+.align	16
+aesgcmsiv_polyval_horner:
+.cfi_startproc	
+	testq	%rcx,%rcx
+	jnz	.Lpolyval_horner_start
+	.byte	0xf3,0xc3
+
+.Lpolyval_horner_start:
+
+
+
+	xorq	%r10,%r10
+	shlq	$4,%rcx
+
+	vmovdqa	(%rsi),%xmm1
+	vmovdqa	(%rdi),%xmm0
+
+.Lpolyval_horner_loop:
+	vpxor	(%rdx,%r10,1),%xmm0,%xmm0
+	call	GFMUL
+
+	addq	$16,%r10
+	cmpq	%r10,%rcx
+	jne	.Lpolyval_horner_loop
+
+
+	vmovdqa	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
+.globl	aes128gcmsiv_aes_ks
+.hidden aes128gcmsiv_aes_ks
+.type	aes128gcmsiv_aes_ks,@function
+.align	16
+aes128gcmsiv_aes_ks:
+.cfi_startproc	
+	vmovdqu	(%rdi),%xmm1
+	vmovdqa	%xmm1,(%rsi)
+
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+
+	movq	$8,%rax
+
+.Lks128_loop:
+	addq	$16,%rsi
+	subq	$1,%rax
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	jne	.Lks128_loop
+
+	vmovdqa	con2(%rip),%xmm0
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,16(%rsi)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslldq	$4,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpslldq	$4,%xmm3,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,32(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
+.globl	aes256gcmsiv_aes_ks
+.hidden aes256gcmsiv_aes_ks
+.type	aes256gcmsiv_aes_ks,@function
+.align	16
+aes256gcmsiv_aes_ks:
+.cfi_startproc	
+	vmovdqu	(%rdi),%xmm1
+	vmovdqu	16(%rdi),%xmm3
+	vmovdqa	%xmm1,(%rsi)
+	vmovdqa	%xmm3,16(%rsi)
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+	vpxor	%xmm14,%xmm14,%xmm14
+	movq	$6,%rax
+
+.Lks256_loop:
+	addq	$32,%rsi
+	subq	$1,%rax
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpsllq	$32,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpshufb	con3(%rip),%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vmovdqa	%xmm3,16(%rsi)
+	jne	.Lks256_loop
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpsllq	$32,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vmovdqa	%xmm1,32(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.globl	aes128gcmsiv_aes_ks_enc_x1
+.hidden aes128gcmsiv_aes_ks_enc_x1
+.type	aes128gcmsiv_aes_ks_enc_x1,@function
+.align	16
+aes128gcmsiv_aes_ks_enc_x1:
+.cfi_startproc	
+	vmovdqa	(%rcx),%xmm1
+	vmovdqa	0(%rdi),%xmm4
+
+	vmovdqa	%xmm1,(%rdx)
+	vpxor	%xmm1,%xmm4,%xmm4
+
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,16(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,32(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,48(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,64(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,80(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,96(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,112(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,128(%rdx)
+
+
+	vmovdqa	con2(%rip),%xmm0
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,144(%rdx)
+
+	vpshufb	%xmm15,%xmm1,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpsllq	$32,%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpshufb	con3(%rip),%xmm1,%xmm3
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+
+	vaesenclast	%xmm1,%xmm4,%xmm4
+	vmovdqa	%xmm1,160(%rdx)
+
+
+	vmovdqa	%xmm4,0(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
+.globl	aes128gcmsiv_kdf
+.hidden aes128gcmsiv_kdf
+.type	aes128gcmsiv_kdf,@function
+.align	16
+aes128gcmsiv_kdf:
+.cfi_startproc	
+
+
+
+
+	vmovdqa	(%rdx),%xmm1
+	vmovdqa	0(%rdi),%xmm9
+	vmovdqa	and_mask(%rip),%xmm12
+	vmovdqa	one(%rip),%xmm13
+	vpshufd	$0x90,%xmm9,%xmm9
+	vpand	%xmm12,%xmm9,%xmm9
+	vpaddd	%xmm13,%xmm9,%xmm10
+	vpaddd	%xmm13,%xmm10,%xmm11
+	vpaddd	%xmm13,%xmm11,%xmm12
+
+	vpxor	%xmm1,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm10,%xmm10
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	16(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	32(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	48(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	64(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	80(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	96(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	112(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	128(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+
+	vmovdqa	144(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+
+	vmovdqa	160(%rdx),%xmm2
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vaesenclast	%xmm2,%xmm10,%xmm10
+	vaesenclast	%xmm2,%xmm11,%xmm11
+	vaesenclast	%xmm2,%xmm12,%xmm12
+
+
+	vmovdqa	%xmm9,0(%rsi)
+	vmovdqa	%xmm10,16(%rsi)
+	vmovdqa	%xmm11,32(%rsi)
+	vmovdqa	%xmm12,48(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
+.globl	aes128gcmsiv_enc_msg_x4
+.hidden aes128gcmsiv_enc_msg_x4
+.type	aes128gcmsiv_enc_msg_x4,@function
+.align	16
+aes128gcmsiv_enc_msg_x4:
+.cfi_startproc	
+	testq	%r8,%r8
+	jnz	.L128_enc_msg_x4_start
+	.byte	0xf3,0xc3
+
+.L128_enc_msg_x4_start:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+
+	shrq	$4,%r8
+	movq	%r8,%r10
+	shlq	$62,%r10
+	shrq	$62,%r10
+
+
+	vmovdqa	(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+
+	vmovdqu	four(%rip),%xmm4
+	vmovdqa	%xmm15,%xmm0
+	vpaddd	one(%rip),%xmm15,%xmm1
+	vpaddd	two(%rip),%xmm15,%xmm2
+	vpaddd	three(%rip),%xmm15,%xmm3
+
+	shrq	$2,%r8
+	je	.L128_enc_msg_x4_check_remainder
+
+	subq	$64,%rsi
+	subq	$64,%rdi
+
+.L128_enc_msg_x4_loop1:
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+	vmovdqa	%xmm0,%xmm5
+	vmovdqa	%xmm1,%xmm6
+	vmovdqa	%xmm2,%xmm7
+	vmovdqa	%xmm3,%xmm8
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqu	32(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm1,%xmm1
+	vmovdqu	48(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm2,%xmm2
+	vmovdqu	64(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm3,%xmm3
+
+	vmovdqu	80(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	96(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	112(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	128(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	144(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm12
+	vaesenclast	%xmm12,%xmm5,%xmm5
+	vaesenclast	%xmm12,%xmm6,%xmm6
+	vaesenclast	%xmm12,%xmm7,%xmm7
+	vaesenclast	%xmm12,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm5,%xmm5
+	vpxor	16(%rdi),%xmm6,%xmm6
+	vpxor	32(%rdi),%xmm7,%xmm7
+	vpxor	48(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm5,0(%rsi)
+	vmovdqu	%xmm6,16(%rsi)
+	vmovdqu	%xmm7,32(%rsi)
+	vmovdqu	%xmm8,48(%rsi)
+
+	jne	.L128_enc_msg_x4_loop1
+
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+.L128_enc_msg_x4_check_remainder:
+	cmpq	$0,%r10
+	je	.L128_enc_msg_x4_out
+
+.L128_enc_msg_x4_loop2:
+
+
+	vmovdqa	%xmm0,%xmm5
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vaesenc	16(%rcx),%xmm5,%xmm5
+	vaesenc	32(%rcx),%xmm5,%xmm5
+	vaesenc	48(%rcx),%xmm5,%xmm5
+	vaesenc	64(%rcx),%xmm5,%xmm5
+	vaesenc	80(%rcx),%xmm5,%xmm5
+	vaesenc	96(%rcx),%xmm5,%xmm5
+	vaesenc	112(%rcx),%xmm5,%xmm5
+	vaesenc	128(%rcx),%xmm5,%xmm5
+	vaesenc	144(%rcx),%xmm5,%xmm5
+	vaesenclast	160(%rcx),%xmm5,%xmm5
+
+
+	vpxor	(%rdi),%xmm5,%xmm5
+	vmovdqu	%xmm5,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	subq	$1,%r10
+	jne	.L128_enc_msg_x4_loop2
+
+.L128_enc_msg_x4_out:
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
+.globl	aes128gcmsiv_enc_msg_x8
+.hidden aes128gcmsiv_enc_msg_x8
+.type	aes128gcmsiv_enc_msg_x8,@function
+.align	16
+aes128gcmsiv_enc_msg_x8:
+.cfi_startproc	
+	testq	%r8,%r8
+	jnz	.L128_enc_msg_x8_start
+	.byte	0xf3,0xc3
+
+.L128_enc_msg_x8_start:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-32
+	movq	%rsp,%rbp
+.cfi_def_cfa_register	rbp
+
+
+	subq	$128,%rsp
+	andq	$-64,%rsp
+
+	shrq	$4,%r8
+	movq	%r8,%r10
+	shlq	$61,%r10
+	shrq	$61,%r10
+
+
+	vmovdqu	(%rdx),%xmm1
+	vpor	OR_MASK(%rip),%xmm1,%xmm1
+
+
+	vpaddd	seven(%rip),%xmm1,%xmm0
+	vmovdqu	%xmm0,(%rsp)
+	vpaddd	one(%rip),%xmm1,%xmm9
+	vpaddd	two(%rip),%xmm1,%xmm10
+	vpaddd	three(%rip),%xmm1,%xmm11
+	vpaddd	four(%rip),%xmm1,%xmm12
+	vpaddd	five(%rip),%xmm1,%xmm13
+	vpaddd	six(%rip),%xmm1,%xmm14
+	vmovdqa	%xmm1,%xmm0
+
+	shrq	$3,%r8
+	je	.L128_enc_msg_x8_check_remainder
+
+	subq	$128,%rsi
+	subq	$128,%rdi
+
+.L128_enc_msg_x8_loop1:
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm9,%xmm2
+	vmovdqa	%xmm10,%xmm3
+	vmovdqa	%xmm11,%xmm4
+	vmovdqa	%xmm12,%xmm5
+	vmovdqa	%xmm13,%xmm6
+	vmovdqa	%xmm14,%xmm7
+
+	vmovdqu	(%rsp),%xmm8
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vpxor	(%rcx),%xmm2,%xmm2
+	vpxor	(%rcx),%xmm3,%xmm3
+	vpxor	(%rcx),%xmm4,%xmm4
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	(%rsp),%xmm14
+	vpaddd	eight(%rip),%xmm14,%xmm14
+	vmovdqu	%xmm14,(%rsp)
+	vmovdqu	32(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpsubd	one(%rip),%xmm14,%xmm14
+	vmovdqu	48(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm0,%xmm0
+	vmovdqu	64(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm9,%xmm9
+	vmovdqu	80(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm10,%xmm10
+	vmovdqu	96(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm11,%xmm11
+	vmovdqu	112(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm12,%xmm12
+	vmovdqu	128(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm13,%xmm13
+	vmovdqu	144(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm15
+	vaesenclast	%xmm15,%xmm1,%xmm1
+	vaesenclast	%xmm15,%xmm2,%xmm2
+	vaesenclast	%xmm15,%xmm3,%xmm3
+	vaesenclast	%xmm15,%xmm4,%xmm4
+	vaesenclast	%xmm15,%xmm5,%xmm5
+	vaesenclast	%xmm15,%xmm6,%xmm6
+	vaesenclast	%xmm15,%xmm7,%xmm7
+	vaesenclast	%xmm15,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm1,%xmm1
+	vpxor	16(%rdi),%xmm2,%xmm2
+	vpxor	32(%rdi),%xmm3,%xmm3
+	vpxor	48(%rdi),%xmm4,%xmm4
+	vpxor	64(%rdi),%xmm5,%xmm5
+	vpxor	80(%rdi),%xmm6,%xmm6
+	vpxor	96(%rdi),%xmm7,%xmm7
+	vpxor	112(%rdi),%xmm8,%xmm8
+
+	decq	%r8
+
+	vmovdqu	%xmm1,0(%rsi)
+	vmovdqu	%xmm2,16(%rsi)
+	vmovdqu	%xmm3,32(%rsi)
+	vmovdqu	%xmm4,48(%rsi)
+	vmovdqu	%xmm5,64(%rsi)
+	vmovdqu	%xmm6,80(%rsi)
+	vmovdqu	%xmm7,96(%rsi)
+	vmovdqu	%xmm8,112(%rsi)
+
+	jne	.L128_enc_msg_x8_loop1
+
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+.L128_enc_msg_x8_check_remainder:
+	cmpq	$0,%r10
+	je	.L128_enc_msg_x8_out
+
+.L128_enc_msg_x8_loop2:
+
+
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vaesenc	16(%rcx),%xmm1,%xmm1
+	vaesenc	32(%rcx),%xmm1,%xmm1
+	vaesenc	48(%rcx),%xmm1,%xmm1
+	vaesenc	64(%rcx),%xmm1,%xmm1
+	vaesenc	80(%rcx),%xmm1,%xmm1
+	vaesenc	96(%rcx),%xmm1,%xmm1
+	vaesenc	112(%rcx),%xmm1,%xmm1
+	vaesenc	128(%rcx),%xmm1,%xmm1
+	vaesenc	144(%rcx),%xmm1,%xmm1
+	vaesenclast	160(%rcx),%xmm1,%xmm1
+
+
+	vpxor	(%rdi),%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	decq	%r10
+	jne	.L128_enc_msg_x8_loop2
+
+.L128_enc_msg_x8_out:
+	movq	%rbp,%rsp
+.cfi_def_cfa_register	%rsp
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
+.globl	aes128gcmsiv_dec
+.hidden aes128gcmsiv_dec
+.type	aes128gcmsiv_dec,@function
+.align	16
+aes128gcmsiv_dec:
+.cfi_startproc	
+	testq	$~15,%r9
+	jnz	.L128_dec_start
+	.byte	0xf3,0xc3
+
+.L128_dec_start:
+	vzeroupper
+	vmovdqa	(%rdx),%xmm0
+	movq	%rdx,%rax
+
+	leaq	32(%rax),%rax
+	leaq	32(%rcx),%rcx
+
+
+	vmovdqu	(%rdi,%r9,1),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+	andq	$~15,%r9
+
+
+	cmpq	$96,%r9
+	jb	.L128_dec_loop2
+
+
+	subq	$96,%r9
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vpxor	(%r8),%xmm7,%xmm7
+	vpxor	(%r8),%xmm8,%xmm8
+	vpxor	(%r8),%xmm9,%xmm9
+	vpxor	(%r8),%xmm10,%xmm10
+	vpxor	(%r8),%xmm11,%xmm11
+	vpxor	(%r8),%xmm12,%xmm12
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+
+	vpxor	0(%rdi),%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm12,%xmm12
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	addq	$96,%rdi
+	addq	$96,%rsi
+	jmp	.L128_dec_loop1
+
+
+.align	64
+.L128_dec_loop1:
+	cmpq	$96,%r9
+	jb	.L128_dec_finish_96
+	subq	$96,%r9
+
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vmovdqa	(%r8),%xmm4
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm8,%xmm8
+	vpxor	%xmm4,%xmm9,%xmm9
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vmovdqa	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm6
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	0(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+	leaq	96(%rdi),%rdi
+	leaq	96(%rsi),%rsi
+	jmp	.L128_dec_loop1
+
+.L128_dec_finish_96:
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+.L128_dec_loop2:
+
+
+
+	cmpq	$16,%r9
+	jb	.L128_dec_out
+	subq	$16,%r9
+
+	vmovdqa	%xmm15,%xmm2
+	vpaddd	one(%rip),%xmm15,%xmm15
+
+	vpxor	0(%r8),%xmm2,%xmm2
+	vaesenc	16(%r8),%xmm2,%xmm2
+	vaesenc	32(%r8),%xmm2,%xmm2
+	vaesenc	48(%r8),%xmm2,%xmm2
+	vaesenc	64(%r8),%xmm2,%xmm2
+	vaesenc	80(%r8),%xmm2,%xmm2
+	vaesenc	96(%r8),%xmm2,%xmm2
+	vaesenc	112(%r8),%xmm2,%xmm2
+	vaesenc	128(%r8),%xmm2,%xmm2
+	vaesenc	144(%r8),%xmm2,%xmm2
+	vaesenclast	160(%r8),%xmm2,%xmm2
+	vpxor	(%rdi),%xmm2,%xmm2
+	vmovdqu	%xmm2,(%rsi)
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	-32(%rcx),%xmm1
+	call	GFMUL
+
+	jmp	.L128_dec_loop2
+
+.L128_dec_out:
+	vmovdqu	%xmm0,(%rdx)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_dec, .-aes128gcmsiv_dec
+.globl	aes128gcmsiv_ecb_enc_block
+.hidden aes128gcmsiv_ecb_enc_block
+.type	aes128gcmsiv_ecb_enc_block,@function
+.align	16
+aes128gcmsiv_ecb_enc_block:
+.cfi_startproc	
+	vmovdqa	(%rdi),%xmm1
+
+	vpxor	(%rdx),%xmm1,%xmm1
+	vaesenc	16(%rdx),%xmm1,%xmm1
+	vaesenc	32(%rdx),%xmm1,%xmm1
+	vaesenc	48(%rdx),%xmm1,%xmm1
+	vaesenc	64(%rdx),%xmm1,%xmm1
+	vaesenc	80(%rdx),%xmm1,%xmm1
+	vaesenc	96(%rdx),%xmm1,%xmm1
+	vaesenc	112(%rdx),%xmm1,%xmm1
+	vaesenc	128(%rdx),%xmm1,%xmm1
+	vaesenc	144(%rdx),%xmm1,%xmm1
+	vaesenclast	160(%rdx),%xmm1,%xmm1
+
+	vmovdqa	%xmm1,(%rsi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
+.globl	aes256gcmsiv_aes_ks_enc_x1
+.hidden aes256gcmsiv_aes_ks_enc_x1
+.type	aes256gcmsiv_aes_ks_enc_x1,@function
+.align	16
+aes256gcmsiv_aes_ks_enc_x1:
+.cfi_startproc	
+	vmovdqa	con1(%rip),%xmm0
+	vmovdqa	mask(%rip),%xmm15
+	vmovdqa	(%rdi),%xmm8
+	vmovdqa	(%rcx),%xmm1
+	vmovdqa	16(%rcx),%xmm3
+	vpxor	%xmm1,%xmm8,%xmm8
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm1,(%rdx)
+	vmovdqu	%xmm3,16(%rdx)
+	vpxor	%xmm14,%xmm14,%xmm14
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,32(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,48(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,64(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,80(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,96(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,112(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,128(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,144(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,160(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,176(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslld	$1,%xmm0,%xmm0
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenc	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,192(%rdx)
+
+	vpshufd	$0xff,%xmm1,%xmm2
+	vaesenclast	%xmm14,%xmm2,%xmm2
+	vpslldq	$4,%xmm3,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpxor	%xmm2,%xmm3,%xmm3
+	vaesenc	%xmm3,%xmm8,%xmm8
+	vmovdqu	%xmm3,208(%rdx)
+
+	vpshufb	%xmm15,%xmm3,%xmm2
+	vaesenclast	%xmm0,%xmm2,%xmm2
+	vpslldq	$4,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpslldq	$4,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm1,%xmm1
+	vaesenclast	%xmm1,%xmm8,%xmm8
+	vmovdqu	%xmm1,224(%rdx)
+
+	vmovdqa	%xmm8,(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
+.globl	aes256gcmsiv_ecb_enc_block
+.hidden aes256gcmsiv_ecb_enc_block
+.type	aes256gcmsiv_ecb_enc_block,@function
+.align	16
+aes256gcmsiv_ecb_enc_block:
+.cfi_startproc	
+	vmovdqa	(%rdi),%xmm1
+	vpxor	(%rdx),%xmm1,%xmm1
+	vaesenc	16(%rdx),%xmm1,%xmm1
+	vaesenc	32(%rdx),%xmm1,%xmm1
+	vaesenc	48(%rdx),%xmm1,%xmm1
+	vaesenc	64(%rdx),%xmm1,%xmm1
+	vaesenc	80(%rdx),%xmm1,%xmm1
+	vaesenc	96(%rdx),%xmm1,%xmm1
+	vaesenc	112(%rdx),%xmm1,%xmm1
+	vaesenc	128(%rdx),%xmm1,%xmm1
+	vaesenc	144(%rdx),%xmm1,%xmm1
+	vaesenc	160(%rdx),%xmm1,%xmm1
+	vaesenc	176(%rdx),%xmm1,%xmm1
+	vaesenc	192(%rdx),%xmm1,%xmm1
+	vaesenc	208(%rdx),%xmm1,%xmm1
+	vaesenclast	224(%rdx),%xmm1,%xmm1
+	vmovdqa	%xmm1,(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
+.globl	aes256gcmsiv_enc_msg_x4
+.hidden aes256gcmsiv_enc_msg_x4
+.type	aes256gcmsiv_enc_msg_x4,@function
+.align	16
+aes256gcmsiv_enc_msg_x4:
+.cfi_startproc	
+	testq	%r8,%r8
+	jnz	.L256_enc_msg_x4_start
+	.byte	0xf3,0xc3
+
+.L256_enc_msg_x4_start:
+	movq	%r8,%r10
+	shrq	$4,%r8
+	shlq	$60,%r10
+	jz	.L256_enc_msg_x4_start2
+	addq	$1,%r8
+
+.L256_enc_msg_x4_start2:
+	movq	%r8,%r10
+	shlq	$62,%r10
+	shrq	$62,%r10
+
+
+	vmovdqa	(%rdx),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+
+	vmovdqa	four(%rip),%xmm4
+	vmovdqa	%xmm15,%xmm0
+	vpaddd	one(%rip),%xmm15,%xmm1
+	vpaddd	two(%rip),%xmm15,%xmm2
+	vpaddd	three(%rip),%xmm15,%xmm3
+
+	shrq	$2,%r8
+	je	.L256_enc_msg_x4_check_remainder
+
+	subq	$64,%rsi
+	subq	$64,%rdi
+
+.L256_enc_msg_x4_loop1:
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+	vmovdqa	%xmm0,%xmm5
+	vmovdqa	%xmm1,%xmm6
+	vmovdqa	%xmm2,%xmm7
+	vmovdqa	%xmm3,%xmm8
+
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm0,%xmm0
+	vmovdqu	32(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm1,%xmm1
+	vmovdqu	48(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm2,%xmm2
+	vmovdqu	64(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vpaddd	%xmm4,%xmm3,%xmm3
+
+	vmovdqu	80(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	96(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	112(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	128(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	144(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	176(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	192(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	208(%rcx),%xmm12
+	vaesenc	%xmm12,%xmm5,%xmm5
+	vaesenc	%xmm12,%xmm6,%xmm6
+	vaesenc	%xmm12,%xmm7,%xmm7
+	vaesenc	%xmm12,%xmm8,%xmm8
+
+	vmovdqu	224(%rcx),%xmm12
+	vaesenclast	%xmm12,%xmm5,%xmm5
+	vaesenclast	%xmm12,%xmm6,%xmm6
+	vaesenclast	%xmm12,%xmm7,%xmm7
+	vaesenclast	%xmm12,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm5,%xmm5
+	vpxor	16(%rdi),%xmm6,%xmm6
+	vpxor	32(%rdi),%xmm7,%xmm7
+	vpxor	48(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm5,0(%rsi)
+	vmovdqu	%xmm6,16(%rsi)
+	vmovdqu	%xmm7,32(%rsi)
+	vmovdqu	%xmm8,48(%rsi)
+
+	jne	.L256_enc_msg_x4_loop1
+
+	addq	$64,%rsi
+	addq	$64,%rdi
+
+.L256_enc_msg_x4_check_remainder:
+	cmpq	$0,%r10
+	je	.L256_enc_msg_x4_out
+
+.L256_enc_msg_x4_loop2:
+
+
+
+	vmovdqa	%xmm0,%xmm5
+	vpaddd	one(%rip),%xmm0,%xmm0
+	vpxor	(%rcx),%xmm5,%xmm5
+	vaesenc	16(%rcx),%xmm5,%xmm5
+	vaesenc	32(%rcx),%xmm5,%xmm5
+	vaesenc	48(%rcx),%xmm5,%xmm5
+	vaesenc	64(%rcx),%xmm5,%xmm5
+	vaesenc	80(%rcx),%xmm5,%xmm5
+	vaesenc	96(%rcx),%xmm5,%xmm5
+	vaesenc	112(%rcx),%xmm5,%xmm5
+	vaesenc	128(%rcx),%xmm5,%xmm5
+	vaesenc	144(%rcx),%xmm5,%xmm5
+	vaesenc	160(%rcx),%xmm5,%xmm5
+	vaesenc	176(%rcx),%xmm5,%xmm5
+	vaesenc	192(%rcx),%xmm5,%xmm5
+	vaesenc	208(%rcx),%xmm5,%xmm5
+	vaesenclast	224(%rcx),%xmm5,%xmm5
+
+
+	vpxor	(%rdi),%xmm5,%xmm5
+
+	vmovdqu	%xmm5,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	subq	$1,%r10
+	jne	.L256_enc_msg_x4_loop2
+
+.L256_enc_msg_x4_out:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
+.globl	aes256gcmsiv_enc_msg_x8
+.hidden aes256gcmsiv_enc_msg_x8
+.type	aes256gcmsiv_enc_msg_x8,@function
+.align	16
+aes256gcmsiv_enc_msg_x8:
+.cfi_startproc	
+	testq	%r8,%r8
+	jnz	.L256_enc_msg_x8_start
+	.byte	0xf3,0xc3
+
+.L256_enc_msg_x8_start:
+
+	movq	%rsp,%r11
+	subq	$16,%r11
+	andq	$-64,%r11
+
+	movq	%r8,%r10
+	shrq	$4,%r8
+	shlq	$60,%r10
+	jz	.L256_enc_msg_x8_start2
+	addq	$1,%r8
+
+.L256_enc_msg_x8_start2:
+	movq	%r8,%r10
+	shlq	$61,%r10
+	shrq	$61,%r10
+
+
+	vmovdqa	(%rdx),%xmm1
+	vpor	OR_MASK(%rip),%xmm1,%xmm1
+
+
+	vpaddd	seven(%rip),%xmm1,%xmm0
+	vmovdqa	%xmm0,(%r11)
+	vpaddd	one(%rip),%xmm1,%xmm9
+	vpaddd	two(%rip),%xmm1,%xmm10
+	vpaddd	three(%rip),%xmm1,%xmm11
+	vpaddd	four(%rip),%xmm1,%xmm12
+	vpaddd	five(%rip),%xmm1,%xmm13
+	vpaddd	six(%rip),%xmm1,%xmm14
+	vmovdqa	%xmm1,%xmm0
+
+	shrq	$3,%r8
+	jz	.L256_enc_msg_x8_check_remainder
+
+	subq	$128,%rsi
+	subq	$128,%rdi
+
+.L256_enc_msg_x8_loop1:
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+	vmovdqa	%xmm0,%xmm1
+	vmovdqa	%xmm9,%xmm2
+	vmovdqa	%xmm10,%xmm3
+	vmovdqa	%xmm11,%xmm4
+	vmovdqa	%xmm12,%xmm5
+	vmovdqa	%xmm13,%xmm6
+	vmovdqa	%xmm14,%xmm7
+
+	vmovdqa	(%r11),%xmm8
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vpxor	(%rcx),%xmm2,%xmm2
+	vpxor	(%rcx),%xmm3,%xmm3
+	vpxor	(%rcx),%xmm4,%xmm4
+	vpxor	(%rcx),%xmm5,%xmm5
+	vpxor	(%rcx),%xmm6,%xmm6
+	vpxor	(%rcx),%xmm7,%xmm7
+	vpxor	(%rcx),%xmm8,%xmm8
+
+	vmovdqu	16(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqa	(%r11),%xmm14
+	vpaddd	eight(%rip),%xmm14,%xmm14
+	vmovdqa	%xmm14,(%r11)
+	vmovdqu	32(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpsubd	one(%rip),%xmm14,%xmm14
+	vmovdqu	48(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm0,%xmm0
+	vmovdqu	64(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm9,%xmm9
+	vmovdqu	80(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm10,%xmm10
+	vmovdqu	96(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm11,%xmm11
+	vmovdqu	112(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm12,%xmm12
+	vmovdqu	128(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vpaddd	eight(%rip),%xmm13,%xmm13
+	vmovdqu	144(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	160(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	176(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	192(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	208(%rcx),%xmm15
+	vaesenc	%xmm15,%xmm1,%xmm1
+	vaesenc	%xmm15,%xmm2,%xmm2
+	vaesenc	%xmm15,%xmm3,%xmm3
+	vaesenc	%xmm15,%xmm4,%xmm4
+	vaesenc	%xmm15,%xmm5,%xmm5
+	vaesenc	%xmm15,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	224(%rcx),%xmm15
+	vaesenclast	%xmm15,%xmm1,%xmm1
+	vaesenclast	%xmm15,%xmm2,%xmm2
+	vaesenclast	%xmm15,%xmm3,%xmm3
+	vaesenclast	%xmm15,%xmm4,%xmm4
+	vaesenclast	%xmm15,%xmm5,%xmm5
+	vaesenclast	%xmm15,%xmm6,%xmm6
+	vaesenclast	%xmm15,%xmm7,%xmm7
+	vaesenclast	%xmm15,%xmm8,%xmm8
+
+
+
+	vpxor	0(%rdi),%xmm1,%xmm1
+	vpxor	16(%rdi),%xmm2,%xmm2
+	vpxor	32(%rdi),%xmm3,%xmm3
+	vpxor	48(%rdi),%xmm4,%xmm4
+	vpxor	64(%rdi),%xmm5,%xmm5
+	vpxor	80(%rdi),%xmm6,%xmm6
+	vpxor	96(%rdi),%xmm7,%xmm7
+	vpxor	112(%rdi),%xmm8,%xmm8
+
+	subq	$1,%r8
+
+	vmovdqu	%xmm1,0(%rsi)
+	vmovdqu	%xmm2,16(%rsi)
+	vmovdqu	%xmm3,32(%rsi)
+	vmovdqu	%xmm4,48(%rsi)
+	vmovdqu	%xmm5,64(%rsi)
+	vmovdqu	%xmm6,80(%rsi)
+	vmovdqu	%xmm7,96(%rsi)
+	vmovdqu	%xmm8,112(%rsi)
+
+	jne	.L256_enc_msg_x8_loop1
+
+	addq	$128,%rsi
+	addq	$128,%rdi
+
+.L256_enc_msg_x8_check_remainder:
+	cmpq	$0,%r10
+	je	.L256_enc_msg_x8_out
+
+.L256_enc_msg_x8_loop2:
+
+
+	vmovdqa	%xmm0,%xmm1
+	vpaddd	one(%rip),%xmm0,%xmm0
+
+	vpxor	(%rcx),%xmm1,%xmm1
+	vaesenc	16(%rcx),%xmm1,%xmm1
+	vaesenc	32(%rcx),%xmm1,%xmm1
+	vaesenc	48(%rcx),%xmm1,%xmm1
+	vaesenc	64(%rcx),%xmm1,%xmm1
+	vaesenc	80(%rcx),%xmm1,%xmm1
+	vaesenc	96(%rcx),%xmm1,%xmm1
+	vaesenc	112(%rcx),%xmm1,%xmm1
+	vaesenc	128(%rcx),%xmm1,%xmm1
+	vaesenc	144(%rcx),%xmm1,%xmm1
+	vaesenc	160(%rcx),%xmm1,%xmm1
+	vaesenc	176(%rcx),%xmm1,%xmm1
+	vaesenc	192(%rcx),%xmm1,%xmm1
+	vaesenc	208(%rcx),%xmm1,%xmm1
+	vaesenclast	224(%rcx),%xmm1,%xmm1
+
+
+	vpxor	(%rdi),%xmm1,%xmm1
+
+	vmovdqu	%xmm1,(%rsi)
+
+	addq	$16,%rdi
+	addq	$16,%rsi
+	subq	$1,%r10
+	jnz	.L256_enc_msg_x8_loop2
+
+.L256_enc_msg_x8_out:
+	.byte	0xf3,0xc3
+
+.cfi_endproc	
+.size	aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
+.globl	aes256gcmsiv_dec
+.hidden aes256gcmsiv_dec
+.type	aes256gcmsiv_dec,@function
+.align	16
+aes256gcmsiv_dec:
+.cfi_startproc	
+	testq	$~15,%r9
+	jnz	.L256_dec_start
+	.byte	0xf3,0xc3
+
+.L256_dec_start:
+	vzeroupper
+	vmovdqa	(%rdx),%xmm0
+	movq	%rdx,%rax
+
+	leaq	32(%rax),%rax
+	leaq	32(%rcx),%rcx
+
+
+	vmovdqu	(%rdi,%r9,1),%xmm15
+	vpor	OR_MASK(%rip),%xmm15,%xmm15
+	andq	$~15,%r9
+
+
+	cmpq	$96,%r9
+	jb	.L256_dec_loop2
+
+
+	subq	$96,%r9
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vpxor	(%r8),%xmm7,%xmm7
+	vpxor	(%r8),%xmm8,%xmm8
+	vpxor	(%r8),%xmm9,%xmm9
+	vpxor	(%r8),%xmm10,%xmm10
+	vpxor	(%r8),%xmm11,%xmm11
+	vpxor	(%r8),%xmm12,%xmm12
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	176(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	192(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	208(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	224(%r8),%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+
+	vpxor	0(%rdi),%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm12,%xmm12
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	addq	$96,%rdi
+	addq	$96,%rsi
+	jmp	.L256_dec_loop1
+
+
+.align	64
+.L256_dec_loop1:
+	cmpq	$96,%r9
+	jb	.L256_dec_finish_96
+	subq	$96,%r9
+
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqa	%xmm15,%xmm7
+	vpaddd	one(%rip),%xmm7,%xmm8
+	vpaddd	two(%rip),%xmm7,%xmm9
+	vpaddd	one(%rip),%xmm9,%xmm10
+	vpaddd	two(%rip),%xmm9,%xmm11
+	vpaddd	one(%rip),%xmm11,%xmm12
+	vpaddd	two(%rip),%xmm11,%xmm15
+
+	vmovdqa	(%r8),%xmm4
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm8,%xmm8
+	vpxor	%xmm4,%xmm9,%xmm9
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	32(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	48(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	64(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	96(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	112(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vmovdqa	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	128(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vmovdqu	144(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	160(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	176(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	192(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	208(%r8),%xmm4
+	vaesenc	%xmm4,%xmm7,%xmm7
+	vaesenc	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm4,%xmm9,%xmm9
+	vaesenc	%xmm4,%xmm10,%xmm10
+	vaesenc	%xmm4,%xmm11,%xmm11
+	vaesenc	%xmm4,%xmm12,%xmm12
+
+	vmovdqu	224(%r8),%xmm6
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	0(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm7,%xmm7
+	vpxor	16(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm8,%xmm8
+	vpxor	32(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vpxor	48(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm10,%xmm10
+	vpxor	64(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm11,%xmm11
+	vpxor	80(%rdi),%xmm6,%xmm4
+	vaesenclast	%xmm4,%xmm12,%xmm12
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vmovdqu	%xmm7,0(%rsi)
+	vmovdqu	%xmm8,16(%rsi)
+	vmovdqu	%xmm9,32(%rsi)
+	vmovdqu	%xmm10,48(%rsi)
+	vmovdqu	%xmm11,64(%rsi)
+	vmovdqu	%xmm12,80(%rsi)
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+	leaq	96(%rdi),%rdi
+	leaq	96(%rsi),%rsi
+	jmp	.L256_dec_loop1
+
+.L256_dec_finish_96:
+	vmovdqa	%xmm12,%xmm6
+	vmovdqa	%xmm11,16-32(%rax)
+	vmovdqa	%xmm10,32-32(%rax)
+	vmovdqa	%xmm9,48-32(%rax)
+	vmovdqa	%xmm8,64-32(%rax)
+	vmovdqa	%xmm7,80-32(%rax)
+
+	vmovdqu	0-32(%rcx),%xmm4
+	vpclmulqdq	$0x10,%xmm4,%xmm6,%xmm1
+	vpclmulqdq	$0x11,%xmm4,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm4,%xmm6,%xmm3
+	vpclmulqdq	$0x01,%xmm4,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	-16(%rax),%xmm6
+	vmovdqu	-16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	0(%rax),%xmm6
+	vmovdqu	0(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	16(%rax),%xmm6
+	vmovdqu	16(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vmovdqu	32(%rax),%xmm6
+	vmovdqu	32(%rcx),%xmm13
+
+	vpclmulqdq	$0x10,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x11,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x01,%xmm13,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+
+	vmovdqu	80-32(%rax),%xmm6
+	vpxor	%xmm0,%xmm6,%xmm6
+	vmovdqu	80-32(%rcx),%xmm5
+	vpclmulqdq	$0x11,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+	vpclmulqdq	$0x10,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x01,%xmm5,%xmm6,%xmm4
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm2,%xmm5
+	vpslldq	$8,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm0
+
+	vmovdqa	poly(%rip),%xmm3
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpalignr	$8,%xmm0,%xmm0,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm0
+	vpxor	%xmm0,%xmm2,%xmm0
+
+	vpxor	%xmm5,%xmm0,%xmm0
+
+.L256_dec_loop2:
+
+
+
+	cmpq	$16,%r9
+	jb	.L256_dec_out
+	subq	$16,%r9
+
+	vmovdqa	%xmm15,%xmm2
+	vpaddd	one(%rip),%xmm15,%xmm15
+
+	vpxor	0(%r8),%xmm2,%xmm2
+	vaesenc	16(%r8),%xmm2,%xmm2
+	vaesenc	32(%r8),%xmm2,%xmm2
+	vaesenc	48(%r8),%xmm2,%xmm2
+	vaesenc	64(%r8),%xmm2,%xmm2
+	vaesenc	80(%r8),%xmm2,%xmm2
+	vaesenc	96(%r8),%xmm2,%xmm2
+	vaesenc	112(%r8),%xmm2,%xmm2
+	vaesenc	128(%r8),%xmm2,%xmm2
+	vaesenc	144(%r8),%xmm2,%xmm2
+	vaesenc	160(%r8),%xmm2,%xmm2
+	vaesenc	176(%r8),%xmm2,%xmm2
+	vaesenc	192(%r8),%xmm2,%xmm2
+	vaesenc	208(%r8),%xmm2,%xmm2
+	vaesenclast	224(%r8),%xmm2,%xmm2
+	vpxor	(%rdi),%xmm2,%xmm2
+	vmovdqu	%xmm2,(%rsi)
+	addq	$16,%rdi
+	addq	$16,%rsi
+
+	vpxor	%xmm2,%xmm0,%xmm0
+	vmovdqa	-32(%rcx),%xmm1
+	call	GFMUL
+
+	jmp	.L256_dec_loop2
+
+.L256_dec_out:
+	vmovdqu	%xmm0,(%rdx)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes256gcmsiv_dec, .-aes256gcmsiv_dec
+.globl	aes256gcmsiv_kdf
+.hidden aes256gcmsiv_kdf
+.type	aes256gcmsiv_kdf,@function
+.align	16
+aes256gcmsiv_kdf:
+.cfi_startproc	
+
+
+
+
+	vmovdqa	(%rdx),%xmm1
+	vmovdqa	0(%rdi),%xmm4
+	vmovdqa	and_mask(%rip),%xmm11
+	vmovdqa	one(%rip),%xmm8
+	vpshufd	$0x90,%xmm4,%xmm4
+	vpand	%xmm11,%xmm4,%xmm4
+	vpaddd	%xmm8,%xmm4,%xmm6
+	vpaddd	%xmm8,%xmm6,%xmm7
+	vpaddd	%xmm8,%xmm7,%xmm11
+	vpaddd	%xmm8,%xmm11,%xmm12
+	vpaddd	%xmm8,%xmm12,%xmm13
+
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpxor	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	16(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	32(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	48(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	64(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	80(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	96(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	112(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	128(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	144(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	160(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	176(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	192(%rdx),%xmm2
+	vaesenc	%xmm2,%xmm4,%xmm4
+	vaesenc	%xmm2,%xmm6,%xmm6
+	vaesenc	%xmm2,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vaesenc	%xmm2,%xmm13,%xmm13
+
+	vmovdqa	208(%rdx),%xmm1
+	vaesenc	%xmm1,%xmm4,%xmm4
+	vaesenc	%xmm1,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+
+	vmovdqa	224(%rdx),%xmm2
+	vaesenclast	%xmm2,%xmm4,%xmm4
+	vaesenclast	%xmm2,%xmm6,%xmm6
+	vaesenclast	%xmm2,%xmm7,%xmm7
+	vaesenclast	%xmm2,%xmm11,%xmm11
+	vaesenclast	%xmm2,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+
+
+	vmovdqa	%xmm4,0(%rsi)
+	vmovdqa	%xmm6,16(%rsi)
+	vmovdqa	%xmm7,32(%rsi)
+	vmovdqa	%xmm11,48(%rsi)
+	vmovdqa	%xmm12,64(%rsi)
+	vmovdqa	%xmm13,80(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
@@ -1,0 +1,8922 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+chacha20_poly1305_constants:
+
+.align	64
+.Lchacha20_consts:
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Lrol8:
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lrol16:
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte	2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.Lavx2_init:
+.long	0,0,0,0
+.Lsse_inc:
+.long	1,0,0,0
+.Lavx2_inc:
+.long	2,0,0,0,2,0,0,0
+.Lclamp:
+.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad	0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align	16
+.Land_masks:
+.byte	0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+.type	poly_hash_ad_internal,@function
+.align	64
+poly_hash_ad_internal:
+.cfi_startproc	
+.cfi_def_cfa	rsp, 8
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	cmpq	$13,%r8
+	jne	.Lhash_ad_loop
+.Lpoly_fast_tls_ad:
+
+	movq	(%rcx),%r10
+	movq	5(%rcx),%r11
+	shrq	$24,%r11
+	movq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	.byte	0xf3,0xc3
+.Lhash_ad_loop:
+
+	cmpq	$16,%r8
+	jb	.Lhash_ad_tail
+	addq	0+0(%rcx),%r10
+	adcq	8+0(%rcx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rcx),%rcx
+	subq	$16,%r8
+	jmp	.Lhash_ad_loop
+.Lhash_ad_tail:
+	cmpq	$0,%r8
+	je	.Lhash_ad_done
+
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	addq	%r8,%rcx
+.Lhash_ad_tail_loop:
+	shldq	$8,%r13,%r14
+	shlq	$8,%r13
+	movzbq	-1(%rcx),%r15
+	xorq	%r15,%r13
+	decq	%rcx
+	decq	%r8
+	jne	.Lhash_ad_tail_loop
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lhash_ad_done:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly_hash_ad_internal, .-poly_hash_ad_internal
+
+.globl	chacha20_poly1305_open
+.hidden chacha20_poly1305_open
+.type	chacha20_poly1305_open,@function
+.align	64
+chacha20_poly1305_open:
+.cfi_startproc	
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$288,%eax
+	xorl	$288,%eax
+	jz	chacha20_poly1305_open_avx2
+
+	cmpq	$128,%rbx
+	jbe	.Lopen_sse_128
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm12,%xmm7
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movq	$10,%r10
+.Lopen_sse_init_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jne	.Lopen_sse_init_rounds
+
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+
+	pand	.Lclamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_sse_main_loop:
+	cmpq	$256,%rbx
+	jb	.Lopen_sse_tail
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+
+
+	movq	$4,%rcx
+	movq	%rsi,%r8
+.Lopen_sse_main_loop_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+
+	leaq	16(%r8),%r8
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%rcx
+	jge	.Lopen_sse_main_loop_rounds
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	cmpq	$-6,%rcx
+	jg	.Lopen_sse_main_loop_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	0+80(%rbp),%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	.Lopen_sse_main_loop
+.Lopen_sse_tail:
+
+	testq	%rbx,%rbx
+	jz	.Lopen_sse_finalize
+	cmpq	$192,%rbx
+	ja	.Lopen_sse_tail_256
+	cmpq	$128,%rbx
+	ja	.Lopen_sse_tail_192
+	cmpq	$64,%rbx
+	ja	.Lopen_sse_tail_128
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	cmpq	$16,%rcx
+	jb	.Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+.Lopen_sse_tail_64_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	cmpq	$16,%rcx
+	jae	.Lopen_sse_tail_64_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_64_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_128:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+.Lopen_sse_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_128_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_192:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+	movq	%rbx,%rcx
+	movq	$160,%r8
+	cmpq	$160,%rcx
+	cmovgq	%r8,%rcx
+	andq	$-16,%rcx
+	xorq	%r8,%r8
+.Lopen_sse_tail_192_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_192_rounds:
+	addq	$16,%r8
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_192_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_sse_tail_192_rounds
+	cmpq	$176,%rbx
+	jb	.Lopen_sse_tail_192_finish
+	addq	0+160(%rsi),%r10
+	adcq	8+160(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	cmpq	$192,%rbx
+	jb	.Lopen_sse_tail_192_finish
+	addq	0+176(%rsi),%r10
+	adcq	8+176(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_sse_tail_192_finish:
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	jmp	.Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_256:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+	xorq	%r8,%r8
+.Lopen_sse_tail_256_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	movdqa	0+80(%rbp),%xmm11
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+	movdqa	0+80(%rbp),%xmm9
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	movdqa	%xmm11,0+80(%rbp)
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm4
+	pxor	%xmm11,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm4
+	pxor	%xmm11,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm5
+	pxor	%xmm11,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm5
+	pxor	%xmm11,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$12,%xmm11
+	psrld	$20,%xmm6
+	pxor	%xmm11,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm11
+	pslld	$7,%xmm11
+	psrld	$25,%xmm6
+	pxor	%xmm11,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+	movdqa	0+80(%rbp),%xmm11
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	movdqa	%xmm9,0+80(%rbp)
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol16(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$12,%xmm9
+	psrld	$20,%xmm7
+	pxor	%xmm9,%xmm7
+	paddd	%xmm7,%xmm3
+	pxor	%xmm3,%xmm15
+	pshufb	.Lrol8(%rip),%xmm15
+	paddd	%xmm15,%xmm11
+	pxor	%xmm11,%xmm7
+	movdqa	%xmm7,%xmm9
+	pslld	$7,%xmm9
+	psrld	$25,%xmm7
+	pxor	%xmm9,%xmm7
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+	movdqa	0+80(%rbp),%xmm9
+
+	addq	$16,%r8
+	cmpq	$160,%r8
+	jb	.Lopen_sse_tail_256_rounds_and_x1hash
+
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+.Lopen_sse_tail_256_hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%r8
+	cmpq	%rcx,%r8
+	jb	.Lopen_sse_tail_256_hash
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqa	%xmm12,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm12
+	pxor	%xmm3,%xmm12
+	movdqu	%xmm12,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm12
+	pxor	%xmm7,%xmm12
+	movdqu	%xmm12,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm12
+	pxor	%xmm11,%xmm12
+	movdqu	%xmm12,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm12
+	pxor	%xmm15,%xmm12
+	movdqu	%xmm12,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movdqa	0+80(%rbp),%xmm12
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	leaq	192(%rdi),%rdi
+
+
+.Lopen_sse_tail_64_dec_loop:
+	cmpq	$16,%rbx
+	jb	.Lopen_sse_tail_16_init
+	subq	$16,%rbx
+	movdqu	(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	jmp	.Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
+	movdqa	%xmm0,%xmm1
+
+
+.Lopen_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	.Lopen_sse_finalize
+
+
+
+	pxor	%xmm3,%xmm3
+	leaq	-1(%rsi,%rbx,1),%rsi
+	movq	%rbx,%r8
+.Lopen_sse_tail_16_compose:
+	pslldq	$1,%xmm3
+	pinsrb	$0,(%rsi),%xmm3
+	subq	$1,%rsi
+	subq	$1,%r8
+	jnz	.Lopen_sse_tail_16_compose
+
+.byte	102,73,15,126,221
+	pextrq	$1,%xmm3,%r14
+
+	pxor	%xmm1,%xmm3
+
+
+.Lopen_sse_tail_16_extract:
+	pextrb	$0,%xmm3,(%rdi)
+	psrldq	$1,%xmm3
+	addq	$1,%rdi
+	subq	$1,%rbx
+	jne	.Lopen_sse_tail_16_extract
+
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lopen_sse_finalize:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+.cfi_remember_state	
+	addq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	-(288 + 32)
+
+	popq	%r9
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r9
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	.byte	0xf3,0xc3
+
+.Lopen_sse_128:
+.cfi_restore_state	
+	movdqu	.Lchacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm13,%xmm15
+	movq	$10,%r10
+
+.Lopen_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	.Lopen_sse_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm9
+	paddd	%xmm11,%xmm10
+	paddd	%xmm15,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm14
+
+	pand	.Lclamp(%rip),%xmm0
+	movdqa	%xmm0,0+0(%rbp)
+	movdqa	%xmm4,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_sse_128_xor_hash:
+	cmpq	$16,%rbx
+	jb	.Lopen_sse_tail_16
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm1
+	movdqu	%xmm1,0(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	movdqa	%xmm2,%xmm13
+	movdqa	%xmm6,%xmm2
+	movdqa	%xmm10,%xmm6
+	movdqa	%xmm14,%xmm10
+	jmp	.Lopen_sse_128_xor_hash
+.size	chacha20_poly1305_open, .-chacha20_poly1305_open
+.cfi_endproc	
+
+
+
+
+
+
+
+.globl	chacha20_poly1305_seal
+.hidden chacha20_poly1305_seal
+.type	chacha20_poly1305_seal,@function
+.align	64
+chacha20_poly1305_seal:
+.cfi_startproc	
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+
+
+	pushq	%r9
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+	subq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	288 + 32
+	leaq	32(%rsp),%rbp
+	andq	$-32,%rbp
+
+	movq	56(%r9),%rbx
+	addq	%rdx,%rbx
+	movq	%r8,0+0+32(%rbp)
+	movq	%rbx,8+0+32(%rbp)
+	movq	%rdx,%rbx
+
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+	andl	$288,%eax
+	xorl	$288,%eax
+	jz	chacha20_poly1305_seal_avx2
+
+	cmpq	$128,%rbx
+	jbe	.Lseal_sse_128
+
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqu	0(%r9),%xmm4
+	movdqu	16(%r9),%xmm8
+	movdqu	32(%r9),%xmm12
+
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm12
+
+	movdqa	%xmm4,0+48(%rbp)
+	movdqa	%xmm8,0+64(%rbp)
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+	movq	$10,%r10
+.Lseal_sse_init_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	decq	%r10
+	jnz	.Lseal_sse_init_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+
+	pand	.Lclamp(%rip),%xmm3
+	movdqa	%xmm3,0+0(%rbp)
+	movdqa	%xmm7,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	cmpq	$192,%rbx
+	ja	.Lseal_sse_main_init
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 128(%rdi)
+	movdqu	%xmm4,16 + 128(%rdi)
+	movdqu	%xmm8,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	movq	$2,%rcx
+	movq	$8,%r8
+	cmpq	$64,%rbx
+	jbe	.Lseal_sse_tail_64
+	cmpq	$128,%rbx
+	jbe	.Lseal_sse_tail_128
+	cmpq	$192,%rbx
+	jbe	.Lseal_sse_tail_192
+
+.Lseal_sse_main_loop:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	0+96(%rbp),%xmm15
+	paddd	.Lsse_inc(%rip),%xmm15
+	movdqa	%xmm15,%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+	movdqa	%xmm15,0+144(%rbp)
+
+.align	32
+.Lseal_sse_main_rounds:
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+.byte	102,15,58,15,255,4
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,12
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	.Lrol16(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$20,%xmm8
+	pslld	$32-20,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	.Lrol8(%rip),%xmm8
+	paddd	%xmm7,%xmm3
+	paddd	%xmm6,%xmm2
+	paddd	%xmm5,%xmm1
+	paddd	%xmm4,%xmm0
+	pxor	%xmm3,%xmm15
+	pxor	%xmm2,%xmm14
+	pxor	%xmm1,%xmm13
+	pxor	%xmm0,%xmm12
+.byte	102,69,15,56,0,248
+.byte	102,69,15,56,0,240
+.byte	102,69,15,56,0,232
+.byte	102,69,15,56,0,224
+	movdqa	0+80(%rbp),%xmm8
+	paddd	%xmm15,%xmm11
+	paddd	%xmm14,%xmm10
+	paddd	%xmm13,%xmm9
+	paddd	%xmm12,%xmm8
+	pxor	%xmm11,%xmm7
+	pxor	%xmm10,%xmm6
+	pxor	%xmm9,%xmm5
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm8,0+80(%rbp)
+	movdqa	%xmm7,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm7
+	pxor	%xmm8,%xmm7
+	movdqa	%xmm6,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm6
+	pxor	%xmm8,%xmm6
+	movdqa	%xmm5,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm5
+	pxor	%xmm8,%xmm5
+	movdqa	%xmm4,%xmm8
+	psrld	$25,%xmm8
+	pslld	$32-25,%xmm4
+	pxor	%xmm8,%xmm4
+	movdqa	0+80(%rbp),%xmm8
+.byte	102,15,58,15,255,12
+.byte	102,69,15,58,15,219,8
+.byte	102,69,15,58,15,255,4
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+
+	leaq	16(%rdi),%rdi
+	decq	%r8
+	jge	.Lseal_sse_main_rounds
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_main_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm3
+	paddd	0+48(%rbp),%xmm7
+	paddd	0+64(%rbp),%xmm11
+	paddd	0+144(%rbp),%xmm15
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	movdqa	%xmm14,0+80(%rbp)
+	movdqa	%xmm14,0+80(%rbp)
+	movdqu	0 + 0(%rsi),%xmm14
+	pxor	%xmm3,%xmm14
+	movdqu	%xmm14,0 + 0(%rdi)
+	movdqu	16 + 0(%rsi),%xmm14
+	pxor	%xmm7,%xmm14
+	movdqu	%xmm14,16 + 0(%rdi)
+	movdqu	32 + 0(%rsi),%xmm14
+	pxor	%xmm11,%xmm14
+	movdqu	%xmm14,32 + 0(%rdi)
+	movdqu	48 + 0(%rsi),%xmm14
+	pxor	%xmm15,%xmm14
+	movdqu	%xmm14,48 + 0(%rdi)
+
+	movdqa	0+80(%rbp),%xmm14
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 64(%rdi)
+	movdqu	%xmm6,16 + 64(%rdi)
+	movdqu	%xmm10,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+	movdqu	0 + 128(%rsi),%xmm3
+	movdqu	16 + 128(%rsi),%xmm7
+	movdqu	32 + 128(%rsi),%xmm11
+	movdqu	48 + 128(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 128(%rdi)
+	movdqu	%xmm5,16 + 128(%rdi)
+	movdqu	%xmm9,32 + 128(%rdi)
+	movdqu	%xmm15,48 + 128(%rdi)
+
+	cmpq	$256,%rbx
+	ja	.Lseal_sse_main_loop_xor
+
+	movq	$192,%rcx
+	subq	$192,%rbx
+	leaq	192(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor:
+	movdqu	0 + 192(%rsi),%xmm3
+	movdqu	16 + 192(%rsi),%xmm7
+	movdqu	32 + 192(%rsi),%xmm11
+	movdqu	48 + 192(%rsi),%xmm15
+	pxor	%xmm3,%xmm0
+	pxor	%xmm7,%xmm4
+	pxor	%xmm11,%xmm8
+	pxor	%xmm12,%xmm15
+	movdqu	%xmm0,0 + 192(%rdi)
+	movdqu	%xmm4,16 + 192(%rdi)
+	movdqu	%xmm8,32 + 192(%rdi)
+	movdqu	%xmm15,48 + 192(%rdi)
+
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	movq	$6,%rcx
+	movq	$4,%r8
+	cmpq	$192,%rbx
+	jg	.Lseal_sse_main_loop
+	movq	%rbx,%rcx
+	testq	%rbx,%rbx
+	je	.Lseal_sse_128_tail_hash
+	movq	$6,%rcx
+	cmpq	$128,%rbx
+	ja	.Lseal_sse_tail_192
+	cmpq	$64,%rbx
+	ja	.Lseal_sse_tail_128
+
+.Lseal_sse_tail_64:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	0+96(%rbp),%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+
+.Lseal_sse_tail_64_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_64_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_64_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_64_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+
+	jmp	.Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_128:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	0+96(%rbp),%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+
+.Lseal_sse_tail_128_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_128_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_128_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_128_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 0(%rdi)
+	movdqu	%xmm5,16 + 0(%rdi)
+	movdqu	%xmm9,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+
+	movq	$64,%rcx
+	subq	$64,%rbx
+	leaq	64(%rsi),%rsi
+	jmp	.Lseal_sse_128_tail_hash
+
+.Lseal_sse_tail_192:
+	movdqa	.Lchacha20_consts(%rip),%xmm0
+	movdqa	0+48(%rbp),%xmm4
+	movdqa	0+64(%rbp),%xmm8
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm0,%xmm2
+	movdqa	%xmm4,%xmm6
+	movdqa	%xmm8,%xmm10
+	movdqa	0+96(%rbp),%xmm14
+	paddd	.Lsse_inc(%rip),%xmm14
+	movdqa	%xmm14,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm13,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,0+96(%rbp)
+	movdqa	%xmm13,0+112(%rbp)
+	movdqa	%xmm14,0+128(%rbp)
+
+.Lseal_sse_tail_192_rounds_and_x2hash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_sse_tail_192_rounds_and_x1hash:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	leaq	16(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_sse_tail_192_rounds_and_x2hash
+	decq	%r8
+	jge	.Lseal_sse_tail_192_rounds_and_x1hash
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	0+48(%rbp),%xmm6
+	paddd	0+64(%rbp),%xmm10
+	paddd	0+128(%rbp),%xmm14
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	0+48(%rbp),%xmm5
+	paddd	0+64(%rbp),%xmm9
+	paddd	0+112(%rbp),%xmm13
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	0+48(%rbp),%xmm4
+	paddd	0+64(%rbp),%xmm8
+	paddd	0+96(%rbp),%xmm12
+	movdqu	0 + 0(%rsi),%xmm3
+	movdqu	16 + 0(%rsi),%xmm7
+	movdqu	32 + 0(%rsi),%xmm11
+	movdqu	48 + 0(%rsi),%xmm15
+	pxor	%xmm3,%xmm2
+	pxor	%xmm7,%xmm6
+	pxor	%xmm11,%xmm10
+	pxor	%xmm14,%xmm15
+	movdqu	%xmm2,0 + 0(%rdi)
+	movdqu	%xmm6,16 + 0(%rdi)
+	movdqu	%xmm10,32 + 0(%rdi)
+	movdqu	%xmm15,48 + 0(%rdi)
+	movdqu	0 + 64(%rsi),%xmm3
+	movdqu	16 + 64(%rsi),%xmm7
+	movdqu	32 + 64(%rsi),%xmm11
+	movdqu	48 + 64(%rsi),%xmm15
+	pxor	%xmm3,%xmm1
+	pxor	%xmm7,%xmm5
+	pxor	%xmm11,%xmm9
+	pxor	%xmm13,%xmm15
+	movdqu	%xmm1,0 + 64(%rdi)
+	movdqu	%xmm5,16 + 64(%rdi)
+	movdqu	%xmm9,32 + 64(%rdi)
+	movdqu	%xmm15,48 + 64(%rdi)
+
+	movq	$128,%rcx
+	subq	$128,%rbx
+	leaq	128(%rsi),%rsi
+
+.Lseal_sse_128_tail_hash:
+	cmpq	$16,%rcx
+	jb	.Lseal_sse_128_tail_xor
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	leaq	16(%rdi),%rdi
+	jmp	.Lseal_sse_128_tail_hash
+
+.Lseal_sse_128_tail_xor:
+	cmpq	$16,%rbx
+	jb	.Lseal_sse_tail_16
+	subq	$16,%rbx
+
+	movdqu	0(%rsi),%xmm3
+	pxor	%xmm3,%xmm0
+	movdqu	%xmm0,0(%rdi)
+
+	addq	0(%rdi),%r10
+	adcq	8(%rdi),%r11
+	adcq	$1,%r12
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm8,%xmm4
+	movdqa	%xmm12,%xmm8
+	movdqa	%xmm1,%xmm12
+	movdqa	%xmm5,%xmm1
+	movdqa	%xmm9,%xmm5
+	movdqa	%xmm13,%xmm9
+	jmp	.Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_16:
+	testq	%rbx,%rbx
+	jz	.Lprocess_blocks_of_extra_in
+
+	movq	%rbx,%r8
+	movq	%rbx,%rcx
+	leaq	-1(%rsi,%rbx,1),%rsi
+	pxor	%xmm15,%xmm15
+.Lseal_sse_tail_16_compose:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	decq	%rcx
+	jne	.Lseal_sse_tail_16_compose
+
+
+	pxor	%xmm0,%xmm15
+
+
+	movq	%rbx,%rcx
+	movdqu	%xmm15,%xmm0
+.Lseal_sse_tail_16_extract:
+	pextrb	$0,%xmm0,(%rdi)
+	psrldq	$1,%xmm0
+	addq	$1,%rdi
+	subq	$1,%rcx
+	jnz	.Lseal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+	movq	288 + 0 + 32(%rsp),%r9
+	movq	56(%r9),%r14
+	movq	48(%r9),%r13
+	testq	%r14,%r14
+	jz	.Lprocess_partial_block
+
+	movq	$16,%r15
+	subq	%rbx,%r15
+	cmpq	%r15,%r14
+
+	jge	.Lload_extra_in
+	movq	%r14,%r15
+
+.Lload_extra_in:
+
+
+	leaq	-1(%r13,%r15,1),%rsi
+
+
+	addq	%r15,%r13
+	subq	%r15,%r14
+	movq	%r13,48(%r9)
+	movq	%r14,56(%r9)
+
+
+
+	addq	%r15,%r8
+
+
+	pxor	%xmm11,%xmm11
+.Lload_extra_load_loop:
+	pslldq	$1,%xmm11
+	pinsrb	$0,(%rsi),%xmm11
+	leaq	-1(%rsi),%rsi
+	subq	$1,%r15
+	jnz	.Lload_extra_load_loop
+
+
+
+
+	movq	%rbx,%r15
+
+.Lload_extra_shift_loop:
+	pslldq	$1,%xmm11
+	subq	$1,%r15
+	jnz	.Lload_extra_shift_loop
+
+
+
+
+	leaq	.Land_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+
+
+	por	%xmm11,%xmm15
+
+
+
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Lprocess_blocks_of_extra_in:
+
+	movq	288+32+0 (%rsp),%r9
+	movq	48(%r9),%rsi
+	movq	56(%r9),%r8
+	movq	%r8,%rcx
+	shrq	$4,%r8
+
+.Lprocess_extra_hash_loop:
+	jz	process_extra_in_trailer
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rsi),%rsi
+	subq	$1,%r8
+	jmp	.Lprocess_extra_hash_loop
+process_extra_in_trailer:
+	andq	$15,%rcx
+	movq	%rcx,%rbx
+	jz	.Ldo_length_block
+	leaq	-1(%rsi,%rcx,1),%rsi
+
+.Lprocess_extra_in_trailer_load:
+	pslldq	$1,%xmm15
+	pinsrb	$0,(%rsi),%xmm15
+	leaq	-1(%rsi),%rsi
+	subq	$1,%rcx
+	jnz	.Lprocess_extra_in_trailer_load
+
+.Lprocess_partial_block:
+
+	leaq	.Land_masks(%rip),%r15
+	shlq	$4,%rbx
+	pand	-16(%r15,%rbx,1),%xmm15
+.byte	102,77,15,126,253
+	pextrq	$1,%xmm15,%r14
+	addq	%r13,%r10
+	adcq	%r14,%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+.Ldo_length_block:
+	addq	0+0+32(%rbp),%r10
+	adcq	8+0+32(%rbp),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	movq	%r10,%r13
+	movq	%r11,%r14
+	movq	%r12,%r15
+	subq	$-5,%r10
+	sbbq	$-1,%r11
+	sbbq	$3,%r12
+	cmovcq	%r13,%r10
+	cmovcq	%r14,%r11
+	cmovcq	%r15,%r12
+
+	addq	0+0+16(%rbp),%r10
+	adcq	8+0+16(%rbp),%r11
+
+.cfi_remember_state	
+	addq	$288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset	-(288 + 32)
+
+	popq	%r9
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r9
+	movq	%r10,(%r9)
+	movq	%r11,8(%r9)
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbx
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%rbp
+	.byte	0xf3,0xc3
+
+.Lseal_sse_128:
+.cfi_restore_state	
+	movdqu	.Lchacha20_consts(%rip),%xmm0
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm0,%xmm2
+	movdqu	0(%r9),%xmm4
+	movdqa	%xmm4,%xmm5
+	movdqa	%xmm4,%xmm6
+	movdqu	16(%r9),%xmm8
+	movdqa	%xmm8,%xmm9
+	movdqa	%xmm8,%xmm10
+	movdqu	32(%r9),%xmm14
+	movdqa	%xmm14,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm12
+	movdqa	%xmm12,%xmm13
+	paddd	.Lsse_inc(%rip),%xmm13
+	movdqa	%xmm4,%xmm7
+	movdqa	%xmm8,%xmm11
+	movdqa	%xmm12,%xmm15
+	movq	$10,%r10
+
+.Lseal_sse_128_rounds:
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,4
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,12
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,4
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,12
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,4
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,12
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol16(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm4
+	pxor	%xmm3,%xmm4
+	paddd	%xmm4,%xmm0
+	pxor	%xmm0,%xmm12
+	pshufb	.Lrol8(%rip),%xmm12
+	paddd	%xmm12,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,15,228,12
+.byte	102,69,15,58,15,192,8
+.byte	102,69,15,58,15,228,4
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol16(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm5
+	pxor	%xmm3,%xmm5
+	paddd	%xmm5,%xmm1
+	pxor	%xmm1,%xmm13
+	pshufb	.Lrol8(%rip),%xmm13
+	paddd	%xmm13,%xmm9
+	pxor	%xmm9,%xmm5
+	movdqa	%xmm5,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm5
+	pxor	%xmm3,%xmm5
+.byte	102,15,58,15,237,12
+.byte	102,69,15,58,15,201,8
+.byte	102,69,15,58,15,237,4
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol16(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$12,%xmm3
+	psrld	$20,%xmm6
+	pxor	%xmm3,%xmm6
+	paddd	%xmm6,%xmm2
+	pxor	%xmm2,%xmm14
+	pshufb	.Lrol8(%rip),%xmm14
+	paddd	%xmm14,%xmm10
+	pxor	%xmm10,%xmm6
+	movdqa	%xmm6,%xmm3
+	pslld	$7,%xmm3
+	psrld	$25,%xmm6
+	pxor	%xmm3,%xmm6
+.byte	102,15,58,15,246,12
+.byte	102,69,15,58,15,210,8
+.byte	102,69,15,58,15,246,4
+
+	decq	%r10
+	jnz	.Lseal_sse_128_rounds
+	paddd	.Lchacha20_consts(%rip),%xmm0
+	paddd	.Lchacha20_consts(%rip),%xmm1
+	paddd	.Lchacha20_consts(%rip),%xmm2
+	paddd	%xmm7,%xmm4
+	paddd	%xmm7,%xmm5
+	paddd	%xmm7,%xmm6
+	paddd	%xmm11,%xmm8
+	paddd	%xmm11,%xmm9
+	paddd	%xmm15,%xmm12
+	paddd	.Lsse_inc(%rip),%xmm15
+	paddd	%xmm15,%xmm13
+
+	pand	.Lclamp(%rip),%xmm2
+	movdqa	%xmm2,0+0(%rbp)
+	movdqa	%xmm6,0+16(%rbp)
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	jmp	.Lseal_sse_128_tail_xor
+.size	chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.cfi_endproc	
+
+
+.type	chacha20_poly1305_open_avx2,@function
+.align	64
+chacha20_poly1305_open_avx2:
+.cfi_startproc	
+
+
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+.cfi_adjust_cfa_offset	288 + 32
+
+	vzeroupper
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	.Lavx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	.Lopen_avx2_192
+	cmpq	$320,%rbx
+	jbe	.Lopen_avx2_320
+
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	movq	$10,%r10
+.Lopen_avx2_init_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	decq	%r10
+	jne	.Lopen_avx2_init_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	xorq	%rcx,%rcx
+.Lopen_avx2_init_hash:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	addq	$16,%rcx
+	cmpq	$64,%rcx
+	jne	.Lopen_avx2_init_hash
+
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0(%rdi)
+	vmovdqu	%ymm4,32(%rdi)
+	leaq	64(%rsi),%rsi
+	leaq	64(%rdi),%rdi
+	subq	$64,%rbx
+.Lopen_avx2_main_loop:
+
+	cmpq	$512,%rbx
+	jb	.Lopen_avx2_main_loop_done
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+.Lopen_avx2_main_loop_rounds:
+	addq	0+0(%rsi,%rcx,1),%r10
+	adcq	8+0(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rsi,%rcx,1),%r10
+	adcq	8+16(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rsi,%rcx,1),%r10
+	adcq	8+32(%rsi,%rcx,1),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rcx),%rcx
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	cmpq	$60*8,%rcx
+	jne	.Lopen_avx2_main_loop_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+60*8(%rsi),%r10
+	adcq	8+60*8(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	addq	0+60*8+16(%rsi),%r10
+	adcq	8+60*8+16(%rsi),%r11
+	adcq	$1,%r12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	leaq	512(%rdi),%rdi
+	subq	$512,%rbx
+	jmp	.Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
+	testq	%rbx,%rbx
+	vzeroupper
+	je	.Lopen_sse_finalize
+
+	cmpq	$384,%rbx
+	ja	.Lopen_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	.Lopen_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	.Lopen_avx2_tail_256
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%r8,%r8
+	movq	%rbx,%rcx
+	andq	$-16,%rcx
+	testq	%rcx,%rcx
+	je	.Lopen_avx2_tail_128_rounds
+.Lopen_avx2_tail_128_rounds_and_x1hash:
+	addq	0+0(%rsi,%r8,1),%r10
+	adcq	8+0(%rsi,%r8,1),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lopen_avx2_tail_128_rounds:
+	addq	$16,%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_128_rounds_and_x1hash
+	cmpq	$160,%r8
+	jne	.Lopen_avx2_tail_128_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_256:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$128,%rcx
+	shrq	$4,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+.Lopen_avx2_tail_256_rounds_and_x1hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+.Lopen_avx2_tail_256_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+
+	incq	%r8
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_256_rounds_and_x1hash
+	cmpq	$10,%r8
+	jne	.Lopen_avx2_tail_256_rounds
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+.Lopen_avx2_tail_256_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	.Lopen_avx2_tail_256_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	.Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	128(%rsi),%rsi
+	leaq	128(%rdi),%rdi
+	subq	$128,%rbx
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_384:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+	movq	%rbx,0+128(%rbp)
+	movq	%rbx,%rcx
+	subq	$256,%rcx
+	shrq	$4,%rcx
+	addq	$6,%rcx
+	movq	$10,%r8
+	cmpq	$10,%rcx
+	cmovgq	%r8,%rcx
+	movq	%rsi,%rbx
+	xorq	%r8,%r8
+.Lopen_avx2_tail_384_rounds_and_x2hash:
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+.Lopen_avx2_tail_384_rounds_and_x1hash:
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rbx),%r10
+	adcq	8+0(%rbx),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rbx),%rbx
+	incq	%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+
+	cmpq	%rcx,%r8
+	jb	.Lopen_avx2_tail_384_rounds_and_x2hash
+	cmpq	$10,%r8
+	jne	.Lopen_avx2_tail_384_rounds_and_x1hash
+	movq	%rbx,%r8
+	subq	%rsi,%rbx
+	movq	%rbx,%rcx
+	movq	0+128(%rbp),%rbx
+.Lopen_avx2_384_tail_hash:
+	addq	$16,%rcx
+	cmpq	%rbx,%rcx
+	jg	.Lopen_avx2_384_tail_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	jmp	.Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	256(%rsi),%rsi
+	leaq	256(%rdi),%rdi
+	subq	$256,%rbx
+	jmp	.Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_512:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	xorq	%rcx,%rcx
+	movq	%rsi,%r8
+.Lopen_avx2_tail_512_rounds_and_x2hash:
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+.Lopen_avx2_tail_512_rounds_and_x1hash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	addq	0+16(%r8),%r10
+	adcq	8+16(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%r8),%r8
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	incq	%rcx
+	cmpq	$4,%rcx
+	jl	.Lopen_avx2_tail_512_rounds_and_x2hash
+	cmpq	$10,%rcx
+	jne	.Lopen_avx2_tail_512_rounds_and_x1hash
+	movq	%rbx,%rcx
+	subq	$384,%rcx
+	andq	$-16,%rcx
+.Lopen_avx2_tail_512_hash:
+	testq	%rcx,%rcx
+	je	.Lopen_avx2_tail_512_done
+	addq	0+0(%r8),%r10
+	adcq	8+0(%r8),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%r8),%r8
+	subq	$16,%rcx
+	jmp	.Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done:
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	leaq	384(%rsi),%rsi
+	leaq	384(%rdi),%rdi
+	subq	$384,%rbx
+.Lopen_avx2_tail_128_xor:
+	cmpq	$32,%rbx
+	jb	.Lopen_avx2_tail_32_xor
+	subq	$32,%rbx
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	jmp	.Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	.Lopen_avx2_exit
+	subq	$16,%rbx
+
+	vpxor	(%rsi),%xmm0,%xmm1
+	vmovdqu	%xmm1,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vperm2i128	$0x11,%ymm0,%ymm0,%ymm0
+	vmovdqa	%xmm0,%xmm1
+.Lopen_avx2_exit:
+	vzeroupper
+	jmp	.Lopen_sse_tail_16
+
+.Lopen_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+.Lopen_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	.Lopen_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+.Lopen_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+.Lopen_avx2_short_hash_and_xor_loop:
+	cmpq	$32,%rbx
+	jb	.Lopen_avx2_short_tail_32
+	subq	$32,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rsi),%r10
+	adcq	8+16(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	.Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
+	cmpq	$16,%rbx
+	vmovdqa	%xmm0,%xmm1
+	jb	.Lopen_avx2_short_tail_32_exit
+	subq	$16,%rbx
+	addq	0+0(%rsi),%r10
+	adcq	8+0(%rsi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm1
+.Lopen_avx2_short_tail_32_exit:
+	vzeroupper
+	jmp	.Lopen_sse_tail_16
+
+.Lopen_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+.Lopen_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	.Lopen_avx2_320_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	.Lopen_avx2_short
+.size	chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc	
+
+
+.type	chacha20_poly1305_seal_avx2,@function
+.align	64
+chacha20_poly1305_seal_avx2:
+.cfi_startproc	
+
+
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r9,-64
+.cfi_adjust_cfa_offset	288 + 32
+
+	vzeroupper
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vbroadcasti128	0(%r9),%ymm4
+	vbroadcasti128	16(%r9),%ymm8
+	vbroadcasti128	32(%r9),%ymm12
+	vpaddd	.Lavx2_init(%rip),%ymm12,%ymm12
+	cmpq	$192,%rbx
+	jbe	.Lseal_avx2_192
+	cmpq	$320,%rbx
+	jbe	.Lseal_avx2_320
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm4,0+64(%rbp)
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm8,0+96(%rbp)
+	vmovdqa	%ymm12,%ymm15
+	vpaddd	.Lavx2_inc(%rip),%ymm15,%ymm14
+	vpaddd	.Lavx2_inc(%rip),%ymm14,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm15,0+256(%rbp)
+	movq	$10,%r10
+.Lseal_avx2_init_rounds:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%r10
+	jnz	.Lseal_avx2_init_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm15
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm3
+	vpand	.Lclamp(%rip),%ymm15,%ymm15
+	vmovdqa	%ymm15,0+0(%rbp)
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+
+	vpxor	0(%rsi),%ymm3,%ymm3
+	vpxor	32(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm3,0(%rdi)
+	vmovdqu	%ymm11,32(%rdi)
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm15
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+64(%rsi),%ymm15,%ymm15
+	vpxor	32+64(%rsi),%ymm2,%ymm2
+	vpxor	64+64(%rsi),%ymm6,%ymm6
+	vpxor	96+64(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm15,0+64(%rdi)
+	vmovdqu	%ymm2,32+64(%rdi)
+	vmovdqu	%ymm6,64+64(%rdi)
+	vmovdqu	%ymm10,96+64(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm15
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+192(%rsi),%ymm15,%ymm15
+	vpxor	32+192(%rsi),%ymm1,%ymm1
+	vpxor	64+192(%rsi),%ymm5,%ymm5
+	vpxor	96+192(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm15,0+192(%rdi)
+	vmovdqu	%ymm1,32+192(%rdi)
+	vmovdqu	%ymm5,64+192(%rdi)
+	vmovdqu	%ymm9,96+192(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm15
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm15,%ymm8
+
+	leaq	320(%rsi),%rsi
+	subq	$320,%rbx
+	movq	$320,%rcx
+	cmpq	$128,%rbx
+	jbe	.Lseal_avx2_short_hash_remainder
+	vpxor	0(%rsi),%ymm0,%ymm0
+	vpxor	32(%rsi),%ymm4,%ymm4
+	vpxor	64(%rsi),%ymm8,%ymm8
+	vpxor	96(%rsi),%ymm12,%ymm12
+	vmovdqu	%ymm0,320(%rdi)
+	vmovdqu	%ymm4,352(%rdi)
+	vmovdqu	%ymm8,384(%rdi)
+	vmovdqu	%ymm12,416(%rdi)
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	movq	$8,%rcx
+	movq	$2,%r8
+	cmpq	$128,%rbx
+	jbe	.Lseal_avx2_tail_128
+	cmpq	$256,%rbx
+	jbe	.Lseal_avx2_tail_256
+	cmpq	$384,%rbx
+	jbe	.Lseal_avx2_tail_384
+	cmpq	$512,%rbx
+	jbe	.Lseal_avx2_tail_512
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+
+	subq	$16,%rdi
+	movq	$9,%rcx
+	jmp	.Lseal_avx2_main_loop_rounds_entry
+.align	32
+.Lseal_avx2_main_loop:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+	movq	$10,%rcx
+.align	32
+.Lseal_avx2_main_loop_rounds:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+.Lseal_avx2_main_loop_rounds_entry:
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	0+32(%rdi),%r10
+	adcq	8+32(%rdi),%r11
+	adcq	$1,%r12
+
+	leaq	48(%rdi),%rdi
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+	decq	%rcx
+	jne	.Lseal_avx2_main_loop_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm4
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm8
+	vpxor	0+384(%rsi),%ymm3,%ymm3
+	vpxor	32+384(%rsi),%ymm0,%ymm0
+	vpxor	64+384(%rsi),%ymm4,%ymm4
+	vpxor	96+384(%rsi),%ymm8,%ymm8
+	vmovdqu	%ymm3,0+384(%rdi)
+	vmovdqu	%ymm0,32+384(%rdi)
+	vmovdqu	%ymm4,64+384(%rdi)
+	vmovdqu	%ymm8,96+384(%rdi)
+
+	leaq	512(%rsi),%rsi
+	subq	$512,%rbx
+	cmpq	$512,%rbx
+	jg	.Lseal_avx2_main_loop
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	movq	$10,%rcx
+	xorq	%r8,%r8
+
+	cmpq	$384,%rbx
+	ja	.Lseal_avx2_tail_512
+	cmpq	$256,%rbx
+	ja	.Lseal_avx2_tail_384
+	cmpq	$128,%rbx
+	ja	.Lseal_avx2_tail_256
+
+.Lseal_avx2_tail_128:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_128_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_128_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_128_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_128_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	jmp	.Lseal_avx2_short_loop
+
+.Lseal_avx2_tail_256:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+
+.Lseal_avx2_tail_256_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_256_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_256_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_256_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm1,%ymm1
+	vpxor	64+0(%rsi),%ymm5,%ymm5
+	vpxor	96+0(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm1,32+0(%rdi)
+	vmovdqu	%ymm5,64+0(%rdi)
+	vmovdqu	%ymm9,96+0(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$128,%rcx
+	leaq	128(%rsi),%rsi
+	subq	$128,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_384:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+
+.Lseal_avx2_tail_384_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_384_rounds_and_2xhash:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_384_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_384_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+0(%rsi),%ymm3,%ymm3
+	vpxor	32+0(%rsi),%ymm2,%ymm2
+	vpxor	64+0(%rsi),%ymm6,%ymm6
+	vpxor	96+0(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+0(%rdi)
+	vmovdqu	%ymm2,32+0(%rdi)
+	vmovdqu	%ymm6,64+0(%rdi)
+	vmovdqu	%ymm10,96+0(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm1,%ymm1
+	vpxor	64+128(%rsi),%ymm5,%ymm5
+	vpxor	96+128(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm1,32+128(%rdi)
+	vmovdqu	%ymm5,64+128(%rdi)
+	vmovdqu	%ymm9,96+128(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$256,%rcx
+	leaq	256(%rsi),%rsi
+	subq	$256,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_512:
+	vmovdqa	.Lchacha20_consts(%rip),%ymm0
+	vmovdqa	0+64(%rbp),%ymm4
+	vmovdqa	0+96(%rbp),%ymm8
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm10
+	vmovdqa	%ymm0,%ymm3
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	.Lavx2_inc(%rip),%ymm12
+	vpaddd	0+160(%rbp),%ymm12,%ymm15
+	vpaddd	%ymm15,%ymm12,%ymm14
+	vpaddd	%ymm14,%ymm12,%ymm13
+	vpaddd	%ymm13,%ymm12,%ymm12
+	vmovdqa	%ymm15,0+256(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_512_rounds_and_3xhash:
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+.Lseal_avx2_tail_512_rounds_and_2xhash:
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$4,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$12,%ymm15,%ymm15,%ymm15
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vmovdqa	%ymm8,0+128(%rbp)
+	vmovdqa	.Lrol16(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$20,%ymm7,%ymm8
+	vpslld	$32-20,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$20,%ymm6,%ymm8
+	vpslld	$32-20,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$20,%ymm5,%ymm8
+	vpslld	$32-20,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$20,%ymm4,%ymm8
+	vpslld	$32-20,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	.Lrol8(%rip),%ymm8
+	vpaddd	%ymm7,%ymm3,%ymm3
+	vpaddd	%ymm6,%ymm2,%ymm2
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm3,%ymm15,%ymm15
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	%ymm8,%ymm15,%ymm15
+	vpshufb	%ymm8,%ymm14,%ymm14
+	vpshufb	%ymm8,%ymm13,%ymm13
+	vpshufb	%ymm8,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm11,%ymm11
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpaddd	0+128(%rbp),%ymm12,%ymm8
+	vpxor	%ymm11,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	%ymm8,0+128(%rbp)
+	vpsrld	$25,%ymm7,%ymm8
+	movq	0+0+0(%rbp),%rdx
+	movq	%rdx,%r15
+	mulxq	%r10,%r13,%r14
+	mulxq	%r11,%rax,%rdx
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	vpslld	$32-25,%ymm7,%ymm7
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$25,%ymm6,%ymm8
+	vpslld	$32-25,%ymm6,%ymm6
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$25,%ymm5,%ymm8
+	vpslld	$32-25,%ymm5,%ymm5
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$25,%ymm4,%ymm8
+	vpslld	$32-25,%ymm4,%ymm4
+	vpxor	%ymm8,%ymm4,%ymm4
+	vmovdqa	0+128(%rbp),%ymm8
+	vpalignr	$12,%ymm7,%ymm7,%ymm7
+	vpalignr	$8,%ymm11,%ymm11,%ymm11
+	vpalignr	$4,%ymm15,%ymm15,%ymm15
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	movq	8+0+0(%rbp),%rdx
+	mulxq	%r10,%r10,%rax
+	addq	%r10,%r14
+	mulxq	%r11,%r11,%r9
+	adcq	%r11,%r15
+	adcq	$0,%r9
+	imulq	%r12,%rdx
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	addq	%rax,%r15
+	adcq	%rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+	decq	%rcx
+	jg	.Lseal_avx2_tail_512_rounds_and_3xhash
+	decq	%r8
+	jge	.Lseal_avx2_tail_512_rounds_and_2xhash
+	vpaddd	.Lchacha20_consts(%rip),%ymm3,%ymm3
+	vpaddd	0+64(%rbp),%ymm7,%ymm7
+	vpaddd	0+96(%rbp),%ymm11,%ymm11
+	vpaddd	0+256(%rbp),%ymm15,%ymm15
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	0+64(%rbp),%ymm6,%ymm6
+	vpaddd	0+96(%rbp),%ymm10,%ymm10
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	0+64(%rbp),%ymm5,%ymm5
+	vpaddd	0+96(%rbp),%ymm9,%ymm9
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	0+64(%rbp),%ymm4,%ymm4
+	vpaddd	0+96(%rbp),%ymm8,%ymm8
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+
+	vmovdqa	%ymm0,0+128(%rbp)
+	vperm2i128	$0x02,%ymm3,%ymm7,%ymm0
+	vperm2i128	$0x13,%ymm3,%ymm7,%ymm7
+	vperm2i128	$0x02,%ymm11,%ymm15,%ymm3
+	vperm2i128	$0x13,%ymm11,%ymm15,%ymm11
+	vpxor	0+0(%rsi),%ymm0,%ymm0
+	vpxor	32+0(%rsi),%ymm3,%ymm3
+	vpxor	64+0(%rsi),%ymm7,%ymm7
+	vpxor	96+0(%rsi),%ymm11,%ymm11
+	vmovdqu	%ymm0,0+0(%rdi)
+	vmovdqu	%ymm3,32+0(%rdi)
+	vmovdqu	%ymm7,64+0(%rdi)
+	vmovdqu	%ymm11,96+0(%rdi)
+
+	vmovdqa	0+128(%rbp),%ymm0
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm3
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm6
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm10
+	vpxor	0+128(%rsi),%ymm3,%ymm3
+	vpxor	32+128(%rsi),%ymm2,%ymm2
+	vpxor	64+128(%rsi),%ymm6,%ymm6
+	vpxor	96+128(%rsi),%ymm10,%ymm10
+	vmovdqu	%ymm3,0+128(%rdi)
+	vmovdqu	%ymm2,32+128(%rdi)
+	vmovdqu	%ymm6,64+128(%rdi)
+	vmovdqu	%ymm10,96+128(%rdi)
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm3
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm5
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm9
+	vpxor	0+256(%rsi),%ymm3,%ymm3
+	vpxor	32+256(%rsi),%ymm1,%ymm1
+	vpxor	64+256(%rsi),%ymm5,%ymm5
+	vpxor	96+256(%rsi),%ymm9,%ymm9
+	vmovdqu	%ymm3,0+256(%rdi)
+	vmovdqu	%ymm1,32+256(%rdi)
+	vmovdqu	%ymm5,64+256(%rdi)
+	vmovdqu	%ymm9,96+256(%rdi)
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm3
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x02,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm12
+	vmovdqa	%ymm3,%ymm8
+
+	movq	$384,%rcx
+	leaq	384(%rsi),%rsi
+	subq	$384,%rbx
+	jmp	.Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_320:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vpaddd	.Lavx2_inc(%rip),%ymm13,%ymm14
+	vmovdqa	%ymm4,%ymm7
+	vmovdqa	%ymm8,%ymm11
+	vmovdqa	%ymm12,0+160(%rbp)
+	vmovdqa	%ymm13,0+192(%rbp)
+	vmovdqa	%ymm14,0+224(%rbp)
+	movq	$10,%r10
+.Lseal_avx2_320_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$12,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$4,%ymm6,%ymm6,%ymm6
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol16(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpsrld	$20,%ymm6,%ymm3
+	vpslld	$12,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpaddd	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm2,%ymm14,%ymm14
+	vpshufb	.Lrol8(%rip),%ymm14,%ymm14
+	vpaddd	%ymm14,%ymm10,%ymm10
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpslld	$7,%ymm6,%ymm3
+	vpsrld	$25,%ymm6,%ymm6
+	vpxor	%ymm3,%ymm6,%ymm6
+	vpalignr	$4,%ymm14,%ymm14,%ymm14
+	vpalignr	$8,%ymm10,%ymm10,%ymm10
+	vpalignr	$12,%ymm6,%ymm6,%ymm6
+
+	decq	%r10
+	jne	.Lseal_avx2_320_rounds
+	vpaddd	.Lchacha20_consts(%rip),%ymm0,%ymm0
+	vpaddd	.Lchacha20_consts(%rip),%ymm1,%ymm1
+	vpaddd	.Lchacha20_consts(%rip),%ymm2,%ymm2
+	vpaddd	%ymm7,%ymm4,%ymm4
+	vpaddd	%ymm7,%ymm5,%ymm5
+	vpaddd	%ymm7,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm8,%ymm8
+	vpaddd	%ymm11,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm10,%ymm10
+	vpaddd	0+160(%rbp),%ymm12,%ymm12
+	vpaddd	0+192(%rbp),%ymm13,%ymm13
+	vpaddd	0+224(%rbp),%ymm14,%ymm14
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+	vperm2i128	$0x02,%ymm2,%ymm6,%ymm9
+	vperm2i128	$0x02,%ymm10,%ymm14,%ymm13
+	vperm2i128	$0x13,%ymm2,%ymm6,%ymm2
+	vperm2i128	$0x13,%ymm10,%ymm14,%ymm6
+	jmp	.Lseal_avx2_short
+
+.Lseal_avx2_192:
+	vmovdqa	%ymm0,%ymm1
+	vmovdqa	%ymm0,%ymm2
+	vmovdqa	%ymm4,%ymm5
+	vmovdqa	%ymm4,%ymm6
+	vmovdqa	%ymm8,%ymm9
+	vmovdqa	%ymm8,%ymm10
+	vpaddd	.Lavx2_inc(%rip),%ymm12,%ymm13
+	vmovdqa	%ymm12,%ymm11
+	vmovdqa	%ymm13,%ymm15
+	movq	$10,%r10
+.Lseal_avx2_192_rounds:
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$12,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$4,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$12,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$4,%ymm5,%ymm5,%ymm5
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol16(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$20,%ymm4,%ymm3
+	vpslld	$12,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpaddd	%ymm4,%ymm0,%ymm0
+	vpxor	%ymm0,%ymm12,%ymm12
+	vpshufb	.Lrol8(%rip),%ymm12,%ymm12
+	vpaddd	%ymm12,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpslld	$7,%ymm4,%ymm3
+	vpsrld	$25,%ymm4,%ymm4
+	vpxor	%ymm3,%ymm4,%ymm4
+	vpalignr	$4,%ymm12,%ymm12,%ymm12
+	vpalignr	$8,%ymm8,%ymm8,%ymm8
+	vpalignr	$12,%ymm4,%ymm4,%ymm4
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol16(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpsrld	$20,%ymm5,%ymm3
+	vpslld	$12,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpaddd	%ymm5,%ymm1,%ymm1
+	vpxor	%ymm1,%ymm13,%ymm13
+	vpshufb	.Lrol8(%rip),%ymm13,%ymm13
+	vpaddd	%ymm13,%ymm9,%ymm9
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpslld	$7,%ymm5,%ymm3
+	vpsrld	$25,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm5,%ymm5
+	vpalignr	$4,%ymm13,%ymm13,%ymm13
+	vpalignr	$8,%ymm9,%ymm9,%ymm9
+	vpalignr	$12,%ymm5,%ymm5,%ymm5
+
+	decq	%r10
+	jne	.Lseal_avx2_192_rounds
+	vpaddd	%ymm2,%ymm0,%ymm0
+	vpaddd	%ymm2,%ymm1,%ymm1
+	vpaddd	%ymm6,%ymm4,%ymm4
+	vpaddd	%ymm6,%ymm5,%ymm5
+	vpaddd	%ymm10,%ymm8,%ymm8
+	vpaddd	%ymm10,%ymm9,%ymm9
+	vpaddd	%ymm11,%ymm12,%ymm12
+	vpaddd	%ymm15,%ymm13,%ymm13
+	vperm2i128	$0x02,%ymm0,%ymm4,%ymm3
+
+	vpand	.Lclamp(%rip),%ymm3,%ymm3
+	vmovdqa	%ymm3,0+0(%rbp)
+
+	vperm2i128	$0x13,%ymm0,%ymm4,%ymm0
+	vperm2i128	$0x13,%ymm8,%ymm12,%ymm4
+	vperm2i128	$0x02,%ymm1,%ymm5,%ymm8
+	vperm2i128	$0x02,%ymm9,%ymm13,%ymm12
+	vperm2i128	$0x13,%ymm1,%ymm5,%ymm1
+	vperm2i128	$0x13,%ymm9,%ymm13,%ymm5
+.Lseal_avx2_short:
+	movq	%r8,%r8
+	call	poly_hash_ad_internal
+	xorq	%rcx,%rcx
+.Lseal_avx2_short_hash_remainder:
+	cmpq	$16,%rcx
+	jb	.Lseal_avx2_short_loop
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	subq	$16,%rcx
+	addq	$16,%rdi
+	jmp	.Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
+	cmpq	$32,%rbx
+	jb	.Lseal_avx2_short_tail
+	subq	$32,%rbx
+
+	vpxor	(%rsi),%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	leaq	32(%rsi),%rsi
+
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+	addq	0+16(%rdi),%r10
+	adcq	8+16(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	32(%rdi),%rdi
+
+	vmovdqa	%ymm4,%ymm0
+	vmovdqa	%ymm8,%ymm4
+	vmovdqa	%ymm12,%ymm8
+	vmovdqa	%ymm1,%ymm12
+	vmovdqa	%ymm5,%ymm1
+	vmovdqa	%ymm9,%ymm5
+	vmovdqa	%ymm13,%ymm9
+	vmovdqa	%ymm2,%ymm13
+	vmovdqa	%ymm6,%ymm2
+	jmp	.Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
+	cmpq	$16,%rbx
+	jb	.Lseal_avx2_exit
+	subq	$16,%rbx
+	vpxor	(%rsi),%xmm0,%xmm3
+	vmovdqu	%xmm3,(%rdi)
+	leaq	16(%rsi),%rsi
+	addq	0+0(%rdi),%r10
+	adcq	8+0(%rdi),%r11
+	adcq	$1,%r12
+	movq	0+0+0(%rbp),%rax
+	movq	%rax,%r15
+	mulq	%r10
+	movq	%rax,%r13
+	movq	%rdx,%r14
+	movq	0+0+0(%rbp),%rax
+	mulq	%r11
+	imulq	%r12,%r15
+	addq	%rax,%r14
+	adcq	%rdx,%r15
+	movq	8+0+0(%rbp),%rax
+	movq	%rax,%r9
+	mulq	%r10
+	addq	%rax,%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+	movq	8+0+0(%rbp),%rax
+	mulq	%r11
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	imulq	%r12,%r9
+	addq	%r10,%r15
+	adcq	%rdx,%r9
+	movq	%r13,%r10
+	movq	%r14,%r11
+	movq	%r15,%r12
+	andq	$3,%r12
+	movq	%r15,%r13
+	andq	$-4,%r13
+	movq	%r9,%r14
+	shrdq	$2,%r9,%r15
+	shrq	$2,%r9
+	addq	%r13,%r15
+	adcq	%r14,%r9
+	addq	%r15,%r10
+	adcq	%r9,%r11
+	adcq	$0,%r12
+
+	leaq	16(%rdi),%rdi
+	vextracti128	$1,%ymm0,%xmm0
+.Lseal_avx2_exit:
+	vzeroupper
+	jmp	.Lseal_sse_tail_16
+.cfi_endproc	
+.size	chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
@@ -1,0 +1,852 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc	
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$11,%ebp
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	je	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl	aesni_gcm_decrypt
+.hidden aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,@function
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc	
+	xorq	%r10,%r10
+
+
+
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	leaq	32+32(%r9),%r9
+	movl	240-128(%rcx),%ebp
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+
+
+
+
+
+
+
+	leaq	-192(%rdi,%rdx,1),%r15
+
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+	call	_aesni_ctr32_ghash_6x
+
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_dec_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+.cfi_startproc	
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-1(%rbp),%r13
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.cfi_endproc	
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.hidden aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc	
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+2(%rip)
+#endif
+	xorq	%r10,%r10
+
+
+
+
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	240-128(%rcx),%ebp
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+
+
+
+
+
+
+
+
+	leaq	-192(%rsi,%rdx,1),%r15
+
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	leaq	32+32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	call	_aesni_ctr32_ghash_6x
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	vmovdqu	%xmm8,-64(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S
@@ -1,0 +1,2506 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl	aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type	aes_hw_encrypt,@function
+.align	16
+aes_hw_encrypt:
+.cfi_startproc	
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+1(%rip)
+#endif
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_1:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_enc1_1
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes_hw_encrypt,.-aes_hw_encrypt
+
+.globl	aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type	aes_hw_decrypt,@function
+.align	16
+aes_hw_decrypt:
+.cfi_startproc	
+	movups	(%rdi),%xmm2
+	movl	240(%rdx),%eax
+	movups	(%rdx),%xmm0
+	movups	16(%rdx),%xmm1
+	leaq	32(%rdx),%rdx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_2:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rdx),%xmm1
+	leaq	16(%rdx),%rdx
+	jnz	.Loop_dec1_2
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes_hw_decrypt, .-aes_hw_decrypt
+.type	_aesni_encrypt2,@function
+.align	16
+_aesni_encrypt2:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop2:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop2
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_encrypt2,.-_aesni_encrypt2
+.type	_aesni_decrypt2,@function
+.align	16
+_aesni_decrypt2:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Ldec_loop2:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop2
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_decrypt2,.-_aesni_decrypt2
+.type	_aesni_encrypt3,@function
+.align	16
+_aesni_encrypt3:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Lenc_loop3:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop3
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_encrypt3,.-_aesni_encrypt3
+.type	_aesni_decrypt3,@function
+.align	16
+_aesni_decrypt3:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+	addq	$16,%rax
+
+.Ldec_loop3:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop3
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_decrypt3,.-_aesni_decrypt3
+.type	_aesni_encrypt4,@function
+.align	16
+_aesni_encrypt4:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+.Lenc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_encrypt4,.-_aesni_encrypt4
+.type	_aesni_decrypt4,@function
+.align	16
+_aesni_decrypt4:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	xorps	%xmm0,%xmm4
+	xorps	%xmm0,%xmm5
+	movups	32(%rcx),%xmm0
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	0x0f,0x1f,0x00
+	addq	$16,%rax
+
+.Ldec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_decrypt4,.-_aesni_decrypt4
+.type	_aesni_encrypt6,@function
+.align	16
+_aesni_encrypt6:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,220,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,220,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop6_enter
+.align	16
+.Lenc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.Lenc_loop6_enter:
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_encrypt6,.-_aesni_encrypt6
+.type	_aesni_decrypt6,@function
+.align	16
+_aesni_decrypt6:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	pxor	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+.byte	102,15,56,222,209
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+.byte	102,15,56,222,225
+	pxor	%xmm0,%xmm7
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Ldec_loop6_enter
+.align	16
+.Ldec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.Ldec_loop6_enter:
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_decrypt6,.-_aesni_decrypt6
+.type	_aesni_encrypt8,@function
+.align	16
+_aesni_encrypt8:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,220,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Lenc_loop8_inner
+.align	16
+.Lenc_loop8:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.Lenc_loop8_inner:
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.Lenc_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Lenc_loop8
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+.byte	102,15,56,221,208
+.byte	102,15,56,221,216
+.byte	102,15,56,221,224
+.byte	102,15,56,221,232
+.byte	102,15,56,221,240
+.byte	102,15,56,221,248
+.byte	102,68,15,56,221,192
+.byte	102,68,15,56,221,200
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_encrypt8,.-_aesni_encrypt8
+.type	_aesni_decrypt8,@function
+.align	16
+_aesni_decrypt8:
+.cfi_startproc	
+	movups	(%rcx),%xmm0
+	shll	$4,%eax
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm2
+	xorps	%xmm0,%xmm3
+	pxor	%xmm0,%xmm4
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	leaq	32(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm9
+	movups	(%rcx,%rax,1),%xmm0
+	addq	$16,%rax
+	jmp	.Ldec_loop8_inner
+.align	16
+.Ldec_loop8:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.Ldec_loop8_inner:
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.Ldec_loop8_enter:
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Ldec_loop8
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+.byte	102,15,56,223,208
+.byte	102,15,56,223,216
+.byte	102,15,56,223,224
+.byte	102,15,56,223,232
+.byte	102,15,56,223,240
+.byte	102,15,56,223,248
+.byte	102,68,15,56,223,192
+.byte	102,68,15,56,223,200
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_aesni_decrypt8,.-_aesni_decrypt8
+.globl	aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type	aes_hw_ecb_encrypt,@function
+.align	16
+aes_hw_ecb_encrypt:
+.cfi_startproc	
+	andq	$-16,%rdx
+	jz	.Lecb_ret
+
+	movl	240(%rcx),%eax
+	movups	(%rcx),%xmm0
+	movq	%rcx,%r11
+	movl	%eax,%r10d
+	testl	%r8d,%r8d
+	jz	.Lecb_decrypt
+
+	cmpq	$0x80,%rdx
+	jb	.Lecb_enc_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$0x80,%rdx
+	jmp	.Lecb_enc_loop8_enter
+.align	16
+.Lecb_enc_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	subq	$0x80,%rdx
+	jnc	.Lecb_enc_loop8
+
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	addq	$0x80,%rdx
+	jz	.Lecb_ret
+
+.Lecb_enc_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$0x20,%rdx
+	jb	.Lecb_enc_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_enc_two
+	movups	32(%rdi),%xmm4
+	cmpq	$0x40,%rdx
+	jb	.Lecb_enc_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_enc_four
+	movups	64(%rdi),%xmm6
+	cmpq	$0x60,%rdx
+	jb	.Lecb_enc_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_enc_six
+	movdqu	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_encrypt8
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	movups	%xmm8,96(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_3:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_3
+.byte	102,15,56,221,209
+	movups	%xmm2,(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_two:
+	call	_aesni_encrypt2
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_three:
+	call	_aesni_encrypt3
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	%xmm2,(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	jmp	.Lecb_ret
+
+.align	16
+.Lecb_decrypt:
+	cmpq	$0x80,%rdx
+	jb	.Lecb_dec_tail
+
+	movdqu	(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	movdqu	96(%rdi),%xmm8
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+	subq	$0x80,%rdx
+	jmp	.Lecb_dec_loop8_enter
+.align	16
+.Lecb_dec_loop8:
+	movups	%xmm2,(%rsi)
+	movq	%r11,%rcx
+	movdqu	(%rdi),%xmm2
+	movl	%r10d,%eax
+	movups	%xmm3,16(%rsi)
+	movdqu	16(%rdi),%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqu	32(%rdi),%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqu	48(%rdi),%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqu	64(%rdi),%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqu	80(%rdi),%xmm7
+	movups	%xmm8,96(%rsi)
+	movdqu	96(%rdi),%xmm8
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+	movdqu	112(%rdi),%xmm9
+	leaq	128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	movups	(%r11),%xmm0
+	subq	$0x80,%rdx
+	jnc	.Lecb_dec_loop8
+
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movq	%r11,%rcx
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movl	%r10d,%eax
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	movups	%xmm9,112(%rsi)
+	pxor	%xmm9,%xmm9
+	leaq	128(%rsi),%rsi
+	addq	$0x80,%rdx
+	jz	.Lecb_ret
+
+.Lecb_dec_tail:
+	movups	(%rdi),%xmm2
+	cmpq	$0x20,%rdx
+	jb	.Lecb_dec_one
+	movups	16(%rdi),%xmm3
+	je	.Lecb_dec_two
+	movups	32(%rdi),%xmm4
+	cmpq	$0x40,%rdx
+	jb	.Lecb_dec_three
+	movups	48(%rdi),%xmm5
+	je	.Lecb_dec_four
+	movups	64(%rdi),%xmm6
+	cmpq	$0x60,%rdx
+	jb	.Lecb_dec_five
+	movups	80(%rdi),%xmm7
+	je	.Lecb_dec_six
+	movups	96(%rdi),%xmm8
+	movups	(%rcx),%xmm0
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	movups	%xmm8,96(%rsi)
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_4:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_4
+.byte	102,15,56,223,209
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	call	_aesni_decrypt2
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
+	call	_aesni_decrypt4
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_five:
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+
+.Lecb_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt
+.globl	aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type	aes_hw_ctr32_encrypt_blocks,@function
+.align	16
+aes_hw_ctr32_encrypt_blocks:
+.cfi_startproc	
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit(%rip)
+#endif
+	cmpq	$1,%rdx
+	jne	.Lctr32_bulk
+
+
+
+	movups	(%r8),%xmm2
+	movups	(%rdi),%xmm3
+	movl	240(%rcx),%edx
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_enc1_5:
+.byte	102,15,56,220,209
+	decl	%edx
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_5
+.byte	102,15,56,221,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm2
+	jmp	.Lctr32_epilogue
+
+.align	16
+.Lctr32_bulk:
+	leaq	(%rsp),%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbp
+.cfi_offset	%rbp,-16
+	subq	$128,%rsp
+	andq	$-16,%rsp
+
+
+
+
+	movdqu	(%r8),%xmm2
+	movdqu	(%rcx),%xmm0
+	movl	12(%r8),%r8d
+	pxor	%xmm0,%xmm2
+	movl	12(%rcx),%ebp
+	movdqa	%xmm2,0(%rsp)
+	bswapl	%r8d
+	movdqa	%xmm2,%xmm3
+	movdqa	%xmm2,%xmm4
+	movdqa	%xmm2,%xmm5
+	movdqa	%xmm2,64(%rsp)
+	movdqa	%xmm2,80(%rsp)
+	movdqa	%xmm2,96(%rsp)
+	movq	%rdx,%r10
+	movdqa	%xmm2,112(%rsp)
+
+	leaq	1(%r8),%rax
+	leaq	2(%r8),%rdx
+	bswapl	%eax
+	bswapl	%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
+.byte	102,15,58,34,216,3
+	leaq	3(%r8),%rax
+	movdqa	%xmm3,16(%rsp)
+.byte	102,15,58,34,226,3
+	bswapl	%eax
+	movq	%r10,%rdx
+	leaq	4(%r8),%r10
+	movdqa	%xmm4,32(%rsp)
+	xorl	%ebp,%eax
+	bswapl	%r10d
+.byte	102,15,58,34,232,3
+	xorl	%ebp,%r10d
+	movdqa	%xmm5,48(%rsp)
+	leaq	5(%r8),%r9
+	movl	%r10d,64+12(%rsp)
+	bswapl	%r9d
+	leaq	6(%r8),%r10
+	movl	240(%rcx),%eax
+	xorl	%ebp,%r9d
+	bswapl	%r10d
+	movl	%r9d,80+12(%rsp)
+	xorl	%ebp,%r10d
+	leaq	7(%r8),%r9
+	movl	%r10d,96+12(%rsp)
+	bswapl	%r9d
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
+	movl	4(%r10),%r10d
+	xorl	%ebp,%r9d
+	andl	$71303168,%r10d
+	movl	%r9d,112+12(%rsp)
+
+	movups	16(%rcx),%xmm1
+
+	movdqa	64(%rsp),%xmm6
+	movdqa	80(%rsp),%xmm7
+
+	cmpq	$8,%rdx
+	jb	.Lctr32_tail
+
+	subq	$6,%rdx
+	cmpl	$4194304,%r10d
+	je	.Lctr32_6x
+
+	leaq	128(%rcx),%rcx
+	subq	$2,%rdx
+	jmp	.Lctr32_loop8
+
+.align	16
+.Lctr32_6x:
+	shll	$4,%eax
+	movl	$48,%r10d
+	bswapl	%ebp
+	leaq	32(%rcx,%rax,1),%rcx
+	subq	%rax,%r10
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	addl	$6,%r8d
+	movups	-48(%rcx,%r10,1),%xmm0
+.byte	102,15,56,220,209
+	movl	%r8d,%eax
+	xorl	%ebp,%eax
+.byte	102,15,56,220,217
+.byte	0x0f,0x38,0xf1,0x44,0x24,12
+	leal	1(%r8),%eax
+.byte	102,15,56,220,225
+	xorl	%ebp,%eax
+.byte	0x0f,0x38,0xf1,0x44,0x24,28
+.byte	102,15,56,220,233
+	leal	2(%r8),%eax
+	xorl	%ebp,%eax
+.byte	102,15,56,220,241
+.byte	0x0f,0x38,0xf1,0x44,0x24,44
+	leal	3(%r8),%eax
+.byte	102,15,56,220,249
+	movups	-32(%rcx,%r10,1),%xmm1
+	xorl	%ebp,%eax
+
+.byte	102,15,56,220,208
+.byte	0x0f,0x38,0xf1,0x44,0x24,60
+	leal	4(%r8),%eax
+.byte	102,15,56,220,216
+	xorl	%ebp,%eax
+.byte	0x0f,0x38,0xf1,0x44,0x24,76
+.byte	102,15,56,220,224
+	leal	5(%r8),%eax
+	xorl	%ebp,%eax
+.byte	102,15,56,220,232
+.byte	0x0f,0x38,0xf1,0x44,0x24,92
+	movq	%r10,%rax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%r10,1),%xmm0
+
+	call	.Lenc_loop6
+
+	movdqu	(%rdi),%xmm8
+	movdqu	16(%rdi),%xmm9
+	movdqu	32(%rdi),%xmm10
+	movdqu	48(%rdi),%xmm11
+	movdqu	64(%rdi),%xmm12
+	movdqu	80(%rdi),%xmm13
+	leaq	96(%rdi),%rdi
+	movups	-64(%rcx,%r10,1),%xmm1
+	pxor	%xmm2,%xmm8
+	movaps	0(%rsp),%xmm2
+	pxor	%xmm3,%xmm9
+	movaps	16(%rsp),%xmm3
+	pxor	%xmm4,%xmm10
+	movaps	32(%rsp),%xmm4
+	pxor	%xmm5,%xmm11
+	movaps	48(%rsp),%xmm5
+	pxor	%xmm6,%xmm12
+	movaps	64(%rsp),%xmm6
+	pxor	%xmm7,%xmm13
+	movaps	80(%rsp),%xmm7
+	movdqu	%xmm8,(%rsi)
+	movdqu	%xmm9,16(%rsi)
+	movdqu	%xmm10,32(%rsi)
+	movdqu	%xmm11,48(%rsi)
+	movdqu	%xmm12,64(%rsi)
+	movdqu	%xmm13,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	subq	$6,%rdx
+	jnc	.Lctr32_loop6
+
+	addq	$6,%rdx
+	jz	.Lctr32_done
+
+	leal	-48(%r10),%eax
+	leaq	-80(%rcx,%r10,1),%rcx
+	negl	%eax
+	shrl	$4,%eax
+	jmp	.Lctr32_tail
+
+.align	32
+.Lctr32_loop8:
+	addl	$8,%r8d
+	movdqa	96(%rsp),%xmm8
+.byte	102,15,56,220,209
+	movl	%r8d,%r9d
+	movdqa	112(%rsp),%xmm9
+.byte	102,15,56,220,217
+	bswapl	%r9d
+	movups	32-128(%rcx),%xmm0
+.byte	102,15,56,220,225
+	xorl	%ebp,%r9d
+	nop
+.byte	102,15,56,220,233
+	movl	%r9d,0+12(%rsp)
+	leaq	1(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	48-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,16+12(%rsp)
+	leaq	2(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	64-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,32+12(%rsp)
+	leaq	3(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	80-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,48+12(%rsp)
+	leaq	4(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	96-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,64+12(%rsp)
+	leaq	5(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	112-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movl	%r9d,80+12(%rsp)
+	leaq	6(%r8),%r9
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	128-128(%rcx),%xmm0
+	bswapl	%r9d
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	xorl	%ebp,%r9d
+.byte	0x66,0x90
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movl	%r9d,96+12(%rsp)
+	leaq	7(%r8),%r9
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	144-128(%rcx),%xmm1
+	bswapl	%r9d
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+	xorl	%ebp,%r9d
+	movdqu	0(%rdi),%xmm10
+.byte	102,15,56,220,232
+	movl	%r9d,112+12(%rsp)
+	cmpl	$11,%eax
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	160-128(%rcx),%xmm0
+
+	jb	.Lctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	176-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	192-128(%rcx),%xmm0
+	je	.Lctr32_enc_done
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movups	208-128(%rcx),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+.byte	102,68,15,56,220,192
+.byte	102,68,15,56,220,200
+	movups	224-128(%rcx),%xmm0
+	jmp	.Lctr32_enc_done
+
+.align	16
+.Lctr32_enc_done:
+	movdqu	16(%rdi),%xmm11
+	pxor	%xmm0,%xmm10
+	movdqu	32(%rdi),%xmm12
+	pxor	%xmm0,%xmm11
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm0,%xmm12
+	movdqu	64(%rdi),%xmm14
+	pxor	%xmm0,%xmm13
+	movdqu	80(%rdi),%xmm15
+	pxor	%xmm0,%xmm14
+	pxor	%xmm0,%xmm15
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+.byte	102,68,15,56,220,201
+	movdqu	96(%rdi),%xmm1
+	leaq	128(%rdi),%rdi
+
+.byte	102,65,15,56,221,210
+	pxor	%xmm0,%xmm1
+	movdqu	112-128(%rdi),%xmm10
+.byte	102,65,15,56,221,219
+	pxor	%xmm0,%xmm10
+	movdqa	0(%rsp),%xmm11
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	movdqa	16(%rsp),%xmm12
+	movdqa	32(%rsp),%xmm13
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	movdqa	48(%rsp),%xmm14
+	movdqa	64(%rsp),%xmm15
+.byte	102,68,15,56,221,193
+	movdqa	80(%rsp),%xmm0
+	movups	16-128(%rcx),%xmm1
+.byte	102,69,15,56,221,202
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm0,%xmm7
+	movups	%xmm8,96(%rsi)
+	movups	%xmm9,112(%rsi)
+	leaq	128(%rsi),%rsi
+
+	subq	$8,%rdx
+	jnc	.Lctr32_loop8
+
+	addq	$8,%rdx
+	jz	.Lctr32_done
+	leaq	-128(%rcx),%rcx
+
+.Lctr32_tail:
+
+
+	leaq	16(%rcx),%rcx
+	cmpq	$4,%rdx
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
+
+
+	shll	$4,%eax
+	movdqa	96(%rsp),%xmm8
+	pxor	%xmm9,%xmm9
+
+	movups	16(%rcx),%xmm0
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+	leaq	32-16(%rcx,%rax,1),%rcx
+	negq	%rax
+.byte	102,15,56,220,225
+	addq	$16,%rax
+	movups	(%rdi),%xmm10
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+	movups	16(%rdi),%xmm11
+	movups	32(%rdi),%xmm12
+.byte	102,15,56,220,249
+.byte	102,68,15,56,220,193
+
+	call	.Lenc_loop8_enter
+
+	movdqu	48(%rdi),%xmm13
+	pxor	%xmm10,%xmm2
+	movdqu	64(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm10,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	movdqu	%xmm6,64(%rsi)
+	cmpq	$6,%rdx
+	jb	.Lctr32_done
+
+	movups	80(%rdi),%xmm11
+	xorps	%xmm11,%xmm7
+	movups	%xmm7,80(%rsi)
+	je	.Lctr32_done
+
+	movups	96(%rdi),%xmm12
+	xorps	%xmm12,%xmm8
+	movups	%xmm8,96(%rsi)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop4:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop4
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+	movups	(%rdi),%xmm10
+	movups	16(%rdi),%xmm11
+.byte	102,15,56,221,225
+.byte	102,15,56,221,233
+	movups	32(%rdi),%xmm12
+	movups	48(%rdi),%xmm13
+
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm5,48(%rsi)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop3:
+.byte	102,15,56,220,209
+	leaq	16(%rcx),%rcx
+	decl	%eax
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+	movups	(%rcx),%xmm1
+	jnz	.Lctr32_loop3
+.byte	102,15,56,221,209
+.byte	102,15,56,221,217
+.byte	102,15,56,221,225
+
+	movups	(%rdi),%xmm10
+	xorps	%xmm10,%xmm2
+	movups	%xmm2,(%rsi)
+	cmpq	$2,%rdx
+	jb	.Lctr32_done
+
+	movups	16(%rdi),%xmm11
+	xorps	%xmm11,%xmm3
+	movups	%xmm3,16(%rsi)
+	je	.Lctr32_done
+
+	movups	32(%rdi),%xmm12
+	xorps	%xmm12,%xmm4
+	movups	%xmm4,32(%rsi)
+
+.Lctr32_done:
+	xorps	%xmm0,%xmm0
+	xorl	%ebp,%ebp
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	movaps	%xmm0,0(%rsp)
+	pxor	%xmm8,%xmm8
+	movaps	%xmm0,16(%rsp)
+	pxor	%xmm9,%xmm9
+	movaps	%xmm0,32(%rsp)
+	pxor	%xmm10,%xmm10
+	movaps	%xmm0,48(%rsp)
+	pxor	%xmm11,%xmm11
+	movaps	%xmm0,64(%rsp)
+	pxor	%xmm12,%xmm12
+	movaps	%xmm0,80(%rsp)
+	pxor	%xmm13,%xmm13
+	movaps	%xmm0,96(%rsp)
+	pxor	%xmm14,%xmm14
+	movaps	%xmm0,112(%rsp)
+	pxor	%xmm15,%xmm15
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lctr32_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl	aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type	aes_hw_cbc_encrypt,@function
+.align	16
+aes_hw_cbc_encrypt:
+.cfi_startproc	
+	testq	%rdx,%rdx
+	jz	.Lcbc_ret
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	testl	%r9d,%r9d
+	jz	.Lcbc_decrypt
+
+	movups	(%r8),%xmm2
+	movl	%r10d,%eax
+	cmpq	$16,%rdx
+	jb	.Lcbc_enc_tail
+	subq	$16,%rdx
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movups	(%rdi),%xmm3
+	leaq	16(%rdi),%rdi
+
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	xorps	%xmm0,%xmm3
+	leaq	32(%rcx),%rcx
+	xorps	%xmm3,%xmm2
+.Loop_enc1_6:
+.byte	102,15,56,220,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_enc1_6
+.byte	102,15,56,221,209
+	movl	%r10d,%eax
+	movq	%r11,%rcx
+	movups	%xmm2,0(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$16,%rdx
+	jnc	.Lcbc_enc_loop
+	addq	$16,%rdx
+	jnz	.Lcbc_enc_tail
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movups	%xmm2,(%r8)
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	jmp	.Lcbc_ret
+
+.Lcbc_enc_tail:
+	movq	%rdx,%rcx
+	xchgq	%rdi,%rsi
+.long	0x9066A4F3
+	movl	$16,%ecx
+	subq	%rdx,%rcx
+	xorl	%eax,%eax
+.long	0x9066AAF3
+	leaq	-16(%rdi),%rdi
+	movl	%r10d,%eax
+	movq	%rdi,%rsi
+	movq	%r11,%rcx
+	xorq	%rdx,%rdx
+	jmp	.Lcbc_enc_loop
+
+.align	16
+.Lcbc_decrypt:
+	cmpq	$16,%rdx
+	jne	.Lcbc_decrypt_bulk
+
+
+
+	movdqu	(%rdi),%xmm2
+	movdqu	(%r8),%xmm3
+	movdqa	%xmm2,%xmm4
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_7:
+.byte	102,15,56,222,209
+	decl	%r10d
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_7
+.byte	102,15,56,223,209
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movdqu	%xmm4,(%r8)
+	xorps	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lcbc_ret
+.align	16
+.Lcbc_decrypt_bulk:
+	leaq	(%rsp),%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbp
+.cfi_offset	%rbp,-16
+	subq	$16,%rsp
+	andq	$-16,%rsp
+	movq	%rcx,%rbp
+	movups	(%r8),%xmm10
+	movl	%r10d,%eax
+	cmpq	$0x50,%rdx
+	jbe	.Lcbc_dec_tail
+
+	movups	(%rcx),%xmm0
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqa	%xmm2,%xmm11
+	movdqu	32(%rdi),%xmm4
+	movdqa	%xmm3,%xmm12
+	movdqu	48(%rdi),%xmm5
+	movdqa	%xmm4,%xmm13
+	movdqu	64(%rdi),%xmm6
+	movdqa	%xmm5,%xmm14
+	movdqu	80(%rdi),%xmm7
+	movdqa	%xmm6,%xmm15
+	leaq	OPENSSL_ia32cap_P(%rip),%r9
+	movl	4(%r9),%r9d
+	cmpq	$0x70,%rdx
+	jbe	.Lcbc_dec_six_or_seven
+
+	andl	$71303168,%r9d
+	subq	$0x50,%rdx
+	cmpl	$4194304,%r9d
+	je	.Lcbc_dec_loop6_enter
+	subq	$0x20,%rdx
+	leaq	112(%rcx),%rcx
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+	movdqu	96(%rdi),%xmm8
+	pxor	%xmm0,%xmm2
+	movdqu	112(%rdi),%xmm9
+	pxor	%xmm0,%xmm3
+	movups	16-112(%rcx),%xmm1
+	pxor	%xmm0,%xmm4
+	movq	$-1,%rbp
+	cmpq	$0x70,%rdx
+	pxor	%xmm0,%xmm5
+	pxor	%xmm0,%xmm6
+	pxor	%xmm0,%xmm7
+	pxor	%xmm0,%xmm8
+
+.byte	102,15,56,222,209
+	pxor	%xmm0,%xmm9
+	movups	32-112(%rcx),%xmm0
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+	adcq	$0,%rbp
+	andq	$128,%rbp
+.byte	102,68,15,56,222,201
+	addq	%rdi,%rbp
+	movups	48-112(%rcx),%xmm1
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	64-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	80-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	96-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	112-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	128-112(%rcx),%xmm0
+	nop
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	144-112(%rcx),%xmm1
+	cmpl	$11,%eax
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	160-112(%rcx),%xmm0
+	jb	.Lcbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	176-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	192-112(%rcx),%xmm0
+	je	.Lcbc_dec_done
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movups	208-112(%rcx),%xmm1
+	nop
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+.byte	102,68,15,56,222,192
+.byte	102,68,15,56,222,200
+	movups	224-112(%rcx),%xmm0
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_done:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+	pxor	%xmm0,%xmm10
+	pxor	%xmm0,%xmm11
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	pxor	%xmm0,%xmm12
+	pxor	%xmm0,%xmm13
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	pxor	%xmm0,%xmm14
+	pxor	%xmm0,%xmm15
+.byte	102,68,15,56,222,193
+.byte	102,68,15,56,222,201
+	movdqu	80(%rdi),%xmm1
+
+.byte	102,65,15,56,223,210
+	movdqu	96(%rdi),%xmm10
+	pxor	%xmm0,%xmm1
+.byte	102,65,15,56,223,219
+	pxor	%xmm0,%xmm10
+	movdqu	112(%rdi),%xmm0
+.byte	102,65,15,56,223,228
+	leaq	128(%rdi),%rdi
+	movdqu	0(%rbp),%xmm11
+.byte	102,65,15,56,223,237
+.byte	102,65,15,56,223,246
+	movdqu	16(%rbp),%xmm12
+	movdqu	32(%rbp),%xmm13
+.byte	102,65,15,56,223,255
+.byte	102,68,15,56,223,193
+	movdqu	48(%rbp),%xmm14
+	movdqu	64(%rbp),%xmm15
+.byte	102,69,15,56,223,202
+	movdqa	%xmm0,%xmm10
+	movdqu	80(%rbp),%xmm1
+	movups	-112(%rcx),%xmm0
+
+	movups	%xmm2,(%rsi)
+	movdqa	%xmm11,%xmm2
+	movups	%xmm3,16(%rsi)
+	movdqa	%xmm12,%xmm3
+	movups	%xmm4,32(%rsi)
+	movdqa	%xmm13,%xmm4
+	movups	%xmm5,48(%rsi)
+	movdqa	%xmm14,%xmm5
+	movups	%xmm6,64(%rsi)
+	movdqa	%xmm15,%xmm6
+	movups	%xmm7,80(%rsi)
+	movdqa	%xmm1,%xmm7
+	movups	%xmm8,96(%rsi)
+	leaq	112(%rsi),%rsi
+
+	subq	$0x80,%rdx
+	ja	.Lcbc_dec_loop8
+
+	movaps	%xmm9,%xmm2
+	leaq	-112(%rcx),%rcx
+	addq	$0x70,%rdx
+	jle	.Lcbc_dec_clear_tail_collected
+	movups	%xmm9,(%rsi)
+	leaq	16(%rsi),%rsi
+	cmpq	$0x50,%rdx
+	jbe	.Lcbc_dec_tail
+
+	movaps	%xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+	cmpq	$0x60,%rdx
+	ja	.Lcbc_dec_seven
+
+	movaps	%xmm7,%xmm8
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm8,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	leaq	80(%rsi),%rsi
+	movdqa	%xmm7,%xmm2
+	pxor	%xmm7,%xmm7
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_seven:
+	movups	96(%rdi),%xmm8
+	xorps	%xmm9,%xmm9
+	call	_aesni_decrypt8
+	movups	80(%rdi),%xmm9
+	pxor	%xmm10,%xmm2
+	movups	96(%rdi),%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	pxor	%xmm15,%xmm7
+	movdqu	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm6
+	pxor	%xmm9,%xmm8
+	movdqu	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm7
+	leaq	96(%rsi),%rsi
+	movdqa	%xmm8,%xmm2
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_loop6:
+	movups	%xmm7,(%rsi)
+	leaq	16(%rsi),%rsi
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqa	%xmm2,%xmm11
+	movdqu	32(%rdi),%xmm4
+	movdqa	%xmm3,%xmm12
+	movdqu	48(%rdi),%xmm5
+	movdqa	%xmm4,%xmm13
+	movdqu	64(%rdi),%xmm6
+	movdqa	%xmm5,%xmm14
+	movdqu	80(%rdi),%xmm7
+	movdqa	%xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+	leaq	96(%rdi),%rdi
+	movdqa	%xmm7,%xmm8
+
+	call	_aesni_decrypt6
+
+	pxor	%xmm10,%xmm2
+	movdqa	%xmm8,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm14,%xmm6
+	movq	%rbp,%rcx
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm15,%xmm7
+	movl	%r10d,%eax
+	movdqu	%xmm6,64(%rsi)
+	leaq	80(%rsi),%rsi
+	subq	$0x60,%rdx
+	ja	.Lcbc_dec_loop6
+
+	movdqa	%xmm7,%xmm2
+	addq	$0x50,%rdx
+	jle	.Lcbc_dec_clear_tail_collected
+	movups	%xmm7,(%rsi)
+	leaq	16(%rsi),%rsi
+
+.Lcbc_dec_tail:
+	movups	(%rdi),%xmm2
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_one
+
+	movups	16(%rdi),%xmm3
+	movaps	%xmm2,%xmm11
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_two
+
+	movups	32(%rdi),%xmm4
+	movaps	%xmm3,%xmm12
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_three
+
+	movups	48(%rdi),%xmm5
+	movaps	%xmm4,%xmm13
+	subq	$0x10,%rdx
+	jbe	.Lcbc_dec_four
+
+	movups	64(%rdi),%xmm6
+	movaps	%xmm5,%xmm14
+	movaps	%xmm6,%xmm15
+	xorps	%xmm7,%xmm7
+	call	_aesni_decrypt6
+	pxor	%xmm10,%xmm2
+	movaps	%xmm15,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	pxor	%xmm14,%xmm6
+	movdqu	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm5
+	leaq	64(%rsi),%rsi
+	movdqa	%xmm6,%xmm2
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	subq	$0x10,%rdx
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_one:
+	movaps	%xmm2,%xmm11
+	movups	(%rcx),%xmm0
+	movups	16(%rcx),%xmm1
+	leaq	32(%rcx),%rcx
+	xorps	%xmm0,%xmm2
+.Loop_dec1_8:
+.byte	102,15,56,222,209
+	decl	%eax
+	movups	(%rcx),%xmm1
+	leaq	16(%rcx),%rcx
+	jnz	.Loop_dec1_8
+.byte	102,15,56,223,209
+	xorps	%xmm10,%xmm2
+	movaps	%xmm11,%xmm10
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_two:
+	movaps	%xmm3,%xmm12
+	call	_aesni_decrypt2
+	pxor	%xmm10,%xmm2
+	movaps	%xmm12,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	movdqa	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	leaq	16(%rsi),%rsi
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_three:
+	movaps	%xmm4,%xmm13
+	call	_aesni_decrypt3
+	pxor	%xmm10,%xmm2
+	movaps	%xmm13,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm4,%xmm2
+	pxor	%xmm4,%xmm4
+	leaq	32(%rsi),%rsi
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	movaps	%xmm5,%xmm14
+	call	_aesni_decrypt4
+	pxor	%xmm10,%xmm2
+	movaps	%xmm14,%xmm10
+	pxor	%xmm11,%xmm3
+	movdqu	%xmm2,(%rsi)
+	pxor	%xmm12,%xmm4
+	movdqu	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm3
+	pxor	%xmm13,%xmm5
+	movdqu	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm4
+	movdqa	%xmm5,%xmm2
+	pxor	%xmm5,%xmm5
+	leaq	48(%rsi),%rsi
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_clear_tail_collected:
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+.Lcbc_dec_tail_collected:
+	movups	%xmm10,(%r8)
+	andq	$15,%rdx
+	jnz	.Lcbc_dec_tail_partial
+	movups	%xmm2,(%rsi)
+	pxor	%xmm2,%xmm2
+	jmp	.Lcbc_dec_ret
+.align	16
+.Lcbc_dec_tail_partial:
+	movaps	%xmm2,(%rsp)
+	pxor	%xmm2,%xmm2
+	movq	$16,%rcx
+	movq	%rsi,%rdi
+	subq	%rdx,%rcx
+	leaq	(%rsp),%rsi
+.long	0x9066A4F3
+	movdqa	%xmm2,(%rsp)
+
+.Lcbc_dec_ret:
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	movq	-8(%r11),%rbp
+.cfi_restore	%rbp
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lcbc_ret:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl	aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type	aes_hw_set_decrypt_key,@function
+.align	16
+aes_hw_set_decrypt_key:
+.cfi_startproc	
+.byte	0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset	8
+	call	__aesni_set_encrypt_key
+	shll	$4,%esi
+	testl	%eax,%eax
+	jnz	.Ldec_key_ret
+	leaq	16(%rdx,%rsi,1),%rdi
+
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+	movups	%xmm0,(%rdi)
+	movups	%xmm1,(%rdx)
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+
+.Ldec_key_inverse:
+	movups	(%rdx),%xmm0
+	movups	(%rdi),%xmm1
+.byte	102,15,56,219,192
+.byte	102,15,56,219,201
+	leaq	16(%rdx),%rdx
+	leaq	-16(%rdi),%rdi
+	movups	%xmm0,16(%rdi)
+	movups	%xmm1,-16(%rdx)
+	cmpq	%rdx,%rdi
+	ja	.Ldec_key_inverse
+
+	movups	(%rdx),%xmm0
+.byte	102,15,56,219,192
+	pxor	%xmm1,%xmm1
+	movups	%xmm0,(%rdi)
+	pxor	%xmm0,%xmm0
+.Ldec_key_ret:
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_set_decrypt_key:
+.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl	aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type	aes_hw_set_encrypt_key,@function
+.align	16
+aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+.cfi_startproc	
+#ifdef BORINGSSL_DISPATCH_TEST
+	movb	$1,BORINGSSL_function_hit+3(%rip)
+#endif
+.byte	0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset	8
+	movq	$-1,%rax
+	testq	%rdi,%rdi
+	jz	.Lenc_key_ret
+	testq	%rdx,%rdx
+	jz	.Lenc_key_ret
+
+	movups	(%rdi),%xmm0
+	xorps	%xmm4,%xmm4
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
+	movl	4(%r10),%r10d
+	andl	$268437504,%r10d
+	leaq	16(%rdx),%rax
+	cmpl	$256,%esi
+	je	.L14rounds
+	cmpl	$192,%esi
+	je	.L12rounds
+	cmpl	$128,%esi
+	jne	.Lbad_keybits
+
+.L10rounds:
+	movl	$9,%esi
+	cmpl	$268435456,%r10d
+	je	.L10rounds_alt
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_128_cold
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,64
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,128
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,27
+	call	.Lkey_expansion_128
+.byte	102,15,58,223,200,54
+	call	.Lkey_expansion_128
+	movups	%xmm0,(%rax)
+	movl	%esi,80(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L10rounds_alt:
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movl	$8,%r10d
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movdqa	%xmm0,%xmm2
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key128
+
+.align	16
+.Loop_key128:
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+	leaq	16(%rax),%rax
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,-16(%rax)
+	movdqa	%xmm0,%xmm2
+
+	decl	%r10d
+	jnz	.Loop_key128
+
+	movdqa	.Lkey_rcon1b(%rip),%xmm4
+
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+	pslld	$1,%xmm4
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	movdqa	%xmm0,%xmm2
+.byte	102,15,56,0,197
+.byte	102,15,56,221,196
+
+	movdqa	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm2,%xmm3
+	pslldq	$4,%xmm2
+	pxor	%xmm3,%xmm2
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,16(%rax)
+
+	movl	%esi,96(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds:
+	movq	16(%rdi),%xmm2
+	movl	$11,%esi
+	cmpl	$268435456,%r10d
+	je	.L12rounds_alt
+
+	movups	%xmm0,(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_192a_cold
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_192b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_192a
+.byte	102,15,58,223,202,128
+	call	.Lkey_expansion_192b
+	movups	%xmm0,(%rax)
+	movl	%esi,48(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L12rounds_alt:
+	movdqa	.Lkey_rotate192(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$8,%r10d
+	movdqu	%xmm0,(%rdx)
+	jmp	.Loop_key192
+
+.align	16
+.Loop_key192:
+	movq	%xmm2,0(%rax)
+	movdqa	%xmm2,%xmm1
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+	pslld	$1,%xmm4
+	leaq	24(%rax),%rax
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+
+	pshufd	$0xff,%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+
+	pxor	%xmm2,%xmm0
+	pxor	%xmm3,%xmm2
+	movdqu	%xmm0,-16(%rax)
+
+	decl	%r10d
+	jnz	.Loop_key192
+
+	movl	%esi,32(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds:
+	movups	16(%rdi),%xmm2
+	movl	$13,%esi
+	leaq	16(%rax),%rax
+	cmpl	$268435456,%r10d
+	je	.L14rounds_alt
+
+	movups	%xmm0,(%rdx)
+	movups	%xmm2,16(%rdx)
+.byte	102,15,58,223,202,1
+	call	.Lkey_expansion_256a_cold
+.byte	102,15,58,223,200,1
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,2
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,2
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,4
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,4
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,8
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,8
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,16
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,16
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,32
+	call	.Lkey_expansion_256a
+.byte	102,15,58,223,200,32
+	call	.Lkey_expansion_256b
+.byte	102,15,58,223,202,64
+	call	.Lkey_expansion_256a
+	movups	%xmm0,(%rax)
+	movl	%esi,16(%rax)
+	xorq	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.L14rounds_alt:
+	movdqa	.Lkey_rotate(%rip),%xmm5
+	movdqa	.Lkey_rcon1(%rip),%xmm4
+	movl	$7,%r10d
+	movdqu	%xmm0,0(%rdx)
+	movdqa	%xmm2,%xmm1
+	movdqu	%xmm2,16(%rdx)
+	jmp	.Loop_key256
+
+.align	16
+.Loop_key256:
+.byte	102,15,56,0,213
+.byte	102,15,56,221,212
+
+	movdqa	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm0,%xmm3
+	pslldq	$4,%xmm0
+	pxor	%xmm3,%xmm0
+	pslld	$1,%xmm4
+
+	pxor	%xmm2,%xmm0
+	movdqu	%xmm0,(%rax)
+
+	decl	%r10d
+	jz	.Ldone_key256
+
+	pshufd	$0xff,%xmm0,%xmm2
+	pxor	%xmm3,%xmm3
+.byte	102,15,56,221,211
+
+	movdqa	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm1,%xmm3
+	pslldq	$4,%xmm1
+	pxor	%xmm3,%xmm1
+
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,16(%rax)
+	leaq	32(%rax),%rax
+	movdqa	%xmm2,%xmm1
+
+	jmp	.Loop_key256
+
+.Ldone_key256:
+	movl	%esi,16(%rax)
+	xorl	%eax,%eax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	movq	$-2,%rax
+.Lenc_key_ret:
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	addq	$8,%rsp
+.cfi_adjust_cfa_offset	-8
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192a:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+	shufps	$16,%xmm0,%xmm4
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	pslldq	$4,%xmm3
+	xorps	%xmm4,%xmm0
+	pshufd	$85,%xmm1,%xmm1
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	$255,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	$68,%xmm0,%xmm5
+	movups	%xmm5,(%rax)
+	shufps	$78,%xmm2,%xmm3
+	movups	%xmm3,16(%rax)
+	leaq	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
+.align	16
+.Lkey_expansion_256a:
+	movups	%xmm2,(%rax)
+	leaq	16(%rax),%rax
+.Lkey_expansion_256a_cold:
+	shufps	$16,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$140,%xmm0,%xmm4
+	xorps	%xmm4,%xmm0
+	shufps	$255,%xmm1,%xmm1
+	xorps	%xmm1,%xmm0
+	.byte	0xf3,0xc3
+
+.align	16
+.Lkey_expansion_256b:
+	movups	%xmm0,(%rax)
+	leaq	16(%rax),%rax
+
+	shufps	$16,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$140,%xmm2,%xmm4
+	xorps	%xmm4,%xmm2
+	shufps	$170,%xmm1,%xmm1
+	xorps	%xmm1,%xmm2
+	.byte	0xf3,0xc3
+.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long	6,6,6,0
+.Lincrement64:
+.long	1,0,0,0
+.Lxts_magic:
+.long	0x87,0,1,0
+.Lincrement1:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long	0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long	1,1,1,1
+.Lkey_rcon1b:
+.long	0x1b,0x1b,0x1b,0x1b
+
+.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
@@ -1,0 +1,427 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+
+
+
+
+.type	gcm_gmult_ssse3, @function
+.globl	gcm_gmult_ssse3
+.hidden gcm_gmult_ssse3
+.align	16
+gcm_gmult_ssse3:
+.cfi_startproc	
+.Lgmult_seh_begin:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lreverse_bytes(%rip),%xmm10
+	movdqa	.Llow4_mask(%rip),%xmm2
+
+
+.byte	102,65,15,56,0,194
+
+
+	movdqa	%xmm2,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm2,%xmm0
+
+
+
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+.Loop_row_1:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_1
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+.Loop_row_2:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_2
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$6,%rax
+.Loop_row_3:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_3
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+
+.byte	102,65,15,56,0,210
+	movdqu	%xmm2,(%rdi)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	.byte	0xf3,0xc3
+.Lgmult_seh_end:
+.cfi_endproc	
+.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
+
+
+
+
+
+.type	gcm_ghash_ssse3, @function
+.globl	gcm_ghash_ssse3
+.hidden gcm_ghash_ssse3
+.align	16
+gcm_ghash_ssse3:
+.Lghash_seh_begin:
+.cfi_startproc	
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lreverse_bytes(%rip),%xmm10
+	movdqa	.Llow4_mask(%rip),%xmm11
+
+
+	andq	$-16,%rcx
+
+
+
+.byte	102,65,15,56,0,194
+
+
+	pxor	%xmm3,%xmm3
+.Loop_ghash:
+
+	movdqu	(%rdx),%xmm1
+.byte	102,65,15,56,0,202
+	pxor	%xmm1,%xmm0
+
+
+	movdqa	%xmm11,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm11,%xmm0
+
+
+
+
+	pxor	%xmm2,%xmm2
+
+	movq	$5,%rax
+.Loop_row_4:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_4
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$5,%rax
+.Loop_row_5:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_5
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movq	$6,%rax
+.Loop_row_6:
+	movdqa	(%rsi),%xmm4
+	leaq	16(%rsi),%rsi
+
+
+	movdqa	%xmm2,%xmm6
+.byte	102,15,58,15,243,1
+	movdqa	%xmm6,%xmm3
+	psrldq	$1,%xmm2
+
+
+
+
+	movdqa	%xmm4,%xmm5
+.byte	102,15,56,0,224
+.byte	102,15,56,0,233
+
+
+	pxor	%xmm5,%xmm2
+
+
+
+	movdqa	%xmm4,%xmm5
+	psllq	$60,%xmm5
+	movdqa	%xmm5,%xmm6
+	pslldq	$8,%xmm6
+	pxor	%xmm6,%xmm3
+
+
+	psrldq	$8,%xmm5
+	pxor	%xmm5,%xmm2
+	psrlq	$4,%xmm4
+	pxor	%xmm4,%xmm2
+
+	subq	$1,%rax
+	jnz	.Loop_row_6
+
+
+
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$1,%xmm3
+	pxor	%xmm3,%xmm2
+	psrlq	$5,%xmm3
+	pxor	%xmm3,%xmm2
+	pxor	%xmm3,%xmm3
+	movdqa	%xmm2,%xmm0
+
+
+	leaq	-256(%rsi),%rsi
+
+
+	leaq	16(%rdx),%rdx
+	subq	$16,%rcx
+	jnz	.Loop_ghash
+
+
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	.byte	0xf3,0xc3
+.Lghash_seh_end:
+.cfi_endproc	
+.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3
+
+.align	16
+
+
+.Lreverse_bytes:
+.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.Llow4_mask:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
@@ -1,0 +1,1127 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl	gcm_init_clmul
+.hidden gcm_init_clmul
+.type	gcm_init_clmul,@function
+.align	16
+gcm_init_clmul:
+.cfi_startproc	
+.L_init_clmul:
+	movdqu	(%rsi),%xmm2
+	pshufd	$78,%xmm2,%xmm2
+
+
+	pshufd	$255,%xmm2,%xmm4
+	movdqa	%xmm2,%xmm3
+	psllq	$1,%xmm2
+	pxor	%xmm5,%xmm5
+	psrlq	$63,%xmm3
+	pcmpgtd	%xmm4,%xmm5
+	pslldq	$8,%xmm3
+	por	%xmm3,%xmm2
+
+
+	pand	.L0x1c2_polynomial(%rip),%xmm5
+	pxor	%xmm5,%xmm2
+
+
+	pshufd	$78,%xmm2,%xmm6
+	movdqa	%xmm2,%xmm0
+	pxor	%xmm2,%xmm6
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm2,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm2,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,16(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,32(%rdi)
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm5
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,222,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	pshufd	$78,%xmm5,%xmm3
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm5,%xmm3
+	movdqu	%xmm5,48(%rdi)
+	pxor	%xmm0,%xmm4
+	movdqu	%xmm0,64(%rdi)
+.byte	102,15,58,15,227,8
+	movdqu	%xmm4,80(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	gcm_init_clmul,.-gcm_init_clmul
+.globl	gcm_gmult_clmul
+.hidden gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.cfi_startproc	
+.L_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm4
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl	gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type	gcm_ghash_clmul,@function
+.align	32
+gcm_ghash_clmul:
+.cfi_startproc	
+.L_ghash_clmul:
+	movdqa	.Lbswap_mask(%rip),%xmm10
+
+	movdqu	(%rdi),%xmm0
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm7
+.byte	102,65,15,56,0,194
+
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+
+	movdqu	16(%rsi),%xmm6
+	leaq	OPENSSL_ia32cap_P(%rip),%rax
+	movl	4(%rax),%eax
+	cmpq	$0x30,%rcx
+	jb	.Lskip4x
+
+	andl	$71303168,%eax
+	cmpl	$4194304,%eax
+	je	.Lskip4x
+
+	subq	$0x30,%rcx
+	movq	$0xA040608020C0E000,%rax
+	movdqu	48(%rsi),%xmm14
+	movdqu	64(%rsi),%xmm15
+
+
+
+
+	movdqu	48(%rdx),%xmm3
+	movdqu	32(%rdx),%xmm11
+.byte	102,65,15,56,0,218
+.byte	102,69,15,56,0,218
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm11,%xmm12
+.byte	102,68,15,58,68,222,0
+.byte	102,68,15,58,68,238,17
+.byte	102,68,15,58,68,231,16
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+	xorps	%xmm12,%xmm4
+
+	movdqu	16(%rdx),%xmm11
+	movdqu	0(%rdx),%xmm8
+.byte	102,69,15,56,0,218
+.byte	102,69,15,56,0,194
+	movdqa	%xmm11,%xmm13
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm8,%xmm0
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,58,68,238,17
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm11,%xmm3
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jc	.Ltail4x
+
+	jmp	.Lmod4_loop
+.align	32
+.Lmod4_loop:
+.byte	102,65,15,58,68,199,0
+	xorps	%xmm12,%xmm4
+	movdqu	48(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,65,15,58,68,207,17
+	xorps	%xmm3,%xmm0
+	movdqu	32(%rdx),%xmm3
+	movdqa	%xmm11,%xmm13
+.byte	102,68,15,58,68,199,16
+	pshufd	$78,%xmm11,%xmm12
+	xorps	%xmm5,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,65,15,56,0,218
+	movups	32(%rsi),%xmm7
+	xorps	%xmm4,%xmm8
+.byte	102,68,15,58,68,218,0
+	pshufd	$78,%xmm3,%xmm4
+
+	pxor	%xmm0,%xmm8
+	movdqa	%xmm3,%xmm5
+	pxor	%xmm1,%xmm8
+	pxor	%xmm3,%xmm4
+	movdqa	%xmm8,%xmm9
+.byte	102,68,15,58,68,234,17
+	pslldq	$8,%xmm8
+	psrldq	$8,%xmm9
+	pxor	%xmm8,%xmm0
+	movdqa	.L7_mask(%rip),%xmm8
+	pxor	%xmm9,%xmm1
+.byte	102,76,15,110,200
+
+	pand	%xmm0,%xmm8
+.byte	102,69,15,56,0,200
+	pxor	%xmm0,%xmm9
+.byte	102,68,15,58,68,231,0
+	psllq	$57,%xmm9
+	movdqa	%xmm9,%xmm8
+	pslldq	$8,%xmm9
+.byte	102,15,58,68,222,0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	movdqu	0(%rdx),%xmm8
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	movdqu	16(%rdx),%xmm11
+.byte	102,69,15,56,0,218
+.byte	102,15,58,68,231,16
+	xorps	%xmm13,%xmm5
+	movups	80(%rsi),%xmm7
+.byte	102,69,15,56,0,194
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+
+	movdqa	%xmm11,%xmm13
+	pxor	%xmm12,%xmm4
+	pshufd	$78,%xmm11,%xmm12
+	pxor	%xmm9,%xmm0
+	pxor	%xmm8,%xmm1
+	pxor	%xmm11,%xmm12
+.byte	102,69,15,58,68,222,0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm0,%xmm1
+.byte	102,69,15,58,68,238,17
+	xorps	%xmm11,%xmm3
+	pshufd	$78,%xmm0,%xmm8
+	pxor	%xmm0,%xmm8
+
+.byte	102,68,15,58,68,231,0
+	xorps	%xmm13,%xmm5
+
+	leaq	64(%rdx),%rdx
+	subq	$0x40,%rcx
+	jnc	.Lmod4_loop
+
+.Ltail4x:
+.byte	102,65,15,58,68,199,0
+.byte	102,65,15,58,68,207,17
+.byte	102,68,15,58,68,199,16
+	xorps	%xmm12,%xmm4
+	xorps	%xmm3,%xmm0
+	xorps	%xmm5,%xmm1
+	pxor	%xmm0,%xmm1
+	pxor	%xmm4,%xmm8
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm0,%xmm1
+
+	movdqa	%xmm8,%xmm9
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm9
+	pxor	%xmm8,%xmm1
+	pxor	%xmm9,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	addq	$0x40,%rcx
+	jz	.Ldone
+	movdqu	32(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Lodd_tail
+.Lskip4x:
+
+
+
+
+
+	movdqu	(%rdx),%xmm8
+	movdqu	16(%rdx),%xmm3
+.byte	102,69,15,56,0,194
+.byte	102,65,15,56,0,218
+	pxor	%xmm8,%xmm0
+
+	movdqa	%xmm3,%xmm5
+	pshufd	$78,%xmm3,%xmm4
+	pxor	%xmm3,%xmm4
+.byte	102,15,58,68,218,0
+.byte	102,15,58,68,234,17
+.byte	102,15,58,68,231,0
+
+	leaq	32(%rdx),%rdx
+	nop
+	subq	$0x20,%rcx
+	jbe	.Leven_tail
+	nop
+	jmp	.Lmod_loop
+
+.align	32
+.Lmod_loop:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	movdqu	(%rdx),%xmm9
+	pxor	%xmm0,%xmm8
+.byte	102,69,15,56,0,202
+	movdqu	16(%rdx),%xmm3
+
+	pxor	%xmm1,%xmm8
+	pxor	%xmm9,%xmm1
+	pxor	%xmm8,%xmm4
+.byte	102,65,15,56,0,218
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm3,%xmm5
+
+	movdqa	%xmm0,%xmm9
+	movdqa	%xmm0,%xmm8
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm8
+.byte	102,15,58,68,218,0
+	psllq	$1,%xmm0
+	pxor	%xmm8,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm8
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm8
+	pxor	%xmm9,%xmm0
+	pshufd	$78,%xmm5,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm5,%xmm4
+
+	movdqa	%xmm0,%xmm9
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,234,17
+	pxor	%xmm9,%xmm1
+	pxor	%xmm0,%xmm9
+	psrlq	$5,%xmm0
+	pxor	%xmm9,%xmm0
+	leaq	32(%rdx),%rdx
+	psrlq	$1,%xmm0
+.byte	102,15,58,68,231,0
+	pxor	%xmm1,%xmm0
+
+	subq	$0x20,%rcx
+	ja	.Lmod_loop
+
+.Leven_tail:
+	movdqa	%xmm0,%xmm1
+	movdqa	%xmm4,%xmm8
+	pshufd	$78,%xmm0,%xmm4
+	pxor	%xmm0,%xmm4
+
+.byte	102,15,58,68,198,0
+.byte	102,15,58,68,206,17
+.byte	102,15,58,68,231,16
+
+	pxor	%xmm3,%xmm0
+	pxor	%xmm5,%xmm1
+	pxor	%xmm0,%xmm8
+	pxor	%xmm1,%xmm8
+	pxor	%xmm8,%xmm4
+	movdqa	%xmm4,%xmm8
+	psrldq	$8,%xmm8
+	pslldq	$8,%xmm4
+	pxor	%xmm8,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+	testq	%rcx,%rcx
+	jnz	.Ldone
+
+.Lodd_tail:
+	movdqu	(%rdx),%xmm8
+.byte	102,69,15,56,0,194
+	pxor	%xmm8,%xmm0
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,223,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.Ldone:
+.byte	102,65,15,56,0,194
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl	gcm_init_avx
+.hidden gcm_init_avx
+.type	gcm_init_avx,@function
+.align	32
+gcm_init_avx:
+.cfi_startproc	
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	vpshufd	$78,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	gcm_init_avx,.-gcm_init_avx
+.globl	gcm_gmult_avx
+.hidden gcm_gmult_avx
+.type	gcm_gmult_avx,@function
+.align	32
+gcm_gmult_avx:
+.cfi_startproc	
+	jmp	.L_gmult_clmul
+.cfi_endproc	
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+.globl	gcm_ghash_avx
+.hidden gcm_ghash_avx
+.type	gcm_ghash_avx,@function
+.align	32
+gcm_ghash_avx:
+.cfi_startproc	
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long	7,0,7,0
+.align	64
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S
@@ -1,0 +1,702 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+.align	16
+
+.globl	md5_block_asm_data_order
+.hidden md5_block_asm_data_order
+.type	md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+.cfi_startproc	
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-32
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-40
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-48
+.Lprologue:
+
+
+
+
+	movq	%rdi,%rbp
+	shlq	$6,%rdx
+	leaq	(%rsi,%rdx,1),%rdi
+	movl	0(%rbp),%eax
+	movl	4(%rbp),%ebx
+	movl	8(%rbp),%ecx
+	movl	12(%rbp),%edx
+
+
+
+
+
+
+
+	cmpq	%rdi,%rsi
+	je	.Lend
+
+
+.Lloop:
+	movl	%eax,%r8d
+	movl	%ebx,%r9d
+	movl	%ecx,%r14d
+	movl	%edx,%r15d
+	movl	0(%rsi),%r10d
+	movl	%edx,%r11d
+	xorl	%ecx,%r11d
+	leal	-680876936(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	4(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-389564586(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	606105819(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1044525330(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	-176418897(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	1200080426(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1473231341(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-45705983(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1770035416(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-1958414417(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-42063(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	-1990404162(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	xorl	%ecx,%r11d
+	leal	1804603682(%rax,%r10,1),%eax
+	andl	%ebx,%r11d
+	xorl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	addl	%r11d,%eax
+	roll	$7,%eax
+	movl	%ecx,%r11d
+	addl	%ebx,%eax
+	xorl	%ebx,%r11d
+	leal	-40341101(%rdx,%r10,1),%edx
+	andl	%eax,%r11d
+	xorl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	addl	%r11d,%edx
+	roll	$12,%edx
+	movl	%ebx,%r11d
+	addl	%eax,%edx
+	xorl	%eax,%r11d
+	leal	-1502002290(%rcx,%r10,1),%ecx
+	andl	%edx,%r11d
+	xorl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	addl	%r11d,%ecx
+	roll	$17,%ecx
+	movl	%eax,%r11d
+	addl	%edx,%ecx
+	xorl	%edx,%r11d
+	leal	1236535329(%rbx,%r10,1),%ebx
+	andl	%ecx,%r11d
+	xorl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	addl	%r11d,%ebx
+	roll	$22,%ebx
+	movl	%edx,%r11d
+	addl	%ecx,%ebx
+	movl	4(%rsi),%r10d
+	movl	%edx,%r11d
+	movl	%edx,%r12d
+	notl	%r11d
+	leal	-165796510(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	24(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1069501632(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	44(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	643717713(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-373897302(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	20(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-701558691(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	40(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	38016083(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	60(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-660478335(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	16(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-405537848(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	36(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	568446438(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	56(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-1019803690(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	12(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	-187363961(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	32(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	1163531501(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	52(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	notl	%r11d
+	leal	-1444681467(%rax,%r10,1),%eax
+	andl	%ebx,%r12d
+	andl	%ecx,%r11d
+	movl	8(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ecx,%r11d
+	addl	%r12d,%eax
+	movl	%ecx,%r12d
+	roll	$5,%eax
+	addl	%ebx,%eax
+	notl	%r11d
+	leal	-51403784(%rdx,%r10,1),%edx
+	andl	%eax,%r12d
+	andl	%ebx,%r11d
+	movl	28(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%ebx,%r11d
+	addl	%r12d,%edx
+	movl	%ebx,%r12d
+	roll	$9,%edx
+	addl	%eax,%edx
+	notl	%r11d
+	leal	1735328473(%rcx,%r10,1),%ecx
+	andl	%edx,%r12d
+	andl	%eax,%r11d
+	movl	48(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%eax,%r11d
+	addl	%r12d,%ecx
+	movl	%eax,%r12d
+	roll	$14,%ecx
+	addl	%edx,%ecx
+	notl	%r11d
+	leal	-1926607734(%rbx,%r10,1),%ebx
+	andl	%ecx,%r12d
+	andl	%edx,%r11d
+	movl	0(%rsi),%r10d
+	orl	%r11d,%r12d
+	movl	%edx,%r11d
+	addl	%r12d,%ebx
+	movl	%edx,%r12d
+	roll	$20,%ebx
+	addl	%ecx,%ebx
+	movl	20(%rsi),%r10d
+	movl	%ecx,%r11d
+	leal	-378558(%rax,%r10,1),%eax
+	movl	32(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-2022574463(%rdx,%r10,1),%edx
+	movl	44(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	1839030562(%rcx,%r10,1),%ecx
+	movl	56(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-35309556(%rbx,%r10,1),%ebx
+	movl	4(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-1530992060(%rax,%r10,1),%eax
+	movl	16(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	1272893353(%rdx,%r10,1),%edx
+	movl	28(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-155497632(%rcx,%r10,1),%ecx
+	movl	40(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-1094730640(%rbx,%r10,1),%ebx
+	movl	52(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	681279174(%rax,%r10,1),%eax
+	movl	0(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-358537222(%rdx,%r10,1),%edx
+	movl	12(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	-722521979(%rcx,%r10,1),%ecx
+	movl	24(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	76029189(%rbx,%r10,1),%ebx
+	movl	36(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	leal	-640364487(%rax,%r10,1),%eax
+	movl	48(%rsi),%r10d
+	xorl	%edx,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%eax
+	roll	$4,%eax
+	movl	%ebx,%r11d
+	addl	%ebx,%eax
+	leal	-421815835(%rdx,%r10,1),%edx
+	movl	60(%rsi),%r10d
+	xorl	%ecx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%edx
+	roll	$11,%edx
+	movl	%eax,%r11d
+	addl	%eax,%edx
+	leal	530742520(%rcx,%r10,1),%ecx
+	movl	8(%rsi),%r10d
+	xorl	%ebx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ecx
+	roll	$16,%ecx
+	movl	%edx,%r11d
+	addl	%edx,%ecx
+	leal	-995338651(%rbx,%r10,1),%ebx
+	movl	0(%rsi),%r10d
+	xorl	%eax,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%ebx
+	roll	$23,%ebx
+	movl	%ecx,%r11d
+	addl	%ecx,%ebx
+	movl	0(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	xorl	%edx,%r11d
+	leal	-198630844(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	28(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	1126891415(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	56(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1416354905(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	20(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-57434055(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	48(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1700485571(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	12(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1894986606(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	40(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1051523(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	4(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-2054922799(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	32(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	1873313359(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	60(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-30611744(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	24(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	-1560198380(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	52(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	1309151649(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	16(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+	leal	-145523070(%rax,%r10,1),%eax
+	orl	%ebx,%r11d
+	xorl	%ecx,%r11d
+	addl	%r11d,%eax
+	movl	44(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$6,%eax
+	xorl	%ecx,%r11d
+	addl	%ebx,%eax
+	leal	-1120210379(%rdx,%r10,1),%edx
+	orl	%eax,%r11d
+	xorl	%ebx,%r11d
+	addl	%r11d,%edx
+	movl	8(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$10,%edx
+	xorl	%ebx,%r11d
+	addl	%eax,%edx
+	leal	718787259(%rcx,%r10,1),%ecx
+	orl	%edx,%r11d
+	xorl	%eax,%r11d
+	addl	%r11d,%ecx
+	movl	36(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$15,%ecx
+	xorl	%eax,%r11d
+	addl	%edx,%ecx
+	leal	-343485551(%rbx,%r10,1),%ebx
+	orl	%ecx,%r11d
+	xorl	%edx,%r11d
+	addl	%r11d,%ebx
+	movl	0(%rsi),%r10d
+	movl	$0xffffffff,%r11d
+	roll	$21,%ebx
+	xorl	%edx,%r11d
+	addl	%ecx,%ebx
+
+	addl	%r8d,%eax
+	addl	%r9d,%ebx
+	addl	%r14d,%ecx
+	addl	%r15d,%edx
+
+
+	addq	$64,%rsi
+	cmpq	%rdi,%rsi
+	jb	.Lloop
+
+
+.Lend:
+	movl	%eax,0(%rbp)
+	movl	%ebx,4(%rbp)
+	movl	%ecx,8(%rbp)
+	movl	%edx,12(%rbp)
+
+	movq	(%rsp),%r15
+.cfi_restore	r15
+	movq	8(%rsp),%r14
+.cfi_restore	r14
+	movq	16(%rsp),%r12
+.cfi_restore	r12
+	movq	24(%rsp),%rbx
+.cfi_restore	rbx
+	movq	32(%rsp),%rbp
+.cfi_restore	rbp
+	addq	$40,%rsp
+.cfi_adjust_cfa_offset	-40
+.Lepilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	md5_block_asm_data_order,.-md5_block_asm_data_order
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
@@ -1,0 +1,4543 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+
+.align	64
+.Lpoly:
+.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+.LOne:
+.long	1,1,1,1,1,1,1,1
+.LTwo:
+.long	2,2,2,2,2,2,2,2
+.LThree:
+.long	3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+.Lord:
+.quad	0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad	0xccd1c8aaee00bc4f
+
+
+
+.globl	ecp_nistz256_neg
+.hidden ecp_nistz256_neg
+.type	ecp_nistz256_neg,@function
+.align	32
+ecp_nistz256_neg:
+.cfi_startproc	
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-24
+.Lneg_body:
+
+	xorq	%r8,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r13,%r13
+
+	subq	0(%rsi),%r8
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r8,%rax
+	sbbq	24(%rsi),%r11
+	leaq	.Lpoly(%rip),%rsi
+	movq	%r9,%rdx
+	sbbq	$0,%r13
+
+	addq	0(%rsi),%r8
+	movq	%r10,%rcx
+	adcq	8(%rsi),%r9
+	adcq	16(%rsi),%r10
+	movq	%r11,%r12
+	adcq	24(%rsi),%r11
+	testq	%r13,%r13
+
+	cmovzq	%rax,%r8
+	cmovzq	%rdx,%r9
+	movq	%r8,0(%rdi)
+	cmovzq	%rcx,%r10
+	movq	%r9,8(%rdi)
+	cmovzq	%r12,%r11
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+	movq	0(%rsp),%r13
+.cfi_restore	%r13
+	movq	8(%rsp),%r12
+.cfi_restore	%r12
+	leaq	16(%rsp),%rsp
+.cfi_adjust_cfa_offset	-16
+.Lneg_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_neg,.-ecp_nistz256_neg
+
+
+
+
+
+
+.globl	ecp_nistz256_ord_mul_mont
+.hidden ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,@function
+.align	32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lecp_nistz256_ord_mul_montx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_mul_body:
+
+	movq	0(%rdx),%rax
+	movq	%rdx,%rbx
+	leaq	.Lord(%rip),%r14
+	movq	.LordK(%rip),%r15
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	movq	%rax,%r8
+	movq	%rcx,%rax
+	movq	%rdx,%r9
+
+	mulq	8(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	16(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r8,%r13
+	imulq	%r15,%r8
+
+	movq	%rdx,%r11
+	mulq	24(%rsi)
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	0(%r14)
+	movq	%r8,%rbp
+	addq	%rax,%r13
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%r8
+
+	mulq	8(%r14)
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	%rdx,%r10
+	movq	%rbp,%rdx
+	adcq	$0,%r8
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	8(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r8,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r9,%rcx
+	imulq	%r15,%r9
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	xorq	%r8,%r8
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+	mulq	0(%r14)
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	%r9,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%r9
+
+	mulq	8(%r14)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	%rdx,%r11
+	movq	%rbp,%rdx
+	adcq	$0,%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r12
+	movq	16(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r9,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r10,%rcx
+	imulq	%r15,%r10
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	xorq	%r9,%r9
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+	mulq	0(%r14)
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	%r10,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r12
+	sbbq	$0,%r10
+
+	mulq	8(%r14)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	%rdx,%r12
+	movq	%rbp,%rdx
+	adcq	$0,%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r13
+	movq	24(%rbx),%rax
+	sbbq	%rdx,%rbp
+
+	addq	%r10,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	movq	%rax,%rcx
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	8(%rsi)
+	addq	%rbp,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	16(%rsi)
+	addq	%rbp,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rcx,%rax
+	adcq	$0,%rdx
+
+	movq	%r11,%rcx
+	imulq	%r15,%r11
+
+	movq	%rdx,%rbp
+	mulq	24(%rsi)
+	addq	%rbp,%r8
+	adcq	$0,%rdx
+	xorq	%r10,%r10
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+	mulq	0(%r14)
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	%r11,%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r13
+	sbbq	$0,%r11
+
+	mulq	8(%r14)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	%rdx,%r13
+	movq	%rbp,%rdx
+	adcq	$0,%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	sbbq	%rdx,%rbp
+
+	addq	%r11,%r8
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	movq	%r12,%rsi
+	subq	0(%r14),%r12
+	movq	%r13,%r11
+	sbbq	8(%r14),%r13
+	movq	%r8,%rcx
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rsi,%r12
+	cmovcq	%r11,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mul_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+
+
+
+
+
+
+.globl	ecp_nistz256_ord_sqr_mont
+.hidden ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,@function
+.align	32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lecp_nistz256_ord_sqr_montx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_sqr_body:
+
+	movq	0(%rsi),%r8
+	movq	8(%rsi),%rax
+	movq	16(%rsi),%r14
+	movq	24(%rsi),%r15
+	leaq	.Lord(%rip),%rsi
+	movq	%rdx,%rbx
+	jmp	.Loop_ord_sqr
+
+.align	32
+.Loop_ord_sqr:
+
+	movq	%rax,%rbp
+	mulq	%r8
+	movq	%rax,%r9
+.byte	102,72,15,110,205
+	movq	%r14,%rax
+	movq	%rdx,%r10
+
+	mulq	%r8
+	addq	%rax,%r10
+	movq	%r15,%rax
+.byte	102,73,15,110,214
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r8
+	addq	%rax,%r11
+	movq	%r15,%rax
+.byte	102,73,15,110,223
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	movq	%rax,%r13
+	movq	%r14,%rax
+	movq	%rdx,%r14
+
+
+	mulq	%rbp
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r15
+
+	mulq	%rbp
+	addq	%rax,%r12
+	adcq	$0,%rdx
+
+	addq	%r15,%r12
+	adcq	%rdx,%r13
+	adcq	$0,%r14
+
+
+	xorq	%r15,%r15
+	movq	%r8,%rax
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+
+	mulq	%rax
+	movq	%rax,%r8
+.byte	102,72,15,126,200
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r9
+	adcq	%rax,%r10
+.byte	102,72,15,126,208
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%rax
+	addq	%rbp,%r11
+	adcq	%rax,%r12
+.byte	102,72,15,126,216
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	movq	%r8,%rcx
+	imulq	32(%rsi),%r8
+
+	mulq	%rax
+	addq	%rbp,%r13
+	adcq	%rax,%r14
+	movq	0(%rsi),%rax
+	adcq	%rdx,%r15
+
+
+	mulq	%r8
+	movq	%r8,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r8,%r10
+	sbbq	$0,%rbp
+
+	mulq	%r8
+	addq	%rcx,%r9
+	adcq	$0,%rdx
+	addq	%rax,%r9
+	movq	%r8,%rax
+	adcq	%rdx,%r10
+	movq	%r8,%rdx
+	adcq	$0,%rbp
+
+	movq	%r9,%rcx
+	imulq	32(%rsi),%r9
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r11
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r8
+
+	addq	%rbp,%r11
+	adcq	$0,%r8
+
+
+	mulq	%r9
+	movq	%r9,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r9,%r11
+	sbbq	$0,%rbp
+
+	mulq	%r9
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%r9,%rax
+	adcq	%rdx,%r11
+	movq	%r9,%rdx
+	adcq	$0,%rbp
+
+	movq	%r10,%rcx
+	imulq	32(%rsi),%r10
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r8
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r9
+
+	addq	%rbp,%r8
+	adcq	$0,%r9
+
+
+	mulq	%r10
+	movq	%r10,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r10,%r8
+	sbbq	$0,%rbp
+
+	mulq	%r10
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	movq	%r10,%rdx
+	adcq	$0,%rbp
+
+	movq	%r11,%rcx
+	imulq	32(%rsi),%r11
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r9
+	movq	0(%rsi),%rax
+	sbbq	%rdx,%r10
+
+	addq	%rbp,%r9
+	adcq	$0,%r10
+
+
+	mulq	%r11
+	movq	%r11,%rbp
+	addq	%rax,%rcx
+	movq	8(%rsi),%rax
+	adcq	%rdx,%rcx
+
+	subq	%r11,%r9
+	sbbq	$0,%rbp
+
+	mulq	%r11
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	movq	%r11,%rdx
+	adcq	$0,%rbp
+
+	shlq	$32,%rax
+	shrq	$32,%rdx
+	subq	%rax,%r10
+	sbbq	%rdx,%r11
+
+	addq	%rbp,%r10
+	adcq	$0,%r11
+
+
+	xorq	%rdx,%rdx
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	movq	%r8,%r12
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+	subq	0(%rsi),%r8
+	movq	%r10,%r14
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r15
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rdx
+
+	cmovcq	%r12,%r8
+	cmovncq	%r9,%rax
+	cmovncq	%r10,%r14
+	cmovncq	%r11,%r15
+
+	decq	%rbx
+	jnz	.Loop_ord_sqr
+
+	movq	%r8,0(%rdi)
+	movq	%rax,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r14,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r15,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqr_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+
+.type	ecp_nistz256_ord_mul_montx,@function
+.align	32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc	
+.Lecp_nistz256_ord_mul_montx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_mulx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+	leaq	.Lord-128(%rip),%r14
+	movq	.LordK(%rip),%r15
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	mulxq	%r11,%rbp,%r11
+	addq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcq	%rbp,%r10
+	adcq	%rcx,%r11
+	adcq	$0,%r12
+
+
+	xorq	%r13,%r13
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+	adcxq	%r8,%r12
+	adoxq	%r8,%r13
+	adcq	$0,%r13
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcxq	%r9,%r13
+	adoxq	%r9,%r8
+	adcq	$0,%r8
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+	adcxq	%r10,%r8
+	adoxq	%r10,%r9
+	adcq	$0,%r9
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	mulxq	%r15,%rdx,%rax
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+	mulxq	0+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%r14),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%r14),%rcx,%rbp
+	leaq	128(%r14),%r14
+	movq	%r12,%rbx
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	movq	%r13,%rdx
+	adcxq	%r11,%r9
+	adoxq	%r11,%r10
+	adcq	$0,%r10
+
+
+
+	movq	%r8,%rcx
+	subq	0(%r14),%r12
+	sbbq	8(%r14),%r13
+	sbbq	16(%r14),%r8
+	movq	%r9,%rbp
+	sbbq	24(%r14),%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%rbp,%r9
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_mulx_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type	ecp_nistz256_ord_sqr_montx,@function
+.align	32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc	
+.Lecp_nistz256_ord_sqr_montx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lord_sqrx_body:
+
+	movq	%rdx,%rbx
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	.Lord(%rip),%rsi
+	jmp	.Loop_ord_sqrx
+
+.align	32
+.Loop_ord_sqrx:
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	movq	%rdx,%rax
+.byte	102,73,15,110,206
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	addq	%rcx,%r10
+.byte	102,73,15,110,215
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+	mulxq	%r8,%rcx,%r14
+	movq	%rax,%rdx
+.byte	102,73,15,110,216
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+
+	mulxq	%rdx,%r8,%rbp
+.byte	102,72,15,126,202
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+.byte	102,72,15,126,210
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+	mulxq	%rdx,%rcx,%rbp
+.byte	0x67
+.byte	102,72,15,126,218
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	mulxq	%rdx,%rcx,%rax
+	adoxq	%rcx,%r14
+	adoxq	%rax,%r15
+
+
+	movq	%r8,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	xorq	%rax,%rax
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	adcxq	%rax,%r8
+
+
+	movq	%r9,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	adoxq	%rax,%r9
+
+
+	movq	%r10,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+	mulxq	8(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r8
+	mulxq	16(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r8
+	adoxq	%rbp,%r9
+	mulxq	24(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+	adcxq	%rax,%r10
+
+
+	movq	%r11,%rdx
+	mulxq	32(%rsi),%rdx,%rcx
+
+	mulxq	0(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r11
+	adcxq	%rbp,%r8
+	mulxq	8(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r8
+	adcxq	%rbp,%r9
+	mulxq	16(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r9
+	adcxq	%rbp,%r10
+	mulxq	24(%rsi),%rcx,%rbp
+	adoxq	%rcx,%r10
+	adcxq	%rbp,%r11
+	adoxq	%rax,%r11
+
+
+	addq	%r8,%r12
+	adcq	%r13,%r9
+	movq	%r12,%rdx
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	movq	%r9,%r14
+	adcq	$0,%rax
+
+
+	subq	0(%rsi),%r12
+	movq	%r10,%r15
+	sbbq	8(%rsi),%r9
+	sbbq	16(%rsi),%r10
+	movq	%r11,%r8
+	sbbq	24(%rsi),%r11
+	sbbq	$0,%rax
+
+	cmovncq	%r12,%rdx
+	cmovncq	%r9,%r14
+	cmovncq	%r10,%r15
+	cmovncq	%r11,%r8
+
+	decq	%rbx
+	jnz	.Loop_ord_sqrx
+
+	movq	%rdx,0(%rdi)
+	movq	%r14,8(%rdi)
+	pxor	%xmm1,%xmm1
+	movq	%r15,16(%rdi)
+	pxor	%xmm2,%xmm2
+	movq	%r8,24(%rdi)
+	pxor	%xmm3,%xmm3
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lord_sqrx_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
+
+
+
+
+
+.globl	ecp_nistz256_mul_mont
+.hidden ecp_nistz256_mul_mont
+.type	ecp_nistz256_mul_mont,@function
+.align	32
+ecp_nistz256_mul_mont:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+.Lmul_mont:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lmul_body:
+	cmpl	$0x80100,%ecx
+	je	.Lmul_montx
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rax
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+
+	call	__ecp_nistz256_mul_montq
+	jmp	.Lmul_mont_done
+
+.align	32
+.Lmul_montx:
+	movq	%rdx,%rbx
+	movq	0(%rdx),%rdx
+	movq	0(%rsi),%r9
+	movq	8(%rsi),%r10
+	movq	16(%rsi),%r11
+	movq	24(%rsi),%r12
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_mul_montx
+.Lmul_mont_done:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type	__ecp_nistz256_mul_montq,@function
+.align	32
+__ecp_nistz256_mul_montq:
+.cfi_startproc	
+
+
+	movq	%rax,%rbp
+	mulq	%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	%rax,%r8
+	movq	%rbp,%rax
+	movq	%rdx,%r9
+
+	mulq	%r10
+	movq	.Lpoly+24(%rip),%r15
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%r11
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r12
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	xorq	%r13,%r13
+	movq	%rdx,%r12
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,%rbp
+	shlq	$32,%r8
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r8,%r9
+	adcq	%rbp,%r10
+	adcq	%rax,%r11
+	movq	8(%rbx),%rax
+	adcq	%rdx,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r9
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r10
+	adcq	$0,%rdx
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%r9,%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+
+
+
+	movq	%r9,%rbp
+	shlq	$32,%r9
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r9,%r10
+	adcq	%rbp,%r11
+	adcq	%rax,%r12
+	movq	16(%rbx),%rax
+	adcq	%rdx,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r10
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r11
+	adcq	$0,%rdx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%r10,%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+
+
+
+	movq	%r10,%rbp
+	shlq	$32,%r10
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r10,%r11
+	adcq	%rbp,%r12
+	adcq	%rax,%r13
+	movq	24(%rbx),%rax
+	adcq	%rdx,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	movq	%rax,%rbp
+	mulq	0(%rsi)
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	8(%rsi)
+	addq	%rcx,%r12
+	adcq	$0,%rdx
+	addq	%rax,%r12
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	16(%rsi)
+	addq	%rcx,%r13
+	adcq	$0,%rdx
+	addq	%rax,%r13
+	movq	%rbp,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	24(%rsi)
+	addq	%rcx,%r8
+	adcq	$0,%rdx
+	addq	%rax,%r8
+	movq	%r11,%rax
+	adcq	%rdx,%r9
+	adcq	$0,%r10
+
+
+
+	movq	%r11,%rbp
+	shlq	$32,%r11
+	mulq	%r15
+	shrq	$32,%rbp
+	addq	%r11,%r12
+	adcq	%rbp,%r13
+	movq	%r12,%rcx
+	adcq	%rax,%r8
+	adcq	%rdx,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r10
+
+
+
+	subq	$-1,%r12
+	movq	%r8,%rbx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rdx
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rcx,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rbx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rdx,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+
+
+
+
+
+
+
+.globl	ecp_nistz256_sqr_mont
+.hidden ecp_nistz256_sqr_mont
+.type	ecp_nistz256_sqr_mont,@function
+.align	32
+ecp_nistz256_sqr_mont:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lsqr_body:
+	cmpl	$0x80100,%ecx
+	je	.Lsqr_montx
+	movq	0(%rsi),%rax
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+
+	call	__ecp_nistz256_sqr_montq
+	jmp	.Lsqr_mont_done
+
+.align	32
+.Lsqr_montx:
+	movq	0(%rsi),%rdx
+	movq	8(%rsi),%r14
+	movq	16(%rsi),%r15
+	movq	24(%rsi),%r8
+	leaq	-128(%rsi),%rsi
+
+	call	__ecp_nistz256_sqr_montx
+.Lsqr_mont_done:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbx
+.cfi_restore	%rbx
+	movq	40(%rsp),%rbp
+.cfi_restore	%rbp
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lsqr_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type	__ecp_nistz256_sqr_montq,@function
+.align	32
+__ecp_nistz256_sqr_montq:
+.cfi_startproc	
+	movq	%rax,%r13
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r15,%rax
+	movq	%rdx,%r10
+
+	mulq	%r13
+	addq	%rax,%r10
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%r13
+	addq	%rax,%r11
+	movq	%r15,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r12
+
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rbp
+
+	mulq	%r14
+	addq	%rax,%r12
+	movq	%r8,%rax
+	adcq	$0,%rdx
+	addq	%rbp,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+
+	mulq	%r15
+	xorq	%r15,%r15
+	addq	%rax,%r13
+	movq	0(%rsi),%rax
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	addq	%r9,%r9
+	adcq	%r10,%r10
+	adcq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	adcq	%r14,%r14
+	adcq	$0,%r15
+
+	mulq	%rax
+	movq	%rax,%r8
+	movq	8(%rsi),%rax
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r11
+	adcq	%rax,%r12
+	movq	24(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rcx
+
+	mulq	%rax
+	addq	%rcx,%r13
+	adcq	%rax,%r14
+	movq	%r8,%rax
+	adcq	%rdx,%r15
+
+	movq	.Lpoly+8(%rip),%rsi
+	movq	.Lpoly+24(%rip),%rbp
+
+
+
+
+	movq	%r8,%rcx
+	shlq	$32,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r8,%r9
+	adcq	%rcx,%r10
+	adcq	%rax,%r11
+	movq	%r9,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r9,%rcx
+	shlq	$32,%r9
+	movq	%rdx,%r8
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r9,%r10
+	adcq	%rcx,%r11
+	adcq	%rax,%r8
+	movq	%r10,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r10,%rcx
+	shlq	$32,%r10
+	movq	%rdx,%r9
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r10,%r11
+	adcq	%rcx,%r8
+	adcq	%rax,%r9
+	movq	%r11,%rax
+	adcq	$0,%rdx
+
+
+
+	movq	%r11,%rcx
+	shlq	$32,%r11
+	movq	%rdx,%r10
+	mulq	%rbp
+	shrq	$32,%rcx
+	addq	%r11,%r8
+	adcq	%rcx,%r9
+	adcq	%rax,%r10
+	adcq	$0,%rdx
+	xorq	%r11,%r11
+
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%rdx,%r15
+	movq	%r13,%r9
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%rcx
+	sbbq	%rbp,%r15
+	sbbq	$0,%r11
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%rcx,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type	__ecp_nistz256_mul_montx,@function
+.align	32
+__ecp_nistz256_mul_montx:
+.cfi_startproc	
+
+
+	mulxq	%r9,%r8,%r9
+	mulxq	%r10,%rcx,%r10
+	movq	$32,%r14
+	xorq	%r13,%r13
+	mulxq	%r11,%rbp,%r11
+	movq	.Lpoly+24(%rip),%r15
+	adcq	%rcx,%r9
+	mulxq	%r12,%rcx,%r12
+	movq	%r8,%rdx
+	adcq	%rbp,%r10
+	shlxq	%r14,%r8,%rbp
+	adcq	%rcx,%r11
+	shrxq	%r14,%r8,%rcx
+	adcq	$0,%r12
+
+
+
+	addq	%rbp,%r9
+	adcq	%rcx,%r10
+
+	mulxq	%r15,%rcx,%rbp
+	movq	8(%rbx),%rdx
+	adcq	%rcx,%r11
+	adcq	%rbp,%r12
+	adcq	$0,%r13
+	xorq	%r8,%r8
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r9
+	adoxq	%rbp,%r10
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r9,%rdx
+	adcxq	%rcx,%r12
+	shlxq	%r14,%r9,%rcx
+	adoxq	%rbp,%r13
+	shrxq	%r14,%r9,%rbp
+
+	adcxq	%r8,%r13
+	adoxq	%r8,%r8
+	adcq	$0,%r8
+
+
+
+	addq	%rcx,%r10
+	adcq	%rbp,%r11
+
+	mulxq	%r15,%rcx,%rbp
+	movq	16(%rbx),%rdx
+	adcq	%rcx,%r12
+	adcq	%rbp,%r13
+	adcq	$0,%r8
+	xorq	%r9,%r9
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r10
+	adoxq	%rbp,%r11
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r10,%rdx
+	adcxq	%rcx,%r13
+	shlxq	%r14,%r10,%rcx
+	adoxq	%rbp,%r8
+	shrxq	%r14,%r10,%rbp
+
+	adcxq	%r9,%r8
+	adoxq	%r9,%r9
+	adcq	$0,%r9
+
+
+
+	addq	%rcx,%r11
+	adcq	%rbp,%r12
+
+	mulxq	%r15,%rcx,%rbp
+	movq	24(%rbx),%rdx
+	adcq	%rcx,%r13
+	adcq	%rbp,%r8
+	adcq	$0,%r9
+	xorq	%r10,%r10
+
+
+
+	mulxq	0+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	8+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+
+	mulxq	16+128(%rsi),%rcx,%rbp
+	adcxq	%rcx,%r13
+	adoxq	%rbp,%r8
+
+	mulxq	24+128(%rsi),%rcx,%rbp
+	movq	%r11,%rdx
+	adcxq	%rcx,%r8
+	shlxq	%r14,%r11,%rcx
+	adoxq	%rbp,%r9
+	shrxq	%r14,%r11,%rbp
+
+	adcxq	%r10,%r9
+	adoxq	%r10,%r10
+	adcq	$0,%r10
+
+
+
+	addq	%rcx,%r12
+	adcq	%rbp,%r13
+
+	mulxq	%r15,%rcx,%rbp
+	movq	%r12,%rbx
+	movq	.Lpoly+8(%rip),%r14
+	adcq	%rcx,%r8
+	movq	%r13,%rdx
+	adcq	%rbp,%r9
+	adcq	$0,%r10
+
+
+
+	xorl	%eax,%eax
+	movq	%r8,%rcx
+	sbbq	$-1,%r12
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%rbp
+	sbbq	%r15,%r9
+	sbbq	$0,%r10
+
+	cmovcq	%rbx,%r12
+	cmovcq	%rdx,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%rbp,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type	__ecp_nistz256_sqr_montx,@function
+.align	32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc	
+	mulxq	%r14,%r9,%r10
+	mulxq	%r15,%rcx,%r11
+	xorl	%eax,%eax
+	adcq	%rcx,%r10
+	mulxq	%r8,%rbp,%r12
+	movq	%r14,%rdx
+	adcq	%rbp,%r11
+	adcq	$0,%r12
+	xorq	%r13,%r13
+
+
+	mulxq	%r15,%rcx,%rbp
+	adcxq	%rcx,%r11
+	adoxq	%rbp,%r12
+
+	mulxq	%r8,%rcx,%rbp
+	movq	%r15,%rdx
+	adcxq	%rcx,%r12
+	adoxq	%rbp,%r13
+	adcq	$0,%r13
+
+
+	mulxq	%r8,%rcx,%r14
+	movq	0+128(%rsi),%rdx
+	xorq	%r15,%r15
+	adcxq	%r9,%r9
+	adoxq	%rcx,%r13
+	adcxq	%r10,%r10
+	adoxq	%r15,%r14
+
+	mulxq	%rdx,%r8,%rbp
+	movq	8+128(%rsi),%rdx
+	adcxq	%r11,%r11
+	adoxq	%rbp,%r9
+	adcxq	%r12,%r12
+	mulxq	%rdx,%rcx,%rax
+	movq	16+128(%rsi),%rdx
+	adcxq	%r13,%r13
+	adoxq	%rcx,%r10
+	adcxq	%r14,%r14
+.byte	0x67
+	mulxq	%rdx,%rcx,%rbp
+	movq	24+128(%rsi),%rdx
+	adoxq	%rax,%r11
+	adcxq	%r15,%r15
+	adoxq	%rcx,%r12
+	movq	$32,%rsi
+	adoxq	%rbp,%r13
+.byte	0x67,0x67
+	mulxq	%rdx,%rcx,%rax
+	movq	.Lpoly+24(%rip),%rdx
+	adoxq	%rcx,%r14
+	shlxq	%rsi,%r8,%rcx
+	adoxq	%rax,%r15
+	shrxq	%rsi,%r8,%rax
+	movq	%rdx,%rbp
+
+
+	addq	%rcx,%r9
+	adcq	%rax,%r10
+
+	mulxq	%r8,%rcx,%r8
+	adcq	%rcx,%r11
+	shlxq	%rsi,%r9,%rcx
+	adcq	$0,%r8
+	shrxq	%rsi,%r9,%rax
+
+
+	addq	%rcx,%r10
+	adcq	%rax,%r11
+
+	mulxq	%r9,%rcx,%r9
+	adcq	%rcx,%r8
+	shlxq	%rsi,%r10,%rcx
+	adcq	$0,%r9
+	shrxq	%rsi,%r10,%rax
+
+
+	addq	%rcx,%r11
+	adcq	%rax,%r8
+
+	mulxq	%r10,%rcx,%r10
+	adcq	%rcx,%r9
+	shlxq	%rsi,%r11,%rcx
+	adcq	$0,%r10
+	shrxq	%rsi,%r11,%rax
+
+
+	addq	%rcx,%r8
+	adcq	%rax,%r9
+
+	mulxq	%r11,%rcx,%r11
+	adcq	%rcx,%r10
+	adcq	$0,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r8,%r12
+	movq	.Lpoly+8(%rip),%rsi
+	adcq	%r9,%r13
+	movq	%r12,%r8
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	movq	%r13,%r9
+	adcq	$0,%rdx
+
+	subq	$-1,%r12
+	movq	%r14,%r10
+	sbbq	%rsi,%r13
+	sbbq	$0,%r14
+	movq	%r15,%r11
+	sbbq	%rbp,%r15
+	sbbq	$0,%rdx
+
+	cmovcq	%r8,%r12
+	cmovcq	%r9,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%r10,%r14
+	movq	%r13,8(%rdi)
+	cmovcq	%r11,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+
+
+.globl	ecp_nistz256_select_w5
+.hidden ecp_nistz256_select_w5
+.type	ecp_nistz256_select_w5,@function
+.align	32
+ecp_nistz256_select_w5:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rax
+	movq	8(%rax),%rax
+	testl	$32,%eax
+	jnz	.Lavx2_select_w5
+	movdqa	.LOne(%rip),%xmm0
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+
+	movdqa	%xmm0,%xmm8
+	pshufd	$0,%xmm1,%xmm1
+
+	movq	$16,%rax
+.Lselect_loop_sse_w5:
+
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	pcmpeqd	%xmm1,%xmm15
+
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	movdqa	64(%rsi),%xmm13
+	movdqa	80(%rsi),%xmm14
+	leaq	96(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	pand	%xmm15,%xmm13
+	por	%xmm12,%xmm5
+	pand	%xmm15,%xmm14
+	por	%xmm13,%xmm6
+	por	%xmm14,%xmm7
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w5
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	movdqu	%xmm6,64(%rdi)
+	movdqu	%xmm7,80(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w5:
+.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+
+.globl	ecp_nistz256_select_w7
+.hidden ecp_nistz256_select_w7
+.type	ecp_nistz256_select_w7,@function
+.align	32
+ecp_nistz256_select_w7:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rax
+	movq	8(%rax),%rax
+	testl	$32,%eax
+	jnz	.Lavx2_select_w7
+	movdqa	.LOne(%rip),%xmm8
+	movd	%edx,%xmm1
+
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	movdqa	%xmm8,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	$64,%rax
+
+.Lselect_loop_sse_w7:
+	movdqa	%xmm8,%xmm15
+	paddd	%xmm0,%xmm8
+	movdqa	0(%rsi),%xmm9
+	movdqa	16(%rsi),%xmm10
+	pcmpeqd	%xmm1,%xmm15
+	movdqa	32(%rsi),%xmm11
+	movdqa	48(%rsi),%xmm12
+	leaq	64(%rsi),%rsi
+
+	pand	%xmm15,%xmm9
+	pand	%xmm15,%xmm10
+	por	%xmm9,%xmm2
+	pand	%xmm15,%xmm11
+	por	%xmm10,%xmm3
+	pand	%xmm15,%xmm12
+	por	%xmm11,%xmm4
+	prefetcht0	255(%rsi)
+	por	%xmm12,%xmm5
+
+	decq	%rax
+	jnz	.Lselect_loop_sse_w7
+
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+	movdqu	%xmm4,32(%rdi)
+	movdqu	%xmm5,48(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_select_w7:
+.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+
+
+.type	ecp_nistz256_avx2_select_w5,@function
+.align	32
+ecp_nistz256_avx2_select_w5:
+.cfi_startproc	
+.Lavx2_select_w5:
+	vzeroupper
+	vmovdqa	.LTwo(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+	vpxor	%ymm4,%ymm4,%ymm4
+
+	vmovdqa	.LOne(%rip),%ymm5
+	vmovdqa	.LTwo(%rip),%ymm10
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+	movq	$8,%rax
+.Lselect_loop_avx2_w5:
+
+	vmovdqa	0(%rsi),%ymm6
+	vmovdqa	32(%rsi),%ymm7
+	vmovdqa	64(%rsi),%ymm8
+
+	vmovdqa	96(%rsi),%ymm11
+	vmovdqa	128(%rsi),%ymm12
+	vmovdqa	160(%rsi),%ymm13
+
+	vpcmpeqd	%ymm1,%ymm5,%ymm9
+	vpcmpeqd	%ymm1,%ymm10,%ymm14
+
+	vpaddd	%ymm0,%ymm5,%ymm5
+	vpaddd	%ymm0,%ymm10,%ymm10
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm9,%ymm6,%ymm6
+	vpand	%ymm9,%ymm7,%ymm7
+	vpand	%ymm9,%ymm8,%ymm8
+	vpand	%ymm14,%ymm11,%ymm11
+	vpand	%ymm14,%ymm12,%ymm12
+	vpand	%ymm14,%ymm13,%ymm13
+
+	vpxor	%ymm6,%ymm2,%ymm2
+	vpxor	%ymm7,%ymm3,%ymm3
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpxor	%ymm11,%ymm2,%ymm2
+	vpxor	%ymm12,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm4,%ymm4
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w5
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vmovdqu	%ymm4,64(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_avx2_select_w5:
+.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+
+
+
+.globl	ecp_nistz256_avx2_select_w7
+.hidden ecp_nistz256_avx2_select_w7
+.type	ecp_nistz256_avx2_select_w7,@function
+.align	32
+ecp_nistz256_avx2_select_w7:
+.cfi_startproc	
+.Lavx2_select_w7:
+	vzeroupper
+	vmovdqa	.LThree(%rip),%ymm0
+
+	vpxor	%ymm2,%ymm2,%ymm2
+	vpxor	%ymm3,%ymm3,%ymm3
+
+	vmovdqa	.LOne(%rip),%ymm4
+	vmovdqa	.LTwo(%rip),%ymm8
+	vmovdqa	.LThree(%rip),%ymm12
+
+	vmovd	%edx,%xmm1
+	vpermd	%ymm1,%ymm2,%ymm1
+
+
+	movq	$21,%rax
+.Lselect_loop_avx2_w7:
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vmovdqa	64(%rsi),%ymm9
+	vmovdqa	96(%rsi),%ymm10
+
+	vmovdqa	128(%rsi),%ymm13
+	vmovdqa	160(%rsi),%ymm14
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+	vpcmpeqd	%ymm1,%ymm8,%ymm11
+	vpcmpeqd	%ymm1,%ymm12,%ymm15
+
+	vpaddd	%ymm0,%ymm4,%ymm4
+	vpaddd	%ymm0,%ymm8,%ymm8
+	vpaddd	%ymm0,%ymm12,%ymm12
+	leaq	192(%rsi),%rsi
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+	vpand	%ymm11,%ymm9,%ymm9
+	vpand	%ymm11,%ymm10,%ymm10
+	vpand	%ymm15,%ymm13,%ymm13
+	vpand	%ymm15,%ymm14,%ymm14
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+	vpxor	%ymm9,%ymm2,%ymm2
+	vpxor	%ymm10,%ymm3,%ymm3
+	vpxor	%ymm13,%ymm2,%ymm2
+	vpxor	%ymm14,%ymm3,%ymm3
+
+	decq	%rax
+	jnz	.Lselect_loop_avx2_w7
+
+
+	vmovdqa	0(%rsi),%ymm5
+	vmovdqa	32(%rsi),%ymm6
+
+	vpcmpeqd	%ymm1,%ymm4,%ymm7
+
+	vpand	%ymm7,%ymm5,%ymm5
+	vpand	%ymm7,%ymm6,%ymm6
+
+	vpxor	%ymm5,%ymm2,%ymm2
+	vpxor	%ymm6,%ymm3,%ymm3
+
+	vmovdqu	%ymm2,0(%rdi)
+	vmovdqu	%ymm3,32(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_ecp_nistz256_avx2_select_w7:
+.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.type	__ecp_nistz256_add_toq,@function
+.align	32
+__ecp_nistz256_add_toq:
+.cfi_startproc	
+	xorq	%r11,%r11
+	addq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type	__ecp_nistz256_sub_fromq,@function
+.align	32
+__ecp_nistz256_sub_fromq:
+.cfi_startproc	
+	subq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	%r11,%r11
+
+	addq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+	testq	%r11,%r11
+
+	cmovzq	%rax,%r12
+	cmovzq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovzq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovzq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type	__ecp_nistz256_subq,@function
+.align	32
+__ecp_nistz256_subq:
+.cfi_startproc	
+	subq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	%r11,%r11
+
+	addq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+	testq	%r11,%r11
+
+	cmovnzq	%rax,%r12
+	cmovnzq	%rbp,%r13
+	cmovnzq	%rcx,%r8
+	cmovnzq	%r10,%r9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type	__ecp_nistz256_mul_by_2q,@function
+.align	32
+__ecp_nistz256_mul_by_2q:
+.cfi_startproc	
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl	ecp_nistz256_point_double
+.hidden ecp_nistz256_point_double
+.type	ecp_nistz256_point_double,@function
+.align	32
+ecp_nistz256_point_double:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_doublex
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$160+8,%rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_doubleq_body:
+
+.Lpoint_double_shortcutq:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-0(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	32(%rbx),%rax
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montq
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montq
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rax
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_toq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2q
+
+	movq	0+32(%rsp),%rax
+	movq	8+32(%rsp),%r14
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montq
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subq
+
+	movq	32(%rsp),%rax
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-0(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromq
+
+	leaq	160+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_doubleq_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl	ecp_nistz256_point_add
+.hidden ecp_nistz256_point_add
+.type	ecp_nistz256_point_add,@function
+.align	32
+ecp_nistz256_point_add:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_addx
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$576+8,%rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_addq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-0(%rsi),%rsi
+	movq	%rax,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rax
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-0(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	416(%rsp),%rax
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	512(%rsp),%rax
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	0+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	480(%rsp),%rax
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	.Ladd_proceedq
+
+
+
+	testq	%r9,%r9
+	jz	.Ladd_doubleq
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_doneq
+
+.align	32
+.Ladd_doubleq:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+.cfi_adjust_cfa_offset	-416
+	jmp	.Lpoint_double_shortcutq
+.cfi_adjust_cfa_offset	416
+
+.align	32
+.Ladd_proceedq:
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0+0(%rsp),%rax
+	movq	8+0(%rsp),%r14
+	leaq	0+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	544(%rsp),%rax
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	0+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	0(%rsp),%rax
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	160(%rsp),%rax
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	0+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_doneq:
+	leaq	576+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_addq_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl	ecp_nistz256_point_add_affine
+.hidden ecp_nistz256_point_add_affine
+.type	ecp_nistz256_point_add_affine,@function
+.align	32
+ecp_nistz256_point_add_affine:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%rcx
+	movq	8(%rcx),%rcx
+	andl	$0x80100,%ecx
+	cmpl	$0x80100,%ecx
+	je	.Lpoint_add_affinex
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$480+8,%rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affineq_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rax
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-0(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rax
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-0(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	384(%rsp),%rax
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	448(%rsp),%rax
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	0+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+64(%rsp),%rax
+	movq	8+64(%rsp),%r14
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	0+96(%rsp),%rax
+	movq	8+96(%rsp),%r14
+	leaq	0+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montq
+
+	movq	128(%rsp),%rax
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	320(%rsp),%rax
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	0+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subq
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subq
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rax
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	0+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	movq	96(%rsp),%rax
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	0+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montq
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromq
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affineq_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type	__ecp_nistz256_add_tox,@function
+.align	32
+__ecp_nistz256_add_tox:
+.cfi_startproc	
+	xorq	%r11,%r11
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	movq	%r12,%rax
+	adcq	16(%rbx),%r8
+	adcq	24(%rbx),%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type	__ecp_nistz256_sub_fromx,@function
+.align	32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc	
+	xorq	%r11,%r11
+	sbbq	0(%rbx),%r12
+	sbbq	8(%rbx),%r13
+	movq	%r12,%rax
+	sbbq	16(%rbx),%r8
+	sbbq	24(%rbx),%r9
+	movq	%r13,%rbp
+	sbbq	$0,%r11
+
+	xorq	%r10,%r10
+	adcq	$-1,%r12
+	movq	%r8,%rcx
+	adcq	%r14,%r13
+	adcq	$0,%r8
+	movq	%r9,%r10
+	adcq	%r15,%r9
+
+	btq	$0,%r11
+	cmovncq	%rax,%r12
+	cmovncq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovncq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovncq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type	__ecp_nistz256_subx,@function
+.align	32
+__ecp_nistz256_subx:
+.cfi_startproc	
+	xorq	%r11,%r11
+	sbbq	%r12,%rax
+	sbbq	%r13,%rbp
+	movq	%rax,%r12
+	sbbq	%r8,%rcx
+	sbbq	%r9,%r10
+	movq	%rbp,%r13
+	sbbq	$0,%r11
+
+	xorq	%r9,%r9
+	adcq	$-1,%rax
+	movq	%rcx,%r8
+	adcq	%r14,%rbp
+	adcq	$0,%rcx
+	movq	%r10,%r9
+	adcq	%r15,%r10
+
+	btq	$0,%r11
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	cmovcq	%rcx,%r8
+	cmovcq	%r10,%r9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type	__ecp_nistz256_mul_by_2x,@function
+.align	32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc	
+	xorq	%r11,%r11
+	adcq	%r12,%r12
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	xorq	%r10,%r10
+	sbbq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	cmovcq	%rbp,%r13
+	movq	%r12,0(%rdi)
+	cmovcq	%rcx,%r8
+	movq	%r13,8(%rdi)
+	cmovcq	%r10,%r9
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type	ecp_nistz256_point_doublex,@function
+.align	32
+ecp_nistz256_point_doublex:
+.cfi_startproc	
+.Lpoint_doublex:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$160+8,%rsp
+.cfi_adjust_cfa_offset	32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+	movdqu	0(%rsi),%xmm0
+	movq	%rsi,%rbx
+	movdqu	16(%rsi),%xmm1
+	movq	32+0(%rsi),%r12
+	movq	32+8(%rsi),%r13
+	movq	32+16(%rsi),%r8
+	movq	32+24(%rsi),%r9
+	movq	.Lpoly+8(%rip),%r14
+	movq	.Lpoly+24(%rip),%r15
+	movdqa	%xmm0,96(%rsp)
+	movdqa	%xmm1,96+16(%rsp)
+	leaq	32(%rdi),%r10
+	leaq	64(%rdi),%r11
+.byte	102,72,15,110,199
+.byte	102,73,15,110,202
+.byte	102,73,15,110,211
+
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	leaq	64-128(%rsi),%rsi
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	32(%rbx),%rdx
+	movq	64+0(%rbx),%r9
+	movq	64+8(%rbx),%r10
+	movq	64+16(%rbx),%r11
+	movq	64+24(%rbx),%r12
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rbx),%rbx
+.byte	102,72,15,126,215
+	call	__ecp_nistz256_mul_montx
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96+0(%rsp),%r12
+	movq	96+8(%rsp),%r13
+	leaq	64(%rsp),%rbx
+	movq	96+16(%rsp),%r8
+	movq	96+24(%rsp),%r9
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sqr_montx
+	xorq	%r9,%r9
+	movq	%r12,%rax
+	addq	$-1,%r12
+	movq	%r13,%r10
+	adcq	%rsi,%r13
+	movq	%r14,%rcx
+	adcq	$0,%r14
+	movq	%r15,%r8
+	adcq	%rbp,%r15
+	adcq	$0,%r9
+	xorq	%rsi,%rsi
+	testq	$1,%rax
+
+	cmovzq	%rax,%r12
+	cmovzq	%r10,%r13
+	cmovzq	%rcx,%r14
+	cmovzq	%r8,%r15
+	cmovzq	%rsi,%r9
+
+	movq	%r13,%rax
+	shrq	$1,%r12
+	shlq	$63,%rax
+	movq	%r14,%r10
+	shrq	$1,%r13
+	orq	%rax,%r12
+	shlq	$63,%r10
+	movq	%r15,%rcx
+	shrq	$1,%r14
+	orq	%r10,%r13
+	shlq	$63,%rcx
+	movq	%r12,0(%rdi)
+	shrq	$1,%r15
+	movq	%r13,8(%rdi)
+	shlq	$63,%r9
+	orq	%rcx,%r14
+	orq	%r9,%r15
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	movq	64(%rsp),%rdx
+	leaq	64(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	leaq	32(%rsp),%rbx
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_add_tox
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_by_2x
+
+	movq	0+32(%rsp),%rdx
+	movq	8+32(%rsp),%r14
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r15
+	movq	24+32(%rsp),%r8
+.byte	102,72,15,126,199
+	call	__ecp_nistz256_sqr_montx
+
+	leaq	128(%rsp),%rbx
+	movq	%r14,%r8
+	movq	%r15,%r9
+	movq	%rsi,%r14
+	movq	%rbp,%r15
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_subx
+
+	movq	32(%rsp),%rdx
+	leaq	32(%rsp),%rbx
+	movq	%r12,%r14
+	xorl	%ecx,%ecx
+	movq	%r12,0+0(%rsp)
+	movq	%r13,%r10
+	movq	%r13,0+8(%rsp)
+	cmovzq	%r8,%r11
+	movq	%r8,0+16(%rsp)
+	leaq	0-128(%rsp),%rsi
+	cmovzq	%r9,%r12
+	movq	%r9,0+24(%rsp)
+	movq	%r14,%r9
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+.byte	102,72,15,126,203
+.byte	102,72,15,126,207
+	call	__ecp_nistz256_sub_fromx
+
+	leaq	160+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_doublex_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type	ecp_nistz256_point_addx,@function
+.align	32
+ecp_nistz256_point_addx:
+.cfi_startproc	
+.Lpoint_addx:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$576+8,%rsp
+.cfi_adjust_cfa_offset	32*18+8
+.Lpoint_addx_body:
+
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	%rsi,%rbx
+	movq	%rdx,%rsi
+	movdqa	%xmm0,384(%rsp)
+	movdqa	%xmm1,384+16(%rsp)
+	movdqa	%xmm2,416(%rsp)
+	movdqa	%xmm3,416+16(%rsp)
+	movdqa	%xmm4,448(%rsp)
+	movdqa	%xmm5,448+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rsi),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rsi),%xmm3
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,480(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,480+16(%rsp)
+	movdqu	64(%rsi),%xmm0
+	movdqu	80(%rsi),%xmm1
+	movdqa	%xmm2,512(%rsp)
+	movdqa	%xmm3,512+16(%rsp)
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+
+	leaq	64-128(%rsi),%rsi
+	movq	%rdx,544+0(%rsp)
+	movq	%r14,544+8(%rsp)
+	movq	%r15,544+16(%rsp)
+	movq	%r8,544+24(%rsp)
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm1,%xmm4
+	por	%xmm1,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+	movq	64+0(%rbx),%rdx
+	movq	64+8(%rbx),%r14
+	movq	64+16(%rbx),%r15
+	movq	64+24(%rbx),%r8
+.byte	102,72,15,110,203
+
+	leaq	64-128(%rbx),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	416(%rsp),%rdx
+	leaq	416(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	512(%rsp),%rdx
+	leaq	512(%rsp),%rbx
+	movq	0+256(%rsp),%r9
+	movq	8+256(%rsp),%r10
+	leaq	-128+256(%rsp),%rsi
+	movq	16+256(%rsp),%r11
+	movq	24+256(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	224(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	movdqa	%xmm4,%xmm2
+	orq	%r8,%r12
+	orq	%r9,%r12
+	por	%xmm5,%xmm2
+.byte	102,73,15,110,220
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+96(%rsp),%r9
+	movq	8+96(%rsp),%r10
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r11
+	movq	24+96(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	480(%rsp),%rdx
+	leaq	480(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	160(%rsp),%rbx
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	orq	%r13,%r12
+	orq	%r8,%r12
+	orq	%r9,%r12
+
+.byte	102,73,15,126,208
+.byte	102,73,15,126,217
+	orq	%r8,%r12
+.byte	0x3e
+	jnz	.Ladd_proceedx
+
+
+
+	testq	%r9,%r9
+	jz	.Ladd_doublex
+
+
+
+
+
+
+.byte	102,72,15,126,199
+	pxor	%xmm0,%xmm0
+	movdqu	%xmm0,0(%rdi)
+	movdqu	%xmm0,16(%rdi)
+	movdqu	%xmm0,32(%rdi)
+	movdqu	%xmm0,48(%rdi)
+	movdqu	%xmm0,64(%rdi)
+	movdqu	%xmm0,80(%rdi)
+	jmp	.Ladd_donex
+
+.align	32
+.Ladd_doublex:
+.byte	102,72,15,126,206
+.byte	102,72,15,126,199
+	addq	$416,%rsp
+.cfi_adjust_cfa_offset	-416
+	jmp	.Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset	416
+
+.align	32
+.Ladd_proceedx:
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+0(%rsp),%r9
+	movq	8+0(%rsp),%r10
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r11
+	movq	24+0(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0+0(%rsp),%rdx
+	movq	8+0(%rsp),%r14
+	leaq	-128+0(%rsp),%rsi
+	movq	16+0(%rsp),%r15
+	movq	24+0(%rsp),%r8
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	544(%rsp),%rdx
+	leaq	544(%rsp),%rbx
+	movq	0+352(%rsp),%r9
+	movq	8+352(%rsp),%r10
+	leaq	-128+352(%rsp),%rsi
+	movq	16+352(%rsp),%r11
+	movq	24+352(%rsp),%r12
+	leaq	352(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	0(%rsp),%rdx
+	leaq	0(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	160(%rsp),%rdx
+	leaq	160(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	96(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	128(%rsp),%rbx
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	192+0(%rsp),%rax
+	movq	192+8(%rsp),%rbp
+	movq	192+16(%rsp),%rcx
+	movq	192+24(%rsp),%r10
+	leaq	320(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+224(%rsp),%r9
+	movq	8+224(%rsp),%r10
+	leaq	-128+224(%rsp),%rsi
+	movq	16+224(%rsp),%r11
+	movq	24+224(%rsp),%r12
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	256(%rsp),%rbx
+	leaq	320(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	352(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	352+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	544(%rsp),%xmm2
+	pand	544+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	480(%rsp),%xmm2
+	pand	480+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	320(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	320+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	512(%rsp),%xmm2
+	pand	512+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+.Ladd_donex:
+	leaq	576+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpoint_addx_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type	ecp_nistz256_point_add_affinex,@function
+.align	32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc	
+.Lpoint_add_affinex:
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-16
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+	subq	$480+8,%rsp
+.cfi_adjust_cfa_offset	32*15+8
+.Ladd_affinex_body:
+
+	movdqu	0(%rsi),%xmm0
+	movq	%rdx,%rbx
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+	movdqu	48(%rsi),%xmm3
+	movdqu	64(%rsi),%xmm4
+	movdqu	80(%rsi),%xmm5
+	movq	64+0(%rsi),%rdx
+	movq	64+8(%rsi),%r14
+	movq	64+16(%rsi),%r15
+	movq	64+24(%rsi),%r8
+	movdqa	%xmm0,320(%rsp)
+	movdqa	%xmm1,320+16(%rsp)
+	movdqa	%xmm2,352(%rsp)
+	movdqa	%xmm3,352+16(%rsp)
+	movdqa	%xmm4,384(%rsp)
+	movdqa	%xmm5,384+16(%rsp)
+	por	%xmm4,%xmm5
+
+	movdqu	0(%rbx),%xmm0
+	pshufd	$0xb1,%xmm5,%xmm3
+	movdqu	16(%rbx),%xmm1
+	movdqu	32(%rbx),%xmm2
+	por	%xmm3,%xmm5
+	movdqu	48(%rbx),%xmm3
+	movdqa	%xmm0,416(%rsp)
+	pshufd	$0x1e,%xmm5,%xmm4
+	movdqa	%xmm1,416+16(%rsp)
+	por	%xmm0,%xmm1
+.byte	102,72,15,110,199
+	movdqa	%xmm2,448(%rsp)
+	movdqa	%xmm3,448+16(%rsp)
+	por	%xmm2,%xmm3
+	por	%xmm4,%xmm5
+	pxor	%xmm4,%xmm4
+	por	%xmm1,%xmm3
+
+	leaq	64-128(%rsi),%rsi
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	pcmpeqd	%xmm4,%xmm5
+	pshufd	$0xb1,%xmm3,%xmm4
+	movq	0(%rbx),%rdx
+
+	movq	%r12,%r9
+	por	%xmm3,%xmm4
+	pshufd	$0,%xmm5,%xmm5
+	pshufd	$0x1e,%xmm4,%xmm3
+	movq	%r13,%r10
+	por	%xmm3,%xmm4
+	pxor	%xmm3,%xmm3
+	movq	%r14,%r11
+	pcmpeqd	%xmm3,%xmm4
+	pshufd	$0,%xmm4,%xmm4
+
+	leaq	32-128(%rsp),%rsi
+	movq	%r15,%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	320(%rsp),%rbx
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	384(%rsp),%rdx
+	leaq	384(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	288(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	448(%rsp),%rdx
+	leaq	448(%rsp),%rbx
+	movq	0+32(%rsp),%r9
+	movq	8+32(%rsp),%r10
+	leaq	-128+32(%rsp),%rsi
+	movq	16+32(%rsp),%r11
+	movq	24+32(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	352(%rsp),%rbx
+	leaq	96(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+64(%rsp),%rdx
+	movq	8+64(%rsp),%r14
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r15
+	movq	24+64(%rsp),%r8
+	leaq	128(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	0+96(%rsp),%rdx
+	movq	8+96(%rsp),%r14
+	leaq	-128+96(%rsp),%rsi
+	movq	16+96(%rsp),%r15
+	movq	24+96(%rsp),%r8
+	leaq	192(%rsp),%rdi
+	call	__ecp_nistz256_sqr_montx
+
+	movq	128(%rsp),%rdx
+	leaq	128(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	160(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	320(%rsp),%rdx
+	leaq	320(%rsp),%rbx
+	movq	0+128(%rsp),%r9
+	movq	8+128(%rsp),%r10
+	leaq	-128+128(%rsp),%rsi
+	movq	16+128(%rsp),%r11
+	movq	24+128(%rsp),%r12
+	leaq	0(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+
+
+
+	xorq	%r11,%r11
+	addq	%r12,%r12
+	leaq	192(%rsp),%rsi
+	adcq	%r13,%r13
+	movq	%r12,%rax
+	adcq	%r8,%r8
+	adcq	%r9,%r9
+	movq	%r13,%rbp
+	adcq	$0,%r11
+
+	subq	$-1,%r12
+	movq	%r8,%rcx
+	sbbq	%r14,%r13
+	sbbq	$0,%r8
+	movq	%r9,%r10
+	sbbq	%r15,%r9
+	sbbq	$0,%r11
+
+	cmovcq	%rax,%r12
+	movq	0(%rsi),%rax
+	cmovcq	%rbp,%r13
+	movq	8(%rsi),%rbp
+	cmovcq	%rcx,%r8
+	movq	16(%rsi),%rcx
+	cmovcq	%r10,%r9
+	movq	24(%rsi),%r10
+
+	call	__ecp_nistz256_subx
+
+	leaq	160(%rsp),%rbx
+	leaq	224(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+	movq	0+0(%rsp),%rax
+	movq	0+8(%rsp),%rbp
+	movq	0+16(%rsp),%rcx
+	movq	0+24(%rsp),%r10
+	leaq	64(%rsp),%rdi
+
+	call	__ecp_nistz256_subx
+
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r8,16(%rdi)
+	movq	%r9,24(%rdi)
+	movq	352(%rsp),%rdx
+	leaq	352(%rsp),%rbx
+	movq	0+160(%rsp),%r9
+	movq	8+160(%rsp),%r10
+	leaq	-128+160(%rsp),%rsi
+	movq	16+160(%rsp),%r11
+	movq	24+160(%rsp),%r12
+	leaq	32(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	movq	96(%rsp),%rdx
+	leaq	96(%rsp),%rbx
+	movq	0+64(%rsp),%r9
+	movq	8+64(%rsp),%r10
+	leaq	-128+64(%rsp),%rsi
+	movq	16+64(%rsp),%r11
+	movq	24+64(%rsp),%r12
+	leaq	64(%rsp),%rdi
+	call	__ecp_nistz256_mul_montx
+
+	leaq	32(%rsp),%rbx
+	leaq	256(%rsp),%rdi
+	call	__ecp_nistz256_sub_fromx
+
+.byte	102,72,15,126,199
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	288(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	288+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	.LONE_mont(%rip),%xmm2
+	pand	.LONE_mont+16(%rip),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	384(%rsp),%xmm2
+	pand	384+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,64(%rdi)
+	movdqu	%xmm3,80(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	224(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	224+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	416(%rsp),%xmm2
+	pand	416+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	320(%rsp),%xmm2
+	pand	320+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,0(%rdi)
+	movdqu	%xmm3,16(%rdi)
+
+	movdqa	%xmm5,%xmm0
+	movdqa	%xmm5,%xmm1
+	pandn	256(%rsp),%xmm0
+	movdqa	%xmm5,%xmm2
+	pandn	256+16(%rsp),%xmm1
+	movdqa	%xmm5,%xmm3
+	pand	448(%rsp),%xmm2
+	pand	448+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+
+	movdqa	%xmm4,%xmm0
+	movdqa	%xmm4,%xmm1
+	pandn	%xmm2,%xmm0
+	movdqa	%xmm4,%xmm2
+	pandn	%xmm3,%xmm1
+	movdqa	%xmm4,%xmm3
+	pand	352(%rsp),%xmm2
+	pand	352+16(%rsp),%xmm3
+	por	%xmm0,%xmm2
+	por	%xmm1,%xmm3
+	movdqu	%xmm2,32(%rdi)
+	movdqu	%xmm3,48(%rdi)
+
+	leaq	480+56(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbx
+.cfi_restore	%rbx
+	movq	-8(%rsi),%rbp
+.cfi_restore	%rbp
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Ladd_affinex_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
@@ -1,0 +1,343 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.type	beeu_mod_inverse_vartime,@function
+.hidden	beeu_mod_inverse_vartime
+.globl	beeu_mod_inverse_vartime
+.hidden beeu_mod_inverse_vartime
+.align	32
+beeu_mod_inverse_vartime:
+.cfi_startproc	
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbp,-16
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r12,-24
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r13,-32
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r14,-40
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	r15,-48
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rbx,-56
+	pushq	%rsi
+.cfi_adjust_cfa_offset	8
+.cfi_offset	rsi,-64
+
+	subq	$80,%rsp
+.cfi_adjust_cfa_offset	80
+	movq	%rdi,0(%rsp)
+
+
+	movq	$1,%r8
+	xorq	%r9,%r9
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%rdi,%rdi
+
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	xorq	%rbp,%rbp
+
+
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	%xmm0,48(%rsp)
+	vmovdqu	%xmm1,64(%rsp)
+
+	vmovdqu	0(%rdx),%xmm0
+	vmovdqu	16(%rdx),%xmm1
+	vmovdqu	%xmm0,16(%rsp)
+	vmovdqu	%xmm1,32(%rsp)
+
+.Lbeeu_loop:
+	xorq	%rbx,%rbx
+	orq	48(%rsp),%rbx
+	orq	56(%rsp),%rbx
+	orq	64(%rsp),%rbx
+	orq	72(%rsp),%rbx
+	jz	.Lbeeu_loop_end
+
+
+
+
+
+
+
+
+
+
+	movq	$1,%rcx
+
+
+.Lbeeu_shift_loop_XB:
+	movq	%rcx,%rbx
+	andq	48(%rsp),%rbx
+	jnz	.Lbeeu_shift_loop_end_XB
+
+
+	movq	$1,%rbx
+	andq	%r8,%rbx
+	jz	.Lshift1_0
+	addq	0(%rdx),%r8
+	adcq	8(%rdx),%r9
+	adcq	16(%rdx),%r10
+	adcq	24(%rdx),%r11
+	adcq	$0,%rdi
+
+.Lshift1_0:
+	shrdq	$1,%r9,%r8
+	shrdq	$1,%r10,%r9
+	shrdq	$1,%r11,%r10
+	shrdq	$1,%rdi,%r11
+	shrq	$1,%rdi
+
+	shlq	$1,%rcx
+
+
+
+
+
+	cmpq	$0x8000000,%rcx
+	jne	.Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+	bsfq	%rcx,%rcx
+	testq	%rcx,%rcx
+	jz	.Lbeeu_no_shift_XB
+
+
+
+	movq	8+48(%rsp),%rax
+	movq	16+48(%rsp),%rbx
+	movq	24+48(%rsp),%rsi
+
+	shrdq	%cl,%rax,0+48(%rsp)
+	shrdq	%cl,%rbx,8+48(%rsp)
+	shrdq	%cl,%rsi,16+48(%rsp)
+
+	shrq	%cl,%rsi
+	movq	%rsi,24+48(%rsp)
+
+
+.Lbeeu_no_shift_XB:
+
+	movq	$1,%rcx
+
+
+.Lbeeu_shift_loop_YA:
+	movq	%rcx,%rbx
+	andq	16(%rsp),%rbx
+	jnz	.Lbeeu_shift_loop_end_YA
+
+
+	movq	$1,%rbx
+	andq	%r12,%rbx
+	jz	.Lshift1_1
+	addq	0(%rdx),%r12
+	adcq	8(%rdx),%r13
+	adcq	16(%rdx),%r14
+	adcq	24(%rdx),%r15
+	adcq	$0,%rbp
+
+.Lshift1_1:
+	shrdq	$1,%r13,%r12
+	shrdq	$1,%r14,%r13
+	shrdq	$1,%r15,%r14
+	shrdq	$1,%rbp,%r15
+	shrq	$1,%rbp
+
+	shlq	$1,%rcx
+
+
+
+
+
+	cmpq	$0x8000000,%rcx
+	jne	.Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+	bsfq	%rcx,%rcx
+	testq	%rcx,%rcx
+	jz	.Lbeeu_no_shift_YA
+
+
+
+	movq	8+16(%rsp),%rax
+	movq	16+16(%rsp),%rbx
+	movq	24+16(%rsp),%rsi
+
+	shrdq	%cl,%rax,0+16(%rsp)
+	shrdq	%cl,%rbx,8+16(%rsp)
+	shrdq	%cl,%rsi,16+16(%rsp)
+
+	shrq	%cl,%rsi
+	movq	%rsi,24+16(%rsp)
+
+
+.Lbeeu_no_shift_YA:
+
+	movq	48(%rsp),%rax
+	movq	56(%rsp),%rbx
+	movq	64(%rsp),%rsi
+	movq	72(%rsp),%rcx
+	subq	16(%rsp),%rax
+	sbbq	24(%rsp),%rbx
+	sbbq	32(%rsp),%rsi
+	sbbq	40(%rsp),%rcx
+	jnc	.Lbeeu_B_bigger_than_A
+
+
+	movq	16(%rsp),%rax
+	movq	24(%rsp),%rbx
+	movq	32(%rsp),%rsi
+	movq	40(%rsp),%rcx
+	subq	48(%rsp),%rax
+	sbbq	56(%rsp),%rbx
+	sbbq	64(%rsp),%rsi
+	sbbq	72(%rsp),%rcx
+	movq	%rax,16(%rsp)
+	movq	%rbx,24(%rsp)
+	movq	%rsi,32(%rsp)
+	movq	%rcx,40(%rsp)
+
+
+	addq	%r8,%r12
+	adcq	%r9,%r13
+	adcq	%r10,%r14
+	adcq	%r11,%r15
+	adcq	%rdi,%rbp
+	jmp	.Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+
+	movq	%rax,48(%rsp)
+	movq	%rbx,56(%rsp)
+	movq	%rsi,64(%rsp)
+	movq	%rcx,72(%rsp)
+
+
+	addq	%r12,%r8
+	adcq	%r13,%r9
+	adcq	%r14,%r10
+	adcq	%r15,%r11
+	adcq	%rbp,%rdi
+
+	jmp	.Lbeeu_loop
+
+.Lbeeu_loop_end:
+
+
+
+
+	movq	16(%rsp),%rbx
+	subq	$1,%rbx
+	orq	24(%rsp),%rbx
+	orq	32(%rsp),%rbx
+	orq	40(%rsp),%rbx
+
+	jnz	.Lbeeu_err
+
+
+
+
+	movq	0(%rdx),%r8
+	movq	8(%rdx),%r9
+	movq	16(%rdx),%r10
+	movq	24(%rdx),%r11
+	xorq	%rdi,%rdi
+
+.Lbeeu_reduction_loop:
+	movq	%r12,16(%rsp)
+	movq	%r13,24(%rsp)
+	movq	%r14,32(%rsp)
+	movq	%r15,40(%rsp)
+	movq	%rbp,48(%rsp)
+
+
+	subq	%r8,%r12
+	sbbq	%r9,%r13
+	sbbq	%r10,%r14
+	sbbq	%r11,%r15
+	sbbq	$0,%rbp
+
+
+	cmovcq	16(%rsp),%r12
+	cmovcq	24(%rsp),%r13
+	cmovcq	32(%rsp),%r14
+	cmovcq	40(%rsp),%r15
+	jnc	.Lbeeu_reduction_loop
+
+
+	subq	%r12,%r8
+	sbbq	%r13,%r9
+	sbbq	%r14,%r10
+	sbbq	%r15,%r11
+
+.Lbeeu_save:
+
+	movq	0(%rsp),%rdi
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+
+
+	movq	$1,%rax
+	jmp	.Lbeeu_finish
+
+.Lbeeu_err:
+
+	xorq	%rax,%rax
+
+.Lbeeu_finish:
+	addq	$80,%rsp
+.cfi_adjust_cfa_offset	-80
+	popq	%rsi
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	rsi
+	popq	%rbx
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	rbx
+	popq	%r15
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r15
+	popq	%r14
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r14
+	popq	%r13
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r13
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	r12
+	popq	%rbp
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	rbp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.size	beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S
@@ -1,0 +1,63 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+
+
+
+.globl	CRYPTO_rdrand
+.hidden CRYPTO_rdrand
+.type	CRYPTO_rdrand,@function
+.align	16
+CRYPTO_rdrand:
+.cfi_startproc	
+	xorq	%rax,%rax
+.byte	72,15,199,242
+
+	adcq	%rax,%rax
+	movq	%rdx,0(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	CRYPTO_rdrand,.-CRYPTO_rdrand
+
+
+
+
+
+.globl	CRYPTO_rdrand_multiple8_buf
+.hidden CRYPTO_rdrand_multiple8_buf
+.type	CRYPTO_rdrand_multiple8_buf,@function
+.align	16
+CRYPTO_rdrand_multiple8_buf:
+.cfi_startproc	
+	testq	%rsi,%rsi
+	jz	.Lout
+	movq	$8,%rdx
+.Lloop:
+.byte	72,15,199,241
+	jnc	.Lerr
+	movq	%rcx,0(%rdi)
+	addq	%rdx,%rdi
+	subq	%rdx,%rsi
+	jnz	.Lloop
+.Lout:
+	movq	$1,%rax
+	.byte	0xf3,0xc3
+.Lerr:
+	xorq	%rax,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S
@@ -1,0 +1,1749 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.globl	rsaz_1024_sqr_avx2
+.hidden rsaz_1024_sqr_avx2
+.type	rsaz_1024_sqr_avx2,@function
+.align	64
+rsaz_1024_sqr_avx2:
+.cfi_startproc	
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	vzeroupper
+	movq	%rax,%rbp
+.cfi_def_cfa_register	%rbp
+	movq	%rdx,%r13
+	subq	$832,%rsp
+	movq	%r13,%r15
+	subq	$-128,%rdi
+	subq	$-128,%rsi
+	subq	$-128,%r13
+
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	vpxor	%ymm9,%ymm9,%ymm9
+	jz	.Lsqr_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%r13),%ymm0
+	andq	$-2048,%rsp
+	vmovdqu	32-128(%r13),%ymm1
+	vmovdqu	64-128(%r13),%ymm2
+	vmovdqu	96-128(%r13),%ymm3
+	vmovdqu	128-128(%r13),%ymm4
+	vmovdqu	160-128(%r13),%ymm5
+	vmovdqu	192-128(%r13),%ymm6
+	vmovdqu	224-128(%r13),%ymm7
+	vmovdqu	256-128(%r13),%ymm8
+	leaq	832+128(%rsp),%r13
+	vmovdqu	%ymm0,0-128(%r13)
+	vmovdqu	%ymm1,32-128(%r13)
+	vmovdqu	%ymm2,64-128(%r13)
+	vmovdqu	%ymm3,96-128(%r13)
+	vmovdqu	%ymm4,128-128(%r13)
+	vmovdqu	%ymm5,160-128(%r13)
+	vmovdqu	%ymm6,192-128(%r13)
+	vmovdqu	%ymm7,224-128(%r13)
+	vmovdqu	%ymm8,256-128(%r13)
+	vmovdqu	%ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+	andq	$-1024,%rsp
+
+	vmovdqu	32-128(%rsi),%ymm1
+	vmovdqu	64-128(%rsi),%ymm2
+	vmovdqu	96-128(%rsi),%ymm3
+	vmovdqu	128-128(%rsi),%ymm4
+	vmovdqu	160-128(%rsi),%ymm5
+	vmovdqu	192-128(%rsi),%ymm6
+	vmovdqu	224-128(%rsi),%ymm7
+	vmovdqu	256-128(%rsi),%ymm8
+
+	leaq	192(%rsp),%rbx
+	vmovdqu	.Land_mask(%rip),%ymm15
+	jmp	.LOOP_GRANDE_SQR_1024
+
+.align	32
+.LOOP_GRANDE_SQR_1024:
+	leaq	576+128(%rsp),%r9
+	leaq	448(%rsp),%r12
+
+
+
+
+	vpaddq	%ymm1,%ymm1,%ymm1
+	vpbroadcastq	0-128(%rsi),%ymm10
+	vpaddq	%ymm2,%ymm2,%ymm2
+	vmovdqa	%ymm1,0-128(%r9)
+	vpaddq	%ymm3,%ymm3,%ymm3
+	vmovdqa	%ymm2,32-128(%r9)
+	vpaddq	%ymm4,%ymm4,%ymm4
+	vmovdqa	%ymm3,64-128(%r9)
+	vpaddq	%ymm5,%ymm5,%ymm5
+	vmovdqa	%ymm4,96-128(%r9)
+	vpaddq	%ymm6,%ymm6,%ymm6
+	vmovdqa	%ymm5,128-128(%r9)
+	vpaddq	%ymm7,%ymm7,%ymm7
+	vmovdqa	%ymm6,160-128(%r9)
+	vpaddq	%ymm8,%ymm8,%ymm8
+	vmovdqa	%ymm7,192-128(%r9)
+	vpxor	%ymm9,%ymm9,%ymm9
+	vmovdqa	%ymm8,224-128(%r9)
+
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpbroadcastq	32-128(%rsi),%ymm11
+	vmovdqu	%ymm9,288-192(%rbx)
+	vpmuludq	%ymm10,%ymm1,%ymm1
+	vmovdqu	%ymm9,320-448(%r12)
+	vpmuludq	%ymm10,%ymm2,%ymm2
+	vmovdqu	%ymm9,352-448(%r12)
+	vpmuludq	%ymm10,%ymm3,%ymm3
+	vmovdqu	%ymm9,384-448(%r12)
+	vpmuludq	%ymm10,%ymm4,%ymm4
+	vmovdqu	%ymm9,416-448(%r12)
+	vpmuludq	%ymm10,%ymm5,%ymm5
+	vmovdqu	%ymm9,448-448(%r12)
+	vpmuludq	%ymm10,%ymm6,%ymm6
+	vmovdqu	%ymm9,480-448(%r12)
+	vpmuludq	%ymm10,%ymm7,%ymm7
+	vmovdqu	%ymm9,512-448(%r12)
+	vpmuludq	%ymm10,%ymm8,%ymm8
+	vpbroadcastq	64-128(%rsi),%ymm10
+	vmovdqu	%ymm9,544-448(%r12)
+
+	movq	%rsi,%r15
+	movl	$4,%r14d
+	jmp	.Lsqr_entry_1024
+.align	32
+.LOOP_SQR_1024:
+	vpbroadcastq	32-128(%r15),%ymm11
+	vpmuludq	0-128(%rsi),%ymm10,%ymm0
+	vpaddq	0-192(%rbx),%ymm0,%ymm0
+	vpmuludq	0-128(%r9),%ymm10,%ymm1
+	vpaddq	32-192(%rbx),%ymm1,%ymm1
+	vpmuludq	32-128(%r9),%ymm10,%ymm2
+	vpaddq	64-192(%rbx),%ymm2,%ymm2
+	vpmuludq	64-128(%r9),%ymm10,%ymm3
+	vpaddq	96-192(%rbx),%ymm3,%ymm3
+	vpmuludq	96-128(%r9),%ymm10,%ymm4
+	vpaddq	128-192(%rbx),%ymm4,%ymm4
+	vpmuludq	128-128(%r9),%ymm10,%ymm5
+	vpaddq	160-192(%rbx),%ymm5,%ymm5
+	vpmuludq	160-128(%r9),%ymm10,%ymm6
+	vpaddq	192-192(%rbx),%ymm6,%ymm6
+	vpmuludq	192-128(%r9),%ymm10,%ymm7
+	vpaddq	224-192(%rbx),%ymm7,%ymm7
+	vpmuludq	224-128(%r9),%ymm10,%ymm8
+	vpbroadcastq	64-128(%r15),%ymm10
+	vpaddq	256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+	vmovdqu	%ymm0,0-192(%rbx)
+	vmovdqu	%ymm1,32-192(%rbx)
+
+	vpmuludq	32-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	32-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	64-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	96-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	128-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	160-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	192-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	224-128(%r9),%ymm11,%ymm0
+	vpbroadcastq	96-128(%r15),%ymm11
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+
+	vmovdqu	%ymm2,64-192(%rbx)
+	vmovdqu	%ymm3,96-192(%rbx)
+
+	vpmuludq	64-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	64-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	96-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	128-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	160-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	224-128(%r9),%ymm10,%ymm1
+	vpbroadcastq	128-128(%r15),%ymm10
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+
+	vmovdqu	%ymm4,128-192(%rbx)
+	vmovdqu	%ymm5,160-192(%rbx)
+
+	vpmuludq	96-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	96-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	128-128(%r9),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm0,%ymm0
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	224-128(%r9),%ymm11,%ymm2
+	vpbroadcastq	160-128(%r15),%ymm11
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+
+	vmovdqu	%ymm6,192-192(%rbx)
+	vmovdqu	%ymm7,224-192(%rbx)
+
+	vpmuludq	128-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	128-128(%r9),%ymm10,%ymm14
+	vpaddq	%ymm14,%ymm0,%ymm0
+	vpmuludq	160-128(%r9),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	192-128(%r9),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	224-128(%r9),%ymm10,%ymm3
+	vpbroadcastq	192-128(%r15),%ymm10
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+
+	vmovdqu	%ymm8,256-192(%rbx)
+	vmovdqu	%ymm0,288-192(%rbx)
+	leaq	8(%rbx),%rbx
+
+	vpmuludq	160-128(%rsi),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	160-128(%r9),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	192-128(%r9),%ymm11,%ymm14
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	224-128(%r9),%ymm11,%ymm4
+	vpbroadcastq	224-128(%r15),%ymm11
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+
+	vmovdqu	%ymm1,320-448(%r12)
+	vmovdqu	%ymm2,352-448(%r12)
+
+	vpmuludq	192-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	192-128(%r9),%ymm10,%ymm14
+	vpbroadcastq	256-128(%r15),%ymm0
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	224-128(%r9),%ymm10,%ymm5
+	vpbroadcastq	0+8-128(%r15),%ymm10
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+
+	vmovdqu	%ymm3,384-448(%r12)
+	vmovdqu	%ymm4,416-448(%r12)
+	leaq	8(%r15),%r15
+
+	vpmuludq	224-128(%rsi),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	224-128(%r9),%ymm11,%ymm6
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+
+	vpmuludq	256-128(%rsi),%ymm0,%ymm7
+	vmovdqu	%ymm5,448-448(%r12)
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vmovdqu	%ymm6,480-448(%r12)
+	vmovdqu	%ymm7,512-448(%r12)
+	leaq	8(%r12),%r12
+
+	decl	%r14d
+	jnz	.LOOP_SQR_1024
+
+	vmovdqu	256(%rsp),%ymm8
+	vmovdqu	288(%rsp),%ymm1
+	vmovdqu	320(%rsp),%ymm2
+	leaq	192(%rsp),%rbx
+
+	vpsrlq	$29,%ymm8,%ymm14
+	vpand	%ymm15,%ymm8,%ymm8
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+
+	vpermq	$0x93,%ymm14,%ymm14
+	vpxor	%ymm9,%ymm9,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpblendd	$3,%ymm11,%ymm9,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,288-192(%rbx)
+	vmovdqu	%ymm2,320-192(%rbx)
+
+	movq	(%rsp),%rax
+	movq	8(%rsp),%r10
+	movq	16(%rsp),%r11
+	movq	24(%rsp),%r12
+	vmovdqu	32(%rsp),%ymm1
+	vmovdqu	64-192(%rbx),%ymm2
+	vmovdqu	96-192(%rbx),%ymm3
+	vmovdqu	128-192(%rbx),%ymm4
+	vmovdqu	160-192(%rbx),%ymm5
+	vmovdqu	192-192(%rbx),%ymm6
+	vmovdqu	224-192(%rbx),%ymm7
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpbroadcastq	%xmm12,%ymm12
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	shrq	$29,%r9
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	addq	%r9,%r10
+	addq	%rax,%r11
+	imulq	24-128(%r13),%rdx
+	addq	%rdx,%r12
+
+	movq	%r10,%rax
+	imull	%ecx,%eax
+	andl	$0x1fffffff,%eax
+
+	movl	$9,%r14d
+	jmp	.LOOP_REDUCE_1024
+
+.align	32
+.LOOP_REDUCE_1024:
+	vmovd	%eax,%xmm13
+	vpbroadcastq	%xmm13,%ymm13
+
+	vpmuludq	32-128(%r13),%ymm12,%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm10,%ymm1,%ymm1
+	addq	%rax,%r10
+	vpmuludq	64-128(%r13),%ymm12,%ymm14
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm14,%ymm2,%ymm2
+	vpmuludq	96-128(%r13),%ymm12,%ymm11
+.byte	0x67
+	addq	%rax,%r11
+.byte	0x67
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	shrq	$29,%r10
+	vpaddq	%ymm11,%ymm3,%ymm3
+	vpmuludq	128-128(%r13),%ymm12,%ymm10
+	addq	%rax,%r12
+	addq	%r10,%r11
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpmuludq	160-128(%r13),%ymm12,%ymm14
+	movq	%r11,%rax
+	imull	%ecx,%eax
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpmuludq	192-128(%r13),%ymm12,%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpmuludq	224-128(%r13),%ymm12,%ymm10
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpmuludq	256-128(%r13),%ymm12,%ymm14
+	vmovd	%eax,%xmm12
+
+	vpaddq	%ymm14,%ymm8,%ymm8
+
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	32-8-128(%r13),%ymm13,%ymm11
+	vmovdqu	96-8-128(%r13),%ymm14
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm1,%ymm1
+	vpmuludq	64-8-128(%r13),%ymm13,%ymm10
+	vmovdqu	128-8-128(%r13),%ymm11
+	addq	%rax,%r11
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+	vpaddq	%ymm10,%ymm2,%ymm2
+	addq	%r12,%rax
+	shrq	$29,%r11
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	160-8-128(%r13),%ymm10
+	addq	%r11,%rax
+	vpaddq	%ymm14,%ymm3,%ymm3
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	192-8-128(%r13),%ymm14
+.byte	0x67
+	movq	%rax,%r12
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm4,%ymm4
+	vpmuludq	%ymm13,%ymm10,%ymm10
+.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm5,%ymm5
+	vpmuludq	%ymm13,%ymm14,%ymm14
+	vmovdqu	256-8-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm6,%ymm6
+	vpmuludq	%ymm13,%ymm11,%ymm11
+	vmovdqu	288-8-128(%r13),%ymm9
+	vmovd	%eax,%xmm0
+	imulq	-128(%r13),%rax
+	vpaddq	%ymm11,%ymm7,%ymm7
+	vpmuludq	%ymm13,%ymm10,%ymm10
+	vmovdqu	32-16-128(%r13),%ymm14
+	vpbroadcastq	%xmm0,%ymm0
+	vpaddq	%ymm10,%ymm8,%ymm8
+	vpmuludq	%ymm13,%ymm9,%ymm9
+	vmovdqu	64-16-128(%r13),%ymm11
+	addq	%rax,%r12
+
+	vmovdqu	32-24-128(%r13),%ymm13
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	96-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpmuludq	%ymm0,%ymm13,%ymm13
+	vpmuludq	%ymm12,%ymm11,%ymm11
+.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+	vpaddq	%ymm1,%ymm13,%ymm13
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	160-16-128(%r13),%ymm11
+.byte	0x67
+	vmovq	%xmm13,%rax
+	vmovdqu	%ymm13,(%rsp)
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	vmovdqu	192-16-128(%r13),%ymm10
+	vpaddq	%ymm14,%ymm4,%ymm4
+	vpmuludq	%ymm12,%ymm11,%ymm11
+	vmovdqu	224-16-128(%r13),%ymm14
+	vpaddq	%ymm11,%ymm5,%ymm5
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	vmovdqu	256-16-128(%r13),%ymm11
+	vpaddq	%ymm10,%ymm6,%ymm6
+	vpmuludq	%ymm12,%ymm14,%ymm14
+	shrq	$29,%r12
+	vmovdqu	288-16-128(%r13),%ymm10
+	addq	%r12,%rax
+	vpaddq	%ymm14,%ymm7,%ymm7
+	vpmuludq	%ymm12,%ymm11,%ymm11
+
+	movq	%rax,%r9
+	imull	%ecx,%eax
+	vpaddq	%ymm11,%ymm8,%ymm8
+	vpmuludq	%ymm12,%ymm10,%ymm10
+	andl	$0x1fffffff,%eax
+	vmovd	%eax,%xmm12
+	vmovdqu	96-24-128(%r13),%ymm11
+.byte	0x67
+	vpaddq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	%xmm12,%ymm12
+
+	vpmuludq	64-24-128(%r13),%ymm0,%ymm14
+	vmovdqu	128-24-128(%r13),%ymm10
+	movq	%rax,%rdx
+	imulq	-128(%r13),%rax
+	movq	8(%rsp),%r10
+	vpaddq	%ymm14,%ymm2,%ymm1
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	160-24-128(%r13),%ymm14
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%r13),%rax
+.byte	0x67
+	shrq	$29,%r9
+	movq	16(%rsp),%r11
+	vpaddq	%ymm11,%ymm3,%ymm2
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vmovdqu	192-24-128(%r13),%ymm11
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%r13),%rax
+	vpaddq	%ymm10,%ymm4,%ymm3
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	vmovdqu	224-24-128(%r13),%ymm10
+	imulq	24-128(%r13),%rdx
+	addq	%rax,%r11
+	leaq	(%r9,%r10,1),%rax
+	vpaddq	%ymm14,%ymm5,%ymm4
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vmovdqu	256-24-128(%r13),%ymm14
+	movq	%rax,%r10
+	imull	%ecx,%eax
+	vpmuludq	%ymm0,%ymm10,%ymm10
+	vpaddq	%ymm11,%ymm6,%ymm5
+	vmovdqu	288-24-128(%r13),%ymm11
+	andl	$0x1fffffff,%eax
+	vpaddq	%ymm10,%ymm7,%ymm6
+	vpmuludq	%ymm0,%ymm14,%ymm14
+	addq	24(%rsp),%rdx
+	vpaddq	%ymm14,%ymm8,%ymm7
+	vpmuludq	%ymm0,%ymm11,%ymm11
+	vpaddq	%ymm11,%ymm9,%ymm8
+	vmovq	%r12,%xmm9
+	movq	%rdx,%r12
+
+	decl	%r14d
+	jnz	.LOOP_REDUCE_1024
+	leaq	448(%rsp),%r12
+	vpaddq	%ymm9,%ymm13,%ymm0
+	vpxor	%ymm9,%ymm9,%ymm9
+
+	vpaddq	288-192(%rbx),%ymm0,%ymm0
+	vpaddq	320-448(%r12),%ymm1,%ymm1
+	vpaddq	352-448(%r12),%ymm2,%ymm2
+	vpaddq	384-448(%r12),%ymm3,%ymm3
+	vpaddq	416-448(%r12),%ymm4,%ymm4
+	vpaddq	448-448(%r12),%ymm5,%ymm5
+	vpaddq	480-448(%r12),%ymm6,%ymm6
+	vpaddq	512-448(%r12),%ymm7,%ymm7
+	vpaddq	544-448(%r12),%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm14
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm11
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm12,%ymm12
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm13,%ymm13
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm0,%ymm0
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm1,%ymm1
+	vmovdqu	%ymm0,0-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm2,%ymm2
+	vmovdqu	%ymm1,32-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm9,%ymm13
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vmovdqu	%ymm2,64-128(%rdi)
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm14
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm11
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm12
+	vpermq	$0x93,%ymm14,%ymm14
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm13
+	vpermq	$0x93,%ymm11,%ymm11
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm13,%ymm13
+
+	vpblendd	$3,%ymm9,%ymm14,%ymm10
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm14,%ymm11,%ymm14
+	vpaddq	%ymm10,%ymm4,%ymm4
+	vpblendd	$3,%ymm11,%ymm12,%ymm11
+	vpaddq	%ymm14,%ymm5,%ymm5
+	vmovdqu	%ymm4,128-128(%rdi)
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm11,%ymm6,%ymm6
+	vmovdqu	%ymm5,160-128(%rdi)
+	vpblendd	$3,%ymm13,%ymm0,%ymm13
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vmovdqu	%ymm6,192-128(%rdi)
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+
+	movq	%rdi,%rsi
+	decl	%r8d
+	jne	.LOOP_GRANDE_SQR_1024
+
+	vzeroall
+	movq	%rbp,%rax
+.cfi_def_cfa_register	%rax
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lsqr_1024_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl	rsaz_1024_mul_avx2
+.hidden rsaz_1024_mul_avx2
+.type	rsaz_1024_mul_avx2,@function
+.align	64
+rsaz_1024_mul_avx2:
+.cfi_startproc	
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	movq	%rax,%rbp
+.cfi_def_cfa_register	%rbp
+	vzeroall
+	movq	%rdx,%r13
+	subq	$64,%rsp
+
+
+
+
+
+
+.byte	0x67,0x67
+	movq	%rsi,%r15
+	andq	$4095,%r15
+	addq	$320,%r15
+	shrq	$12,%r15
+	movq	%rsi,%r15
+	cmovnzq	%r13,%rsi
+	cmovnzq	%r15,%r13
+
+	movq	%rcx,%r15
+	subq	$-128,%rsi
+	subq	$-128,%rcx
+	subq	$-128,%rdi
+
+	andq	$4095,%r15
+	addq	$320,%r15
+.byte	0x67,0x67
+	shrq	$12,%r15
+	jz	.Lmul_1024_no_n_copy
+
+
+
+
+
+	subq	$320,%rsp
+	vmovdqu	0-128(%rcx),%ymm0
+	andq	$-512,%rsp
+	vmovdqu	32-128(%rcx),%ymm1
+	vmovdqu	64-128(%rcx),%ymm2
+	vmovdqu	96-128(%rcx),%ymm3
+	vmovdqu	128-128(%rcx),%ymm4
+	vmovdqu	160-128(%rcx),%ymm5
+	vmovdqu	192-128(%rcx),%ymm6
+	vmovdqu	224-128(%rcx),%ymm7
+	vmovdqu	256-128(%rcx),%ymm8
+	leaq	64+128(%rsp),%rcx
+	vmovdqu	%ymm0,0-128(%rcx)
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm1,32-128(%rcx)
+	vpxor	%ymm1,%ymm1,%ymm1
+	vmovdqu	%ymm2,64-128(%rcx)
+	vpxor	%ymm2,%ymm2,%ymm2
+	vmovdqu	%ymm3,96-128(%rcx)
+	vpxor	%ymm3,%ymm3,%ymm3
+	vmovdqu	%ymm4,128-128(%rcx)
+	vpxor	%ymm4,%ymm4,%ymm4
+	vmovdqu	%ymm5,160-128(%rcx)
+	vpxor	%ymm5,%ymm5,%ymm5
+	vmovdqu	%ymm6,192-128(%rcx)
+	vpxor	%ymm6,%ymm6,%ymm6
+	vmovdqu	%ymm7,224-128(%rcx)
+	vpxor	%ymm7,%ymm7,%ymm7
+	vmovdqu	%ymm8,256-128(%rcx)
+	vmovdqa	%ymm0,%ymm8
+	vmovdqu	%ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+	andq	$-64,%rsp
+
+	movq	(%r13),%rbx
+	vpbroadcastq	(%r13),%ymm10
+	vmovdqu	%ymm0,(%rsp)
+	xorq	%r9,%r9
+.byte	0x67
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+
+	vmovdqu	.Land_mask(%rip),%ymm15
+	movl	$9,%r14d
+	vmovdqu	%ymm9,288-128(%rdi)
+	jmp	.Loop_mul_1024
+
+.align	32
+.Loop_mul_1024:
+	vpsrlq	$29,%ymm3,%ymm9
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r9,%rax
+	movq	%rbx,%r10
+	imulq	8-128(%rsi),%r10
+	addq	8(%rsp),%r10
+
+	movq	%rax,%r9
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	movq	%rbx,%r11
+	imulq	16-128(%rsi),%r11
+	addq	16(%rsp),%r11
+
+	movq	%rbx,%r12
+	imulq	24-128(%rsi),%r12
+	addq	24(%rsp),%r12
+	vpmuludq	32-128(%rsi),%ymm10,%ymm0
+	vmovd	%eax,%xmm11
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	64-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	96-128(%rsi),%ymm10,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	128-128(%rsi),%ymm10,%ymm0
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	160-128(%rsi),%ymm10,%ymm12
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	192-128(%rsi),%ymm10,%ymm13
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	224-128(%rsi),%ymm10,%ymm0
+	vpermq	$0x93,%ymm9,%ymm9
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	256-128(%rsi),%ymm10,%ymm12
+	vpbroadcastq	8(%r13),%ymm10
+	vpaddq	%ymm12,%ymm8,%ymm8
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r9
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r10
+	movq	%rdx,%rax
+	imulq	16-128(%rcx),%rax
+	addq	%rax,%r11
+	shrq	$29,%r9
+	imulq	24-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r9,%r10
+
+	vpmuludq	32-128(%rcx),%ymm11,%ymm13
+	vmovq	%xmm10,%rbx
+	vpaddq	%ymm13,%ymm1,%ymm1
+	vpmuludq	64-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm2,%ymm2
+	vpmuludq	96-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpmuludq	128-128(%rcx),%ymm11,%ymm13
+	vpaddq	%ymm13,%ymm4,%ymm4
+	vpmuludq	160-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm0,%ymm5,%ymm5
+	vpmuludq	192-128(%rcx),%ymm11,%ymm12
+	vpaddq	%ymm12,%ymm6,%ymm6
+	vpmuludq	224-128(%rcx),%ymm11,%ymm13
+	vpblendd	$3,%ymm14,%ymm9,%ymm12
+	vpaddq	%ymm13,%ymm7,%ymm7
+	vpmuludq	256-128(%rcx),%ymm11,%ymm0
+	vpaddq	%ymm12,%ymm3,%ymm3
+	vpaddq	%ymm0,%ymm8,%ymm8
+
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rsi),%ymm12
+	movq	%rbx,%rax
+	imulq	8-128(%rsi),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rsi),%ymm13
+
+	movq	%r10,%rax
+	vpblendd	$0xfc,%ymm14,%ymm9,%ymm9
+	imull	%r8d,%eax
+	vpaddq	%ymm9,%ymm4,%ymm4
+	andl	$0x1fffffff,%eax
+
+	imulq	16-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovd	%eax,%xmm11
+	vmovdqu	-8+96-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-8+128-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+160-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+192-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-8+224-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-8+256-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-8+288-128(%rsi),%ymm9
+	vpaddq	%ymm12,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm9,%ymm9
+	vpbroadcastq	16(%r13),%ymm10
+
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r10
+	vmovdqu	-8+32-128(%rcx),%ymm0
+	movq	%rdx,%rax
+	imulq	8-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-8+64-128(%rcx),%ymm12
+	shrq	$29,%r10
+	imulq	16-128(%rcx),%rdx
+	addq	%rdx,%r12
+	addq	%r10,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-8+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-8+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-8+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-8+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rsi),%ymm0
+	movq	%rbx,%rax
+	imulq	-128(%rsi),%rax
+	addq	%r11,%rax
+
+	vmovdqu	-16+64-128(%rsi),%ymm12
+	movq	%rax,%r11
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	imulq	8-128(%rsi),%rbx
+	addq	%rbx,%r12
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-16+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-16+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	24(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	vmovdqu	-16+32-128(%rcx),%ymm0
+	movq	%rax,%rdx
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r11
+	vmovdqu	-16+64-128(%rcx),%ymm12
+	imulq	8-128(%rcx),%rdx
+	addq	%rdx,%r12
+	shrq	$29,%r11
+
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-16+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+128-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-16+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-16+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-16+288-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+32-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+64-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm9,%ymm9
+
+	addq	%r11,%r12
+	imulq	-128(%rsi),%rbx
+	addq	%rbx,%r12
+
+	movq	%r12,%rax
+	imull	%r8d,%eax
+	andl	$0x1fffffff,%eax
+
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovd	%eax,%xmm11
+	vmovdqu	-24+96-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm1
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpbroadcastq	%xmm11,%ymm11
+	vmovdqu	-24+128-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm2,%ymm2
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm3
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm4
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rsi),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rsi),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpmuludq	%ymm10,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rsi),%ymm13
+	vpaddq	%ymm0,%ymm7,%ymm7
+	vpmuludq	%ymm10,%ymm12,%ymm12
+	vpaddq	%ymm12,%ymm8,%ymm8
+	vpmuludq	%ymm10,%ymm13,%ymm13
+	vpbroadcastq	32(%r13),%ymm10
+	vpaddq	%ymm13,%ymm9,%ymm9
+	addq	$32,%r13
+
+	vmovdqu	-24+32-128(%rcx),%ymm0
+	imulq	-128(%rcx),%rax
+	addq	%rax,%r12
+	shrq	$29,%r12
+
+	vmovdqu	-24+64-128(%rcx),%ymm12
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovq	%xmm10,%rbx
+	vmovdqu	-24+96-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm1,%ymm0
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	%ymm0,(%rsp)
+	vpaddq	%ymm12,%ymm2,%ymm1
+	vmovdqu	-24+128-128(%rcx),%ymm0
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+160-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm3,%ymm2
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+192-128(%rcx),%ymm13
+	vpaddq	%ymm0,%ymm4,%ymm3
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	vmovdqu	-24+224-128(%rcx),%ymm0
+	vpaddq	%ymm12,%ymm5,%ymm4
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovdqu	-24+256-128(%rcx),%ymm12
+	vpaddq	%ymm13,%ymm6,%ymm5
+	vpmuludq	%ymm11,%ymm0,%ymm0
+	vmovdqu	-24+288-128(%rcx),%ymm13
+	movq	%r12,%r9
+	vpaddq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm11,%ymm12,%ymm12
+	addq	(%rsp),%r9
+	vpaddq	%ymm12,%ymm8,%ymm7
+	vpmuludq	%ymm11,%ymm13,%ymm13
+	vmovq	%r12,%xmm12
+	vpaddq	%ymm13,%ymm9,%ymm8
+
+	decl	%r14d
+	jnz	.Loop_mul_1024
+	vpaddq	(%rsp),%ymm12,%ymm0
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm10,%ymm10
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpermq	$0x93,%ymm11,%ymm11
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vpsrlq	$29,%ymm0,%ymm12
+	vpand	%ymm15,%ymm0,%ymm0
+	vpsrlq	$29,%ymm1,%ymm13
+	vpand	%ymm15,%ymm1,%ymm1
+	vpsrlq	$29,%ymm2,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm2,%ymm2
+	vpsrlq	$29,%ymm3,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm3,%ymm3
+	vpermq	$0x93,%ymm10,%ymm10
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm11,%ymm11
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm0,%ymm0
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm1,%ymm1
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm2,%ymm2
+	vpblendd	$3,%ymm11,%ymm14,%ymm11
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm11,%ymm4,%ymm4
+
+	vmovdqu	%ymm0,0-128(%rdi)
+	vmovdqu	%ymm1,32-128(%rdi)
+	vmovdqu	%ymm2,64-128(%rdi)
+	vmovdqu	%ymm3,96-128(%rdi)
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vpsrlq	$29,%ymm4,%ymm12
+	vpand	%ymm15,%ymm4,%ymm4
+	vpsrlq	$29,%ymm5,%ymm13
+	vpand	%ymm15,%ymm5,%ymm5
+	vpsrlq	$29,%ymm6,%ymm10
+	vpermq	$0x93,%ymm12,%ymm12
+	vpand	%ymm15,%ymm6,%ymm6
+	vpsrlq	$29,%ymm7,%ymm11
+	vpermq	$0x93,%ymm13,%ymm13
+	vpand	%ymm15,%ymm7,%ymm7
+	vpsrlq	$29,%ymm8,%ymm0
+	vpermq	$0x93,%ymm10,%ymm10
+	vpand	%ymm15,%ymm8,%ymm8
+	vpermq	$0x93,%ymm11,%ymm11
+
+	vpblendd	$3,%ymm14,%ymm12,%ymm9
+	vpermq	$0x93,%ymm0,%ymm0
+	vpblendd	$3,%ymm12,%ymm13,%ymm12
+	vpaddq	%ymm9,%ymm4,%ymm4
+	vpblendd	$3,%ymm13,%ymm10,%ymm13
+	vpaddq	%ymm12,%ymm5,%ymm5
+	vpblendd	$3,%ymm10,%ymm11,%ymm10
+	vpaddq	%ymm13,%ymm6,%ymm6
+	vpblendd	$3,%ymm11,%ymm0,%ymm11
+	vpaddq	%ymm10,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm8,%ymm8
+
+	vmovdqu	%ymm4,128-128(%rdi)
+	vmovdqu	%ymm5,160-128(%rdi)
+	vmovdqu	%ymm6,192-128(%rdi)
+	vmovdqu	%ymm7,224-128(%rdi)
+	vmovdqu	%ymm8,256-128(%rdi)
+	vzeroupper
+
+	movq	%rbp,%rax
+.cfi_def_cfa_register	%rax
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_1024_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl	rsaz_1024_red2norm_avx2
+.hidden rsaz_1024_red2norm_avx2
+.type	rsaz_1024_red2norm_avx2,@function
+.align	32
+rsaz_1024_red2norm_avx2:
+.cfi_startproc	
+	subq	$-128,%rsi
+	xorq	%rax,%rax
+	movq	-128(%rsi),%r8
+	movq	-120(%rsi),%r9
+	movq	-112(%rsi),%r10
+	shlq	$0,%r8
+	shlq	$29,%r9
+	movq	%r10,%r11
+	shlq	$58,%r10
+	shrq	$6,%r11
+	addq	%r8,%rax
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,0(%rdi)
+	movq	%r11,%rax
+	movq	-104(%rsi),%r8
+	movq	-96(%rsi),%r9
+	shlq	$23,%r8
+	movq	%r9,%r10
+	shlq	$52,%r9
+	shrq	$12,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,8(%rdi)
+	movq	%r10,%rax
+	movq	-88(%rsi),%r11
+	movq	-80(%rsi),%r8
+	shlq	$17,%r11
+	movq	%r8,%r9
+	shlq	$46,%r8
+	shrq	$18,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,16(%rdi)
+	movq	%r9,%rax
+	movq	-72(%rsi),%r10
+	movq	-64(%rsi),%r11
+	shlq	$11,%r10
+	movq	%r11,%r8
+	shlq	$40,%r11
+	shrq	$24,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,24(%rdi)
+	movq	%r8,%rax
+	movq	-56(%rsi),%r9
+	movq	-48(%rsi),%r10
+	movq	-40(%rsi),%r11
+	shlq	$5,%r9
+	shlq	$34,%r10
+	movq	%r11,%r8
+	shlq	$63,%r11
+	shrq	$1,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,32(%rdi)
+	movq	%r8,%rax
+	movq	-32(%rsi),%r9
+	movq	-24(%rsi),%r10
+	shlq	$28,%r9
+	movq	%r10,%r11
+	shlq	$57,%r10
+	shrq	$7,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,40(%rdi)
+	movq	%r11,%rax
+	movq	-16(%rsi),%r8
+	movq	-8(%rsi),%r9
+	shlq	$22,%r8
+	movq	%r9,%r10
+	shlq	$51,%r9
+	shrq	$13,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,48(%rdi)
+	movq	%r10,%rax
+	movq	0(%rsi),%r11
+	movq	8(%rsi),%r8
+	shlq	$16,%r11
+	movq	%r8,%r9
+	shlq	$45,%r8
+	shrq	$19,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,56(%rdi)
+	movq	%r9,%rax
+	movq	16(%rsi),%r10
+	movq	24(%rsi),%r11
+	shlq	$10,%r10
+	movq	%r11,%r8
+	shlq	$39,%r11
+	shrq	$25,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,64(%rdi)
+	movq	%r8,%rax
+	movq	32(%rsi),%r9
+	movq	40(%rsi),%r10
+	movq	48(%rsi),%r11
+	shlq	$4,%r9
+	shlq	$33,%r10
+	movq	%r11,%r8
+	shlq	$62,%r11
+	shrq	$2,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,72(%rdi)
+	movq	%r8,%rax
+	movq	56(%rsi),%r9
+	movq	64(%rsi),%r10
+	shlq	$27,%r9
+	movq	%r10,%r11
+	shlq	$56,%r10
+	shrq	$8,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,80(%rdi)
+	movq	%r11,%rax
+	movq	72(%rsi),%r8
+	movq	80(%rsi),%r9
+	shlq	$21,%r8
+	movq	%r9,%r10
+	shlq	$50,%r9
+	shrq	$14,%r10
+	addq	%r8,%rax
+	addq	%r9,%rax
+	adcq	$0,%r10
+	movq	%rax,88(%rdi)
+	movq	%r10,%rax
+	movq	88(%rsi),%r11
+	movq	96(%rsi),%r8
+	shlq	$15,%r11
+	movq	%r8,%r9
+	shlq	$44,%r8
+	shrq	$20,%r9
+	addq	%r11,%rax
+	addq	%r8,%rax
+	adcq	$0,%r9
+	movq	%rax,96(%rdi)
+	movq	%r9,%rax
+	movq	104(%rsi),%r10
+	movq	112(%rsi),%r11
+	shlq	$9,%r10
+	movq	%r11,%r8
+	shlq	$38,%r11
+	shrq	$26,%r8
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,104(%rdi)
+	movq	%r8,%rax
+	movq	120(%rsi),%r9
+	movq	128(%rsi),%r10
+	movq	136(%rsi),%r11
+	shlq	$3,%r9
+	shlq	$32,%r10
+	movq	%r11,%r8
+	shlq	$61,%r11
+	shrq	$3,%r8
+	addq	%r9,%rax
+	addq	%r10,%rax
+	addq	%r11,%rax
+	adcq	$0,%r8
+	movq	%rax,112(%rdi)
+	movq	%r8,%rax
+	movq	144(%rsi),%r9
+	movq	152(%rsi),%r10
+	shlq	$26,%r9
+	movq	%r10,%r11
+	shlq	$55,%r10
+	shrq	$9,%r11
+	addq	%r9,%rax
+	addq	%r10,%rax
+	adcq	$0,%r11
+	movq	%rax,120(%rdi)
+	movq	%r11,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl	rsaz_1024_norm2red_avx2
+.hidden rsaz_1024_norm2red_avx2
+.type	rsaz_1024_norm2red_avx2,@function
+.align	32
+rsaz_1024_norm2red_avx2:
+.cfi_startproc	
+	subq	$-128,%rdi
+	movq	(%rsi),%r8
+	movl	$0x1fffffff,%eax
+	movq	8(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$0,%r11
+	andq	%rax,%r11
+	movq	%r11,-128(%rdi)
+	movq	%r8,%r10
+	shrq	$29,%r10
+	andq	%rax,%r10
+	movq	%r10,-120(%rdi)
+	shrdq	$58,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-112(%rdi)
+	movq	16(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$23,%r8
+	andq	%rax,%r8
+	movq	%r8,-104(%rdi)
+	shrdq	$52,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-96(%rdi)
+	movq	24(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$17,%r9
+	andq	%rax,%r9
+	movq	%r9,-88(%rdi)
+	shrdq	$46,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-80(%rdi)
+	movq	32(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$11,%r10
+	andq	%rax,%r10
+	movq	%r10,-72(%rdi)
+	shrdq	$40,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,-64(%rdi)
+	movq	40(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$5,%r11
+	andq	%rax,%r11
+	movq	%r11,-56(%rdi)
+	movq	%r8,%r10
+	shrq	$34,%r10
+	andq	%rax,%r10
+	movq	%r10,-48(%rdi)
+	shrdq	$63,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,-40(%rdi)
+	movq	48(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$28,%r8
+	andq	%rax,%r8
+	movq	%r8,-32(%rdi)
+	shrdq	$57,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,-24(%rdi)
+	movq	56(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$22,%r9
+	andq	%rax,%r9
+	movq	%r9,-16(%rdi)
+	shrdq	$51,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,-8(%rdi)
+	movq	64(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$16,%r10
+	andq	%rax,%r10
+	movq	%r10,0(%rdi)
+	shrdq	$45,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,8(%rdi)
+	movq	72(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$10,%r11
+	andq	%rax,%r11
+	movq	%r11,16(%rdi)
+	shrdq	$39,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,24(%rdi)
+	movq	80(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$4,%r8
+	andq	%rax,%r8
+	movq	%r8,32(%rdi)
+	movq	%r9,%r11
+	shrq	$33,%r11
+	andq	%rax,%r11
+	movq	%r11,40(%rdi)
+	shrdq	$62,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,48(%rdi)
+	movq	88(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$27,%r9
+	andq	%rax,%r9
+	movq	%r9,56(%rdi)
+	shrdq	$56,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,64(%rdi)
+	movq	96(%rsi),%r8
+	movq	%r11,%r10
+	shrq	$21,%r10
+	andq	%rax,%r10
+	movq	%r10,72(%rdi)
+	shrdq	$50,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,80(%rdi)
+	movq	104(%rsi),%r9
+	movq	%r8,%r11
+	shrq	$15,%r11
+	andq	%rax,%r11
+	movq	%r11,88(%rdi)
+	shrdq	$44,%r9,%r8
+	andq	%rax,%r8
+	movq	%r8,96(%rdi)
+	movq	112(%rsi),%r10
+	movq	%r9,%r8
+	shrq	$9,%r8
+	andq	%rax,%r8
+	movq	%r8,104(%rdi)
+	shrdq	$38,%r10,%r9
+	andq	%rax,%r9
+	movq	%r9,112(%rdi)
+	movq	120(%rsi),%r11
+	movq	%r10,%r9
+	shrq	$3,%r9
+	andq	%rax,%r9
+	movq	%r9,120(%rdi)
+	movq	%r10,%r8
+	shrq	$32,%r8
+	andq	%rax,%r8
+	movq	%r8,128(%rdi)
+	shrdq	$61,%r11,%r10
+	andq	%rax,%r10
+	movq	%r10,136(%rdi)
+	xorq	%r8,%r8
+	movq	%r11,%r10
+	shrq	$26,%r10
+	andq	%rax,%r10
+	movq	%r10,144(%rdi)
+	shrdq	$55,%r8,%r11
+	andq	%rax,%r11
+	movq	%r11,152(%rdi)
+	movq	%r8,160(%rdi)
+	movq	%r8,168(%rdi)
+	movq	%r8,176(%rdi)
+	movq	%r8,184(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl	rsaz_1024_scatter5_avx2
+.hidden rsaz_1024_scatter5_avx2
+.type	rsaz_1024_scatter5_avx2,@function
+.align	32
+rsaz_1024_scatter5_avx2:
+.cfi_startproc	
+	vzeroupper
+	vmovdqu	.Lscatter_permd(%rip),%ymm5
+	shll	$4,%edx
+	leaq	(%rdi,%rdx,1),%rdi
+	movl	$9,%eax
+	jmp	.Loop_scatter_1024
+
+.align	32
+.Loop_scatter_1024:
+	vmovdqu	(%rsi),%ymm0
+	leaq	32(%rsi),%rsi
+	vpermd	%ymm0,%ymm5,%ymm0
+	vmovdqu	%xmm0,(%rdi)
+	leaq	512(%rdi),%rdi
+	decl	%eax
+	jnz	.Loop_scatter_1024
+
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl	rsaz_1024_gather5_avx2
+.hidden rsaz_1024_gather5_avx2
+.type	rsaz_1024_gather5_avx2,@function
+.align	32
+rsaz_1024_gather5_avx2:
+.cfi_startproc	
+	vzeroupper
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	leaq	-256(%rsp),%rsp
+	andq	$-32,%rsp
+	leaq	.Linc(%rip),%r10
+	leaq	-128(%rsp),%rax
+
+	vmovd	%edx,%xmm4
+	vmovdqa	(%r10),%ymm0
+	vmovdqa	32(%r10),%ymm1
+	vmovdqa	64(%r10),%ymm5
+	vpbroadcastd	%xmm4,%ymm4
+
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,0+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm0
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,32+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm1
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,64+128(%rax)
+	vpaddd	%ymm5,%ymm0,%ymm2
+	vpcmpeqd	%ymm4,%ymm0,%ymm0
+	vmovdqa	%ymm3,96+128(%rax)
+	vpaddd	%ymm5,%ymm1,%ymm3
+	vpcmpeqd	%ymm4,%ymm1,%ymm1
+	vmovdqa	%ymm0,128+128(%rax)
+	vpaddd	%ymm5,%ymm2,%ymm8
+	vpcmpeqd	%ymm4,%ymm2,%ymm2
+	vmovdqa	%ymm1,160+128(%rax)
+	vpaddd	%ymm5,%ymm3,%ymm9
+	vpcmpeqd	%ymm4,%ymm3,%ymm3
+	vmovdqa	%ymm2,192+128(%rax)
+	vpaddd	%ymm5,%ymm8,%ymm10
+	vpcmpeqd	%ymm4,%ymm8,%ymm8
+	vmovdqa	%ymm3,224+128(%rax)
+	vpaddd	%ymm5,%ymm9,%ymm11
+	vpcmpeqd	%ymm4,%ymm9,%ymm9
+	vpaddd	%ymm5,%ymm10,%ymm12
+	vpcmpeqd	%ymm4,%ymm10,%ymm10
+	vpaddd	%ymm5,%ymm11,%ymm13
+	vpcmpeqd	%ymm4,%ymm11,%ymm11
+	vpaddd	%ymm5,%ymm12,%ymm14
+	vpcmpeqd	%ymm4,%ymm12,%ymm12
+	vpaddd	%ymm5,%ymm13,%ymm15
+	vpcmpeqd	%ymm4,%ymm13,%ymm13
+	vpcmpeqd	%ymm4,%ymm14,%ymm14
+	vpcmpeqd	%ymm4,%ymm15,%ymm15
+
+	vmovdqa	-32(%r10),%ymm7
+	leaq	128(%rsi),%rsi
+	movl	$9,%edx
+
+.Loop_gather_1024:
+	vmovdqa	0-128(%rsi),%ymm0
+	vmovdqa	32-128(%rsi),%ymm1
+	vmovdqa	64-128(%rsi),%ymm2
+	vmovdqa	96-128(%rsi),%ymm3
+	vpand	0+128(%rax),%ymm0,%ymm0
+	vpand	32+128(%rax),%ymm1,%ymm1
+	vpand	64+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm1,%ymm4
+	vpand	96+128(%rax),%ymm3,%ymm3
+	vmovdqa	128-128(%rsi),%ymm0
+	vmovdqa	160-128(%rsi),%ymm1
+	vpor	%ymm2,%ymm3,%ymm5
+	vmovdqa	192-128(%rsi),%ymm2
+	vmovdqa	224-128(%rsi),%ymm3
+	vpand	128+128(%rax),%ymm0,%ymm0
+	vpand	160+128(%rax),%ymm1,%ymm1
+	vpand	192+128(%rax),%ymm2,%ymm2
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	224+128(%rax),%ymm3,%ymm3
+	vpand	256-128(%rsi),%ymm8,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	288-128(%rsi),%ymm9,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	320-128(%rsi),%ymm10,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	352-128(%rsi),%ymm11,%ymm3
+	vpor	%ymm0,%ymm4,%ymm4
+	vpand	384-128(%rsi),%ymm12,%ymm0
+	vpor	%ymm1,%ymm5,%ymm5
+	vpand	416-128(%rsi),%ymm13,%ymm1
+	vpor	%ymm2,%ymm4,%ymm4
+	vpand	448-128(%rsi),%ymm14,%ymm2
+	vpor	%ymm3,%ymm5,%ymm5
+	vpand	480-128(%rsi),%ymm15,%ymm3
+	leaq	512(%rsi),%rsi
+	vpor	%ymm0,%ymm4,%ymm4
+	vpor	%ymm1,%ymm5,%ymm5
+	vpor	%ymm2,%ymm4,%ymm4
+	vpor	%ymm3,%ymm5,%ymm5
+
+	vpor	%ymm5,%ymm4,%ymm4
+	vextracti128	$1,%ymm4,%xmm5
+	vpor	%xmm4,%xmm5,%xmm5
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqu	%ymm5,(%rdi)
+	leaq	32(%rdi),%rdi
+	decl	%edx
+	jnz	.Loop_gather_1024
+
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm0,(%rdi)
+	vzeroupper
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.LSEH_end_rsaz_1024_gather5:
+.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+.align	64
+.Land_mask:
+.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long	0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long	0,7,1,7,2,7,3,7
+.Linc:
+.long	0,0,0,0, 1,1,1,1
+.long	2,2,2,2, 3,3,3,3
+.long	4,4,4,4, 4,4,4,4
+.align	64
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
@@ -1,0 +1,5468 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl	sha1_block_data_order
+.hidden sha1_block_data_order
+.type	sha1_block_data_order,@function
+.align	16
+sha1_block_data_order:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%r10
+	movl	0(%r10),%r9d
+	movl	4(%r10),%r8d
+	movl	8(%r10),%r10d
+	testl	$512,%r8d
+	jz	.Lialu
+	testl	$536870912,%r10d
+	jnz	_shaext_shortcut
+	andl	$296,%r10d
+	cmpl	$296,%r10d
+	je	_avx2_shortcut
+	andl	$268435456,%r8d
+	andl	$1073741824,%r9d
+	orl	%r9d,%r8d
+	cmpl	$1342177280,%r8d
+	je	_avx_shortcut
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	movq	%rdi,%r8
+	subq	$72,%rsp
+	movq	%rsi,%r9
+	andq	$-64,%rsp
+	movq	%rdx,%r10
+	movq	%rax,64(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
+.Lprologue:
+
+	movl	0(%r8),%esi
+	movl	4(%r8),%edi
+	movl	8(%r8),%r11d
+	movl	12(%r8),%r12d
+	movl	16(%r8),%r13d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	0(%r9),%edx
+	bswapl	%edx
+	movl	4(%r9),%ebp
+	movl	%r12d,%eax
+	movl	%edx,0(%rsp)
+	movl	%esi,%ecx
+	bswapl	%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%rdx,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	8(%r9),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,4(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%rbp,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	12(%r9),%edx
+	movl	%edi,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%r14,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	16(%r9),%ebp
+	movl	%esi,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	20(%r9),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ecx
+	bswapl	%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	24(%r9),%edx
+	movl	%r12d,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ecx
+	bswapl	%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%r14,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	28(%r9),%ebp
+	movl	%r11d,%eax
+	movl	%edx,24(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%rdx,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	32(%r9),%r14d
+	movl	%edi,%eax
+	movl	%ebp,28(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%rbp,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	36(%r9),%edx
+	movl	%esi,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%r14,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	40(%r9),%ebp
+	movl	%r13d,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ecx
+	bswapl	%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%rdx,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	movl	44(%r9),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ecx
+	bswapl	%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	andl	%edi,%eax
+	leal	1518500249(%rbp,%r13,1),%r13d
+	addl	%ecx,%r13d
+	xorl	%r12d,%eax
+	roll	$30,%edi
+	addl	%eax,%r13d
+	movl	48(%r9),%edx
+	movl	%r11d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ecx
+	bswapl	%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	andl	%esi,%eax
+	leal	1518500249(%r14,%r12,1),%r12d
+	addl	%ecx,%r12d
+	xorl	%r11d,%eax
+	roll	$30,%esi
+	addl	%eax,%r12d
+	movl	52(%r9),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
+	movl	%r12d,%ecx
+	bswapl	%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	andl	%r13d,%eax
+	leal	1518500249(%rdx,%r11,1),%r11d
+	addl	%ecx,%r11d
+	xorl	%edi,%eax
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	movl	56(%r9),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
+	movl	%r11d,%ecx
+	bswapl	%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	andl	%r12d,%eax
+	leal	1518500249(%rbp,%rdi,1),%edi
+	addl	%ecx,%edi
+	xorl	%esi,%eax
+	roll	$30,%r12d
+	addl	%eax,%edi
+	movl	60(%r9),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
+	movl	%edi,%ecx
+	bswapl	%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	andl	%r11d,%eax
+	leal	1518500249(%r14,%rsi,1),%esi
+	addl	%ecx,%esi
+	xorl	%r13d,%eax
+	roll	$30,%r11d
+	addl	%eax,%esi
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
+	movl	%esi,%ecx
+	xorl	8(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	andl	%edi,%eax
+	leal	1518500249(%rdx,%r13,1),%r13d
+	roll	$30,%edi
+	xorl	%r12d,%eax
+	addl	%ecx,%r13d
+	roll	$1,%ebp
+	addl	%eax,%r13d
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
+	movl	%r13d,%ecx
+	xorl	12(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%r14d
+	andl	%esi,%eax
+	leal	1518500249(%rbp,%r12,1),%r12d
+	roll	$30,%esi
+	xorl	%r11d,%eax
+	addl	%ecx,%r12d
+	roll	$1,%r14d
+	addl	%eax,%r12d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
+	movl	%r12d,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	andl	%r13d,%eax
+	leal	1518500249(%r14,%r11,1),%r11d
+	roll	$30,%r13d
+	xorl	%edi,%eax
+	addl	%ecx,%r11d
+	roll	$1,%edx
+	addl	%eax,%r11d
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,8(%rsp)
+	movl	%r11d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	andl	%r12d,%eax
+	leal	1518500249(%rdx,%rdi,1),%edi
+	roll	$30,%r12d
+	xorl	%esi,%eax
+	addl	%ecx,%edi
+	roll	$1,%ebp
+	addl	%eax,%edi
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	movl	%edi,%ecx
+	xorl	24(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
+	andl	%r11d,%eax
+	leal	1518500249(%rbp,%rsi,1),%esi
+	roll	$30,%r11d
+	xorl	%r13d,%eax
+	addl	%ecx,%esi
+	roll	$1,%r14d
+	addl	%eax,%esi
+	xorl	20(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,16(%rsp)
+	movl	%esi,%ecx
+	xorl	28(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,20(%rsp)
+	movl	%r13d,%ecx
+	xorl	32(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,24(%rsp)
+	movl	%r12d,%ecx
+	xorl	36(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,28(%rsp)
+	movl	%r11d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,32(%rsp)
+	movl	%edi,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,36(%rsp)
+	movl	%esi,%ecx
+	xorl	48(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
+	leal	1859775393(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,40(%rsp)
+	movl	%r13d,%ecx
+	xorl	52(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	leal	1859775393(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,44(%rsp)
+	movl	%r12d,%ecx
+	xorl	56(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	leal	1859775393(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,48(%rsp)
+	movl	%r11d,%ecx
+	xorl	60(%rsp),%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%r14d
+	leal	1859775393(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,52(%rsp)
+	movl	%edi,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%edx
+	leal	1859775393(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,56(%rsp)
+	movl	%esi,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
+	leal	1859775393(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	0(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,60(%rsp)
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%r14d
+	leal	1859775393(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	4(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,0(%rsp)
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%edx
+	leal	1859775393(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	8(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,4(%rsp)
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%ebp
+	leal	1859775393(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	xorl	12(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,8(%rsp)
+	movl	%edi,%ecx
+	xorl	20(%rsp),%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%r14d
+	leal	1859775393(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%r14d
+	xorl	16(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,12(%rsp)
+	movl	%esi,%ecx
+	xorl	24(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%edx
+	leal	1859775393(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	20(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,16(%rsp)
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%ebp
+	leal	1859775393(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	24(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,20(%rsp)
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%r14d
+	leal	1859775393(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	28(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,24(%rsp)
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%edx
+	leal	1859775393(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	32(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,28(%rsp)
+	movl	%edi,%ecx
+	xorl	40(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%ebp
+	leal	1859775393(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	36(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,32(%rsp)
+	movl	%r12d,%ebx
+	xorl	44(%rsp),%r14d
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	4(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	40(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,36(%rsp)
+	movl	%r11d,%ebx
+	xorl	48(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	8(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%edx
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	44(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,40(%rsp)
+	movl	%edi,%ebx
+	xorl	52(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	12(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	48(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,44(%rsp)
+	movl	%esi,%ebx
+	xorl	56(%rsp),%r14d
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	16(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%r14d
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	52(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,48(%rsp)
+	movl	%r13d,%ebx
+	xorl	60(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	20(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%edx
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	56(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,52(%rsp)
+	movl	%r12d,%ebx
+	xorl	0(%rsp),%ebp
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	24(%rsp),%ebp
+	leal	-1894007588(%rdx,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	60(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,56(%rsp)
+	movl	%r11d,%ebx
+	xorl	4(%rsp),%r14d
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	28(%rsp),%r14d
+	leal	-1894007588(%rbp,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	0(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,60(%rsp)
+	movl	%edi,%ebx
+	xorl	8(%rsp),%edx
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	32(%rsp),%edx
+	leal	-1894007588(%r14,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%edx
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	4(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,0(%rsp)
+	movl	%esi,%ebx
+	xorl	12(%rsp),%ebp
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	36(%rsp),%ebp
+	leal	-1894007588(%rdx,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%ebp
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	8(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,4(%rsp)
+	movl	%r13d,%ebx
+	xorl	16(%rsp),%r14d
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	40(%rsp),%r14d
+	leal	-1894007588(%rbp,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%r14d
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	12(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,8(%rsp)
+	movl	%r12d,%ebx
+	xorl	20(%rsp),%edx
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	44(%rsp),%edx
+	leal	-1894007588(%r14,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%edx
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	16(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,12(%rsp)
+	movl	%r11d,%ebx
+	xorl	24(%rsp),%ebp
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	48(%rsp),%ebp
+	leal	-1894007588(%rdx,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	20(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,16(%rsp)
+	movl	%edi,%ebx
+	xorl	28(%rsp),%r14d
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	52(%rsp),%r14d
+	leal	-1894007588(%rbp,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	24(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,20(%rsp)
+	movl	%esi,%ebx
+	xorl	32(%rsp),%edx
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	56(%rsp),%edx
+	leal	-1894007588(%r14,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%edx
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	28(%rsp),%ebp
+	movl	%r13d,%eax
+	movl	%edx,24(%rsp)
+	movl	%r13d,%ebx
+	xorl	36(%rsp),%ebp
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	60(%rsp),%ebp
+	leal	-1894007588(%rdx,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%ebp
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	32(%rsp),%r14d
+	movl	%r12d,%eax
+	movl	%ebp,28(%rsp)
+	movl	%r12d,%ebx
+	xorl	40(%rsp),%r14d
+	andl	%r11d,%eax
+	movl	%esi,%ecx
+	xorl	0(%rsp),%r14d
+	leal	-1894007588(%rbp,%r13,1),%r13d
+	xorl	%r11d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	andl	%edi,%ebx
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%ebx,%r13d
+	xorl	36(%rsp),%edx
+	movl	%r11d,%eax
+	movl	%r14d,32(%rsp)
+	movl	%r11d,%ebx
+	xorl	44(%rsp),%edx
+	andl	%edi,%eax
+	movl	%r13d,%ecx
+	xorl	4(%rsp),%edx
+	leal	-1894007588(%r14,%r12,1),%r12d
+	xorl	%edi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r12d
+	roll	$1,%edx
+	andl	%esi,%ebx
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%ebx,%r12d
+	xorl	40(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,36(%rsp)
+	movl	%edi,%ebx
+	xorl	48(%rsp),%ebp
+	andl	%esi,%eax
+	movl	%r12d,%ecx
+	xorl	8(%rsp),%ebp
+	leal	-1894007588(%rdx,%r11,1),%r11d
+	xorl	%esi,%ebx
+	roll	$5,%ecx
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	andl	%r13d,%ebx
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%ebx,%r11d
+	xorl	44(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,40(%rsp)
+	movl	%esi,%ebx
+	xorl	52(%rsp),%r14d
+	andl	%r13d,%eax
+	movl	%r11d,%ecx
+	xorl	12(%rsp),%r14d
+	leal	-1894007588(%rbp,%rdi,1),%edi
+	xorl	%r13d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%edi
+	roll	$1,%r14d
+	andl	%r12d,%ebx
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%ebx,%edi
+	xorl	48(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,44(%rsp)
+	movl	%r13d,%ebx
+	xorl	56(%rsp),%edx
+	andl	%r12d,%eax
+	movl	%edi,%ecx
+	xorl	16(%rsp),%edx
+	leal	-1894007588(%r14,%rsi,1),%esi
+	xorl	%r12d,%ebx
+	roll	$5,%ecx
+	addl	%eax,%esi
+	roll	$1,%edx
+	andl	%r11d,%ebx
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%ebx,%esi
+	xorl	52(%rsp),%ebp
+	movl	%edi,%eax
+	movl	%edx,48(%rsp)
+	movl	%esi,%ecx
+	xorl	60(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	56(%rsp),%r14d
+	movl	%esi,%eax
+	movl	%ebp,52(%rsp)
+	movl	%r13d,%ecx
+	xorl	0(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	60(%rsp),%edx
+	movl	%r13d,%eax
+	movl	%r14d,56(%rsp)
+	movl	%r12d,%ecx
+	xorl	4(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	0(%rsp),%ebp
+	movl	%r12d,%eax
+	movl	%edx,60(%rsp)
+	movl	%r11d,%ecx
+	xorl	8(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	32(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	xorl	4(%rsp),%r14d
+	movl	%r11d,%eax
+	movl	%ebp,0(%rsp)
+	movl	%edi,%ecx
+	xorl	12(%rsp),%r14d
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	36(%rsp),%r14d
+	leal	-899497514(%rbp,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%r14d
+	xorl	8(%rsp),%edx
+	movl	%edi,%eax
+	movl	%r14d,4(%rsp)
+	movl	%esi,%ecx
+	xorl	16(%rsp),%edx
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	40(%rsp),%edx
+	leal	-899497514(%r14,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%edx
+	xorl	12(%rsp),%ebp
+	movl	%esi,%eax
+	movl	%edx,8(%rsp)
+	movl	%r13d,%ecx
+	xorl	20(%rsp),%ebp
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	44(%rsp),%ebp
+	leal	-899497514(%rdx,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%ebp
+	xorl	16(%rsp),%r14d
+	movl	%r13d,%eax
+	movl	%ebp,12(%rsp)
+	movl	%r12d,%ecx
+	xorl	24(%rsp),%r14d
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	48(%rsp),%r14d
+	leal	-899497514(%rbp,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%r14d
+	xorl	20(%rsp),%edx
+	movl	%r12d,%eax
+	movl	%r14d,16(%rsp)
+	movl	%r11d,%ecx
+	xorl	28(%rsp),%edx
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	52(%rsp),%edx
+	leal	-899497514(%r14,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%edx
+	xorl	24(%rsp),%ebp
+	movl	%r11d,%eax
+	movl	%edx,20(%rsp)
+	movl	%edi,%ecx
+	xorl	32(%rsp),%ebp
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	56(%rsp),%ebp
+	leal	-899497514(%rdx,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%ebp
+	xorl	28(%rsp),%r14d
+	movl	%edi,%eax
+	movl	%ebp,24(%rsp)
+	movl	%esi,%ecx
+	xorl	36(%rsp),%r14d
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	60(%rsp),%r14d
+	leal	-899497514(%rbp,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%r14d
+	xorl	32(%rsp),%edx
+	movl	%esi,%eax
+	movl	%r14d,28(%rsp)
+	movl	%r13d,%ecx
+	xorl	40(%rsp),%edx
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	0(%rsp),%edx
+	leal	-899497514(%r14,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%edx
+	xorl	36(%rsp),%ebp
+	movl	%r13d,%eax
+
+	movl	%r12d,%ecx
+	xorl	44(%rsp),%ebp
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	4(%rsp),%ebp
+	leal	-899497514(%rdx,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%ebp
+	xorl	40(%rsp),%r14d
+	movl	%r12d,%eax
+
+	movl	%r11d,%ecx
+	xorl	48(%rsp),%r14d
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	8(%rsp),%r14d
+	leal	-899497514(%rbp,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%r14d
+	xorl	44(%rsp),%edx
+	movl	%r11d,%eax
+
+	movl	%edi,%ecx
+	xorl	52(%rsp),%edx
+	xorl	%r13d,%eax
+	roll	$5,%ecx
+	xorl	12(%rsp),%edx
+	leal	-899497514(%r14,%rsi,1),%esi
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	roll	$1,%edx
+	xorl	48(%rsp),%ebp
+	movl	%edi,%eax
+
+	movl	%esi,%ecx
+	xorl	56(%rsp),%ebp
+	xorl	%r12d,%eax
+	roll	$5,%ecx
+	xorl	16(%rsp),%ebp
+	leal	-899497514(%rdx,%r13,1),%r13d
+	xorl	%r11d,%eax
+	addl	%ecx,%r13d
+	roll	$30,%edi
+	addl	%eax,%r13d
+	roll	$1,%ebp
+	xorl	52(%rsp),%r14d
+	movl	%esi,%eax
+
+	movl	%r13d,%ecx
+	xorl	60(%rsp),%r14d
+	xorl	%r11d,%eax
+	roll	$5,%ecx
+	xorl	20(%rsp),%r14d
+	leal	-899497514(%rbp,%r12,1),%r12d
+	xorl	%edi,%eax
+	addl	%ecx,%r12d
+	roll	$30,%esi
+	addl	%eax,%r12d
+	roll	$1,%r14d
+	xorl	56(%rsp),%edx
+	movl	%r13d,%eax
+
+	movl	%r12d,%ecx
+	xorl	0(%rsp),%edx
+	xorl	%edi,%eax
+	roll	$5,%ecx
+	xorl	24(%rsp),%edx
+	leal	-899497514(%r14,%r11,1),%r11d
+	xorl	%esi,%eax
+	addl	%ecx,%r11d
+	roll	$30,%r13d
+	addl	%eax,%r11d
+	roll	$1,%edx
+	xorl	60(%rsp),%ebp
+	movl	%r12d,%eax
+
+	movl	%r11d,%ecx
+	xorl	4(%rsp),%ebp
+	xorl	%esi,%eax
+	roll	$5,%ecx
+	xorl	28(%rsp),%ebp
+	leal	-899497514(%rdx,%rdi,1),%edi
+	xorl	%r13d,%eax
+	addl	%ecx,%edi
+	roll	$30,%r12d
+	addl	%eax,%edi
+	roll	$1,%ebp
+	movl	%r11d,%eax
+	movl	%edi,%ecx
+	xorl	%r13d,%eax
+	leal	-899497514(%rbp,%rsi,1),%esi
+	roll	$5,%ecx
+	xorl	%r12d,%eax
+	addl	%ecx,%esi
+	roll	$30,%r11d
+	addl	%eax,%esi
+	addl	0(%r8),%esi
+	addl	4(%r8),%edi
+	addl	8(%r8),%r11d
+	addl	12(%r8),%r12d
+	addl	16(%r8),%r13d
+	movl	%esi,0(%r8)
+	movl	%edi,4(%r8)
+	movl	%r11d,8(%r8)
+	movl	%r12d,12(%r8)
+	movl	%r13d,16(%r8)
+
+	subq	$1,%r10
+	leaq	64(%r9),%r9
+	jnz	.Lloop
+
+	movq	64(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha1_block_data_order,.-sha1_block_data_order
+.type	sha1_block_data_order_shaext,@function
+.align	32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc	
+	movdqu	(%rdi),%xmm0
+	movd	16(%rdi),%xmm1
+	movdqa	K_XX_XX+160(%rip),%xmm3
+
+	movdqu	(%rsi),%xmm4
+	pshufd	$27,%xmm0,%xmm0
+	movdqu	16(%rsi),%xmm5
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,227
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,235
+.byte	102,15,56,0,243
+	movdqa	%xmm1,%xmm9
+.byte	102,15,56,0,251
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	decq	%rdx
+	leaq	64(%rsi),%r8
+	paddd	%xmm4,%xmm1
+	cmovneq	%r8,%rsi
+	movdqa	%xmm0,%xmm8
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,0
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,0
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,1
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,1
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+.byte	15,56,201,229
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,213
+	pxor	%xmm6,%xmm4
+.byte	15,56,201,238
+.byte	15,56,202,231
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,2
+.byte	15,56,200,206
+	pxor	%xmm7,%xmm5
+.byte	15,56,202,236
+.byte	15,56,201,247
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,2
+.byte	15,56,200,215
+	pxor	%xmm4,%xmm6
+.byte	15,56,201,252
+.byte	15,56,202,245
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,204
+	pxor	%xmm5,%xmm7
+.byte	15,56,202,254
+	movdqu	(%rsi),%xmm4
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,213
+	movdqu	16(%rsi),%xmm5
+.byte	102,15,56,0,227
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	15,56,200,206
+	movdqu	32(%rsi),%xmm6
+.byte	102,15,56,0,235
+
+	movdqa	%xmm0,%xmm2
+.byte	15,58,204,193,3
+.byte	15,56,200,215
+	movdqu	48(%rsi),%xmm7
+.byte	102,15,56,0,243
+
+	movdqa	%xmm0,%xmm1
+.byte	15,58,204,194,3
+.byte	65,15,56,200,201
+.byte	102,15,56,0,251
+
+	paddd	%xmm8,%xmm0
+	movdqa	%xmm1,%xmm9
+
+	jnz	.Loop_shaext
+
+	pshufd	$27,%xmm0,%xmm0
+	pshufd	$27,%xmm1,%xmm1
+	movdqu	%xmm0,(%rdi)
+	movd	%xmm1,16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+.type	sha1_block_data_order_ssse3,@function
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+.cfi_startproc	
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	leaq	-64(%rsp),%rsp
+	andq	$-64,%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
+
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+.byte	102,15,56,0,206
+.byte	102,15,56,0,214
+	addq	$64,%r9
+	paddd	%xmm9,%xmm0
+.byte	102,15,56,0,222
+	paddd	%xmm9,%xmm1
+	paddd	%xmm9,%xmm2
+	movdqa	%xmm0,0(%rsp)
+	psubd	%xmm9,%xmm0
+	movdqa	%xmm1,16(%rsp)
+	psubd	%xmm9,%xmm1
+	movdqa	%xmm2,32(%rsp)
+	psubd	%xmm9,%xmm2
+	jmp	.Loop_ssse3
+.align	16
+.Loop_ssse3:
+	rorl	$2,%ebx
+	pshufd	$238,%xmm0,%xmm4
+	xorl	%edx,%esi
+	movdqa	%xmm3,%xmm8
+	paddd	%xmm3,%xmm9
+	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	punpcklqdq	%xmm1,%xmm4
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	addl	%esi,%ebp
+	psrldq	$4,%xmm8
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	pxor	%xmm0,%xmm4
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm2,%xmm8
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	pxor	%xmm8,%xmm4
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	movdqa	%xmm9,48(%rsp)
+	addl	%edi,%edx
+	andl	%eax,%esi
+	movdqa	%xmm4,%xmm10
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	movdqa	%xmm4,%xmm8
+	xorl	%ebx,%esi
+	pslldq	$12,%xmm10
+	paddd	%xmm4,%xmm4
+	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	psrld	$31,%xmm8
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	movdqa	%xmm10,%xmm9
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	psrld	$30,%xmm10
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	por	%xmm8,%xmm4
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm4
+	xorl	%ebp,%edx
+	movdqa	-64(%r14),%xmm10
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	pxor	%xmm9,%xmm4
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pshufd	$238,%xmm1,%xmm5
+	xorl	%ebp,%esi
+	movdqa	%xmm4,%xmm9
+	paddd	%xmm4,%xmm10
+	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	punpcklqdq	%xmm2,%xmm5
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	addl	%esi,%eax
+	psrldq	$4,%xmm9
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	pxor	%xmm1,%xmm5
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	pxor	%xmm3,%xmm9
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	pxor	%xmm9,%xmm5
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	movdqa	%xmm10,0(%rsp)
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	movdqa	%xmm5,%xmm8
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	movdqa	%xmm5,%xmm9
+	xorl	%ecx,%esi
+	pslldq	$12,%xmm8
+	paddd	%xmm5,%xmm5
+	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	psrld	$31,%xmm9
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	addl	%esi,%edx
+	movdqa	%xmm8,%xmm10
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	psrld	$30,%xmm8
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	por	%xmm9,%xmm5
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	pslld	$2,%xmm10
+	pxor	%xmm8,%xmm5
+	xorl	%eax,%ebp
+	movdqa	-32(%r14),%xmm8
+	roll	$5,%edx
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	pxor	%xmm10,%xmm5
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pshufd	$238,%xmm2,%xmm6
+	xorl	%eax,%esi
+	movdqa	%xmm5,%xmm10
+	paddd	%xmm5,%xmm8
+	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	punpcklqdq	%xmm3,%xmm6
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	psrldq	$4,%xmm10
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	pxor	%xmm2,%xmm6
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	pxor	%xmm4,%xmm10
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	pxor	%xmm10,%xmm6
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	movdqa	%xmm8,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	movdqa	%xmm6,%xmm9
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	movdqa	%xmm6,%xmm10
+	xorl	%edx,%esi
+	pslldq	$12,%xmm9
+	paddd	%xmm6,%xmm6
+	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	psrld	$31,%xmm10
+	xorl	%ecx,%ebx
+	roll	$5,%eax
+	addl	%esi,%ebp
+	movdqa	%xmm9,%xmm8
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	psrld	$30,%xmm9
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	por	%xmm10,%xmm6
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	pslld	$2,%xmm8
+	pxor	%xmm9,%xmm6
+	xorl	%ebx,%eax
+	movdqa	-32(%r14),%xmm9
+	roll	$5,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
+	pxor	%xmm8,%xmm6
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	rorl	$7,%ebp
+	pshufd	$238,%xmm3,%xmm7
+	xorl	%ebx,%esi
+	movdqa	%xmm6,%xmm8
+	paddd	%xmm6,%xmm9
+	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	punpcklqdq	%xmm4,%xmm7
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	addl	%esi,%ecx
+	psrldq	$4,%xmm8
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	pxor	%xmm3,%xmm7
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	pxor	%xmm5,%xmm8
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	pxor	%xmm8,%xmm7
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	movdqa	%xmm9,32(%rsp)
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	movdqa	%xmm7,%xmm10
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	rorl	$7,%ecx
+	movdqa	%xmm7,%xmm8
+	xorl	%ebp,%esi
+	pslldq	$12,%xmm10
+	paddd	%xmm7,%xmm7
+	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	psrld	$31,%xmm8
+	xorl	%edx,%ecx
+	roll	$5,%ebx
+	addl	%esi,%eax
+	movdqa	%xmm10,%xmm9
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	psrld	$30,%xmm10
+	addl	%ebx,%eax
+	rorl	$7,%ebx
+	por	%xmm8,%xmm7
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	pslld	$2,%xmm9
+	pxor	%xmm10,%xmm7
+	xorl	%ecx,%ebx
+	movdqa	-32(%r14),%xmm10
+	roll	$5,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	pxor	%xmm9,%xmm7
+	pshufd	$238,%xmm6,%xmm9
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	rorl	$7,%eax
+	pxor	%xmm4,%xmm0
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	punpcklqdq	%xmm7,%xmm9
+	xorl	%ebx,%eax
+	roll	$5,%ebp
+	pxor	%xmm1,%xmm0
+	addl	%esi,%edx
+	andl	%eax,%edi
+	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%eax
+	paddd	%xmm7,%xmm10
+	addl	%ebp,%edx
+	pxor	%xmm9,%xmm0
+	rorl	$7,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	movdqa	%xmm0,%xmm9
+	xorl	%eax,%ebp
+	roll	$5,%edx
+	movdqa	%xmm10,48(%rsp)
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	pslld	$2,%xmm0
+	addl	%edx,%ecx
+	rorl	$7,%edx
+	psrld	$30,%xmm9
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	por	%xmm9,%xmm0
+	xorl	%ebp,%edx
+	roll	$5,%ecx
+	pshufd	$238,%xmm7,%xmm10
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm5,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm0,%xmm10
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm2,%xmm1
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebx
+	paddd	%xmm0,%xmm8
+	addl	%eax,%ebp
+	pxor	%xmm10,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm1,%xmm10
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm8,0(%rsp)
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	24(%rsp),%ecx
+	pslld	$2,%xmm1
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	psrld	$30,%xmm10
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm10,%xmm1
+	addl	%edx,%ecx
+	addl	28(%rsp),%ebx
+	pshufd	$238,%xmm0,%xmm8
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	pxor	%xmm6,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%edx,%esi
+	punpcklqdq	%xmm1,%xmm8
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	pxor	%xmm3,%xmm2
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	0(%r14),%xmm10
+	rorl	$7,%ecx
+	paddd	%xmm1,%xmm9
+	addl	%ebx,%eax
+	pxor	%xmm8,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	movdqa	%xmm2,%xmm8
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	movdqa	%xmm9,16(%rsp)
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	40(%rsp),%edx
+	pslld	$2,%xmm2
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	psrld	$30,%xmm8
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	por	%xmm8,%xmm2
+	addl	%ebp,%edx
+	addl	44(%rsp),%ecx
+	pshufd	$238,%xmm1,%xmm9
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm7,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%ebp,%esi
+	punpcklqdq	%xmm2,%xmm9
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	pxor	%xmm4,%xmm3
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	movdqa	%xmm10,%xmm8
+	rorl	$7,%edx
+	paddd	%xmm2,%xmm10
+	addl	%ecx,%ebx
+	pxor	%xmm9,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	movdqa	%xmm3,%xmm9
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	movdqa	%xmm10,32(%rsp)
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	56(%rsp),%ebp
+	pslld	$2,%xmm3
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	psrld	$30,%xmm9
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	por	%xmm9,%xmm3
+	addl	%eax,%ebp
+	addl	60(%rsp),%edx
+	pshufd	$238,%xmm2,%xmm10
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	pxor	%xmm0,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%eax,%esi
+	punpcklqdq	%xmm3,%xmm10
+	movl	%edx,%edi
+	roll	$5,%edx
+	pxor	%xmm5,%xmm4
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	movdqa	%xmm8,%xmm9
+	rorl	$7,%ebp
+	paddd	%xmm3,%xmm8
+	addl	%edx,%ecx
+	pxor	%xmm10,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	movdqa	%xmm4,%xmm10
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	movdqa	%xmm8,48(%rsp)
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	8(%rsp),%eax
+	pslld	$2,%xmm4
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	psrld	$30,%xmm10
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	por	%xmm10,%xmm4
+	addl	%ebx,%eax
+	addl	12(%rsp),%ebp
+	pshufd	$238,%xmm3,%xmm8
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	pxor	%xmm1,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ebx,%esi
+	punpcklqdq	%xmm4,%xmm8
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	pxor	%xmm6,%xmm5
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	movdqa	%xmm9,%xmm10
+	rorl	$7,%eax
+	paddd	%xmm4,%xmm9
+	addl	%ebp,%edx
+	pxor	%xmm8,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	movdqa	%xmm5,%xmm8
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	movdqa	%xmm9,0(%rsp)
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	24(%rsp),%ebx
+	pslld	$2,%xmm5
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	psrld	$30,%xmm8
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	por	%xmm8,%xmm5
+	addl	%ecx,%ebx
+	addl	28(%rsp),%eax
+	pshufd	$238,%xmm4,%xmm9
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	pxor	%xmm2,%xmm6
+	addl	32(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	punpcklqdq	%xmm5,%xmm9
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	pxor	%xmm7,%xmm6
+	roll	$5,%eax
+	addl	%esi,%ebp
+	movdqa	%xmm10,%xmm8
+	xorl	%ebx,%edi
+	paddd	%xmm5,%xmm10
+	xorl	%ecx,%ebx
+	pxor	%xmm9,%xmm6
+	addl	%eax,%ebp
+	addl	36(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movdqa	%xmm6,%xmm9
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	movdqa	%xmm10,16(%rsp)
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	pslld	$2,%xmm6
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	psrld	$30,%xmm9
+	addl	40(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	por	%xmm9,%xmm6
+	rorl	$7,%ebp
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	roll	$5,%edx
+	pshufd	$238,%xmm5,%xmm10
+	addl	%esi,%ecx
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	44(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	pxor	%xmm3,%xmm7
+	addl	48(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	punpcklqdq	%xmm6,%xmm10
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	pxor	%xmm0,%xmm7
+	roll	$5,%ebx
+	addl	%esi,%eax
+	movdqa	32(%r14),%xmm9
+	xorl	%ecx,%edi
+	paddd	%xmm6,%xmm8
+	xorl	%edx,%ecx
+	pxor	%xmm10,%xmm7
+	addl	%ebx,%eax
+	addl	52(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movdqa	%xmm7,%xmm10
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	movdqa	%xmm8,32(%rsp)
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	pslld	$2,%xmm7
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	psrld	$30,%xmm10
+	addl	56(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	por	%xmm10,%xmm7
+	rorl	$7,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	roll	$5,%ebp
+	pshufd	$238,%xmm6,%xmm8
+	addl	%esi,%edx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	60(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	pxor	%xmm4,%xmm0
+	addl	0(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	punpcklqdq	%xmm7,%xmm8
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	pxor	%xmm1,%xmm0
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	movdqa	%xmm9,%xmm10
+	xorl	%edx,%edi
+	paddd	%xmm7,%xmm9
+	xorl	%ebp,%edx
+	pxor	%xmm8,%xmm0
+	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	movdqa	%xmm0,%xmm8
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	movdqa	%xmm9,48(%rsp)
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	pslld	$2,%xmm0
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	psrld	$30,%xmm8
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	por	%xmm8,%xmm0
+	rorl	$7,%ebx
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	roll	$5,%eax
+	pshufd	$238,%xmm7,%xmm9
+	addl	%esi,%ebp
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	12(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	pxor	%xmm5,%xmm1
+	addl	16(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	punpcklqdq	%xmm0,%xmm9
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	pxor	%xmm2,%xmm1
+	roll	$5,%edx
+	addl	%esi,%ecx
+	movdqa	%xmm10,%xmm8
+	xorl	%ebp,%edi
+	paddd	%xmm0,%xmm10
+	xorl	%eax,%ebp
+	pxor	%xmm9,%xmm1
+	addl	%edx,%ecx
+	addl	20(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	rorl	$7,%edx
+	movdqa	%xmm1,%xmm9
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	movdqa	%xmm10,0(%rsp)
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	pslld	$2,%xmm1
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	psrld	$30,%xmm9
+	addl	24(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	por	%xmm9,%xmm1
+	rorl	$7,%ecx
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	roll	$5,%ebx
+	pshufd	$238,%xmm0,%xmm10
+	addl	%esi,%eax
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	rorl	$7,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	pxor	%xmm6,%xmm2
+	addl	32(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	rorl	$7,%eax
+	punpcklqdq	%xmm1,%xmm10
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	pxor	%xmm3,%xmm2
+	roll	$5,%ebp
+	addl	%esi,%edx
+	movdqa	%xmm8,%xmm9
+	xorl	%eax,%edi
+	paddd	%xmm1,%xmm8
+	xorl	%ebx,%eax
+	pxor	%xmm10,%xmm2
+	addl	%ebp,%edx
+	addl	36(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	rorl	$7,%ebp
+	movdqa	%xmm2,%xmm10
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	movdqa	%xmm8,16(%rsp)
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	pslld	$2,%xmm2
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	psrld	$30,%xmm10
+	addl	40(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	por	%xmm10,%xmm2
+	rorl	$7,%edx
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	roll	$5,%ecx
+	pshufd	$238,%xmm1,%xmm8
+	addl	%esi,%ebx
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	44(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	rorl	$7,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	pxor	%xmm7,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%ecx,%esi
+	punpcklqdq	%xmm2,%xmm8
+	movl	%eax,%edi
+	roll	$5,%eax
+	pxor	%xmm4,%xmm3
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	movdqa	%xmm9,%xmm10
+	rorl	$7,%ebx
+	paddd	%xmm2,%xmm9
+	addl	%eax,%ebp
+	pxor	%xmm8,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	movdqa	%xmm3,%xmm8
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	movdqa	%xmm9,32(%rsp)
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	56(%rsp),%ecx
+	pslld	$2,%xmm3
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	psrld	$30,%xmm8
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	por	%xmm8,%xmm3
+	addl	%edx,%ecx
+	addl	60(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	0(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	paddd	%xmm3,%xmm10
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	movdqa	%xmm10,48(%rsp)
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	4(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	12(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_ssse3
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
+	movdqu	0(%r9),%xmm0
+	movdqu	16(%r9),%xmm1
+	movdqu	32(%r9),%xmm2
+	movdqu	48(%r9),%xmm3
+.byte	102,15,56,0,198
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+.byte	102,15,56,0,206
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	paddd	%xmm9,%xmm0
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	movdqa	%xmm0,0(%rsp)
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	psubd	%xmm9,%xmm0
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+.byte	102,15,56,0,214
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	paddd	%xmm9,%xmm1
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	movdqa	%xmm1,16(%rsp)
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	psubd	%xmm9,%xmm1
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+.byte	102,15,56,0,222
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	paddd	%xmm9,%xmm2
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	movdqa	%xmm2,32(%rsp)
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	psubd	%xmm9,%xmm2
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
+	movl	%edx,12(%r8)
+	xorl	%edx,%edi
+	movl	%ebp,16(%r8)
+	andl	%edi,%esi
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	roll	$5,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	roll	$5,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	roll	$5,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	roll	$5,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	roll	$5,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	roll	$5,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	rorl	$7,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	roll	$5,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	rorl	$7,%eax
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	roll	$5,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	rorl	$7,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	roll	$5,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	rorl	$7,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	roll	$5,%ebx
+	addl	%edi,%eax
+	rorl	$7,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type	sha1_block_data_order_avx,@function
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc	
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	leaq	-64(%rsp),%rsp
+	vzeroupper
+	andq	$-64,%rsp
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	shlq	$6,%r10
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	movl	4(%r8),%ebx
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	%ebx,%esi
+	movl	16(%r8),%ebp
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	andl	%edi,%esi
+
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	vpshufb	%xmm6,%xmm1,%xmm1
+	vpshufb	%xmm6,%xmm2,%xmm2
+	vpshufb	%xmm6,%xmm3,%xmm3
+	vpaddd	%xmm11,%xmm0,%xmm4
+	vpaddd	%xmm11,%xmm1,%xmm5
+	vpaddd	%xmm11,%xmm2,%xmm6
+	vmovdqa	%xmm4,0(%rsp)
+	vmovdqa	%xmm5,16(%rsp)
+	vmovdqa	%xmm6,32(%rsp)
+	jmp	.Loop_avx
+.align	16
+.Loop_avx:
+	shrdl	$2,%ebx,%ebx
+	xorl	%edx,%esi
+	vpalignr	$8,%xmm0,%xmm1,%xmm4
+	movl	%eax,%edi
+	addl	0(%rsp),%ebp
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrldq	$4,%xmm3,%xmm8
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	vpxor	%xmm0,%xmm4,%xmm4
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm2,%xmm8,%xmm8
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	4(%rsp),%edx
+	vpxor	%xmm8,%xmm4,%xmm4
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%edx
+	andl	%eax,%esi
+	vpsrld	$31,%xmm4,%xmm8
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpslldq	$12,%xmm4,%xmm10
+	vpaddd	%xmm4,%xmm4,%xmm4
+	movl	%edx,%edi
+	addl	8(%rsp),%ecx
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm4,%xmm4
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm4,%xmm4
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	12(%rsp),%ebx
+	vpxor	%xmm10,%xmm4,%xmm4
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpalignr	$8,%xmm1,%xmm2,%xmm5
+	movl	%ebx,%edi
+	addl	16(%rsp),%eax
+	vpaddd	%xmm4,%xmm11,%xmm9
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrldq	$4,%xmm4,%xmm8
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	vpxor	%xmm1,%xmm5,%xmm5
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm3,%xmm8,%xmm8
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	20(%rsp),%ebp
+	vpxor	%xmm8,%xmm5,%xmm5
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	vpsrld	$31,%xmm5,%xmm8
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	vpslldq	$12,%xmm5,%xmm10
+	vpaddd	%xmm5,%xmm5,%xmm5
+	movl	%ebp,%edi
+	addl	24(%rsp),%edx
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	%esi,%edx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm5,%xmm5
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	movl	%edx,%esi
+	addl	28(%rsp),%ecx
+	vpxor	%xmm10,%xmm5,%xmm5
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vmovdqa	-32(%r14),%xmm11
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	vpalignr	$8,%xmm2,%xmm3,%xmm6
+	movl	%ecx,%edi
+	addl	32(%rsp),%ebx
+	vpaddd	%xmm5,%xmm11,%xmm9
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	vpsrldq	$4,%xmm5,%xmm8
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	vpxor	%xmm2,%xmm6,%xmm6
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm4,%xmm8,%xmm8
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	addl	36(%rsp),%eax
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%eax
+	andl	%ecx,%esi
+	vpsrld	$31,%xmm6,%xmm8
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%esi
+	vpslldq	$12,%xmm6,%xmm10
+	vpaddd	%xmm6,%xmm6,%xmm6
+	movl	%eax,%edi
+	addl	40(%rsp),%ebp
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm6,%xmm6
+	addl	%esi,%ebp
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm6,%xmm6
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%edi
+	movl	%ebp,%esi
+	addl	44(%rsp),%edx
+	vpxor	%xmm10,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%esi
+	vpalignr	$8,%xmm3,%xmm4,%xmm7
+	movl	%edx,%edi
+	addl	48(%rsp),%ecx
+	vpaddd	%xmm6,%xmm11,%xmm9
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpsrldq	$4,%xmm6,%xmm8
+	addl	%esi,%ecx
+	andl	%ebp,%edi
+	vpxor	%xmm3,%xmm7,%xmm7
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpxor	%xmm5,%xmm8,%xmm8
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%edi
+	movl	%ecx,%esi
+	addl	52(%rsp),%ebx
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%ebx
+	andl	%edx,%esi
+	vpsrld	$31,%xmm7,%xmm8
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	shrdl	$7,%ecx,%ecx
+	xorl	%ebp,%esi
+	vpslldq	$12,%xmm7,%xmm10
+	vpaddd	%xmm7,%xmm7,%xmm7
+	movl	%ebx,%edi
+	addl	56(%rsp),%eax
+	xorl	%edx,%ecx
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm10,%xmm9
+	vpor	%xmm8,%xmm7,%xmm7
+	addl	%esi,%eax
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm7,%xmm7
+	shrdl	$7,%ebx,%ebx
+	xorl	%edx,%edi
+	movl	%eax,%esi
+	addl	60(%rsp),%ebp
+	vpxor	%xmm10,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	shrdl	$7,%eax,%eax
+	xorl	%ecx,%esi
+	movl	%ebp,%edi
+	addl	0(%rsp),%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm7,%xmm11,%xmm9
+	addl	%esi,%edx
+	andl	%eax,%edi
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	shrdl	$7,%ebp,%ebp
+	xorl	%ebx,%edi
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	movl	%edx,%esi
+	addl	4(%rsp),%ecx
+	xorl	%eax,%ebp
+	shldl	$5,%edx,%edx
+	vpslld	$2,%xmm0,%xmm0
+	addl	%edi,%ecx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	shrdl	$7,%edx,%edx
+	xorl	%eax,%esi
+	movl	%ecx,%edi
+	addl	8(%rsp),%ebx
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%ebp,%edx
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	12(%rsp),%eax
+	xorl	%ebp,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm2,%xmm1,%xmm1
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm8,%xmm1,%xmm1
+	addl	20(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm1,%xmm1
+	addl	24(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpor	%xmm8,%xmm1,%xmm1
+	addl	28(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	vpxor	%xmm3,%xmm2,%xmm2
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	vmovdqa	0(%r14),%xmm11
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpxor	%xmm8,%xmm2,%xmm2
+	addl	36(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpslld	$2,%xmm2,%xmm2
+	addl	40(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpor	%xmm8,%xmm2,%xmm2
+	addl	44(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpxor	%xmm4,%xmm3,%xmm3
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	vpxor	%xmm0,%xmm4,%xmm4
+	addl	0(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpxor	%xmm5,%xmm4,%xmm4
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	vpaddd	%xmm3,%xmm11,%xmm9
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpxor	%xmm8,%xmm4,%xmm4
+	addl	4(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	vpsrld	$30,%xmm4,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpslld	$2,%xmm4,%xmm4
+	addl	8(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vpor	%xmm8,%xmm4,%xmm4
+	addl	12(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	vpxor	%xmm1,%xmm5,%xmm5
+	addl	16(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpxor	%xmm6,%xmm5,%xmm5
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	vpaddd	%xmm4,%xmm11,%xmm9
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpxor	%xmm8,%xmm5,%xmm5
+	addl	20(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	vpsrld	$30,%xmm5,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpslld	$2,%xmm5,%xmm5
+	addl	24(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vpor	%xmm8,%xmm5,%xmm5
+	addl	28(%rsp),%eax
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	vpxor	%xmm2,%xmm6,%xmm6
+	addl	32(%rsp),%ebp
+	andl	%ecx,%esi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	vpaddd	%xmm5,%xmm11,%xmm9
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	vpxor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	36(%rsp),%edx
+	vpsrld	$30,%xmm6,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%esi
+	vpslld	$2,%xmm6,%xmm6
+	xorl	%ebx,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	40(%rsp),%ecx
+	andl	%eax,%esi
+	vpor	%xmm8,%xmm6,%xmm6
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	44(%rsp),%ebx
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	xorl	%ebp,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	vpxor	%xmm3,%xmm7,%xmm7
+	addl	48(%rsp),%eax
+	andl	%edx,%esi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	vpxor	%xmm0,%xmm7,%xmm7
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	vpaddd	%xmm6,%xmm11,%xmm9
+	vmovdqa	32(%r14),%xmm11
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vpxor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	52(%rsp),%ebp
+	vpsrld	$30,%xmm7,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	vpslld	$2,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	56(%rsp),%edx
+	andl	%ebx,%esi
+	vpor	%xmm8,%xmm7,%xmm7
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	60(%rsp),%ecx
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%esi
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	vpxor	%xmm4,%xmm0,%xmm0
+	addl	0(%rsp),%ebx
+	andl	%ebp,%esi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	vpxor	%xmm1,%xmm0,%xmm0
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	vpaddd	%xmm7,%xmm11,%xmm9
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	vpxor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	4(%rsp),%eax
+	vpsrld	$30,%xmm0,%xmm8
+	vmovdqa	%xmm9,48(%rsp)
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	vpslld	$2,%xmm0,%xmm0
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%ecx,%esi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	8(%rsp),%ebp
+	andl	%ecx,%esi
+	vpor	%xmm8,%xmm0,%xmm0
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%edi
+	xorl	%ecx,%esi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ebx,%edi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	addl	12(%rsp),%edx
+	andl	%ebx,%edi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	movl	%ebp,%esi
+	xorl	%ebx,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%eax,%esi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	vpxor	%xmm5,%xmm1,%xmm1
+	addl	16(%rsp),%ecx
+	andl	%eax,%esi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	vpxor	%xmm2,%xmm1,%xmm1
+	movl	%edx,%edi
+	xorl	%eax,%esi
+	vpaddd	%xmm0,%xmm11,%xmm9
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	vpxor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	20(%rsp),%ebx
+	vpsrld	$30,%xmm1,%xmm8
+	vmovdqa	%xmm9,0(%rsp)
+	andl	%ebp,%edi
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%esi
+	vpslld	$2,%xmm1,%xmm1
+	xorl	%ebp,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%edx,%esi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	24(%rsp),%eax
+	andl	%edx,%esi
+	vpor	%xmm8,%xmm1,%xmm1
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%edi
+	xorl	%edx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%ecx,%edi
+	xorl	%edx,%ecx
+	addl	%ebx,%eax
+	addl	28(%rsp),%ebp
+	andl	%ecx,%edi
+	xorl	%edx,%ecx
+	shrdl	$7,%ebx,%ebx
+	movl	%eax,%esi
+	xorl	%ecx,%edi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ebx,%esi
+	xorl	%ecx,%ebx
+	addl	%eax,%ebp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	vpxor	%xmm6,%xmm2,%xmm2
+	addl	32(%rsp),%edx
+	andl	%ebx,%esi
+	xorl	%ecx,%ebx
+	shrdl	$7,%eax,%eax
+	vpxor	%xmm3,%xmm2,%xmm2
+	movl	%ebp,%edi
+	xorl	%ebx,%esi
+	vpaddd	%xmm1,%xmm11,%xmm9
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	vpxor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%edi
+	xorl	%ebx,%eax
+	addl	%ebp,%edx
+	addl	36(%rsp),%ecx
+	vpsrld	$30,%xmm2,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	andl	%eax,%edi
+	xorl	%ebx,%eax
+	shrdl	$7,%ebp,%ebp
+	movl	%edx,%esi
+	vpslld	$2,%xmm2,%xmm2
+	xorl	%eax,%edi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%ebp,%esi
+	xorl	%eax,%ebp
+	addl	%edx,%ecx
+	addl	40(%rsp),%ebx
+	andl	%ebp,%esi
+	vpor	%xmm8,%xmm2,%xmm2
+	xorl	%eax,%ebp
+	shrdl	$7,%edx,%edx
+	movl	%ecx,%edi
+	xorl	%ebp,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%edx,%edi
+	xorl	%ebp,%edx
+	addl	%ecx,%ebx
+	addl	44(%rsp),%eax
+	andl	%edx,%edi
+	xorl	%ebp,%edx
+	shrdl	$7,%ecx,%ecx
+	movl	%ebx,%esi
+	xorl	%edx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	addl	%ebx,%eax
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	vpxor	%xmm7,%xmm3,%xmm3
+	addl	48(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	vpxor	%xmm4,%xmm3,%xmm3
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	vpaddd	%xmm2,%xmm11,%xmm9
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	vpxor	%xmm8,%xmm3,%xmm3
+	addl	52(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	vpsrld	$30,%xmm3,%xmm8
+	vmovdqa	%xmm9,32(%rsp)
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vpslld	$2,%xmm3,%xmm3
+	addl	56(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vpor	%xmm8,%xmm3,%xmm3
+	addl	60(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	0(%rsp),%eax
+	vpaddd	%xmm3,%xmm11,%xmm9
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	vmovdqa	%xmm9,48(%rsp)
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	4(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	8(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	12(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	cmpq	%r10,%r9
+	je	.Ldone_avx
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
+	vmovdqu	0(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	vpshufb	%xmm6,%xmm0,%xmm0
+	addq	$64,%r9
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	vpshufb	%xmm6,%xmm1,%xmm1
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	vpaddd	%xmm11,%xmm0,%xmm4
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	vmovdqa	%xmm4,0(%rsp)
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	vpshufb	%xmm6,%xmm2,%xmm2
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	vpaddd	%xmm11,%xmm1,%xmm5
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	vmovdqa	%xmm5,16(%rsp)
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	vpshufb	%xmm6,%xmm3,%xmm3
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	vpaddd	%xmm11,%xmm2,%xmm6
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	vmovdqa	%xmm6,32(%rsp)
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	addl	12(%r8),%edx
+	movl	%eax,0(%r8)
+	addl	16(%r8),%ebp
+	movl	%esi,4(%r8)
+	movl	%esi,%ebx
+	movl	%ecx,8(%r8)
+	movl	%ecx,%edi
+	movl	%edx,12(%r8)
+	xorl	%edx,%edi
+	movl	%ebp,16(%r8)
+	andl	%edi,%esi
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+	addl	16(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	20(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	xorl	%edx,%esi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	24(%rsp),%ebp
+	xorl	%ecx,%esi
+	movl	%eax,%edi
+	shldl	$5,%eax,%eax
+	addl	%esi,%ebp
+	xorl	%ecx,%edi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	28(%rsp),%edx
+	xorl	%ebx,%edi
+	movl	%ebp,%esi
+	shldl	$5,%ebp,%ebp
+	addl	%edi,%edx
+	xorl	%ebx,%esi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	32(%rsp),%ecx
+	xorl	%eax,%esi
+	movl	%edx,%edi
+	shldl	$5,%edx,%edx
+	addl	%esi,%ecx
+	xorl	%eax,%edi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	36(%rsp),%ebx
+	xorl	%ebp,%edi
+	movl	%ecx,%esi
+	shldl	$5,%ecx,%ecx
+	addl	%edi,%ebx
+	xorl	%ebp,%esi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	40(%rsp),%eax
+	xorl	%edx,%esi
+	movl	%ebx,%edi
+	shldl	$5,%ebx,%ebx
+	addl	%esi,%eax
+	xorl	%edx,%edi
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	addl	44(%rsp),%ebp
+	xorl	%ecx,%edi
+	movl	%eax,%esi
+	shldl	$5,%eax,%eax
+	addl	%edi,%ebp
+	xorl	%ecx,%esi
+	shrdl	$7,%ebx,%ebx
+	addl	%eax,%ebp
+	addl	48(%rsp),%edx
+	xorl	%ebx,%esi
+	movl	%ebp,%edi
+	shldl	$5,%ebp,%ebp
+	addl	%esi,%edx
+	xorl	%ebx,%edi
+	shrdl	$7,%eax,%eax
+	addl	%ebp,%edx
+	addl	52(%rsp),%ecx
+	xorl	%eax,%edi
+	movl	%edx,%esi
+	shldl	$5,%edx,%edx
+	addl	%edi,%ecx
+	xorl	%eax,%esi
+	shrdl	$7,%ebp,%ebp
+	addl	%edx,%ecx
+	addl	56(%rsp),%ebx
+	xorl	%ebp,%esi
+	movl	%ecx,%edi
+	shldl	$5,%ecx,%ecx
+	addl	%esi,%ebx
+	xorl	%ebp,%edi
+	shrdl	$7,%edx,%edx
+	addl	%ecx,%ebx
+	addl	60(%rsp),%eax
+	xorl	%edx,%edi
+	movl	%ebx,%esi
+	shldl	$5,%ebx,%ebx
+	addl	%edi,%eax
+	shrdl	$7,%ecx,%ecx
+	addl	%ebx,%eax
+	vzeroupper
+
+	addl	0(%r8),%eax
+	addl	4(%r8),%esi
+	addl	8(%r8),%ecx
+	movl	%eax,0(%r8)
+	addl	12(%r8),%edx
+	movl	%esi,4(%r8)
+	addl	16(%r8),%ebp
+	movl	%ecx,8(%r8)
+	movl	%edx,12(%r8)
+	movl	%ebp,16(%r8)
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type	sha1_block_data_order_avx2,@function
+.align	16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc	
+	movq	%rsp,%r11
+.cfi_def_cfa_register	%r11
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	vzeroupper
+	movq	%rdi,%r8
+	movq	%rsi,%r9
+	movq	%rdx,%r10
+
+	leaq	-640(%rsp),%rsp
+	shlq	$6,%r10
+	leaq	64(%r9),%r13
+	andq	$-128,%rsp
+	addq	%r9,%r10
+	leaq	K_XX_XX+64(%rip),%r14
+
+	movl	0(%r8),%eax
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+	movl	4(%r8),%ebp
+	movl	8(%r8),%ecx
+	movl	12(%r8),%edx
+	movl	16(%r8),%esi
+	vmovdqu	64(%r14),%ymm6
+
+	vmovdqu	(%r9),%xmm0
+	vmovdqu	16(%r9),%xmm1
+	vmovdqu	32(%r9),%xmm2
+	vmovdqu	48(%r9),%xmm3
+	leaq	64(%r9),%r9
+	vinserti128	$1,(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vpshufb	%ymm6,%ymm0,%ymm0
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vmovdqu	-64(%r14),%ymm11
+	vpshufb	%ymm6,%ymm3,%ymm3
+
+	vpaddd	%ymm11,%ymm0,%ymm4
+	vpaddd	%ymm11,%ymm1,%ymm5
+	vmovdqu	%ymm4,0(%rsp)
+	vpaddd	%ymm11,%ymm2,%ymm6
+	vmovdqu	%ymm5,32(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	vmovdqu	%ymm6,64(%rsp)
+	vmovdqu	%ymm7,96(%rsp)
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	vpsrldq	$4,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm4,%ymm4
+	vpsrld	$31,%ymm4,%ymm8
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	vpxor	%ymm10,%ymm4,%ymm4
+	vpaddd	%ymm11,%ymm4,%ymm9
+	vmovdqu	%ymm9,128(%rsp)
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	vpsrldq	$4,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm5,%ymm5
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r14),%ymm11
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	vpxor	%ymm10,%ymm5,%ymm5
+	vpaddd	%ymm11,%ymm5,%ymm9
+	vmovdqu	%ymm9,160(%rsp)
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	vpsrldq	$4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm6,%ymm6
+	vpsrld	$31,%ymm6,%ymm8
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	vpxor	%ymm10,%ymm6,%ymm6
+	vpaddd	%ymm11,%ymm6,%ymm9
+	vmovdqu	%ymm9,192(%rsp)
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	vpsrldq	$4,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	vpxor	%ymm8,%ymm7,%ymm7
+	vpsrld	$31,%ymm7,%ymm8
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	vpxor	%ymm10,%ymm7,%ymm7
+	vpaddd	%ymm11,%ymm7,%ymm9
+	vmovdqu	%ymm9,224(%rsp)
+	leaq	128(%rsp),%r13
+	jmp	.Loop_avx2
+.align	32
+.Loop_avx2:
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	jmp	.Lalign32_1
+.align	32
+.Lalign32_1:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpxor	%ymm1,%ymm0,%ymm0
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vpor	%ymm8,%ymm0,%ymm0
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	vmovdqu	%ymm9,256(%rsp)
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpxor	%ymm2,%ymm1,%ymm1
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vpor	%ymm8,%ymm1,%ymm1
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	vmovdqu	%ymm9,288(%rsp)
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	vpxor	%ymm3,%ymm2,%ymm2
+	vmovdqu	0(%r14),%ymm11
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vpor	%ymm8,%ymm2,%ymm2
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	vmovdqu	%ymm9,320(%rsp)
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	vpxor	%ymm4,%ymm3,%ymm3
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	vpor	%ymm8,%ymm3,%ymm3
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	vmovdqu	%ymm9,352(%rsp)
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpalignr	$8,%ymm2,%ymm3,%ymm8
+	vpxor	%ymm0,%ymm4,%ymm4
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm5,%ymm4,%ymm4
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	vpsrld	$30,%ymm4,%ymm8
+	vpslld	$2,%ymm4,%ymm4
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	vmovdqu	%ymm9,384(%rsp)
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpalignr	$8,%ymm3,%ymm4,%ymm8
+	vpxor	%ymm1,%ymm5,%ymm5
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm6,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpxor	%ymm8,%ymm5,%ymm5
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	vpsrld	$30,%ymm5,%ymm8
+	vpslld	$2,%ymm5,%ymm5
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vpor	%ymm8,%ymm5,%ymm5
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	vmovdqu	%ymm9,416(%rsp)
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm4,%ymm5,%ymm8
+	vpxor	%ymm2,%ymm6,%ymm6
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm7,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	vpxor	%ymm8,%ymm6,%ymm6
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	vpsrld	$30,%ymm6,%ymm8
+	vpslld	$2,%ymm6,%ymm6
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vpor	%ymm8,%ymm6,%ymm6
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	vmovdqu	%ymm9,448(%rsp)
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm5,%ymm6,%ymm8
+	vpxor	%ymm3,%ymm7,%ymm7
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm0,%ymm7,%ymm7
+	vmovdqu	32(%r14),%ymm11
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpxor	%ymm8,%ymm7,%ymm7
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	vpsrld	$30,%ymm7,%ymm8
+	vpslld	$2,%ymm7,%ymm7
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpor	%ymm8,%ymm7,%ymm7
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	vmovdqu	%ymm9,480(%rsp)
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	jmp	.Lalign32_2
+.align	32
+.Lalign32_2:
+	vpalignr	$8,%ymm6,%ymm7,%ymm8
+	vpxor	%ymm4,%ymm0,%ymm0
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	vpxor	%ymm1,%ymm0,%ymm0
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	vpxor	%ymm8,%ymm0,%ymm0
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpsrld	$30,%ymm0,%ymm8
+	vpslld	$2,%ymm0,%ymm0
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	vpor	%ymm8,%ymm0,%ymm0
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	vpaddd	%ymm11,%ymm0,%ymm9
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	vmovdqu	%ymm9,512(%rsp)
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpalignr	$8,%ymm7,%ymm0,%ymm8
+	vpxor	%ymm5,%ymm1,%ymm1
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm2,%ymm1,%ymm1
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpxor	%ymm8,%ymm1,%ymm1
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpsrld	$30,%ymm1,%ymm8
+	vpslld	$2,%ymm1,%ymm1
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	vpor	%ymm8,%ymm1,%ymm1
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	vmovdqu	%ymm9,544(%rsp)
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vpalignr	$8,%ymm0,%ymm1,%ymm8
+	vpxor	%ymm6,%ymm2,%ymm2
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	vpxor	%ymm3,%ymm2,%ymm2
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	vpxor	%ymm8,%ymm2,%ymm2
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm2,%ymm8
+	vpslld	$2,%ymm2,%ymm2
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	vpor	%ymm8,%ymm2,%ymm2
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vpaddd	%ymm11,%ymm2,%ymm9
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,576(%rsp)
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm1,%ymm2,%ymm8
+	vpxor	%ymm7,%ymm3,%ymm3
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	vpxor	%ymm4,%ymm3,%ymm3
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm3,%ymm3
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	vpsrld	$30,%ymm3,%ymm8
+	vpslld	$2,%ymm3,%ymm3
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	vpor	%ymm8,%ymm3,%ymm3
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpaddd	%ymm11,%ymm3,%ymm9
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vmovdqu	%ymm9,608(%rsp)
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%r9),%r13
+	leaq	128(%r9),%rdi
+	cmpq	%r10,%r13
+	cmovaeq	%r9,%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	je	.Ldone_avx2
+	vmovdqu	64(%r14),%ymm6
+	cmpq	%r10,%rdi
+	ja	.Last_avx2
+
+	vmovdqu	-64(%rdi),%xmm0
+	vmovdqu	-48(%rdi),%xmm1
+	vmovdqu	-32(%rdi),%xmm2
+	vmovdqu	-16(%rdi),%xmm3
+	vinserti128	$1,0(%r13),%ymm0,%ymm0
+	vinserti128	$1,16(%r13),%ymm1,%ymm1
+	vinserti128	$1,32(%r13),%ymm2,%ymm2
+	vinserti128	$1,48(%r13),%ymm3,%ymm3
+	jmp	.Last_avx2
+
+.align	32
+.Last_avx2:
+	leaq	128+16(%rsp),%r13
+	rorxl	$2,%ebp,%ebx
+	andnl	%edx,%ebp,%edi
+	andl	%ecx,%ebp
+	xorl	%edi,%ebp
+	subq	$-128,%r9
+	addl	-128(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-124(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-120(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-116(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-96(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-92(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-88(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-84(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-64(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-60(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	-56(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	-52(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	-32(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	-28(%r13),%edx
+	andnl	%ebx,%esi,%edi
+	addl	%eax,%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	andl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%edi,%esi
+	addl	-24(%r13),%ecx
+	andnl	%ebp,%edx,%edi
+	addl	%esi,%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	andl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%edi,%edx
+	addl	-20(%r13),%ebx
+	andnl	%eax,%ecx,%edi
+	addl	%edx,%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	andl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%edi,%ecx
+	addl	0(%r13),%ebp
+	andnl	%esi,%ebx,%edi
+	addl	%ecx,%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	andl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%edi,%ebx
+	addl	4(%r13),%eax
+	andnl	%edx,%ebp,%edi
+	addl	%ebx,%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	andl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edi,%ebp
+	addl	8(%r13),%esi
+	andnl	%ecx,%eax,%edi
+	addl	%ebp,%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	andl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%edi,%eax
+	addl	12(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	32(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	36(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	40(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	44(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vmovdqu	-64(%r14),%ymm11
+	vpshufb	%ymm6,%ymm0,%ymm0
+	addl	68(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	72(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	76(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	96(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	100(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpshufb	%ymm6,%ymm1,%ymm1
+	vpaddd	%ymm11,%ymm0,%ymm8
+	addl	104(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	108(%r13),%edx
+	leaq	256(%r13),%r13
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-128(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-124(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-120(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vmovdqu	%ymm8,0(%rsp)
+	vpshufb	%ymm6,%ymm2,%ymm2
+	vpaddd	%ymm11,%ymm1,%ymm9
+	addl	-116(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-92(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	addl	-88(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-84(%r13),%ebx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	vmovdqu	%ymm9,32(%rsp)
+	vpshufb	%ymm6,%ymm3,%ymm3
+	vpaddd	%ymm11,%ymm2,%ymm6
+	addl	-64(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-60(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	-56(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	-52(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	addl	-32(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	jmp	.Lalign32_3
+.align	32
+.Lalign32_3:
+	vmovdqu	%ymm6,64(%rsp)
+	vpaddd	%ymm11,%ymm3,%ymm7
+	addl	-28(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	-24(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	-20(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	0(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	addl	4(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	andl	%edi,%esi
+	vmovdqu	%ymm7,96(%rsp)
+	addl	8(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	addl	12(%r13),%ebx
+	xorl	%eax,%edx
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	32(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	36(%r13),%eax
+	xorl	%edx,%ebx
+	movl	%ecx,%edi
+	xorl	%edx,%edi
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	andl	%edi,%ebp
+	addl	40(%r13),%esi
+	xorl	%ecx,%ebp
+	movl	%ebx,%edi
+	xorl	%ecx,%edi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	andl	%edi,%eax
+	vpalignr	$8,%ymm0,%ymm1,%ymm4
+	addl	44(%r13),%edx
+	xorl	%ebx,%eax
+	movl	%ebp,%edi
+	xorl	%ebx,%edi
+	vpsrldq	$4,%ymm3,%ymm8
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpxor	%ymm0,%ymm4,%ymm4
+	vpxor	%ymm2,%ymm8,%ymm8
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpxor	%ymm8,%ymm4,%ymm4
+	andl	%edi,%esi
+	addl	64(%r13),%ecx
+	xorl	%ebp,%esi
+	movl	%eax,%edi
+	vpsrld	$31,%ymm4,%ymm8
+	xorl	%ebp,%edi
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	vpslldq	$12,%ymm4,%ymm10
+	vpaddd	%ymm4,%ymm4,%ymm4
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm4,%ymm4
+	addl	%r12d,%ecx
+	andl	%edi,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm4,%ymm4
+	addl	68(%r13),%ebx
+	xorl	%eax,%edx
+	vpxor	%ymm10,%ymm4,%ymm4
+	movl	%esi,%edi
+	xorl	%eax,%edi
+	leal	(%rbx,%rdx,1),%ebx
+	vpaddd	%ymm11,%ymm4,%ymm9
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	vmovdqu	%ymm9,128(%rsp)
+	addl	%r12d,%ebx
+	andl	%edi,%ecx
+	addl	72(%r13),%ebp
+	xorl	%esi,%ecx
+	movl	%edx,%edi
+	xorl	%esi,%edi
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	andl	%edi,%ebx
+	addl	76(%r13),%eax
+	xorl	%edx,%ebx
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpalignr	$8,%ymm1,%ymm2,%ymm5
+	addl	96(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrldq	$4,%ymm4,%ymm8
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	vpxor	%ymm1,%ymm5,%ymm5
+	vpxor	%ymm3,%ymm8,%ymm8
+	addl	100(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpxor	%ymm8,%ymm5,%ymm5
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpsrld	$31,%ymm5,%ymm8
+	vmovdqu	-32(%r14),%ymm11
+	xorl	%ebx,%esi
+	addl	104(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	vpslldq	$12,%ymm5,%ymm10
+	vpaddd	%ymm5,%ymm5,%ymm5
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm5,%ymm5
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm5,%ymm5
+	xorl	%ebp,%edx
+	addl	108(%r13),%ebx
+	leaq	256(%r13),%r13
+	vpxor	%ymm10,%ymm5,%ymm5
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	vpaddd	%ymm11,%ymm5,%ymm9
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vmovdqu	%ymm9,160(%rsp)
+	addl	-128(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpalignr	$8,%ymm2,%ymm3,%ymm6
+	addl	-124(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	vpsrldq	$4,%ymm5,%ymm8
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	vpxor	%ymm2,%ymm6,%ymm6
+	vpxor	%ymm4,%ymm8,%ymm8
+	addl	-120(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpxor	%ymm8,%ymm6,%ymm6
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpsrld	$31,%ymm6,%ymm8
+	xorl	%ecx,%eax
+	addl	-116(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	vpslldq	$12,%ymm6,%ymm10
+	vpaddd	%ymm6,%ymm6,%ymm6
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm6,%ymm6
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm6,%ymm6
+	xorl	%ebx,%esi
+	addl	-96(%r13),%ecx
+	vpxor	%ymm10,%ymm6,%ymm6
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	vpaddd	%ymm11,%ymm6,%ymm9
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	vmovdqu	%ymm9,192(%rsp)
+	addl	-92(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	vpalignr	$8,%ymm3,%ymm4,%ymm7
+	addl	-88(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	vpsrldq	$4,%ymm6,%ymm8
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	vpxor	%ymm3,%ymm7,%ymm7
+	vpxor	%ymm5,%ymm8,%ymm8
+	addl	-84(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	vpxor	%ymm8,%ymm7,%ymm7
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	vpsrld	$31,%ymm7,%ymm8
+	xorl	%edx,%ebp
+	addl	-64(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	vpslldq	$12,%ymm7,%ymm10
+	vpaddd	%ymm7,%ymm7,%ymm7
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	vpsrld	$30,%ymm10,%ymm9
+	vpor	%ymm8,%ymm7,%ymm7
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	vpslld	$2,%ymm10,%ymm10
+	vpxor	%ymm9,%ymm7,%ymm7
+	xorl	%ecx,%eax
+	addl	-60(%r13),%edx
+	vpxor	%ymm10,%ymm7,%ymm7
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	rorxl	$2,%esi,%eax
+	vpaddd	%ymm11,%ymm7,%ymm9
+	xorl	%ebp,%esi
+	addl	%r12d,%edx
+	xorl	%ebx,%esi
+	vmovdqu	%ymm9,224(%rsp)
+	addl	-56(%r13),%ecx
+	leal	(%rcx,%rsi,1),%ecx
+	rorxl	$27,%edx,%r12d
+	rorxl	$2,%edx,%esi
+	xorl	%eax,%edx
+	addl	%r12d,%ecx
+	xorl	%ebp,%edx
+	addl	-52(%r13),%ebx
+	leal	(%rbx,%rdx,1),%ebx
+	rorxl	$27,%ecx,%r12d
+	rorxl	$2,%ecx,%edx
+	xorl	%esi,%ecx
+	addl	%r12d,%ebx
+	xorl	%eax,%ecx
+	addl	-32(%r13),%ebp
+	leal	(%rcx,%rbp,1),%ebp
+	rorxl	$27,%ebx,%r12d
+	rorxl	$2,%ebx,%ecx
+	xorl	%edx,%ebx
+	addl	%r12d,%ebp
+	xorl	%esi,%ebx
+	addl	-28(%r13),%eax
+	leal	(%rax,%rbx,1),%eax
+	rorxl	$27,%ebp,%r12d
+	rorxl	$2,%ebp,%ebx
+	xorl	%ecx,%ebp
+	addl	%r12d,%eax
+	xorl	%edx,%ebp
+	addl	-24(%r13),%esi
+	leal	(%rsi,%rbp,1),%esi
+	rorxl	$27,%eax,%r12d
+	rorxl	$2,%eax,%ebp
+	xorl	%ebx,%eax
+	addl	%r12d,%esi
+	xorl	%ecx,%eax
+	addl	-20(%r13),%edx
+	leal	(%rdx,%rax,1),%edx
+	rorxl	$27,%esi,%r12d
+	addl	%r12d,%edx
+	leaq	128(%rsp),%r13
+
+
+	addl	0(%r8),%edx
+	addl	4(%r8),%esi
+	addl	8(%r8),%ebp
+	movl	%edx,0(%r8)
+	addl	12(%r8),%ebx
+	movl	%esi,4(%r8)
+	movl	%edx,%eax
+	addl	16(%r8),%ecx
+	movl	%ebp,%r12d
+	movl	%ebp,8(%r8)
+	movl	%ebx,%edx
+
+	movl	%ebx,12(%r8)
+	movl	%esi,%ebp
+	movl	%ecx,16(%r8)
+
+	movl	%ecx,%esi
+	movl	%r12d,%ecx
+
+
+	cmpq	%r10,%r9
+	jbe	.Loop_avx2
+
+.Ldone_avx2:
+	vzeroupper
+	movq	-40(%r11),%r14
+.cfi_restore	%r14
+	movq	-32(%r11),%r13
+.cfi_restore	%r13
+	movq	-24(%r11),%r12
+.cfi_restore	%r12
+	movq	-16(%r11),%rbp
+.cfi_restore	%rbp
+	movq	-8(%r11),%rbx
+.cfi_restore	%rbx
+	leaq	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx2:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -1,0 +1,3973 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl	sha256_block_data_order
+.hidden sha256_block_data_order
+.type	sha256_block_data_order,@function
+.align	16
+sha256_block_data_order:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	0(%r11),%r9d
+	movl	4(%r11),%r10d
+	movl	8(%r11),%r11d
+	andl	$1073741824,%r9d
+	andl	$268435968,%r10d
+	orl	%r9d,%r10d
+	cmpl	$1342177792,%r10d
+	je	.Lavx_shortcut
+	testl	$512,%r10d
+	jnz	.Lssse3_shortcut
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$64+32,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movl	%ebx,%edi
+	leaq	K256(%rip),%rbp
+	xorl	%ecx,%edi
+	movl	0(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	4(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	8(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	12(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	16(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	20(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	24(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	28(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%eax
+	movl	32(%rsi),%r12d
+	movl	%r8d,%r13d
+	movl	%eax,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r11d
+	movl	36(%rsi),%r12d
+	movl	%edx,%r13d
+	movl	%r11d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r10d
+	movl	40(%rsi),%r12d
+	movl	%ecx,%r13d
+	movl	%r10d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%r9d
+	movl	44(%rsi),%r12d
+	movl	%ebx,%r13d
+	movl	%r9d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	addl	%r14d,%r8d
+	movl	48(%rsi),%r12d
+	movl	%eax,%r13d
+	movl	%r8d,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%edx
+	movl	52(%rsi),%r12d
+	movl	%r11d,%r13d
+	movl	%edx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ecx
+	movl	56(%rsi),%r12d
+	movl	%r10d,%r13d
+	movl	%ecx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	addl	%r14d,%ebx
+	movl	60(%rsi),%r12d
+	movl	%r9d,%r13d
+	movl	%ebx,%r14d
+	bswapl	%r12d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movl	4(%rsp),%r13d
+	movl	56(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	36(%rsp),%r12d
+
+	addl	0(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,0(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	8(%rsp),%r13d
+	movl	60(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	40(%rsp),%r12d
+
+	addl	4(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,4(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	12(%rsp),%r13d
+	movl	0(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	44(%rsp),%r12d
+
+	addl	8(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,8(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	16(%rsp),%r13d
+	movl	4(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	48(%rsp),%r12d
+
+	addl	12(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,12(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	20(%rsp),%r13d
+	movl	8(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	52(%rsp),%r12d
+
+	addl	16(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,16(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	24(%rsp),%r13d
+	movl	12(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	56(%rsp),%r12d
+
+	addl	20(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,20(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	28(%rsp),%r13d
+	movl	16(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	60(%rsp),%r12d
+
+	addl	24(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,24(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	32(%rsp),%r13d
+	movl	20(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	0(%rsp),%r12d
+
+	addl	28(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,28(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	movl	36(%rsp),%r13d
+	movl	24(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%eax
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	4(%rsp),%r12d
+
+	addl	32(%rsp),%r12d
+	movl	%r8d,%r13d
+	addl	%r15d,%r12d
+	movl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r9d,%r15d
+
+	xorl	%r8d,%r13d
+	rorl	$9,%r14d
+	xorl	%r10d,%r15d
+
+	movl	%r12d,32(%rsp)
+	xorl	%eax,%r14d
+	andl	%r8d,%r15d
+
+	rorl	$5,%r13d
+	addl	%r11d,%r12d
+	xorl	%r10d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r8d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%eax,%r15d
+	addl	(%rbp),%r12d
+	xorl	%eax,%r14d
+
+	xorl	%ebx,%r15d
+	rorl	$6,%r13d
+	movl	%ebx,%r11d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r11d
+	addl	%r12d,%edx
+	addl	%r12d,%r11d
+
+	leaq	4(%rbp),%rbp
+	movl	40(%rsp),%r13d
+	movl	28(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r11d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	8(%rsp),%r12d
+
+	addl	36(%rsp),%r12d
+	movl	%edx,%r13d
+	addl	%edi,%r12d
+	movl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r8d,%edi
+
+	xorl	%edx,%r13d
+	rorl	$9,%r14d
+	xorl	%r9d,%edi
+
+	movl	%r12d,36(%rsp)
+	xorl	%r11d,%r14d
+	andl	%edx,%edi
+
+	rorl	$5,%r13d
+	addl	%r10d,%r12d
+	xorl	%r9d,%edi
+
+	rorl	$11,%r14d
+	xorl	%edx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r11d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r11d,%r14d
+
+	xorl	%eax,%edi
+	rorl	$6,%r13d
+	movl	%eax,%r10d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r10d
+	addl	%r12d,%ecx
+	addl	%r12d,%r10d
+
+	leaq	4(%rbp),%rbp
+	movl	44(%rsp),%r13d
+	movl	32(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r10d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	12(%rsp),%r12d
+
+	addl	40(%rsp),%r12d
+	movl	%ecx,%r13d
+	addl	%r15d,%r12d
+	movl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%edx,%r15d
+
+	xorl	%ecx,%r13d
+	rorl	$9,%r14d
+	xorl	%r8d,%r15d
+
+	movl	%r12d,40(%rsp)
+	xorl	%r10d,%r14d
+	andl	%ecx,%r15d
+
+	rorl	$5,%r13d
+	addl	%r9d,%r12d
+	xorl	%r8d,%r15d
+
+	rorl	$11,%r14d
+	xorl	%ecx,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r10d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r10d,%r14d
+
+	xorl	%r11d,%r15d
+	rorl	$6,%r13d
+	movl	%r11d,%r9d
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%r9d
+	addl	%r12d,%ebx
+	addl	%r12d,%r9d
+
+	leaq	4(%rbp),%rbp
+	movl	48(%rsp),%r13d
+	movl	36(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r9d
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	16(%rsp),%r12d
+
+	addl	44(%rsp),%r12d
+	movl	%ebx,%r13d
+	addl	%edi,%r12d
+	movl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%ecx,%edi
+
+	xorl	%ebx,%r13d
+	rorl	$9,%r14d
+	xorl	%edx,%edi
+
+	movl	%r12d,44(%rsp)
+	xorl	%r9d,%r14d
+	andl	%ebx,%edi
+
+	rorl	$5,%r13d
+	addl	%r8d,%r12d
+	xorl	%edx,%edi
+
+	rorl	$11,%r14d
+	xorl	%ebx,%r13d
+	addl	%edi,%r12d
+
+	movl	%r9d,%edi
+	addl	(%rbp),%r12d
+	xorl	%r9d,%r14d
+
+	xorl	%r10d,%edi
+	rorl	$6,%r13d
+	movl	%r10d,%r8d
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%r8d
+	addl	%r12d,%eax
+	addl	%r12d,%r8d
+
+	leaq	20(%rbp),%rbp
+	movl	52(%rsp),%r13d
+	movl	40(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%r8d
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	20(%rsp),%r12d
+
+	addl	48(%rsp),%r12d
+	movl	%eax,%r13d
+	addl	%r15d,%r12d
+	movl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%ebx,%r15d
+
+	xorl	%eax,%r13d
+	rorl	$9,%r14d
+	xorl	%ecx,%r15d
+
+	movl	%r12d,48(%rsp)
+	xorl	%r8d,%r14d
+	andl	%eax,%r15d
+
+	rorl	$5,%r13d
+	addl	%edx,%r12d
+	xorl	%ecx,%r15d
+
+	rorl	$11,%r14d
+	xorl	%eax,%r13d
+	addl	%r15d,%r12d
+
+	movl	%r8d,%r15d
+	addl	(%rbp),%r12d
+	xorl	%r8d,%r14d
+
+	xorl	%r9d,%r15d
+	rorl	$6,%r13d
+	movl	%r9d,%edx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%edx
+	addl	%r12d,%r11d
+	addl	%r12d,%edx
+
+	leaq	4(%rbp),%rbp
+	movl	56(%rsp),%r13d
+	movl	44(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%edx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	24(%rsp),%r12d
+
+	addl	52(%rsp),%r12d
+	movl	%r11d,%r13d
+	addl	%edi,%r12d
+	movl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%eax,%edi
+
+	xorl	%r11d,%r13d
+	rorl	$9,%r14d
+	xorl	%ebx,%edi
+
+	movl	%r12d,52(%rsp)
+	xorl	%edx,%r14d
+	andl	%r11d,%edi
+
+	rorl	$5,%r13d
+	addl	%ecx,%r12d
+	xorl	%ebx,%edi
+
+	rorl	$11,%r14d
+	xorl	%r11d,%r13d
+	addl	%edi,%r12d
+
+	movl	%edx,%edi
+	addl	(%rbp),%r12d
+	xorl	%edx,%r14d
+
+	xorl	%r8d,%edi
+	rorl	$6,%r13d
+	movl	%r8d,%ecx
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%ecx
+	addl	%r12d,%r10d
+	addl	%r12d,%ecx
+
+	leaq	4(%rbp),%rbp
+	movl	60(%rsp),%r13d
+	movl	48(%rsp),%r15d
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ecx
+	movl	%r15d,%r14d
+	rorl	$2,%r15d
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%r15d
+	shrl	$10,%r14d
+
+	rorl	$17,%r15d
+	xorl	%r13d,%r12d
+	xorl	%r14d,%r15d
+	addl	28(%rsp),%r12d
+
+	addl	56(%rsp),%r12d
+	movl	%r10d,%r13d
+	addl	%r15d,%r12d
+	movl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r11d,%r15d
+
+	xorl	%r10d,%r13d
+	rorl	$9,%r14d
+	xorl	%eax,%r15d
+
+	movl	%r12d,56(%rsp)
+	xorl	%ecx,%r14d
+	andl	%r10d,%r15d
+
+	rorl	$5,%r13d
+	addl	%ebx,%r12d
+	xorl	%eax,%r15d
+
+	rorl	$11,%r14d
+	xorl	%r10d,%r13d
+	addl	%r15d,%r12d
+
+	movl	%ecx,%r15d
+	addl	(%rbp),%r12d
+	xorl	%ecx,%r14d
+
+	xorl	%edx,%r15d
+	rorl	$6,%r13d
+	movl	%edx,%ebx
+
+	andl	%r15d,%edi
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%edi,%ebx
+	addl	%r12d,%r9d
+	addl	%r12d,%ebx
+
+	leaq	4(%rbp),%rbp
+	movl	0(%rsp),%r13d
+	movl	52(%rsp),%edi
+
+	movl	%r13d,%r12d
+	rorl	$11,%r13d
+	addl	%r14d,%ebx
+	movl	%edi,%r14d
+	rorl	$2,%edi
+
+	xorl	%r12d,%r13d
+	shrl	$3,%r12d
+	rorl	$7,%r13d
+	xorl	%r14d,%edi
+	shrl	$10,%r14d
+
+	rorl	$17,%edi
+	xorl	%r13d,%r12d
+	xorl	%r14d,%edi
+	addl	32(%rsp),%r12d
+
+	addl	60(%rsp),%r12d
+	movl	%r9d,%r13d
+	addl	%edi,%r12d
+	movl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r10d,%edi
+
+	xorl	%r9d,%r13d
+	rorl	$9,%r14d
+	xorl	%r11d,%edi
+
+	movl	%r12d,60(%rsp)
+	xorl	%ebx,%r14d
+	andl	%r9d,%edi
+
+	rorl	$5,%r13d
+	addl	%eax,%r12d
+	xorl	%r11d,%edi
+
+	rorl	$11,%r14d
+	xorl	%r9d,%r13d
+	addl	%edi,%r12d
+
+	movl	%ebx,%edi
+	addl	(%rbp),%r12d
+	xorl	%ebx,%r14d
+
+	xorl	%ecx,%edi
+	rorl	$6,%r13d
+	movl	%ecx,%eax
+
+	andl	%edi,%r15d
+	rorl	$2,%r14d
+	addl	%r13d,%r12d
+
+	xorl	%r15d,%eax
+	addl	%r12d,%r8d
+	addl	%r12d,%eax
+
+	leaq	20(%rbp),%rbp
+	cmpb	$0,3(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	64+0(%rsp),%rdi
+	addl	%r14d,%eax
+	leaq	64(%rsi),%rsi
+
+	addl	0(%rdi),%eax
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha256_block_data_order,.-sha256_block_data_order
+.align	64
+.type	K256,@object
+K256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type	sha256_block_data_order_ssse3,@function
+.align	64
+sha256_block_data_order_ssse3:
+.cfi_startproc	
+.Lssse3_shortcut:
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+
+
+	jmp	.Lloop_ssse3
+.align	16
+.Lloop_ssse3:
+	movdqa	K256+512(%rip),%xmm7
+	movdqu	0(%rsi),%xmm0
+	movdqu	16(%rsi),%xmm1
+	movdqu	32(%rsi),%xmm2
+.byte	102,15,56,0,199
+	movdqu	48(%rsi),%xmm3
+	leaq	K256(%rip),%rbp
+.byte	102,15,56,0,207
+	movdqa	0(%rbp),%xmm4
+	movdqa	32(%rbp),%xmm5
+.byte	102,15,56,0,215
+	paddd	%xmm0,%xmm4
+	movdqa	64(%rbp),%xmm6
+.byte	102,15,56,0,223
+	movdqa	96(%rbp),%xmm7
+	paddd	%xmm1,%xmm5
+	paddd	%xmm2,%xmm6
+	paddd	%xmm3,%xmm7
+	movdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	movdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	movdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	movdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lssse3_00_47
+
+.align	16
+.Lssse3_00_47:
+	subq	$-128,%rbp
+	rorl	$14,%r13d
+	movdqa	%xmm1,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm3,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,224,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,250,4
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm3,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm0
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm0
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm0,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	0(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm0
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm0,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,0(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm2,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm0,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,225,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,251,4
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm0,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm1
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm1
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm1,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	32(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm1
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm1,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,16(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm3,%xmm4
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	movdqa	%xmm1,%xmm7
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+.byte	102,15,58,15,226,4
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+.byte	102,15,58,15,248,4
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	psrld	$7,%xmm6
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	pshufd	$250,%xmm1,%xmm7
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%r11d,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	pslld	$11,%xmm5
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	paddd	%xmm4,%xmm2
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	psrlq	$17,%xmm6
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	psrldq	$8,%xmm7
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm2
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	pshufd	$80,%xmm2,%xmm7
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	psrld	$10,%xmm7
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	psrlq	$2,%xmm6
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	pxor	%xmm6,%xmm7
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	movdqa	64(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	paddd	%xmm7,%xmm2
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	paddd	%xmm2,%xmm6
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	movdqa	%xmm6,32(%rsp)
+	rorl	$14,%r13d
+	movdqa	%xmm0,%xmm4
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	movdqa	%xmm2,%xmm7
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+.byte	102,15,58,15,227,4
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+.byte	102,15,58,15,249,4
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm4,%xmm5
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	movdqa	%xmm4,%xmm6
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	psrld	$3,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	psrld	$7,%xmm6
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	pshufd	$250,%xmm2,%xmm7
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	pslld	$14,%xmm5
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	pxor	%xmm6,%xmm4
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	psrld	$11,%xmm6
+	xorl	%edx,%r14d
+	pxor	%xmm5,%xmm4
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	pslld	$11,%xmm5
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	pxor	%xmm6,%xmm4
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	movdqa	%xmm7,%xmm6
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	pxor	%xmm5,%xmm4
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	psrld	$10,%xmm7
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	paddd	%xmm4,%xmm3
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	psrlq	$17,%xmm6
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	pxor	%xmm6,%xmm7
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	psrlq	$2,%xmm6
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	pshufd	$128,%xmm7,%xmm7
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	psrldq	$8,%xmm7
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	paddd	%xmm7,%xmm3
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	pshufd	$80,%xmm3,%xmm7
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	movdqa	%xmm7,%xmm6
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	psrld	$10,%xmm7
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	psrlq	$17,%xmm6
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	pxor	%xmm6,%xmm7
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	psrlq	$2,%xmm6
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	pxor	%xmm6,%xmm7
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	pshufd	$8,%xmm7,%xmm7
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	movdqa	96(%rbp),%xmm6
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	pslldq	$8,%xmm7
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	paddd	%xmm7,%xmm3
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	paddd	%xmm3,%xmm6
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lssse3_00_47
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	rorl	$9,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	rorl	$5,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	rorl	$11,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	rorl	$2,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	rorl	$9,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	rorl	$5,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	rorl	$11,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	rorl	$2,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	rorl	$9,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	rorl	$5,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	rorl	$11,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	rorl	$2,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	rorl	$9,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	rorl	$5,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	rorl	$11,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	rorl	$2,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	rorl	$9,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	rorl	$5,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	rorl	$11,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	rorl	$2,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	rorl	$9,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	rorl	$5,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	rorl	$11,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	rorl	$2,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	rorl	$9,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	rorl	$5,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	rorl	$11,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	rorl	$6,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	rorl	$2,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	rorl	$14,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	rorl	$9,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	rorl	$5,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	rorl	$11,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	rorl	$6,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	rorl	$2,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_ssse3
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_ssse3:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type	sha256_block_data_order_avx,@function
+.align	64
+sha256_block_data_order_avx:
+.cfi_startproc	
+.Lavx_shortcut:
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$96,%rsp
+	leaq	(%rsi,%rdx,4),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,64+0(%rsp)
+	movq	%rsi,64+8(%rsp)
+	movq	%rdx,64+16(%rsp)
+	movq	%rax,88(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+	vzeroupper
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ebx
+	movl	8(%rdi),%ecx
+	movl	12(%rdi),%edx
+	movl	16(%rdi),%r8d
+	movl	20(%rdi),%r9d
+	movl	24(%rdi),%r10d
+	movl	28(%rdi),%r11d
+	vmovdqa	K256+512+32(%rip),%xmm8
+	vmovdqa	K256+512+64(%rip),%xmm9
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K256+512(%rip),%xmm7
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm7,%xmm0,%xmm0
+	leaq	K256(%rip),%rbp
+	vpshufb	%xmm7,%xmm1,%xmm1
+	vpshufb	%xmm7,%xmm2,%xmm2
+	vpaddd	0(%rbp),%xmm0,%xmm4
+	vpshufb	%xmm7,%xmm3,%xmm3
+	vpaddd	32(%rbp),%xmm1,%xmm5
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	vpaddd	96(%rbp),%xmm3,%xmm7
+	vmovdqa	%xmm4,0(%rsp)
+	movl	%eax,%r14d
+	vmovdqa	%xmm5,16(%rsp)
+	movl	%ebx,%edi
+	vmovdqa	%xmm6,32(%rsp)
+	xorl	%ecx,%edi
+	vmovdqa	%xmm7,48(%rsp)
+	movl	%r8d,%r13d
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	subq	$-128,%rbp
+	vpalignr	$4,%xmm0,%xmm1,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm2,%xmm3,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm0,%xmm0
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm3,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm0,%xmm0
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	vpshufd	$80,%xmm0,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm0,%xmm0
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	0(%rbp),%xmm0,%xmm6
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,0(%rsp)
+	vpalignr	$4,%xmm1,%xmm2,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm3,%xmm0,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm1,%xmm1
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm0,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm1,%xmm1
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	vpshufd	$80,%xmm1,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm1,%xmm1
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	32(%rbp),%xmm1,%xmm6
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,16(%rsp)
+	vpalignr	$4,%xmm2,%xmm3,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	vpalignr	$4,%xmm0,%xmm1,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	vpaddd	%xmm7,%xmm2,%xmm2
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	vpshufd	$250,%xmm1,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	vpaddd	%xmm4,%xmm2,%xmm2
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	vpshufd	$80,%xmm2,%xmm7
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	vpaddd	%xmm6,%xmm2,%xmm2
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	vpaddd	64(%rbp),%xmm2,%xmm6
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	vmovdqa	%xmm6,32(%rsp)
+	vpalignr	$4,%xmm3,%xmm0,%xmm4
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	vpalignr	$4,%xmm1,%xmm2,%xmm7
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	vpsrld	$7,%xmm4,%xmm6
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	vpaddd	%xmm7,%xmm3,%xmm3
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	vpsrld	$3,%xmm4,%xmm7
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	vpslld	$14,%xmm4,%xmm5
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	vpxor	%xmm6,%xmm7,%xmm4
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	vpshufd	$250,%xmm2,%xmm7
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	vpsrld	$11,%xmm6,%xmm6
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	vpxor	%xmm5,%xmm4,%xmm4
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	vpslld	$11,%xmm5,%xmm5
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	vpxor	%xmm6,%xmm4,%xmm4
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	vpsrld	$10,%xmm7,%xmm6
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	vpxor	%xmm5,%xmm4,%xmm4
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	vpsrlq	$17,%xmm7,%xmm7
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	vpaddd	%xmm4,%xmm3,%xmm3
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	vpxor	%xmm7,%xmm6,%xmm6
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	vpshufb	%xmm8,%xmm6,%xmm6
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	vpshufd	$80,%xmm3,%xmm7
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	vpsrld	$10,%xmm7,%xmm6
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	vpsrlq	$17,%xmm7,%xmm7
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	vpxor	%xmm7,%xmm6,%xmm6
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	vpsrlq	$2,%xmm7,%xmm7
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	vpxor	%xmm7,%xmm6,%xmm6
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	vpshufb	%xmm9,%xmm6,%xmm6
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	vpaddd	%xmm6,%xmm3,%xmm3
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	vpaddd	96(%rbp),%xmm3,%xmm6
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	vmovdqa	%xmm6,48(%rsp)
+	cmpb	$0,131(%rbp)
+	jne	.Lavx_00_47
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	0(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	4(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	8(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	12(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	16(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	20(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	24(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	28(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%eax
+	movl	%r9d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r8d,%r13d
+	xorl	%r10d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%eax,%r14d
+	andl	%r8d,%r12d
+	xorl	%r8d,%r13d
+	addl	32(%rsp),%r11d
+	movl	%eax,%r15d
+	xorl	%r10d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ebx,%r15d
+	addl	%r12d,%r11d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%eax,%r14d
+	addl	%r13d,%r11d
+	xorl	%ebx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r11d,%edx
+	addl	%edi,%r11d
+	movl	%edx,%r13d
+	addl	%r11d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r11d
+	movl	%r8d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%edx,%r13d
+	xorl	%r9d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r11d,%r14d
+	andl	%edx,%r12d
+	xorl	%edx,%r13d
+	addl	36(%rsp),%r10d
+	movl	%r11d,%edi
+	xorl	%r9d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%eax,%edi
+	addl	%r12d,%r10d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r11d,%r14d
+	addl	%r13d,%r10d
+	xorl	%eax,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r10d,%ecx
+	addl	%r15d,%r10d
+	movl	%ecx,%r13d
+	addl	%r10d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r10d
+	movl	%edx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ecx,%r13d
+	xorl	%r8d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r10d,%r14d
+	andl	%ecx,%r12d
+	xorl	%ecx,%r13d
+	addl	40(%rsp),%r9d
+	movl	%r10d,%r15d
+	xorl	%r8d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r11d,%r15d
+	addl	%r12d,%r9d
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r10d,%r14d
+	addl	%r13d,%r9d
+	xorl	%r11d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%r9d,%ebx
+	addl	%edi,%r9d
+	movl	%ebx,%r13d
+	addl	%r9d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r9d
+	movl	%ecx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%ebx,%r13d
+	xorl	%edx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r9d,%r14d
+	andl	%ebx,%r12d
+	xorl	%ebx,%r13d
+	addl	44(%rsp),%r8d
+	movl	%r9d,%edi
+	xorl	%edx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r10d,%edi
+	addl	%r12d,%r8d
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%r9d,%r14d
+	addl	%r13d,%r8d
+	xorl	%r10d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%r8d,%eax
+	addl	%r15d,%r8d
+	movl	%eax,%r13d
+	addl	%r8d,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%r8d
+	movl	%ebx,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%eax,%r13d
+	xorl	%ecx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%r8d,%r14d
+	andl	%eax,%r12d
+	xorl	%eax,%r13d
+	addl	48(%rsp),%edx
+	movl	%r8d,%r15d
+	xorl	%ecx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r9d,%r15d
+	addl	%r12d,%edx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%r8d,%r14d
+	addl	%r13d,%edx
+	xorl	%r9d,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%edx,%r11d
+	addl	%edi,%edx
+	movl	%r11d,%r13d
+	addl	%edx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%edx
+	movl	%eax,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r11d,%r13d
+	xorl	%ebx,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%edx,%r14d
+	andl	%r11d,%r12d
+	xorl	%r11d,%r13d
+	addl	52(%rsp),%ecx
+	movl	%edx,%edi
+	xorl	%ebx,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%r8d,%edi
+	addl	%r12d,%ecx
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%edx,%r14d
+	addl	%r13d,%ecx
+	xorl	%r8d,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%ecx,%r10d
+	addl	%r15d,%ecx
+	movl	%r10d,%r13d
+	addl	%ecx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ecx
+	movl	%r11d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r10d,%r13d
+	xorl	%eax,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ecx,%r14d
+	andl	%r10d,%r12d
+	xorl	%r10d,%r13d
+	addl	56(%rsp),%ebx
+	movl	%ecx,%r15d
+	xorl	%eax,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%edx,%r15d
+	addl	%r12d,%ebx
+	shrdl	$6,%r13d,%r13d
+	andl	%r15d,%edi
+	xorl	%ecx,%r14d
+	addl	%r13d,%ebx
+	xorl	%edx,%edi
+	shrdl	$2,%r14d,%r14d
+	addl	%ebx,%r9d
+	addl	%edi,%ebx
+	movl	%r9d,%r13d
+	addl	%ebx,%r14d
+	shrdl	$14,%r13d,%r13d
+	movl	%r14d,%ebx
+	movl	%r10d,%r12d
+	shrdl	$9,%r14d,%r14d
+	xorl	%r9d,%r13d
+	xorl	%r11d,%r12d
+	shrdl	$5,%r13d,%r13d
+	xorl	%ebx,%r14d
+	andl	%r9d,%r12d
+	xorl	%r9d,%r13d
+	addl	60(%rsp),%eax
+	movl	%ebx,%edi
+	xorl	%r11d,%r12d
+	shrdl	$11,%r14d,%r14d
+	xorl	%ecx,%edi
+	addl	%r12d,%eax
+	shrdl	$6,%r13d,%r13d
+	andl	%edi,%r15d
+	xorl	%ebx,%r14d
+	addl	%r13d,%eax
+	xorl	%ecx,%r15d
+	shrdl	$2,%r14d,%r14d
+	addl	%eax,%r8d
+	addl	%r15d,%eax
+	movl	%r8d,%r13d
+	addl	%eax,%r14d
+	movq	64+0(%rsp),%rdi
+	movl	%r14d,%eax
+
+	addl	0(%rdi),%eax
+	leaq	64(%rsi),%rsi
+	addl	4(%rdi),%ebx
+	addl	8(%rdi),%ecx
+	addl	12(%rdi),%edx
+	addl	16(%rdi),%r8d
+	addl	20(%rdi),%r9d
+	addl	24(%rdi),%r10d
+	addl	28(%rdi),%r11d
+
+	cmpq	64+16(%rsp),%rsi
+
+	movl	%eax,0(%rdi)
+	movl	%ebx,4(%rdi)
+	movl	%ecx,8(%rdi)
+	movl	%edx,12(%rdi)
+	movl	%r8d,16(%rdi)
+	movl	%r9d,20(%rdi)
+	movl	%r10d,24(%rdi)
+	movl	%r11d,28(%rdi)
+	jb	.Lloop_avx
+
+	movq	88(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha256_block_data_order_avx,.-sha256_block_data_order_avx
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
@@ -1,0 +1,2992 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl	sha512_block_data_order
+.hidden sha512_block_data_order
+.type	sha512_block_data_order,@function
+.align	16
+sha512_block_data_order:
+.cfi_startproc	
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	0(%r11),%r9d
+	movl	4(%r11),%r10d
+	movl	8(%r11),%r11d
+	andl	$1073741824,%r9d
+	andl	$268435968,%r10d
+	orl	%r9d,%r10d
+	cmpl	$1342177792,%r10d
+	je	.Lavx_shortcut
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$128+32,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	movq	%rbx,%rdi
+	leaq	K512(%rip),%rbp
+	xorq	%rcx,%rdi
+	movq	0(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	8(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	16(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	24(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	32(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	40(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	48(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	56(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rax
+	movq	64(%rsi),%r12
+	movq	%r8,%r13
+	movq	%rax,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r11
+	movq	72(%rsi),%r12
+	movq	%rdx,%r13
+	movq	%r11,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r10
+	movq	80(%rsi),%r12
+	movq	%rcx,%r13
+	movq	%r10,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%r9
+	movq	88(%rsi),%r12
+	movq	%rbx,%r13
+	movq	%r9,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%r8
+	movq	96(%rsi),%r12
+	movq	%rax,%r13
+	movq	%r8,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rdx
+	movq	104(%rsi),%r12
+	movq	%r11,%r13
+	movq	%rdx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	addq	%r14,%rcx
+	movq	112(%rsi),%r12
+	movq	%r10,%r13
+	movq	%rcx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	addq	%r14,%rbx
+	movq	120(%rsi),%r12
+	movq	%r9,%r13
+	movq	%rbx,%r14
+	bswapq	%r12
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	movq	8(%rsp),%r13
+	movq	112(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	72(%rsp),%r12
+
+	addq	0(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,0(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	16(%rsp),%r13
+	movq	120(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	80(%rsp),%r12
+
+	addq	8(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,8(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	24(%rsp),%r13
+	movq	0(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	88(%rsp),%r12
+
+	addq	16(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,16(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	32(%rsp),%r13
+	movq	8(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	96(%rsp),%r12
+
+	addq	24(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,24(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	40(%rsp),%r13
+	movq	16(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	104(%rsp),%r12
+
+	addq	32(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,32(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	48(%rsp),%r13
+	movq	24(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	112(%rsp),%r12
+
+	addq	40(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,40(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	56(%rsp),%r13
+	movq	32(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	120(%rsp),%r12
+
+	addq	48(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,48(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	64(%rsp),%r13
+	movq	40(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	0(%rsp),%r12
+
+	addq	56(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,56(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	movq	72(%rsp),%r13
+	movq	48(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rax
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	8(%rsp),%r12
+
+	addq	64(%rsp),%r12
+	movq	%r8,%r13
+	addq	%r15,%r12
+	movq	%rax,%r14
+	rorq	$23,%r13
+	movq	%r9,%r15
+
+	xorq	%r8,%r13
+	rorq	$5,%r14
+	xorq	%r10,%r15
+
+	movq	%r12,64(%rsp)
+	xorq	%rax,%r14
+	andq	%r8,%r15
+
+	rorq	$4,%r13
+	addq	%r11,%r12
+	xorq	%r10,%r15
+
+	rorq	$6,%r14
+	xorq	%r8,%r13
+	addq	%r15,%r12
+
+	movq	%rax,%r15
+	addq	(%rbp),%r12
+	xorq	%rax,%r14
+
+	xorq	%rbx,%r15
+	rorq	$14,%r13
+	movq	%rbx,%r11
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r11
+	addq	%r12,%rdx
+	addq	%r12,%r11
+
+	leaq	8(%rbp),%rbp
+	movq	80(%rsp),%r13
+	movq	56(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r11
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	16(%rsp),%r12
+
+	addq	72(%rsp),%r12
+	movq	%rdx,%r13
+	addq	%rdi,%r12
+	movq	%r11,%r14
+	rorq	$23,%r13
+	movq	%r8,%rdi
+
+	xorq	%rdx,%r13
+	rorq	$5,%r14
+	xorq	%r9,%rdi
+
+	movq	%r12,72(%rsp)
+	xorq	%r11,%r14
+	andq	%rdx,%rdi
+
+	rorq	$4,%r13
+	addq	%r10,%r12
+	xorq	%r9,%rdi
+
+	rorq	$6,%r14
+	xorq	%rdx,%r13
+	addq	%rdi,%r12
+
+	movq	%r11,%rdi
+	addq	(%rbp),%r12
+	xorq	%r11,%r14
+
+	xorq	%rax,%rdi
+	rorq	$14,%r13
+	movq	%rax,%r10
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r10
+	addq	%r12,%rcx
+	addq	%r12,%r10
+
+	leaq	24(%rbp),%rbp
+	movq	88(%rsp),%r13
+	movq	64(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r10
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	24(%rsp),%r12
+
+	addq	80(%rsp),%r12
+	movq	%rcx,%r13
+	addq	%r15,%r12
+	movq	%r10,%r14
+	rorq	$23,%r13
+	movq	%rdx,%r15
+
+	xorq	%rcx,%r13
+	rorq	$5,%r14
+	xorq	%r8,%r15
+
+	movq	%r12,80(%rsp)
+	xorq	%r10,%r14
+	andq	%rcx,%r15
+
+	rorq	$4,%r13
+	addq	%r9,%r12
+	xorq	%r8,%r15
+
+	rorq	$6,%r14
+	xorq	%rcx,%r13
+	addq	%r15,%r12
+
+	movq	%r10,%r15
+	addq	(%rbp),%r12
+	xorq	%r10,%r14
+
+	xorq	%r11,%r15
+	rorq	$14,%r13
+	movq	%r11,%r9
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%r9
+	addq	%r12,%rbx
+	addq	%r12,%r9
+
+	leaq	8(%rbp),%rbp
+	movq	96(%rsp),%r13
+	movq	72(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r9
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	32(%rsp),%r12
+
+	addq	88(%rsp),%r12
+	movq	%rbx,%r13
+	addq	%rdi,%r12
+	movq	%r9,%r14
+	rorq	$23,%r13
+	movq	%rcx,%rdi
+
+	xorq	%rbx,%r13
+	rorq	$5,%r14
+	xorq	%rdx,%rdi
+
+	movq	%r12,88(%rsp)
+	xorq	%r9,%r14
+	andq	%rbx,%rdi
+
+	rorq	$4,%r13
+	addq	%r8,%r12
+	xorq	%rdx,%rdi
+
+	rorq	$6,%r14
+	xorq	%rbx,%r13
+	addq	%rdi,%r12
+
+	movq	%r9,%rdi
+	addq	(%rbp),%r12
+	xorq	%r9,%r14
+
+	xorq	%r10,%rdi
+	rorq	$14,%r13
+	movq	%r10,%r8
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%r8
+	addq	%r12,%rax
+	addq	%r12,%r8
+
+	leaq	24(%rbp),%rbp
+	movq	104(%rsp),%r13
+	movq	80(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%r8
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	40(%rsp),%r12
+
+	addq	96(%rsp),%r12
+	movq	%rax,%r13
+	addq	%r15,%r12
+	movq	%r8,%r14
+	rorq	$23,%r13
+	movq	%rbx,%r15
+
+	xorq	%rax,%r13
+	rorq	$5,%r14
+	xorq	%rcx,%r15
+
+	movq	%r12,96(%rsp)
+	xorq	%r8,%r14
+	andq	%rax,%r15
+
+	rorq	$4,%r13
+	addq	%rdx,%r12
+	xorq	%rcx,%r15
+
+	rorq	$6,%r14
+	xorq	%rax,%r13
+	addq	%r15,%r12
+
+	movq	%r8,%r15
+	addq	(%rbp),%r12
+	xorq	%r8,%r14
+
+	xorq	%r9,%r15
+	rorq	$14,%r13
+	movq	%r9,%rdx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rdx
+	addq	%r12,%r11
+	addq	%r12,%rdx
+
+	leaq	8(%rbp),%rbp
+	movq	112(%rsp),%r13
+	movq	88(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rdx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	48(%rsp),%r12
+
+	addq	104(%rsp),%r12
+	movq	%r11,%r13
+	addq	%rdi,%r12
+	movq	%rdx,%r14
+	rorq	$23,%r13
+	movq	%rax,%rdi
+
+	xorq	%r11,%r13
+	rorq	$5,%r14
+	xorq	%rbx,%rdi
+
+	movq	%r12,104(%rsp)
+	xorq	%rdx,%r14
+	andq	%r11,%rdi
+
+	rorq	$4,%r13
+	addq	%rcx,%r12
+	xorq	%rbx,%rdi
+
+	rorq	$6,%r14
+	xorq	%r11,%r13
+	addq	%rdi,%r12
+
+	movq	%rdx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rdx,%r14
+
+	xorq	%r8,%rdi
+	rorq	$14,%r13
+	movq	%r8,%rcx
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rcx
+	addq	%r12,%r10
+	addq	%r12,%rcx
+
+	leaq	24(%rbp),%rbp
+	movq	120(%rsp),%r13
+	movq	96(%rsp),%r15
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rcx
+	movq	%r15,%r14
+	rorq	$42,%r15
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%r15
+	shrq	$6,%r14
+
+	rorq	$19,%r15
+	xorq	%r13,%r12
+	xorq	%r14,%r15
+	addq	56(%rsp),%r12
+
+	addq	112(%rsp),%r12
+	movq	%r10,%r13
+	addq	%r15,%r12
+	movq	%rcx,%r14
+	rorq	$23,%r13
+	movq	%r11,%r15
+
+	xorq	%r10,%r13
+	rorq	$5,%r14
+	xorq	%rax,%r15
+
+	movq	%r12,112(%rsp)
+	xorq	%rcx,%r14
+	andq	%r10,%r15
+
+	rorq	$4,%r13
+	addq	%rbx,%r12
+	xorq	%rax,%r15
+
+	rorq	$6,%r14
+	xorq	%r10,%r13
+	addq	%r15,%r12
+
+	movq	%rcx,%r15
+	addq	(%rbp),%r12
+	xorq	%rcx,%r14
+
+	xorq	%rdx,%r15
+	rorq	$14,%r13
+	movq	%rdx,%rbx
+
+	andq	%r15,%rdi
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%rdi,%rbx
+	addq	%r12,%r9
+	addq	%r12,%rbx
+
+	leaq	8(%rbp),%rbp
+	movq	0(%rsp),%r13
+	movq	104(%rsp),%rdi
+
+	movq	%r13,%r12
+	rorq	$7,%r13
+	addq	%r14,%rbx
+	movq	%rdi,%r14
+	rorq	$42,%rdi
+
+	xorq	%r12,%r13
+	shrq	$7,%r12
+	rorq	$1,%r13
+	xorq	%r14,%rdi
+	shrq	$6,%r14
+
+	rorq	$19,%rdi
+	xorq	%r13,%r12
+	xorq	%r14,%rdi
+	addq	64(%rsp),%r12
+
+	addq	120(%rsp),%r12
+	movq	%r9,%r13
+	addq	%rdi,%r12
+	movq	%rbx,%r14
+	rorq	$23,%r13
+	movq	%r10,%rdi
+
+	xorq	%r9,%r13
+	rorq	$5,%r14
+	xorq	%r11,%rdi
+
+	movq	%r12,120(%rsp)
+	xorq	%rbx,%r14
+	andq	%r9,%rdi
+
+	rorq	$4,%r13
+	addq	%rax,%r12
+	xorq	%r11,%rdi
+
+	rorq	$6,%r14
+	xorq	%r9,%r13
+	addq	%rdi,%r12
+
+	movq	%rbx,%rdi
+	addq	(%rbp),%r12
+	xorq	%rbx,%r14
+
+	xorq	%rcx,%rdi
+	rorq	$14,%r13
+	movq	%rcx,%rax
+
+	andq	%rdi,%r15
+	rorq	$28,%r14
+	addq	%r13,%r12
+
+	xorq	%r15,%rax
+	addq	%r12,%r8
+	addq	%r12,%rax
+
+	leaq	24(%rbp),%rbp
+	cmpb	$0,7(%rbp)
+	jnz	.Lrounds_16_xx
+
+	movq	128+0(%rsp),%rdi
+	addq	%r14,%rax
+	leaq	128(%rsi),%rsi
+
+	addq	0(%rdi),%rax
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop
+
+	movq	152(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha512_block_data_order,.-sha512_block_data_order
+.align	64
+.type	K512,@object
+K512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.quad	0x0001020304050607,0x08090a0b0c0d0e0f
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type	sha512_block_data_order_avx,@function
+.align	64
+sha512_block_data_order_avx:
+.cfi_startproc	
+.Lavx_shortcut:
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	shlq	$4,%rdx
+	subq	$160,%rsp
+	leaq	(%rsi,%rdx,8),%rdx
+	andq	$-64,%rsp
+	movq	%rdi,128+0(%rsp)
+	movq	%rsi,128+8(%rsp)
+	movq	%rdx,128+16(%rsp)
+	movq	%rax,152(%rsp)
+.cfi_escape	0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+	vzeroupper
+	movq	0(%rdi),%rax
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rcx
+	movq	24(%rdi),%rdx
+	movq	32(%rdi),%r8
+	movq	40(%rdi),%r9
+	movq	48(%rdi),%r10
+	movq	56(%rdi),%r11
+	jmp	.Lloop_avx
+.align	16
+.Lloop_avx:
+	vmovdqa	K512+1280(%rip),%xmm11
+	vmovdqu	0(%rsi),%xmm0
+	leaq	K512+128(%rip),%rbp
+	vmovdqu	16(%rsi),%xmm1
+	vmovdqu	32(%rsi),%xmm2
+	vpshufb	%xmm11,%xmm0,%xmm0
+	vmovdqu	48(%rsi),%xmm3
+	vpshufb	%xmm11,%xmm1,%xmm1
+	vmovdqu	64(%rsi),%xmm4
+	vpshufb	%xmm11,%xmm2,%xmm2
+	vmovdqu	80(%rsi),%xmm5
+	vpshufb	%xmm11,%xmm3,%xmm3
+	vmovdqu	96(%rsi),%xmm6
+	vpshufb	%xmm11,%xmm4,%xmm4
+	vmovdqu	112(%rsi),%xmm7
+	vpshufb	%xmm11,%xmm5,%xmm5
+	vpaddq	-128(%rbp),%xmm0,%xmm8
+	vpshufb	%xmm11,%xmm6,%xmm6
+	vpaddq	-96(%rbp),%xmm1,%xmm9
+	vpshufb	%xmm11,%xmm7,%xmm7
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	vpaddq	-32(%rbp),%xmm3,%xmm11
+	vmovdqa	%xmm8,0(%rsp)
+	vpaddq	0(%rbp),%xmm4,%xmm8
+	vmovdqa	%xmm9,16(%rsp)
+	vpaddq	32(%rbp),%xmm5,%xmm9
+	vmovdqa	%xmm10,32(%rsp)
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	vmovdqa	%xmm11,48(%rsp)
+	vpaddq	96(%rbp),%xmm7,%xmm11
+	vmovdqa	%xmm8,64(%rsp)
+	movq	%rax,%r14
+	vmovdqa	%xmm9,80(%rsp)
+	movq	%rbx,%rdi
+	vmovdqa	%xmm10,96(%rsp)
+	xorq	%rcx,%rdi
+	vmovdqa	%xmm11,112(%rsp)
+	movq	%r8,%r13
+	jmp	.Lavx_00_47
+
+.align	16
+.Lavx_00_47:
+	addq	$256,%rbp
+	vpalignr	$8,%xmm0,%xmm1,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm4,%xmm5,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm0,%xmm0
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm7,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm7,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm0,%xmm0
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm7,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm0,%xmm0
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	-128(%rbp),%xmm0,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,0(%rsp)
+	vpalignr	$8,%xmm1,%xmm2,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm5,%xmm6,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm1,%xmm1
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm0,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm0,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm1,%xmm1
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm0,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm1,%xmm1
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	-96(%rbp),%xmm1,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,16(%rsp)
+	vpalignr	$8,%xmm2,%xmm3,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm6,%xmm7,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm2,%xmm2
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm1,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm1,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm2,%xmm2
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm1,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm2,%xmm2
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	-64(%rbp),%xmm2,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,32(%rsp)
+	vpalignr	$8,%xmm3,%xmm4,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm7,%xmm0,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm3,%xmm3
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm2,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm2,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm3,%xmm3
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm2,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm3,%xmm3
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	-32(%rbp),%xmm3,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,48(%rsp)
+	vpalignr	$8,%xmm4,%xmm5,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	vpalignr	$8,%xmm0,%xmm1,%xmm11
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	vpaddq	%xmm11,%xmm4,%xmm4
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm3,%xmm11
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	vpsllq	$3,%xmm3,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	vpaddq	%xmm8,%xmm4,%xmm4
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm3,%xmm9
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm4,%xmm4
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	vpaddq	0(%rbp),%xmm4,%xmm10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	vmovdqa	%xmm10,64(%rsp)
+	vpalignr	$8,%xmm5,%xmm6,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	vpalignr	$8,%xmm1,%xmm2,%xmm11
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	vpaddq	%xmm11,%xmm5,%xmm5
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm4,%xmm11
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	vpsllq	$3,%xmm4,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	vpaddq	%xmm8,%xmm5,%xmm5
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm4,%xmm9
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm5,%xmm5
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	vpaddq	32(%rbp),%xmm5,%xmm10
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	vmovdqa	%xmm10,80(%rsp)
+	vpalignr	$8,%xmm6,%xmm7,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	vpalignr	$8,%xmm2,%xmm3,%xmm11
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	vpaddq	%xmm11,%xmm6,%xmm6
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm5,%xmm11
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	vpsllq	$3,%xmm5,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	vpaddq	%xmm8,%xmm6,%xmm6
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm5,%xmm9
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm6,%xmm6
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	vpaddq	64(%rbp),%xmm6,%xmm10
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	vmovdqa	%xmm10,96(%rsp)
+	vpalignr	$8,%xmm7,%xmm0,%xmm8
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	vpalignr	$8,%xmm3,%xmm4,%xmm11
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$1,%xmm8,%xmm10
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	vpaddq	%xmm11,%xmm7,%xmm7
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	vpsrlq	$7,%xmm8,%xmm11
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	vpsllq	$56,%xmm8,%xmm9
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	vpxor	%xmm10,%xmm11,%xmm8
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	vpsrlq	$7,%xmm10,%xmm10
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	vpsllq	$7,%xmm9,%xmm9
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	vpxor	%xmm10,%xmm8,%xmm8
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	vpsrlq	$6,%xmm6,%xmm11
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	vpxor	%xmm9,%xmm8,%xmm8
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	vpsllq	$3,%xmm6,%xmm10
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	vpaddq	%xmm8,%xmm7,%xmm7
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	vpsrlq	$19,%xmm6,%xmm9
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	vpxor	%xmm10,%xmm11,%xmm11
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	vpsllq	$42,%xmm10,%xmm10
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	vpxor	%xmm9,%xmm11,%xmm11
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	vpsrlq	$42,%xmm9,%xmm9
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	vpxor	%xmm10,%xmm11,%xmm11
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	vpxor	%xmm9,%xmm11,%xmm11
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	vpaddq	%xmm11,%xmm7,%xmm7
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	vpaddq	96(%rbp),%xmm7,%xmm10
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	vmovdqa	%xmm10,112(%rsp)
+	cmpb	$0,135(%rbp)
+	jne	.Lavx_00_47
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	0(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	8(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	16(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	24(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	32(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	40(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	48(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	56(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rax
+	movq	%r9,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r8,%r13
+	xorq	%r10,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rax,%r14
+	andq	%r8,%r12
+	xorq	%r8,%r13
+	addq	64(%rsp),%r11
+	movq	%rax,%r15
+	xorq	%r10,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rbx,%r15
+	addq	%r12,%r11
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rax,%r14
+	addq	%r13,%r11
+	xorq	%rbx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r11,%rdx
+	addq	%rdi,%r11
+	movq	%rdx,%r13
+	addq	%r11,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r11
+	movq	%r8,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rdx,%r13
+	xorq	%r9,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r11,%r14
+	andq	%rdx,%r12
+	xorq	%rdx,%r13
+	addq	72(%rsp),%r10
+	movq	%r11,%rdi
+	xorq	%r9,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rax,%rdi
+	addq	%r12,%r10
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r11,%r14
+	addq	%r13,%r10
+	xorq	%rax,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r10,%rcx
+	addq	%r15,%r10
+	movq	%rcx,%r13
+	addq	%r10,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r10
+	movq	%rdx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rcx,%r13
+	xorq	%r8,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r10,%r14
+	andq	%rcx,%r12
+	xorq	%rcx,%r13
+	addq	80(%rsp),%r9
+	movq	%r10,%r15
+	xorq	%r8,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r11,%r15
+	addq	%r12,%r9
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r10,%r14
+	addq	%r13,%r9
+	xorq	%r11,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%r9,%rbx
+	addq	%rdi,%r9
+	movq	%rbx,%r13
+	addq	%r9,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r9
+	movq	%rcx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rbx,%r13
+	xorq	%rdx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r9,%r14
+	andq	%rbx,%r12
+	xorq	%rbx,%r13
+	addq	88(%rsp),%r8
+	movq	%r9,%rdi
+	xorq	%rdx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r10,%rdi
+	addq	%r12,%r8
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%r9,%r14
+	addq	%r13,%r8
+	xorq	%r10,%r15
+	shrdq	$28,%r14,%r14
+	addq	%r8,%rax
+	addq	%r15,%r8
+	movq	%rax,%r13
+	addq	%r8,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%r8
+	movq	%rbx,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%rax,%r13
+	xorq	%rcx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%r8,%r14
+	andq	%rax,%r12
+	xorq	%rax,%r13
+	addq	96(%rsp),%rdx
+	movq	%r8,%r15
+	xorq	%rcx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r9,%r15
+	addq	%r12,%rdx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%r8,%r14
+	addq	%r13,%rdx
+	xorq	%r9,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rdx,%r11
+	addq	%rdi,%rdx
+	movq	%r11,%r13
+	addq	%rdx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rdx
+	movq	%rax,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r11,%r13
+	xorq	%rbx,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rdx,%r14
+	andq	%r11,%r12
+	xorq	%r11,%r13
+	addq	104(%rsp),%rcx
+	movq	%rdx,%rdi
+	xorq	%rbx,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%r8,%rdi
+	addq	%r12,%rcx
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rdx,%r14
+	addq	%r13,%rcx
+	xorq	%r8,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rcx,%r10
+	addq	%r15,%rcx
+	movq	%r10,%r13
+	addq	%rcx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rcx
+	movq	%r11,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r10,%r13
+	xorq	%rax,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rcx,%r14
+	andq	%r10,%r12
+	xorq	%r10,%r13
+	addq	112(%rsp),%rbx
+	movq	%rcx,%r15
+	xorq	%rax,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rdx,%r15
+	addq	%r12,%rbx
+	shrdq	$14,%r13,%r13
+	andq	%r15,%rdi
+	xorq	%rcx,%r14
+	addq	%r13,%rbx
+	xorq	%rdx,%rdi
+	shrdq	$28,%r14,%r14
+	addq	%rbx,%r9
+	addq	%rdi,%rbx
+	movq	%r9,%r13
+	addq	%rbx,%r14
+	shrdq	$23,%r13,%r13
+	movq	%r14,%rbx
+	movq	%r10,%r12
+	shrdq	$5,%r14,%r14
+	xorq	%r9,%r13
+	xorq	%r11,%r12
+	shrdq	$4,%r13,%r13
+	xorq	%rbx,%r14
+	andq	%r9,%r12
+	xorq	%r9,%r13
+	addq	120(%rsp),%rax
+	movq	%rbx,%rdi
+	xorq	%r11,%r12
+	shrdq	$6,%r14,%r14
+	xorq	%rcx,%rdi
+	addq	%r12,%rax
+	shrdq	$14,%r13,%r13
+	andq	%rdi,%r15
+	xorq	%rbx,%r14
+	addq	%r13,%rax
+	xorq	%rcx,%r15
+	shrdq	$28,%r14,%r14
+	addq	%rax,%r8
+	addq	%r15,%rax
+	movq	%r8,%r13
+	addq	%rax,%r14
+	movq	128+0(%rsp),%rdi
+	movq	%r14,%rax
+
+	addq	0(%rdi),%rax
+	leaq	128(%rsi),%rsi
+	addq	8(%rdi),%rbx
+	addq	16(%rdi),%rcx
+	addq	24(%rdi),%rdx
+	addq	32(%rdi),%r8
+	addq	40(%rdi),%r9
+	addq	48(%rdi),%r10
+	addq	56(%rdi),%r11
+
+	cmpq	128+16(%rsp),%rsi
+
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rcx,16(%rdi)
+	movq	%rdx,24(%rdi)
+	movq	%r8,32(%rdi)
+	movq	%r9,40(%rdi)
+	movq	%r10,48(%rdi)
+	movq	%r11,56(%rdi)
+	jb	.Lloop_avx
+
+	movq	152(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	vzeroupper
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lepilogue_avx:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha512_block_data_order_avx,.-sha512_block_data_order_avx
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S
@@ -1,0 +1,1133 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core,@function
+.align	16
+_vpaes_encrypt_core:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_ipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movdqu	(%r9),%xmm5
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm5,%xmm2
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align	16
+.Lenc_loop:
+
+	movdqa	%xmm13,%xmm4
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,226
+.byte	102,15,56,0,195
+	pxor	%xmm5,%xmm4
+	movdqa	%xmm15,%xmm5
+	pxor	%xmm4,%xmm0
+	movdqa	-64(%r11,%r10,1),%xmm1
+.byte	102,15,56,0,234
+	movdqa	(%r11,%r10,1),%xmm4
+	movdqa	%xmm14,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm0,%xmm3
+	pxor	%xmm5,%xmm2
+.byte	102,15,56,0,193
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+.byte	102,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+
+.Lenc_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm11,%xmm5
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,232
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm5,%xmm3
+.byte	102,15,56,0,224
+	movdqa	%xmm10,%xmm2
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm5
+	pxor	%xmm1,%xmm3
+	jnz	.Lenc_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+.byte	102,15,56,0,226
+	pxor	%xmm5,%xmm4
+.byte	102,15,56,0,195
+	movdqa	64(%r11,%r10,1),%xmm1
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,193
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_encrypt_core_2x,@function
+.align	16
+_vpaes_encrypt_core_2x:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movq	$16,%r11
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	.Lk_ipt(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	movdqu	(%r9),%xmm5
+
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,208
+.byte	102,68,15,56,0,198
+	movdqa	.Lk_ipt+16(%rip),%xmm0
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,247
+	pxor	%xmm5,%xmm2
+	pxor	%xmm5,%xmm8
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+	leaq	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc2x_entry
+
+.align	16
+.Lenc2x_loop:
+
+	movdqa	.Lk_sb1(%rip),%xmm4
+	movdqa	.Lk_sb1+16(%rip),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+	movdqa	.Lk_sb2(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+	movdqa	-64(%r11,%r10,1),%xmm1
+
+.byte	102,15,56,0,234
+.byte	102,69,15,56,0,232
+	movdqa	(%r11,%r10,1),%xmm4
+
+	movdqa	.Lk_sb2+16(%rip),%xmm2
+	movdqa	%xmm2,%xmm8
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm0,%xmm3
+	movdqa	%xmm6,%xmm11
+	pxor	%xmm5,%xmm2
+	pxor	%xmm13,%xmm8
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	addq	$16,%r9
+	pxor	%xmm2,%xmm0
+	pxor	%xmm8,%xmm6
+.byte	102,15,56,0,220
+.byte	102,68,15,56,0,220
+	addq	$16,%r11
+	pxor	%xmm0,%xmm3
+	pxor	%xmm6,%xmm11
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	andq	$0x30,%r11
+	subq	$1,%rax
+	pxor	%xmm3,%xmm0
+	pxor	%xmm11,%xmm6
+
+.Lenc2x_entry:
+
+	movdqa	%xmm9,%xmm1
+	movdqa	%xmm9,%xmm7
+	movdqa	.Lk_inv+16(%rip),%xmm5
+	movdqa	%xmm5,%xmm13
+	pandn	%xmm0,%xmm1
+	pandn	%xmm6,%xmm7
+	psrld	$4,%xmm1
+	psrld	$4,%xmm7
+	pand	%xmm9,%xmm0
+	pand	%xmm9,%xmm6
+.byte	102,15,56,0,232
+.byte	102,68,15,56,0,238
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm1,%xmm0
+	pxor	%xmm7,%xmm6
+.byte	102,15,56,0,217
+.byte	102,68,15,56,0,223
+	movdqa	%xmm10,%xmm4
+	movdqa	%xmm10,%xmm12
+	pxor	%xmm5,%xmm3
+	pxor	%xmm13,%xmm11
+.byte	102,15,56,0,224
+.byte	102,68,15,56,0,230
+	movdqa	%xmm10,%xmm2
+	movdqa	%xmm10,%xmm8
+	pxor	%xmm5,%xmm4
+	pxor	%xmm13,%xmm12
+.byte	102,15,56,0,211
+.byte	102,69,15,56,0,195
+	movdqa	%xmm10,%xmm3
+	movdqa	%xmm10,%xmm11
+	pxor	%xmm0,%xmm2
+	pxor	%xmm6,%xmm8
+.byte	102,15,56,0,220
+.byte	102,69,15,56,0,220
+	movdqu	(%r9),%xmm5
+
+	pxor	%xmm1,%xmm3
+	pxor	%xmm7,%xmm11
+	jnz	.Lenc2x_loop
+
+
+	movdqa	-96(%r10),%xmm4
+	movdqa	-80(%r10),%xmm0
+	movdqa	%xmm4,%xmm12
+	movdqa	%xmm0,%xmm6
+.byte	102,15,56,0,226
+.byte	102,69,15,56,0,224
+	pxor	%xmm5,%xmm4
+	pxor	%xmm5,%xmm12
+.byte	102,15,56,0,195
+.byte	102,65,15,56,0,243
+	movdqa	64(%r11,%r10,1),%xmm1
+
+	pxor	%xmm4,%xmm0
+	pxor	%xmm12,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+
+
+
+
+
+.type	_vpaes_decrypt_core,@function
+.align	16
+_vpaes_decrypt_core:
+.cfi_startproc	
+	movq	%rdx,%r9
+	movl	240(%rdx),%eax
+	movdqa	%xmm9,%xmm1
+	movdqa	.Lk_dipt(%rip),%xmm2
+	pandn	%xmm0,%xmm1
+	movq	%rax,%r11
+	psrld	$4,%xmm1
+	movdqu	(%r9),%xmm5
+	shlq	$4,%r11
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	.Lk_dipt+16(%rip),%xmm0
+	xorq	$0x30,%r11
+	leaq	.Lk_dsbd(%rip),%r10
+.byte	102,15,56,0,193
+	andq	$0x30,%r11
+	pxor	%xmm5,%xmm2
+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
+	pxor	%xmm2,%xmm0
+	addq	$16,%r9
+	addq	%r10,%r11
+	jmp	.Ldec_entry
+
+.align	16
+.Ldec_loop:
+
+
+
+	movdqa	-32(%r10),%xmm4
+	movdqa	-16(%r10),%xmm1
+.byte	102,15,56,0,226
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	0(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	16(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	32(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	48(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	movdqa	64(%r10),%xmm4
+	pxor	%xmm1,%xmm0
+	movdqa	80(%r10),%xmm1
+
+.byte	102,15,56,0,226
+.byte	102,15,56,0,197
+.byte	102,15,56,0,203
+	pxor	%xmm4,%xmm0
+	addq	$16,%r9
+.byte	102,15,58,15,237,12
+	pxor	%xmm1,%xmm0
+	subq	$1,%rax
+
+.Ldec_entry:
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	movdqa	%xmm11,%xmm2
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+.byte	102,15,56,0,208
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,217
+	movdqa	%xmm10,%xmm4
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	movdqa	%xmm10,%xmm3
+	pxor	%xmm0,%xmm2
+.byte	102,15,56,0,220
+	movdqu	(%r9),%xmm0
+	pxor	%xmm1,%xmm3
+	jnz	.Ldec_loop
+
+
+	movdqa	96(%r10),%xmm4
+.byte	102,15,56,0,226
+	pxor	%xmm0,%xmm4
+	movdqa	112(%r10),%xmm0
+	movdqa	-352(%r11),%xmm2
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+.byte	102,15,56,0,194
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type	_vpaes_schedule_core,@function
+.align	16
+_vpaes_schedule_core:
+.cfi_startproc	
+
+
+
+
+
+	call	_vpaes_preheat
+	movdqa	.Lk_rcon(%rip),%xmm8
+	movdqu	(%rdi),%xmm0
+
+
+	movdqa	%xmm0,%xmm3
+	leaq	.Lk_ipt(%rip),%r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm7
+
+	leaq	.Lk_sr(%rip),%r10
+	testq	%rcx,%rcx
+	jnz	.Lschedule_am_decrypting
+
+
+	movdqu	%xmm0,(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	movdqu	%xmm3,(%rdx)
+	xorq	$0x30,%r8
+
+.Lschedule_go:
+	cmpl	$192,%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+	movl	$10,%esi
+
+.Loop_schedule_128:
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	jmp	.Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,%xmm6
+	pxor	%xmm4,%xmm4
+	movhlps	%xmm4,%xmm6
+	movl	$4,%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+.byte	102,15,58,15,198,8
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0
+	call	_vpaes_schedule_transform
+	movl	$7,%esi
+
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle
+	movdqa	%xmm0,%xmm6
+
+
+	call	_vpaes_schedule_round
+	decq	%rsi
+	jz	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+	movdqa	%xmm7,%xmm5
+	movdqa	%xmm6,%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,%xmm7
+
+	jmp	.Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align	16
+.Lschedule_mangle_last:
+
+	leaq	.Lk_deskew(%rip),%r11
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,193
+	leaq	.Lk_opt(%rip),%r11
+	addq	$32,%rdx
+
+.Lschedule_mangle_last_dec:
+	addq	$-16,%rdx
+	pxor	.Lk_s63(%rip),%xmm0
+	call	_vpaes_schedule_transform
+	movdqu	%xmm0,(%rdx)
+
+
+	pxor	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_192_smear,@function
+.align	16
+_vpaes_schedule_192_smear:
+.cfi_startproc	
+	pshufd	$0x80,%xmm6,%xmm1
+	pshufd	$0xFE,%xmm7,%xmm0
+	pxor	%xmm1,%xmm6
+	pxor	%xmm1,%xmm1
+	pxor	%xmm0,%xmm6
+	movdqa	%xmm6,%xmm0
+	movhlps	%xmm1,%xmm6
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_round,@function
+.align	16
+_vpaes_schedule_round:
+.cfi_startproc	
+
+	pxor	%xmm1,%xmm1
+.byte	102,65,15,58,15,200,15
+.byte	102,69,15,58,15,192,15
+	pxor	%xmm1,%xmm7
+
+
+	pshufd	$0xFF,%xmm0,%xmm0
+.byte	102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+	movdqa	%xmm7,%xmm1
+	pslldq	$4,%xmm7
+	pxor	%xmm1,%xmm7
+	movdqa	%xmm7,%xmm1
+	pslldq	$8,%xmm7
+	pxor	%xmm1,%xmm7
+	pxor	.Lk_s63(%rip),%xmm7
+
+
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	%xmm11,%xmm2
+.byte	102,15,56,0,208
+	pxor	%xmm1,%xmm0
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+	movdqa	%xmm10,%xmm4
+.byte	102,15,56,0,224
+	pxor	%xmm2,%xmm4
+	movdqa	%xmm10,%xmm2
+.byte	102,15,56,0,211
+	pxor	%xmm0,%xmm2
+	movdqa	%xmm10,%xmm3
+.byte	102,15,56,0,220
+	pxor	%xmm1,%xmm3
+	movdqa	%xmm13,%xmm4
+.byte	102,15,56,0,226
+	movdqa	%xmm12,%xmm0
+.byte	102,15,56,0,195
+	pxor	%xmm4,%xmm0
+
+
+	pxor	%xmm7,%xmm0
+	movdqa	%xmm0,%xmm7
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_transform,@function
+.align	16
+_vpaes_schedule_transform:
+.cfi_startproc	
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm0,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm0
+	movdqa	(%r11),%xmm2
+.byte	102,15,56,0,208
+	movdqa	16(%r11),%xmm0
+.byte	102,15,56,0,193
+	pxor	%xmm2,%xmm0
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type	_vpaes_schedule_mangle,@function
+.align	16
+_vpaes_schedule_mangle:
+.cfi_startproc	
+	movdqa	%xmm0,%xmm4
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	testq	%rcx,%rcx
+	jnz	.Lschedule_mangle_dec
+
+
+	addq	$16,%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+.byte	102,15,56,0,229
+	movdqa	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+.byte	102,15,56,0,229
+	pxor	%xmm4,%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+
+	leaq	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,%xmm1
+	pandn	%xmm4,%xmm1
+	psrld	$4,%xmm1
+	pand	%xmm9,%xmm4
+
+	movdqa	0(%r11),%xmm2
+.byte	102,15,56,0,212
+	movdqa	16(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	32(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	48(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	64(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	80(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+.byte	102,15,56,0,221
+
+	movdqa	96(%r11),%xmm2
+.byte	102,15,56,0,212
+	pxor	%xmm3,%xmm2
+	movdqa	112(%r11),%xmm3
+.byte	102,15,56,0,217
+	pxor	%xmm2,%xmm3
+
+	addq	$-16,%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10,1),%xmm1
+.byte	102,15,56,0,217
+	addq	$-16,%r8
+	andq	$0x30,%r8
+	movdqu	%xmm3,(%rdx)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl	vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type	vpaes_set_encrypt_key,@function
+.align	16
+vpaes_set_encrypt_key:
+.cfi_startproc	
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+5(%rip)
+#endif
+
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+
+	movl	$0,%ecx
+	movl	$0x30,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl	vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type	vpaes_set_decrypt_key,@function
+.align	16
+vpaes_set_decrypt_key:
+.cfi_startproc	
+	movl	%esi,%eax
+	shrl	$5,%eax
+	addl	$5,%eax
+	movl	%eax,240(%rdx)
+	shll	$4,%eax
+	leaq	16(%rdx,%rax,1),%rdx
+
+	movl	$1,%ecx
+	movl	%esi,%r8d
+	shrl	$1,%r8d
+	andl	$32,%r8d
+	xorl	$32,%r8d
+	call	_vpaes_schedule_core
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl	vpaes_encrypt
+.hidden vpaes_encrypt
+.type	vpaes_encrypt,@function
+.align	16
+vpaes_encrypt:
+.cfi_startproc	
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern	BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+	movb	$1,BORINGSSL_function_hit+4(%rip)
+#endif
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	vpaes_encrypt,.-vpaes_encrypt
+
+.globl	vpaes_decrypt
+.hidden vpaes_decrypt
+.type	vpaes_decrypt,@function
+.align	16
+vpaes_decrypt:
+.cfi_startproc	
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	vpaes_decrypt,.-vpaes_decrypt
+.globl	vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type	vpaes_cbc_encrypt,@function
+.align	16
+vpaes_cbc_encrypt:
+.cfi_startproc	
+	xchgq	%rcx,%rdx
+	subq	$16,%rcx
+	jc	.Lcbc_abort
+	movdqu	(%r8),%xmm6
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	cmpl	$0,%r9d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	(%rdi),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	(%rdi),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	leaq	16(%rdi),%rdi
+	subq	$16,%rcx
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,(%r8)
+.Lcbc_abort:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+.globl	vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type	vpaes_ctr32_encrypt_blocks,@function
+.align	16
+vpaes_ctr32_encrypt_blocks:
+.cfi_startproc	
+
+	xchgq	%rcx,%rdx
+	testq	%rcx,%rcx
+	jz	.Lctr32_abort
+	movdqu	(%r8),%xmm0
+	movdqa	.Lctr_add_one(%rip),%xmm8
+	subq	%rdi,%rsi
+	call	_vpaes_preheat
+	movdqa	%xmm0,%xmm6
+	pshufb	.Lrev_ctr(%rip),%xmm6
+
+	testq	$1,%rcx
+	jz	.Lctr32_prep_loop
+
+
+
+	movdqu	(%rdi),%xmm7
+	call	_vpaes_encrypt_core
+	pxor	%xmm7,%xmm0
+	paddd	%xmm8,%xmm6
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	subq	$1,%rcx
+	leaq	16(%rdi),%rdi
+	jz	.Lctr32_done
+
+.Lctr32_prep_loop:
+
+
+	movdqa	%xmm6,%xmm14
+	movdqa	%xmm6,%xmm15
+	paddd	%xmm8,%xmm15
+
+.Lctr32_loop:
+	movdqa	.Lrev_ctr(%rip),%xmm1
+	movdqa	%xmm14,%xmm0
+	movdqa	%xmm15,%xmm6
+.byte	102,15,56,0,193
+.byte	102,15,56,0,241
+	call	_vpaes_encrypt_core_2x
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	.Lctr_add_two(%rip),%xmm3
+	pxor	%xmm1,%xmm0
+	pxor	%xmm2,%xmm6
+	paddd	%xmm3,%xmm14
+	paddd	%xmm3,%xmm15
+	movdqu	%xmm0,(%rsi,%rdi,1)
+	movdqu	%xmm6,16(%rsi,%rdi,1)
+	subq	$2,%rcx
+	leaq	32(%rdi),%rdi
+	jnz	.Lctr32_loop
+
+.Lctr32_done:
+.Lctr32_abort:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+
+
+
+
+
+
+.type	_vpaes_preheat,@function
+.align	16
+_vpaes_preheat:
+.cfi_startproc	
+	leaq	.Lk_s0F(%rip),%r10
+	movdqa	-32(%r10),%xmm10
+	movdqa	-16(%r10),%xmm11
+	movdqa	0(%r10),%xmm9
+	movdqa	48(%r10),%xmm13
+	movdqa	64(%r10),%xmm12
+	movdqa	80(%r10),%xmm15
+	movdqa	96(%r10),%xmm14
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	_vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type	_vpaes_consts,@object
+.align	64
+_vpaes_consts:
+.Lk_inv:
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+
+.Lrev_ctr:
+.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+.Lctr_add_one:
+.quad	0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+.quad	0x0000000000000000, 0x0000000200000000
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S
@@ -1,0 +1,1260 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl	bn_mul_mont
+.hidden bn_mul_mont
+.type	bn_mul_mont,@function
+.align	16
+bn_mul_mont:
+.cfi_startproc	
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	testl	$3,%r9d
+	jnz	.Lmul_enter
+	cmpl	$8,%r9d
+	jb	.Lmul_enter
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	cmpq	%rsi,%rdx
+	jne	.Lmul4x_enter
+	testl	$7,%r9d
+	jz	.Lsqr8x_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	movq	%r9,%r15
+
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsp,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+.Lcopy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r9,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_mul_mont,.-bn_mul_mont
+.type	bn_mul4x_mont,@function
+.align	16
+bn_mul4x_mont:
+.cfi_startproc	
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmul4x_enter:
+	andl	$0x80100,%r11d
+	cmpl	$0x80100,%r11d
+	je	.Lmulx4x_enter
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul4x_body:
+	movq	%rdi,16(%rsp,%r9,8)
+	movq	%rdx,%r12
+	movq	(%r8),%r8
+	movq	(%r12),%rbx
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	leaq	1(%r14),%r14
+.align	4
+.Louter4x:
+	movq	(%r12,%r14,8),%rbx
+	xorq	%r15,%r15
+	movq	(%rsp),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	4(%r15),%r15
+	adcq	$0,%rdx
+	movq	%rdi,(%rsp)
+	movq	%rdx,%r13
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	4(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	-16(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-32(%rsp,%r15,8)
+	movq	%rdx,%r13
+	cmpq	%r9,%r15
+	jb	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-16(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%rsp,%r15,8)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	-8(%rsp,%r15,8),%r11
+	adcq	$0,%rdx
+	leaq	1(%r14),%r14
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%rsp,%r9,8),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%rsp,%r15,8)
+	movq	%rdi,(%rsp,%r15,8)
+
+	cmpq	%r9,%r14
+	jb	.Louter4x
+	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdx
+	shrq	$2,%r15
+	leaq	(%rsp),%rsi
+	xorq	%r14,%r14
+
+	subq	0(%rcx),%rax
+	movq	16(%rsi),%rbx
+	movq	24(%rsi),%rbp
+	sbbq	8(%rcx),%rdx
+
+.Lsub4x:
+	movq	%rax,0(%rdi,%r14,8)
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	32(%rsi,%r14,8),%rax
+	movq	40(%rsi,%r14,8),%rdx
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+	movq	%rbp,24(%rdi,%r14,8)
+	sbbq	32(%rcx,%r14,8),%rax
+	movq	48(%rsi,%r14,8),%rbx
+	movq	56(%rsi,%r14,8),%rbp
+	sbbq	40(%rcx,%r14,8),%rdx
+	leaq	4(%r14),%r14
+	decq	%r15
+	jnz	.Lsub4x
+
+	movq	%rax,0(%rdi,%r14,8)
+	movq	32(%rsi,%r14,8),%rax
+	sbbq	16(%rcx,%r14,8),%rbx
+	movq	%rdx,8(%rdi,%r14,8)
+	sbbq	24(%rcx,%r14,8),%rbp
+	movq	%rbx,16(%rdi,%r14,8)
+
+	sbbq	$0,%rax
+	movq	%rbp,24(%rdi,%r14,8)
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,224
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	$0,%xmm4,%xmm4
+	movq	%r9,%r15
+	pxor	%xmm4,%xmm5
+	shrq	$2,%r15
+	xorl	%eax,%eax
+
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:
+	movdqa	(%rsp,%rax,1),%xmm1
+	movdqu	(%rdi,%rax,1),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax,1),%xmm3
+	movdqa	%xmm0,(%rsp,%rax,1)
+	por	%xmm2,%xmm1
+	movdqu	16(%rdi,%rax,1),%xmm2
+	movdqu	%xmm1,(%rdi,%rax,1)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax,1)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16(%rdi,%rax,1)
+	leaq	32(%rax),%rax
+	decq	%r15
+	jnz	.Lcopy4x
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi, 8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+.extern	bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.extern	bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+
+.type	bn_sqr8x_mont,@function
+.align	32
+bn_sqr8x_mont:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lsqr8x_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lsqr8x_prologue:
+
+	movl	%r9d,%r10d
+	shll	$3,%r9d
+	shlq	$3+2,%r10
+	negq	%r9
+
+
+
+
+
+
+	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	movq	(%r8),%r8
+	subq	%rsi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lsqr8x_sp_alt
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
+	jmp	.Lsqr8x_sp_done
+
+.align	32
+.Lsqr8x_sp_alt:
+	leaq	4096-64(,%r9,2),%r10
+	leaq	-64(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lsqr8x_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
+
+.align	16
+.Lsqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lsqr8x_body:
+
+.byte	102,72,15,110,209
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,73,15,110,218
+	leaq	OPENSSL_ia32cap_P(%rip),%rax
+	movl	8(%rax),%eax
+	andl	$0x80100,%eax
+	cmpl	$0x80100,%eax
+	jne	.Lsqr8x_nox
+
+	call	bn_sqrx8x_internal
+
+
+
+
+	leaq	(%r8,%rcx,1),%rbx
+	movq	%rcx,%r9
+	movq	%rcx,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_nox:
+	call	bn_sqr8x_internal
+
+
+
+
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+	movq	%r9,%rdx
+.byte	102,72,15,126,207
+	sarq	$3+2,%rcx
+	jmp	.Lsqr8x_sub
+
+.align	32
+.Lsqr8x_sub:
+	movq	0(%rbx),%r12
+	movq	8(%rbx),%r13
+	movq	16(%rbx),%r14
+	movq	24(%rbx),%r15
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rbp),%r12
+	sbbq	8(%rbp),%r13
+	sbbq	16(%rbp),%r14
+	sbbq	24(%rbp),%r15
+	leaq	32(%rbp),%rbp
+	movq	%r12,0(%rdi)
+	movq	%r13,8(%rdi)
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+	incq	%rcx
+	jnz	.Lsqr8x_sub
+
+	sbbq	$0,%rax
+	leaq	(%rbx,%r9,1),%rbx
+	leaq	(%rdi,%r9,1),%rdi
+
+.byte	102,72,15,110,200
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	jmp	.Lsqr8x_cond_copy
+
+.align	32
+.Lsqr8x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	movdqa	%xmm0,-32(%rbx,%rdx,1)
+	movdqa	%xmm0,-16(%rbx,%rdx,1)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	addq	$32,%r9
+	jnz	.Lsqr8x_cond_copy
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lsqr8x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+.type	bn_mulx4x_mont,@function
+.align	32
+bn_mulx4x_mont:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmulx4x_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmulx4x_prologue:
+
+	shll	$3,%r9d
+	xorq	%r10,%r10
+	subq	%r9,%r10
+	movq	(%r8),%r8
+	leaq	-72(%rsp,%r10,1),%rbp
+	andq	$-128,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.align	16
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+	leaq	(%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r9,0(%rsp)
+	shrq	$5,%r9
+	movq	%r10,16(%rsp)
+	subq	$1,%r9
+	movq	%r8,24(%rsp)
+	movq	%rdi,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+	movq	%r9,48(%rsp)
+	jmp	.Lmulx4x_body
+
+.align	32
+.Lmulx4x_body:
+	leaq	8(%rdx),%rdi
+	movq	(%rdx),%rdx
+	leaq	64+32(%rsp),%rbx
+	movq	%rdx,%r9
+
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r14
+	addq	%rax,%r11
+	movq	%rdi,8(%rsp)
+	mulxq	16(%rsi),%r12,%r13
+	adcq	%r14,%r12
+	adcq	$0,%r13
+
+	movq	%r8,%rdi
+	imulq	24(%rsp),%r8
+	xorq	%rbp,%rbp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%rdi
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+	movq	48(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	addq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	movq	(%rdi),%rdx
+	leaq	8(%rdi),%rdi
+	subq	%rax,%rsi
+	movq	%r15,(%rbx)
+	leaq	64+32(%rsp),%rbx
+	subq	%rax,%rcx
+
+	mulxq	0(%rsi),%r8,%r11
+	xorl	%ebp,%ebp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	adoxq	-16(%rbx),%r12
+	adcxq	%rbp,%r13
+	adoxq	%rbp,%r13
+
+	movq	%rdi,8(%rsp)
+	movq	%r8,%r15
+	imulq	24(%rsp),%r8
+	xorl	%ebp,%ebp
+
+	mulxq	24(%rsi),%rax,%r14
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	adoxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	leaq	32(%rcx),%rcx
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	movq	48(%rsp),%rdi
+	movq	%r12,-16(%rbx)
+
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-32(%rbx)
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0(%rsp),%rax
+	movq	8(%rsp),%rdi
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rbp
+	adcq	%r15,%r14
+	sbbq	%r15,%r15
+	movq	%r14,-8(%rbx)
+
+	cmpq	16(%rsp),%rdi
+	jne	.Lmulx4x_outer
+
+	leaq	64(%rsp),%rbx
+	subq	%rax,%rcx
+	negq	%r15
+	movq	%rax,%rdx
+	shrq	$3+2,%rax
+	movq	32(%rsp),%rdi
+	jmp	.Lmulx4x_sub
+
+.align	32
+.Lmulx4x_sub:
+	movq	0(%rbx),%r11
+	movq	8(%rbx),%r12
+	movq	16(%rbx),%r13
+	movq	24(%rbx),%r14
+	leaq	32(%rbx),%rbx
+	sbbq	0(%rcx),%r11
+	sbbq	8(%rcx),%r12
+	sbbq	16(%rcx),%r13
+	sbbq	24(%rcx),%r14
+	leaq	32(%rcx),%rcx
+	movq	%r11,0(%rdi)
+	movq	%r12,8(%rdi)
+	movq	%r13,16(%rdi)
+	movq	%r14,24(%rdi)
+	leaq	32(%rdi),%rdi
+	decq	%rax
+	jnz	.Lmulx4x_sub
+
+	sbbq	$0,%r15
+	leaq	64(%rsp),%rbx
+	subq	%rdx,%rdi
+
+.byte	102,73,15,110,207
+	pxor	%xmm0,%xmm0
+	pshufd	$0,%xmm1,%xmm1
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	jmp	.Lmulx4x_cond_copy
+
+.align	32
+.Lmulx4x_cond_copy:
+	movdqa	0(%rbx),%xmm2
+	movdqa	16(%rbx),%xmm3
+	leaq	32(%rbx),%rbx
+	movdqu	0(%rdi),%xmm4
+	movdqu	16(%rdi),%xmm5
+	leaq	32(%rdi),%rdi
+	movdqa	%xmm0,-32(%rbx)
+	movdqa	%xmm0,-16(%rbx)
+	pcmpeqd	%xmm1,%xmm0
+	pand	%xmm1,%xmm2
+	pand	%xmm1,%xmm3
+	pand	%xmm0,%xmm4
+	pand	%xmm0,%xmm5
+	pxor	%xmm0,%xmm0
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqu	%xmm4,-32(%rdi)
+	movdqu	%xmm5,-16(%rdi)
+	subq	$32,%rdx
+	jnz	.Lmulx4x_cond_copy
+
+	movq	%rdx,(%rbx)
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_mulx4x_mont,.-bn_mulx4x_mont
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S
@@ -1,0 +1,3790 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+.extern	OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl	bn_mul_mont_gather5
+.hidden bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,@function
+.align	64
+bn_mul_mont_gather5:
+.cfi_startproc	
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	testl	$7,%r9d
+	jnz	.Lmul_enter
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	movd	8(%rsp),%xmm5
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-280(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	leaq	.Linc(%rip),%r10
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+
+	leaq	128(%rdx),%r12
+	movdqa	0(%r10),%xmm0
+	movdqa	16(%r10),%xmm1
+	leaq	24-112(%rsp,%r9,8),%r10
+	andq	$-16,%r10
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	movq	%r10,%r11
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.L1st_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	leaq	1(%r15),%r15
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.L1st
+
+
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r9,8)
+	movq	%rdx,%r13
+	movq	%r10,%r11
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	jmp	.Louter
+.align	16
+.Louter:
+	leaq	24+128(%rsp,%r9,8),%rdx
+	andq	$-16,%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+
+	movq	(%rsi),%rax
+.byte	102,72,15,126,195
+
+	xorq	%r15,%r15
+	movq	%r8,%rbp
+	movq	(%rsp),%r10
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi),%rax
+	adcq	$0,%rdx
+	movq	8(%rsp),%r10
+	movq	%rdx,%r13
+
+	leaq	1(%r15),%r15
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	addq	%rax,%r13
+	movq	(%rsi,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r15,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r15,8)
+	movq	%rdx,%r13
+
+.Linner_enter:
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	(%rcx,%r15,8),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	leaq	1(%r15),%r15
+
+	mulq	%rbp
+	cmpq	%r9,%r15
+	jne	.Linner
+
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	movq	(%rsp,%r9,8),%r10
+	adcq	$0,%rdx
+	movq	%r13,-16(%rsp,%r9,8)
+	movq	%rdx,%r13
+
+	xorq	%rdx,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%rsp,%r9,8)
+	movq	%rdx,(%rsp,%r9,8)
+
+	leaq	1(%r14),%r14
+	cmpq	%r9,%r14
+	jb	.Louter
+
+	xorq	%r14,%r14
+	movq	(%rsp),%rax
+	leaq	(%rsp),%rsi
+	movq	%r9,%r15
+	jmp	.Lsub
+.align	16
+.Lsub:	sbbq	(%rcx,%r14,8),%rax
+	movq	%rax,(%rdi,%r14,8)
+	movq	8(%rsi,%r14,8),%rax
+	leaq	1(%r14),%r14
+	decq	%r15
+	jnz	.Lsub
+
+	sbbq	$0,%rax
+	movq	$-1,%rbx
+	xorq	%rax,%rbx
+	xorq	%r14,%r14
+	movq	%r9,%r15
+
+.Lcopy:
+	movq	(%rdi,%r14,8),%rcx
+	movq	(%rsp,%r14,8),%rdx
+	andq	%rbx,%rcx
+	andq	%rax,%rdx
+	movq	%r14,(%rsp,%r14,8)
+	orq	%rcx,%rdx
+	movq	%rdx,(%rdi,%r14,8)
+	leaq	1(%r14),%r14
+	subq	$1,%r15
+	jnz	.Lcopy
+
+	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type	bn_mul4x_mont_gather5,@function
+.align	32
+bn_mul4x_mont_gather5:
+.cfi_startproc	
+.byte	0x67
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmul4x_enter:
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	.Lmulx4x_enter
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmul4x_prologue:
+
+.byte	0x67
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmul4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmul4xsp_done
+
+.align	32
+.Lmul4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmul4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	negq	%r9
+
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmul4x_body:
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmul4x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type	mul4x_internal,@function
+.align	32
+mul4x_internal:
+.cfi_startproc	
+	shlq	$5,%r9
+	movd	8(%rax),%xmm5
+	leaq	.Linc(%rip),%rax
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5,%r9
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r9,1),%r10
+	leaq	128(%rdx),%r12
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67,0x67
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+.byte	0x67
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+.byte	0x67
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+	pand	64(%r12),%xmm0
+
+	pand	80(%r12),%xmm1
+	pand	96(%r12),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%r12),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%r12),%xmm4
+	movdqa	-112(%r12),%xmm5
+	movdqa	-96(%r12),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%r12),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%r12),%xmm4
+	movdqa	-48(%r12),%xmm5
+	movdqa	-32(%r12),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%r12),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%r12),%xmm4
+	movdqa	16(%r12),%xmm5
+	movdqa	32(%r12),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%r12),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	por	%xmm1,%xmm0
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	%r13,16+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+
+	movq	(%r8),%r8
+	movq	(%rsi),%rax
+	leaq	(%rsi,%r9,1),%rsi
+	negq	%r9
+
+	movq	%r8,%rbp
+	mulq	%rbx
+	movq	%rax,%r10
+	movq	(%rcx),%rax
+
+	imulq	%r10,%rbp
+	leaq	64+8(%rsp),%r14
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+	jmp	.L1st4x
+
+.align	32
+.L1st4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdi,(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	.L1st4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%r13
+
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	jmp	.Louter4x
+
+.align	32
+.Louter4x:
+	leaq	16+128(%r14),%rdx
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r12),%xmm0
+	movdqa	-112(%r12),%xmm1
+	movdqa	-96(%r12),%xmm2
+	movdqa	-80(%r12),%xmm3
+	pand	-128(%rdx),%xmm0
+	pand	-112(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r12),%xmm0
+	movdqa	-48(%r12),%xmm1
+	movdqa	-32(%r12),%xmm2
+	movdqa	-16(%r12),%xmm3
+	pand	-64(%rdx),%xmm0
+	pand	-48(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r12),%xmm0
+	movdqa	16(%r12),%xmm1
+	movdqa	32(%r12),%xmm2
+	movdqa	48(%r12),%xmm3
+	pand	0(%rdx),%xmm0
+	pand	16(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r12),%xmm0
+	movdqa	80(%r12),%xmm1
+	movdqa	96(%r12),%xmm2
+	movdqa	112(%r12),%xmm3
+	pand	64(%rdx),%xmm0
+	pand	80(%rdx),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rdx),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rdx),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%r12),%r12
+.byte	102,72,15,126,195
+
+	movq	(%r14,%r9,1),%r10
+	movq	%r8,%rbp
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	(%rcx),%rax
+	adcq	$0,%rdx
+
+	imulq	%r10,%rbp
+	movq	%rdx,%r11
+	movq	%rdi,(%r14)
+
+	leaq	(%r14,%r9,1),%r14
+
+	mulq	%rbp
+	addq	%rax,%r10
+	movq	8(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%r9),%r15
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%rdx,%r13
+	jmp	.Linner4x
+
+.align	32
+.Linner4x:
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	-8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	0(%rcx),%rax
+	adcq	$0,%rdx
+	addq	(%r14),%r10
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	8(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-16(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	8(%rcx),%rax
+	adcq	$0,%rdx
+	addq	8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	16(%rsi,%r15,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	leaq	32(%rcx),%rcx
+	adcq	$0,%rdx
+	movq	%r13,-8(%r14)
+	movq	%rdx,%r13
+
+	addq	$32,%r15
+	jnz	.Linner4x
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	-16(%rcx),%rax
+	adcq	$0,%rdx
+	addq	16(%r14),%r10
+	leaq	32(%r14),%r14
+	adcq	$0,%rdx
+	movq	%rdx,%r11
+
+	mulq	%rbp
+	addq	%rax,%r13
+	movq	-8(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r13
+	adcq	$0,%rdx
+	movq	%rdi,-32(%r14)
+	movq	%rdx,%rdi
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	%rbp,%rax
+	movq	-8(%rcx),%rbp
+	adcq	$0,%rdx
+	addq	-8(%r14),%r11
+	adcq	$0,%rdx
+	movq	%rdx,%r10
+
+	mulq	%rbp
+	addq	%rax,%rdi
+	movq	(%rsi,%r9,1),%rax
+	adcq	$0,%rdx
+	addq	%r11,%rdi
+	adcq	$0,%rdx
+	movq	%r13,-24(%r14)
+	movq	%rdx,%r13
+
+	movq	%rdi,-16(%r14)
+	leaq	(%rcx,%r9,1),%rcx
+
+	xorq	%rdi,%rdi
+	addq	%r10,%r13
+	adcq	$0,%rdi
+	addq	(%r14),%r13
+	adcq	$0,%rdi
+	movq	%r13,-8(%r14)
+
+	cmpq	16+8(%rsp),%r12
+	jb	.Louter4x
+	xorq	%rax,%rax
+	subq	%r13,%rbp
+	adcq	%r15,%r15
+	orq	%r15,%rdi
+	subq	%rdi,%rax
+	leaq	(%r14,%r9,1),%rbx
+	movq	(%rcx),%r12
+	leaq	(%rcx),%rbp
+	movq	%r9,%rcx
+	sarq	$3+2,%rcx
+	movq	56+8(%rsp),%rdi
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+.cfi_endproc	
+.size	mul4x_internal,.-mul4x_internal
+.globl	bn_power5
+.hidden bn_power5
+.type	bn_power5,@function
+.align	32
+bn_power5:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	je	.Lpowerx5_enter
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpower5_prologue:
+
+	shll	$3,%r9d
+	leal	(%r9,%r9,2),%r10d
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwr_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwr_sp_done
+
+.align	32
+.Lpwr_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwr_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpower5_body:
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_post4x_internal
+
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	%rsi,%rdi
+	movq	40(%rsp),%rax
+	leaq	32(%rsp),%r8
+
+	call	mul4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpower5_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_power5,.-bn_power5
+
+.globl	bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.hidden	bn_sqr8x_internal
+.type	bn_sqr8x_internal,@function
+.align	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+.cfi_startproc	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	32(%r10),%rbp
+	leaq	(%rsi,%r9,1),%rsi
+
+	movq	%r9,%rcx
+
+
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	movq	%r10,-24(%rdi,%rbp,1)
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r11,-16(%rdi,%rbp,1)
+	movq	%rdx,%r10
+
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	movq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+
+	leaq	(%rbp),%rcx
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+	jmp	.Lsqr4x_1st
+
+.align	32
+.Lsqr4x_1st:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	16(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%r10,8(%rdi,%rcx,1)
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	24(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,16(%rdi,%rcx,1)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	leaq	32(%rcx),%rcx
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_1st
+
+	mulq	%r15
+	addq	%rax,%r13
+	leaq	16(%rbp),%rbp
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+	jmp	.Lsqr4x_outer
+
+.align	32
+.Lsqr4x_outer:
+	movq	-32(%rsi,%rbp,1),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi,%rbp,1),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi,%rbp,1),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	movq	-24(%rdi,%rbp,1),%r10
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	movq	%r10,-24(%rdi,%rbp,1)
+	movq	%rdx,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-16(%rdi,%rbp,1),%r11
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	movq	%r11,-16(%rdi,%rbp,1)
+
+	xorq	%r12,%r12
+
+	movq	-8(%rsi,%rbp,1),%rbx
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	-8(%rdi,%rbp,1),%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rbp,1)
+
+	leaq	(%rbp),%rcx
+	jmp	.Lsqr4x_inner
+
+.align	32
+.Lsqr4x_inner:
+	movq	(%rsi,%rcx,1),%rbx
+	mulq	%r15
+	addq	%rax,%r13
+	movq	%rbx,%rax
+	movq	%rdx,%r12
+	adcq	$0,%r12
+	addq	(%rdi,%rcx,1),%r13
+	adcq	$0,%r12
+
+.byte	0x67
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	8(%rsi,%rcx,1),%rbx
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%r11,(%rdi,%rcx,1)
+	movq	%rbx,%rax
+	movq	%rdx,%r13
+	adcq	$0,%r13
+	addq	8(%rdi,%rcx,1),%r12
+	leaq	16(%rcx),%rcx
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	adcq	$0,%rdx
+	addq	%r12,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi,%rcx,1)
+
+	cmpq	$0,%rcx
+	jne	.Lsqr4x_inner
+
+.byte	0x67
+	mulq	%r15
+	addq	%rax,%r13
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	addq	$16,%rbp
+	jnz	.Lsqr4x_outer
+
+
+	movq	-32(%rsi),%r14
+	leaq	48+8(%rsp,%r9,2),%rdi
+	movq	-24(%rsi),%rax
+	leaq	-32(%rdi,%rbp,1),%rdi
+	movq	-16(%rsi),%rbx
+	movq	%rax,%r15
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%r14
+	addq	%rax,%r11
+	movq	%rbx,%rax
+	movq	%r10,-24(%rdi)
+	movq	%rdx,%r10
+	adcq	$0,%r10
+	addq	%r13,%r11
+	movq	-8(%rsi),%rbx
+	adcq	$0,%r10
+
+	mulq	%r15
+	addq	%rax,%r12
+	movq	%rbx,%rax
+	movq	%r11,-16(%rdi)
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%r14
+	addq	%rax,%r10
+	movq	%rbx,%rax
+	movq	%rdx,%r11
+	adcq	$0,%r11
+	addq	%r12,%r10
+	adcq	$0,%r11
+	movq	%r10,-8(%rdi)
+
+	mulq	%r15
+	addq	%rax,%r13
+	movq	-16(%rsi),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r13
+	adcq	$0,%rdx
+
+	movq	%r13,(%rdi)
+	movq	%rdx,%r12
+	movq	%rdx,8(%rdi)
+
+	mulq	%rbx
+	addq	$16,%rbp
+	xorq	%r14,%r14
+	subq	%r9,%rbp
+	xorq	%r15,%r15
+
+	addq	%r12,%rax
+	adcq	$0,%rdx
+	movq	%rax,8(%rdi)
+	movq	%rdx,16(%rdi)
+	movq	%r15,24(%rdi)
+
+	movq	-16(%rsi,%rbp,1),%rax
+	leaq	48+8(%rsp),%rdi
+	xorq	%r10,%r10
+	movq	8(%rdi),%r11
+
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	leaq	16(%rbp),%rbp
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	jmp	.Lsqr4x_shift_n_add
+
+.align	32
+.Lsqr4x_shift_n_add:
+	leaq	(%r14,%r10,2),%r12
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi,%rbp,1),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	0(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	8(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	0(%rsi,%rbp,1),%rax
+	movq	%rbx,-16(%rdi)
+	adcq	%rdx,%r8
+
+	leaq	(%r14,%r10,2),%r12
+	movq	%r8,-8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	24(%rdi),%r11
+	adcq	%rax,%r12
+	movq	8(%rsi,%rbp,1),%rax
+	movq	%r12,0(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	movq	32(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	40(%rdi),%r11
+	adcq	%rax,%rbx
+	movq	16(%rsi,%rbp,1),%rax
+	movq	%rbx,16(%rdi)
+	adcq	%rdx,%r8
+	movq	%r8,24(%rdi)
+	sbbq	%r15,%r15
+	leaq	64(%rdi),%rdi
+	addq	$32,%rbp
+	jnz	.Lsqr4x_shift_n_add
+
+	leaq	(%r14,%r10,2),%r12
+.byte	0x67
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r13
+	shrq	$63,%r11
+	orq	%r10,%r13
+	movq	-16(%rdi),%r10
+	movq	%r11,%r14
+	mulq	%rax
+	negq	%r15
+	movq	-8(%rdi),%r11
+	adcq	%rax,%r12
+	movq	-8(%rsi),%rax
+	movq	%r12,-32(%rdi)
+	adcq	%rdx,%r13
+
+	leaq	(%r14,%r10,2),%rbx
+	movq	%r13,-24(%rdi)
+	sbbq	%r15,%r15
+	shrq	$63,%r10
+	leaq	(%rcx,%r11,2),%r8
+	shrq	$63,%r11
+	orq	%r10,%r8
+	mulq	%rax
+	negq	%r15
+	adcq	%rax,%rbx
+	adcq	%rdx,%r8
+	movq	%rbx,-16(%rdi)
+	movq	%r8,-8(%rdi)
+.byte	102,72,15,126,213
+__bn_sqr8x_reduction:
+	xorq	%rax,%rax
+	leaq	(%r9,%rbp,1),%rcx
+	leaq	48+8(%rsp,%r9,2),%rdx
+	movq	%rcx,0+8(%rsp)
+	leaq	48+8(%rsp,%r9,1),%rdi
+	movq	%rdx,8+8(%rsp)
+	negq	%r9
+	jmp	.L8x_reduction_loop
+
+.align	32
+.L8x_reduction_loop:
+	leaq	(%rdi,%r9,1),%rdi
+.byte	0x66
+	movq	0(%rdi),%rbx
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,(%rdx)
+	leaq	64(%rdi),%rdi
+
+.byte	0x67
+	movq	%rbx,%r8
+	imulq	32+8(%rsp),%rbx
+	movq	0(%rbp),%rax
+	movl	$8,%ecx
+	jmp	.L8x_reduce
+
+.align	32
+.L8x_reduce:
+	mulq	%rbx
+	movq	8(%rbp),%rax
+	negq	%r8
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	movq	%rbx,48-8+8(%rsp,%rcx,8)
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	32+8(%rsp),%rsi
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	imulq	%r8,%rsi
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	%rsi,%rbx
+	addq	%rax,%r15
+	movq	0(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_reduce
+
+	leaq	64(%rbp),%rbp
+	xorq	%rax,%rax
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_no_tail
+
+.byte	0x66
+	addq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movq	48+56+8(%rsp),%rbx
+	movl	$8,%ecx
+	movq	0(%rbp),%rax
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail:
+	mulq	%rbx
+	addq	%rax,%r8
+	movq	8(%rbp),%rax
+	movq	%r8,(%rdi)
+	movq	%rdx,%r8
+	adcq	$0,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	16(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r9,%r8
+	leaq	8(%rdi),%rdi
+	movq	%rdx,%r9
+	adcq	$0,%r9
+
+	mulq	%rbx
+	addq	%rax,%r10
+	movq	24(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r10,%r9
+	movq	%rdx,%r10
+	adcq	$0,%r10
+
+	mulq	%rbx
+	addq	%rax,%r11
+	movq	32(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r11,%r10
+	movq	%rdx,%r11
+	adcq	$0,%r11
+
+	mulq	%rbx
+	addq	%rax,%r12
+	movq	40(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r12,%r11
+	movq	%rdx,%r12
+	adcq	$0,%r12
+
+	mulq	%rbx
+	addq	%rax,%r13
+	movq	48(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r13,%r12
+	movq	%rdx,%r13
+	adcq	$0,%r13
+
+	mulq	%rbx
+	addq	%rax,%r14
+	movq	56(%rbp),%rax
+	adcq	$0,%rdx
+	addq	%r14,%r13
+	movq	%rdx,%r14
+	adcq	$0,%r14
+
+	mulq	%rbx
+	movq	48-16+8(%rsp,%rcx,8),%rbx
+	addq	%rax,%r15
+	adcq	$0,%rdx
+	addq	%r15,%r14
+	movq	0(%rbp),%rax
+	movq	%rdx,%r15
+	adcq	$0,%r15
+
+	decl	%ecx
+	jnz	.L8x_tail
+
+	leaq	64(%rbp),%rbp
+	movq	8+8(%rsp),%rdx
+	cmpq	0+8(%rsp),%rbp
+	jae	.L8x_tail_done
+
+	movq	48+56+8(%rsp),%rbx
+	negq	%rsi
+	movq	0(%rbp),%rax
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	sbbq	%rsi,%rsi
+
+	movl	$8,%ecx
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail_done:
+	xorq	%rax,%rax
+	addq	(%rdx),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	negq	%rsi
+.L8x_no_tail:
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+	movq	-8(%rbp),%rcx
+	xorq	%rsi,%rsi
+
+.byte	102,72,15,126,213
+
+	movq	%r8,0(%rdi)
+	movq	%r9,8(%rdi)
+.byte	102,73,15,126,217
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+	leaq	64(%rdi),%rdi
+
+	cmpq	%rdx,%rdi
+	jb	.L8x_reduction_loop
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+.type	__bn_post4x_internal,@function
+.align	32
+__bn_post4x_internal:
+.cfi_startproc	
+	movq	0(%rbp),%r12
+	leaq	(%rdi,%r9,1),%rbx
+	movq	%r9,%rcx
+.byte	102,72,15,126,207
+	negq	%rax
+.byte	102,72,15,126,206
+	sarq	$3+2,%rcx
+	decq	%r12
+	xorq	%r10,%r10
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqr4x_sub_entry
+
+.align	16
+.Lsqr4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqr4x_sub_entry:
+	leaq	32(%rbp),%rbp
+	notq	%r12
+	notq	%r13
+	notq	%r14
+	notq	%r15
+	andq	%rax,%r12
+	andq	%rax,%r13
+	andq	%rax,%r14
+	andq	%rax,%r15
+
+	negq	%r10
+	adcq	0(%rbx),%r12
+	adcq	8(%rbx),%r13
+	adcq	16(%rbx),%r14
+	adcq	24(%rbx),%r15
+	movq	%r12,0(%rdi)
+	leaq	32(%rbx),%rbx
+	movq	%r13,8(%rdi)
+	sbbq	%r10,%r10
+	movq	%r14,16(%rdi)
+	movq	%r15,24(%rdi)
+	leaq	32(%rdi),%rdi
+
+	incq	%rcx
+	jnz	.Lsqr4x_sub
+
+	movq	%r9,%r10
+	negq	%r9
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__bn_post4x_internal,.-__bn_post4x_internal
+.globl	bn_from_montgomery
+.hidden bn_from_montgomery
+.type	bn_from_montgomery,@function
+.align	32
+bn_from_montgomery:
+.cfi_startproc	
+	testl	$7,%r9d
+	jz	bn_from_mont8x
+	xorl	%eax,%eax
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_from_montgomery,.-bn_from_montgomery
+
+.type	bn_from_mont8x,@function
+.align	32
+bn_from_mont8x:
+.cfi_startproc	
+.byte	0x67
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lfrom_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lfrom_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lfrom_sp_done
+
+.align	32
+.Lfrom_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lfrom_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lfrom_page_walk
+	jmp	.Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lfrom_page_walk
+.Lfrom_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lfrom_body:
+	movq	%r9,%r11
+	leaq	48(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	jmp	.Lmul_by_1
+
+.align	32
+.Lmul_by_1:
+	movdqu	(%rsi),%xmm1
+	movdqu	16(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm3
+	movdqa	%xmm0,(%rax,%r9,1)
+	movdqu	48(%rsi),%xmm4
+	movdqa	%xmm0,16(%rax,%r9,1)
+.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
+	movdqa	%xmm1,(%rax)
+	movdqa	%xmm0,32(%rax,%r9,1)
+	movdqa	%xmm2,16(%rax)
+	movdqa	%xmm0,48(%rax,%r9,1)
+	movdqa	%xmm3,32(%rax)
+	movdqa	%xmm4,48(%rax)
+	leaq	64(%rax),%rax
+	subq	$64,%r11
+	jnz	.Lmul_by_1
+
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	0x67
+	movq	%rcx,%rbp
+.byte	102,73,15,110,218
+	leaq	OPENSSL_ia32cap_P(%rip),%r11
+	movl	8(%r11),%r11d
+	andl	$0x80108,%r11d
+	cmpl	$0x80108,%r11d
+	jne	.Lfrom_mont_nox
+
+	leaq	(%rax,%r9,1),%rdi
+	call	__bn_sqrx8x_reduction
+	call	__bn_postx4x_internal
+
+	pxor	%xmm0,%xmm0
+	leaq	48(%rsp),%rax
+	jmp	.Lfrom_mont_zero
+
+.align	32
+.Lfrom_mont_nox:
+	call	__bn_sqr8x_reduction
+	call	__bn_post4x_internal
+
+	pxor	%xmm0,%xmm0
+	leaq	48(%rsp),%rax
+	jmp	.Lfrom_mont_zero
+
+.align	32
+.Lfrom_mont_zero:
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm0,16(%rax)
+	movdqa	%xmm0,32(%rax)
+	movdqa	%xmm0,48(%rax)
+	leaq	64(%rax),%rax
+	subq	$32,%r9
+	jnz	.Lfrom_mont_zero
+
+	movq	$1,%rax
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lfrom_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_from_mont8x,.-bn_from_mont8x
+.type	bn_mulx4x_mont_gather5,@function
+.align	32
+bn_mulx4x_mont_gather5:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmulx4x_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmulx4x_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lmulx4xsp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lmulx4xsp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lmulx4x_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type	mulx4x_internal,@function
+.align	32
+mulx4x_internal:
+.cfi_startproc	
+	movq	%r9,8(%rsp)
+	movq	%r9,%r10
+	negq	%r9
+	shlq	$5,%r9
+	negq	%r10
+	leaq	128(%rdx,%r9,1),%r13
+	shrq	$5+5,%r9
+	movd	8(%rax),%xmm5
+	subq	$1,%r9
+	leaq	.Linc(%rip),%rax
+	movq	%r13,16+8(%rsp)
+	movq	%r9,24+8(%rsp)
+	movq	%rdi,56+8(%rsp)
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	88-112(%rsp,%r10,1),%r10
+	leaq	128(%rdx),%rdi
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+.byte	0x67
+	movdqa	%xmm1,%xmm2
+.byte	0x67
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,112(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,128(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,144(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,160(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,176(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,192(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,208(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,224(%r10)
+	movdqa	%xmm4,%xmm3
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,240(%r10)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,256(%r10)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,272(%r10)
+	movdqa	%xmm4,%xmm2
+
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,288(%r10)
+	movdqa	%xmm4,%xmm3
+.byte	0x67
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,304(%r10)
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,320(%r10)
+
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,336(%r10)
+
+	pand	64(%rdi),%xmm0
+	pand	80(%rdi),%xmm1
+	pand	96(%rdi),%xmm2
+	movdqa	%xmm3,352(%r10)
+	pand	112(%rdi),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-128(%rdi),%xmm4
+	movdqa	-112(%rdi),%xmm5
+	movdqa	-96(%rdi),%xmm2
+	pand	112(%r10),%xmm4
+	movdqa	-80(%rdi),%xmm3
+	pand	128(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	144(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	160(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	-64(%rdi),%xmm4
+	movdqa	-48(%rdi),%xmm5
+	movdqa	-32(%rdi),%xmm2
+	pand	176(%r10),%xmm4
+	movdqa	-16(%rdi),%xmm3
+	pand	192(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	208(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	224(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	movdqa	0(%rdi),%xmm4
+	movdqa	16(%rdi),%xmm5
+	movdqa	32(%rdi),%xmm2
+	pand	240(%r10),%xmm4
+	movdqa	48(%rdi),%xmm3
+	pand	256(%r10),%xmm5
+	por	%xmm4,%xmm0
+	pand	272(%r10),%xmm2
+	por	%xmm5,%xmm1
+	pand	288(%r10),%xmm3
+	por	%xmm2,%xmm0
+	por	%xmm3,%xmm1
+	pxor	%xmm1,%xmm0
+	pshufd	$0x4e,%xmm0,%xmm1
+	por	%xmm1,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+	leaq	64+32+8(%rsp),%rbx
+
+	movq	%rdx,%r9
+	mulxq	0(%rsi),%r8,%rax
+	mulxq	8(%rsi),%r11,%r12
+	addq	%rax,%r11
+	mulxq	16(%rsi),%rax,%r13
+	adcq	%rax,%r12
+	adcq	$0,%r13
+	mulxq	24(%rsi),%rax,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+	xorq	%rbp,%rbp
+	movq	%r8,%rdx
+
+	movq	%rdi,8+8(%rsp)
+
+	leaq	32(%rsi),%rsi
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r11,-24(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r12,-16(%rbx)
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcxq	%rbp,%r15
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+.byte	0x67,0x67
+	movq	%r8,%rdx
+	adcxq	%rax,%r13
+	adcxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-32(%rbx)
+	adoxq	%r15,%r13
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	leaq	32(%rcx),%rcx
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_1st
+
+	movq	8(%rsp),%rax
+	adcq	%rbp,%r15
+	leaq	(%rsi,%rax,1),%rsi
+	addq	%r15,%r14
+	movq	8+8(%rsp),%rdi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	leaq	16-256(%rbx),%r10
+	pxor	%xmm4,%xmm4
+.byte	0x67,0x67
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%rdi),%xmm0
+	movdqa	-112(%rdi),%xmm1
+	movdqa	-96(%rdi),%xmm2
+	pand	256(%r10),%xmm0
+	movdqa	-80(%rdi),%xmm3
+	pand	272(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	288(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	304(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%rdi),%xmm0
+	movdqa	-48(%rdi),%xmm1
+	movdqa	-32(%rdi),%xmm2
+	pand	320(%r10),%xmm0
+	movdqa	-16(%rdi),%xmm3
+	pand	336(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	352(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	368(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%rdi),%xmm0
+	movdqa	16(%rdi),%xmm1
+	movdqa	32(%rdi),%xmm2
+	pand	384(%r10),%xmm0
+	movdqa	48(%rdi),%xmm3
+	pand	400(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	416(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	432(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%rdi),%xmm0
+	movdqa	80(%rdi),%xmm1
+	movdqa	96(%rdi),%xmm2
+	pand	448(%r10),%xmm0
+	movdqa	112(%rdi),%xmm3
+	pand	464(%r10),%xmm1
+	por	%xmm0,%xmm4
+	pand	480(%r10),%xmm2
+	por	%xmm1,%xmm5
+	pand	496(%r10),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	leaq	256(%rdi),%rdi
+.byte	102,72,15,126,194
+
+	movq	%rbp,(%rbx)
+	leaq	32(%rbx,%rax,1),%rbx
+	mulxq	0(%rsi),%r8,%r11
+	xorq	%rbp,%rbp
+	movq	%rdx,%r9
+	mulxq	8(%rsi),%r14,%r12
+	adoxq	-32(%rbx),%r8
+	adcxq	%r14,%r11
+	mulxq	16(%rsi),%r15,%r13
+	adoxq	-24(%rbx),%r11
+	adcxq	%r15,%r12
+	mulxq	24(%rsi),%rdx,%r14
+	adoxq	-16(%rbx),%r12
+	adcxq	%rdx,%r13
+	leaq	(%rcx,%rax,1),%rcx
+	leaq	32(%rsi),%rsi
+	adoxq	-8(%rbx),%r13
+	adcxq	%rbp,%r14
+	adoxq	%rbp,%r14
+
+	movq	%r8,%r15
+	imulq	32+8(%rsp),%r8
+
+	movq	%r8,%rdx
+	xorq	%rbp,%rbp
+	movq	%rdi,8+8(%rsp)
+
+	mulxq	0(%rcx),%rax,%r10
+	adcxq	%rax,%r15
+	adoxq	%r11,%r10
+	mulxq	8(%rcx),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+	mulxq	16(%rcx),%rax,%r12
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	movq	24+8(%rsp),%rdi
+	movq	%r10,-32(%rbx)
+	adcxq	%rax,%r12
+	movq	%r11,-24(%rbx)
+	adoxq	%rbp,%r15
+	movq	%r12,-16(%rbx)
+	leaq	32(%rcx),%rcx
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulxq	0(%rsi),%r10,%rax
+	adcxq	%rbp,%r15
+	adoxq	%r14,%r10
+	mulxq	8(%rsi),%r11,%r14
+	adcxq	0(%rbx),%r10
+	adoxq	%rax,%r11
+	mulxq	16(%rsi),%r12,%rax
+	adcxq	8(%rbx),%r11
+	adoxq	%r14,%r12
+	mulxq	24(%rsi),%r13,%r14
+	movq	%r8,%rdx
+	adcxq	16(%rbx),%r12
+	adoxq	%rax,%r13
+	adcxq	24(%rbx),%r13
+	adoxq	%rbp,%r14
+	leaq	32(%rsi),%rsi
+	leaq	32(%rbx),%rbx
+	adcxq	%rbp,%r14
+
+	adoxq	%r15,%r10
+	mulxq	0(%rcx),%rax,%r15
+	adcxq	%rax,%r10
+	adoxq	%r15,%r11
+	mulxq	8(%rcx),%rax,%r15
+	adcxq	%rax,%r11
+	adoxq	%r15,%r12
+	mulxq	16(%rcx),%rax,%r15
+	movq	%r10,-40(%rbx)
+	adcxq	%rax,%r12
+	adoxq	%r15,%r13
+	movq	%r11,-32(%rbx)
+	mulxq	24(%rcx),%rax,%r15
+	movq	%r9,%rdx
+	leaq	32(%rcx),%rcx
+	movq	%r12,-24(%rbx)
+	adcxq	%rax,%r13
+	adoxq	%rbp,%r15
+	movq	%r13,-16(%rbx)
+
+	decq	%rdi
+	jnz	.Lmulx4x_inner
+
+	movq	0+8(%rsp),%rax
+	adcq	%rbp,%r15
+	subq	0(%rbx),%rdi
+	movq	8+8(%rsp),%rdi
+	movq	16+8(%rsp),%r10
+	adcq	%r15,%r14
+	leaq	(%rsi,%rax,1),%rsi
+	adcq	%rbp,%rbp
+	movq	%r14,-8(%rbx)
+
+	cmpq	%r10,%rdi
+	jb	.Lmulx4x_outer
+
+	movq	-8(%rcx),%r10
+	movq	%rbp,%r8
+	movq	(%rcx,%rax,1),%r12
+	leaq	(%rcx,%rax,1),%rbp
+	movq	%rax,%rcx
+	leaq	(%rbx,%rax,1),%rdi
+	xorl	%eax,%eax
+	xorq	%r15,%r15
+	subq	%r14,%r10
+	adcq	%r15,%r15
+	orq	%r15,%r8
+	sarq	$3+2,%rcx
+	subq	%r8,%rax
+	movq	56+8(%rsp),%rdx
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+.cfi_endproc	
+.size	mulx4x_internal,.-mulx4x_internal
+.type	bn_powerx5,@function
+.align	32
+bn_powerx5:
+.cfi_startproc	
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lpowerx5_enter:
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpowerx5_prologue:
+
+	shll	$3,%r9d
+	leaq	(%r9,%r9,2),%r10
+	negq	%r9
+	movq	(%r8),%r8
+
+
+
+
+
+
+
+
+	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
+	subq	%rdi,%r11
+	andq	$4095,%r11
+	cmpq	%r11,%r10
+	jb	.Lpwrx_sp_alt
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
+	jmp	.Lpwrx_sp_done
+
+.align	32
+.Lpwrx_sp_alt:
+	leaq	4096-320(,%r9,2),%r10
+	leaq	-320(%rbp,%r9,2),%rbp
+	subq	%r10,%r11
+	movq	$0,%r10
+	cmovcq	%r10,%r11
+	subq	%r11,%rbp
+.Lpwrx_sp_done:
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+	jmp	.Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+	movq	%r9,%r10
+	negq	%r9
+
+
+
+
+
+
+
+
+
+
+
+
+	pxor	%xmm0,%xmm0
+.byte	102,72,15,110,207
+.byte	102,72,15,110,209
+.byte	102,73,15,110,218
+.byte	102,72,15,110,226
+	movq	%r8,32(%rsp)
+	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_postx4x_internal
+
+	movq	%r10,%r9
+	movq	%rsi,%rdi
+.byte	102,72,15,126,209
+.byte	102,72,15,126,226
+	movq	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
+	movq	$1,%rax
+
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
+.Lpowerx5_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_powerx5,.-bn_powerx5
+
+.globl	bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.hidden	bn_sqrx8x_internal
+.type	bn_sqrx8x_internal,@function
+.align	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc	
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	leaq	48+8(%rsp),%rdi
+	leaq	(%rsi,%r9,1),%rbp
+	movq	%r9,0+8(%rsp)
+	movq	%rbp,8+8(%rsp)
+	jmp	.Lsqr8x_zero_start
+
+.align	32
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte	0x3e
+	movdqa	%xmm0,0(%rdi)
+	movdqa	%xmm0,16(%rdi)
+	movdqa	%xmm0,32(%rdi)
+	movdqa	%xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+	movdqa	%xmm0,64(%rdi)
+	movdqa	%xmm0,80(%rdi)
+	movdqa	%xmm0,96(%rdi)
+	movdqa	%xmm0,112(%rdi)
+	leaq	128(%rdi),%rdi
+	subq	$64,%r9
+	jnz	.Lsqrx8x_zero
+
+	movq	0(%rsi),%rdx
+
+	xorq	%r10,%r10
+	xorq	%r11,%r11
+	xorq	%r12,%r12
+	xorq	%r13,%r13
+	xorq	%r14,%r14
+	xorq	%r15,%r15
+	leaq	48+8(%rsp),%rdi
+	xorq	%rbp,%rbp
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_loop:
+	mulxq	8(%rsi),%r8,%rax
+	adcxq	%r9,%r8
+	adoxq	%rax,%r10
+	mulxq	16(%rsi),%r9,%rax
+	adcxq	%r10,%r9
+	adoxq	%rax,%r11
+.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+	adcxq	%r11,%r10
+	adoxq	%rax,%r12
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+	adcxq	%r12,%r11
+	adoxq	%rax,%r13
+	mulxq	40(%rsi),%r12,%rax
+	adcxq	%r13,%r12
+	adoxq	%rax,%r14
+	mulxq	48(%rsi),%r13,%rax
+	adcxq	%r14,%r13
+	adoxq	%r15,%rax
+	mulxq	56(%rsi),%r14,%r15
+	movq	8(%rsi),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbp,%r15
+	adcq	64(%rdi),%r15
+	movq	%r8,8(%rdi)
+	movq	%r9,16(%rdi)
+	sbbq	%rcx,%rcx
+	xorq	%rbp,%rbp
+
+
+	mulxq	16(%rsi),%r8,%rbx
+	mulxq	24(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	32(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%rbx,%r11
+.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+	adcxq	%r13,%r11
+	adoxq	%r14,%r12
+.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+	movq	16(%rsi),%rdx
+	adcxq	%rax,%r12
+	adoxq	%rbx,%r13
+	adcxq	%r15,%r13
+	adoxq	%rbp,%r14
+	adcxq	%rbp,%r14
+
+	movq	%r8,24(%rdi)
+	movq	%r9,32(%rdi)
+
+	mulxq	24(%rsi),%r8,%rbx
+	mulxq	32(%rsi),%r9,%rax
+	adcxq	%r10,%r8
+	adoxq	%rbx,%r9
+	mulxq	40(%rsi),%r10,%rbx
+	adcxq	%r11,%r9
+	adoxq	%rax,%r10
+.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+	adcxq	%r12,%r10
+	adoxq	%r13,%r11
+.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte	0x3e
+	movq	24(%rsi),%rdx
+	adcxq	%rbx,%r11
+	adoxq	%rax,%r12
+	adcxq	%r14,%r12
+	movq	%r8,40(%rdi)
+	movq	%r9,48(%rdi)
+	mulxq	32(%rsi),%r8,%rax
+	adoxq	%rbp,%r13
+	adcxq	%rbp,%r13
+
+	mulxq	40(%rsi),%r9,%rbx
+	adcxq	%r10,%r8
+	adoxq	%rax,%r9
+	mulxq	48(%rsi),%r10,%rax
+	adcxq	%r11,%r9
+	adoxq	%r12,%r10
+	mulxq	56(%rsi),%r11,%r12
+	movq	32(%rsi),%rdx
+	movq	40(%rsi),%r14
+	adcxq	%rbx,%r10
+	adoxq	%rax,%r11
+	movq	48(%rsi),%r15
+	adcxq	%r13,%r11
+	adoxq	%rbp,%r12
+	adcxq	%rbp,%r12
+
+	movq	%r8,56(%rdi)
+	movq	%r9,64(%rdi)
+
+	mulxq	%r14,%r9,%rax
+	movq	56(%rsi),%r8
+	adcxq	%r10,%r9
+	mulxq	%r15,%r10,%rbx
+	adoxq	%rax,%r10
+	adcxq	%r11,%r10
+	mulxq	%r8,%r11,%rax
+	movq	%r14,%rdx
+	adoxq	%rbx,%r11
+	adcxq	%r12,%r11
+
+	adcxq	%rbp,%rax
+
+	mulxq	%r15,%r14,%rbx
+	mulxq	%r8,%r12,%r13
+	movq	%r15,%rdx
+	leaq	64(%rsi),%rsi
+	adcxq	%r14,%r11
+	adoxq	%rbx,%r12
+	adcxq	%rax,%r12
+	adoxq	%rbp,%r13
+
+.byte	0x67,0x67
+	mulxq	%r8,%r8,%r14
+	adcxq	%r8,%r13
+	adcxq	%rbp,%r14
+
+	cmpq	8+8(%rsp),%rsi
+	je	.Lsqrx8x_outer_break
+
+	negq	%rcx
+	movq	$-8,%rcx
+	movq	%rbp,%r15
+	movq	64(%rdi),%r8
+	adcxq	72(%rdi),%r9
+	adcxq	80(%rdi),%r10
+	adcxq	88(%rdi),%r11
+	adcq	96(%rdi),%r12
+	adcq	104(%rdi),%r13
+	adcq	112(%rdi),%r14
+	adcq	120(%rdi),%r15
+	leaq	(%rsi),%rbp
+	leaq	128(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	movq	-64(%rsi),%rdx
+	movq	%rax,16+8(%rsp)
+	movq	%rdi,24+8(%rsp)
+
+
+	xorl	%eax,%eax
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_loop:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	movq	%rbx,(%rdi,%rcx,8)
+	movl	$0,%ebx
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+	movq	8(%rsi,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rbx,%r15
+	adcxq	%rbx,%r15
+
+.byte	0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_loop
+
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	cmpq	8+8(%rsp),%rbp
+	je	.Lsqrx8x_break
+
+	subq	16+8(%rsp),%rbx
+.byte	0x66
+	movq	-64(%rsi),%rdx
+	adcxq	0(%rdi),%r8
+	adcxq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+.byte	0x67
+	sbbq	%rax,%rax
+	xorl	%ebx,%ebx
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_break:
+	xorq	%rbp,%rbp
+	subq	16+8(%rsp),%rbx
+	adcxq	%rbp,%r8
+	movq	24+8(%rsp),%rcx
+	adcxq	%rbp,%r9
+	movq	0(%rsi),%rdx
+	adcq	$0,%r10
+	movq	%r8,0(%rdi)
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	cmpq	%rcx,%rdi
+	je	.Lsqrx8x_outer_loop
+
+	movq	%r9,8(%rdi)
+	movq	8(%rcx),%r9
+	movq	%r10,16(%rdi)
+	movq	16(%rcx),%r10
+	movq	%r11,24(%rdi)
+	movq	24(%rcx),%r11
+	movq	%r12,32(%rdi)
+	movq	32(%rcx),%r12
+	movq	%r13,40(%rdi)
+	movq	40(%rcx),%r13
+	movq	%r14,48(%rdi)
+	movq	48(%rcx),%r14
+	movq	%r15,56(%rdi)
+	movq	56(%rcx),%r15
+	movq	%rcx,%rdi
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_break:
+	movq	%r9,72(%rdi)
+.byte	102,72,15,126,217
+	movq	%r10,80(%rdi)
+	movq	%r11,88(%rdi)
+	movq	%r12,96(%rdi)
+	movq	%r13,104(%rdi)
+	movq	%r14,112(%rdi)
+	leaq	48+8(%rsp),%rdi
+	movq	(%rsi,%rcx,1),%rdx
+
+	movq	8(%rdi),%r11
+	xorq	%r10,%r10
+	movq	0+8(%rsp),%r9
+	adoxq	%r11,%r11
+	movq	16(%rdi),%r12
+	movq	24(%rdi),%r13
+
+
+.align	32
+.Lsqrx4x_shift_n_add:
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	40(%rdi),%r11
+	movq	%rax,0(%rdi)
+	movq	%rbx,8(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	movq	16(%rsi,%rcx,1),%rdx
+	movq	48(%rdi),%r12
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	56(%rdi),%r13
+	movq	%rax,16(%rdi)
+	movq	%rbx,24(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r12,%r12
+	adcxq	%r10,%rax
+	movq	24(%rsi,%rcx,1),%rdx
+	leaq	32(%rcx),%rcx
+	movq	64(%rdi),%r10
+	adoxq	%r13,%r13
+	adcxq	%r11,%rbx
+	movq	72(%rdi),%r11
+	movq	%rax,32(%rdi)
+	movq	%rbx,40(%rdi)
+
+	mulxq	%rdx,%rax,%rbx
+	adoxq	%r10,%r10
+	adcxq	%r12,%rax
+	jrcxz	.Lsqrx4x_shift_n_add_break
+.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+	adoxq	%r11,%r11
+	adcxq	%r13,%rbx
+	movq	80(%rdi),%r12
+	movq	88(%rdi),%r13
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+	nop
+	jmp	.Lsqrx4x_shift_n_add
+
+.align	32
+.Lsqrx4x_shift_n_add_break:
+	adcxq	%r13,%rbx
+	movq	%rax,48(%rdi)
+	movq	%rbx,56(%rdi)
+	leaq	64(%rdi),%rdi
+.byte	102,72,15,126,213
+__bn_sqrx8x_reduction:
+	xorl	%eax,%eax
+	movq	32+8(%rsp),%rbx
+	movq	48+8(%rsp),%rdx
+	leaq	-64(%rbp,%r9,1),%rcx
+
+	movq	%rcx,0+8(%rsp)
+	movq	%rdi,8+8(%rsp)
+
+	leaq	48+8(%rsp),%rdi
+	jmp	.Lsqrx8x_reduction_loop
+
+.align	32
+.Lsqrx8x_reduction_loop:
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r12
+	movq	%rdx,%r8
+	imulq	%rbx,%rdx
+	movq	40(%rdi),%r13
+	movq	48(%rdi),%r14
+	movq	56(%rdi),%r15
+	movq	%rax,24+8(%rsp)
+
+	leaq	64(%rdi),%rdi
+	xorq	%rsi,%rsi
+	movq	$-8,%rcx
+	jmp	.Lsqrx8x_reduce
+
+.align	32
+.Lsqrx8x_reduce:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rbx,%rax
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rbx,%r9
+	adcxq	%rbx,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rbx,%r10
+	adcxq	%rbx,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rbx,%r11
+	adcxq	%rbx,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+	movq	%rdx,%rax
+	movq	%r8,%rdx
+	adcxq	%rbx,%r11
+	adoxq	%r13,%r12
+
+	mulxq	32+8(%rsp),%rbx,%rdx
+	movq	%rax,%rdx
+	movq	%rax,64+48+8(%rsp,%rcx,8)
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	%rbx,%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	adcxq	%rsi,%r15
+
+.byte	0x67,0x67,0x67
+	incq	%rcx
+	jnz	.Lsqrx8x_reduce
+
+	movq	%rsi,%rax
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_no_tail
+
+	movq	48+8(%rsp),%rdx
+	addq	0(%rdi),%r8
+	leaq	64(%rbp),%rbp
+	movq	$-8,%rcx
+	adcxq	8(%rdi),%r9
+	adcxq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail:
+	movq	%r8,%rbx
+	mulxq	0(%rbp),%rax,%r8
+	adcxq	%rax,%rbx
+	adoxq	%r9,%r8
+
+	mulxq	8(%rbp),%rax,%r9
+	adcxq	%rax,%r8
+	adoxq	%r10,%r9
+
+	mulxq	16(%rbp),%rax,%r10
+	adcxq	%rax,%r9
+	adoxq	%r11,%r10
+
+	mulxq	24(%rbp),%rax,%r11
+	adcxq	%rax,%r10
+	adoxq	%r12,%r11
+
+.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+	adcxq	%rax,%r11
+	adoxq	%r13,%r12
+
+	mulxq	40(%rbp),%rax,%r13
+	adcxq	%rax,%r12
+	adoxq	%r14,%r13
+
+	mulxq	48(%rbp),%rax,%r14
+	adcxq	%rax,%r13
+	adoxq	%r15,%r14
+
+	mulxq	56(%rbp),%rax,%r15
+	movq	72+48+8(%rsp,%rcx,8),%rdx
+	adcxq	%rax,%r14
+	adoxq	%rsi,%r15
+	movq	%rbx,(%rdi,%rcx,8)
+	movq	%r8,%rbx
+	adcxq	%rsi,%r15
+
+	incq	%rcx
+	jnz	.Lsqrx8x_tail
+
+	cmpq	0+8(%rsp),%rbp
+	jae	.Lsqrx8x_tail_done
+
+	subq	16+8(%rsp),%rsi
+	movq	48+8(%rsp),%rdx
+	leaq	64(%rbp),%rbp
+	adcq	0(%rdi),%r8
+	adcq	8(%rdi),%r9
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	leaq	64(%rdi),%rdi
+	sbbq	%rax,%rax
+	subq	$8,%rcx
+
+	xorq	%rsi,%rsi
+	movq	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail_done:
+	xorq	%rax,%rax
+	addq	24+8(%rsp),%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+	adcq	$0,%r11
+	adcq	$0,%r12
+	adcq	$0,%r13
+	adcq	$0,%r14
+	adcq	$0,%r15
+	adcq	$0,%rax
+
+	subq	16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+	adcq	0(%rdi),%r8
+.byte	102,72,15,126,217
+	adcq	8(%rdi),%r9
+	movq	56(%rbp),%rsi
+.byte	102,72,15,126,213
+	adcq	16(%rdi),%r10
+	adcq	24(%rdi),%r11
+	adcq	32(%rdi),%r12
+	adcq	40(%rdi),%r13
+	adcq	48(%rdi),%r14
+	adcq	56(%rdi),%r15
+	adcq	$0,%rax
+
+	movq	32+8(%rsp),%rbx
+	movq	64(%rdi,%rcx,1),%rdx
+
+	movq	%r8,0(%rdi)
+	leaq	64(%rdi),%r8
+	movq	%r9,8(%rdi)
+	movq	%r10,16(%rdi)
+	movq	%r11,24(%rdi)
+	movq	%r12,32(%rdi)
+	movq	%r13,40(%rdi)
+	movq	%r14,48(%rdi)
+	movq	%r15,56(%rdi)
+
+	leaq	64(%rdi,%rcx,1),%rdi
+	cmpq	8+8(%rsp),%r8
+	jb	.Lsqrx8x_reduction_loop
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align	32
+.type	__bn_postx4x_internal,@function
+__bn_postx4x_internal:
+.cfi_startproc	
+	movq	0(%rbp),%r12
+	movq	%rcx,%r10
+	movq	%rcx,%r9
+	negq	%rax
+	sarq	$3+2,%rcx
+
+.byte	102,72,15,126,202
+.byte	102,72,15,126,206
+	decq	%r12
+	movq	8(%rbp),%r13
+	xorq	%r8,%r8
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+	jmp	.Lsqrx4x_sub_entry
+
+.align	16
+.Lsqrx4x_sub:
+	movq	0(%rbp),%r12
+	movq	8(%rbp),%r13
+	movq	16(%rbp),%r14
+	movq	24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+	andnq	%rax,%r12,%r12
+	leaq	32(%rbp),%rbp
+	andnq	%rax,%r13,%r13
+	andnq	%rax,%r14,%r14
+	andnq	%rax,%r15,%r15
+
+	negq	%r8
+	adcq	0(%rdi),%r12
+	adcq	8(%rdi),%r13
+	adcq	16(%rdi),%r14
+	adcq	24(%rdi),%r15
+	movq	%r12,0(%rdx)
+	leaq	32(%rdi),%rdi
+	movq	%r13,8(%rdx)
+	sbbq	%r8,%r8
+	movq	%r14,16(%rdx)
+	movq	%r15,24(%rdx)
+	leaq	32(%rdx),%rdx
+
+	incq	%rcx
+	jnz	.Lsqrx4x_sub
+
+	negq	%r9
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	__bn_postx4x_internal,.-__bn_postx4x_internal
+.globl	bn_scatter5
+.hidden bn_scatter5
+.type	bn_scatter5,@function
+.align	16
+bn_scatter5:
+.cfi_startproc	
+	cmpl	$0,%esi
+	jz	.Lscatter_epilogue
+	leaq	(%rdx,%rcx,8),%rdx
+.Lscatter:
+	movq	(%rdi),%rax
+	leaq	8(%rdi),%rdi
+	movq	%rax,(%rdx)
+	leaq	256(%rdx),%rdx
+	subl	$1,%esi
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.hidden bn_gather5
+.type	bn_gather5,@function
+.align	32
+bn_gather5:
+.cfi_startproc	
+.LSEH_begin_bn_gather5:
+
+.byte	0x4c,0x8d,0x14,0x24
+.cfi_def_cfa_register	%r10
+.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
+	leaq	.Linc(%rip),%rax
+	andq	$-16,%rsp
+
+	movd	%ecx,%xmm5
+	movdqa	0(%rax),%xmm0
+	movdqa	16(%rax),%xmm1
+	leaq	128(%rdx),%r11
+	leaq	128(%rsp),%rax
+
+	pshufd	$0,%xmm5,%xmm5
+	movdqa	%xmm1,%xmm4
+	movdqa	%xmm1,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-128(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-112(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-96(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-80(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,-64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,-48(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,-32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,-16(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,0(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,16(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,32(%rax)
+	movdqa	%xmm4,%xmm2
+	paddd	%xmm0,%xmm1
+	pcmpeqd	%xmm5,%xmm0
+	movdqa	%xmm3,48(%rax)
+	movdqa	%xmm4,%xmm3
+
+	paddd	%xmm1,%xmm2
+	pcmpeqd	%xmm5,%xmm1
+	movdqa	%xmm0,64(%rax)
+	movdqa	%xmm4,%xmm0
+
+	paddd	%xmm2,%xmm3
+	pcmpeqd	%xmm5,%xmm2
+	movdqa	%xmm1,80(%rax)
+	movdqa	%xmm4,%xmm1
+
+	paddd	%xmm3,%xmm0
+	pcmpeqd	%xmm5,%xmm3
+	movdqa	%xmm2,96(%rax)
+	movdqa	%xmm4,%xmm2
+	movdqa	%xmm3,112(%rax)
+	jmp	.Lgather
+
+.align	32
+.Lgather:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	movdqa	-128(%r11),%xmm0
+	movdqa	-112(%r11),%xmm1
+	movdqa	-96(%r11),%xmm2
+	pand	-128(%rax),%xmm0
+	movdqa	-80(%r11),%xmm3
+	pand	-112(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-80(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	-64(%r11),%xmm0
+	movdqa	-48(%r11),%xmm1
+	movdqa	-32(%r11),%xmm2
+	pand	-64(%rax),%xmm0
+	movdqa	-16(%r11),%xmm3
+	pand	-48(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	-32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	-16(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	0(%r11),%xmm0
+	movdqa	16(%r11),%xmm1
+	movdqa	32(%r11),%xmm2
+	pand	0(%rax),%xmm0
+	movdqa	48(%r11),%xmm3
+	pand	16(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	32(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	48(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	movdqa	64(%r11),%xmm0
+	movdqa	80(%r11),%xmm1
+	movdqa	96(%r11),%xmm2
+	pand	64(%rax),%xmm0
+	movdqa	112(%r11),%xmm3
+	pand	80(%rax),%xmm1
+	por	%xmm0,%xmm4
+	pand	96(%rax),%xmm2
+	por	%xmm1,%xmm5
+	pand	112(%rax),%xmm3
+	por	%xmm2,%xmm4
+	por	%xmm3,%xmm5
+	por	%xmm5,%xmm4
+	leaq	256(%r11),%r11
+	pshufd	$0x4e,%xmm4,%xmm0
+	por	%xmm4,%xmm0
+	movq	%xmm0,(%rdi)
+	leaq	8(%rdi),%rdi
+	subl	$1,%esi
+	jnz	.Lgather
+
+	leaq	(%r10),%rsp
+.cfi_def_cfa_register	%rsp
+	.byte	0xf3,0xc3
+.LSEH_end_bn_gather5:
+.cfi_endproc	
+.size	bn_gather5,.-bn_gather5
+.align	64
+.Linc:
+.long	0,0, 1,1
+.long	2,2, 2,2
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif
+.section	.note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S
@@ -1,0 +1,518 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text	
+
+
+
+
+
+
+
+
+.type	abi_test_trampoline, @function
+.globl	abi_test_trampoline
+.hidden abi_test_trampoline
+.align	16
+abi_test_trampoline:
+.Labi_test_trampoline_seh_begin:
+.cfi_startproc	
+
+
+
+
+
+
+
+
+
+	subq	$120,%rsp
+.cfi_adjust_cfa_offset	120
+.Labi_test_trampoline_seh_prolog_alloc:
+	movq	%r8,48(%rsp)
+	movq	%rbx,64(%rsp)
+.cfi_offset	rbx, -64
+.Labi_test_trampoline_seh_prolog_rbx:
+	movq	%rbp,72(%rsp)
+.cfi_offset	rbp, -56
+.Labi_test_trampoline_seh_prolog_rbp:
+	movq	%r12,80(%rsp)
+.cfi_offset	r12, -48
+.Labi_test_trampoline_seh_prolog_r12:
+	movq	%r13,88(%rsp)
+.cfi_offset	r13, -40
+.Labi_test_trampoline_seh_prolog_r13:
+	movq	%r14,96(%rsp)
+.cfi_offset	r14, -32
+.Labi_test_trampoline_seh_prolog_r14:
+	movq	%r15,104(%rsp)
+.cfi_offset	r15, -24
+.Labi_test_trampoline_seh_prolog_r15:
+.Labi_test_trampoline_seh_prolog_end:
+	movq	0(%rsi),%rbx
+	movq	8(%rsi),%rbp
+	movq	16(%rsi),%r12
+	movq	24(%rsi),%r13
+	movq	32(%rsi),%r14
+	movq	40(%rsi),%r15
+
+	movq	%rdi,32(%rsp)
+	movq	%rsi,40(%rsp)
+
+
+
+
+	movq	%rdx,%r10
+	movq	%rcx,%r11
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rdi
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rsi
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rdx
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%rcx
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%r8
+	addq	$8,%r10
+	decq	%r11
+	js	.Largs_done
+	movq	(%r10),%r9
+	addq	$8,%r10
+	leaq	0(%rsp),%rax
+.Largs_loop:
+	decq	%r11
+	js	.Largs_done
+
+
+
+
+
+
+	movq	%r11,56(%rsp)
+	movq	(%r10),%r11
+	movq	%r11,(%rax)
+	movq	56(%rsp),%r11
+
+	addq	$8,%r10
+	addq	$8,%rax
+	jmp	.Largs_loop
+
+.Largs_done:
+	movq	32(%rsp),%rax
+	movq	48(%rsp),%r10
+	testq	%r10,%r10
+	jz	.Lno_unwind
+
+
+	pushfq
+	orq	$0x100,0(%rsp)
+	popfq
+
+
+
+	nop
+.globl	abi_test_unwind_start
+.hidden abi_test_unwind_start
+abi_test_unwind_start:
+
+	call	*%rax
+.globl	abi_test_unwind_return
+.hidden abi_test_unwind_return
+abi_test_unwind_return:
+
+
+
+
+	pushfq
+	andq	$-0x101,0(%rsp)
+	popfq
+.globl	abi_test_unwind_stop
+.hidden abi_test_unwind_stop
+abi_test_unwind_stop:
+
+	jmp	.Lcall_done
+
+.Lno_unwind:
+	call	*%rax
+
+.Lcall_done:
+
+	movq	40(%rsp),%rsi
+	movq	%rbx,0(%rsi)
+	movq	%rbp,8(%rsi)
+	movq	%r12,16(%rsi)
+	movq	%r13,24(%rsi)
+	movq	%r14,32(%rsi)
+	movq	%r15,40(%rsi)
+	movq	64(%rsp),%rbx
+.cfi_restore	rbx
+	movq	72(%rsp),%rbp
+.cfi_restore	rbp
+	movq	80(%rsp),%r12
+.cfi_restore	r12
+	movq	88(%rsp),%r13
+.cfi_restore	r13
+	movq	96(%rsp),%r14
+.cfi_restore	r14
+	movq	104(%rsp),%r15
+.cfi_restore	r15
+	addq	$120,%rsp
+.cfi_adjust_cfa_offset	-120
+
+
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.Labi_test_trampoline_seh_end:
+.size	abi_test_trampoline,.-abi_test_trampoline
+.type	abi_test_clobber_rax, @function
+.globl	abi_test_clobber_rax
+.hidden abi_test_clobber_rax
+.align	16
+abi_test_clobber_rax:
+	xorq	%rax,%rax
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rax,.-abi_test_clobber_rax
+.type	abi_test_clobber_rbx, @function
+.globl	abi_test_clobber_rbx
+.hidden abi_test_clobber_rbx
+.align	16
+abi_test_clobber_rbx:
+	xorq	%rbx,%rbx
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rbx,.-abi_test_clobber_rbx
+.type	abi_test_clobber_rcx, @function
+.globl	abi_test_clobber_rcx
+.hidden abi_test_clobber_rcx
+.align	16
+abi_test_clobber_rcx:
+	xorq	%rcx,%rcx
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rcx,.-abi_test_clobber_rcx
+.type	abi_test_clobber_rdx, @function
+.globl	abi_test_clobber_rdx
+.hidden abi_test_clobber_rdx
+.align	16
+abi_test_clobber_rdx:
+	xorq	%rdx,%rdx
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rdx,.-abi_test_clobber_rdx
+.type	abi_test_clobber_rdi, @function
+.globl	abi_test_clobber_rdi
+.hidden abi_test_clobber_rdi
+.align	16
+abi_test_clobber_rdi:
+	xorq	%rdi,%rdi
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rdi,.-abi_test_clobber_rdi
+.type	abi_test_clobber_rsi, @function
+.globl	abi_test_clobber_rsi
+.hidden abi_test_clobber_rsi
+.align	16
+abi_test_clobber_rsi:
+	xorq	%rsi,%rsi
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rsi,.-abi_test_clobber_rsi
+.type	abi_test_clobber_rbp, @function
+.globl	abi_test_clobber_rbp
+.hidden abi_test_clobber_rbp
+.align	16
+abi_test_clobber_rbp:
+	xorq	%rbp,%rbp
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_rbp,.-abi_test_clobber_rbp
+.type	abi_test_clobber_r8, @function
+.globl	abi_test_clobber_r8
+.hidden abi_test_clobber_r8
+.align	16
+abi_test_clobber_r8:
+	xorq	%r8,%r8
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r8,.-abi_test_clobber_r8
+.type	abi_test_clobber_r9, @function
+.globl	abi_test_clobber_r9
+.hidden abi_test_clobber_r9
+.align	16
+abi_test_clobber_r9:
+	xorq	%r9,%r9
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r9,.-abi_test_clobber_r9
+.type	abi_test_clobber_r10, @function
+.globl	abi_test_clobber_r10
+.hidden abi_test_clobber_r10
+.align	16
+abi_test_clobber_r10:
+	xorq	%r10,%r10
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r10,.-abi_test_clobber_r10
+.type	abi_test_clobber_r11, @function
+.globl	abi_test_clobber_r11
+.hidden abi_test_clobber_r11
+.align	16
+abi_test_clobber_r11:
+	xorq	%r11,%r11
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r11,.-abi_test_clobber_r11
+.type	abi_test_clobber_r12, @function
+.globl	abi_test_clobber_r12
+.hidden abi_test_clobber_r12
+.align	16
+abi_test_clobber_r12:
+	xorq	%r12,%r12
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r12,.-abi_test_clobber_r12
+.type	abi_test_clobber_r13, @function
+.globl	abi_test_clobber_r13
+.hidden abi_test_clobber_r13
+.align	16
+abi_test_clobber_r13:
+	xorq	%r13,%r13
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r13,.-abi_test_clobber_r13
+.type	abi_test_clobber_r14, @function
+.globl	abi_test_clobber_r14
+.hidden abi_test_clobber_r14
+.align	16
+abi_test_clobber_r14:
+	xorq	%r14,%r14
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r14,.-abi_test_clobber_r14
+.type	abi_test_clobber_r15, @function
+.globl	abi_test_clobber_r15
+.hidden abi_test_clobber_r15
+.align	16
+abi_test_clobber_r15:
+	xorq	%r15,%r15
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_r15,.-abi_test_clobber_r15
+.type	abi_test_clobber_xmm0, @function
+.globl	abi_test_clobber_xmm0
+.hidden abi_test_clobber_xmm0
+.align	16
+abi_test_clobber_xmm0:
+	pxor	%xmm0,%xmm0
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm0,.-abi_test_clobber_xmm0
+.type	abi_test_clobber_xmm1, @function
+.globl	abi_test_clobber_xmm1
+.hidden abi_test_clobber_xmm1
+.align	16
+abi_test_clobber_xmm1:
+	pxor	%xmm1,%xmm1
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm1,.-abi_test_clobber_xmm1
+.type	abi_test_clobber_xmm2, @function
+.globl	abi_test_clobber_xmm2
+.hidden abi_test_clobber_xmm2
+.align	16
+abi_test_clobber_xmm2:
+	pxor	%xmm2,%xmm2
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm2,.-abi_test_clobber_xmm2
+.type	abi_test_clobber_xmm3, @function
+.globl	abi_test_clobber_xmm3
+.hidden abi_test_clobber_xmm3
+.align	16
+abi_test_clobber_xmm3:
+	pxor	%xmm3,%xmm3
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm3,.-abi_test_clobber_xmm3
+.type	abi_test_clobber_xmm4, @function
+.globl	abi_test_clobber_xmm4
+.hidden abi_test_clobber_xmm4
+.align	16
+abi_test_clobber_xmm4:
+	pxor	%xmm4,%xmm4
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm4,.-abi_test_clobber_xmm4
+.type	abi_test_clobber_xmm5, @function
+.globl	abi_test_clobber_xmm5
+.hidden abi_test_clobber_xmm5
+.align	16
+abi_test_clobber_xmm5:
+	pxor	%xmm5,%xmm5
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm5,.-abi_test_clobber_xmm5
+.type	abi_test_clobber_xmm6, @function
+.globl	abi_test_clobber_xmm6
+.hidden abi_test_clobber_xmm6
+.align	16
+abi_test_clobber_xmm6:
+	pxor	%xmm6,%xmm6
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm6,.-abi_test_clobber_xmm6
+.type	abi_test_clobber_xmm7, @function
+.globl	abi_test_clobber_xmm7
+.hidden abi_test_clobber_xmm7
+.align	16
+abi_test_clobber_xmm7:
+	pxor	%xmm7,%xmm7
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm7,.-abi_test_clobber_xmm7
+.type	abi_test_clobber_xmm8, @function
+.globl	abi_test_clobber_xmm8
+.hidden abi_test_clobber_xmm8
+.align	16
+abi_test_clobber_xmm8:
+	pxor	%xmm8,%xmm8
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm8,.-abi_test_clobber_xmm8
+.type	abi_test_clobber_xmm9, @function
+.globl	abi_test_clobber_xmm9
+.hidden abi_test_clobber_xmm9
+.align	16
+abi_test_clobber_xmm9:
+	pxor	%xmm9,%xmm9
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm9,.-abi_test_clobber_xmm9
+.type	abi_test_clobber_xmm10, @function
+.globl	abi_test_clobber_xmm10
+.hidden abi_test_clobber_xmm10
+.align	16
+abi_test_clobber_xmm10:
+	pxor	%xmm10,%xmm10
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm10,.-abi_test_clobber_xmm10
+.type	abi_test_clobber_xmm11, @function
+.globl	abi_test_clobber_xmm11
+.hidden abi_test_clobber_xmm11
+.align	16
+abi_test_clobber_xmm11:
+	pxor	%xmm11,%xmm11
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm11,.-abi_test_clobber_xmm11
+.type	abi_test_clobber_xmm12, @function
+.globl	abi_test_clobber_xmm12
+.hidden abi_test_clobber_xmm12
+.align	16
+abi_test_clobber_xmm12:
+	pxor	%xmm12,%xmm12
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm12,.-abi_test_clobber_xmm12
+.type	abi_test_clobber_xmm13, @function
+.globl	abi_test_clobber_xmm13
+.hidden abi_test_clobber_xmm13
+.align	16
+abi_test_clobber_xmm13:
+	pxor	%xmm13,%xmm13
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm13,.-abi_test_clobber_xmm13
+.type	abi_test_clobber_xmm14, @function
+.globl	abi_test_clobber_xmm14
+.hidden abi_test_clobber_xmm14
+.align	16
+abi_test_clobber_xmm14:
+	pxor	%xmm14,%xmm14
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm14,.-abi_test_clobber_xmm14
+.type	abi_test_clobber_xmm15, @function
+.globl	abi_test_clobber_xmm15
+.hidden abi_test_clobber_xmm15
+.align	16
+abi_test_clobber_xmm15:
+	pxor	%xmm15,%xmm15
+	.byte	0xf3,0xc3
+.size	abi_test_clobber_xmm15,.-abi_test_clobber_xmm15
+
+
+
+.type	abi_test_bad_unwind_wrong_register, @function
+.globl	abi_test_bad_unwind_wrong_register
+.hidden abi_test_bad_unwind_wrong_register
+.align	16
+abi_test_bad_unwind_wrong_register:
+.cfi_startproc	
+.Labi_test_bad_unwind_wrong_register_seh_begin:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-16
+.Labi_test_bad_unwind_wrong_register_seh_push_r13:
+
+
+
+	nop
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	.byte	0xf3,0xc3
+.Labi_test_bad_unwind_wrong_register_seh_end:
+.cfi_endproc	
+.size	abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register
+
+
+
+
+.type	abi_test_bad_unwind_temporary, @function
+.globl	abi_test_bad_unwind_temporary
+.hidden abi_test_bad_unwind_temporary
+.align	16
+abi_test_bad_unwind_temporary:
+.cfi_startproc	
+.Labi_test_bad_unwind_temporary_seh_begin:
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-16
+.Labi_test_bad_unwind_temporary_seh_push_r12:
+
+	movq	%r12,%rax
+	incq	%rax
+	movq	%rax,(%rsp)
+
+
+
+	movq	%r12,(%rsp)
+
+
+	popq	%r12
+.cfi_adjust_cfa_offset	-8
+.cfi_restore	%r12
+	.byte	0xf3,0xc3
+.Labi_test_bad_unwind_temporary_seh_end:
+.cfi_endproc	
+.size	abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary
+
+
+
+
+.type	abi_test_set_direction_flag, @function
+.globl	abi_test_get_and_clear_direction_flag
+.hidden abi_test_get_and_clear_direction_flag
+abi_test_get_and_clear_direction_flag:
+	pushfq
+	popq	%rax
+	andq	$0x400,%rax
+	shrq	$10,%rax
+	cld
+	.byte	0xf3,0xc3
+.size	abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag
+
+
+
+.type	abi_test_set_direction_flag, @function
+.globl	abi_test_set_direction_flag
+.hidden abi_test_set_direction_flag
+abi_test_set_direction_flag:
+	std
+	.byte	0xf3,0xc3
+.size	abi_test_set_direction_flag,.-abi_test_set_direction_flag
+#endif
+.section	.note.GNU-stack,"",@progbits