From e2c377effd9a4eca0291c217e9c3256b033511dc Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 17 Aug 2017 12:33:43 -0400 Subject: [PATCH] Split source files to support Base Implementation + SIMD implementation (GH #461) Split source files to support Base Implementation + SIMD implementation --- .appveyor.yml | 2 +- Filelist.txt | 8 + GNUmakefile | 121 +- TestScripts/change-version.sh | 0 TestScripts/cryptest.sh | 283 ++- aria-simd.cpp | 143 ++ aria.cpp | 444 +--- ariatab.cpp | 168 ++ authenc.cpp | 2 +- bench1.cpp | 8 +- blake2-simd.cpp | 2182 ++++++++++++++++++++ blake2.cpp | 3617 +-------------------------------- config.h | 223 +- cpu.cpp | 648 ++---- cpu.h | 171 +- crc-simd.cpp | 156 ++ crc.cpp | 75 +- cryptdll.vcxproj | 9 + cryptdll.vcxproj.filters | 9 + cryptest.nmake | 9 +- cryptest.sh | 283 ++- cryptest.vcxproj | 6 + cryptlib.vcxproj | 14 + cryptlib.vcxproj.filters | 24 + dlltest.vcxproj | 6 + dlltest.vcxproj.filters | 14 + gcm-simd.cpp | 610 ++++++ gcm.cpp | 522 +---- neon.cpp | 108 + rijndael-simd.cpp | 705 +++++++ rijndael.cpp | 476 +---- rijndael.h | 8 +- sha-simd.cpp | 961 +++++++++ sha.cpp | 933 +-------- shacal2-simd.cpp | 111 + shacal2.cpp | 84 +- validat1.cpp | 13 +- whrlpool.cpp | 2 +- 38 files changed, 6521 insertions(+), 6637 deletions(-) mode change 100644 => 100755 TestScripts/change-version.sh create mode 100644 aria-simd.cpp create mode 100644 ariatab.cpp create mode 100644 blake2-simd.cpp create mode 100644 crc-simd.cpp create mode 100644 dlltest.vcxproj.filters create mode 100644 gcm-simd.cpp create mode 100644 neon.cpp create mode 100644 rijndael-simd.cpp create mode 100644 sha-simd.cpp create mode 100644 shacal2-simd.cpp diff --git a/.appveyor.yml b/.appveyor.yml index 0a43a0492..2ef03f510 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -3,7 +3,7 @@ # I have to admit its a bit complex and I don't fully understand it. version: 1.0.{build} -clone_depth: 1 +clone_depth: 3 skip_tags: true configuration: diff --git a/Filelist.txt b/Filelist.txt index f43b8a17f..c9a1a8c51 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -10,7 +10,9 @@ algparam.cpp algparam.h arc4.cpp arc4.h +ariatab.cpp aria.cpp +aria-simd.cpp aria.h argnames.h asn.cpp @@ -29,6 +31,7 @@ bench1.cpp bench2.cpp bfinit.cpp blake2.cpp +blake2-simd.cpp blake2.h blowfish.cpp blowfish.h @@ -53,6 +56,7 @@ config.h cpu.cpp cpu.h crc.cpp +crc-simd.cpp crc.h cryptdll.vcxproj cryptdll.vcxproj.filters @@ -177,6 +181,7 @@ mqv.cpp mqv.h nbtheory.cpp nbtheory.h +neon.cpp network.cpp network.h nr.h @@ -225,6 +230,7 @@ regtest2.cpp regtest3.cpp resource.h rijndael.cpp +rijndael-simd.cpp rijndael.h ripemd.cpp ripemd.h @@ -248,10 +254,12 @@ serpent.cpp serpent.h serpentp.h sha.cpp +sha-simd.cpp sha.h sha3.cpp sha3.h shacal2.cpp +shacal2-simd.cpp shacal2.h shark.cpp shark.h diff --git a/GNUmakefile b/GNUmakefile index ddc99dc55..702f49d6e 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -2,6 +2,12 @@ ##### System Attributes and Programs ##### ########################################################### +# If needed +TMPDIR ?= /tmp +# Used for ARMv7 and NEON. +FP_ABI ?= hard + +# Command ard arguments AR ?= ar ARFLAGS ?= -cr # ar needs the dash on OpenBSD RANLIB ?= ranlib @@ -19,8 +25,10 @@ UNAME := $(shell uname) IS_X86 := $(shell uname -m | $(EGREP) -v "x86_64" | $(EGREP) -i -c "i.86|x86|i86") IS_X64 := $(shell uname -m | $(EGREP) -i -c "(_64|d64)") IS_PPC := $(shell uname -m | $(EGREP) -i -c "ppc|power") -IS_ARM32 := $(shell uname -m | $(EGREP) -i -c "arm") +IS_ARM32 := $(shell uname -m | $(EGREP) -v "arm64" | $(EGREP) -i -c "arm") IS_ARM64 := $(shell uname -m | $(EGREP) -i -c "aarch64") +IS_ARMV8 ?= $(shell uname -m | $(EGREP) -i -c 'aarch32|aarch64') +IS_NEON ?= $(shell uname -m | $(EGREP) -i -c 'armv7|armv8|aarch32|aarch64') IS_SPARC := $(shell uname -m | $(EGREP) -i -c "sparc") IS_SPARC64 := $(shell uname -m | $(EGREP) -i -c "sparc64") @@ -191,6 +199,21 @@ endif # -DCRYPTOPP_DISABLE_SSSE3 endif # -DCRYPTOPP_DISABLE_ASM endif # CXXFLAGS +SSSE3_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mssse3 -dM -E - 2>/dev/null | grep -i -c -q __SSSE3__ && echo "-mssse3") +ARIA_FLAG = $(SSSE3_FLAG) +ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),) +SSE42_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -dM -E - 2>/dev/null | grep -i -c -q __SSE4_2__ && echo "-msse4.2") +ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),) +GCM_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mssse3 -mpclmul -dM -E - 2>/dev/null | grep -i -c -q __PCLMUL__ && echo "-mssse3 -mpclmul") +AES_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -maes -dM -E - 2>/dev/null | grep -i -c -q __AES__ && echo "-msse4.1 -maes") +ifeq ($(findstring -DCRYPTOPP_DISABLE_SHA,$(CXXFLAGS)),) +SHA_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -msha -dM -E - 2>/dev/null | grep -i -c -q __SHA__ && echo "-msse4.2 -msha") +BLAKE2_FLAG = $(SSE42_FLAG) +CRC_FLAG = $(SSE42_FLAG) +endif +endif +endif + # BEGIN_NATIVE_ARCH # Guard use of -march=native (or -m{32|64} on some platforms) # Don't add anything if -march=XXX or -mtune=XXX is specified @@ -280,6 +303,26 @@ CXXFLAGS += -pipe endif endif +ifeq ($(IS_NEON),1) + NEON_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon -dM -E - 2>/dev/null | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon") + GCM_FLAG = $(NEON_FLAG) + ARIA_FLAG = $(NEON_FLAG) + BLAKE2_FLAG = $(NEON_FLAG) +endif + +ifeq ($(IS_ARMV8),1) + ARMV8A_NEON_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a -dM -E - 2>/dev/null | grep -i -c -q __ARM_NEON && echo "-march=armv8-a") + ARMV8A_CRC_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | grep -i -c -q __ARM_FEATURE_CRC32 && echo "-march=armv8-a+crc") + ARMV8A_CRYPTO_FLAG = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crypto -dM -E - 2>/dev/null | grep -i -c -q __ARM_FEATURE_CRYPTO && echo "-march=armv8-a+crypto") + CRC_FLAG = $(ARMV8A_CRC_FLAG) + AES_FLAG = $(ARMV8A_CRYPTO_FLAG) + GCM_FLAG = $(ARMV8A_CRYPTO_FLAG) + SHA_FLAG = $(ARMV8A_CRYPTO_FLAG) + ARIA_FLAG = $(ARMV8A_NEON_FLAG) + BLAKE2_FLAG = $(ARMV8A_NEON_FLAG) + NEON_FLAG = $(ARMV8A_NEON_FLAG) +endif + endif # IS_X86 ########################################################### @@ -287,7 +330,7 @@ endif # IS_X86 ########################################################### # For SunOS, create a Mapfile that allows our object files -# to cantain additional bits (like SSE4 and AES on old Xeon) +# to contain additional bits (like SSE4 and AES on old Xeon) # http://www.oracle.com/technetwork/server-storage/solaris/hwcap-modification-139536.html ifeq ($(IS_SUN)$(SUN_COMPILER),11) ifneq ($(IS_X86)$(IS_X32)$(IS_X64),000) @@ -527,12 +570,13 @@ endif endif # Nasm # List test.cpp first to tame C++ static initialization problems. -TESTSRCS := adhoc.cpp test.cpp bench1.cpp bench2.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp +TESTSRCS := adhoc.cpp test.cpp bench1.cpp bench2.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp dlltest.cpp fipsalgt.cpp TESTOBJS := $(TESTSRCS:.cpp=.o) LIBOBJS := $(filter-out $(TESTOBJS),$(OBJS)) # List cryptlib.cpp first, then cpu.cpp, then integer.cpp to tame C++ static initialization problems. -DLLSRCS := cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp network.cpp oaep.cpp ospstore.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp +DLLSRCS := cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon.cpp network.cpp oaep.cpp ospstore.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp + DLLOBJS := $(DLLSRCS:.cpp=.export.o) # Import lib testing @@ -555,8 +599,8 @@ static: libcryptopp.a shared dynamic: libcryptopp.so$(SOLIB_VERSION_SUFFIX) endif -.PHONY: deps -deps GNUmakefile.deps: +.PHONY: dep deps depend +dep deps depend GNUmakefile.deps: $(CXX) $(strip $(CXXFLAGS)) -MM *.cpp > GNUmakefile.deps # CXXFLAGS are tuned earlier. @@ -752,13 +796,8 @@ ifeq ($(wildcard Filelist.txt),Filelist.txt) DIST_FILES := $(shell cat Filelist.txt) endif -.PHONY: appveyor -appveyor: - sed 's|Toolset>v100|Toolset>$$(DefaultPlatformToolset)|g' cryptlib.vcxproj > TestScripts/cryptlib.vcxproj - sed 's|Toolset>v100|Toolset>$$(DefaultPlatformToolset)|g' cryptest.vcxproj > TestScripts/cryptest.vcxproj - .PHONY: trim -trim: appveyor +trim: ifneq ($(IS_DARWIN),0) sed -i '' -e's/[[:space:]]*$$//' *.sh .*.yml *.h *.cpp *.asm *.s *.sln *.vcxproj *.filters GNUmakefile GNUmakefile-cross sed -i '' -e's/[[:space:]]*$$//' TestData/*.dat TestVectors/*.txt TestScripts/*.* @@ -823,58 +862,90 @@ endif # Dependencies # Run rdrand-nasm.sh to create the object files ifeq ($(USE_NASM),1) rdrand.o: rdrand.h rdrand.cpp rdrand.s - $(CXX) $(strip $(CXXFLAGS)) -DNASM_RDRAND_ASM_AVAILABLE=1 -DNASM_RDSEED_ASM_AVAILABLE=1 -c rdrand.cpp + $(CXX) $(strip $(CXXFLAGS) -DNASM_RDRAND_ASM_AVAILABLE=1 -DNASM_RDSEED_ASM_AVAILABLE=1 -c rdrand.cpp) rdrand-%.o: ./rdrand-nasm.sh endif +# SSE4.2 or NEON available +aria-simd.o : aria-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(ARIA_FLAG) -c) $< + +# SSE4.2 or NEON available +neon.o : neon.cpp + $(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $< + +# SSE4.2 or ARMv8a available +blake2-simd.o : blake2-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(BLAKE2_FLAG) -c) $< + +# SSE4.2 or ARMv8a available +crc-simd.o : crc-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $< + +# PCLMUL or ARMv7a/ARMv8a available +gcm-simd.o : gcm-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< + +# AESNI or ARMv7a/ARMv8a available +rijndael-simd.o : rijndael-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(AES_FLAG) -c) $< + +# SSE4.2/SHA-NI or ARMv8a available +sha-simd.o : sha-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< + +# SSE4.2/SHA-NI or ARMv8a available +shacal2-simd.o : shacal2-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< + # Don't build Threefish with UBsan on Travis CI. Timeouts cause the build to fail. # Also see https://stackoverflow.com/q/12983137/608639. ifeq ($(findstring true,$(CI)),true) threefish.o : threefish.cpp - $(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS))) -c $< + $(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS)) -c) $< endif # Don't build Rijndael with UBsan. Too much noise due to unaligned data accesses. ifneq ($(findstring -fsanitize=undefined,$(CXXFLAGS)),) rijndael.o : rijndael.cpp - $(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS))) -c $< + $(CXX) $(strip $(subst -fsanitize=undefined,,$(CXXFLAGS)) -c) $< endif # Don't build VMAC and friends with Asan. Too many false positives. ifneq ($(findstring -fsanitize=address,$(CXXFLAGS)),) vmac.o : vmac.cpp - $(CXX) $(strip $(subst -fsanitize=address,,$(CXXFLAGS))) -c $< + $(CXX) $(strip $(subst -fsanitize=address,,$(CXXFLAGS)) -c) $< endif # Only use CRYPTOPP_DATA_DIR if its not set in CXXFLAGS ifeq ($(findstring -DCRYPTOPP_DATA_DIR, $(strip $(CXXFLAGS))),) ifneq ($(strip $(CRYPTOPP_DATA_DIR)),) validat%.o : validat%.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $< + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $< bench%.o : bench%.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $< + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $< datatest.o : datatest.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $< + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $< test.o : test.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c $< + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DATA_DIR=\"$(CRYPTOPP_DATA_DIR)\" -c) $< endif endif %.dllonly.o : %.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_DLL_ONLY -c $< -o $@ + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_DLL_ONLY -c) $< -o $@ %.import.o : %.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_IMPORTS -c $< -o $@ + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_IMPORTS -c) $< -o $@ %.export.o : %.cpp - $(CXX) $(strip $(CXXFLAGS)) -DCRYPTOPP_EXPORTS -c $< -o $@ + $(CXX) $(strip $(CXXFLAGS) -DCRYPTOPP_EXPORTS -c) $< -o $@ %.bc : %.cpp - $(CXX) $(strip $(CXXFLAGS)) -c $< + $(CXX) $(strip $(CXXFLAGS) -c) $< %.o : %.cpp - $(CXX) $(strip $(CXXFLAGS)) -c $< + $(CXX) $(strip $(CXXFLAGS) -c) $< .PHONY: so_warning so_warning: diff --git a/TestScripts/change-version.sh b/TestScripts/change-version.sh old mode 100644 new mode 100755 diff --git a/TestScripts/cryptest.sh b/TestScripts/cryptest.sh index 28f79be5e..f441c963e 100755 --- a/TestScripts/cryptest.sh +++ b/TestScripts/cryptest.sh @@ -1171,37 +1171,75 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) X86_SSE2=$(echo -n "$X86_CPU_FLAGS" | "$GREP" -i -c sse2) - X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'X86_SHA256_HashBlocks') + X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'SHA256_HashMultipleBlocks_SSE2') if [[ ("$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)') - if [[ ("$COUNT" -le "600") ]]; then + if [[ ("$COUNT" -le "250") ]]; then FAILED=1 - echo "ERROR: failed to generate rotate immediate instruction (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate rotate immediate instruction (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS" fi else COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)') - if [[ ("$COUNT" -le "1000") ]]; then + if [[ ("$COUNT" -le "500") ]]; then FAILED=1 echo "ERROR: failed to generate rotate immediate instruction" | tee -a "$TEST_RESULTS" fi fi if [[ ("$X86_SSE2" -ne "0" && "$X86_SHA256_HASH_BLOCKS" -eq "0") ]]; then - echo "ERROR: failed to use X86_SHA256_HashBlocks" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to use SHA256_HashMultipleBlocks_SSE2" | tee -a "$TEST_RESULTS" fi if [[ ("$FAILED" -eq "0" && "$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then - echo "Verified rotate immediate machine instructions (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS" + echo "Verified rotate immediate machine instructions (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS" elif [[ ("$FAILED" -eq "0") ]]; then echo "Verified rotate immediate machine instructions" | tee -a "$TEST_RESULTS" fi fi + ############################################ + # Test CRC-32C code generation + + "$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + X86_CRC32=1 + fi + + if [[ ("$X86_CRC32" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS" + echo + + OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]]; then + echo "Verified crc32b and crc32l machine instructions" | tee -a "$TEST_RESULTS" + fi + fi + ############################################ # Test AES-NI code generation @@ -1216,8 +1254,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo "Testing: X86 AES-NI code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=rijndael.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1278,8 +1316,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo "Testing: X86 carryless multiply code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1321,7 +1359,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo OBJFILE=rdrand.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1348,44 +1386,6 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t fi fi - ############################################ - # X86 CRC32 code generation - - "$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 - if [[ "$?" -eq "0" ]]; then - X86_CRC32=1 - fi - - if [[ ("$X86_CRC32" -ne "0") ]]; then - echo - echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS" - echo - - OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" - - COUNT=0 - FAILED=0 - DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" - fi - - if [[ ("$FAILED" -eq "0") ]]; then - echo "Verified crc32l and crc32b machine instructions" | tee -a "$TEST_RESULTS" - fi - fi - ############################################ # X86 SHA code generation @@ -1400,8 +1400,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo "Testing: X86 SHA code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1469,7 +1469,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ] echo "Testing: ARM NEON code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=aria.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=aria-simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 @@ -1516,17 +1516,71 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ] fi fi + ############################################ + # ARM CRC32 code generation + + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crc adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_CRC32=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_CRC32" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS" + echo + + OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cb) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32cb instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cw) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32cw instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32w) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32w instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]]; then + echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS" + fi + fi + ############################################ # ARM carryless multiply code generation - ARM_PMULL=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c pmull) - if [[ ("$ARM_PMULL" -ne "0" || "$HAVE_ARM_CRYPTO" -ne "0") ]]; then + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_PMULL=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_PMULL" -ne "0") ]]; then echo echo "************************************" | tee -a "$TEST_RESULTS" echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 @@ -1549,50 +1603,139 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ] echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS" fi fi + ############################################ + # ARM SHA code generation + + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_AES=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_AES" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM AES generation" | tee -a "$TEST_RESULTS" + echo + + OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aese) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aese instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesmc) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesmc instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesd) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesd instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesimc) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]]; then + echo "Verified aese, aesd, aesmc, aesimc machine instructions" | tee -a "$TEST_RESULTS" + fi + fi ############################################ - # ARM CRC32 code generation + # ARM SHA code generation - ARM_CRC32=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c crc32) - if [[ ("$ARM_CRC32" -ne "0") ]]; then + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_SHA=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_SHA" -ne "0") ]]; then echo echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS" + echo "Testing: ARM SHA generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cb) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1c) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32cb instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1c instruction" | tee -a "$TEST_RESULTS" fi - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cw) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1m) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32cw instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1m instruction" | tee -a "$TEST_RESULTS" fi - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1p) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1p instruction" | tee -a "$TEST_RESULTS" fi - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32w) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1h) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32w instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1h instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su0) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha1su0 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su1) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha1su1 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v sha256h2 | "$GREP" -i -c sha256h) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256h instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256h2) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256h2 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su0) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256su0 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su1) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256su1 instruction" | tee -a "$TEST_RESULTS" fi if [[ ("$FAILED" -eq "0") ]]; then - echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS" + echo "Verified sha1c, sha1m, sha1p, sha1su0, sha1su1, sha256h, sha256h2, sha256su0, sha256su1 machine instructions" | tee -a "$TEST_RESULTS" fi fi fi diff --git a/aria-simd.cpp b/aria-simd.cpp new file mode 100644 index 000000000..51ccecfd8 --- /dev/null +++ b/aria-simd.cpp @@ -0,0 +1,143 @@ +// aria-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to ARMv7a and +// ARMv8a NEON instructions. A separate source file is needed +// because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include "arm_neon.h" +#endif + +#if (CRYPTOPP_SSSE3_AVAILABLE) +# include "tmmintrin.h" +#endif + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(ARIATab) + +extern const word32 S1[256]; +extern const word32 S2[256]; +extern const word32 X1[256]; +extern const word32 X2[256]; +extern const word32 KRK[3][4]; + +NAMESPACE_END +NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +using CryptoPP::ARIATab::S1; +using CryptoPP::ARIATab::S2; +using CryptoPP::ARIATab::X1; +using CryptoPP::ARIATab::X2; +using CryptoPP::ARIATab::KRK; + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + +template +inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16]) +{ + static const unsigned int Q1 = (4-(N/32)) % 4; + static const unsigned int Q2 = (3-(N/32)) % 4; + static const unsigned int R = N % 32; + + vst1q_u32(reinterpret_cast(RK), + veorq_u32(X, veorq_u32( + vshrq_n_u32(vextq_u32(Y, Y, Q1), R), + vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))); +} + +void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen) +{ + const uint32x4_t w0 = vld1q_u32((const uint32_t*)(ws+ 0)); + const uint32x4_t w1 = vld1q_u32((const uint32_t*)(ws+ 8)); + const uint32x4_t w2 = vld1q_u32((const uint32_t*)(ws+12)); + const uint32x4_t w3 = vld1q_u32((const uint32_t*)(ws+16)); + + ARIA_GSRK_NEON<19>(w0, w1, rk + 0); + ARIA_GSRK_NEON<19>(w1, w2, rk + 16); + ARIA_GSRK_NEON<19>(w2, w3, rk + 32); + ARIA_GSRK_NEON<19>(w3, w0, rk + 48); + ARIA_GSRK_NEON<31>(w0, w1, rk + 64); + ARIA_GSRK_NEON<31>(w1, w2, rk + 80); + ARIA_GSRK_NEON<31>(w2, w3, rk + 96); + ARIA_GSRK_NEON<31>(w3, w0, rk + 112); + ARIA_GSRK_NEON<67>(w0, w1, rk + 128); + ARIA_GSRK_NEON<67>(w1, w2, rk + 144); + ARIA_GSRK_NEON<67>(w2, w3, rk + 160); + ARIA_GSRK_NEON<67>(w3, w0, rk + 176); + ARIA_GSRK_NEON<97>(w0, w1, rk + 192); + + if (keylen > 16) + { + ARIA_GSRK_NEON<97>(w1, w2, rk + 208); + ARIA_GSRK_NEON<97>(w2, w3, rk + 224); + + if (keylen > 24) + { + ARIA_GSRK_NEON< 97>(w3, w0, rk + 240); + ARIA_GSRK_NEON<109>(w0, w1, rk + 256); + } + } +} + +void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outBlock) +{ + vst1q_u32(reinterpret_cast(outBlock), veorq_u32( + vld1q_u32(reinterpret_cast(outBlock)), + vld1q_u32(reinterpret_cast(xorBlock)))); +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_SSSE3_AVAILABLE) + +inline byte ARIA_BRF(const word32 x, const int y) { + return GETBYTE(x, y); +} + +void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t) +{ + const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); + + outBlock[ 0] = (byte)(X1[ARIA_BRF(t[0],3)] ); + outBlock[ 1] = (byte)(X2[ARIA_BRF(t[0],2)]>>8); + outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); + outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); + outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); + outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); + outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); + outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); + outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); + outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); + outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); + outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); + outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); + outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); + outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); + outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); + + // 'outBlock' may be unaligned. + _mm_storeu_si128(reinterpret_cast<__m128i*>(outBlock), + _mm_xor_si128(_mm_loadu_si128((const __m128i*)(outBlock)), + _mm_shuffle_epi8(_mm_load_si128((const __m128i*)(rk)), MASK))); + + // 'outBlock' and 'xorBlock' may be unaligned. + if (xorBlock != NULLPTR) + { + _mm_storeu_si128((__m128i*)(outBlock), + _mm_xor_si128( + _mm_loadu_si128((const __m128i*)(outBlock)), + _mm_loadu_si128((const __m128i*)(xorBlock)))); + } +} + +#endif // CRYPTOPP_SSSE3_AVAILABLE + +NAMESPACE_END diff --git a/aria.cpp b/aria.cpp index bfa9ab07a..75ae98f14 100644 --- a/aria.cpp +++ b/aria.cpp @@ -7,175 +7,34 @@ #include "misc.h" #include "cpu.h" -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#if CRYPTOPP_SSE2_AVAILABLE # define CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS 1 #endif -#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE +#if CRYPTOPP_SSSE3_AVAILABLE # define CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS 1 #endif -#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE -# define CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS 1 -#endif +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(ARIATab) + +extern const word32 S1[256]; +extern const word32 S2[256]; +extern const word32 X1[256]; +extern const word32 X2[256]; +extern const word32 KRK[3][4]; -ANONYMOUS_NAMESPACE_BEGIN - -CRYPTOPP_ALIGN_DATA(16) -const CryptoPP::word32 S1[256]={ - 0x00636363,0x007c7c7c,0x00777777,0x007b7b7b,0x00f2f2f2,0x006b6b6b,0x006f6f6f,0x00c5c5c5, - 0x00303030,0x00010101,0x00676767,0x002b2b2b,0x00fefefe,0x00d7d7d7,0x00ababab,0x00767676, - 0x00cacaca,0x00828282,0x00c9c9c9,0x007d7d7d,0x00fafafa,0x00595959,0x00474747,0x00f0f0f0, - 0x00adadad,0x00d4d4d4,0x00a2a2a2,0x00afafaf,0x009c9c9c,0x00a4a4a4,0x00727272,0x00c0c0c0, - 0x00b7b7b7,0x00fdfdfd,0x00939393,0x00262626,0x00363636,0x003f3f3f,0x00f7f7f7,0x00cccccc, - 0x00343434,0x00a5a5a5,0x00e5e5e5,0x00f1f1f1,0x00717171,0x00d8d8d8,0x00313131,0x00151515, - 0x00040404,0x00c7c7c7,0x00232323,0x00c3c3c3,0x00181818,0x00969696,0x00050505,0x009a9a9a, - 0x00070707,0x00121212,0x00808080,0x00e2e2e2,0x00ebebeb,0x00272727,0x00b2b2b2,0x00757575, - 0x00090909,0x00838383,0x002c2c2c,0x001a1a1a,0x001b1b1b,0x006e6e6e,0x005a5a5a,0x00a0a0a0, - 0x00525252,0x003b3b3b,0x00d6d6d6,0x00b3b3b3,0x00292929,0x00e3e3e3,0x002f2f2f,0x00848484, - 0x00535353,0x00d1d1d1,0x00000000,0x00ededed,0x00202020,0x00fcfcfc,0x00b1b1b1,0x005b5b5b, - 0x006a6a6a,0x00cbcbcb,0x00bebebe,0x00393939,0x004a4a4a,0x004c4c4c,0x00585858,0x00cfcfcf, - 0x00d0d0d0,0x00efefef,0x00aaaaaa,0x00fbfbfb,0x00434343,0x004d4d4d,0x00333333,0x00858585, - 0x00454545,0x00f9f9f9,0x00020202,0x007f7f7f,0x00505050,0x003c3c3c,0x009f9f9f,0x00a8a8a8, - 0x00515151,0x00a3a3a3,0x00404040,0x008f8f8f,0x00929292,0x009d9d9d,0x00383838,0x00f5f5f5, - 0x00bcbcbc,0x00b6b6b6,0x00dadada,0x00212121,0x00101010,0x00ffffff,0x00f3f3f3,0x00d2d2d2, - 0x00cdcdcd,0x000c0c0c,0x00131313,0x00ececec,0x005f5f5f,0x00979797,0x00444444,0x00171717, - 0x00c4c4c4,0x00a7a7a7,0x007e7e7e,0x003d3d3d,0x00646464,0x005d5d5d,0x00191919,0x00737373, - 0x00606060,0x00818181,0x004f4f4f,0x00dcdcdc,0x00222222,0x002a2a2a,0x00909090,0x00888888, - 0x00464646,0x00eeeeee,0x00b8b8b8,0x00141414,0x00dedede,0x005e5e5e,0x000b0b0b,0x00dbdbdb, - 0x00e0e0e0,0x00323232,0x003a3a3a,0x000a0a0a,0x00494949,0x00060606,0x00242424,0x005c5c5c, - 0x00c2c2c2,0x00d3d3d3,0x00acacac,0x00626262,0x00919191,0x00959595,0x00e4e4e4,0x00797979, - 0x00e7e7e7,0x00c8c8c8,0x00373737,0x006d6d6d,0x008d8d8d,0x00d5d5d5,0x004e4e4e,0x00a9a9a9, - 0x006c6c6c,0x00565656,0x00f4f4f4,0x00eaeaea,0x00656565,0x007a7a7a,0x00aeaeae,0x00080808, - 0x00bababa,0x00787878,0x00252525,0x002e2e2e,0x001c1c1c,0x00a6a6a6,0x00b4b4b4,0x00c6c6c6, - 0x00e8e8e8,0x00dddddd,0x00747474,0x001f1f1f,0x004b4b4b,0x00bdbdbd,0x008b8b8b,0x008a8a8a, - 0x00707070,0x003e3e3e,0x00b5b5b5,0x00666666,0x00484848,0x00030303,0x00f6f6f6,0x000e0e0e, - 0x00616161,0x00353535,0x00575757,0x00b9b9b9,0x00868686,0x00c1c1c1,0x001d1d1d,0x009e9e9e, - 0x00e1e1e1,0x00f8f8f8,0x00989898,0x00111111,0x00696969,0x00d9d9d9,0x008e8e8e,0x00949494, - 0x009b9b9b,0x001e1e1e,0x00878787,0x00e9e9e9,0x00cecece,0x00555555,0x00282828,0x00dfdfdf, - 0x008c8c8c,0x00a1a1a1,0x00898989,0x000d0d0d,0x00bfbfbf,0x00e6e6e6,0x00424242,0x00686868, - 0x00414141,0x00999999,0x002d2d2d,0x000f0f0f,0x00b0b0b0,0x00545454,0x00bbbbbb,0x00161616 -}; - -CRYPTOPP_ALIGN_DATA(16) -const CryptoPP::word32 S2[256]={ - 0xe200e2e2,0x4e004e4e,0x54005454,0xfc00fcfc,0x94009494,0xc200c2c2,0x4a004a4a,0xcc00cccc, - 0x62006262,0x0d000d0d,0x6a006a6a,0x46004646,0x3c003c3c,0x4d004d4d,0x8b008b8b,0xd100d1d1, - 0x5e005e5e,0xfa00fafa,0x64006464,0xcb00cbcb,0xb400b4b4,0x97009797,0xbe00bebe,0x2b002b2b, - 0xbc00bcbc,0x77007777,0x2e002e2e,0x03000303,0xd300d3d3,0x19001919,0x59005959,0xc100c1c1, - 0x1d001d1d,0x06000606,0x41004141,0x6b006b6b,0x55005555,0xf000f0f0,0x99009999,0x69006969, - 0xea00eaea,0x9c009c9c,0x18001818,0xae00aeae,0x63006363,0xdf00dfdf,0xe700e7e7,0xbb00bbbb, - 0x00000000,0x73007373,0x66006666,0xfb00fbfb,0x96009696,0x4c004c4c,0x85008585,0xe400e4e4, - 0x3a003a3a,0x09000909,0x45004545,0xaa00aaaa,0x0f000f0f,0xee00eeee,0x10001010,0xeb00ebeb, - 0x2d002d2d,0x7f007f7f,0xf400f4f4,0x29002929,0xac00acac,0xcf00cfcf,0xad00adad,0x91009191, - 0x8d008d8d,0x78007878,0xc800c8c8,0x95009595,0xf900f9f9,0x2f002f2f,0xce00cece,0xcd00cdcd, - 0x08000808,0x7a007a7a,0x88008888,0x38003838,0x5c005c5c,0x83008383,0x2a002a2a,0x28002828, - 0x47004747,0xdb00dbdb,0xb800b8b8,0xc700c7c7,0x93009393,0xa400a4a4,0x12001212,0x53005353, - 0xff00ffff,0x87008787,0x0e000e0e,0x31003131,0x36003636,0x21002121,0x58005858,0x48004848, - 0x01000101,0x8e008e8e,0x37003737,0x74007474,0x32003232,0xca00caca,0xe900e9e9,0xb100b1b1, - 0xb700b7b7,0xab00abab,0x0c000c0c,0xd700d7d7,0xc400c4c4,0x56005656,0x42004242,0x26002626, - 0x07000707,0x98009898,0x60006060,0xd900d9d9,0xb600b6b6,0xb900b9b9,0x11001111,0x40004040, - 0xec00ecec,0x20002020,0x8c008c8c,0xbd00bdbd,0xa000a0a0,0xc900c9c9,0x84008484,0x04000404, - 0x49004949,0x23002323,0xf100f1f1,0x4f004f4f,0x50005050,0x1f001f1f,0x13001313,0xdc00dcdc, - 0xd800d8d8,0xc000c0c0,0x9e009e9e,0x57005757,0xe300e3e3,0xc300c3c3,0x7b007b7b,0x65006565, - 0x3b003b3b,0x02000202,0x8f008f8f,0x3e003e3e,0xe800e8e8,0x25002525,0x92009292,0xe500e5e5, - 0x15001515,0xdd00dddd,0xfd00fdfd,0x17001717,0xa900a9a9,0xbf00bfbf,0xd400d4d4,0x9a009a9a, - 0x7e007e7e,0xc500c5c5,0x39003939,0x67006767,0xfe00fefe,0x76007676,0x9d009d9d,0x43004343, - 0xa700a7a7,0xe100e1e1,0xd000d0d0,0xf500f5f5,0x68006868,0xf200f2f2,0x1b001b1b,0x34003434, - 0x70007070,0x05000505,0xa300a3a3,0x8a008a8a,0xd500d5d5,0x79007979,0x86008686,0xa800a8a8, - 0x30003030,0xc600c6c6,0x51005151,0x4b004b4b,0x1e001e1e,0xa600a6a6,0x27002727,0xf600f6f6, - 0x35003535,0xd200d2d2,0x6e006e6e,0x24002424,0x16001616,0x82008282,0x5f005f5f,0xda00dada, - 0xe600e6e6,0x75007575,0xa200a2a2,0xef00efef,0x2c002c2c,0xb200b2b2,0x1c001c1c,0x9f009f9f, - 0x5d005d5d,0x6f006f6f,0x80008080,0x0a000a0a,0x72007272,0x44004444,0x9b009b9b,0x6c006c6c, - 0x90009090,0x0b000b0b,0x5b005b5b,0x33003333,0x7d007d7d,0x5a005a5a,0x52005252,0xf300f3f3, - 0x61006161,0xa100a1a1,0xf700f7f7,0xb000b0b0,0xd600d6d6,0x3f003f3f,0x7c007c7c,0x6d006d6d, - 0xed00eded,0x14001414,0xe000e0e0,0xa500a5a5,0x3d003d3d,0x22002222,0xb300b3b3,0xf800f8f8, - 0x89008989,0xde00dede,0x71007171,0x1a001a1a,0xaf00afaf,0xba00baba,0xb500b5b5,0x81008181 -}; - -CRYPTOPP_ALIGN_DATA(16) -const CryptoPP::word32 X1[256]={ - 0x52520052,0x09090009,0x6a6a006a,0xd5d500d5,0x30300030,0x36360036,0xa5a500a5,0x38380038, - 0xbfbf00bf,0x40400040,0xa3a300a3,0x9e9e009e,0x81810081,0xf3f300f3,0xd7d700d7,0xfbfb00fb, - 0x7c7c007c,0xe3e300e3,0x39390039,0x82820082,0x9b9b009b,0x2f2f002f,0xffff00ff,0x87870087, - 0x34340034,0x8e8e008e,0x43430043,0x44440044,0xc4c400c4,0xdede00de,0xe9e900e9,0xcbcb00cb, - 0x54540054,0x7b7b007b,0x94940094,0x32320032,0xa6a600a6,0xc2c200c2,0x23230023,0x3d3d003d, - 0xeeee00ee,0x4c4c004c,0x95950095,0x0b0b000b,0x42420042,0xfafa00fa,0xc3c300c3,0x4e4e004e, - 0x08080008,0x2e2e002e,0xa1a100a1,0x66660066,0x28280028,0xd9d900d9,0x24240024,0xb2b200b2, - 0x76760076,0x5b5b005b,0xa2a200a2,0x49490049,0x6d6d006d,0x8b8b008b,0xd1d100d1,0x25250025, - 0x72720072,0xf8f800f8,0xf6f600f6,0x64640064,0x86860086,0x68680068,0x98980098,0x16160016, - 0xd4d400d4,0xa4a400a4,0x5c5c005c,0xcccc00cc,0x5d5d005d,0x65650065,0xb6b600b6,0x92920092, - 0x6c6c006c,0x70700070,0x48480048,0x50500050,0xfdfd00fd,0xeded00ed,0xb9b900b9,0xdada00da, - 0x5e5e005e,0x15150015,0x46460046,0x57570057,0xa7a700a7,0x8d8d008d,0x9d9d009d,0x84840084, - 0x90900090,0xd8d800d8,0xabab00ab,0x00000000,0x8c8c008c,0xbcbc00bc,0xd3d300d3,0x0a0a000a, - 0xf7f700f7,0xe4e400e4,0x58580058,0x05050005,0xb8b800b8,0xb3b300b3,0x45450045,0x06060006, - 0xd0d000d0,0x2c2c002c,0x1e1e001e,0x8f8f008f,0xcaca00ca,0x3f3f003f,0x0f0f000f,0x02020002, - 0xc1c100c1,0xafaf00af,0xbdbd00bd,0x03030003,0x01010001,0x13130013,0x8a8a008a,0x6b6b006b, - 0x3a3a003a,0x91910091,0x11110011,0x41410041,0x4f4f004f,0x67670067,0xdcdc00dc,0xeaea00ea, - 0x97970097,0xf2f200f2,0xcfcf00cf,0xcece00ce,0xf0f000f0,0xb4b400b4,0xe6e600e6,0x73730073, - 0x96960096,0xacac00ac,0x74740074,0x22220022,0xe7e700e7,0xadad00ad,0x35350035,0x85850085, - 0xe2e200e2,0xf9f900f9,0x37370037,0xe8e800e8,0x1c1c001c,0x75750075,0xdfdf00df,0x6e6e006e, - 0x47470047,0xf1f100f1,0x1a1a001a,0x71710071,0x1d1d001d,0x29290029,0xc5c500c5,0x89890089, - 0x6f6f006f,0xb7b700b7,0x62620062,0x0e0e000e,0xaaaa00aa,0x18180018,0xbebe00be,0x1b1b001b, - 0xfcfc00fc,0x56560056,0x3e3e003e,0x4b4b004b,0xc6c600c6,0xd2d200d2,0x79790079,0x20200020, - 0x9a9a009a,0xdbdb00db,0xc0c000c0,0xfefe00fe,0x78780078,0xcdcd00cd,0x5a5a005a,0xf4f400f4, - 0x1f1f001f,0xdddd00dd,0xa8a800a8,0x33330033,0x88880088,0x07070007,0xc7c700c7,0x31310031, - 0xb1b100b1,0x12120012,0x10100010,0x59590059,0x27270027,0x80800080,0xecec00ec,0x5f5f005f, - 0x60600060,0x51510051,0x7f7f007f,0xa9a900a9,0x19190019,0xb5b500b5,0x4a4a004a,0x0d0d000d, - 0x2d2d002d,0xe5e500e5,0x7a7a007a,0x9f9f009f,0x93930093,0xc9c900c9,0x9c9c009c,0xefef00ef, - 0xa0a000a0,0xe0e000e0,0x3b3b003b,0x4d4d004d,0xaeae00ae,0x2a2a002a,0xf5f500f5,0xb0b000b0, - 0xc8c800c8,0xebeb00eb,0xbbbb00bb,0x3c3c003c,0x83830083,0x53530053,0x99990099,0x61610061, - 0x17170017,0x2b2b002b,0x04040004,0x7e7e007e,0xbaba00ba,0x77770077,0xd6d600d6,0x26260026, - 0xe1e100e1,0x69690069,0x14140014,0x63630063,0x55550055,0x21210021,0x0c0c000c,0x7d7d007d -}; - -CRYPTOPP_ALIGN_DATA(16) -const CryptoPP::word32 X2[256]={ - 0x30303000,0x68686800,0x99999900,0x1b1b1b00,0x87878700,0xb9b9b900,0x21212100,0x78787800, - 0x50505000,0x39393900,0xdbdbdb00,0xe1e1e100,0x72727200,0x09090900,0x62626200,0x3c3c3c00, - 0x3e3e3e00,0x7e7e7e00,0x5e5e5e00,0x8e8e8e00,0xf1f1f100,0xa0a0a000,0xcccccc00,0xa3a3a300, - 0x2a2a2a00,0x1d1d1d00,0xfbfbfb00,0xb6b6b600,0xd6d6d600,0x20202000,0xc4c4c400,0x8d8d8d00, - 0x81818100,0x65656500,0xf5f5f500,0x89898900,0xcbcbcb00,0x9d9d9d00,0x77777700,0xc6c6c600, - 0x57575700,0x43434300,0x56565600,0x17171700,0xd4d4d400,0x40404000,0x1a1a1a00,0x4d4d4d00, - 0xc0c0c000,0x63636300,0x6c6c6c00,0xe3e3e300,0xb7b7b700,0xc8c8c800,0x64646400,0x6a6a6a00, - 0x53535300,0xaaaaaa00,0x38383800,0x98989800,0x0c0c0c00,0xf4f4f400,0x9b9b9b00,0xededed00, - 0x7f7f7f00,0x22222200,0x76767600,0xafafaf00,0xdddddd00,0x3a3a3a00,0x0b0b0b00,0x58585800, - 0x67676700,0x88888800,0x06060600,0xc3c3c300,0x35353500,0x0d0d0d00,0x01010100,0x8b8b8b00, - 0x8c8c8c00,0xc2c2c200,0xe6e6e600,0x5f5f5f00,0x02020200,0x24242400,0x75757500,0x93939300, - 0x66666600,0x1e1e1e00,0xe5e5e500,0xe2e2e200,0x54545400,0xd8d8d800,0x10101000,0xcecece00, - 0x7a7a7a00,0xe8e8e800,0x08080800,0x2c2c2c00,0x12121200,0x97979700,0x32323200,0xababab00, - 0xb4b4b400,0x27272700,0x0a0a0a00,0x23232300,0xdfdfdf00,0xefefef00,0xcacaca00,0xd9d9d900, - 0xb8b8b800,0xfafafa00,0xdcdcdc00,0x31313100,0x6b6b6b00,0xd1d1d100,0xadadad00,0x19191900, - 0x49494900,0xbdbdbd00,0x51515100,0x96969600,0xeeeeee00,0xe4e4e400,0xa8a8a800,0x41414100, - 0xdadada00,0xffffff00,0xcdcdcd00,0x55555500,0x86868600,0x36363600,0xbebebe00,0x61616100, - 0x52525200,0xf8f8f800,0xbbbbbb00,0x0e0e0e00,0x82828200,0x48484800,0x69696900,0x9a9a9a00, - 0xe0e0e000,0x47474700,0x9e9e9e00,0x5c5c5c00,0x04040400,0x4b4b4b00,0x34343400,0x15151500, - 0x79797900,0x26262600,0xa7a7a700,0xdedede00,0x29292900,0xaeaeae00,0x92929200,0xd7d7d700, - 0x84848400,0xe9e9e900,0xd2d2d200,0xbababa00,0x5d5d5d00,0xf3f3f300,0xc5c5c500,0xb0b0b000, - 0xbfbfbf00,0xa4a4a400,0x3b3b3b00,0x71717100,0x44444400,0x46464600,0x2b2b2b00,0xfcfcfc00, - 0xebebeb00,0x6f6f6f00,0xd5d5d500,0xf6f6f600,0x14141400,0xfefefe00,0x7c7c7c00,0x70707000, - 0x5a5a5a00,0x7d7d7d00,0xfdfdfd00,0x2f2f2f00,0x18181800,0x83838300,0x16161600,0xa5a5a500, - 0x91919100,0x1f1f1f00,0x05050500,0x95959500,0x74747400,0xa9a9a900,0xc1c1c100,0x5b5b5b00, - 0x4a4a4a00,0x85858500,0x6d6d6d00,0x13131300,0x07070700,0x4f4f4f00,0x4e4e4e00,0x45454500, - 0xb2b2b200,0x0f0f0f00,0xc9c9c900,0x1c1c1c00,0xa6a6a600,0xbcbcbc00,0xececec00,0x73737300, - 0x90909000,0x7b7b7b00,0xcfcfcf00,0x59595900,0x8f8f8f00,0xa1a1a100,0xf9f9f900,0x2d2d2d00, - 0xf2f2f200,0xb1b1b100,0x00000000,0x94949400,0x37373700,0x9f9f9f00,0xd0d0d000,0x2e2e2e00, - 0x9c9c9c00,0x6e6e6e00,0x28282800,0x3f3f3f00,0x80808000,0xf0f0f000,0x3d3d3d00,0xd3d3d300, - 0x25252500,0x8a8a8a00,0xb5b5b500,0xe7e7e700,0x42424200,0xb3b3b300,0xc7c7c700,0xeaeaea00, - 0xf7f7f700,0x4c4c4c00,0x11111100,0x33333300,0x03030300,0xa2a2a200,0xacacac00,0x60606000 -}; - -CRYPTOPP_ALIGN_DATA(16) -const CryptoPP::word32 KRK[3][4] = { - {0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0}, - {0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0}, - {0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e} -}; - -ANONYMOUS_NAMESPACE_END +NAMESPACE_END +NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) +using CryptoPP::ARIATab::S1; +using CryptoPP::ARIATab::S2; +using CryptoPP::ARIATab::X1; +using CryptoPP::ARIATab::X2; +using CryptoPP::ARIATab::KRK; + typedef BlockGetAndPut BigEndianBlock; typedef BlockGetAndPut NativeEndianBlock; @@ -222,6 +81,15 @@ inline byte ARIA_BRF(const word32 x, const int y) { #define ARIA_FO {SBL1_M(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3])} #define ARIA_FE {SBL2_M(t[0],t[1],t[2],t[3]) ARIA_MM(t[0],t[1],t[2],t[3]) ARIA_P(t[2],t[3],t[0],t[1]) ARIA_MM(t[0],t[1],t[2],t[3])} +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +extern void ARIA_UncheckedSetKey_Schedule_NEON(byte* rk, word32* ws, unsigned int keylen); +extern void ARIA_ProcessAndXorBlock_Xor_NEON(const byte* xorBlock, byte* outblock); +#endif + +#if (CRYPTOPP_SSSE3_AVAILABLE) +extern void ARIA_ProcessAndXorBlock_Xor_SSSE3(const byte* xorBlock, byte* outBlock, const byte *rk, word32 *t); +#endif + // n-bit right shift of Y XORed to X template inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16]) @@ -235,21 +103,6 @@ inline void ARIA_GSRK(const word32 X[4], const word32 Y[4], byte RK[16]) reinterpret_cast(RK)[3] = (X[3]) ^ ((Y[(Q+3)%4])>>R) ^ ((Y[(Q+2)%4])<<(32-R)); } -#if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS -template -inline void ARIA_GSRK_NEON(const uint32x4_t X, const uint32x4_t Y, byte RK[16]) -{ - static const unsigned int Q1 = (4-(N/32)) % 4; - static const unsigned int Q2 = (3-(N/32)) % 4; - static const unsigned int R = N % 32; - - vst1q_u32(reinterpret_cast(RK), - veorq_u32(X, veorq_u32( - vshrq_n_u32(vextq_u32(Y, Y, Q1), R), - vshlq_n_u32(vextq_u32(Y, Y, Q2), 32-R)))); -} -#endif - void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const NameValuePairs ¶ms) { CRYPTOPP_UNUSED(params); @@ -280,144 +133,51 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam // w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t and u are 16 byte temp areas. word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20; -#if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS - const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); - if (HasSSSE3()) - { - // 'mk' may be unaligned. - const __m128i w = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk)), MASK); - _mm_store_si128((__m128i*)w0, w); - _mm_store_si128((__m128i*)t, _mm_xor_si128(w, _mm_load_si128((const __m128i*)(KRK[q])))); + BigEndianBlock::Get(mk)(w0[0])(w0[1])(w0[2])(w0[3]); + t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1]; + t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3]; - ARIA_FO; + ARIA_FO; - if (keylen == 32) - { - // 'mk' may be unaligned. - _mm_store_si128(reinterpret_cast<__m128i*>(w1), - _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk+16)), MASK)); - } - else if (keylen == 24) - { - BigEndianBlock::Get(mk+16)(w1[0])(w1[1]); - w1[2] = w1[3] = 0; - } - else - { - w1[0]=w1[1]=w1[2]=w1[3]=0; - } - } - else -#endif // CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS + if (keylen == 32) { - BigEndianBlock::Get(mk)(w0[0])(w0[1])(w0[2])(w0[3]); - t[0]=w0[0]^KRK[q][0]; t[1]=w0[1]^KRK[q][1]; - t[2]=w0[2]^KRK[q][2]; t[3]=w0[3]^KRK[q][3]; - - ARIA_FO; - - if (keylen == 32) - { - BigEndianBlock::Get(mk+16)(w1[0])(w1[1])(w1[2])(w1[3]); - } - else if (keylen == 24) - { - BigEndianBlock::Get(mk+16)(w1[0])(w1[1]); - w1[2] = w1[3] = 0; - } - else - { - w1[0]=w1[1]=w1[2]=w1[3]=0; - } + BigEndianBlock::Get(mk+16)(w1[0])(w1[1])(w1[2])(w1[3]); } - -#if CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS - if (HasSSE2()) + else if (keylen == 24) { - const __m128i x = _mm_xor_si128( - _mm_load_si128((const __m128i*)(w1)), - _mm_load_si128((const __m128i*)(t))); - _mm_store_si128((__m128i*)(w1), x); - - q = (q==2) ? 0 : (q+1); - _mm_store_si128((__m128i*)(t), _mm_xor_si128(x, - _mm_load_si128((const __m128i*)(KRK[q])))); - - ARIA_FE; - - const __m128i y = _mm_xor_si128( - _mm_load_si128((const __m128i*)(w0)), - _mm_load_si128((const __m128i*)(t))); - _mm_store_si128((__m128i*)(w2), y); - - q = (q==2) ? 0 : (q+1); - _mm_store_si128((__m128i*)(t), _mm_xor_si128(y, - _mm_load_si128((const __m128i*)(KRK[q])))); - - ARIA_FO; - - _mm_store_si128((__m128i*)(w3), _mm_xor_si128( - _mm_load_si128((const __m128i*)(w1)), - _mm_load_si128((const __m128i*)(t)))); + BigEndianBlock::Get(mk+16)(w1[0])(w1[1]); + w1[2] = w1[3] = 0; } else -#endif // CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS { - w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; - ::memcpy(t, w1, 16); + w1[0]=w1[1]=w1[2]=w1[3]=0; + } - q = (q==2) ? 0 : (q+1); - t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; + w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; + ::memcpy(t, w1, 16); - ARIA_FE; + q = (q==2) ? 0 : (q+1); + t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; - t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; - ::memcpy(w2, t, 16); + ARIA_FE; - q = (q==2) ? 0 : (q+1); - t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; + t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; + ::memcpy(w2, t, 16); - ARIA_FO; + q = (q==2) ? 0 : (q+1); + t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; - w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; - } + ARIA_FO; + + w3[0]=t[0]^w1[0]; w3[1]=t[1]^w1[1]; w3[2]=t[2]^w1[2]; w3[3]=t[3]^w1[3]; -#if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS +#if CRYPTOPP_ARM_NEON_AVAILABLE if (HasNEON()) { - const uint32x4_t w0 = vld1q_u32((const uint32_t*)(m_w.data()+0)); - const uint32x4_t w1 = vld1q_u32((const uint32_t*)(m_w.data()+8)); - const uint32x4_t w2 = vld1q_u32((const uint32_t*)(m_w.data()+12)); - const uint32x4_t w3 = vld1q_u32((const uint32_t*)(m_w.data()+16)); - - ARIA_GSRK_NEON<19>(w0, w1, rk + 0); - ARIA_GSRK_NEON<19>(w1, w2, rk + 16); - ARIA_GSRK_NEON<19>(w2, w3, rk + 32); - ARIA_GSRK_NEON<19>(w3, w0, rk + 48); - ARIA_GSRK_NEON<31>(w0, w1, rk + 64); - ARIA_GSRK_NEON<31>(w1, w2, rk + 80); - ARIA_GSRK_NEON<31>(w2, w3, rk + 96); - ARIA_GSRK_NEON<31>(w3, w0, rk + 112); - ARIA_GSRK_NEON<67>(w0, w1, rk + 128); - ARIA_GSRK_NEON<67>(w1, w2, rk + 144); - ARIA_GSRK_NEON<67>(w2, w3, rk + 160); - ARIA_GSRK_NEON<67>(w3, w0, rk + 176); - ARIA_GSRK_NEON<97>(w0, w1, rk + 192); - - if (keylen > 16) - { - ARIA_GSRK_NEON<97>(w1, w2, rk + 208); - ARIA_GSRK_NEON<97>(w2, w3, rk + 224); - - if (keylen > 24) - { - ARIA_GSRK_NEON< 97>(w3, w0, rk + 240); - ARIA_GSRK_NEON<109>(w0, w1, rk + 256); - } - } + ARIA_UncheckedSetKey_Schedule_NEON(rk, m_w, keylen); } else -#endif // CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS +#endif // CRYPTOPP_ARM_NEON_AVAILABLE { ARIA_GSRK<19>(w0, w1, rk + 0); ARIA_GSRK<19>(w1, w2, rk + 16); @@ -453,53 +213,24 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam rk = m_rk.data(); r = R; q = Q; -#if CRYPTOPP_ENABLE_ARIA_SSE2_INTRINSICS && !defined(__SUNPRO_CC) - if (HasSSE2()) - { - a=reinterpret_cast(rk); s=m_w.data()+24; z=a+r*4; - _mm_store_si128((__m128i*)t, _mm_load_si128((const __m128i*)a)); - _mm_store_si128((__m128i*)a, _mm_load_si128((const __m128i*)z)); - _mm_store_si128((__m128i*)z, _mm_load_si128((const __m128i*)t)); - - a+=4; z-=4; - for (; a(rk); s=m_w.data()+24; z=a+r*4; + ::memcpy(t, a, 16); ::memcpy(a, z, 16); ::memcpy(z, t, 16); + a+=4; z-=4; + for (; a(rk); s=m_w.data()+24; z=a+r*4; - ::memcpy(t, a, 16); ::memcpy(a, z, 16); ::memcpy(z, t, 16); - - a+=4; z-=4; - for (; a>8); - outBlock[ 2] = (byte)(S1[ARIA_BRF(t[0],1)] ); - outBlock[ 3] = (byte)(S2[ARIA_BRF(t[0],0)] ); - outBlock[ 4] = (byte)(X1[ARIA_BRF(t[1],3)] ); - outBlock[ 5] = (byte)(X2[ARIA_BRF(t[1],2)]>>8); - outBlock[ 6] = (byte)(S1[ARIA_BRF(t[1],1)] ); - outBlock[ 7] = (byte)(S2[ARIA_BRF(t[1],0)] ); - outBlock[ 8] = (byte)(X1[ARIA_BRF(t[2],3)] ); - outBlock[ 9] = (byte)(X2[ARIA_BRF(t[2],2)]>>8); - outBlock[10] = (byte)(S1[ARIA_BRF(t[2],1)] ); - outBlock[11] = (byte)(S2[ARIA_BRF(t[2],0)] ); - outBlock[12] = (byte)(X1[ARIA_BRF(t[3],3)] ); - outBlock[13] = (byte)(X2[ARIA_BRF(t[3],2)]>>8); - outBlock[14] = (byte)(S1[ARIA_BRF(t[3],1)] ); - outBlock[15] = (byte)(S2[ARIA_BRF(t[3],0)] ); - - // 'outBlock' may be unaligned. - _mm_storeu_si128(reinterpret_cast<__m128i*>(outBlock), - _mm_xor_si128(_mm_loadu_si128((const __m128i*)(outBlock)), - _mm_shuffle_epi8(_mm_load_si128((const __m128i*)(rk)), MASK))); - - // 'outBlock' and 'xorBlock' may be unaligned. - if (xorBlock != NULLPTR) - { - _mm_storeu_si128((__m128i*)(outBlock), - _mm_xor_si128( - _mm_loadu_si128((const __m128i*)(outBlock)), - _mm_loadu_si128((const __m128i*)(xorBlock)))); - } + ARIA_ProcessAndXorBlock_Xor_SSSE3(xorBlock, outBlock, rk, t); return; } else @@ -617,22 +318,17 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b BigEndianBlock::Put(rk, t)(t[0])(t[1])(t[2])(t[3]); #endif -#if CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS +#if CRYPTOPP_ARM_NEON_AVAILABLE if (HasNEON()) { if (xorBlock != NULLPTR) - { - vst1q_u32(reinterpret_cast(outBlock), - veorq_u32( - vld1q_u32((const uint32_t*)outBlock), - vld1q_u32((const uint32_t*)xorBlock))); - } + ARIA_ProcessAndXorBlock_Xor_NEON(xorBlock, outBlock); } else -#endif // CRYPTOPP_ENABLE_ARIA_NEON_INTRINSICS +#endif // CRYPTOPP_ARM_NEON_AVAILABLE { if (xorBlock != NULLPTR) - for (unsigned int n=0; n<16; ++n) + for (unsigned int n=0; nSetKeyWithoutResync(userKey, keylength, params); m_state = State_KeySet; size_t length; diff --git a/bench1.cpp b/bench1.cpp index bc252ed8a..4b6d4437a 100644 --- a/bench1.cpp +++ b/bench1.cpp @@ -506,11 +506,11 @@ void Benchmark2(double t, double hertz) std::cout << "\n"; { -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasCLMUL()) BenchMarkByName2("AES/GCM", 0, "GMAC(AES)"); else -#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) BenchMarkByName2("AES/GCM", 0, "GMAC(AES)"); else @@ -595,11 +595,11 @@ void Benchmark2(double t, double hertz) std::cout << "\n"; { -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasCLMUL()) BenchMarkByName2("AES/GCM", 0, "AES/GCM"); else -#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) BenchMarkByName2("AES/GCM", 0, "AES/GCM"); else diff --git a/blake2-simd.cpp b/blake2-simd.cpp new file mode 100644 index 000000000..1aff1ff84 --- /dev/null +++ b/blake2-simd.cpp @@ -0,0 +1,2182 @@ +// blake2-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to ARMv7a/ARMv8a +// NEON and SSE4.2 instructions. A separate source file is needed +// because additional CXXFLAGS are required to enable the appropriate +// instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" +#include "blake2.h" + +#if (CRYPTOPP_SSE42_AVAILABLE) +# include "emmintrin.h" +# include "nmmintrin.h" +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include "arm_neon.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + +NAMESPACE_BEGIN(CryptoPP) + +// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008. +// Also see http://stackoverflow.com/a/38547909/608639 +#if CRYPTOPP_SSE2_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) +inline __m128i MM_SET_EPI64X(const word64 a, const word64 b) +{ + const word64 t[2] = {b,a}; __m128i r; + ::memcpy(&r, &t, sizeof(t)); + return r; +} +#else +# define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b) +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +CRYPTOPP_ALIGN_DATA(16) +const word32 BLAKE2S_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + +CRYPTOPP_ALIGN_DATA(16) +const word64 BLAKE2B_IV[8] = { + W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b), + W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1), + W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f), + W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179) +}; + +ANONYMOUS_NAMESPACE_END + +#if CRYPTOPP_SSE42_AVAILABLE +void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& state) +{ + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; + + __m128i t0, t1, t2; + __m128i ff0, ff1; + + const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); + const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00)); + const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16)); + const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32)); + const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48)); + + row1 = ff0 = _mm_loadu_si128(CONST_M128_CAST(&state.h[0])); + row2 = ff1 = _mm_loadu_si128(CONST_M128_CAST(&state.h[4])); + row3 = _mm_setr_epi32(BLAKE2S_IV[0], BLAKE2S_IV[1], BLAKE2S_IV[2], BLAKE2S_IV[3]); + row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV[4], BLAKE2S_IV[5], BLAKE2S_IV[6], BLAKE2S_IV[7]), _mm_loadu_si128(CONST_M128_CAST(&state.t[0]))); + buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1)))); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_blend_epi16(m1, m2, 0x0C); + t1 = _mm_slli_si128(m3, 4); + t2 = _mm_blend_epi16(t0, t1, 0xF0); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); + t1 = _mm_blend_epi16(m1,m3,0xC0); + t2 = _mm_blend_epi16(t0, t1, 0xF0); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_slli_si128(m1, 4); + t1 = _mm_blend_epi16(m2, t0, 0x30); + t2 = _mm_blend_epi16(m0, t1, 0xF0); + buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_slli_si128(m3, 4); + t2 = _mm_blend_epi16(t0, t1, 0x0C); + buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m2,m3); + t1 = _mm_blend_epi16(m3,m1,0x0C); + t2 = _mm_blend_epi16(t0, t1, 0x0F); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi32(m2,m0); + t1 = _mm_blend_epi16(t0, m0, 0xF0); + t2 = _mm_slli_si128(m3, 8); + buf2 = _mm_blend_epi16(t1, t2, 0xC0); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m0, m2, 0x3C); + t1 = _mm_srli_si128(m1, 12); + t2 = _mm_blend_epi16(t0,t1,0x03); + buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_slli_si128(m3, 4); + t1 = _mm_blend_epi16(m0, m1, 0x33); + t2 = _mm_blend_epi16(t1, t0, 0xC0); + buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_unpackhi_epi32(t0, m2); + t2 = _mm_blend_epi16(t1, m3, 0x0C); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_slli_si128(m2, 8); + t1 = _mm_blend_epi16(m3,m0,0x0C); + t2 = _mm_blend_epi16(t1, t0, 0xC0); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m0,m1,0x0F); + t1 = _mm_blend_epi16(t0, m3, 0xC0); + buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi32(m0,m2); + t1 = _mm_unpackhi_epi32(m1,m2); + buf4 = _mm_unpacklo_epi64(t1,t0); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpacklo_epi64(m1,m2); + t1 = _mm_unpackhi_epi64(m0,m2); + t2 = _mm_blend_epi16(t0,t1,0x33); + buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi64(m1,m3); + t1 = _mm_unpacklo_epi64(m0,m1); + buf2 = _mm_blend_epi16(t0,t1,0x33); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpackhi_epi64(m3,m1); + t1 = _mm_unpackhi_epi64(m2,m0); + buf3 = _mm_blend_epi16(t1,t0,0x33); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m0,m2,0x03); + t1 = _mm_slli_si128(t0, 8); + t2 = _mm_blend_epi16(t1,m3,0x0F); + buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_unpacklo_epi32(m0,m2); + buf1 = _mm_unpacklo_epi64(t0,t1); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_srli_si128(m2, 4); + t1 = _mm_blend_epi16(m0,m3,0x03); + buf2 = _mm_blend_epi16(t1,t0,0x3C); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m1,m0,0x0C); + t1 = _mm_srli_si128(m3, 4); + t2 = _mm_blend_epi16(t0,t1,0x30); + buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi64(m1,m2); + t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); + buf4 = _mm_blend_epi16(t0,t1,0x33); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_slli_si128(m1, 12); + t1 = _mm_blend_epi16(m0,m3,0x33); + buf1 = _mm_blend_epi16(t1,t0,0xC0); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m3,m2,0x30); + t1 = _mm_srli_si128(m1, 4); + t2 = _mm_blend_epi16(t0,t1,0x03); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpacklo_epi64(m0,m2); + t1 = _mm_srli_si128(m1, 4); + buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi32(m1,m2); + t1 = _mm_unpackhi_epi64(m0,t0); + buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m0,m1); + t1 = _mm_blend_epi16(t0,m3,0x0F); + buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m2,m3,0x30); + t1 = _mm_srli_si128(m0,4); + t2 = _mm_blend_epi16(t0,t1,0x03); + buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpackhi_epi64(m0,m3); + t1 = _mm_unpacklo_epi64(m1,m2); + t2 = _mm_blend_epi16(t0,t1,0x3C); + buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpacklo_epi32(m0,m1); + t1 = _mm_unpackhi_epi32(m1,m2); + buf4 = _mm_unpacklo_epi64(t0,t1); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_unpackhi_epi32(m1,m3); + t1 = _mm_unpacklo_epi64(t0,m0); + t2 = _mm_blend_epi16(t1,m2,0xC0); + buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_unpackhi_epi32(m0,m3); + t1 = _mm_blend_epi16(m2,t0,0xF0); + buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_blend_epi16(m2,m0,0x0C); + t1 = _mm_slli_si128(t0,4); + buf3 = _mm_blend_epi16(t1,m3,0x0F); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m1,m0,0x30); + buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + t0 = _mm_blend_epi16(m0,m2,0x03); + t1 = _mm_blend_epi16(m1,m2,0x30); + t2 = _mm_blend_epi16(t1,t0,0x0F); + buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_slli_si128(m0,4); + t1 = _mm_blend_epi16(m1,t0,0xC0); + buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); + + t0 = _mm_unpackhi_epi32(m0,m3); + t1 = _mm_unpacklo_epi32(m2,m3); + t2 = _mm_unpackhi_epi64(t0,t1); + buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r16); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); + + t0 = _mm_blend_epi16(m3,m2,0xC0); + t1 = _mm_unpacklo_epi32(m0,m3); + t2 = _mm_blend_epi16(t0,t1,0x0F); + buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); + + row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); + row4 = _mm_xor_si128(row4, row1); + row4 = _mm_shuffle_epi8(row4,r8); + row3 = _mm_add_epi32(row3, row4); + row2 = _mm_xor_si128(row2, row3); + row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); + + row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); + row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); + row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); + + _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); + _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); +} + +void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1, t0, t1; + + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + + const __m128i m0 = _mm_loadu_si128(CONST_M128_CAST(input + 00)); + const __m128i m1 = _mm_loadu_si128(CONST_M128_CAST(input + 16)); + const __m128i m2 = _mm_loadu_si128(CONST_M128_CAST(input + 32)); + const __m128i m3 = _mm_loadu_si128(CONST_M128_CAST(input + 48)); + const __m128i m4 = _mm_loadu_si128(CONST_M128_CAST(input + 64)); + const __m128i m5 = _mm_loadu_si128(CONST_M128_CAST(input + 80)); + const __m128i m6 = _mm_loadu_si128(CONST_M128_CAST(input + 96)); + const __m128i m7 = _mm_loadu_si128(CONST_M128_CAST(input + 112)); + + row1l = _mm_loadu_si128(CONST_M128_CAST(&state.h[0])); + row1h = _mm_loadu_si128(CONST_M128_CAST(&state.h[2])); + row2l = _mm_loadu_si128(CONST_M128_CAST(&state.h[4])); + row2h = _mm_loadu_si128(CONST_M128_CAST(&state.h[6])); + row3l = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[0])); + row3h = _mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[2])); + row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[4])), _mm_loadu_si128(CONST_M128_CAST(&state.t[0]))); + row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&BLAKE2B_IV[6])), _mm_loadu_si128(CONST_M128_CAST(&state.f[0]))); + + b0 = _mm_unpacklo_epi64(m0, m1); + b1 = _mm_unpacklo_epi64(m2, m3); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m0, m1); + b1 = _mm_unpackhi_epi64(m2, m3); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m4, m5); + b1 = _mm_unpacklo_epi64(m6, m7); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m4, m5); + b1 = _mm_unpackhi_epi64(m6, m7); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m7, m2); + b1 = _mm_unpackhi_epi64(m4, m6); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m5, m4); + b1 = _mm_alignr_epi8(m3, m7, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); + b1 = _mm_unpackhi_epi64(m5, m2); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m6, m1); + b1 = _mm_unpackhi_epi64(m3, m1); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_alignr_epi8(m6, m5, 8); + b1 = _mm_unpackhi_epi64(m2, m7); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m4, m0); + b1 = _mm_blend_epi16(m1, m6, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_blend_epi16(m5, m1, 0xF0); + b1 = _mm_unpackhi_epi64(m3, m4); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m7, m3); + b1 = _mm_alignr_epi8(m2, m0, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpackhi_epi64(m3, m1); + b1 = _mm_unpackhi_epi64(m6, m5); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m4, m0); + b1 = _mm_unpacklo_epi64(m6, m7); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_blend_epi16(m1, m2, 0xF0); + b1 = _mm_blend_epi16(m2, m7, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m3, m5); + b1 = _mm_unpacklo_epi64(m0, m4); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpackhi_epi64(m4, m2); + b1 = _mm_unpacklo_epi64(m1, m5); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_blend_epi16(m0, m3, 0xF0); + b1 = _mm_blend_epi16(m2, m7, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_blend_epi16(m7, m5, 0xF0); + b1 = _mm_blend_epi16(m3, m1, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_alignr_epi8(m6, m0, 8); + b1 = _mm_blend_epi16(m4, m6, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m1, m3); + b1 = _mm_unpacklo_epi64(m0, m4); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m6, m5); + b1 = _mm_unpackhi_epi64(m5, m1); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_blend_epi16(m2, m3, 0xF0); + b1 = _mm_unpackhi_epi64(m7, m0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m6, m2); + b1 = _mm_blend_epi16(m7, m4, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_blend_epi16(m6, m0, 0xF0); + b1 = _mm_unpacklo_epi64(m7, m2); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m2, m7); + b1 = _mm_alignr_epi8(m5, m6, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m0, m3); + b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m3, m1); + b1 = _mm_blend_epi16(m1, m5, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpackhi_epi64(m6, m3); + b1 = _mm_blend_epi16(m6, m1, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_alignr_epi8(m7, m5, 8); + b1 = _mm_unpackhi_epi64(m0, m4); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpackhi_epi64(m2, m7); + b1 = _mm_unpacklo_epi64(m4, m1); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m0, m2); + b1 = _mm_unpacklo_epi64(m3, m5); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m3, m7); + b1 = _mm_alignr_epi8(m0, m5, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m7, m4); + b1 = _mm_alignr_epi8(m4, m1, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = m6; + b1 = _mm_alignr_epi8(m5, m0, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_blend_epi16(m1, m3, 0xF0); + b1 = m2; + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m5, m4); + b1 = _mm_unpackhi_epi64(m3, m0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m1, m2); + b1 = _mm_blend_epi16(m3, m2, 0xF0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpackhi_epi64(m7, m4); + b1 = _mm_unpackhi_epi64(m1, m6); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_alignr_epi8(m7, m5, 8); + b1 = _mm_unpacklo_epi64(m6, m0); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m0, m1); + b1 = _mm_unpacklo_epi64(m2, m3); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m0, m1); + b1 = _mm_unpackhi_epi64(m2, m3); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m4, m5); + b1 = _mm_unpacklo_epi64(m6, m7); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpackhi_epi64(m4, m5); + b1 = _mm_unpackhi_epi64(m6, m7); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + b0 = _mm_unpacklo_epi64(m7, m2); + b1 = _mm_unpackhi_epi64(m4, m6); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m5, m4); + b1 = _mm_alignr_epi8(m3, m7, 8); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2h, row2l, 8); + t1 = _mm_alignr_epi8(row2l, row2h, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4h, row4l, 8); + t1 = _mm_alignr_epi8(row4l, row4h, 8); + row4l = t1, row4h = t0; + + b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); + b1 = _mm_unpackhi_epi64(m5, m2); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); + row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_shuffle_epi8(row2l, r24); + row2h = _mm_shuffle_epi8(row2h, r24); + + b0 = _mm_unpacklo_epi64(m6, m1); + b1 = _mm_unpackhi_epi64(m3, m1); + + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_shuffle_epi8(row4l, r16); + row4h = _mm_shuffle_epi8(row4h, r16); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); + + t0 = _mm_alignr_epi8(row2l, row2h, 8); + t1 = _mm_alignr_epi8(row2h, row2l, 8); + row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; + t0 = _mm_alignr_epi8(row4l, row4h, 8); + t1 = _mm_alignr_epi8(row4h, row4l, 8); + row4l = t1, row4h = t0; + + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + _mm_storeu_si128(M128_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[0])), row1l)); + _mm_storeu_si128(M128_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[2])), row1h)); + + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + _mm_storeu_si128(M128_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[4])), row2l)); + _mm_storeu_si128(M128_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128_CAST(&state.h[6])), row2h)); +} +#endif // CRYPTOPP_SSE42_AVAILABLE + +// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 +#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State& state) +{ + #define BLAKE2S_LOAD_MSG_0_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_0_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_0_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_0_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ + t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \ + t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_1_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \ + t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \ + t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_2_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \ + t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_3_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \ + t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_4_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \ + t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \ + t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_5_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \ + t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ + t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \ + t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_6_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ + t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ + t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_7_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \ + t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ + t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \ + t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_8_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_1(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ + t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_2(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \ + t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_3(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ + t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define BLAKE2S_LOAD_MSG_9_4(buf) \ + do { uint32x2_t t0, t1; \ + t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ + t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \ + buf = vcombine_u32(t0, t1); } while(0) + + #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) + + #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) + + #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) + + #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ + do { \ + row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ + row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ + row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ + } while(0) + + #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ + do { \ + row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ + row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ + row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ + } while(0) + + #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ + do { \ + row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ + } while(0) + + #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ + do { \ + row4 = vextq_u32(row4, row4, 1); \ + row3 = vextq_u32(row3, row3, 2); \ + row2 = vextq_u32(row2, row2, 3); \ + } while(0) + + #define BLAKE2S_ROUND(r) \ + do { \ + uint32x4_t buf1, buf2, buf3, buf4; \ + BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ + BLAKE2S_G1(row1,row2,row3,row4,buf1); \ + BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ + BLAKE2S_G2(row1,row2,row3,row4,buf2); \ + BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ + BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ + BLAKE2S_G1(row1,row2,row3,row4,buf3); \ + BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ + BLAKE2S_G2(row1,row2,row3,row4,buf4); \ + BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ + } while(0) + + CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf())); + + const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00))); + const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16))); + const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32))); + const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48))); + + uint32x4_t row1, row2, row3, row4; + + const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]); + const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]); + row3 = vld1q_u32(&BLAKE2S_IV[0]); + row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV[4]), vld1q_u32(&state.t[0])); + + BLAKE2S_ROUND(0); + BLAKE2S_ROUND(1); + BLAKE2S_ROUND(2); + BLAKE2S_ROUND(3); + BLAKE2S_ROUND(4); + BLAKE2S_ROUND(5); + BLAKE2S_ROUND(6); + BLAKE2S_ROUND(7); + BLAKE2S_ROUND(8); + BLAKE2S_ROUND(9); + + vst1q_u32(&state.h[0], veorq_u32(f0, veorq_u32(row1, row3))); + vst1q_u32(&state.h[4], veorq_u32(f1, veorq_u32(row2, row4))); +} + +void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state) +{ + #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) + + #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) + + #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) + + #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ + do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) + + #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) + + #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ + do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0) + + #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0) + + #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0) + + #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) + + #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0) + + #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0) + + #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ + do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0) + + #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) + + #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0) + + #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0) + + #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0) + + #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0) + + #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0) + + #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0) + + #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ + do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0) + + #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0) + + #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0) + + #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ + do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0) + + #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0) + + #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0) + + #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0) + + #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ + do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0) + + #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) + + #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) + + #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ + do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) + + #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) + + #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) + + #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ + do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) + + #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ + do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) + + #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) + + #define vrorq_n_u64_24(x) vcombine_u64(\ + vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ + vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) + + #define vrorq_n_u64_16(x) vcombine_u64(\ + vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ + vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) + + #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63)) + + #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + do { \ + row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ + row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ + row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ + row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \ + row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ + row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ + row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \ + } while(0) + + #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + do { \ + row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ + row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ + row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ + row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \ + row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ + row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ + row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ + } while(0) + + #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + do { \ + uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ + uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \ + row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ + t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \ + row4l = t0; row4h = t1; \ + } while(0) + + #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + do { \ + uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \ + uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \ + row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ + t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \ + row4l = t0; row4h = t1; \ + } while(0) + + #define BLAKE2B_ROUND(r) \ + do { \ + uint64x2_t b0, b1; \ + BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ + BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ + BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ + BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ + BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + } while(0) + + CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf())); + + const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00)); + const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16)); + const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32)); + const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48)); + const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64)); + const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80)); + const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96)); + const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112)); + + uint64x2_t row1l, row1h, row2l, row2h; + uint64x2_t row3l, row3h, row4l, row4h; + + const uint64x2_t h0 = row1l = vld1q_u64(&state.h[0]); + const uint64x2_t h1 = row1h = vld1q_u64(&state.h[2]); + const uint64x2_t h2 = row2l = vld1q_u64(&state.h[4]); + const uint64x2_t h3 = row2h = vld1q_u64(&state.h[6]); + + row3l = vld1q_u64(&BLAKE2B_IV[0]); + row3h = vld1q_u64(&BLAKE2B_IV[2]); + row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV[4]), vld1q_u64(&state.t[0])); + row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV[6]), vld1q_u64(&state.f[0])); + + BLAKE2B_ROUND(0); + BLAKE2B_ROUND(1); + BLAKE2B_ROUND(2); + BLAKE2B_ROUND(3); + BLAKE2B_ROUND(4); + BLAKE2B_ROUND(5); + BLAKE2B_ROUND(6); + BLAKE2B_ROUND(7); + BLAKE2B_ROUND(8); + BLAKE2B_ROUND(9); + BLAKE2B_ROUND(10); + BLAKE2B_ROUND(11); + + vst1q_u64(&state.h[0], veorq_u64(h0, veorq_u64(row1l, row3l))); + vst1q_u64(&state.h[2], veorq_u64(h1, veorq_u64(row1h, row3h))); + vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l))); + vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h))); +} +#endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE + +NAMESPACE_END \ No newline at end of file diff --git a/blake2.cpp b/blake2.cpp index 6313fec1c..3e1712eb1 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -13,111 +13,47 @@ NAMESPACE_BEGIN(CryptoPP) // Uncomment for benchmarking C++ against SSE2 or NEON -// #undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE -// #undef CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE +// #undef CRYPTOPP_SSE42_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE // Apple Clang 6.0/Clang 3.5 does not have SSSE3 intrinsics // http://llvm.org/bugs/show_bug.cgi?id=20213 #if (defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION <= 60000)) || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION <= 30500)) -# undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE +# undef CRYPTOPP_SSE42_AVAILABLE #endif -// Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x, Win64 supplies it except for VS2008. -// Also see http://stackoverflow.com/a/38547909/608639 -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (defined(_MSC_VER) && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) -inline __m128i MM_SET_EPI64X(const word64 a, const word64 b) -{ - const word64 t[2] = {b,a}; __m128i r; - memcpy(&r, &t, sizeof(t)); - return r; -} -#else -# define MM_SET_EPI64X(a, b) _mm_set_epi64x(a, b) -#endif - -// Clang casts -#define M128I_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x)) - -// C/C++ implementation -static void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State& state); -static void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State& state); +void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state); +void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State& state); -// Also see http://github.com/weidai11/cryptopp/issues/247 for SunCC 5.12 -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE -static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State& state); -# if (__SUNPRO_CC != 0x5120) -static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State& state); -# endif -#endif - -#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE -static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State& state); -static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State& state); +#if CRYPTOPP_SSE42_AVAILABLE +extern void BLAKE2_Compress32_SSE4(const byte* input, BLAKE2_State& state); +extern void BLAKE2_Compress64_SSE4(const byte* input, BLAKE2_State& state); #endif // Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE -static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State& state); -static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State& state); +#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE +extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2_State& state); +extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2_State& state); #endif -#ifndef CRYPTOPP_DOXYGEN_PROCESSING - -// IV and Sigma are a better fit as part of BLAKE2_Base, but that places -// the constants out of reach for the NEON, SSE2 and SSE4 implementations. -template -struct CRYPTOPP_NO_VTABLE BLAKE2_IV {}; - -//! \brief BLAKE2s initialization vector specialization -template<> -struct CRYPTOPP_NO_VTABLE BLAKE2_IV -{ - CRYPTOPP_CONSTANT(IVSIZE = 8) - // Always align for NEON and SSE - CRYPTOPP_ALIGN_DATA(16) static const word32 iv[8]; -}; +ANONYMOUS_NAMESPACE_BEGIN CRYPTOPP_ALIGN_DATA(16) -const word32 BLAKE2_IV::iv[8] = { +const word32 BLAKE2S_IV[8] = { 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL }; -#define BLAKE2S_IV(n) BLAKE2_IV::iv[n] - -template<> -struct CRYPTOPP_NO_VTABLE BLAKE2_IV -{ - CRYPTOPP_CONSTANT(IVSIZE = 8) - // Always align for NEON and SSE - CRYPTOPP_ALIGN_DATA(16) static const word64 iv[8]; -}; - CRYPTOPP_ALIGN_DATA(16) -const word64 BLAKE2_IV::iv[8] = { +const word64 BLAKE2B_IV[8] = { W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b), W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1), W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f), W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179) }; -#define BLAKE2B_IV(n) BLAKE2_IV::iv[n] - -// IV and Sigma are a better fit as part of BLAKE2_Base, but that places -// the constants out of reach for the NEON, SSE2 and SSE4 implementations. -template -struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma {}; - -template<> -struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma -{ - // Always align for NEON and SSE - CRYPTOPP_ALIGN_DATA(16) static const byte sigma[10][16]; -}; - CRYPTOPP_ALIGN_DATA(16) -const byte BLAKE2_Sigma::sigma[10][16] = { +const byte BLAKE2S_SIGMA[10][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, @@ -130,16 +66,8 @@ const byte BLAKE2_Sigma::sigma[10][16] = { { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, }; -//! \brief BLAKE2b sigma table specialization -template<> -struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma -{ - // Always align for NEON and SSE - CRYPTOPP_ALIGN_DATA(16) static const byte sigma[12][16]; -}; - CRYPTOPP_ALIGN_DATA(16) -const byte BLAKE2_Sigma::sigma[12][16] = { +const byte BLAKE2B_SIGMA[12][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, @@ -159,47 +87,35 @@ typedef void (*pfnCompress64)(const byte*, BLAKE2_State&); pfnCompress64 InitializeCompress64Fn() { -#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE - if (HasSSE4()) - return &BLAKE2_SSE4_Compress64; +#if CRYPTOPP_SSE42_AVAILABLE + if (HasSSE42()) + return &BLAKE2_Compress64_SSE4; else #endif -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE -# if (__SUNPRO_CC != 0x5120) - if (HasSSE2()) - return &BLAKE2_SSE2_Compress64; - else -# endif -#endif -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE +#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE if (HasNEON()) - return &BLAKE2_NEON_Compress64; + return &BLAKE2_Compress64_NEON; else #endif - return &BLAKE2_CXX_Compress64; + return &BLAKE2_Compress64_CXX; } pfnCompress32 InitializeCompress32Fn() { -#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE - if (HasSSE4()) - return &BLAKE2_SSE4_Compress32; - else -#endif -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE - if (HasSSE2()) - return &BLAKE2_SSE2_Compress32; +#if CRYPTOPP_SSE42_AVAILABLE + if (HasSSE42()) + return &BLAKE2_Compress32_SSE4; else #endif -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE +#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_ARM_NEON_AVAILABLE if (HasNEON()) - return &BLAKE2_NEON_Compress32; + return &BLAKE2_Compress32_NEON; else #endif - return &BLAKE2_CXX_Compress32; + return &BLAKE2_Compress32_CXX; } -#endif // CRYPTOPP_DOXYGEN_PROCESSING +ANONYMOUS_NAMESPACE_END BLAKE2_ParameterBlock::BLAKE2_ParameterBlock(size_t digestLen, size_t keyLen, const byte* saltStr, size_t saltLen, @@ -402,9 +318,9 @@ void BLAKE2_Base::Restart(const BLAKE2_ParameterBlock& bloc state.t[1] = counter[1]; } + const W* IV = T_64bit ? reinterpret_cast(BLAKE2B_IV) : reinterpret_cast(BLAKE2S_IV); PutBlock put(m_block.data(), &state.h[0]); - put(BLAKE2_IV::iv[0])(BLAKE2_IV::iv[1])(BLAKE2_IV::iv[2])(BLAKE2_IV::iv[3]); - put(BLAKE2_IV::iv[4])(BLAKE2_IV::iv[5])(BLAKE2_IV::iv[6])(BLAKE2_IV::iv[7]); + put(IV[0])(IV[1])(IV[2])(IV[3])(IV[4])(IV[5])(IV[6])(IV[7]); // When BLAKE2 is keyed, the input stream is simply {key||message}. Key it // during Restart to avoid FirstPut and friends. Key size == 0 means no key. @@ -495,18 +411,18 @@ void BLAKE2_Base::Compress(const byte *input) s_pfn(input, *m_state.data()); } -void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State& state) +void BLAKE2_Compress64_CXX(const byte* input, BLAKE2_State& state) { #undef BLAKE2_G #undef BLAKE2_ROUND #define BLAKE2_G(r,i,a,b,c,d) \ do { \ - a = a + b + m[BLAKE2_Sigma::sigma[r][2*i+0]]; \ + a = a + b + m[BLAKE2B_SIGMA[r][2*i+0]]; \ d = rotrVariable(d ^ a, 32); \ c = c + d; \ b = rotrVariable(b ^ c, 24); \ - a = a + b + m[BLAKE2_Sigma::sigma[r][2*i+1]]; \ + a = a + b + m[BLAKE2B_SIGMA[r][2*i+1]]; \ d = rotrVariable(d ^ a, 16); \ c = c + d; \ b = rotrVariable(b ^ c, 63); \ @@ -532,14 +448,14 @@ void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State& state) GetBlock get2(&state.h[0]); get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]); - v[ 8] = BLAKE2B_IV(0); - v[ 9] = BLAKE2B_IV(1); - v[10] = BLAKE2B_IV(2); - v[11] = BLAKE2B_IV(3); - v[12] = state.t[0] ^ BLAKE2B_IV(4); - v[13] = state.t[1] ^ BLAKE2B_IV(5); - v[14] = state.f[0] ^ BLAKE2B_IV(6); - v[15] = state.f[1] ^ BLAKE2B_IV(7); + v[ 8] = BLAKE2B_IV[0]; + v[ 9] = BLAKE2B_IV[1]; + v[10] = BLAKE2B_IV[2]; + v[11] = BLAKE2B_IV[3]; + v[12] = state.t[0] ^ BLAKE2B_IV[4]; + v[13] = state.t[1] ^ BLAKE2B_IV[5]; + v[14] = state.f[0] ^ BLAKE2B_IV[6]; + v[15] = state.f[1] ^ BLAKE2B_IV[7]; BLAKE2_ROUND(0); BLAKE2_ROUND(1); @@ -558,18 +474,18 @@ void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State& state) state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]); } -void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State& state) +void BLAKE2_Compress32_CXX(const byte* input, BLAKE2_State& state) { #undef BLAKE2_G #undef BLAKE2_ROUND #define BLAKE2_G(r,i,a,b,c,d) \ do { \ - a = a + b + m[BLAKE2_Sigma::sigma[r][2*i+0]]; \ + a = a + b + m[BLAKE2S_SIGMA[r][2*i+0]]; \ d = rotrVariable(d ^ a, 16); \ c = c + d; \ b = rotrVariable(b ^ c, 12); \ - a = a + b + m[BLAKE2_Sigma::sigma[r][2*i+1]]; \ + a = a + b + m[BLAKE2S_SIGMA[r][2*i+1]]; \ d = rotrVariable(d ^ a, 8); \ c = c + d; \ b = rotrVariable(b ^ c, 7); \ @@ -595,14 +511,14 @@ void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State& state GetBlock get2(&state.h[0]); get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]); - v[ 8] = BLAKE2S_IV(0); - v[ 9] = BLAKE2S_IV(1); - v[10] = BLAKE2S_IV(2); - v[11] = BLAKE2S_IV(3); - v[12] = state.t[0] ^ BLAKE2S_IV(4); - v[13] = state.t[1] ^ BLAKE2S_IV(5); - v[14] = state.f[0] ^ BLAKE2S_IV(6); - v[15] = state.f[1] ^ BLAKE2S_IV(7); + v[ 8] = BLAKE2S_IV[0]; + v[ 9] = BLAKE2S_IV[1]; + v[10] = BLAKE2S_IV[2]; + v[11] = BLAKE2S_IV[3]; + v[12] = state.t[0] ^ BLAKE2S_IV[4]; + v[13] = state.t[1] ^ BLAKE2S_IV[5]; + v[14] = state.f[0] ^ BLAKE2S_IV[6]; + v[15] = state.f[1] ^ BLAKE2S_IV[7]; BLAKE2_ROUND(0); BLAKE2_ROUND(1); @@ -619,3433 +535,6 @@ void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State& state state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]); } -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE -static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State& state) -{ - word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; - GetBlock get(input); - get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); - - __m128i row1,row2,row3,row4; - __m128i buf1,buf2,buf3,buf4; - __m128i ff0,ff1; - - row1 = ff0 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0])); - row2 = ff1 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4])); - row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3)); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128(CONST_M128I_CAST(&state.t[0]))); - buf1 = _mm_set_epi32(m6,m4,m2,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m7,m5,m3,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m14,m12,m10,m8); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m15,m13,m11,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m13,m9,m4,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m6,m15,m8,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m5,m11,m0,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m3,m7,m2,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m15,m5,m12,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m13,m2,m0,m8); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m9,m7,m3,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m4,m1,m6,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m11,m13,m3,m7); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m14,m12,m1,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m15,m4,m5,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m8,m0,m10,m6); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m10,m2,m5,m9); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m15,m4,m7,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m3,m6,m11,m14); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m13,m8,m12,m1); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m8,m0,m6,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m3,m11,m10,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m1,m15,m7,m4); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m9,m14,m5,m13); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m4,m14,m1,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m10,m13,m15,m5); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m8,m9,m6,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m11,m2,m3,m7); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m3,m12,m7,m13); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m9,m1,m14,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m2,m8,m15,m5); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m10,m6,m4,m0); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m0,m11,m14,m6); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m8,m3,m9,m15); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m10,m1,m13,m12); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m5,m4,m7,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - buf1 = _mm_set_epi32(m1,m7,m8,m10); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf2 = _mm_set_epi32(m5,m6,m4,m2); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_set_epi32(m13,m3,m9,m15); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20)); - - buf4 = _mm_set_epi32(m0,m12,m14,m11); - row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2); - row4 = _mm_xor_si128(row4,row1); - row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24)); - row3 = _mm_add_epi32(row3,row4); - row2 = _mm_xor_si128(row2,row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25)); - - row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3)); - - _mm_storeu_si128(M128I_CAST(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3))); - _mm_storeu_si128(M128I_CAST(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4))); -} - -# if (__SUNPRO_CC != 0x5120) -static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State& state) -{ - word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15; - GetBlock get(input); - get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15); - - __m128i row1l, row1h, row2l, row2h; - __m128i row3l, row3h, row4l, row4h; - __m128i b0, b1, t0, t1; - - row1l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0])); - row1h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[2])); - row2l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4])); - row2h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[6])); - row3l = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(0))); - row3h = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(2))); - row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(4))), _mm_loadu_si128(CONST_M128I_CAST(&state.t[0]))); - row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(6))), _mm_loadu_si128(CONST_M128I_CAST(&state.f[0]))); - - b0 = MM_SET_EPI64X(m2, m0); - b1 = MM_SET_EPI64X(m6, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40)); - - b0 = MM_SET_EPI64X(m3, m1); - b1 = MM_SET_EPI64X(m7, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m10, m8); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m11, m9); - b1 = MM_SET_EPI64X(m15, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m4, m14); - b1 = MM_SET_EPI64X(m13, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m6, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m0, m1); - b1 = MM_SET_EPI64X(m5, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m2, m12); - b1 = MM_SET_EPI64X(m3, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m12, m11); - b1 = MM_SET_EPI64X(m15, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m0, m8); - b1 = MM_SET_EPI64X(m13, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m3, m10); - b1 = MM_SET_EPI64X(m9, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m6, m14); - b1 = MM_SET_EPI64X(m4, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m3, m7); - b1 = MM_SET_EPI64X(m11, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m1, m9); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - b0 = MM_SET_EPI64X(m5, m2); - b1 = MM_SET_EPI64X(m15, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m10, m6); - b1 = MM_SET_EPI64X(m8, m0); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m5, m9); - b1 = MM_SET_EPI64X(m10, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m7, m0); - b1 = MM_SET_EPI64X(m15, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m11, m14); - b1 = MM_SET_EPI64X(m3, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - - b0 = MM_SET_EPI64X(m12, m1); - b1 = MM_SET_EPI64X(m13, m8); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m6, m2); - b1 = MM_SET_EPI64X(m8, m0); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m10, m12); - b1 = MM_SET_EPI64X(m3, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m7, m4); - b1 = MM_SET_EPI64X(m1, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m5, m13); - b1 = MM_SET_EPI64X(m9, m14); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m1, m12); - b1 = MM_SET_EPI64X(m4, m14); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m15, m5); - b1 = MM_SET_EPI64X(m10, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m6, m0); - b1 = MM_SET_EPI64X(m8, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m3, m7); - b1 = MM_SET_EPI64X(m11, m2); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m7, m13); - b1 = MM_SET_EPI64X(m3, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m14, m11); - b1 = MM_SET_EPI64X(m9, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m15, m5); - b1 = MM_SET_EPI64X(m2, m8); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m4, m0); - b1 = MM_SET_EPI64X(m10, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m14, m6); - b1 = MM_SET_EPI64X(m0, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m9, m15); - b1 = MM_SET_EPI64X(m8, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m13, m12); - b1 = MM_SET_EPI64X(m10, m1); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m7, m2); - b1 = MM_SET_EPI64X(m5, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m1, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m4, m2); - b1 = MM_SET_EPI64X(m5, m6); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m9, m15); - b1 = MM_SET_EPI64X(m13, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m14, m11); - b1 = MM_SET_EPI64X(m0, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m2, m0); - b1 = MM_SET_EPI64X(m6, m4); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m3, m1); - b1 = MM_SET_EPI64X(m7, m5); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m10, m8); - b1 = MM_SET_EPI64X(m14, m12); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m11, m9); - b1 = MM_SET_EPI64X(m15, m13); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m4, m14); - b1 = MM_SET_EPI64X(m13, m9); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m8, m10); - b1 = MM_SET_EPI64X(m6, m15); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; - row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); - row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); - row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); - row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); - - b0 = MM_SET_EPI64X(m0, m1); - b1 = MM_SET_EPI64X(m5, m11); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); - - b0 = MM_SET_EPI64X(m2, m12); - b1 = MM_SET_EPI64X(m3, m7); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); - row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); - - t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; - row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); - row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); - row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); - row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); - - row1l = _mm_xor_si128(row3l, row1l); - row1h = _mm_xor_si128(row3h, row1h); - _mm_storeu_si128(M128I_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[0])), row1l)); - _mm_storeu_si128(M128I_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[2])), row1h)); - - row2l = _mm_xor_si128(row4l, row2l); - row2h = _mm_xor_si128(row4h, row2h); - _mm_storeu_si128(M128I_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[4])), row2l)); - _mm_storeu_si128(M128I_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[6])), row2h)); -} -# endif // (__SUNPRO_CC != 0x5120) -#endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE - -#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE -static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State& state) -{ - __m128i row1, row2, row3, row4; - __m128i buf1, buf2, buf3, buf4; - - __m128i t0, t1, t2; - __m128i ff0, ff1; - - const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); - const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - - const __m128i m0 = _mm_loadu_si128(CONST_M128I_CAST(input + 00)); - const __m128i m1 = _mm_loadu_si128(CONST_M128I_CAST(input + 16)); - const __m128i m2 = _mm_loadu_si128(CONST_M128I_CAST(input + 32)); - const __m128i m3 = _mm_loadu_si128(CONST_M128I_CAST(input + 48)); - - row1 = ff0 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0])); - row2 = ff1 = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4])); - row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3)); - row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128(CONST_M128I_CAST(&state.t[0]))); - buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0)))); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1)))); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0)))); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1)))); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_blend_epi16(m1, m2, 0x0C); - t1 = _mm_slli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0xF0); - buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); - t1 = _mm_blend_epi16(m1,m3,0xC0); - t2 = _mm_blend_epi16(t0, t1, 0xF0); - buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_slli_si128(m1, 4); - t1 = _mm_blend_epi16(m2, t0, 0x30); - t2 = _mm_blend_epi16(m0, t1, 0xF0); - buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpackhi_epi32(m0,m1); - t1 = _mm_slli_si128(m3, 4); - t2 = _mm_blend_epi16(t0, t1, 0x0C); - buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_unpackhi_epi32(m2,m3); - t1 = _mm_blend_epi16(m3,m1,0x0C); - t2 = _mm_blend_epi16(t0, t1, 0x0F); - buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpacklo_epi32(m2,m0); - t1 = _mm_blend_epi16(t0, m0, 0xF0); - t2 = _mm_slli_si128(m3, 8); - buf2 = _mm_blend_epi16(t1, t2, 0xC0); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_blend_epi16(m0, m2, 0x3C); - t1 = _mm_srli_si128(m1, 12); - t2 = _mm_blend_epi16(t0,t1,0x03); - buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_slli_si128(m3, 4); - t1 = _mm_blend_epi16(m0, m1, 0x33); - t2 = _mm_blend_epi16(t1, t0, 0xC0); - buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_unpackhi_epi32(m0,m1); - t1 = _mm_unpackhi_epi32(t0, m2); - t2 = _mm_blend_epi16(t1, m3, 0x0C); - buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_slli_si128(m2, 8); - t1 = _mm_blend_epi16(m3,m0,0x0C); - t2 = _mm_blend_epi16(t1, t0, 0xC0); - buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_blend_epi16(m0,m1,0x0F); - t1 = _mm_blend_epi16(t0, m3, 0xC0); - buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpacklo_epi32(m0,m2); - t1 = _mm_unpackhi_epi32(m1,m2); - buf4 = _mm_unpacklo_epi64(t1,t0); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_unpacklo_epi64(m1,m2); - t1 = _mm_unpackhi_epi64(m0,m2); - t2 = _mm_blend_epi16(t0,t1,0x33); - buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpackhi_epi64(m1,m3); - t1 = _mm_unpacklo_epi64(m0,m1); - buf2 = _mm_blend_epi16(t0,t1,0x33); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_unpackhi_epi64(m3,m1); - t1 = _mm_unpackhi_epi64(m2,m0); - buf3 = _mm_blend_epi16(t1,t0,0x33); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_blend_epi16(m0,m2,0x03); - t1 = _mm_slli_si128(t0, 8); - t2 = _mm_blend_epi16(t1,m3,0x0F); - buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_unpackhi_epi32(m0,m1); - t1 = _mm_unpacklo_epi32(m0,m2); - buf1 = _mm_unpacklo_epi64(t0,t1); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_srli_si128(m2, 4); - t1 = _mm_blend_epi16(m0,m3,0x03); - buf2 = _mm_blend_epi16(t1,t0,0x3C); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_blend_epi16(m1,m0,0x0C); - t1 = _mm_srli_si128(m3, 4); - t2 = _mm_blend_epi16(t0,t1,0x30); - buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpacklo_epi64(m1,m2); - t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); - buf4 = _mm_blend_epi16(t0,t1,0x33); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_slli_si128(m1, 12); - t1 = _mm_blend_epi16(m0,m3,0x33); - buf1 = _mm_blend_epi16(t1,t0,0xC0); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_blend_epi16(m3,m2,0x30); - t1 = _mm_srli_si128(m1, 4); - t2 = _mm_blend_epi16(t0,t1,0x03); - buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_unpacklo_epi64(m0,m2); - t1 = _mm_srli_si128(m1, 4); - buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpackhi_epi32(m1,m2); - t1 = _mm_unpackhi_epi64(m0,t0); - buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_unpackhi_epi32(m0,m1); - t1 = _mm_blend_epi16(t0,m3,0x0F); - buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_blend_epi16(m2,m3,0x30); - t1 = _mm_srli_si128(m0,4); - t2 = _mm_blend_epi16(t0,t1,0x03); - buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_unpackhi_epi64(m0,m3); - t1 = _mm_unpacklo_epi64(m1,m2); - t2 = _mm_blend_epi16(t0,t1,0x3C); - buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpacklo_epi32(m0,m1); - t1 = _mm_unpackhi_epi32(m1,m2); - buf4 = _mm_unpacklo_epi64(t0,t1); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_unpackhi_epi32(m1,m3); - t1 = _mm_unpacklo_epi64(t0,m0); - t2 = _mm_blend_epi16(t1,m2,0xC0); - buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_unpackhi_epi32(m0,m3); - t1 = _mm_blend_epi16(m2,t0,0xF0); - buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_blend_epi16(m2,m0,0x0C); - t1 = _mm_slli_si128(t0,4); - buf3 = _mm_blend_epi16(t1,m3,0x0F); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_blend_epi16(m1,m0,0x30); - buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - t0 = _mm_blend_epi16(m0,m2,0x03); - t1 = _mm_blend_epi16(m1,m2,0x30); - t2 = _mm_blend_epi16(t1,t0,0x0F); - buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_slli_si128(m0,4); - t1 = _mm_blend_epi16(m1,t0,0xC0); - buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1)); - - t0 = _mm_unpackhi_epi32(m0,m3); - t1 = _mm_unpacklo_epi32(m2,m3); - t2 = _mm_unpackhi_epi64(t0,t1); - buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r16); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20)); - - t0 = _mm_blend_epi16(m3,m2,0xC0); - t1 = _mm_unpacklo_epi32(m0,m3); - t2 = _mm_blend_epi16(t0,t1,0x0F); - buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); - - row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2); - row4 = _mm_xor_si128(row4, row1); - row4 = _mm_shuffle_epi8(row4,r8); - row3 = _mm_add_epi32(row3, row4); - row2 = _mm_xor_si128(row2, row3); - row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25)); - - row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1)); - row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2)); - row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3)); - - _mm_storeu_si128(M128I_CAST(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3))); - _mm_storeu_si128(M128I_CAST(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4))); -} - -static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State& state) -{ - __m128i row1l, row1h; - __m128i row2l, row2h; - __m128i row3l, row3h; - __m128i row4l, row4h; - __m128i b0, b1, t0, t1; - - const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - - const __m128i m0 = _mm_loadu_si128(CONST_M128I_CAST(input + 00)); - const __m128i m1 = _mm_loadu_si128(CONST_M128I_CAST(input + 16)); - const __m128i m2 = _mm_loadu_si128(CONST_M128I_CAST(input + 32)); - const __m128i m3 = _mm_loadu_si128(CONST_M128I_CAST(input + 48)); - const __m128i m4 = _mm_loadu_si128(CONST_M128I_CAST(input + 64)); - const __m128i m5 = _mm_loadu_si128(CONST_M128I_CAST(input + 80)); - const __m128i m6 = _mm_loadu_si128(CONST_M128I_CAST(input + 96)); - const __m128i m7 = _mm_loadu_si128(CONST_M128I_CAST(input + 112)); - - row1l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[0])); - row1h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[2])); - row2l = _mm_loadu_si128(CONST_M128I_CAST(&state.h[4])); - row2h = _mm_loadu_si128(CONST_M128I_CAST(&state.h[6])); - row3l = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(0))); - row3h = _mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(2))); - row4l = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(4))), _mm_loadu_si128(CONST_M128I_CAST(&state.t[0]))); - row4h = _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&BLAKE2B_IV(6))), _mm_loadu_si128(CONST_M128I_CAST(&state.f[0]))); - - b0 = _mm_unpacklo_epi64(m0, m1); - b1 = _mm_unpacklo_epi64(m2, m3); - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m0, m1); - b1 = _mm_unpackhi_epi64(m2, m3); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m4, m5); - b1 = _mm_unpacklo_epi64(m6, m7); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m4, m5); - b1 = _mm_unpackhi_epi64(m6, m7); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m7, m2); - b1 = _mm_unpackhi_epi64(m4, m6); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m5, m4); - b1 = _mm_alignr_epi8(m3, m7, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); - b1 = _mm_unpackhi_epi64(m5, m2); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m6, m1); - b1 = _mm_unpackhi_epi64(m3, m1); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_alignr_epi8(m6, m5, 8); - b1 = _mm_unpackhi_epi64(m2, m7); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m4, m0); - b1 = _mm_blend_epi16(m1, m6, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_blend_epi16(m5, m1, 0xF0); - b1 = _mm_unpackhi_epi64(m3, m4); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m7, m3); - b1 = _mm_alignr_epi8(m2, m0, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpackhi_epi64(m3, m1); - b1 = _mm_unpackhi_epi64(m6, m5); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m4, m0); - b1 = _mm_unpacklo_epi64(m6, m7); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_blend_epi16(m1, m2, 0xF0); - b1 = _mm_blend_epi16(m2, m7, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m3, m5); - b1 = _mm_unpacklo_epi64(m0, m4); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpackhi_epi64(m4, m2); - b1 = _mm_unpacklo_epi64(m1, m5); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_blend_epi16(m0, m3, 0xF0); - b1 = _mm_blend_epi16(m2, m7, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_blend_epi16(m7, m5, 0xF0); - b1 = _mm_blend_epi16(m3, m1, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_alignr_epi8(m6, m0, 8); - b1 = _mm_blend_epi16(m4, m6, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m1, m3); - b1 = _mm_unpacklo_epi64(m0, m4); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m6, m5); - b1 = _mm_unpackhi_epi64(m5, m1); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_blend_epi16(m2, m3, 0xF0); - b1 = _mm_unpackhi_epi64(m7, m0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m6, m2); - b1 = _mm_blend_epi16(m7, m4, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_blend_epi16(m6, m0, 0xF0); - b1 = _mm_unpacklo_epi64(m7, m2); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m2, m7); - b1 = _mm_alignr_epi8(m5, m6, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m0, m3); - b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m3, m1); - b1 = _mm_blend_epi16(m1, m5, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpackhi_epi64(m6, m3); - b1 = _mm_blend_epi16(m6, m1, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_alignr_epi8(m7, m5, 8); - b1 = _mm_unpackhi_epi64(m0, m4); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpackhi_epi64(m2, m7); - b1 = _mm_unpacklo_epi64(m4, m1); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m0, m2); - b1 = _mm_unpacklo_epi64(m3, m5); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m3, m7); - b1 = _mm_alignr_epi8(m0, m5, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m7, m4); - b1 = _mm_alignr_epi8(m4, m1, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = m6; - b1 = _mm_alignr_epi8(m5, m0, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_blend_epi16(m1, m3, 0xF0); - b1 = m2; - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m5, m4); - b1 = _mm_unpackhi_epi64(m3, m0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m1, m2); - b1 = _mm_blend_epi16(m3, m2, 0xF0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpackhi_epi64(m7, m4); - b1 = _mm_unpackhi_epi64(m1, m6); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_alignr_epi8(m7, m5, 8); - b1 = _mm_unpacklo_epi64(m6, m0); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m0, m1); - b1 = _mm_unpacklo_epi64(m2, m3); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m0, m1); - b1 = _mm_unpackhi_epi64(m2, m3); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m4, m5); - b1 = _mm_unpacklo_epi64(m6, m7); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpackhi_epi64(m4, m5); - b1 = _mm_unpackhi_epi64(m6, m7); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - b0 = _mm_unpacklo_epi64(m7, m2); - b1 = _mm_unpackhi_epi64(m4, m6); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m5, m4); - b1 = _mm_alignr_epi8(m3, m7, 8); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2h, row2l, 8); - t1 = _mm_alignr_epi8(row2l, row2h, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4h, row4l, 8); - t1 = _mm_alignr_epi8(row4l, row4h, 8); - row4l = t1, row4h = t0; - - b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); - b1 = _mm_unpackhi_epi64(m5, m2); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1)); - row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1)); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_shuffle_epi8(row2l, r24); - row2h = _mm_shuffle_epi8(row2h, r24); - - b0 = _mm_unpacklo_epi64(m6, m1); - b1 = _mm_unpackhi_epi64(m3, m1); - - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); - row4l = _mm_xor_si128(row4l, row1l); - row4h = _mm_xor_si128(row4h, row1h); - row4l = _mm_shuffle_epi8(row4l, r16); - row4h = _mm_shuffle_epi8(row4h, r16); - row3l = _mm_add_epi64(row3l, row4l); - row3h = _mm_add_epi64(row3h, row4h); - row2l = _mm_xor_si128(row2l, row3l); - row2h = _mm_xor_si128(row2h, row3h); - row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l)); - row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h)); - - t0 = _mm_alignr_epi8(row2l, row2h, 8); - t1 = _mm_alignr_epi8(row2h, row2l, 8); - row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0; - t0 = _mm_alignr_epi8(row4l, row4h, 8); - t1 = _mm_alignr_epi8(row4h, row4l, 8); - row4l = t1, row4h = t0; - - row1l = _mm_xor_si128(row3l, row1l); - row1h = _mm_xor_si128(row3h, row1h); - _mm_storeu_si128(M128I_CAST(&state.h[0]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[0])), row1l)); - _mm_storeu_si128(M128I_CAST(&state.h[2]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[2])), row1h)); - - row2l = _mm_xor_si128(row4l, row2l); - row2h = _mm_xor_si128(row4h, row2h); - _mm_storeu_si128(M128I_CAST(&state.h[4]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[4])), row2l)); - _mm_storeu_si128(M128I_CAST(&state.h[6]), _mm_xor_si128(_mm_loadu_si128(CONST_M128I_CAST(&state.h[6])), row2h)); -} -#endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE - -// Disable NEON for Cortex-A53 and A57. Also see http://github.com/weidai11/cryptopp/issues/367 -#if CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE -static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State& state) -{ - #define BLAKE2S_LOAD_MSG_0_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_0_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m0)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m1)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_0_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_0_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_high_u32(m2)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m2), vget_low_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ - t1 = vext_u32(vget_high_u32(m3), vget_high_u32(m1), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m0), 1); \ - t1 = vzip_u32(vget_high_u32(m2), vget_low_u32(m1)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_1_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m0)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_high_u32(m2), vget_low_u32(m3), 1); \ - t1 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[0]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m2), vget_high_u32(m0)); \ - t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m2)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_2_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_high_u32(m1)).val[0]; \ - t1 = vext_u32(vget_low_u32(m0), vget_low_u32(m1), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m0)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m3), vget_high_u32(m3)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_low_u32(m1)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_3_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m2), vget_low_u32(m1)).val[1]; \ - t1 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m0), vget_high_u32(m1)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_high_u32(m2)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_high_u32(m0)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_4_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_low_u32(m0), vget_low_u32(m3), 1); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m2), vget_low_u32(m3)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32((vget_high_u32(m0)), vget_high_u32(m1)).val[0]; \ - t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m2), vget_high_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_high_u32(m1)); \ - t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_5_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_low_u32(m1)).val[1]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m3), vget_low_u32(m2)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m0)); \ - t1 = vzip_u32(vget_high_u32(m3), vget_low_u32(m1)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ - t1 = vext_u32(vget_low_u32(m3), vget_high_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_high_u32(m1)).val[0]; \ - t1 = vext_u32(vget_low_u32(m2), vget_low_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_6_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m0)).val[1]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m2)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m3), vget_high_u32(m1)).val[1]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_high_u32(m0)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ - t1 = vzip_u32(vget_low_u32(m0), vget_low_u32(m2)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m1), vget_high_u32(m3)).val[1]; \ - t1 = vzip_u32(vget_low_u32(m2), vget_high_u32(m0)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_7_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_low_u32(m0), vget_low_u32(m1)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m1), vget_high_u32(m2)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m1), vget_high_u32(m3)).val[0]; \ - t1 = vext_u32(vget_high_u32(m2), vget_low_u32(m0), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ - t1 = vext_u32(vget_high_u32(m0), vget_low_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m3), vget_low_u32(m3)); \ - t1 = vext_u32(vget_low_u32(m0), vget_high_u32(m2), 1); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_8_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m0), vget_high_u32(m1)); \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_low_u32(m1), vget_low_u32(m1)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_1(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m2), vget_low_u32(m2)).val[0]; \ - t1 = vzip_u32(vget_high_u32(m1), vget_low_u32(m0)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_2(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m1)).val[0]; \ - t1 = vbsl_u32(vcreate_u32(0xFFFFFFFF), vget_high_u32(m1), vget_low_u32(m1)); \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_3(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vzip_u32(vget_high_u32(m3), vget_low_u32(m2)).val[1]; \ - t1 = vzip_u32((vget_high_u32(m0)), vget_low_u32(m3)).val[1]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define BLAKE2S_LOAD_MSG_9_4(buf) \ - do { uint32x2_t t0, t1; \ - t0 = vext_u32(vget_high_u32(m2), vget_high_u32(m3), 1); \ - t1 = vzip_u32(vget_low_u32(m3), vget_low_u32(m0)).val[0]; \ - buf = vcombine_u32(t0, t1); } while(0) - - #define vrorq_n_u32_16(x) vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))) - - #define vrorq_n_u32_8(x) vsriq_n_u32(vshlq_n_u32((x), 24), (x), 8) - - #define vrorq_n_u32(x, c) vsriq_n_u32(vshlq_n_u32((x), 32-(c)), (x), (c)) - - #define BLAKE2S_G1(row1,row2,row3,row4,buf) \ - do { \ - row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ - row4 = vrorq_n_u32_16(row4); row3 = vaddq_u32(row3, row4); \ - row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 12); \ - } while(0) - - #define BLAKE2S_G2(row1,row2,row3,row4,buf) \ - do { \ - row1 = vaddq_u32(vaddq_u32(row1, buf), row2); row4 = veorq_u32(row4, row1); \ - row4 = vrorq_n_u32_8(row4); row3 = vaddq_u32(row3, row4); \ - row2 = veorq_u32(row2, row3); row2 = vrorq_n_u32(row2, 7); \ - } while(0) - - #define BLAKE2S_DIAGONALIZE(row1,row2,row3,row4) \ - do { \ - row4 = vextq_u32(row4, row4, 3); row3 = vextq_u32(row3, row3, 2); row2 = vextq_u32(row2, row2, 1); \ - } while(0) - - #define BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4) \ - do { \ - row4 = vextq_u32(row4, row4, 1); \ - row3 = vextq_u32(row3, row3, 2); \ - row2 = vextq_u32(row2, row2, 3); \ - } while(0) - - #define BLAKE2S_ROUND(r) \ - do { \ - uint32x4_t buf1, buf2, buf3, buf4; \ - BLAKE2S_LOAD_MSG_ ##r ##_1(buf1); \ - BLAKE2S_G1(row1,row2,row3,row4,buf1); \ - BLAKE2S_LOAD_MSG_ ##r ##_2(buf2); \ - BLAKE2S_G2(row1,row2,row3,row4,buf2); \ - BLAKE2S_DIAGONALIZE(row1,row2,row3,row4); \ - BLAKE2S_LOAD_MSG_ ##r ##_3(buf3); \ - BLAKE2S_G1(row1,row2,row3,row4,buf3); \ - BLAKE2S_LOAD_MSG_ ##r ##_4(buf4); \ - BLAKE2S_G2(row1,row2,row3,row4,buf4); \ - BLAKE2S_UNDIAGONALIZE(row1,row2,row3,row4); \ - } while(0) - - CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf())); - - const uint32x4_t m0 = vreinterpretq_u32_u8(vld1q_u8((input + 00))); - const uint32x4_t m1 = vreinterpretq_u32_u8(vld1q_u8((input + 16))); - const uint32x4_t m2 = vreinterpretq_u32_u8(vld1q_u8((input + 32))); - const uint32x4_t m3 = vreinterpretq_u32_u8(vld1q_u8((input + 48))); - - uint32x4_t row1, row2, row3, row4; - - const uint32x4_t f0 = row1 = vld1q_u32(&state.h[0]); - const uint32x4_t f1 = row2 = vld1q_u32(&state.h[4]); - row3 = vld1q_u32(&BLAKE2S_IV(0)); - row4 = veorq_u32(vld1q_u32(&BLAKE2S_IV(4)), vld1q_u32(&state.t[0])); - - BLAKE2S_ROUND(0); - BLAKE2S_ROUND(1); - BLAKE2S_ROUND(2); - BLAKE2S_ROUND(3); - BLAKE2S_ROUND(4); - BLAKE2S_ROUND(5); - BLAKE2S_ROUND(6); - BLAKE2S_ROUND(7); - BLAKE2S_ROUND(8); - BLAKE2S_ROUND(9); - - vst1q_u32(&state.h[0], veorq_u32(f0, veorq_u32(row1, row3))); - vst1q_u32(&state.h[4], veorq_u32(f1, veorq_u32(row2, row4))); -} - -static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State& state) -{ - #define BLAKE2B_LOAD_MSG_0_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) - - #define BLAKE2B_LOAD_MSG_0_2(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) - - #define BLAKE2B_LOAD_MSG_0_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_0_4(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_1_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) - - #define BLAKE2B_LOAD_MSG_1_2(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_1_3(b0, b1) \ - do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) - - #define BLAKE2B_LOAD_MSG_1_4(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) - - #define BLAKE2B_LOAD_MSG_2_1(b0, b1) \ - do { b0 = vextq_u64(m5, m6, 1); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_2_2(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); } while(0) - - #define BLAKE2B_LOAD_MSG_2_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); } while(0) - - #define BLAKE2B_LOAD_MSG_2_4(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); b1 = vextq_u64(m0, m2, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_3_1(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); } while(0) - - #define BLAKE2B_LOAD_MSG_3_2(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_3_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_3_4(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) - - #define BLAKE2B_LOAD_MSG_4_1(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); } while(0) - - #define BLAKE2B_LOAD_MSG_4_2(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_4_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); } while(0) - - #define BLAKE2B_LOAD_MSG_4_4(b0, b1) \ - do { b0 = vextq_u64(m0, m6, 1); b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); } while(0) - - #define BLAKE2B_LOAD_MSG_5_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); } while(0) - - #define BLAKE2B_LOAD_MSG_5_2(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); } while(0) - - #define BLAKE2B_LOAD_MSG_5_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); } while(0) - - #define BLAKE2B_LOAD_MSG_5_4(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); } while(0) - - #define BLAKE2B_LOAD_MSG_6_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); } while(0) - - #define BLAKE2B_LOAD_MSG_6_2(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vextq_u64(m6, m5, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_6_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); b1 = vextq_u64(m4, m4, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_6_4(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); } while(0) - - #define BLAKE2B_LOAD_MSG_7_1(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); } while(0) - - #define BLAKE2B_LOAD_MSG_7_2(b0, b1) \ - do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); } while(0) - - #define BLAKE2B_LOAD_MSG_7_3(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); } while(0) - - #define BLAKE2B_LOAD_MSG_7_4(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); } while(0) - - #define BLAKE2B_LOAD_MSG_8_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); b1 = vextq_u64(m5, m0, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_8_2(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vextq_u64(m1, m4, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_8_3(b0, b1) \ - do { b0 = m6; b1 = vextq_u64(m0, m5, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_8_4(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); b1 = m2; } while(0) - - #define BLAKE2B_LOAD_MSG_9_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); } while(0) - - #define BLAKE2B_LOAD_MSG_9_2(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); } while(0) - - #define BLAKE2B_LOAD_MSG_9_3(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); } while(0) - - #define BLAKE2B_LOAD_MSG_9_4(b0, b1) \ - do { b0 = vextq_u64(m5, m7, 1); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); } while(0) - - #define BLAKE2B_LOAD_MSG_10_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); } while(0) - - #define BLAKE2B_LOAD_MSG_10_2(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); } while(0) - - #define BLAKE2B_LOAD_MSG_10_3(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_10_4(b0, b1) \ - do { b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); } while(0) - - #define BLAKE2B_LOAD_MSG_11_1(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); } while(0) - - #define BLAKE2B_LOAD_MSG_11_2(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); b1 = vextq_u64(m7, m3, 1); } while(0) - - #define BLAKE2B_LOAD_MSG_11_3(b0, b1) \ - do { b0 = vextq_u64(m0, m0, 1); b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); } while(0) - - #define BLAKE2B_LOAD_MSG_11_4(b0, b1) \ - do { b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); } while(0) - - #define vrorq_n_u64_32(x) vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64((x)))) - - #define vrorq_n_u64_24(x) vcombine_u64(\ - vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 3)), \ - vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 3))) - - #define vrorq_n_u64_16(x) vcombine_u64(\ - vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_low_u64(x)), vreinterpret_u8_u64(vget_low_u64(x)), 2)), \ - vreinterpret_u64_u8(vext_u8(vreinterpret_u8_u64(vget_high_u64(x)), vreinterpret_u8_u64(vget_high_u64(x)), 2))) - - #define vrorq_n_u64_63(x) veorq_u64(vaddq_u64(x, x), vshrq_n_u64(x, 63)) - - #define BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - do { \ - row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ - row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ - row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ - row4l = vrorq_n_u64_32(row4l); row4h = vrorq_n_u64_32(row4h); \ - row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ - row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ - row2l = vrorq_n_u64_24(row2l); row2h = vrorq_n_u64_24(row2h); \ - } while(0) - - #define BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - do { \ - row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ - row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ - row4l = veorq_u64(row4l, row1l); row4h = veorq_u64(row4h, row1h); \ - row4l = vrorq_n_u64_16(row4l); row4h = vrorq_n_u64_16(row4h); \ - row3l = vaddq_u64(row3l, row4l); row3h = vaddq_u64(row3h, row4h); \ - row2l = veorq_u64(row2l, row3l); row2h = veorq_u64(row2h, row3h); \ - row2l = vrorq_n_u64_63(row2l); row2h = vrorq_n_u64_63(row2h); \ - } while(0) - - #define BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - do { \ - uint64x2_t t0 = vextq_u64(row2l, row2h, 1); \ - uint64x2_t t1 = vextq_u64(row2h, row2l, 1); \ - row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ - t0 = vextq_u64(row4h, row4l, 1); t1 = vextq_u64(row4l, row4h, 1); \ - row4l = t0; row4h = t1; \ - } while(0) - - #define BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - do { \ - uint64x2_t t0 = vextq_u64(row2h, row2l, 1); \ - uint64x2_t t1 = vextq_u64(row2l, row2h, 1); \ - row2l = t0; row2h = t1; t0 = row3l; row3l = row3h; row3h = t0; \ - t0 = vextq_u64(row4l, row4h, 1); t1 = vextq_u64(row4h, row4l, 1); \ - row4l = t0; row4h = t1; \ - } while(0) - - #define BLAKE2B_ROUND(r) \ - do { \ - uint64x2_t b0, b1; \ - BLAKE2B_LOAD_MSG_ ##r ##_1(b0, b1); \ - BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - BLAKE2B_LOAD_MSG_ ##r ##_2(b0, b1); \ - BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - BLAKE2B_DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - BLAKE2B_LOAD_MSG_ ##r ##_3(b0, b1); \ - BLAKE2B_G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - BLAKE2B_LOAD_MSG_ ##r ##_4(b0, b1); \ - BLAKE2B_G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - BLAKE2B_UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - } while(0) - - CRYPTOPP_ASSERT(IsAlignedOn(&state.h[0],GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(&state.t[0],GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(&state.f[0],GetAlignmentOf())); - - const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(input + 00)); - const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(input + 16)); - const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(input + 32)); - const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(input + 48)); - const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(input + 64)); - const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(input + 80)); - const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(input + 96)); - const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(input + 112)); - - uint64x2_t row1l, row1h, row2l, row2h; - uint64x2_t row3l, row3h, row4l, row4h; - - const uint64x2_t h0 = row1l = vld1q_u64(&state.h[0]); - const uint64x2_t h1 = row1h = vld1q_u64(&state.h[2]); - const uint64x2_t h2 = row2l = vld1q_u64(&state.h[4]); - const uint64x2_t h3 = row2h = vld1q_u64(&state.h[6]); - - row3l = vld1q_u64(&BLAKE2B_IV(0)); - row3h = vld1q_u64(&BLAKE2B_IV(2)); - row4l = veorq_u64(vld1q_u64(&BLAKE2B_IV(4)), vld1q_u64(&state.t[0])); - row4h = veorq_u64(vld1q_u64(&BLAKE2B_IV(6)), vld1q_u64(&state.f[0])); - - BLAKE2B_ROUND(0); - BLAKE2B_ROUND(1); - BLAKE2B_ROUND(2); - BLAKE2B_ROUND(3); - BLAKE2B_ROUND(4); - BLAKE2B_ROUND(5); - BLAKE2B_ROUND(6); - BLAKE2B_ROUND(7); - BLAKE2B_ROUND(8); - BLAKE2B_ROUND(9); - BLAKE2B_ROUND(10); - BLAKE2B_ROUND(11); - - vst1q_u64(&state.h[0], veorq_u64(h0, veorq_u64(row1l, row3l))); - vst1q_u64(&state.h[2], veorq_u64(h1, veorq_u64(row1h, row3h))); - vst1q_u64(&state.h[4], veorq_u64(h2, veorq_u64(row2l, row4l))); - vst1q_u64(&state.h[6], veorq_u64(h3, veorq_u64(row2h, row4h))); -} -#endif // CRYPTOPP_BOOL_ARM32 && CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE - template class BLAKE2_Base; template class BLAKE2_Base; diff --git a/config.h b/config.h index 0b48e8a61..e063108f9 100644 --- a/config.h +++ b/config.h @@ -80,8 +80,15 @@ // #endif // File system code to write to GZIP archive. +// http://www.gzip.org/format.txt #if !defined(GZIP_OS_CODE) -# define GZIP_OS_CODE 0 +# if defined(__macintosh__) +# define GZIP_OS_CODE 7 +# elif defined(__unix__) || defined(__linux__) +# define GZIP_OS_CODE 3 +# else +# define GZIP_OS_CODE 0 +# endif #endif // Try this if your CPU has 256K internal cache or a slow multiply instruction @@ -386,7 +393,62 @@ NAMESPACE_END #define CRYPTOPP_UNCAUGHT_EXCEPTION_AVAILABLE #endif -// Apple's Clang prior to 5.0 cannot handle SSE2 (and Apple does not use LLVM Clang numbering...) +// ***************** Platform and CPU features ******************** + +// Linux provides X32, which is 32-bit integers, longs and pointers on x86_64 using the full x86_64 register set. +// Detect via __ILP32__ (http://wiki.debian.org/X32Port). However, __ILP32__ shows up in more places than +// the System V ABI specs calls out, like on some Solaris installations and just about any 32-bit system with Clang. +#if (defined(__ILP32__) || defined(_ILP32)) && defined(__x86_64__) + #define CRYPTOPP_BOOL_X32 1 +#else + #define CRYPTOPP_BOOL_X32 0 +#endif + +// see http://predef.sourceforge.net/prearch.html +#if (defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_X86_) || defined(__I86__) || defined(__INTEL__)) && !CRYPTOPP_BOOL_X32 + #define CRYPTOPP_BOOL_X86 1 +#else + #define CRYPTOPP_BOOL_X86 0 +#endif + +#if (defined(_M_X64) || defined(__x86_64__)) && !CRYPTOPP_BOOL_X32 + #define CRYPTOPP_BOOL_X64 1 +#else + #define CRYPTOPP_BOOL_X64 0 +#endif + +// Undo the ASM and Intrinsic related defines due to X32. +#if CRYPTOPP_BOOL_X32 +# undef CRYPTOPP_BOOL_X64 +# undef CRYPTOPP_X64_ASM_AVAILABLE +# undef CRYPTOPP_X64_MASM_AVAILABLE +#endif + +#if defined(__arm__) || defined(__aarch32__) || defined(_M_ARM) + #define CRYPTOPP_BOOL_ARM32 1 +#else + #define CRYPTOPP_BOOL_ARM32 0 +#endif + +// Microsoft plans to support ARM-64, but its not clear how to detect it. +// TODO: Add MSC_VER and ARM-64 platform define when available +#if defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) + #define CRYPTOPP_BOOL_ARM64 1 +#else + #define CRYPTOPP_BOOL_ARM64 0 +#endif + +#if defined(_MSC_VER) || defined(__BORLANDC__) +# define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY 1 +#else +# define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY 1 +#endif + +// ***************** IA32 CPU features ******************** + +#if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) + +// Apple Clang prior to 5.0 cannot handle SSE2 #if defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION < 50000) # define CRYPTOPP_DISABLE_ASM #endif @@ -403,63 +465,80 @@ NAMESPACE_END #if !defined(CRYPTOPP_DISABLE_SSE2) && (defined(_MSC_VER) || CRYPTOPP_GCC_VERSION >= 30300 || defined(__SSE2__)) #define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 1 - #else - #define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 #endif - #if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__))) + #if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || defined(__SSSE3__)) #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1 - #else - #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0 #endif #endif #if !defined(CRYPTOPP_DISABLE_ASM) && defined(_MSC_VER) && defined(_M_X64) - #define CRYPTOPP_X64_MASM_AVAILABLE + #define CRYPTOPP_X64_MASM_AVAILABLE 1 #endif #if !defined(CRYPTOPP_DISABLE_ASM) && defined(__GNUC__) && defined(__x86_64__) - #define CRYPTOPP_X64_ASM_AVAILABLE + #define CRYPTOPP_X64_ASM_AVAILABLE 1 #endif -#if !defined(CRYPTOPP_DISABLE_ASM) && (defined(_MSC_VER) || defined(__SSE2__)) && !defined(_M_ARM) - #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 1 -#else - #define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 +#if !defined(CRYPTOPP_DISABLE_ASM) && (defined(_MSC_VER) || defined(__SSE2__)) + #define CRYPTOPP_SSE2_AVAILABLE 1 #endif -#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSSE3) && !defined(_M_ARM) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__))) - #define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 1 -#else - #define CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE 0 +#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSSE3) +# if defined(__SSSE3__) || (_MSC_VER >= 1500) || (CRYPTOPP_GCC_VERSION >= 40300) + #define CRYPTOPP_SSSE3_AVAILABLE 1 +# endif #endif // Intrinsics availible in GCC 4.3 (http://gcc.gnu.org/gcc-4.3/changes.html) and // MSVC 2008 (http://msdn.microsoft.com/en-us/library/bb892950%28v=vs.90%29.aspx) // SunCC could generate SSE4 at 12.1, but the intrinsics are missing until 12.4. -#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SSE4) && !defined(_M_ARM) && ((_MSC_VER >= 1500) || (defined(__SSE4_1__) && defined(__SSE4_2__))) - #define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 1 -#else - #define CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 0 +#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \ + (defined(__SSE4_1__) || (CRYPTOPP_MSC_VERSION >= 1500) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1000) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 20300) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40000)) + #define CRYPTOPP_SSE41_AVAILABLE 1 #endif -// Don't disgorge AES-NI from CLMUL. There will be two to four subtle breaks -#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AESNI) && !defined(_M_ARM) && (_MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110 || (defined(__AES__) && defined(__PCLMUL__))) - #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 1 -#else - #define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0 +#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \ + (defined(__SSE4_2__) || (CRYPTOPP_MSC_VERSION >= 1500) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1000) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 20300) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40000)) + #define CRYPTOPP_SSE42_AVAILABLE 1 #endif -#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && !defined(_M_ARM) && ((_MSC_VER >= 1900) || defined(__SHA__)) - #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 1 -#else - #define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0 +#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_CLMUL) && \ + (defined(__PCLMUL__) || (_MSC_FULL_VER >= 150030729) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1110) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30200) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300)) + #define CRYPTOPP_CLMUL_AVAILABLE 1 +#endif + +#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \ + (defined(__AES__) || (_MSC_FULL_VER >= 150030729) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1110) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30200) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300)) + #define CRYPTOPP_AESNI_AVAILABLE 1 #endif +#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && \ + (defined(__SHA__) || (CRYPTOPP_MSC_VERSION >= 1900) || \ + (CRYPTOPP_GCC_VERSION >= 40900) || (__INTEL_COMPILER >= 1300) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30400) || (CRYPTOPP_APPLE_CLANG_VERSION >= 50100)) + #define CRYPTOPP_SHANI_AVAILABLE 1 +#endif + +#endif // X86, X32, X64 + +// ***************** ARM CPU features ******************** + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) + // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains. -#if !defined(CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM) -# define CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 1 +#if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) +# if defined(__ARM_NEON__) || defined(__ARM_FEATURE_NEON) || (CRYPTOPP_MSC_VERSION >= 1700) || \ + (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) +# define CRYPTOPP_ARM_NEON_AVAILABLE 1 # endif #endif @@ -467,21 +546,21 @@ NAMESPACE_END // LLVM Clang requires 3.5. Apple Clang is unknown at the moment. // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available -#if !defined(CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_FEATURE_CRC32) -# define CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE 1 +#if !defined(CRYPTOPP_ARM_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__) +# if defined(__ARM_FEATURE_CRC32) || (CRYPTOPP_MSC_VERSION >= 1910) || \ + defined(__aarch32__) || defined(__aarch64__) +# define CRYPTOPP_ARM_CRC32_AVAILABLE 1 # endif #endif -// Requires ARMv8, ACLE 2.0 and Aarch64. GCC requires 4.8 and above. -// LLVM Clang requires 3.5. Apple Clang does not support it at the moment. +// Requires ARMv8 and ACLE 2.0. GCC requires 4.8 and above. +// LLVM Clang requires 3.5. Apple Clang is unknown at the moment. // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available -#if !defined(CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_FEATURE_CRYPTO) && !defined(__apple_build_version__) -# if defined(__arm64__) || defined(__aarch64__) -# define CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE 1 -# endif +#if !defined(CRYPTOPP_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__) +# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VERSION >= 1910) || \ + defined(__aarch32__) || defined(__aarch64__) +# define CRYPTOPP_ARM_PMULL_AVAILABLE 1 # endif #endif @@ -489,13 +568,20 @@ NAMESPACE_END // LLVM Clang requires 3.5. Apple Clang is unknown at the moment. // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available -#if !defined(CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VERSION >= 1910) -# define CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE 1 +#if !defined(CRYPTOPP_ARM_CRYPTO_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) +# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VERSION >= 1910) || \ + defined(__aarch32__) || defined(__aarch64__) +# define CRYPTOPP_ARM_AES_AVAILABLE 1 +# define CRYPTOPP_ARM_SHA_AVAILABLE 1 +# define CRYPTOPP_ARM_CRYPTO_AVAILABLE 1 # endif #endif -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) +#endif // ARM32, ARM64 + +// ***************** Miscellaneous ******************** + +#if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_ARM_NEON_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) #define CRYPTOPP_BOOL_ALIGN16 1 #else #define CRYPTOPP_BOOL_ALIGN16 0 @@ -536,49 +622,6 @@ NAMESPACE_END # define CRYPTOPP_CONSTANT(x) enum {x}; #endif -// Linux provides X32, which is 32-bit integers, longs and pointers on x86_64 using the full x86_64 register set. -// Detect via __ILP32__ (http://wiki.debian.org/X32Port). However, __ILP32__ shows up in more places than -// the System V ABI specs calls out, like on some Solaris installations and just about any 32-bit system with Clang. -#if (defined(__ILP32__) || defined(_ILP32)) && defined(__x86_64__) - #define CRYPTOPP_BOOL_X32 1 -#else - #define CRYPTOPP_BOOL_X32 0 -#endif - -// see http://predef.sourceforge.net/prearch.html -#if (defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_X86_) || defined(__I86__) || defined(__INTEL__)) && !CRYPTOPP_BOOL_X32 - #define CRYPTOPP_BOOL_X86 1 -#else - #define CRYPTOPP_BOOL_X86 0 -#endif - -#if (defined(_M_X64) || defined(__x86_64__)) && !CRYPTOPP_BOOL_X32 - #define CRYPTOPP_BOOL_X64 1 -#else - #define CRYPTOPP_BOOL_X64 0 -#endif - -// Undo the ASM and Intrinsic related defines due to X32. -#if CRYPTOPP_BOOL_X32 -# undef CRYPTOPP_BOOL_X64 -# undef CRYPTOPP_X64_ASM_AVAILABLE -# undef CRYPTOPP_X64_MASM_AVAILABLE -#endif - -#if defined(__arm__) || defined(__aarch32__) || defined(_M_ARM) - #define CRYPTOPP_BOOL_ARM32 1 -#else - #define CRYPTOPP_BOOL_ARM32 0 -#endif - -// Microsoft plans to support ARM-64, but its not clear how to detect it. -// TODO: Add MSC_VER and ARM-64 platform define when available -#if defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) - #define CRYPTOPP_BOOL_ARM64 1 -#else - #define CRYPTOPP_BOOL_ARM64 0 -#endif - // ***************** Initialization and Constructor priorities ******************** // CRYPTOPP_INIT_PRIORITY attempts to manage initialization of C++ static objects. diff --git a/cpu.cpp b/cpu.cpp index dcd94dd46..609bc303f 100644 --- a/cpu.cpp +++ b/cpu.cpp @@ -13,13 +13,9 @@ #include "misc.h" #include "stdcpp.h" -#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE -# include -#endif - -#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY -#include -#include +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include #endif NAMESPACE_BEGIN(CryptoPP) @@ -30,13 +26,26 @@ extern "C" { }; #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY +// ***************** IA-32 CPUs ******************** + #ifdef CRYPTOPP_CPUID_AVAILABLE -#if _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64 +#if _MSC_VER >= 1500 -bool CpuId(word32 input, word32 output[4]) +inline bool CpuId(word32 func, word32 subfunc, word32 output[4]) { - __cpuid((int *)output, input); + __cpuidex((int *)output, func, subfunc); + return true; +} + +#elif _MSC_VER >= 1400 && CRYPTOPP_BOOL_X64 + +inline bool CpuId(word32 func, word32 subfunc, word32 output[4]) +{ + if (subfunc != 0) + return false; + + __cpuid((int *)output, func); return true; } @@ -59,15 +68,15 @@ extern "C" } #endif -bool CpuId(word32 input, word32 output[4]) +inline bool CpuId(word32 func, word32 subfunc, word32 output[4]) { #if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) __try { __asm { - mov eax, input - mov ecx, 0 + mov eax, func + mov ecx, subfunc cpuid mov edi, output mov [edi], eax @@ -116,7 +125,7 @@ bool CpuId(word32 input, word32 output[4]) "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx" # endif : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3]) - : "a" (input), "c" (0) + : "a" (func), "c" (subfunc) : "cc" ); } @@ -132,7 +141,7 @@ bool CpuId(word32 input, word32 output[4]) #endif -static bool TrySSE2() +static bool CPU_ProbeSSE2() { #if CRYPTOPP_BOOL_X64 return true; @@ -141,7 +150,7 @@ static bool TrySSE2() { #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE AS2(por xmm0, xmm0) // executing SSE2 instruction -#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#elif CRYPTOPP_SSE2_AVAILABLE __m128i x = _mm_setzero_si128(); return _mm_cvtsi128_si32(x) == 0; #endif @@ -173,7 +182,7 @@ static bool TrySSE2() { #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE __asm __volatile ("por %xmm0, %xmm0"); -#elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +#elif CRYPTOPP_SSE2_AVAILABLE __m128i x = _mm_setzero_si128(); result = _mm_cvtsi128_si32(x) == 0; #endif @@ -189,8 +198,9 @@ static bool TrySSE2() } bool CRYPTOPP_SECTION_INIT g_x86DetectionDone = false; -bool CRYPTOPP_SECTION_INIT g_hasMMX = false, CRYPTOPP_SECTION_INIT g_hasISSE = false, CRYPTOPP_SECTION_INIT g_hasSSE2 = false, CRYPTOPP_SECTION_INIT g_hasSSSE3 = false; -bool CRYPTOPP_SECTION_INIT g_hasSSE4 = false, CRYPTOPP_SECTION_INIT g_hasAESNI = false, CRYPTOPP_SECTION_INIT g_hasCLMUL = false, CRYPTOPP_SECTION_INIT g_hasSHA = false; +bool CRYPTOPP_SECTION_INIT CRYPTOPP_SECTION_INIT g_hasSSE2 = false, CRYPTOPP_SECTION_INIT g_hasSSSE3 = false; +bool CRYPTOPP_SECTION_INIT g_hasSSE41 = false, CRYPTOPP_SECTION_INIT g_hasSSE42 = false; +bool CRYPTOPP_SECTION_INIT g_hasAESNI = false, CRYPTOPP_SECTION_INIT g_hasCLMUL = false, CRYPTOPP_SECTION_INIT g_hasSHA = false; bool CRYPTOPP_SECTION_INIT g_hasRDRAND = false, CRYPTOPP_SECTION_INIT g_hasRDSEED = false, CRYPTOPP_SECTION_INIT g_isP4 = false; bool CRYPTOPP_SECTION_INIT g_hasPadlockRNG = false, CRYPTOPP_SECTION_INIT g_hasPadlockACE = false, CRYPTOPP_SECTION_INIT g_hasPadlockACE2 = false; bool CRYPTOPP_SECTION_INIT g_hasPadlockPHE = false, CRYPTOPP_SECTION_INIT g_hasPadlockPMM = false; @@ -224,31 +234,19 @@ void DetectX86Features() { // Coverity finding CID 171239... word32 cpuid0[4]={0}, cpuid1[4]={0}, cpuid2[4]={0}; - if (!CpuId(0, cpuid0)) + if (!CpuId(0, 0, cpuid0)) return; - if (!CpuId(1, cpuid1)) + if (!CpuId(1, 0, cpuid1)) return; - g_hasMMX = (cpuid1[3] & (1 << 23)) != 0; if ((cpuid1[3] & (1 << 26)) != 0) - g_hasSSE2 = TrySSE2(); + g_hasSSE2 = CPU_ProbeSSE2(); g_hasSSSE3 = g_hasSSE2 && (cpuid1[2] & (1<<9)); - g_hasSSE4 = g_hasSSE2 && ((cpuid1[2] & (1<<19)) && (cpuid1[2] & (1<<20))); + g_hasSSE41 = g_hasSSE2 && (cpuid1[2] & (1<<19)); + g_hasSSE42 = g_hasSSE2 && (cpuid1[2] & (1<<20)); g_hasAESNI = g_hasSSE2 && (cpuid1[2] & (1<<25)); g_hasCLMUL = g_hasSSE2 && (cpuid1[2] & (1<<1)); - if ((cpuid1[3] & (1 << 25)) != 0) - g_hasISSE = true; - else - { - CpuId(0x080000000, cpuid2); - if (cpuid2[0] >= 0x080000001) - { - CpuId(0x080000001, cpuid2); - g_hasISSE = (cpuid2[3] & (1 << 22)) != 0; - } - } - if (IsIntel(cpuid0)) { enum { RDRAND_FLAG = (1 << 30) }; @@ -259,9 +257,9 @@ void DetectX86Features() g_cacheLineSize = 8 * GETBYTE(cpuid1[1], 1); g_hasRDRAND = !!(cpuid1[2] /*ECX*/ & RDRAND_FLAG); - if (cpuid0[0] /*EAX*/ >= 7) + if (cpuid1[0] /*EAX*/ >= 7) { - if (CpuId(7, cpuid2)) + if (CpuId(7, 0, cpuid2)) { g_hasRDSEED = !!(cpuid2[1] /*EBX*/ & RDSEED_FLAG); g_hasSHA = !!(cpuid2[1] /*EBX*/ & SHA_FLAG); @@ -274,13 +272,13 @@ void DetectX86Features() enum { RDSEED_FLAG = (1 << 18) }; enum { SHA_FLAG = (1 << 29) }; - CpuId(0x80000005, cpuid2); + CpuId(0x80000005, 0, cpuid2); g_cacheLineSize = GETBYTE(cpuid2[2], 0); g_hasRDRAND = !!(cpuid1[2] /*ECX*/ & RDRAND_FLAG); - if (cpuid0[0] /*EAX*/ >= 7) + if (cpuid1[0] /*EAX*/ >= 7) { - if (CpuId(7, cpuid2)) + if (CpuId(7, 0, cpuid2)) { g_hasRDSEED = !!(cpuid2[1] /*EBX*/ & RDSEED_FLAG); g_hasSHA = !!(cpuid2[1] /*EBX*/ & SHA_FLAG); @@ -295,16 +293,16 @@ void DetectX86Features() enum { PHE_FLAGS = (0x3 << 10) }; enum { PMM_FLAGS = (0x3 << 12) }; - CpuId(0xC0000000, cpuid0); - if (cpuid0[0] >= 0xC0000001) + CpuId(0xC0000000, 0, cpuid2); + if (cpuid2[0] >= 0xC0000001) { // Extended features available - CpuId(0xC0000001, cpuid0); - g_hasPadlockRNG = !!(cpuid0[3] /*EDX*/ & RNG_FLAGS); - g_hasPadlockACE = !!(cpuid0[3] /*EDX*/ & ACE_FLAGS); - g_hasPadlockACE2 = !!(cpuid0[3] /*EDX*/ & ACE2_FLAGS); - g_hasPadlockPHE = !!(cpuid0[3] /*EDX*/ & PHE_FLAGS); - g_hasPadlockPMM = !!(cpuid0[3] /*EDX*/ & PMM_FLAGS); + CpuId(0xC0000001, 0, cpuid2); + g_hasPadlockRNG = !!(cpuid2[3] /*EDX*/ & RNG_FLAGS); + g_hasPadlockACE = !!(cpuid2[3] /*EDX*/ & ACE_FLAGS); + g_hasPadlockACE2 = !!(cpuid2[3] /*EDX*/ & ACE2_FLAGS); + g_hasPadlockPHE = !!(cpuid2[3] /*EDX*/ & PHE_FLAGS); + g_hasPadlockPMM = !!(cpuid2[3] /*EDX*/ & PMM_FLAGS); } } @@ -314,436 +312,196 @@ void DetectX86Features() g_x86DetectionDone = true; } +// ***************** ARM-32, Aarch32 and Aarch64 CPUs ******************** + #elif (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) -// The ARM equivalent of CPUID probing is reading a MSR. The code requires Exception Level 1 (EL1) and above, but user space runs at EL0. -// Attempting to run the code results in a SIGILL and termination. -// -// #if defined(__arm64__) || defined(__aarch64__) -// word64 caps = 0; // Read ID_AA64ISAR0_EL1 -// __asm __volatile("mrs %0, " "id_aa64isar0_el1" : "=r" (caps)); -// #elif defined(__arm__) || defined(__aarch32__) -// word32 caps = 0; // Read ID_ISAR5_EL1 -// __asm __volatile("mrs %0, " "id_isar5_el1" : "=r" (caps)); -// #endif -// -// The following does not work well either. Its appears to be missing constants, and it does not detect Aarch32 execution environments on Aarch64 -// http://community.arm.com/groups/android-community/blog/2014/10/10/runtime-detection-of-cpu-features-on-an-armv8-a-cpu -// +#if defined(__linux__) +# include +# ifndef HWCAP_ASIMD +# define HWCAP_ASIMD (1 << 1) +# endif +# ifndef HWCAP_ARM_NEON +# define HWCAP_ARM_NEON 4096 +# endif +# ifndef HWCAP_CRC32 +# define HWCAP_CRC32 (1 << 7) +# endif +# ifndef HWCAP2_CRC32 +# define HWCAP2_CRC32 (1 << 4) +# endif +# ifndef HWCAP_PMULL +# define HWCAP_PMULL (1 << 4) +# endif +# ifndef HWCAP2_PMULL +# define HWCAP2_PMULL (1 << 1) +# endif +# ifndef HWCAP_AES +# define HWCAP_AES (1 << 3) +# endif +# ifndef HWCAP2_AES +# define HWCAP2_AES (1 << 0) +# endif +# ifndef HWCAP_SHA1 +# define HWCAP_SHA1 (1 << 5) +# endif +# ifndef HWCAP_SHA2 +# define HWCAP_SHA2 (1 << 6) +# endif +# ifndef HWCAP2_SHA1 +# define HWCAP2_SHA1 (1 << 2) +# endif +# ifndef HWCAP2_SHA2 +# define HWCAP2_SHA2 (1 << 3) +# endif +#endif + +#if defined(__APPLE__) && defined(__aarch64__) +# include +#endif + bool CRYPTOPP_SECTION_INIT g_ArmDetectionDone = false; bool CRYPTOPP_SECTION_INIT g_hasNEON = false, CRYPTOPP_SECTION_INIT g_hasPMULL = false, CRYPTOPP_SECTION_INIT g_hasCRC32 = false; bool CRYPTOPP_SECTION_INIT g_hasAES = false, CRYPTOPP_SECTION_INIT g_hasSHA1 = false, CRYPTOPP_SECTION_INIT g_hasSHA2 = false; word32 CRYPTOPP_SECTION_INIT g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE; -#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY -extern "C" -{ - static jmp_buf s_jmpNoNEON; - static void SigIllHandlerNEON(int) - { - longjmp(s_jmpNoNEON, 1); - } - - static jmp_buf s_jmpNoPMULL; - static void SigIllHandlerPMULL(int) - { - longjmp(s_jmpNoPMULL, 1); - } - - static jmp_buf s_jmpNoCRC32; - static void SigIllHandlerCRC32(int) - { - longjmp(s_jmpNoCRC32, 1); - } - - static jmp_buf s_jmpNoAES; - static void SigIllHandlerAES(int) - { - longjmp(s_jmpNoAES, 1); - } - - static jmp_buf s_jmpNoSHA1; - static void SigIllHandlerSHA1(int) - { - longjmp(s_jmpNoSHA1, 1); - } - - static jmp_buf s_jmpNoSHA2; - static void SigIllHandlerSHA2(int) - { - longjmp(s_jmpNoSHA2, 1); - } -}; -#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY - -static bool TryNEON() +// ARM does not have an unprivliged equivalent to CPUID on IA-32. We have to jump through some +// hoops to detect features on a wide array of platforms. Our strategy is two part. First, +// attempt to *Query* the OS for a feature, like using getauxval on Linux. If that fails, +// then *Probe* the cpu executing an instruction and an observe a SIGILL if unsupported. +// The probes are in source files where compilation options like -march=armv8-a+crc make +// intrinsics available. They are expensive when compared to a standard OS feature query. +// Always perform the feature quesry first. For Linux see +// http://sourceware.org/ml/libc-help/2017-08/msg00012.html +// Avoid probes on Apple platforms because Apple's signal handling for SIGILLs appears broken. +// We are trying to figure out a way to feature test without probes. Also see +// http://stackoverflow.com/a/11197770/608639 and +// http://gist.github.com/erkanyildiz/390a480f27e86f8cd6ba + +extern bool CPU_ProbeNEON(); +extern bool CPU_ProbeCRC32(); +extern bool CPU_ProbeAES(); +extern bool CPU_ProbeSHA1(); +extern bool CPU_ProbeSHA2(); +extern bool CPU_ProbePMULL(); + +inline bool CPU_QueryNEON() { -#if (CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - uint32_t v1[4] = {1,1,1,1}; - uint32x4_t x1 = vld1q_u32(v1); - uint64_t v2[2] = {1,1}; - uint64x2_t x2 = vld1q_u64(v2); - - uint32x4_t x3 = vdupq_n_u32(2); - x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); - x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); - uint64x2_t x4 = vdupq_n_u64(2); - x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); - x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); - - result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerNEON); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoNEON)) - result = false; - else - { - uint32_t v1[4] = {1,1,1,1}; - uint32x4_t x1 = vld1q_u32(v1); - uint64_t v2[2] = {1,1}; - uint64x2_t x2 = vld1q_u64(v2); - - uint32x4_t x3 = {0,0,0,0}; - x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); - x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); - uint64x2_t x4 = {0,0}; - x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); - x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); - - // Hack... GCC optimizes away the code and returns true - result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else +#if defined(__ANDROID__) && (defined(__aarch32__) || defined(__aarch64__)) + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_ASIMD) + return true; +#elif defined(__ANDROID__) && defined(__arm__) + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) + return true; +#elif defined(__linux__) && defined(__aarch64__) + if (getauxval(AT_HWCAP) & HWCAP_ASIMD) + return true; +#elif defined(__linux__) && defined(__aarch32__) + if (getauxval(AT_HWCAP2) & HWCAP2_ASIMD) + return true; +#elif defined(__linux__) && defined(__arm__) + if (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) + return true; +#endif return false; -#endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE } -static bool TryPMULL() +inline bool CPU_QueryCRC32() { -#if (CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - - const poly128_t r1 = vmull_p64(a1, b1); - const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); - - // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} - const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} - - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoPMULL)) - result = false; - else - { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - - const poly128_t r1 = vmull_p64(a1, b1); - const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); - - // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} - const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} - - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else +#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_CRC32) + return true; +#elif defined(__linux__) && defined(__aarch64__) + if (getauxval(AT_HWCAP) & HWCAP_CRC32) + return true; +#elif defined(__linux__) && defined(__aarch32__) + if (getauxval(AT_HWCAP2) & HWCAP2_CRC32) + return true; +#endif return false; -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE } -static bool TryCRC32() +inline bool CPU_QueryPMULL() { -#if (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - word32 w=0, x=1; word16 y=2; byte z=3; - w = __crc32cw(w,x); - w = __crc32ch(w,y); - w = __crc32cb(w,z); - - result = !!w; - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerCRC32); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoCRC32)) - result = false; - else - { - word32 w=0, x=1; word16 y=2; byte z=3; - w = __crc32cw(w,x); - w = __crc32ch(w,y); - w = __crc32cb(w,z); - - // Hack... GCC optimizes away the code and returns true - result = !!w; - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else +#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_PMULL) + return true; +#elif defined(__linux__) && defined(__aarch64__) + if (getauxval(AT_HWCAP) & HWCAP_PMULL) + return true; +#elif defined(__linux__) && defined(__aarch32__) + if (getauxval(AT_HWCAP2) & HWCAP2_PMULL) + return true; +#endif return false; -#endif // CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE } -static bool TryAES() +inline bool CPU_QueryAES() { -#if (CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - // AES encrypt and decrypt - uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); - uint8x16_t r1 = vaeseq_u8(data, key); - uint8x16_t r2 = vaesdq_u8(data, key); - - result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); +#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_AES) + return true; +#elif defined(__linux__) && defined(__aarch64__) + if (getauxval(AT_HWCAP) & HWCAP_AES) + return true; +#elif defined(__linux__) && defined(__aarch32__) + if (getauxval(AT_HWCAP2) & HWCAP2_AES) + return true; +#elif defined(__APPLE__) + struct utsname systemInfo; + systemInfo.machine[0] = '\0'; + uname(&systemInfo); + + std::string machine(systemInfo.machine); + if (machine.substr(0, 7) == "iPhone6" || machine.substr(0, 7) == "iPhone7" || + machine.substr(0, 7) == "iPhone8" || machine.substr(0, 7) == "iPhone9" || + machine.substr(0, 5) == "iPad4" || machine.substr(0, 5) == "iPad5" || + machine.substr(0, 5) == "iPad6" || machine.substr(0, 5) == "iPad7") + { + return true; } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerAES); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoAES)) - result = false; - else - { - uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); - uint8x16_t r1 = vaeseq_u8(data, key); - uint8x16_t r2 = vaesdq_u8(data, key); - - // Hack... GCC optimizes away the code and returns true - result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else +#endif return false; -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE } -static bool TrySHA1() +inline bool CPU_QuerySHA1() { -#if (CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - - uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); - uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); - uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); - uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); - uint32x4_t r5 = vsha1su1q_u32 (data1, data2); - - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerSHA1); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoSHA1)) - result = false; - else - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - - uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); - uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); - uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); - uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); - uint32x4_t r5 = vsha1su1q_u32 (data1, data2); - - // Hack... GCC optimizes away the code and returns true - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else +#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_SHA1) + return true; +#elif defined(__linux__) && defined(__aarch64__) + if (getauxval(AT_HWCAP) & HWCAP_SHA1) + return true; +#elif defined(__linux__) && defined(__aarch32__) + if (getauxval(AT_HWCAP2) & HWCAP2_SHA1) + return true; +#endif return false; -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE } -static bool TrySHA2() +inline bool CPU_QuerySHA2() { -#if (CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - - uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); - uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); - uint32x4_t r3 = vsha256su0q_u32 (data1, data2); - uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); - - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerSHA2); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoSHA2)) - result = false; - else - { - uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; - - uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); - uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); - uint32x4_t r3 = vsha256su0q_u32 (data1, data2); - uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); - - // Hack... GCC optimizes away the code and returns true - result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else +#if defined(__ANDROID__) && (defined(__aarch64__) || defined(__aarch32__)) + if (android_getCpuFeatures() & ANDROID_CPU_ARM64_FEATURE_SHA2) + return true; +#elif defined(__linux__) && defined(__aarch64__) + if (getauxval(AT_HWCAP) & HWCAP_SHA2) + return true; +#elif defined(__linux__) && defined(__aarch32__) + if (getauxval(AT_HWCAP2) & HWCAP2_SHA2) + return true; +#endif return false; -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE } void DetectArmFeatures() { - g_hasNEON = TryNEON(); - g_hasPMULL = TryPMULL(); - g_hasCRC32 = TryCRC32(); - g_hasAES = TryAES(); - g_hasSHA1 = TrySHA1(); - g_hasSHA2 = TrySHA2(); + g_hasNEON = CPU_QueryNEON() || CPU_ProbeNEON(); + g_hasCRC32 = CPU_QueryCRC32() || CPU_ProbeCRC32(); + g_hasPMULL = CPU_QueryPMULL() || CPU_ProbePMULL(); + g_hasAES = CPU_QueryAES() || CPU_ProbeAES(); + g_hasSHA1 = CPU_QuerySHA1() || CPU_ProbeSHA1(); + g_hasSHA2 = CPU_QuerySHA2() || CPU_ProbeSHA2(); g_ArmDetectionDone = true; } diff --git a/cpu.h b/cpu.h index d1ccf6f10..59cecf021 100644 --- a/cpu.h +++ b/cpu.h @@ -17,70 +17,6 @@ # pragma GCC diagnostic ignored "-Wsign-conversion" #endif -// ARM32 and ARM64 Headers -#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) -# if defined(__GNUC__) -# include -# endif -# if defined(__ARM_NEON) || defined(__aarch32__) || defined(__aarch64__) || defined(_MSC_VER) -# include -# endif -# if defined(__GNUC__) && !defined(__apple_build_version__) -# if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRC32) || defined(__ARM_FEATURE_CRYPTO) -# include -# endif -# endif -#endif // ARM32 and ARM64 Headers - -// Used when supplying ASM due to missing intrinsics -#if defined(__clang__) -# define GCC_INLINE inline -# define GCC_INLINE_ATTRIB __attribute__((__gnu_inline__, __always_inline__)) -#elif (CRYPTOPP_GCC_VERSION >= 30300) || defined(__INTEL_COMPILER) -# define GCC_INLINE __inline -# define GCC_INLINE_ATTRIB __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else -# define GCC_INLINE inline -# define GCC_INLINE_ATTRIB -# endif - -// X86/X64/X32 Headers -#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 - -// GCC X86 super-include -#if (CRYPTOPP_GCC_VERSION >= 40800) -# include -#endif -#if (CRYPTOPP_MSC_VERSION >= 1400) -# include -#endif - -// Baseline include -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE -# include // __m64, __m128i, _mm_set_epi64x -#endif -#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE -# include // _mm_shuffle_pi8, _mm_shuffle_epi8 -#endif // tmmintrin.h -#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE -# include // _mm_blend_epi16 -# include // _mm_crc32_u{8|16|32} -#endif // smmintrin.h -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE -# include // aesenc, aesdec, etc -#endif // wmmintrin.h -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE -# include // RDRAND, RDSEED, AVX, SHA -#endif // immintrin.h -#endif // X86/X64/X32 Headers - -// Applies to both X86/X32/X64 and ARM32/ARM64. And we've got MIPS devices on the way. -#if defined(_MSC_VER) || defined(__BORLANDC__) -# define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY -#else -# define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY -#endif - // Applies to both X86/X32/X64 and ARM32/ARM64 #if defined(CRYPTOPP_LLVM_CLANG_VERSION) || defined(CRYPTOPP_APPLE_CLANG_VERSION) || defined(CRYPTOPP_CLANG_INTEGRATED_ASSEMBLER) #define NEW_LINE "\n" @@ -121,11 +57,10 @@ NAMESPACE_BEGIN(CryptoPP) #ifndef CRYPTOPP_DOXYGEN_PROCESSING // These should not be used directly extern CRYPTOPP_DLL bool g_x86DetectionDone; -extern CRYPTOPP_DLL bool g_hasMMX; -extern CRYPTOPP_DLL bool g_hasISSE; extern CRYPTOPP_DLL bool g_hasSSE2; extern CRYPTOPP_DLL bool g_hasSSSE3; -extern CRYPTOPP_DLL bool g_hasSSE4; +extern CRYPTOPP_DLL bool g_hasSSE41; +extern CRYPTOPP_DLL bool g_hasSSE42; extern CRYPTOPP_DLL bool g_hasAESNI; extern CRYPTOPP_DLL bool g_hasCLMUL; extern CRYPTOPP_DLL bool g_hasSHA; @@ -140,39 +75,9 @@ extern CRYPTOPP_DLL bool g_hasPadlockPMM; extern CRYPTOPP_DLL word32 g_cacheLineSize; CRYPTOPP_DLL void CRYPTOPP_API DetectX86Features(); -CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 input, word32 output[4]); +CRYPTOPP_DLL bool CRYPTOPP_API CpuId(word32 func, word32 subfunc, word32 output[4]); #endif // CRYPTOPP_DOXYGEN_PROCESSING -//! \brief Determines MMX availability -//! \returns true if MMX is determined to be available, false otherwise -//! \details MMX, SSE and SSE2 are core processor features for x86_64, and -//! the function always returns true for the platform. -inline bool HasMMX() -{ -#if CRYPTOPP_BOOL_X64 - return true; -#else - if (!g_x86DetectionDone) - DetectX86Features(); - return g_hasMMX; -#endif -} - -//! \brief Determines SSE availability -//! \returns true if SSE is determined to be available, false otherwise -//! \details MMX, SSE and SSE2 are core processor features for x86_64, and -//! the function always returns true for the platform. -inline bool HasISSE() -{ -#if CRYPTOPP_BOOL_X64 - return true; -#else - if (!g_x86DetectionDone) - DetectX86Features(); - return g_hasISSE; -#endif -} - //! \brief Determines SSE2 availability //! \returns true if SSE2 is determined to be available, false otherwise //! \details MMX, SSE and SSE2 are core processor features for x86_64, and @@ -199,14 +104,24 @@ inline bool HasSSSE3() return g_hasSSSE3; } -//! \brief Determines SSE4 availability -//! \returns true if SSE4.1 and SSE4.2 are determined to be available, false otherwise -//! \details HasSSE4() is a runtime check performed using CPUID which requires both SSE4.1 and SSE4.2 -inline bool HasSSE4() +//! \brief Determines SSE4.1 availability +//! \returns true if SSE4.1 is determined to be available, false otherwise +//! \details HasSSE41() is a runtime check performed using CPUID +inline bool HasSSE41() +{ + if (!g_x86DetectionDone) + DetectX86Features(); + return g_hasSSE41; +} + +//! \brief Determines SSE4.2 availability +//! \returns true if SSE4.2 is determined to be available, false otherwise +//! \details HasSSE42() is a runtime check performed using CPUID +inline bool HasSSE42() { if (!g_x86DetectionDone) DetectX86Features(); - return g_hasSSE4; + return g_hasSSE42; } //! \brief Determines AES-NI availability @@ -341,20 +256,25 @@ void CRYPTOPP_API DetectArmFeatures(); //! \brief Determine if an ARM processor has Advanced SIMD available //! \returns true if the hardware is capable of Advanced SIMD at runtime, false otherwise. -//! \details Advanced SIMD instructions are available under Aarch64 (ARM-64) and Aarch32 (ARM-32). +//! \details Advanced SIMD instructions are available under most ARMv7, Aarch32 and Aarch64. //! \details Runtime support requires compile time support. When compiling with GCC, you may //! need to compile with -mfpu=neon (32-bit) or -march=armv8-a //! (64-bit). Also see ARM's __ARM_NEON preprocessor macro. inline bool HasNEON() { + // ASIMD is a core feature on Aarch32 and Aarch64 like SSE2 is a core feature on x86_64 +#if defined(__aarch32__) || defined(__aarch64__) + return true; +#else if (!g_ArmDetectionDone) DetectArmFeatures(); return g_hasNEON; +#endif } -//! \brief Determine if an ARM processor provides Polynomial Multiplication (long) +//! \brief Determine if an ARM processor provides Polynomial Multiplication //! \returns true if the hardware is capable of polynomial multiplications at runtime, false otherwise. -//! \details The multiplication instructions are available under Aarch64 (ARM-64) and Aarch32 (ARM-32). +//! \details The multiplication instructions are available under Aarch32 and Aarch64. //! \details Runtime support requires compile time support. When compiling with GCC, you may //! need to compile with -march=armv8-a+crypto; while Apple requires //! -arch arm64. Also see ARM's __ARM_FEATURE_CRYPTO preprocessor macro. @@ -367,62 +287,74 @@ inline bool HasPMULL() //! \brief Determine if an ARM processor has CRC32 available //! \returns true if the hardware is capable of CRC32 at runtime, false otherwise. -//! \details CRC32 instructions provide access to the processor's CRC32 and CRC32-C instructions. -//! They are provided by ARM C Language Extensions 2.0 (ACLE 2.0) and available under Aarch64 -//! (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an AArch32 execution environment). +//! \details CRC32 instructions provide access to the processor's CRC-32 and CRC-32C instructions. +//! They are provided by ARM C Language Extensions 2.0 (ACLE 2.0) and available under Aarch32 and Aarch64. //! \details Runtime support requires compile time support. When compiling with GCC, you may //! need to compile with -march=armv8-a+crc; while Apple requires //! -arch arm64. Also see ARM's __ARM_FEATURE_CRC32 preprocessor macro. inline bool HasCRC32() { +#if defined(__aarch32__) || defined(__aarch64__) if (!g_ArmDetectionDone) DetectArmFeatures(); return g_hasCRC32; +#else + return false; +#endif } //! \brief Determine if an ARM processor has AES available //! \returns true if the hardware is capable of AES at runtime, false otherwise. -//! \details AES is part of the Crypto extensions from ARM C Language Extensions 2.0 (ACLE 2.0) -//! and available under Aarch64 (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an -//! AArch32 execution environment). +//! \details AES is part of the optional Crypto extensions on Aarch32 and Aarch64. They are +//! accessed using ARM C Language Extensions 2.0 (ACLE 2.0). //! \details Runtime support requires compile time support. When compiling with GCC, you may //! need to compile with -march=armv8-a+crypto; while Apple requires //! -arch arm64. Also see ARM's __ARM_FEATURE_CRYPTO preprocessor macro. inline bool HasAES() { +#if defined(__aarch32__) || defined(__aarch64__) if (!g_ArmDetectionDone) DetectArmFeatures(); return g_hasAES; +#else + return false; +#endif } //! \brief Determine if an ARM processor has SHA1 available //! \returns true if the hardware is capable of SHA1 at runtime, false otherwise. -//! \details SHA1 is part of the Crypto extensions from ARM C Language Extensions 2.0 (ACLE 2.0) -//! and available under Aarch64 (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an -//! AArch32 execution environment). +//! \details SHA1 is part of the optional Crypto extensions on Aarch32 and Aarch64. They are +//! accessed using ARM C Language Extensions 2.0 (ACLE 2.0). //! \details Runtime support requires compile time support. When compiling with GCC, you may //! need to compile with -march=armv8-a+crypto; while Apple requires //! -arch arm64. Also see ARM's __ARM_FEATURE_CRYPTO preprocessor macro. inline bool HasSHA1() { +#if defined(__aarch32__) || defined(__aarch64__) if (!g_ArmDetectionDone) DetectArmFeatures(); return g_hasSHA1; +#else + return false; +#endif } //! \brief Determine if an ARM processor has SHA2 available //! \returns true if the hardware is capable of SHA2 at runtime, false otherwise. -//! \details SHA2 is part of the Crypto extensions from ARM C Language Extensions 2.0 (ACLE 2.0) -//! and available under Aarch64 (ARM-64) and Aarch32 (ARM-32) running on Aarch64 (i.e., an -//! AArch32 execution environment). +//! \details SHA2 is part of the optional Crypto extensions on Aarch32 and Aarch64. They are +//! accessed using ARM C Language Extensions 2.0 (ACLE 2.0). //! \details Runtime support requires compile time support. When compiling with GCC, you may //! need to compile with -march=armv8-a+crypto; while Apple requires //! -arch arm64. Also see ARM's __ARM_FEATURE_CRYPTO preprocessor macro. inline bool HasSHA2() { +#if defined(__aarch32__) || defined(__aarch64__) if (!g_ArmDetectionDone) DetectArmFeatures(); return g_hasSHA2; +#else + return false; +#endif } //! \brief Provides the cache line size at runtime @@ -457,7 +389,6 @@ inline int GetCacheLineSize() #define ASC(x, y) x label##y*newline* #define AS_HEX(y) 0##y##h #elif defined(_MSC_VER) || defined(__BORLANDC__) - #define CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY #define AS1(x) __asm {x} #define AS2(x, y) __asm {x, y} #define AS3(x, y, z) __asm {x, y, z} @@ -468,8 +399,6 @@ inline int GetCacheLineSize() #define CRYPTOPP_NAKED __declspec(naked) #define AS_HEX(y) 0x##y #else - #define CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY - // define these in two steps to allow arguments to be expanded #define GNU_AS1(x) #x ";" NEW_LINE #define GNU_AS2(x, y) #x ", " #y ";" NEW_LINE diff --git a/crc-simd.cpp b/crc-simd.cpp new file mode 100644 index 000000000..a7e32a4a5 --- /dev/null +++ b/crc-simd.cpp @@ -0,0 +1,156 @@ +// crc-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to SSE4.2 and +// ARMv8a CRC-32 and CRC-32C instructions. A separate source file +// is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" + +// Clang and GCC hoops... +#if !(defined(__ARM_FEATURE_CRC32) || defined(_MSC_VER)) +# undef CRYPTOPP_ARM_CRC32_AVAILABLE +#endif + +#if (CRYPTOPP_SSE42_AVAILABLE) +# include "nmmintrin.h" +#endif + +// Don't include when using Apple Clang. Early Apple compilers +// fail to compile with included. Later Apple compilers compile +// intrinsics without included. +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) +# include "arm_acle.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) + +bool CPU_ProbeCRC32() +{ +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + word32 w=0, x=1; word16 y=2; byte z=3; + w = __crc32w(w,x); + w = __crc32h(w,y); + w = __crc32b(w,z); + w = __crc32cw(w,x); + w = __crc32ch(w,y); + w = __crc32cb(w,z); + + result = !!w; + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +#else + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + word32 w=0, x=1; word16 y=2; byte z=3; + w = __crc32w(w,x); + w = __crc32h(w,y); + w = __crc32b(w,z); + w = __crc32cw(w,x); + w = __crc32ch(w,y); + w = __crc32cb(w,z); + + // Hack... GCC optimizes away the code and returns true + result = !!w; + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_CRC32_AVAILABLE +} +#endif // ARM32 or ARM64 + +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) +void CRC32_Update_ARMV8(const byte *s, size_t n, word32& c) +{ + for(; !IsAligned(s) && n > 0; s++, n--) + c = __crc32b(c, *s); + + for(; n > 4; s+=4, n-=4) + c = __crc32w(c, *(const word32 *)(void*)s); + + for(; n > 0; s++, n--) + c = __crc32b(c, *s); +} + +void CRC32C_Update_ARMV8(const byte *s, size_t n, word32& c) +{ + for(; !IsAligned(s) && n > 0; s++, n--) + c = __crc32cb(c, *s); + + for(; n > 4; s+=4, n-=4) + c = __crc32cw(c, *(const word32 *)(void*)s); + + for(; n > 0; s++, n--) + c = __crc32cb(c, *s); +} +#endif + +#if (CRYPTOPP_SSE42_AVAILABLE) +void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c) +{ + for(; !IsAligned(s) && n > 0; s++, n--) + c = _mm_crc32_u8(c, *s); + + for(; n > 4; s+=4, n-=4) + c = _mm_crc32_u32(c, *(const word32 *)(void*)s); + + for(; n > 0; s++, n--) + c = _mm_crc32_u8(c, *s); +} +#endif + +NAMESPACE_END \ No newline at end of file diff --git a/crc.cpp b/crc.cpp index ccc3fe62d..21153d3d6 100644 --- a/crc.cpp +++ b/crc.cpp @@ -1,44 +1,23 @@ // crc.cpp - originally written and placed in the public domain by Wei Dai #include "pch.h" +#include "config.h" #include "crc.h" #include "misc.h" #include "cpu.h" NAMESPACE_BEGIN(CryptoPP) -// Visual Studio needs VS2008 (1500) -// http://msdn.microsoft.com/en-us/library/bb531394%28v=vs.90%29.aspx -#if defined(_MSC_VER) && (_MSC_VER < 1500) -# undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE +// crc-simd.cpp +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) +extern void CRC32_Update_ARMV8(const byte *s, size_t n, word32& c); +extern void CRC32C_Update_ARMV8(const byte *s, size_t n, word32& c); #endif -#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 -#if (CRYPTOPP_GCC_VERSION >= 40300 || __INTEL_COMPILER >= 1000 || __SUNPRO_CC >= 0x5110 || CRYPTOPP_LLVM_CLANG_VERSION >= 20300 || CRYPTOPP_APPLE_CLANG_VERSION >= 40000) && !defined(__SSE4_2__) && !defined(_MSC_VER) -GCC_INLINE unsigned int GCC_INLINE_ATTRIB -MM_CRC32_U8(unsigned int crc, unsigned char val) -{ - asm ("crc32 %1, %0" : "+r"(crc) : "r"(val)); - return crc; -} -GCC_INLINE unsigned int GCC_INLINE_ATTRIB -MM_CRC32_U16(unsigned int crc, unsigned short val) -{ - asm ("crc32 %1, %0" : "+r"(crc) : "r"(val)); - return crc; -} -GCC_INLINE unsigned int GCC_INLINE_ATTRIB -MM_CRC32_U32(unsigned int crc, unsigned int val) -{ - asm ("crc32 %1, %0" : "+r"(crc) : "r"(val)); - return crc; -} -#else - #define MM_CRC32_U8(a,b) _mm_crc32_u8(a,b) - #define MM_CRC32_U16(a,b) _mm_crc32_u16(a,b) - #define MM_CRC32_U32(a,b) _mm_crc32_u32(a,b) +// crc-simd.cpp +#if (CRYPTOPP_SSE42_AVAILABLE) +extern void CRC32C_Update_SSE42(const byte *s, size_t n, word32& c); #endif -#endif // X86/X32/X64 /* Table of CRC-32's of all single byte values (made by makecrc.c) */ const word32 CRC32::m_tab[] = { @@ -158,18 +137,10 @@ CRC32::CRC32() void CRC32::Update(const byte *s, size_t n) { -#if (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) if (HasCRC32()) { - for(; !IsAligned(s) && n > 0; s++, n--) - m_crc = __crc32b(m_crc, *s); - - for(; n > 4; s+=4, n-=4) - m_crc = __crc32w(m_crc, *(const word32 *)(void*)s); - - for(; n > 0; s++, n--) - m_crc = __crc32b(m_crc, *s); - + CRC32_Update_ARMV8(s, n, m_crc); return; } #endif @@ -326,32 +297,16 @@ CRC32C::CRC32C() void CRC32C::Update(const byte *s, size_t n) { -#if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE - if (HasSSE4()) +#if (CRYPTOPP_SSE42_AVAILABLE) + if (HasSSE42()) { - for(; !IsAligned(s) && n > 0; s++, n--) - m_crc = MM_CRC32_U8(m_crc, *s); - - for(; n > 4; s+=4, n-=4) - m_crc = MM_CRC32_U32(m_crc, *(const word32 *)(void*)s); - - for(; n > 0; s++, n--) - m_crc = MM_CRC32_U8(m_crc, *s); - + CRC32C_Update_SSE42(s, n, m_crc); return; } -#elif (CRYPTOPP_BOOL_ARM_CRC32_INTRINSICS_AVAILABLE) +#elif (CRYPTOPP_ARM_CRC32_AVAILABLE) if (HasCRC32()) { - for(; !IsAligned(s) && n > 0; s++, n--) - m_crc = __crc32cb(m_crc, *s); - - for(; n > 4; s+=4, n-=4) - m_crc = __crc32cw(m_crc, *(const word32 *)(void*)s); - - for(; n > 0; s++, n--) - m_crc = __crc32cb(m_crc, *s); - + CRC32C_Update_ARMV8(s, n, m_crc); return; } #endif diff --git a/cryptdll.vcxproj b/cryptdll.vcxproj index f25899440..f2c137809 100644 --- a/cryptdll.vcxproj +++ b/cryptdll.vcxproj @@ -1,5 +1,8 @@ + + + Debug @@ -18,6 +21,9 @@ x64 + + + {94a428a1-9ba8-4db2-b76e-bd2e3c08f257} cryptdll @@ -203,6 +209,7 @@ + @@ -222,10 +229,12 @@ + + diff --git a/cryptdll.vcxproj.filters b/cryptdll.vcxproj.filters index b10394017..22ff828b7 100644 --- a/cryptdll.vcxproj.filters +++ b/cryptdll.vcxproj.filters @@ -89,6 +89,9 @@ Source Files + + Source Files + Source Files @@ -152,6 +155,9 @@ Source Files + + Source Files + Source Files @@ -164,6 +170,9 @@ Source Files + + Source Files + Source Files diff --git a/cryptest.nmake b/cryptest.nmake index 8590e7160..4516fefff 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -45,12 +45,11 @@ ########################################################################################### -# If you use 'make sources' from Linux makefile, then add 'winpipes.cpp'. Platform specific -# classes, like 'rdrand.cpp', should not be included. Add them under the X86 and X64 rules. +# If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below. -LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp network.cpp oaep.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp +LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp network.cpp oaep.cpp osrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp -LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj network.obj oaep.obj osrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha.obj sha3.obj shacal2.obj shark.obj sharkbox.obj skipjack.obj socketft.obj sosemanuk.obj square.obj squaretb.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj +LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj network.obj oaep.obj osrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj skipjack.obj socketft.obj sosemanuk.obj square.obj squaretb.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp @@ -128,6 +127,8 @@ LDLIBS = $(LDLIBS) ws2_32.lib kernel32.lib !IF "$(PLATFORM)" == "ARM" || "$(PLATFORM)" == "arm" || "$(PLATFORM)" == "ARM64" || "$(PLATFORM)" == "arm64" # CXXFLAGS = $(CXXFLAGS) /D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 /DWINAPI_FAMILY=WINAPI_FAMILY_DESKTOP_APP CXXFLAGS = $(CXXFLAGS) /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP +LIB_SRCS = $(LIB_SRCS) neon.cpp +LIB_OBJS = $(LIB_OBJS) neon.obj # CXXFLAGS = $(CXXFLAGS) /DWINAPI_FAMILY=WINAPI_FAMILY_APP # LDLIBS = $(LDLIBS) ws2_32.lib !ENDIF diff --git a/cryptest.sh b/cryptest.sh index 28f79be5e..f441c963e 100755 --- a/cryptest.sh +++ b/cryptest.sh @@ -1171,37 +1171,75 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) X86_SSE2=$(echo -n "$X86_CPU_FLAGS" | "$GREP" -i -c sse2) - X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'X86_SHA256_HashBlocks') + X86_SHA256_HASH_BLOCKS=$(echo -n "$DISASS_TEXT" | "$EGREP" -c 'SHA256_HashMultipleBlocks_SSE2') if [[ ("$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)') - if [[ ("$COUNT" -le "600") ]]; then + if [[ ("$COUNT" -le "250") ]]; then FAILED=1 - echo "ERROR: failed to generate rotate immediate instruction (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate rotate immediate instruction (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS" fi else COUNT=$(echo -n "$DISASS_TEXT" | "$EGREP" -i -c '(rol.*0x|ror.*0x)') - if [[ ("$COUNT" -le "1000") ]]; then + if [[ ("$COUNT" -le "500") ]]; then FAILED=1 echo "ERROR: failed to generate rotate immediate instruction" | tee -a "$TEST_RESULTS" fi fi if [[ ("$X86_SSE2" -ne "0" && "$X86_SHA256_HASH_BLOCKS" -eq "0") ]]; then - echo "ERROR: failed to use X86_SHA256_HashBlocks" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to use SHA256_HashMultipleBlocks_SSE2" | tee -a "$TEST_RESULTS" fi if [[ ("$FAILED" -eq "0" && "$X86_SHA256_HASH_BLOCKS" -ne "0") ]]; then - echo "Verified rotate immediate machine instructions (X86_SHA256_HashBlocks)" | tee -a "$TEST_RESULTS" + echo "Verified rotate immediate machine instructions (SHA256_HashMultipleBlocks_SSE2)" | tee -a "$TEST_RESULTS" elif [[ ("$FAILED" -eq "0") ]]; then echo "Verified rotate immediate machine instructions" | tee -a "$TEST_RESULTS" fi fi + ############################################ + # Test CRC-32C code generation + + "$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + X86_CRC32=1 + fi + + if [[ ("$X86_CRC32" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS" + echo + + OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]]; then + echo "Verified crc32b and crc32l machine instructions" | tee -a "$TEST_RESULTS" + fi + fi + ############################################ # Test AES-NI code generation @@ -1216,8 +1254,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo "Testing: X86 AES-NI code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=rijndael.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1278,8 +1316,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo "Testing: X86 carryless multiply code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1321,7 +1359,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo OBJFILE=rdrand.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1348,44 +1386,6 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t fi fi - ############################################ - # X86 CRC32 code generation - - "$CXX" -DCRYPTOPP_ADHOC_MAIN -msse4.2 adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 - if [[ "$?" -eq "0" ]]; then - X86_CRC32=1 - fi - - if [[ ("$X86_CRC32" -ne "0") ]]; then - echo - echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS" - echo - - OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" - - COUNT=0 - FAILED=0 - DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32l) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" - fi - - if [[ ("$FAILED" -eq "0") ]]; then - echo "Verified crc32l and crc32b machine instructions" | tee -a "$TEST_RESULTS" - fi - fi - ############################################ # X86 SHA code generation @@ -1400,8 +1400,8 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t echo "Testing: X86 SHA code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=sha.o; rm -f "$OBJFILE" 2>/dev/null - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1 -msse -msse2" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 @@ -1469,7 +1469,7 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ] echo "Testing: ARM NEON code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=aria.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=aria-simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 @@ -1516,17 +1516,71 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ] fi fi + ############################################ + # ARM CRC32 code generation + + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crc adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_CRC32=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_CRC32" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS" + echo + + OBJFILE=crc-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cb) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32cb instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cw) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32cw instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32w) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32w instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]]; then + echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS" + fi + fi + ############################################ # ARM carryless multiply code generation - ARM_PMULL=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c pmull) - if [[ ("$ARM_PMULL" -ne "0" || "$HAVE_ARM_CRYPTO" -ne "0") ]]; then + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_PMULL=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_PMULL" -ne "0") ]]; then echo echo "************************************" | tee -a "$TEST_RESULTS" echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=gcm.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=gcm-simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 @@ -1549,50 +1603,139 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ] echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS" fi fi + ############################################ + # ARM SHA code generation + + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_AES=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_AES" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM AES generation" | tee -a "$TEST_RESULTS" + echo + + OBJFILE=rijndael-simd.o; rm -f "$OBJFILE" 2>/dev/null + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aese) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aese instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesmc) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesmc instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesd) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesd instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c aesimc) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]]; then + echo "Verified aese, aesd, aesmc, aesimc machine instructions" | tee -a "$TEST_RESULTS" + fi + fi ############################################ - # ARM CRC32 code generation + # ARM SHA code generation - ARM_CRC32=$(echo -n "$ARM_CPU_FLAGS" | "$GREP" -i -c crc32) - if [[ ("$ARM_CRC32" -ne "0") ]]; then + "$CXX" -DCRYPTOPP_ADHOC_MAIN -march=armv8-a+crypto adhoc.cpp -o "$TMPDIR/adhoc.exe" > /dev/null 2>&1 + if [[ "$?" -eq "0" ]]; then + ARM_SHA=1 + fi + + if [[ ("$HAVE_ARMV8A" -ne "0" && "$ARM_SHA" -ne "0") ]]; then echo echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS" + echo "Testing: ARM SHA generation" | tee -a "$TEST_RESULTS" echo - OBJFILE=crc.o; rm -f "$OBJFILE" 2>/dev/null + OBJFILE=sha-simd.o; rm -f "$OBJFILE" 2>/dev/null CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS -DDISABLE_NATIVE_ARCH=1" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" COUNT=0 FAILED=0 DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cb) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1c) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32cb instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1c instruction" | tee -a "$TEST_RESULTS" fi - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32cw) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1m) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32cw instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1m instruction" | tee -a "$TEST_RESULTS" fi - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32b) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1p) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1p instruction" | tee -a "$TEST_RESULTS" fi - COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c crc32w) + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1h) if [[ ("$COUNT" -eq "0") ]]; then FAILED=1 - echo "ERROR: failed to generate crc32w instruction" | tee -a "$TEST_RESULTS" + echo "ERROR: failed to generate sha1h instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su0) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha1su0 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha1su1) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha1su1 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -v sha256h2 | "$GREP" -i -c sha256h) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256h instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256h2) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256h2 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su0) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256su0 instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo -n "$DISASS_TEXT" | "$GREP" -i -c sha256su1) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate sha256su1 instruction" | tee -a "$TEST_RESULTS" fi if [[ ("$FAILED" -eq "0") ]]; then - echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS" + echo "Verified sha1c, sha1m, sha1p, sha1su0, sha1su1, sha256h, sha256h2, sha256su0, sha256su1 machine instructions" | tee -a "$TEST_RESULTS" fi fi fi diff --git a/cryptest.vcxproj b/cryptest.vcxproj index 5c121432d..45fc00729 100644 --- a/cryptest.vcxproj +++ b/cryptest.vcxproj @@ -1,5 +1,8 @@ + + + Debug @@ -34,6 +37,9 @@ x64 + + + {09cdac08-e6ae-48a9-8de7-0fbc779eebde} cryptest diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index acea306ba..aa4ff2d3a 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -1,5 +1,8 @@ + + + Debug @@ -34,6 +37,9 @@ x64 + + + {c39f4b46-6e89-4074-902e-ca57073044d2} cryptlib @@ -166,6 +172,8 @@ + + @@ -173,6 +181,7 @@ + @@ -184,6 +193,7 @@ + @@ -210,6 +220,7 @@ + @@ -259,6 +270,7 @@ + @@ -269,8 +281,10 @@ + + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index c826e4910..a20ea0111 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -32,6 +32,12 @@ Source Files + + Source Files + + + Source Files + Source Files @@ -53,6 +59,9 @@ Source Files + + Source Files + Source Files @@ -89,6 +98,9 @@ Source Files + + Source Files + Source Files @@ -152,6 +164,9 @@ Source Files + + Source Files + Source Files @@ -290,6 +305,9 @@ Source Files + + Source Files + Source Files @@ -320,12 +338,18 @@ Source Files + + Source Files + Source Files Source Files + + Source Files + Source Files diff --git a/dlltest.vcxproj b/dlltest.vcxproj index ed0c8be77..b76d184af 100644 --- a/dlltest.vcxproj +++ b/dlltest.vcxproj @@ -1,5 +1,8 @@ + + + Debug @@ -18,6 +21,9 @@ x64 + + + {1974a53a-9863-41c9-886d-b2b8c2fc3c8b} dlltest diff --git a/dlltest.vcxproj.filters b/dlltest.vcxproj.filters new file mode 100644 index 000000000..047dde276 --- /dev/null +++ b/dlltest.vcxproj.filters @@ -0,0 +1,14 @@ + + + + + {d7fe0401-fa2d-40cd-80b9-b91f937996a3} + .cpp + + + + + Source Files + + + \ No newline at end of file diff --git a/gcm-simd.cpp b/gcm-simd.cpp new file mode 100644 index 000000000..d80fd5b6c --- /dev/null +++ b/gcm-simd.cpp @@ -0,0 +1,610 @@ +// gcm-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to SSE4.2 and +// ARMv8a CRC-32 and CRC-32C instructions. A separate source file +// is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" + +// Clang 3.3 integrated assembler crash on Linux. Other versions produce incorrect results. +// Clang has never handled Intel ASM very well. I wish LLVM would fix it. +#if defined(__clang__) +# undef CRYPTOPP_X86_ASM_AVAILABLE +# undef CRYPTOPP_X32_ASM_AVAILABLE +# undef CRYPTOPP_X64_ASM_AVAILABLE +# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +#endif + +// Clang and GCC hoops... +#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER)) +# undef CRYPTOPP_ARM_PMULL_AVAILABLE +#endif + +#if (CRYPTOPP_CLMUL_AVAILABLE) +# include "tmmintrin.h" +# include "wmmintrin.h" +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include "arm_neon.h" +#endif + +// Don't include when using Apple Clang. Early Apple compilers +// fail to compile with included. Later Apple compilers compile +// intrinsics without included. +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) +# include "arm_acle.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + +ANONYMOUS_NAMESPACE_BEGIN + +// GCC 4.8 is missing PMULL gear +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) +# if (CRYPTOPP_GCC_VERSION >= 40800) && (CRYPTOPP_GCC_VERSION < 49000) +inline poly128_t VMULL_P64(poly64_t a, poly64_t b) +{ + return __builtin_aarch64_crypto_pmulldi_ppp (a, b); +} + +inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b) +{ + return __builtin_aarch64_crypto_pmullv2di_ppp (a, b); +} +# else +inline poly128_t VMULL_P64(poly64_t a, poly64_t b) +{ + return vmull_p64(a, b); +} + +inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b) +{ + return vmull_high_p64(a, b); +} +# endif +#endif + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARM_PMULL_AVAILABLE +#if defined(__GNUC__) +// Schneiders, Hovsmith and O'Rourke used this trick. +// It results in much better code generation in production code +// by avoiding D-register spills when using vgetq_lane_u64. The +// problem does not surface under minimal test cases. +inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (a), "w" (b) ); + return r; +} + +inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) ); + return r; +} + +inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) ); + return r; +} + +inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" + :"=w" (r) : "w" (a), "w" (b) ); + return r; +} + +inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) +{ + uint64x2_t r; + __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" + :"=w" (r) : "w" (a), "w" (b), "I" (c) ); + return r; +} + +// https://github.com/weidai11/cryptopp/issues/366 +template +inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" + :"=w" (r) : "w" (a), "w" (b), "I" (C) ); + return r; +} +#endif // GCC and compatibles + +#if defined(_MSC_VER) +inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), + vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); +} + +inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), + vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); +} + +inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), + vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); +} + +inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), + vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); +} + +inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) +{ + return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c); +} + +// https://github.com/weidai11/cryptopp/issues/366 +template +inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) +{ + return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C); +} +#endif // Microsoft and compatibles +#endif // CRYPTOPP_ARM_PMULL_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) +bool CPU_ProbePMULL() +{ +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + + const poly128_t r1 = vmull_p64(a1, b1); + const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); + + // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. + const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} + const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} + + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +# else + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + + const poly128_t r1 = VMULL_P64(a1, b1); + const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2)); + + // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. + const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} + const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} + + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_SHA_AVAILABLE +} +#endif // ARM32 or ARM64 + +#if CRYPTOPP_ARM_NEON_AVAILABLE +void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c) +{ + CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); + *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c); +} +#endif + +#if CRYPTOPP_ARM_PMULL_AVAILABLE + +ANONYMOUS_NAMESPACE_BEGIN + +CRYPTOPP_ALIGN_DATA(16) +const word64 s_clmulConstants64[] = { + W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients + W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8 + W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8 +}; + +const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64; +const unsigned int s_clmulTableSizeInBlocks = 8; + +ANONYMOUS_NAMESPACE_END + +uint64x2_t GCM_Reduce_PMULL(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r) +{ + c1 = veorq_u64(c1, VEXT_U8<8>(vdupq_n_u64(0), c0)); + c1 = veorq_u64(c1, PMULL_01(c0, r)); + c0 = VEXT_U8<8>(c0, vdupq_n_u64(0)); + c0 = vshlq_n_u64(veorq_u64(c0, c1), 1); + c0 = PMULL_00(c0, r); + c2 = veorq_u64(c2, c0); + c2 = veorq_u64(c2, VEXT_U8<8>(c1, vdupq_n_u64(0))); + c1 = vshrq_n_u64(vcombine_u64(vget_low_u64(c1), vget_low_u64(c2)), 63); + c2 = vshlq_n_u64(c2, 1); + + return veorq_u64(c2, c1); +} + +uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r) +{ + const uint64x2_t c0 = PMULL_00(x, h); + const uint64x2_t c1 = veorq_u64(PMULL_10(x, h), PMULL_01(x, h)); + const uint64x2_t c2 = PMULL_11(x, h); + + return GCM_Reduce_PMULL(c0, c1, c2, r); +} + +void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize) +{ + const uint64x2_t r = s_clmulConstants[0]; + const uint64x2_t t = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(hashKey))); + const uint64x2_t h0 = vextq_u64(t, t, 1); + + uint64x2_t h = h0; + unsigned int i; + for (i=0; i(mtable); + uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer)); + const uint64x2_t r = s_clmulConstants[0]; + + const size_t BLOCKSIZE = 16; + while (len >= BLOCKSIZE) + { + size_t s = UnsignedMin(len/BLOCKSIZE, s_clmulTableSizeInBlocks), i=0; + uint64x2_t d1, d2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-1)*BLOCKSIZE))); + uint64x2_t c0 = vdupq_n_u64(0); + uint64x2_t c1 = vdupq_n_u64(0); + uint64x2_t c2 = vdupq_n_u64(0); + + while (true) + { + const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i)); + const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1)); + const uint64x2_t h2 = veorq_u64(h0, h1); + + if (++i == s) + { + const uint64x2_t t1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); + d1 = veorq_u64(vextq_u64(t1, t1, 1), x); + c0 = veorq_u64(c0, PMULL_00(d1, h0)); + c2 = veorq_u64(c2, PMULL_10(d1, h1)); + d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), + vget_low_u32(vreinterpretq_u32_u64(d1)))); + c1 = veorq_u64(c1, PMULL_00(d1, h2)); + + break; + } + + d1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); + c0 = veorq_u64(c0, PMULL_10(d2, h0)); + c2 = veorq_u64(c2, PMULL_10(d1, h1)); + d2 = veorq_u64(d2, d1); + c1 = veorq_u64(c1, PMULL_10(d2, h2)); + + if (++i == s) + { + const uint64x2_t t2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); + d1 = veorq_u64(vextq_u64(t2, t2, 1), x); + c0 = veorq_u64(c0, PMULL_01(d1, h0)); + c2 = veorq_u64(c2, PMULL_11(d1, h1)); + d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), + vget_low_u32(vreinterpretq_u32_u64(d1)))); + c1 = veorq_u64(c1, PMULL_01(d1, h2)); + + break; + } + + const uint64x2_t t3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); + d2 = vextq_u64(t3, t3, 1); + c0 = veorq_u64(c0, PMULL_01(d1, h0)); + c2 = veorq_u64(c2, PMULL_01(d2, h1)); + d1 = veorq_u64(d1, d2); + c1 = veorq_u64(c1, PMULL_01(d1, h2)); + } + data += s*16; + len -= s*16; + + c1 = veorq_u64(veorq_u64(c1, c0), c2); + x = GCM_Reduce_PMULL(c0, c1, c2, r); + } + + vst1q_u64(reinterpret_cast(hbuffer), x); + return len; +} + +void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer) +{ + if (GetNativeByteOrder() != BIG_ENDIAN_ORDER) + { + const uint8x16_t x = vrev64q_u8(vld1q_u8(hashBuffer)); + vst1q_u8(hashBuffer, vextq_u8(x, x, 8)); + } +} +#endif + +#if CRYPTOPP_CLMUL_AVAILABLE + +ANONYMOUS_NAMESPACE_BEGIN + +CRYPTOPP_ALIGN_DATA(16) +const word64 s_clmulConstants64[] = { + W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), + W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), + W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; + +const __m128i *s_clmulConstants = CONST_M128_CAST(s_clmulConstants64); +const unsigned int s_cltableSizeInBlocks = 8; + +ANONYMOUS_NAMESPACE_END + +#if 0 +// preserved for testing +void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c) +{ + word64 Z0=0, Z1=0, V0, V1; + + typedef BlockGetAndPut Block; + Block::Get(a)(V0)(V1); + + for (int i=0; i<16; i++) + { + for (int j=0x80; j!=0; j>>=1) + { + int x = b[i] & j; + Z0 ^= x ? V0 : 0; + Z1 ^= x ? V1 : 0; + x = (int)V1 & 1; + V1 = (V1>>1) | (V0<<63); + V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0); + } + } + Block::Put(NULLPTR, c)(Z0)(Z1); +} + +__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i) +{ + word64 A[1] = {ByteReverse(((word64*)&a)[i&1])}; + word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])}; + + PolynomialMod2 pa((byte *)A, 8); + PolynomialMod2 pb((byte *)B, 8); + PolynomialMod2 c = pa*pb; + + __m128i output; + for (int i=0; i<16; i++) + ((byte *)&output)[i] = c.GetByte(i); + return output; +} +#endif // Testing + +__m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) +{ + /* + The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most + significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the + rightmost bit positions, and the lowest byte addresses. + + c1 ^= c0t * 0xc200000000000000 + c2t ^= c0t + t = shift (c1t ^ c0b) left 1 bit + c2 ^= t * 0xe100000000000000 + c2t ^= c1b + shift c2 left 1 bit and xor in lowest bit of c1t + */ + c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); + c0 = _mm_srli_si128(c0, 8); + c0 = _mm_xor_si128(c0, c1); + c0 = _mm_slli_epi64(c0, 1); + c0 = _mm_clmulepi64_si128(c0, r, 0); + c2 = _mm_xor_si128(c2, c0); + c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8)); + c1 = _mm_unpacklo_epi64(c1, c2); + c1 = _mm_srli_epi64(c1, 63); + c2 = _mm_slli_epi64(c2, 1); + return _mm_xor_si128(c2, c1); +} + +__m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r) +{ + const __m128i c0 = _mm_clmulepi64_si128(x,h,0); + const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); + const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); + + return GCM_Reduce_CLMUL(c0, c1, c2, r); +} + +void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize) +{ + const __m128i r = s_clmulConstants[0]; + const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(CONST_M128_CAST(hashKey)), s_clmulConstants[1]); + + __m128i h = h0; + unsigned int i; + for (i=0; i= 16) + { + size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0; + __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-1)*16)), mask2); + __m128i c0 = _mm_setzero_si128(); + __m128i c1 = _mm_setzero_si128(); + __m128i c2 = _mm_setzero_si128(); + + while (true) + { + const __m128i h0 = _mm_load_si128(table+i); + const __m128i h1 = _mm_load_si128(table+i+1); + const __m128i h2 = _mm_xor_si128(h0, h1); + + if (++i == s) + { + d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1); + d1 = _mm_xor_si128(d1, x); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); + d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0)); + break; + } + + d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask2); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); + d2 = _mm_xor_si128(d2, d1); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1)); + + if (++i == s) + { + d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data)), mask1); + d1 = _mm_xor_si128(d1, x); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); + d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); + break; + } + + d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(data+(s-i)*16-8)), mask1); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); + d1 = _mm_xor_si128(d1, d2); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); + } + data += s*16; + len -= s*16; + + c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2); + x = GCM_Reduce_CLMUL(c0, c1, c2, r); + } + + _mm_store_si128(M128_CAST(hbuffer), x); + return len; +} + +void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer) +{ + // SSSE3 instruction, but only used with CLMUL + __m128i &x = *M128_CAST(hashBuffer); + x = _mm_shuffle_epi8(x, s_clmulConstants[1]); +} +#endif + +NAMESPACE_END \ No newline at end of file diff --git a/gcm.cpp b/gcm.cpp index cafd46c6e..033e8da91 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -9,10 +9,6 @@ #include "pch.h" #include "config.h" -#if CRYPTOPP_MSC_VERSION -# pragma warning(disable: 4189) -#endif - #ifndef CRYPTOPP_IMPORTS #ifndef CRYPTOPP_GENERATE_X64_MASM @@ -31,15 +27,15 @@ // # undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE // #endif -// Clang casts -#define M128I_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128I_CAST(x) ((const __m128i *)(const void *)(x)) - #include "gcm.h" #include "cpu.h" NAMESPACE_BEGIN(CryptoPP) +#if (CRYPTOPP_SSE2_AVAILABLE) +# include "emmintrin.h" +#endif + #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) // Different assemblers accept different mnemonics: 'movd eax, xmm0' vs // 'movd rax, xmm0' vs 'mov eax, xmm0' vs 'mov rax, xmm0' @@ -55,101 +51,13 @@ NAMESPACE_BEGIN(CryptoPP) #endif #endif // CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 -#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE -#if defined(__GNUC__) -// Schneiders, Hovsmith and O'Rourke used this trick. -// It results in much better code generation in production code -// by avoiding D-register spills when using vgetq_lane_u64. The -// problem does not surface under minimal test cases. -inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) -{ - uint64x2_t r; - __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" - :"=w" (r) : "w" (a), "w" (b) ); - return r; -} - -inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) -{ - uint64x2_t r; - __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" - :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) ); - return r; -} - -inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) -{ - uint64x2_t r; - __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" - :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) ); - return r; -} - -inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) -{ - uint64x2_t r; - __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" - :"=w" (r) : "w" (a), "w" (b) ); - return r; -} - -inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) -{ - uint64x2_t r; - __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" - :"=w" (r) : "w" (a), "w" (b), "I" (c) ); - return r; -} - -// https://github.com/weidai11/cryptopp/issues/366 -template -inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) -{ - uint64x2_t r; - __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" - :"=w" (r) : "w" (a), "w" (b), "I" (C) ); - return r; -} -#endif // GCC and compatibles - -#if defined(_MSC_VER) -inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) -{ - return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), - vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); -} - -inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) -{ - return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), - vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); -} - -inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) -{ - return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), - vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); -} +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) -inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) -{ - return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), - vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); -} - -inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c) -{ - return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c); -} - -// https://github.com/weidai11/cryptopp/issues/366 -template -inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) -{ - return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C); -} -#endif // Microsoft and compatibles -#endif // CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE +#if CRYPTOPP_ARM_NEON_AVAILABLE +extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c); +#endif word16 GCM_Base::s_reductionTable[256]; volatile bool GCM_Base::s_reductionTableInitialized = false; @@ -159,197 +67,82 @@ void GCM_Base::GCTR::IncrementCounterBy256() IncrementCounterByOne(m_counterArray+BlockSize()-4, 3); } -#if 0 -// preserved for testing -void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c) -{ - word64 Z0=0, Z1=0, V0, V1; - - typedef BlockGetAndPut Block; - Block::Get(a)(V0)(V1); - - for (int i=0; i<16; i++) - { - for (int j=0x80; j!=0; j>>=1) - { - int x = b[i] & j; - Z0 ^= x ? V0 : 0; - Z1 ^= x ? V1 : 0; - x = (int)V1 & 1; - V1 = (V1>>1) | (V0<<63); - V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0); - } - } - Block::Put(NULLPTR, c)(Z0)(Z1); -} - -__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i) +inline static void Xor16(byte *a, const byte *b, const byte *c) { - word64 A[1] = {ByteReverse(((word64*)&a)[i&1])}; - word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])}; - - PolynomialMod2 pa((byte *)A, 8); - PolynomialMod2 pb((byte *)B, 8); - PolynomialMod2 c = pa*pb; - - __m128i output; - for (int i=0; i<16; i++) - ((byte *)&output)[i] = c.GetByte(i); - return output; + CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); + ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0]; + ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1]; } -#endif -#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE -inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c) +#if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE +inline static void GCM_Xor16_SSE2(byte *a, const byte *b, const byte *c) { // SunCC 5.14 crash (bewildering since asserts are not in effect in release builds) // Also see http://github.com/weidai11/cryptopp/issues/226 and http://github.com/weidai11/cryptopp/issues/284 # if __SUNPRO_CC - *M128I_CAST(a) = _mm_xor_si128(*M128I_CAST(b), *M128I_CAST(c)); -# elif CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE + *M128_CAST(a) = _mm_xor_si128(*M128_CAST(b), *M128_CAST(c)); +# elif CRYPTOPP_SSE2_AVAILABLE CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<__m128i>())); CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<__m128i>())); CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<__m128i>())); - *M128I_CAST(a) = _mm_xor_si128(*M128I_CAST(b), *M128I_CAST(c)); + *M128_CAST(a) = _mm_xor_si128(*M128_CAST(b), *M128_CAST(c)); # else asm ("movdqa %1, %%xmm0; pxor %2, %%xmm0; movdqa %%xmm0, %0;" : "=m" (a[0]) : "m"(b[0]), "m"(c[0])); # endif } #endif -#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE -inline static void NEON_Xor16(byte *a, const byte *b, const byte *c) -{ - CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); - *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c); -} -#endif - -inline static void Xor16(byte *a, const byte *b, const byte *c) -{ - CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); - ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0]; - ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1]; -} - -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE -CRYPTOPP_ALIGN_DATA(16) -static const word64 s_clmulConstants64[] = { - W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), - W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), - W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; - -static const __m128i *s_clmulConstants = CONST_M128I_CAST(s_clmulConstants64); -static const unsigned int s_clmulTableSizeInBlocks = 8; - -inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) -{ - /* - The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most - significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the - rightmost bit positions, and the lowest byte addresses. - - c1 ^= c0t * 0xc200000000000000 - c2t ^= c0t - t = shift (c1t ^ c0b) left 1 bit - c2 ^= t * 0xe100000000000000 - c2t ^= c1b - shift c2 left 1 bit and xor in lowest bit of c1t - */ -#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301 - c2 = _mm_xor_si128(c2, _mm_move_epi64(c0)); -#else - c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); -#endif - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); - c0 = _mm_srli_si128(c0, 8); - c0 = _mm_xor_si128(c0, c1); - c0 = _mm_slli_epi64(c0, 1); - c0 = _mm_clmulepi64_si128(c0, r, 0); - c2 = _mm_xor_si128(c2, c0); - c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8)); - c1 = _mm_unpacklo_epi64(c1, c2); - c1 = _mm_srli_epi64(c1, 63); - c2 = _mm_slli_epi64(c2, 1); - return _mm_xor_si128(c2, c1); -} - -inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r) -{ - const __m128i c0 = _mm_clmulepi64_si128(x,h,0); - const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); - const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); +#if CRYPTOPP_CLMUL_AVAILABLE +extern void GCM_SetKeyWithoutResync_CLMUL(const byte *hashKey, byte *mulTable, unsigned int tableSize); +extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); +const unsigned int s_cltableSizeInBlocks = 8; +extern void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer); +#endif // CRYPTOPP_CLMUL_AVAILABLE - return CLMUL_Reduce(c0, c1, c2, r); -} +#if CRYPTOPP_ARM_PMULL_AVAILABLE +extern void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer); #endif -#if CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE - -CRYPTOPP_ALIGN_DATA(16) -static const word64 s_clmulConstants64[] = { - W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients - W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8 - W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8 -}; - -static const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64; -static const unsigned int s_clmulTableSizeInBlocks = 8; - -inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r) -{ - // See comments fo CLMUL_Reduce - c1 = veorq_u64(c1, VEXT_U8<8>(vdupq_n_u64(0), c0)); - c1 = veorq_u64(c1, PMULL_01(c0, r)); - c0 = VEXT_U8<8>(c0, vdupq_n_u64(0)); - c0 = vshlq_n_u64(veorq_u64(c0, c1), 1); - c0 = PMULL_00(c0, r); - c2 = veorq_u64(c2, c0); - c2 = veorq_u64(c2, VEXT_U8<8>(c1, vdupq_n_u64(0))); - c1 = vshrq_n_u64(vcombine_u64(vget_low_u64(c1), vget_low_u64(c2)), 63); - c2 = vshlq_n_u64(c2, 1); - - return veorq_u64(c2, c1); -} - -inline uint64x2_t PMULL_GF_Mul(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r) -{ - const uint64x2_t c0 = PMULL_00(x, h); - const uint64x2_t c1 = veorq_u64(PMULL_10(x, h), PMULL_01(x, h)); - const uint64x2_t c2 = PMULL_11(x, h); - - return PMULL_Reduce(c0, c1, c2, r); -} -#endif +#if CRYPTOPP_ARM_PMULL_AVAILABLE +extern void GCM_SetKeyWithoutResync_PMULL(const byte *hashKey, byte *mulTable, unsigned int tableSize); +extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); +const unsigned int s_cltableSizeInBlocks = 8; +#endif // CRYPTOPP_ARM_PMULL_AVAILABLE void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms) { BlockCipher &blockCipher = AccessBlockCipher(); blockCipher.SetKey(userKey, keylength, params); + // GCM is only defined for 16-byte block ciphers at the moment. + // However, variable blocksize support means we have to defer + // blocksize checks to runtime after the key is set. Also see + // https://github.com/weidai11/cryptopp/issues/408. + const unsigned int blockSize = blockCipher.BlockSize(); + CRYPTOPP_ASSERT(blockSize == REQUIRED_BLOCKSIZE); if (blockCipher.BlockSize() != REQUIRED_BLOCKSIZE) throw InvalidArgument(AlgorithmName() + ": block size of underlying block cipher is not 16"); int tableSize, i, j, k; -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_CLMUL_AVAILABLE if (HasCLMUL()) { // Avoid "parameter not used" error and suppress Coverity finding (void)params.GetIntValue(Name::TableSize(), tableSize); - tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; + tableSize = s_cltableSizeInBlocks * blockSize; + CRYPTOPP_ASSERT(tableSize > static_cast(blockSize)); } else -#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) { // Avoid "parameter not used" error and suppress Coverity finding (void)params.GetIntValue(Name::TableSize(), tableSize); - tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; + tableSize = s_cltableSizeInBlocks * blockSize; + CRYPTOPP_ASSERT(tableSize > static_cast(blockSize)); } else #endif @@ -359,61 +152,28 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const else tableSize = (GetTablesOption() == GCM_64K_Tables) ? 64*1024 : 2*1024; -#if defined(_MSC_VER) && (_MSC_VER < 1400) + //#if defined(_MSC_VER) && (_MSC_VER < 1400) // VC 2003 workaround: compiler generates bad code for 64K tables - tableSize = 2*1024; -#endif + //tableSize = 2*1024; + //#endif } - m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize); + m_buffer.resize(3*blockSize + tableSize); byte *mulTable = MulTable(); byte *hashKey = HashKey(); memset(hashKey, 0, REQUIRED_BLOCKSIZE); blockCipher.ProcessBlock(hashKey); -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_CLMUL_AVAILABLE if (HasCLMUL()) { - const __m128i r = s_clmulConstants[0]; - __m128i h0 = _mm_shuffle_epi8(_mm_load_si128(M128I_CAST(hashKey)), s_clmulConstants[1]); - __m128i h = h0; - - for (i=0; i= 16) - { - size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; - __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-1)*16)), mask2); - __m128i c0 = _mm_setzero_si128(); - __m128i c1 = _mm_setzero_si128(); - __m128i c2 = _mm_setzero_si128(); - - while (true) - { - __m128i h0 = _mm_load_si128(mulTable+i); - __m128i h1 = _mm_load_si128(mulTable+i+1); - __m128i h2 = _mm_xor_si128(h0, h1); - - if (++i == s) - { - d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data)), mask1); - d1 = _mm_xor_si128(d1, x); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); - d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0)); - break; - } - - d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-i)*16-8)), mask2); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); - d2 = _mm_xor_si128(d2, d1); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1)); - - if (++i == s) - { - d1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data)), mask1); - d1 = _mm_xor_si128(d1, x); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); - d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); - break; - } - - d2 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128I_CAST(data+(s-i)*16-8)), mask1); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); - d1 = _mm_xor_si128(d1, d2); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); - } - data += s*16; - len -= s*16; - - c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2); - x = CLMUL_Reduce(c0, c1, c2, r); - } - - _mm_store_si128(M128I_CAST(HashBuffer()), x); - return len; + return GCM_AuthenticateBlocks_CLMUL(data, len, MulTable(), HashBuffer()); } -#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) { - const uint64x2_t *mulTable = (const uint64x2_t *)MulTable(); - uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(HashBuffer())); - const uint64x2_t r = s_clmulConstants[0]; - - while (len >= 16) - { - size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; - uint64x2_t d1, d2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-1)*16))); - uint64x2_t c0 = vdupq_n_u64(0); - uint64x2_t c1 = vdupq_n_u64(0); - uint64x2_t c2 = vdupq_n_u64(0); - - while (true) - { - const uint64x2_t h0 = vld1q_u64((const uint64_t*)(mulTable+i)); - const uint64x2_t h1 = vld1q_u64((const uint64_t*)(mulTable+i+1)); - const uint64x2_t h2 = veorq_u64(h0, h1); - - if (++i == s) - { - const uint64x2_t t1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); - d1 = veorq_u64(vextq_u64(t1, t1, 1), x); - c0 = veorq_u64(c0, PMULL_00(d1, h0)); - c2 = veorq_u64(c2, PMULL_10(d1, h1)); - d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), - vget_low_u32(vreinterpretq_u32_u64(d1)))); - c1 = veorq_u64(c1, PMULL_00(d1, h2)); - - break; - } - - d1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); - c0 = veorq_u64(c0, PMULL_10(d2, h0)); - c2 = veorq_u64(c2, PMULL_10(d1, h1)); - d2 = veorq_u64(d2, d1); - c1 = veorq_u64(c1, PMULL_10(d2, h2)); - - if (++i == s) - { - const uint64x2_t t2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); - d1 = veorq_u64(vextq_u64(t2, t2, 1), x); - c0 = veorq_u64(c0, PMULL_01(d1, h0)); - c2 = veorq_u64(c2, PMULL_11(d1, h1)); - d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), - vget_low_u32(vreinterpretq_u32_u64(d1)))); - c1 = veorq_u64(c1, PMULL_01(d1, h2)); - - break; - } - - const uint64x2_t t3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); - d2 = vextq_u64(t3, t3, 1); - c0 = veorq_u64(c0, PMULL_01(d1, h0)); - c2 = veorq_u64(c2, PMULL_01(d2, h1)); - d1 = veorq_u64(d1, d2); - c1 = veorq_u64(c1, PMULL_01(d1, h2)); - } - data += s*16; - len -= s*16; - - c1 = veorq_u64(veorq_u64(c1, c0), c2); - x = PMULL_Reduce(c0, c1, c2, r); - } - - vst1q_u64((uint64_t *)HashBuffer(), x); - return len; -} + return GCM_AuthenticateBlocks_PMULL(data, len, MulTable(), HashBuffer()); + } #endif typedef BlockGetAndPut Block; @@ -755,7 +385,7 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) switch (2*(m_buffer.size()>=64*1024) #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) + HasSSE2() -//#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE +//#elif CRYPTOPP_ARM_NEON_AVAILABLE // + HasNEON() #endif ) diff --git a/neon.cpp b/neon.cpp new file mode 100644 index 000000000..70ce61015 --- /dev/null +++ b/neon.cpp @@ -0,0 +1,108 @@ +// crc-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to ARMv7a and +// ARMv8a NEON instructions. A separate source file is needed +// because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include "arm_neon.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +bool CPU_ProbeNEON() +{ +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + uint32_t v1[4] = {1,1,1,1}; + uint32x4_t x1 = vld1q_u32(v1); + uint64_t v2[2] = {1,1}; + uint64x2_t x2 = vld1q_u64(v2); + + uint32x4_t x3 = vdupq_n_u32(2); + x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); + x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); + uint64x2_t x4 = vdupq_n_u64(2); + x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); + x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); + + result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +# else + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint32_t v1[4] = {1,1,1,1}; + uint32x4_t x1 = vld1q_u32(v1); + uint64_t v2[2] = {1,1}; + uint64x2_t x2 = vld1q_u64(v2); + + uint32x4_t x3 = {0,0,0,0}; + x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); + x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); + uint64x2_t x4 = {0,0}; + x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); + x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); + + // Hack... GCC optimizes away the code and returns true + result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_NEON_AVAILABLE +} + +NAMESPACE_END diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp new file mode 100644 index 000000000..94d3fa61d --- /dev/null +++ b/rijndael-simd.cpp @@ -0,0 +1,705 @@ +// rijndael-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to AES-NI and +// ARMv8a AES instructions. A separate source file is needed +// because additional CXXFLAGS are required to enable the +// appropriate instructions sets in some build configurations. +// +// ARMv8a AES code based on CriticalBlue code from Johannes Schneiders, +// Skip Hovsmith and Barry O'Rourke for the mbedTLS project. Stepping +// mbedTLS under a debugger was helped for us to determine problems +// with our subkey generation and scheduling. + +#include "pch.h" +#include "config.h" +#include "misc.h" + +// Clang and GCC hoops... +#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER)) +# undef CRYPTOPP_ARM_AES_AVAILABLE +#endif + +#if (CRYPTOPP_AESNI_AVAILABLE) +// Hack... We are supposed to use . GCC 4.8, LLVM Clang 3.5 +// and Apple Clang 6.0 conflates SSE4.1 and SSE4.2. If we use +// then compile fails with "SSE4.2 instruction set not enabled". Also see +// https://gcc.gnu.org/ml/gcc-help/2017-08/msg00015.html. +# include "smmintrin.h" +# include "wmmintrin.h" +#endif + +#if (CRYPTOPP_ARM_AES_AVAILABLE) +# include "arm_neon.h" +#endif + +// Don't include when using Apple Clang. Early Apple compilers +// fail to compile with included. Later Apple compilers compile +// intrinsics without included. +#if (CRYPTOPP_ARM_AES_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) +# include "arm_acle.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +// Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 +#if (__SUNPRO_CC >= 0x5130) +# define MAYBE_CONST +#else +# define MAYBE_CONST const +#endif + +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) +bool CPU_ProbeAES() +{ +#if (CRYPTOPP_ARM_AES_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + // AES encrypt and decrypt + uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); + uint8x16_t r1 = vaeseq_u8(data, key); + uint8x16_t r2 = vaesdq_u8(data, key); + r1 = vaesmcq_u8(r1); + r2 = vaesimcq_u8(r2); + + result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +# else + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint8x16_t data = vdupq_n_u8(0), key = vdupq_n_u8(0); + uint8x16_t r1 = vaeseq_u8(data, key); + uint8x16_t r2 = vaesdq_u8(data, key); + r1 = vaesmcq_u8(r1); + r2 = vaesimcq_u8(r2); + + // Hack... GCC optimizes away the code and returns true + result = !!(vgetq_lane_u8(r1,0) | vgetq_lane_u8(r2,7)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_AES_AVAILABLE +} +#endif // ARM32 or ARM64 + +#if (CRYPTOPP_ARM_AES_AVAILABLE) +inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds) +{ + const byte *keys = reinterpret_cast(subkeys); + + // Unroll the loop, profit 0.3 to 0.5 cpb. + block = vaeseq_u8(block, vld1q_u8(keys+0)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+16)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+32)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+48)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+64)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+80)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+96)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+112)); + block = vaesmcq_u8(block); + block = vaeseq_u8(block, vld1q_u8(keys+128)); + block = vaesmcq_u8(block); + + unsigned int i=9; + for ( ; i(subkeys); + + unsigned int i=0; + for ( ; i(subkeys); + + // Unroll the loop, profit 0.3 to 0.5 cpb. + block = vaesdq_u8(block, vld1q_u8(keys+0)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+16)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+32)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+48)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+64)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+80)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+96)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+112)); + block = vaesimcq_u8(block); + block = vaesdq_u8(block, vld1q_u8(keys+128)); + block = vaesimcq_u8(block); + + unsigned int i=9; + for ( ; i(subkeys); + + unsigned int i=0; + for ( ; i +size_t Rijndael_AdvancedProcessBlocks_ARMV8(F1 func1, F4 func4, const word32 *subkeys, unsigned int rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 4*blockSize) + { + uint8x16_t block0, block1, block2, block3, temp; + block0 = vld1q_u8(inBlocks); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + uint32x4_t be = vld1q_u32(s_one); + block1 = vaddq_u8(block0, vreinterpretq_u8_u32(be)); + block2 = vaddq_u8(block1, vreinterpretq_u8_u32(be)); + block3 = vaddq_u8(block2, vreinterpretq_u8_u32(be)); + temp = vaddq_u8(block3, vreinterpretq_u8_u32(be)); + vst1q_u8(const_cast(inBlocks), temp); + } + else + { + inBlocks += inIncrement; + block1 = vld1q_u8(inBlocks); + inBlocks += inIncrement; + block2 = vld1q_u8(inBlocks); + inBlocks += inIncrement; + block3 = vld1q_u8(inBlocks); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + block0 = veorq_u8(block0, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block1 = veorq_u8(block1, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block2 = veorq_u8(block2, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block3 = veorq_u8(block3, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + } + + func4(block0, block1, block2, block3, subkeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = veorq_u8(block0, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block1 = veorq_u8(block1, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block2 = veorq_u8(block2, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + block3 = veorq_u8(block3, vld1q_u8(xorBlocks)); + xorBlocks += xorIncrement; + } + + vst1q_u8(outBlocks, block0); + outBlocks += outIncrement; + vst1q_u8(outBlocks, block1); + outBlocks += outIncrement; + vst1q_u8(outBlocks, block2); + outBlocks += outIncrement; + vst1q_u8(outBlocks, block3); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + uint8x16_t block = vld1q_u8(inBlocks); + + if (flags & BlockTransformation::BT_XorInput) + block = veorq_u8(block, vld1q_u8(xorBlocks)); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subkeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = veorq_u8(block, vld1q_u8(xorBlocks)); + + vst1q_u8(outBlocks, block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + +size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Enc_Block, ARMV8_Enc_4_Blocks, + subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_ARMV8(ARMV8_Dec_Block, ARMV8_Dec_4_Blocks, + subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +#endif // CRYPTOPP_ARM_AES_AVAILABLE + +#if (CRYPTOPP_AESNI_AVAILABLE) +inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) +{ + block = _mm_xor_si128(block, subkeys[0]); + for (unsigned int i=1; i +inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4, + MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + MAYBE_CONST __m128i *subkeys = reinterpret_cast(subKeys); + + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 4*blockSize) + { + __m128i block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)), block1, block2, block3; + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + const __m128i be1 = *CONST_M128_CAST(s_one); + block1 = _mm_add_epi32(block0, be1); + block2 = _mm_add_epi32(block1, be1); + block3 = _mm_add_epi32(block2, be1); + _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1)); + } + else + { + inBlocks += inIncrement; + block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + inBlocks += inIncrement; + } + + if (flags & BlockTransformation::BT_XorInput) + { + // Coverity finding, appears to be false positive. Assert the condition. + CRYPTOPP_ASSERT(xorBlocks); + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + func4(block0, block1, block2, block3, subkeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + xorBlocks += xorIncrement; + } + + _mm_storeu_si128(M128_CAST(outBlocks), block0); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block1); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block2); + outBlocks += outIncrement; + _mm_storeu_si128(M128_CAST(outBlocks), block3); + outBlocks += outIncrement; + + length -= 4*blockSize; + } + } + + while (length >= blockSize) + { + __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); + + if (flags & BlockTransformation::BT_XorInput) + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subkeys, static_cast(rounds)); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); + + _mm_storeu_si128(M128_CAST(outBlocks), block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + +size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(MAYBE_CONST word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Enc_Block, AESNI_Enc_4_Blocks, + subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(MAYBE_CONST word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return Rijndael_AdvancedProcessBlocks_AESNI(AESNI_Dec_Block, AESNI_Dec_4_Blocks, + subkeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32 *rk) +{ + const unsigned rounds = static_cast(keyLen/4 + 6); + static const word32 rcLE[] = { + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80, + 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ + }; + + const word32 *ro = rcLE, *rc = rcLE; + CRYPTOPP_UNUSED(ro); + + __m128i temp = _mm_loadu_si128(M128_CAST(userKey+keyLen-16)); + std::memcpy(rk, userKey, keyLen); + + // keySize: m_key allocates 4*(rounds+1 word32's. + const size_t keySize = 4*(rounds+1); + const word32* end = rk + keySize; + while (true) + { + CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE)); + rk[keyLen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); + rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; + rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; + rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; + + if (rk + keyLen/4 + 4 == end) + break; + + if (keyLen == 24) + { + rk[10] = rk[ 4] ^ rk[ 9]; + rk[11] = rk[ 5] ^ rk[10]; + + CRYPTOPP_ASSERT(keySize >= 12); + temp = _mm_insert_epi32(temp, rk[11], 3); + } + else if (keyLen == 32) + { + CRYPTOPP_ASSERT(keySize >= 12); + temp = _mm_insert_epi32(temp, rk[11], 3); + rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); + rk[13] = rk[ 5] ^ rk[12]; + rk[14] = rk[ 6] ^ rk[13]; + rk[15] = rk[ 7] ^ rk[14]; + + CRYPTOPP_ASSERT(keySize >= 16); + temp = _mm_insert_epi32(temp, rk[15], 3); + } + else + { + CRYPTOPP_ASSERT(keySize >= 8); + temp = _mm_insert_epi32(temp, rk[7], 3); + } + + rk += keyLen/4; + } +} + +void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds) +{ + unsigned int i, j; + __m128i temp; + +#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) + // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. + // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. + vec_swap(*(__m128i *)(key), *(__m128i *)(key+4*rounds)); +#else + std::swap(*M128_CAST(key), *M128_CAST(key+4*rounds)); +#endif + for (i = 4, j = 4*rounds-4; i < j; i += 4, j -= 4) + { + temp = _mm_aesimc_si128(*M128_CAST(key+i)); + *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+j)); + *M128_CAST(key+j) = temp; + } + + *M128_CAST(key+i) = _mm_aesimc_si128(*M128_CAST(key+i)); +} +#endif // CRYPTOPP_AESNI_AVAILABLE + +NAMESPACE_END diff --git a/rijndael.cpp b/rijndael.cpp index 85d63cf1c..71c6f9f1a 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -5,7 +5,7 @@ // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code /* -August 2017: Added support for ARMv8 AES instructions via compiler intrinsics. +July 2017: Added support for ARM AES instructions via compiler intrinsics. */ /* @@ -85,13 +85,6 @@ NAMESPACE_BEGIN(CryptoPP) # define CRYPTOPP_ALLOW_RIJNDAEL_UNALIGNED_DATA_ACCESS 1 #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE -static void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, - const word32 *subKeys, unsigned int rounds); -static void Rijndael_Dec_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, - const word32 *subKeys, unsigned int rounds); -#endif - // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224 #if (__SUNPRO_CC >= 0x5130) # define MAYBE_CONST @@ -229,123 +222,68 @@ void Rijndael::Base::FillDecTable() s_TdFilled = true; } -void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) +#if (CRYPTOPP_AESNI_AVAILABLE) +extern void Rijndael_UncheckedSetKey_SSE4_AESNI(const byte *userKey, size_t keyLen, word32* rk); +extern void Rijndael_UncheckedSetKeyRev_SSE4_AESNI(word32 *key, unsigned int rounds); + +extern size_t Rijndael_Enc_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +extern size_t Rijndael_Dec_AdvancedProcessBlocks_AESNI(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +#if (CRYPTOPP_ARM_AES_AVAILABLE) +extern size_t Rijndael_Enc_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, const NameValuePairs &) { - AssertValidKeyLength(keylen); + AssertValidKeyLength(keyLen); - m_rounds = keylen/4 + 6; + m_rounds = keyLen/4 + 6; m_key.New(4*(m_rounds+1)); word32 *rk = m_key; -#if (CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) +#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE41_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 - if (HasAESNI() && HasSSE4()) + if (HasAESNI() && HasSSE41()) { - static const word32 rcLE[] = { - 0x01, 0x02, 0x04, 0x08, - 0x10, 0x20, 0x40, 0x80, - 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ - }; - - // Coverity finding, appears to be false positive. Assert the condition. - const word32 *ro = rcLE, *rc = rcLE; - CRYPTOPP_UNUSED(ro); - - __m128i temp = _mm_loadu_si128(M128I_CAST(userKey+keylen-16)); - memcpy(rk, userKey, keylen); - - while (true) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(rc < ro + COUNTOF(rcLE)); - rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); - rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; - rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; - rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; - - if (rk + keylen/4 + 4 == m_key.end()) - break; - - if (keylen == 24) - { - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 12); - temp = _mm_insert_epi32(temp, rk[11], 3); - } - else if (keylen == 32) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 12); - temp = _mm_insert_epi32(temp, rk[11], 3); - rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 16); - temp = _mm_insert_epi32(temp, rk[15], 3); - } - else - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(m_key.size() >= 8); - temp = _mm_insert_epi32(temp, rk[7], 3); - } - - rk += keylen/4; - } - + // TODO: Add non-SSE4.1 variant for low-end Atoms. The low-end + // Atoms have SSE2-SSSE3 and AES-NI, but not SSE4.1 or SSE4.2. + Rijndael_UncheckedSetKey_SSE4_AESNI(userKey, keyLen, rk); if (!IsForwardTransformation()) - { - rk = m_key; - unsigned int i, j; - -#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) - // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. - // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. - vec_swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); -#else - std::swap(*M128I_CAST(rk), *M128I_CAST(rk+4*m_rounds)); -#endif - for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) - { - temp = _mm_aesimc_si128(*M128I_CAST(rk+i)); - *M128I_CAST(rk+i) = _mm_aesimc_si128(*M128I_CAST(rk+j)); - *M128I_CAST(rk+j) = temp; - } - - *M128I_CAST(rk+i) = _mm_aesimc_si128(*M128I_CAST(rk+i)); - } + Rijndael_UncheckedSetKeyRev_SSE4_AESNI(m_key, m_rounds); return; } #endif - GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); + GetUserKey(BIG_ENDIAN_ORDER, rk, keyLen/4, userKey, keyLen); const word32 *rc = rcon; word32 temp; while (true) { - temp = rk[keylen/4-1]; + temp = rk[keyLen/4-1]; word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; - rk[keylen/4] = rk[0] ^ x ^ *(rc++); - rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; - rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; - rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; + rk[keyLen/4] = rk[0] ^ x ^ *(rc++); + rk[keyLen/4+1] = rk[1] ^ rk[keyLen/4]; + rk[keyLen/4+2] = rk[2] ^ rk[keyLen/4+1]; + rk[keyLen/4+3] = rk[3] ^ rk[keyLen/4+2]; - if (rk + keylen/4 + 4 == m_key.end()) + if (rk + keyLen/4 + 4 == m_key.end()) break; - if (keylen == 24) + if (keyLen == 24) { rk[10] = rk[ 4] ^ rk[ 9]; rk[11] = rk[ 5] ^ rk[10]; } - else if (keylen == 32) + else if (keyLen == 32) { temp = rk[11]; rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; @@ -353,7 +291,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c rk[14] = rk[ 6] ^ rk[13]; rk[15] = rk[ 7] ^ rk[14]; } - rk += keylen/4; + rk += keyLen/4; } rk = m_key; @@ -394,11 +332,11 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; } -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_AES_AVAILABLE if (HasAES()) ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); #endif @@ -406,20 +344,22 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE -#if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE +# if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) if (HasSSE2()) -#else +# else if (HasAESNI()) -#endif +# endif { - return (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); + (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); + return; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + +#if (CRYPTOPP_ARM_AES_AVAILABLE) if (HasAES()) { - Rijndael_Enc_ProcessAndXorBlock_ARMV8(inBlock, xorBlock, outBlock, m_key.begin(), m_rounds); + (void)Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); return; } #endif @@ -494,17 +434,18 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) { - Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); + (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); return; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + +#if (CRYPTOPP_ARM_AES_AVAILABLE) if (HasAES()) { - Rijndael_Dec_ProcessAndXorBlock_ARMV8(inBlock, xorBlock, outBlock, m_key.begin(), m_rounds); + (void)Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); return; } #endif @@ -1115,191 +1056,6 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end) return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); } -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - -inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) -{ - block = _mm_xor_si128(block, subkeys[0]); - for (unsigned int i=1; i -inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, MAYBE_CONST __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - size_t blockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; - size_t xorIncrement = xorBlocks ? blockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; - - if (flags & BlockTransformation::BT_ReverseDirection) - { - CRYPTOPP_ASSERT(length % blockSize == 0); - inBlocks += length - blockSize; - xorBlocks += length - blockSize; - outBlocks += length - blockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 4*blockSize) - { - __m128i block0 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)), block1, block2, block3; - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - const __m128i be1 = *CONST_M128I_CAST(s_one); - block1 = _mm_add_epi32(block0, be1); - block2 = _mm_add_epi32(block1, be1); - block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128(M128I_CAST(inBlocks), _mm_add_epi32(block3, be1)); - } - else - { - inBlocks += inIncrement; - block1 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)); - inBlocks += inIncrement; - block2 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)); - inBlocks += inIncrement; - block3 = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - // Coverity finding, appears to be false positive. Assert the condition. - CRYPTOPP_ASSERT(xorBlocks); - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - func4(block0, block1, block2, block3, subkeys, rounds); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128(M128I_CAST(outBlocks), block0); - outBlocks += outIncrement; - _mm_storeu_si128(M128I_CAST(outBlocks), block1); - outBlocks += outIncrement; - _mm_storeu_si128(M128I_CAST(outBlocks), block2); - outBlocks += outIncrement; - _mm_storeu_si128(M128I_CAST(outBlocks), block3); - outBlocks += outIncrement; - - length -= 4*blockSize; - } - } - - while (length >= blockSize) - { - __m128i block = _mm_loadu_si128(CONST_M128I_CAST(inBlocks)); - - if (flags & BlockTransformation::BT_XorInput) - block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast(inBlocks)[15]++; - - func1(block, subkeys, rounds); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128I_CAST(xorBlocks))); - - _mm_storeu_si128(M128I_CAST(outBlocks), block); - - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } - - return length; -} -#endif - -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 struct Locals { word32 subkeys[4*12], workspace[8]; @@ -1314,13 +1070,24 @@ const size_t s_aliasBlockSize = 256; const size_t s_sizeToAllocate = s_aliasPageSize + s_aliasBlockSize + sizeof(Locals); Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { } + +#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 + +#if CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 +// Do nothing +Rijndael::Enc::Enc() { } #endif +#if CRYPTOPP_ENABLE_ADVANCED_PROCESS_BLOCKS size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) - return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + return Rijndael_Enc_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if CRYPTOPP_ARM_AES_AVAILABLE + if (HasAES()) + return Rijndael_Enc_AdvancedProcessBlocks_ARMV8(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) @@ -1375,116 +1142,21 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } -#endif - -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) - return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); + return Rijndael_Dec_AdvancedProcessBlocks_AESNI(m_key, m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} -#endif // CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 - -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -void Rijndael_Enc_ProcessAndXorBlock_ARMV8(const byte *inBlock, const byte *xorBlock, byte *outBlock, - const word32 *subKeys, unsigned int rounds) -{ - uint8x16_t data = vld1q_u8(inBlock); - const byte *keys = reinterpret_cast(subKeys); - - // Unroll the loop, profit 0.3 to 0.5 cpb. - data = vaeseq_u8(data, vld1q_u8(keys+0)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+16)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+32)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+48)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+64)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+80)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+96)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+112)); - data = vaesmcq_u8(data); - data = vaeseq_u8(data, vld1q_u8(keys+128)); - data = vaesmcq_u8(data); - - unsigned int i=9; - for ( ; i(subKeys); - - // Unroll the loop, profit 0.3 to 0.5 cpb. - data = vaesdq_u8(data, vld1q_u8(keys+0)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+16)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+32)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+48)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+64)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+80)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+96)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+112)); - data = vaesimcq_u8(data); - data = vaesdq_u8(data, vld1q_u8(keys+128)); - data = vaesimcq_u8(data); - - unsigned int i=9; - for ( ; i when using Apple Clang. Early Apple compilers +// fail to compile with included. Later Apple compilers compile +// intrinsics without included. +#if (CRYPTOPP_ARM_SHA_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) +# include "arm_acle.h" +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +#ifndef EXCEPTION_EXECUTE_HANDLER +# define EXCEPTION_EXECUTE_HANDLER 1 +#endif + +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) +bool CPU_ProbeSHA1() +{ +#if (CRYPTOPP_ARM_SHA_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + + uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); + uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); + uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); + uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); + uint32x4_t r5 = vsha1su1q_u32 (data1, data2); + + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +# else + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + + uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2); + uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2); + uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2); + uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3); + uint32x4_t r5 = vsha1su1q_u32 (data1, data2); + + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_SHA_AVAILABLE +} + +bool CPU_ProbeSHA2() +{ +#if (CRYPTOPP_ARM_SHA_AVAILABLE) +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + + uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); + uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); + uint32x4_t r3 = vsha256su0q_u32 (data1, data2); + uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); + + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +#else + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + uint32x4_t data1 = {1,2,3,4}, data2 = {5,6,7,8}, data3 = {9,10,11,12}; + + uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3); + uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3); + uint32x4_t r3 = vsha256su0q_u32 (data1, data2); + uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3); + + result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ARM_SHA_AVAILABLE +} +#endif // ARM32 or ARM64 + +extern const word32 SHA256_K[64]; + +/////////////////////////////////// +// start of Walton/Gulley's code // +/////////////////////////////////// + +#if CRYPTOPP_SHANI_AVAILABLE +// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. +void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order) +{ + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); + + __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; + __m128i MASK, MSG0, MSG1, MSG2, MSG3; + + // Load initial values + ABCD = _mm_loadu_si128(CONST_M128_CAST(state)); + E0 = _mm_set_epi32(state[4], 0, 0, 0); + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + + // IA-32 SHA is little endian, SHA::Transform is big endian, + // and SHA::HashMultipleBlocks can be either. ByteOrder + // allows us to avoid extra endian reversals. It saves 1.0 cpb. + MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement + _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) : + _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ; + + while (length >= SHA1::BLOCKSIZE) + { + // Save current hash + ABCD_SAVE = ABCD; + E0_SAVE = E0; + + // Rounds 0-3 + MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0)); + MSG0 = _mm_shuffle_epi8(MSG0, MASK); + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + + // Rounds 4-7 + MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + + // Rounds 8-11 + MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + // Rounds 12-15 + MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + // Rounds 16-19 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + // Rounds 20-23 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + // Rounds 24-27 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + // Rounds 28-31 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + // Rounds 32-35 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + // Rounds 36-39 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + // Rounds 40-43 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + // Rounds 44-47 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + // Rounds 48-51 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + // Rounds 52-55 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + // Rounds 56-59 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + // Rounds 60-63 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + // Rounds 64-67 + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + // Rounds 68-71 + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + // Rounds 72-75 + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + + // Rounds 76-79 + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + + // Add values back to state + E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); + ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); + + data += SHA1::BLOCKSIZE/sizeof(word32); + length -= SHA1::BLOCKSIZE; + } + + // Save state + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + _mm_storeu_si128(M128_CAST(state), ABCD); + state[4] = _mm_extract_epi32(E0, 3); +} + +// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. +void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order) +{ + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); + + __m128i STATE0, STATE1; + __m128i MSG, TMP, MASK; + __m128i TMSG0, TMSG1, TMSG2, TMSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + + // Load initial values + TMP = _mm_loadu_si128(M128_CAST(&state[0])); + STATE1 = _mm_loadu_si128(M128_CAST(&state[4])); + + // IA-32 SHA is little endian, SHA::Transform is big endian, + // and SHA::HashMultipleBlocks can be either. ByteOrder + // allows us to avoid extra endian reversals. It saves 1.0 cpb. + MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement + _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) : + _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ; + + TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH + + while (length >= SHA256::BLOCKSIZE) + { + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Rounds 0-3 + MSG = _mm_loadu_si128(CONST_M128_CAST(data+0)); + TMSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 4-7 + TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); + TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 8-11 + TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); + TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 12-15 + TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); + TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 16-19 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 20-23 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 24-27 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 28-31 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 32-35 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 36-39 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); + + // Rounds 40-43 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); + + // Rounds 44-47 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); + TMSG0 = _mm_add_epi32(TMSG0, TMP); + TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); + + // Rounds 48-51 + MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); + TMSG1 = _mm_add_epi32(TMSG1, TMP); + TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); + + // Rounds 52-55 + MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); + TMSG2 = _mm_add_epi32(TMSG2, TMP); + TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 56-59 + MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); + TMSG3 = _mm_add_epi32(TMSG3, TMP); + TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Rounds 60-63 + MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA))); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + // Add values back to state + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + data += SHA256::BLOCKSIZE/sizeof(word32); + length -= SHA256::BLOCKSIZE; + } + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF + + // Save state + _mm_storeu_si128(M128_CAST(&state[0]), STATE0); + _mm_storeu_si128(M128_CAST(&state[4]), STATE1); +} +#endif // CRYPTOPP_SHANI_AVAILABLE + +///////////////////////////////// +// end of Walton/Gulley's code // +///////////////////////////////// + +///////////////////////////////////////////////////////// +// start of Walton/Schneiders/O'Rourke/Hovsmith's code // +///////////////////////////////////////////////////////// + +#if CRYPTOPP_ARM_SHA_AVAILABLE +void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) +{ + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); + + uint32x4_t C0, C1, C2, C3; + uint32x4_t ABCD, ABCD_SAVED; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1; + uint32_t E0, E0_SAVED, E1; + + // Load initial values + C0 = vdupq_n_u32(0x5A827999); + C1 = vdupq_n_u32(0x6ED9EBA1); + C2 = vdupq_n_u32(0x8F1BBCDC); + C3 = vdupq_n_u32(0xCA62C1D6); + + ABCD = vld1q_u32(&state[0]); + E0 = state[4]; + + while (length >= SHA1::BLOCKSIZE) + { + // Save current hash + ABCD_SAVED = ABCD; + E0_SAVED = E0; + + MSG0 = vld1q_u32(data + 0); + MSG1 = vld1q_u32(data + 4); + MSG2 = vld1q_u32(data + 8); + MSG3 = vld1q_u32(data + 12); + + if (order == BIG_ENDIAN_ORDER) // Data arrangement + { + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + } + + TMP0 = vaddq_u32(MSG0, C0); + TMP1 = vaddq_u32(MSG1, C0); + + // Rounds 0-3 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C0); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 4-7 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C0); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 8-11 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C0); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 12-15 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C1); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 16-19 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C1); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 20-23 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C1); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 24-27 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C1); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 28-31 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C1); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 32-35 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C2); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 36-39 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C2); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 40-43 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C2); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 44-47 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C2); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 48-51 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C2); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 52-55 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C3); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + // Rounds 56-59 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, C3); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + // Rounds 60-63 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, C3); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + // Rounds 64-67 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, C3); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + // Rounds 68-71 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, C3); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + + // Rounds 72-75 + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + + // Rounds 76-79 + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + + E0 += E0_SAVED; + ABCD = vaddq_u32(ABCD_SAVED, ABCD); + + data += SHA1::BLOCKSIZE/sizeof(word32); + length -= SHA1::BLOCKSIZE; + } + + // Save state + vst1q_u32(&state[0], ABCD); + state[4] = E0; +} + +void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) +{ + CRYPTOPP_ASSERT(state); + CRYPTOPP_ASSERT(data); + CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); + + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1, TMP2; + + // Load initial values + STATE0 = vld1q_u32(&state[0]); + STATE1 = vld1q_u32(&state[4]); + + while (length >= SHA256::BLOCKSIZE) + { + // Save current hash + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + // Load message + MSG0 = vld1q_u32(data + 0); + MSG1 = vld1q_u32(data + 4); + MSG2 = vld1q_u32(data + 8); + MSG3 = vld1q_u32(data + 12); + + if (order == BIG_ENDIAN_ORDER) // Data arrangement + { + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + } + + TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00])); + + // Rounds 0-3 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; + + // Rounds 4-7 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; + + // Rounds 8-11 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; + + // Rounds 12-15 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; + + // Rounds 16-19 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; + + // Rounds 20-23 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; + + // Rounds 24-27 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; + + // Rounds 28-31 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; + + // Rounds 32-35 + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; + + // Rounds 36-39 + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; + + // Rounds 40-43 + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; + + // Rounds 44-47 + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; + + // Rounds 48-51 + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);; + + // Rounds 52-55 + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);; + + // Rounds 56-59 + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);; + + // Rounds 60-63 + TMP2 = STATE0; + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);; + + // Add back to state + STATE0 = vaddq_u32(STATE0, ABEF_SAVE); + STATE1 = vaddq_u32(STATE1, CDGH_SAVE); + + data += SHA256::BLOCKSIZE/sizeof(word32); + length -= SHA256::BLOCKSIZE; + } + + // Save state + vst1q_u32(&state[0], STATE0); + vst1q_u32(&state[4], STATE1); +} +#endif + +/////////////////////////////////////////////////////// +// end of Walton/Schneiders/O'Rourke/Hovsmith's code // +/////////////////////////////////////////////////////// + +NAMESPACE_END \ No newline at end of file diff --git a/sha.cpp b/sha.cpp index cd0cdd5de..8f3fcb119 100644 --- a/sha.cpp +++ b/sha.cpp @@ -1,6 +1,6 @@ // sha.cpp - modified by Wei Dai from Steve Reid's public domain sha1.c -// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton +// Steve Reid implemented SHA-1. Wei Dai implemented SHA-2. Jeffrey Walton // implemented Intel SHA extensions based on Intel articles and code by // Sean Gulley. Jeffrey Walton implemented ARM SHA based on ARM code and // code from Johannes Schneiders, Skip Hovsmith and Barry O'Rourke. @@ -48,15 +48,21 @@ # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE #endif -// Clang __m128i casts -#define M128_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) - // C++ makes const internal linkage #define EXPORT_TABLE extern NAMESPACE_BEGIN(CryptoPP) +#if CRYPTOPP_SHANI_AVAILABLE +extern void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order); +extern void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order); +#endif + +#if CRYPTOPP_ARM_SHA_AVAILABLE +extern void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order); +extern void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order); +#endif + //////////////////////////////// // start of Steve Reid's code // //////////////////////////////// @@ -78,7 +84,7 @@ ANONYMOUS_NAMESPACE_BEGIN #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDC+rotlFixed(v,5);w=rotlFixed(w,30); #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6+rotlFixed(v,5);w=rotlFixed(w,30); -void SHA1_CXX_HashBlock(word32 *state, const word32 *data) +void SHA1_HashBlock_CXX(word32 *state, const word32 *data) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); @@ -125,430 +131,13 @@ ANONYMOUS_NAMESPACE_END // end of Steve Reid's code // ////////////////////////////// -/////////////////////////////////// -// start of Walton/Gulley's code // -/////////////////////////////////// - -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. -void SHA1_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); - - __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; - __m128i MASK, MSG0, MSG1, MSG2, MSG3; - - // Load initial values - ABCD = _mm_loadu_si128(CONST_M128_CAST(state)); - E0 = _mm_set_epi32(state[4], 0, 0, 0); - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - - // IA-32 SHA is little endian, SHA::Transform is big endian, - // and SHA::HashMultipleBlocks can be either. ByteOrder - // allows us to avoid extra endian reversals. It saves 1.0 cpb. - MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement - _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) : - _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ; - - while (length >= SHA1::BLOCKSIZE) - { - // Save current hash - ABCD_SAVE = ABCD; - E0_SAVE = E0; - - // Rounds 0-3 - MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0)); - MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - - // Rounds 4-7 - MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - - // Rounds 8-11 - MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 12-15 - MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 16-19 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 20-23 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 24-27 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 28-31 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 32-35 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 36-39 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 40-43 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 44-47 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 48-51 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 52-55 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 56-59 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - // Rounds 60-63 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - // Rounds 64-67 - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - // Rounds 68-71 - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - // Rounds 72-75 - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - - // Rounds 76-79 - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - - // Add values back to state - E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); - ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); - - data += SHA1::BLOCKSIZE/sizeof(word32); - length -= SHA1::BLOCKSIZE; - } - - // Save state - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - _mm_storeu_si128(M128_CAST(state), ABCD); - state[4] = _mm_extract_epi32(E0, 3); -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -///////////////////////////////// -// end of Walton/Gulley's code // -///////////////////////////////// - -////////////////////////////////////////////////////////////// -// start of Walton/Schneiders/O'Rourke/Skip Hovsmith's code // -////////////////////////////////////////////////////////////// - -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -void SHA1_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); - - uint32x4_t C0, C1, C2, C3; - uint32x4_t ABCD, ABCD_SAVED; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32x4_t TMP0, TMP1; - uint32_t E0, E0_SAVED, E1; - - // Load initial values - C0 = vdupq_n_u32(0x5A827999); - C1 = vdupq_n_u32(0x6ED9EBA1); - C2 = vdupq_n_u32(0x8F1BBCDC); - C3 = vdupq_n_u32(0xCA62C1D6); - - ABCD = vld1q_u32(&state[0]); - E0 = state[4]; - - while (length >= SHA1::BLOCKSIZE) - { - // Save current hash - ABCD_SAVED = ABCD; - E0_SAVED = E0; - - MSG0 = vld1q_u32(data + 0); - MSG1 = vld1q_u32(data + 4); - MSG2 = vld1q_u32(data + 8); - MSG3 = vld1q_u32(data + 12); - - if (order == BIG_ENDIAN_ORDER) // Data arrangement - { - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - } - - TMP0 = vaddq_u32(MSG0, C0); - TMP1 = vaddq_u32(MSG1, C0); - - // Rounds 0-3 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C0); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 4-7 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C0); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 8-11 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C0); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 12-15 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C1); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 16-19 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C1); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 20-23 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C1); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 24-27 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C1); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 28-31 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C1); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 32-35 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C2); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 36-39 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C2); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 40-43 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C2); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 44-47 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C2); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 48-51 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C2); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 52-55 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C3); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - // Rounds 56-59 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, C3); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - // Rounds 60-63 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, C3); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - // Rounds 64-67 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, C3); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - // Rounds 68-71 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, C3); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - - // Rounds 72-75 - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - - // Rounds 76-79 - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - - E0 += E0_SAVED; - ABCD = vaddq_u32(ABCD_SAVED, ABCD); - - data += SHA1::BLOCKSIZE/sizeof(word32); - length -= SHA1::BLOCKSIZE; - } - - // Save state - vst1q_u32(&state[0], ABCD); - state[4] = E0; -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -/////////////////////////////////////////////////////// -// end of Walton/Schneiders/O'Rourke/Hovsmith's code // -/////////////////////////////////////////////////////// - void SHA1::InitState(HashWordType *state) { - state[0] = 0x67452301L; - state[1] = 0xEFCDAB89L; - state[2] = 0x98BADCFEL; - state[3] = 0x10325476L; - state[4] = 0xC3D2E1F0L; + state[0] = 0x67452301; + state[1] = 0xEFCDAB89; + state[2] = 0x98BADCFE; + state[3] = 0x10325476; + state[4] = 0xC3D2E1F0; } void SHA1::Transform(word32 *state, const word32 *data) @@ -556,22 +145,22 @@ void SHA1::Transform(word32 *state, const word32 *data) CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA1_SHANI_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_SHANI(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA1()) { - SHA1_ARM_SHA_HashMultipleBlocks(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_ARMV8(state, data, SHA1::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif - SHA1_CXX_HashBlock(state, data); + SHA1_HashBlock_CXX(state, data); } size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) @@ -579,17 +168,17 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) CRYPTOPP_ASSERT(input); CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA1_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_SHANI(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA1::BLOCKSIZE - 1); } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA1()) { - SHA1_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA1_HashMultipleBlocks_ARMV8(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA1::BLOCKSIZE - 1); } #endif @@ -600,12 +189,12 @@ size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) { if (noReverse) { - SHA1_CXX_HashBlock(m_state, input); + SHA1_HashBlock_CXX(m_state, input); } else { ByteReverse(dataBuf, input, SHA1::BLOCKSIZE); - SHA1_CXX_HashBlock(m_state, dataBuf); + SHA1_HashBlock_CXX(m_state, dataBuf); } input += SHA1::BLOCKSIZE/sizeof(word32); @@ -663,7 +252,7 @@ ANONYMOUS_NAMESPACE_BEGIN #define s0(x) (rotrFixed(x,7)^rotrFixed(x,18)^(x>>3)) #define s1(x) (rotrFixed(x,17)^rotrFixed(x,19)^(x>>10)) -void SHA256_CXX_HashBlock(word32 *state, const word32 *data) +void SHA256_HashBlock_CXX(word32 *state, const word32 *data) { word32 W[16], T[8]; /* Copy context->state[] to working vars */ @@ -712,7 +301,7 @@ void SHA256::InitState(HashWordType *state) ANONYMOUS_NAMESPACE_BEGIN -void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len) +void CRYPTOPP_FASTCALL SHA256_HashMultipleBlocks_SSE2(word32 *state, const word32 *data, size_t len) { #define LOCALS_SIZE 8*4 + 16*4 + 4*WORD_SZ #define H(i) [BASE+ASM_MOD(1024+7-(i),8)*4] @@ -834,7 +423,7 @@ void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 INTEL_NOPREFIX #elif defined(CRYPTOPP_GENERATE_X64_MASM) ALIGN 8 - SHA256_SSE_HashMultipleBlocks PROC FRAME + SHA256_HashMultipleBlocks_SSE2 PROC FRAME rex_push_reg rsi push_reg rdi push_reg rbx @@ -1013,7 +602,7 @@ INTEL_NOPREFIX pop rdi pop rsi ret - SHA256_SSE_HashMultipleBlocks ENDP + SHA256_HashMultipleBlocks_SSE2 ENDP #endif #ifdef __GNUC__ @@ -1039,435 +628,31 @@ ANONYMOUS_NAMESPACE_END #ifdef CRYPTOPP_X64_MASM_AVAILABLE EXPORT_TABLE "C" { -void CRYPTOPP_FASTCALL SHA256_SSE_HashMultipleBlocks(word32 *state, const word32 *data, size_t len); +void CRYPTOPP_FASTCALL SHA256_HashMultipleBlocks_SSE2(word32 *state, const word32 *data, size_t len); } #endif -/////////////////////////////////// -// start of Walton/Gulley's code // -/////////////////////////////////// - -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. -void SHA256_SHANI_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); - - __m128i STATE0, STATE1; - __m128i MSG, TMP, MASK; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_loadu_si128(M128_CAST(&state[0])); - STATE1 = _mm_loadu_si128(M128_CAST(&state[4])); - - // IA-32 SHA is little endian, SHA::Transform is big endian, - // and SHA::HashMultipleBlocks can be either. ByteOrder - // allows us to avoid extra endian reversals. It saves 1.0 cpb. - MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement - _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) : - _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ; - - TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH - - while (length >= SHA256::BLOCKSIZE) - { - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - MSG = _mm_loadu_si128(CONST_M128_CAST(data+0)); - TMSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 4-7 - TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4)); - TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 8-11 - TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8)); - TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 12-15 - TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12)); - TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 16-19 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 20-23 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 24-27 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 28-31 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 32-35 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 36-39 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); - - // Rounds 40-43 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); - - // Rounds 44-47 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); - TMSG0 = _mm_add_epi32(TMSG0, TMP); - TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); - - // Rounds 48-51 - MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); - TMSG1 = _mm_add_epi32(TMSG1, TMP); - TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); - - // Rounds 52-55 - MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); - TMSG2 = _mm_add_epi32(TMSG2, TMP); - TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 56-59 - MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); - TMSG3 = _mm_add_epi32(TMSG3, TMP); - TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Rounds 60-63 - MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA))); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - // Add values back to state - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - data += SHA256::BLOCKSIZE/sizeof(word32); - length -= SHA256::BLOCKSIZE; - } - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF - - // Save state - _mm_storeu_si128(M128_CAST(&state[0]), STATE0); - _mm_storeu_si128(M128_CAST(&state[4]), STATE1); -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE - -///////////////////////////////// -// end of Walton/Gulley's code // -///////////////////////////////// - -///////////////////////////////////////////////////////// -// start of Walton/Schneiders/O'Rourke/Hovsmith's code // -///////////////////////////////////////////////////////// - -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -ANONYMOUS_NAMESPACE_BEGIN - -void SHA256_ARM_SHA_HashMultipleBlocks(word32 *state, const word32 *data, size_t length, ByteOrder order) -{ - CRYPTOPP_ASSERT(state); - CRYPTOPP_ASSERT(data); - CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); - - uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32x4_t TMP0, TMP1, TMP2; - - // Load initial values - STATE0 = vld1q_u32(&state[0]); - STATE1 = vld1q_u32(&state[4]); - - while (length >= SHA256::BLOCKSIZE) - { - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Load message - MSG0 = vld1q_u32(data + 0); - MSG1 = vld1q_u32(data + 4); - MSG2 = vld1q_u32(data + 8); - MSG3 = vld1q_u32(data + 12); - - if (order == BIG_ENDIAN_ORDER) // Data arrangement - { - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - } - - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00])); - - // Rounds 0-3 - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; - - // Rounds 4-7 - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; - - // Rounds 8-11 - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; - - // Rounds 12-15 - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; - - // Rounds 16-19 - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; - - // Rounds 20-23 - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; - - // Rounds 24-27 - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; - - // Rounds 28-31 - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; - - // Rounds 32-35 - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);; - - // Rounds 36-39 - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);; - - // Rounds 40-43 - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);; - - // Rounds 44-47 - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);; - - // Rounds 48-51 - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);; - - // Rounds 52-55 - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);; - - // Rounds 56-59 - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);; - - // Rounds 60-63 - TMP2 = STATE0; - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);; - - // Add back to state - STATE0 = vaddq_u32(STATE0, ABEF_SAVE); - STATE1 = vaddq_u32(STATE1, CDGH_SAVE); - - data += SHA256::BLOCKSIZE/sizeof(word32); - length -= SHA256::BLOCKSIZE; - } - - // Save state - vst1q_u32(&state[0], STATE0); - vst1q_u32(&state[4], STATE1); -} - -ANONYMOUS_NAMESPACE_END - -#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE - -/////////////////////////////////////////////////////// -// end of Walton/Schneiders/O'Rourke/Hovsmith's code // -/////////////////////////////////////////////////////// - void SHA256::Transform(word32 *state, const word32 *data) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA256_SHANI_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_SHANI(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA2()) { - SHA256_ARM_SHA_HashMultipleBlocks(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_ARMV8(state, data, SHA256::BLOCKSIZE, LITTLE_ENDIAN_ORDER); return; } #endif - SHA256_CXX_HashBlock(state, data); + SHA256_HashBlock_CXX(state, data); } size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) @@ -1475,10 +660,10 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) CRYPTOPP_ASSERT(input); CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_SHANI(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1486,14 +671,14 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) if (HasSSE2()) { const size_t res = length & (SHA256::BLOCKSIZE - 1); - SHA256_SSE_HashMultipleBlocks(m_state, input, length-res); + SHA256_HashMultipleBlocks_SSE2(m_state, input, length-res); return res; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA2()) { - SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_ARMV8(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1504,12 +689,12 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) { if (noReverse) { - SHA256_CXX_HashBlock(m_state, input); + SHA256_HashBlock_CXX(m_state, input); } else { ByteReverse(dataBuf, input, SHA256::BLOCKSIZE); - SHA256_CXX_HashBlock(m_state, dataBuf); + SHA256_HashBlock_CXX(m_state, dataBuf); } input += SHA256::BLOCKSIZE/sizeof(word32); @@ -1521,13 +706,13 @@ size_t SHA256::HashMultipleBlocks(const word32 *input, size_t length) size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) { - CRYPTOPP_ASSERT(input); + CRYPTOPP_ASSERT(input); CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { - SHA256_SHANI_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_SHANI(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1535,14 +720,14 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) if (HasSSE2()) { const size_t res = length & (SHA256::BLOCKSIZE - 1); - SHA256_SSE_HashMultipleBlocks(m_state, input, length-res); + SHA256_HashMultipleBlocks_SSE2(m_state, input, length-res); return res; } #endif -#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE +#if CRYPTOPP_ARM_SHA_AVAILABLE if (HasSHA2()) { - SHA256_ARM_SHA_HashMultipleBlocks(m_state, input, length, BIG_ENDIAN_ORDER); + SHA256_HashMultipleBlocks_ARMV8(m_state, input, length, BIG_ENDIAN_ORDER); return length & (SHA256::BLOCKSIZE - 1); } #endif @@ -1553,12 +738,12 @@ size_t SHA224::HashMultipleBlocks(const word32 *input, size_t length) { if (noReverse) { - SHA256_CXX_HashBlock(m_state, input); + SHA256_HashBlock_CXX(m_state, input); } else { ByteReverse(dataBuf, input, SHA256::BLOCKSIZE); - SHA256_CXX_HashBlock(m_state, dataBuf); + SHA256_HashBlock_CXX(m_state, dataBuf); } input += SHA256::BLOCKSIZE/sizeof(word32); @@ -1591,7 +776,7 @@ void SHA512::InitState(HashWordType *state) } CRYPTOPP_ALIGN_DATA(16) -static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { +const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), @@ -1638,7 +823,7 @@ static const word64 SHA512_K[80] CRYPTOPP_SECTION_ALIGN16 = { ANONYMOUS_NAMESPACE_BEGIN -CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_SSE2_Transform(word64 *state, const word64 *data) +CRYPTOPP_NAKED void CRYPTOPP_FASTCALL SHA512_HashBlock_SSE2(word64 *state, const word64 *data) { #ifdef __GNUC__ __asm__ __volatile__ @@ -1844,9 +1029,9 @@ ANONYMOUS_NAMESPACE_BEGIN #define s1(x) (rotrFixed(x,19)^rotrFixed(x,61)^(x>>6)) #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+SHA512_K[i+j]+\ - (j?blk2(i):blk0(i));d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) + (j?blk2(i):blk0(i));d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i)) -void SHA512_CXX_HashBlock(word64 *state, const word64 *data) +void SHA512_HashBlock_CXX(word64 *state, const word64 *data) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); @@ -1884,12 +1069,12 @@ void SHA512::Transform(word64 *state, const word64 *data) #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32) if (HasSSE2()) { - SHA512_SSE2_Transform(state, data); + SHA512_HashBlock_SSE2(state, data); return; } #endif - SHA512_CXX_HashBlock(state, data); + SHA512_HashBlock_CXX(state, data); } NAMESPACE_END diff --git a/shacal2-simd.cpp b/shacal2-simd.cpp new file mode 100644 index 000000000..9597cc90c --- /dev/null +++ b/shacal2-simd.cpp @@ -0,0 +1,111 @@ +// shacla2-simd.cpp - written and placed in the public domain by +// Jeffrey Walton and Jack Lloyd +// +// Jack Lloyd and the Botan team allowed Crypto++ to use parts of +// Botan's implementation under the same license as Crypto++ +// is released. The code for SHACAL2_Enc_ProcessAndXorBlock_SHANI +// below is Botan's x86_encrypt_blocks with minor tweaks. Many thanks +// to the Botan team. Also see http://github.com/randombit/botan/. +// +// This source file uses intrinsics to gain access to SHA-NI and +// ARMv8a SHA instructions. A separate source file is needed because +// additional CXXFLAGS are required to enable the appropriate instructions +// sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "sha.h" +#include "misc.h" + +// Clang and GCC hoops... +#if !(defined(__ARM_FEATURE_CRYPTO) || defined(_MSC_VER)) +# undef CRYPTOPP_ARM_SHA_AVAILABLE +#endif + +#if (CRYPTOPP_SHANI_AVAILABLE) +# include "nmmintrin.h" +# include "immintrin.h" +#endif + +#if (CRYPTOPP_ARM_SHA_AVAILABLE) +# include "arm_neon.h" +#endif + +// Don't include when using Apple Clang. Early Apple compilers +// fail to compile with included. Later Apple compilers compile +// intrinsics without included. +#if (CRYPTOPP_ARM_SHA_AVAILABLE) && !defined(CRYPTOPP_APPLE_CLANG_VERSION) +# include "arm_acle.h" +#endif + +// Clang __m128i casts +#define M128_CAST(x) ((__m128i *)(void *)(x)) +#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) + +NAMESPACE_BEGIN(CryptoPP) + +#if CRYPTOPP_SHANI_AVAILABLE +void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlock); + CRYPTOPP_ASSERT(outBlock); + + const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); + const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); + + __m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1); + __m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2); + + __m128i TMP = _mm_alignr_epi8(B0, B1, 8); + B1 = _mm_blend_epi16(B1, B0, 0xF0); + B0 = TMP; + +#if 0 + // SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455 + const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); + const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); + + __m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0)); + __m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16)); + + __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2); + B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2); + B0 = TMP; +#endif + + const byte* keys = reinterpret_cast(subKeys); + for (size_t i = 0; i != 8; ++i) + { + const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i)); + const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16)); + const __m128i RK1 = _mm_srli_si128(RK0, 8); + const __m128i RK3 = _mm_srli_si128(RK2, 8); + + B1 = _mm_sha256rnds2_epu32(B1, B0, RK0); + B0 = _mm_sha256rnds2_epu32(B0, B1, RK1); + B1 = _mm_sha256rnds2_epu32(B1, B0, RK2); + B0 = _mm_sha256rnds2_epu32(B0, B1, RK3); + } + + TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1); + B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1); + B0 = TMP; + + if (xorBlock) + { + _mm_storeu_si128(M128_CAST(outBlock + 0), + _mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0)))); + + _mm_storeu_si128(M128_CAST(outBlock + 16), + _mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16)))); + } + else + { + _mm_storeu_si128(M128_CAST(outBlock + 0), B0); + _mm_storeu_si128(M128_CAST(outBlock + 16), B1); + } +} +#endif + +NAMESPACE_END diff --git a/shacal2.cpp b/shacal2.cpp index 9ab3b3731..afb5f4d88 100644 --- a/shacal2.cpp +++ b/shacal2.cpp @@ -14,87 +14,12 @@ #include "pch.h" #include "config.h" #include "shacal2.h" +#include "cpu.h" #include "misc.h" #include "cpu.h" -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE -#include -#endif - -// Clang __m128i casts -#define M128_CAST(x) ((__m128i *)(void *)(x)) -#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) - NAMESPACE_BEGIN(CryptoPP) -ANONYMOUS_NAMESPACE_BEGIN - -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE -void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock) -{ - CRYPTOPP_ASSERT(subKeys); - CRYPTOPP_ASSERT(inBlock); - CRYPTOPP_ASSERT(outBlock); - - const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); - const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); - - __m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1); - __m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2); - - __m128i TMP = _mm_alignr_epi8(B0, B1, 8); - B1 = _mm_blend_epi16(B1, B0, 0xF0); - B0 = TMP; - -#if 0 - // SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455 - const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); - const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); - - __m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0)); - __m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16)); - - __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2); - B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2); - B0 = TMP; -#endif - - const byte* keys = reinterpret_cast(subKeys); - for (size_t i = 0; i != 8; ++i) - { - const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i)); - const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16)); - const __m128i RK1 = _mm_srli_si128(RK0, 8); - const __m128i RK3 = _mm_srli_si128(RK2, 8); - - B1 = _mm_sha256rnds2_epu32(B1, B0, RK0); - B0 = _mm_sha256rnds2_epu32(B0, B1, RK1); - B1 = _mm_sha256rnds2_epu32(B1, B0, RK2); - B0 = _mm_sha256rnds2_epu32(B0, B1, RK3); - } - - TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1); - B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1); - B0 = TMP; - - if (xorBlock) - { - _mm_storeu_si128(M128_CAST(outBlock + 0), - _mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0)))); - - _mm_storeu_si128(M128_CAST(outBlock + 16), - _mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16)))); - } - else - { - _mm_storeu_si128(M128_CAST(outBlock + 0), B0); - _mm_storeu_si128(M128_CAST(outBlock + 16), B1); - } -} -#endif - -ANONYMOUS_NAMESPACE_END - // SHACAL-2 function and round definitions #define S0(x) (rotrFixed(x,2)^rotrFixed(x,13)^rotrFixed(x,22)) @@ -115,6 +40,11 @@ ANONYMOUS_NAMESPACE_END #define P(a,b,c,d,e,f,g,h,k) \ h-=S0(a)+Maj(a,b,c);d-=h;h-=S1(e)+Ch(e,f,g)+*--k; +#if CRYPTOPP_SHANI_AVAILABLE +extern void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, + const byte *inBlock, const byte *xorBlock, byte *outBlock); +#endif + void SHACAL2::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) { AssertValidKeyLength(keylen); @@ -138,7 +68,7 @@ typedef BlockGetAndPut Block; void SHACAL2::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) { SHACAL2_Enc_ProcessAndXorBlock_SHANI(m_key, inBlock, xorBlock, outBlock); diff --git a/validat1.cpp b/validat1.cpp index 58ddcae91..d49ff812c 100644 --- a/validat1.cpp +++ b/validat1.cpp @@ -344,7 +344,8 @@ bool TestSettings() #ifdef CRYPTOPP_CPUID_AVAILABLE bool hasSSE2 = HasSSE2(); bool hasSSSE3 = HasSSSE3(); - bool hasSSE4 = HasSSE4(); + bool hasSSE41 = HasSSE41(); + bool hasSSE42 = HasSSE42(); bool isP4 = IsP4(); int cacheLineSize = GetCacheLineSize(); @@ -356,21 +357,21 @@ bool TestSettings() else std::cout << "passed: "; - std::cout << "hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasSSE4 == " << hasSSE4; - std::cout << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL(); - std::cout << ", hasRDRAND == " << HasRDRAND() << ", hasRDSEED == " << HasRDSEED(); + std::cout << "hasSSE2 == " << hasSSE2 << ", hasSSSE3 == " << hasSSSE3 << ", hasSSE4.1 == " << hasSSE41 << ", hasSSE4.2 == " << hasSSE42; + std::cout << ", hasAESNI == " << HasAESNI() << ", hasCLMUL == " << HasCLMUL() << ", hasRDRAND == " << HasRDRAND() << ", hasRDSEED == " << HasRDSEED(); std::cout << ", hasSHA == " << HasSHA() << ", isP4 == " << isP4 << ", cacheLineSize == " << cacheLineSize << std::endl; #elif (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) bool hasNEON = HasNEON(); - bool hasPMULL = HasPMULL(); bool hasCRC32 = HasCRC32(); + bool hasPMULL = HasPMULL(); bool hasAES = HasAES(); bool hasSHA1 = HasSHA1(); bool hasSHA2 = HasSHA2(); std::cout << "passed: "; - std::cout << "hasNEON == " << hasNEON << ", hasPMULL == " << hasPMULL << ", hasCRC32 == " << hasCRC32 << ", hasAES == " << hasAES << ", hasSHA1 == " << hasSHA1 << ", hasSHA2 == " << hasSHA2 << std::endl; + std::cout << "hasNEON == " << hasNEON << ", hasCRC32 == " << hasCRC32 << ", hasPMULL == " << hasPMULL; + std::cout << ", hasAES == " << hasAES << ", hasSHA1 == " << hasSHA1 << ", hasSHA2 == " << hasSHA2 << std::endl; #endif if (!pass) diff --git a/whrlpool.cpp b/whrlpool.cpp index 1278ad8a8..a947b813c 100644 --- a/whrlpool.cpp +++ b/whrlpool.cpp @@ -409,7 +409,7 @@ static const word64 Whirlpool_C[4*256+R] = { void Whirlpool::Transform(word64 *digest, const word64 *block) { #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - if (HasISSE()) + if (HasSSE2()) { // MMX version has the same structure as C version below #ifdef __GNUC__