diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 33f389a..e1f1dff 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -4,7 +4,7 @@ on: workflow_run: workflows: [tests] types: [completed] - + jobs: run-benchmark: runs-on: ubuntu-latest @@ -34,11 +34,26 @@ jobs: - name: Stash Benchmark Results run: git add micro-benchmark.md + - name: Commit Changes + run: git commit -m "..." + + - name: Remove Other Changes + run: git checkout . + + - name: Clean Artifacts + run: git clean -f -d + + - name: Checkout Benchmark Branch + run: git checkout benchmarks + + - name: Checkout Benchmark Markdown Results + run: git checkout main micro-benchmark.md + - name: Save Changes run: git commit -m "run::$(date)" - name: Push Changes To Benchmark Branch - run: git push + run: git push --force run-benchmark-failed: runs-on: ubuntu-latest diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c1a0f21..a39d1f6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,33 +6,40 @@ on: pull_request: branches: [ main ] +concurrency: + group: ${{ github.workflow }} @ ${{ github.head_ref || github.run_id }} + cancel-in-progress: true + jobs: cmake-build: - name: ${{ matrix.platform.name }} + name: ${{ matrix.platform.name }} ${{ matrix.aes.name }} runs-on: ${{ matrix.platform.os }} strategy: fail-fast: false matrix: aes: - - { name: Pure C++ Implementation, flag: portable } - - { name: Intel x86_64 AES NI, flag: aesni } - - { name: ARMv8 aarch64 NEON, flag: neon } + - { name: C++ Implementation, flag: portable} + - { name: Intel x86_64 AES NI, flag: aesni} + # - { name: ARMv8 aarch64 NEON, flag: neon} platform: - # - { name: Linux Clang, os: ubuntu-latest, outpath: './', flags: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ } - - { name: Windows VS2019, os: windows-2019, outpath: 'Release\', fext: .exe, } - - { name: Windows VS2022, os: windows-2022, outpath: 'Release\', fext: .exe, } - - { name: MacOS XCode, os: macos-latest, outpath: './' } - # - { name: Linux GCC, os: ubuntu-latest, outpath: './' } + # - { name: Linux Clang, os: ubuntu-latest, outpath: './', flags: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++} + - { name: Windows VS2019, os: windows-2019, outpath: 'Release\', fext: .exe, } + - { name: Windows VS2022, os: windows-2022, outpath: 'Release\', fext: .exe, } + # - { name: Clang VS2022, os: windows-2022, outpath: 'Release\', fext: .exe, flags: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++} + - { name: MacOS XCode, os: macos-latest, outpath: './' } + # - { name: Linux GCC, os: ubuntu-latest, outpath: './' } steps: - uses: actions/checkout@v3 - - name: System Info - run: cmake cmake -S . -B . -D AES_IMPL=${{matrix.aes.flag}} + - run: cmake --version + + - name: Configure + run: cmake -S . -B . -D AES_IMPL=${{matrix.aes.flag}} - name: Build - run: cmake --build tests --config Release + run: cmake --build . --config Release - name: Run Tests run: "${{matrix.platform.outpath}}tests${{matrix.platform.fext}}" diff --git a/AES.hpp b/AES.hpp index 13cd5c0..b1b7ddb 100644 --- a/AES.hpp +++ b/AES.hpp @@ -3,11 +3,47 @@ #include -#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(__amd64__) +#if defined(USE_ARM_NEON_AES) + #ifndef HARDWARE_ACCELERATION_ARM_NEON_AES + #define HARDWARE_ACCELERATION_ARM_NEON_AES + #endif + #undef HARDWARE_ACCELERATION_INTEL_AESNI + #undef PORTABLE_CPP_CODE + +#elif defined(USE_INTEL_AESNI) + #ifndef HARDWARE_ACCELERATION_INTEL_AESNI + #define HARDWARE_ACCELERATION_INTEL_AESNI + #endif + #undef HARDWARE_ACCELERATION_ARM_NEON_AES + #undef PORTABLE_CPP_CODE + +#elif defined(USE_CXX_AES) + #ifndef PORTABLE_CPP_CODE + #define PORTABLE_CPP_CODE + #endif + #undef HARDWARE_ACCELERATION_INTEL_AESNI + #undef HARDWARE_ACCELERATION_ARM_NEON_AES + +#else + #warning AES Implementation Not Specified + #warning Auto Detecting AES Implementation +#endif + +#if (defined(_WIN32) || defined(_WIN64) || defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(__amd64__)) && \ + !defined(USE_CXX_AES) && !defined(USE_ARM_NEON_AES) + #ifdef _MSC_VER + #include + #endif + + #include #include #include - #define HARDWARE_ACCELERATION_INTEL_AESNI -#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) + + #ifndef HARDWARE_ACCELERATION_INTEL_AESNI + #define HARDWARE_ACCELERATION_INTEL_AESNI + #endif +#elif (defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM)) && !defined(USE_CXX_AES) && \ + !defined(USE_INTEL_AESNI) #if defined(__GNUC__) #include #endif @@ -22,20 +58,14 @@ #include #endif #endif - #define HARDWARE_ACCELERATION_ARM_NEON_AES -#else - #define PORTABLE_CPP_CODE -#endif -#if defined(_USE_ARM_NEON_AES) - #undef HARDWARE_ACCELERATION_INTEL_AESNI - #undef PORTABLE_CPP_CODE -#elif defined(_USE_INTEL_AESNI) - #undef HARDWARE_ACCELERATION_ARM_NEON_AES - #undef PORTABLE_CPP_CODE + #ifndef HARDWARE_ACCELERATION_ARM_NEON_AES + #define HARDWARE_ACCELERATION_ARM_NEON_AES + #endif #else - #undef HARDWARE_ACCELERATION_INTEL_AESNI - #undef HARDWARE_ACCELERATION_ARM_NEON_AES + #ifndef PORTABLE_CPP_CODE + #define PORTABLE_CPP_CODE + #endif #endif #include @@ -54,178 +84,178 @@ namespace Cipher { unsigned char round_keys[round_keys_size]; #ifdef HARDWARE_ACCELERATION_INTEL_AESNI - inline __m128i AES_128_ASSIST(__m128i temp1, __m128i temp2) { - __m128i temp3; - temp2 = _mm_shuffle_epi32(temp2, 0xff); - temp3 = _mm_slli_si128(temp1, 0x4); - temp1 = _mm_xor_si128(temp1, temp3); - temp3 = _mm_slli_si128(temp3, 0x4); - temp1 = _mm_xor_si128(temp1, temp3); - temp3 = _mm_slli_si128(temp3, 0x4); - temp1 = _mm_xor_si128(temp1, temp3); - temp1 = _mm_xor_si128(temp1, temp2); - return temp1; + inline __m128i AES_128_ASSIST(__m128i tmp1, __m128i tmp2) { + __m128i tmp3; + tmp2 = _mm_shuffle_epi32(tmp2, 0xff); + tmp3 = _mm_slli_si128(tmp1, 0x4); + tmp1 = _mm_xor_si128(tmp1, tmp3); + tmp3 = _mm_slli_si128(tmp3, 0x4); + tmp1 = _mm_xor_si128(tmp1, tmp3); + tmp3 = _mm_slli_si128(tmp3, 0x4); + tmp1 = _mm_xor_si128(tmp1, tmp3); + tmp1 = _mm_xor_si128(tmp1, tmp2); + return tmp1; } - void AES_128_Key_Expansion(const unsigned char *userkey, unsigned char *key) { - __m128i temp1, temp2; - __m128i *Key_Schedule = (__m128i *) key; - - temp1 = _mm_loadu_si128((__m128i *) userkey); - Key_Schedule[0] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x1); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[1] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x2); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[2] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x4); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[3] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x8); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[4] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x10); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[5] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x20); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[6] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x40); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[7] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x80); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[8] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x1b); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[9] = temp1; - temp2 = _mm_aeskeygenassist_si128(temp1, 0x36); - temp1 = AES_128_ASSIST(temp1, temp2); - Key_Schedule[10] = temp1; + void AES_128_Key_Expansion(const unsigned char *user_key, unsigned char *key) { + __m128i tmp1, tmp2; + __m128i *key_sched = (__m128i *) key; + + tmp1 = _mm_loadu_si128((__m128i *) user_key); + key_sched[0] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x1); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[1] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x2); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[2] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x4); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[3] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x8); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[4] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x10); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[5] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x20); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[6] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x40); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[7] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x80); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[8] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x1b); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[9] = tmp1; + tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x36); + tmp1 = AES_128_ASSIST(tmp1, tmp2); + key_sched[10] = tmp1; } - inline void KEY_192_ASSIST(__m128i *temp1, __m128i *temp2, __m128i *temp3) { - __m128i temp4; - *temp2 = _mm_shuffle_epi32(*temp2, 0x55); - temp4 = _mm_slli_si128(*temp1, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - *temp1 = _mm_xor_si128(*temp1, *temp2); - *temp2 = _mm_shuffle_epi32(*temp1, 0xff); - temp4 = _mm_slli_si128(*temp3, 0x4); - *temp3 = _mm_xor_si128(*temp3, temp4); - *temp3 = _mm_xor_si128(*temp3, *temp2); + inline void KEY_192_ASSIST(__m128i *tmp1, __m128i *tmp2, __m128i *tmp3) { + __m128i tmp4; + *tmp2 = _mm_shuffle_epi32(*tmp2, 0x55); + tmp4 = _mm_slli_si128(*tmp1, 0x4); + *tmp1 = _mm_xor_si128(*tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x4); + *tmp1 = _mm_xor_si128(*tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x4); + *tmp1 = _mm_xor_si128(*tmp1, tmp4); + *tmp1 = _mm_xor_si128(*tmp1, *tmp2); + *tmp2 = _mm_shuffle_epi32(*tmp1, 0xff); + tmp4 = _mm_slli_si128(*tmp3, 0x4); + *tmp3 = _mm_xor_si128(*tmp3, tmp4); + *tmp3 = _mm_xor_si128(*tmp3, *tmp2); } - void AES_192_Key_Expansion(const unsigned char *userkey, unsigned char *key) { - __m128i temp1, temp2, temp3; - __m128i *Key_Schedule = (__m128i *) key; - temp1 = _mm_loadu_si128((__m128i *) userkey); - temp3 = _mm_loadu_si128((__m128i *) (userkey + 16)); - Key_Schedule[0] = temp1; - Key_Schedule[1] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x1); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[1] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[1], (__m128d) temp1, 0); - Key_Schedule[2] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x2); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[3] = temp1; - Key_Schedule[4] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x4); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[4] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[4], (__m128d) temp1, 0); - Key_Schedule[5] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x8); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[6] = temp1; - Key_Schedule[7] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x10); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[7] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[7], (__m128d) temp1, 0); - Key_Schedule[8] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x20); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[9] = temp1; - Key_Schedule[10] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x40); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[10] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[10], (__m128d) temp1, 0); - Key_Schedule[11] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x80); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[12] = temp1; + void AES_192_Key_Expansion(const unsigned char *user_key, unsigned char *key) { + __m128i tmp1, tmp2, tmp3; + __m128i *key_sched = (__m128i *) key; + tmp1 = _mm_loadu_si128((__m128i *) user_key); + tmp3 = _mm_loadu_si128((__m128i *) (user_key + 16)); + key_sched[0] = tmp1; + key_sched[1] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x1); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[1]), _mm_castsi128_pd(tmp1), 0)); + key_sched[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1)); + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x2); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[3] = tmp1; + key_sched[4] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x4); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[4]), _mm_castsi128_pd(tmp1), 0)); + key_sched[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1)); + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x8); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[6] = tmp1; + key_sched[7] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[7]), _mm_castsi128_pd(tmp1), 0)); + key_sched[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1)); + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[9] = tmp1; + key_sched[10] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[10]), _mm_castsi128_pd(tmp1), 0)); + key_sched[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1)); + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x80); + KEY_192_ASSIST(&tmp1, &tmp2, &tmp3); + key_sched[12] = tmp1; } - inline void KEY_256_ASSIST_1(__m128i *temp1, __m128i *temp2) { - __m128i temp4; - *temp2 = _mm_shuffle_epi32(*temp2, 0xff); - temp4 = _mm_slli_si128(*temp1, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - *temp1 = _mm_xor_si128(*temp1, *temp2); + inline void KEY_256_ASSIST_1(__m128i *tmp1, __m128i *tmp2) { + __m128i tmp4; + *tmp2 = _mm_shuffle_epi32(*tmp2, 0xff); + tmp4 = _mm_slli_si128(*tmp1, 0x4); + *tmp1 = _mm_xor_si128(*tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x4); + *tmp1 = _mm_xor_si128(*tmp1, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x4); + *tmp1 = _mm_xor_si128(*tmp1, tmp4); + *tmp1 = _mm_xor_si128(*tmp1, *tmp2); } - inline void KEY_256_ASSIST_2(__m128i *temp1, __m128i *temp3) { - __m128i temp2, temp4; - temp4 = _mm_aeskeygenassist_si128(*temp1, 0x0); - temp2 = _mm_shuffle_epi32(temp4, 0xaa); - temp4 = _mm_slli_si128(*temp3, 0x4); - *temp3 = _mm_xor_si128(*temp3, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp3 = _mm_xor_si128(*temp3, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp3 = _mm_xor_si128(*temp3, temp4); - *temp3 = _mm_xor_si128(*temp3, temp2); + inline void KEY_256_ASSIST_2(__m128i *tmp1, __m128i *tmp3) { + __m128i tmp2, tmp4; + tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x0); + tmp2 = _mm_shuffle_epi32(tmp4, 0xaa); + tmp4 = _mm_slli_si128(*tmp3, 0x4); + *tmp3 = _mm_xor_si128(*tmp3, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x4); + *tmp3 = _mm_xor_si128(*tmp3, tmp4); + tmp4 = _mm_slli_si128(tmp4, 0x4); + *tmp3 = _mm_xor_si128(*tmp3, tmp4); + *tmp3 = _mm_xor_si128(*tmp3, tmp2); } - void AES_256_Key_Expansion(const unsigned char *userkey, unsigned char *key) { - __m128i temp1, temp2, temp3; - __m128i *Key_Schedule = (__m128i *) key; - temp1 = _mm_loadu_si128((__m128i *) userkey); - temp3 = _mm_loadu_si128((__m128i *) (userkey + 16)); - Key_Schedule[0] = temp1; - Key_Schedule[1] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x01); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[2] = temp1; - KEY_256_ASSIST_2(&temp1, &temp3); - Key_Schedule[3] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x02); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[4] = temp1; - KEY_256_ASSIST_2(&temp1, &temp3); - Key_Schedule[5] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x04); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[6] = temp1; - KEY_256_ASSIST_2(&temp1, &temp3); - Key_Schedule[7] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x08); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[8] = temp1; - KEY_256_ASSIST_2(&temp1, &temp3); - Key_Schedule[9] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x10); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[10] = temp1; - KEY_256_ASSIST_2(&temp1, &temp3); - Key_Schedule[11] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x20); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[12] = temp1; - KEY_256_ASSIST_2(&temp1, &temp3); - Key_Schedule[13] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x40); - KEY_256_ASSIST_1(&temp1, &temp2); - Key_Schedule[14] = temp1; + void AES_256_Key_Expansion(const unsigned char *user_key, unsigned char *key) { + __m128i tmp1, tmp2, tmp3; + __m128i *key_sched = (__m128i *) key; + tmp1 = _mm_loadu_si128((__m128i *) user_key); + tmp3 = _mm_loadu_si128((__m128i *) (user_key + 16)); + key_sched[0] = tmp1; + key_sched[1] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[2] = tmp1; + KEY_256_ASSIST_2(&tmp1, &tmp3); + key_sched[3] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[4] = tmp1; + KEY_256_ASSIST_2(&tmp1, &tmp3); + key_sched[5] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[6] = tmp1; + KEY_256_ASSIST_2(&tmp1, &tmp3); + key_sched[7] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[8] = tmp1; + KEY_256_ASSIST_2(&tmp1, &tmp3); + key_sched[9] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[10] = tmp1; + KEY_256_ASSIST_2(&tmp1, &tmp3); + key_sched[11] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[12] = tmp1; + KEY_256_ASSIST_2(&tmp1, &tmp3); + key_sched[13] = tmp3; + tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40); + KEY_256_ASSIST_1(&tmp1, &tmp2); + key_sched[14] = tmp1; } // #elif defined(HARDWARE_ACCELERATION_ARM_NEON_AES) // space for arm neon variables in case needed in the future. diff --git a/CMakeLists.txt b/CMakeLists.txt index 94c4be7..3b6eab6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,19 +1,25 @@ -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.20) project(tests VERSION 1.0.0) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) -set(AES_IMPL "portable" CACHE STRING "Choose an AES Implementation") -set_property(CACHE AES_IMPL PROPERTY STRINGS portable aesni neon) - -if(${AES_IMPL} STREQUAL "aesni") - add_compile_options(-D_USE_INTEL_AESNI -maes) -elseif(${AES_IMPL} STREQUAL "neon") - add_compile_options(-D_USE_ARM_NEON_AES -march=armv8-a+crypto) -elseif(NOT ${AES_IMPL} STREQUAL "portable") - message(FATAL_ERROR "Invalid AES implementation option.") -endif() +set(AES_IMPL "aesni" CACHE STRING "Choose an AES Implementation") +set_property(CACHE AES_IMPL PROPERTY STRINGS auto portable aesni neon) add_executable(tests tests.cpp) + +if("${AES_IMPL}" STREQUAL "aesni") + target_compile_definitions(tests PUBLIC USE_INTEL_AESNI) + if(MSVC) + target_compile_options(tests PRIVATE /arch:SSE2) + else() + target_compile_options(tests PRIVATE -maes) + endif() +elseif("${AES_IMPL}" STREQUAL "neon") + target_compile_definitions(tests PUBLIC USE_ARM_NEON_AES) + target_compile_options(tests PRIVATE -march=armv8-a+crypto) +elseif("${AES_IMPL}" STREQUAL "portable") + target_compile_definitions(tests PUBLIC USE_CXX_AES) +endif() \ No newline at end of file diff --git a/README.md b/README.md index 15a9186..6f8dd5a 100644 --- a/README.md +++ b/README.md @@ -8,54 +8,41 @@ This repository contains a **single header file C++ library** that provides AES ----------- -## **Requirements** +## Requirement -- Requires C++17 so you need to compile it with the compilation flag `-std=c++17`. +Requires C++17 so you need to compile it with the compilation flag `-std=c++17`. -## **Performance Compilation D-Flags:** +## Enable Portable C++ AES Implementation -+ **Portable**: +By simply including the main header file (`AES.hpp`), the code will be compiled using portable C++. Make sure to compile with the optimization flag `-O3`. - By simply including the main header file (`AES.hpp`), the code will be compiled using portable C++. Make sure to compile with the optimization flag `-O3`. - - _Please note that the portable code is slower than the two alternatives mentioned below_. - -+ **AES-NI:** +_Please note that the portable code is slower than the two alternatives mentioned below_. - To achieve a significant speed-up performance, add the following flag when compiling for `x86-64` architecture. - - _e.g. mid-range PCs & Laptops_. +**Additional Compiler Flag: `-D USE_CXX_AES`** - ```-D_USE_INTEL_AESNI -maes -O3``` +**CMake: `AES_IMPL=portable`** -+ **ARM neon:** +## Enable AES-NI Hardware Acceleration - To gain a speed-up performance, add the following flag when compiling for `aarch64 armv8` architecture. +To achieve a significant speed-up performance, add the following flag when compiling for **`x86-64`** architecture. - _e.g. modern android devices_. +_e.g. mid-range PCs & Laptops_. - ```-D_USE_ARM_NEON_AES -march=armv8-a+crypto -O3``` +**Additional Compiler Flag: `-D USE_INTEL_AESNI -maes`** -## **Sample program:** +**CMake: `AES_IMPL=aesni`** -- **compile with [pure c/c++ code]** - - ``` - g++ -o sample.exe sample.cpp -O3 - ``` +## Enable ARM neon Hardware Acceleration -- **comple with [AES-NI]** +To gain a speed-up performance, add the following flag when compiling for **`aarch64`**, **`armv8`** architecture. + + _e.g. modern android devices_. - ``` - g++ -o sample.exe sample.cpp -D_USE_INTEL_AESNI -maes -O3 - ``` +**Additional Compiler Flag: `-D USE_ARM_NEON_AES -march=armv8-a+crypto`** -- **comple with [Arm-NEON-AES]** - - ``` - g++ -o sample.exe sample.cpp -D_USE_ARM_NEON_AES -march=armv8-a+crypto -O3 - ``` +**CMake: `AES_IMPL=neon`** +# Sample Program ```c++ /* sample.cpp */ @@ -85,4 +72,79 @@ int main() ```shell 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a, -``` \ No newline at end of file +``` + +# Compiling with CMake + +To build with cmake while choosing what AES implementation to use, you can add the following cmake code below into your **CMakeLists.txt** file. + +**cmake:** + +```cmake +cmake_minimum_required(VERSION 3.16) + +project(YourProjectName VERSION 1.0.0) + +# ... + +# add this block before `add_executable` +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) + +set(AES_IMPL "aesni" CACHE STRING "Choose an AES Implementation") +set_property(CACHE AES_IMPL PROPERTY STRINGS auto portable aesni neon) +# add this block before `add_executable` + +# ... + +add_executable(main Source.cpp) + +# ... + +# add this block to after `add_executable(...)` +if("${AES_IMPL}" STREQUAL "aesni") + target_compile_definitions(main PUBLIC USE_INTEL_AESNI) + if(MSVC) + target_compile_options(main PRIVATE /arch:SSE2) + else() + target_compile_options(main PRIVATE -maes) + endif() +elseif("${AES_IMPL}" STREQUAL "neon") + target_compile_definitions(main PUBLIC USE_ARM_NEON_AES) + target_compile_options(main PRIVATE -march=armv8-a+crypto) +elseif("${AES_IMPL}" STREQUAL "portable") + target_compile_definitions(main PUBLIC USE_CXX_AES) +endif() +# add this block to after `add_executable(...)` +``` + +Then run **cmake-gui** choose which aes implementation you want to enable in the check-boxe of `AES_IMPL`. + +Or use the terminal command for bash/cmd`. + +```bash +cmake -S . -B . -D AES_IMPL= +cmake --build . --config Release +``` + +The value of `` could be `aesni`, `neon` or `portable`, . + +# Compiling in the Command Line + +1. **compile with [pure c/c++ code]** + + ``` + g++ -o sample.exe sample.cpp -O3 + ``` + +2. **comple with [AES-NI]** + + ``` + g++ -o sample.exe sample.cpp -D USE_INTEL_AESNI -maes -O3 + ``` + +3. **comple with [Arm-NEON-AES]** + + ``` + g++ -o sample.exe sample.cpp -D USE_ARM_NEON_AES -march=armv8-a+crypto -O3 + ``` \ No newline at end of file diff --git a/makefile b/makefile index 05af8f3..cb3eab3 100644 --- a/makefile +++ b/makefile @@ -28,13 +28,13 @@ VERSION:= ifeq ($(VERSION), portable) COMPILATION_MSG="compiling portable version" -DFLAGS= +DFLAGS:=-D USE_CXX_AES else ifeq ($(VERSION), aesni) COMPILATION_MSG="compiling AES-NI version" -DFLAGS=-D_USE_INTEL_AESNI -maes +DFLAGS:=-D USE_INTEL_AESNI -maes else ifeq ($(VERSION), neon) COMPILATION_MSG="compiling AES aarch64 neon version" -DFLAGS=-D_USE_ARM_NEON_AES -march=armv8-a+crypto +DFLAGS:=-D USE_ARM_NEON_AES -march=armv8-a+crypto endif ########################## type ########################## @@ -74,7 +74,7 @@ style: microbenchmark: $(CXX) $(CXX_STANDARD) $(LINKER) microbench.cpp -o microbench1.out -O3 - $(CXX) $(CXX_STANDARD) $(LINKER) microbench.cpp -o microbench2.out -O3 -D_USE_INTEL_AESNI -maes + $(CXX) $(CXX_STANDARD) $(LINKER) microbench.cpp -o microbench2.out -O3 -D USE_INTEL_AESNI -maes @echo "Running micro-benchmarks" @echo "" @echo "# **micro-benchmark**" > micro-benchmark.md diff --git a/microbench.cpp b/microbench.cpp index 45d3c85..fb7f0a7 100644 --- a/microbench.cpp +++ b/microbench.cpp @@ -1,63 +1,60 @@ +#include #include #include -#include +#include #include #include -#include #include "AES.hpp" int main() { - constexpr size_t MB = 16 * 1024 * 1024; - constexpr size_t KEY_BIT_SIZE = 256; - static_assert(MB % 16 == 0, "Divisible by AES block size"); - - std::mt19937_64 engine(std::chrono::steady_clock::now().time_since_epoch().count()); - std::uniform_int_distribution rng( - std::numeric_limits::min(), - std::numeric_limits::max() - ); - - unsigned char *data = new unsigned char [MB]; - unsigned char *save = new unsigned char [MB]; - - for (size_t i = 0; i < MB; ++i) { - data[i] = rng(engine); - save[i] = data[i]; - } - - // benchmark start - - unsigned char key[32] = { - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - }; - - Cipher::Aes aes_cipher(key); - - auto enc_start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < MB; i += 16) { - aes_cipher.encrypt_block(&data[i]); - } - auto enc_end = std::chrono::high_resolution_clock::now(); - auto enc_dur = std::chrono::duration_cast(enc_end - enc_start); - - std::cout << "| Encryption | " << KEY_BIT_SIZE << " | " << enc_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n"; - - auto dec_start = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < MB; i += 16) { - aes_cipher.decrypt_block(&data[i]); - } - auto dec_end = std::chrono::high_resolution_clock::now(); - auto dec_dur = std::chrono::duration_cast(dec_end - dec_start); - - std::cout << "| Decryption | " << KEY_BIT_SIZE << " | " << dec_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n"; - - int result = std::memcmp(data, save, MB); - // std::cout << "\n\nresult = " << result << "\n\n"; - delete [] data; - delete [] save; - return result; + constexpr size_t MB = 16 * 1024 * 1024; + constexpr size_t KEY_BIT_SIZE = 256; + static_assert(MB % 16 == 0, "Divisible by AES block size"); + + std::mt19937_64 engine(std::chrono::steady_clock::now().time_since_epoch().count()); + std::uniform_int_distribution rng( + std::numeric_limits::min(), std::numeric_limits::max() + ); + + unsigned char *data = new unsigned char[MB]; + unsigned char *save = new unsigned char[MB]; + + for (size_t i = 0; i < MB; ++i) { + data[i] = rng(engine); + save[i] = data[i]; + } + + // benchmark start + + unsigned char key[32] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + + Cipher::Aes aes_cipher(key); + + auto enc_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < MB; i += 16) { + aes_cipher.encrypt_block(&data[i]); + } + auto enc_end = std::chrono::high_resolution_clock::now(); + auto enc_dur = std::chrono::duration_cast(enc_end - enc_start); + + std::cout << "| Encryption | " << KEY_BIT_SIZE << " | " << enc_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n"; + + auto dec_start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < MB; i += 16) { + aes_cipher.decrypt_block(&data[i]); + } + auto dec_end = std::chrono::high_resolution_clock::now(); + auto dec_dur = std::chrono::duration_cast(dec_end - dec_start); + + std::cout << "| Decryption | " << KEY_BIT_SIZE << " | " << dec_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n"; + + int result = std::memcmp(data, save, MB); + // std::cout << "\n\nresult = " << result << "\n\n"; + delete[] data; + delete[] save; + return result; } \ No newline at end of file diff --git a/tests b/tests new file mode 100755 index 0000000..959820a Binary files /dev/null and b/tests differ