diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 33f389a..e1f1dff 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -4,7 +4,7 @@ on:
   workflow_run:
     workflows: [tests]
     types: [completed]
-
+  
 jobs:
   run-benchmark:
     runs-on: ubuntu-latest
@@ -34,11 +34,26 @@ jobs:
       - name: Stash Benchmark Results
         run: git add micro-benchmark.md
 
+      - name: Commit Changes
+        run: git commit -m "..."
+
+      - name: Remove Other Changes
+        run: git checkout .
+
+      - name: Clean Artifacts
+        run: git clean -f -d
+
+      - name: Checkout Benchmark Branch
+        run: git checkout benchmarks
+
+      - name: Checkout Benchmark Markdown Results
+        run: git checkout main micro-benchmark.md
+
       - name: Save Changes
         run: git commit -m "run::$(date)"
 
       - name: Push Changes To Benchmark Branch
-        run: git push
+        run: git push --force
 
   run-benchmark-failed:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index c1a0f21..a39d1f6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,33 +6,40 @@ on:
   pull_request:
     branches: [ main ]
 
+concurrency:
+  group: ${{ github.workflow }} @ ${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
   cmake-build:
-    name: ${{ matrix.platform.name }}
+    name: ${{ matrix.platform.name }} ${{ matrix.aes.name }}
     runs-on: ${{ matrix.platform.os }}
 
     strategy:
       fail-fast: false
       matrix:
         aes:
-        - { name: Pure C++ Implementation, flag: portable }
-        - { name: Intel x86_64 AES NI,     flag: aesni    }
-        - { name: ARMv8 aarch64 NEON,      flag: neon     }
+        - { name: C++ Implementation,  flag: portable}
+        - { name: Intel x86_64 AES NI, flag: aesni}
+        # - { name: ARMv8 aarch64 NEON,  flag: neon}
         platform:
-        # - { name: Linux Clang,    os: ubuntu-latest,  outpath: './', flags: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ }
-        - { name: Windows VS2019, os: windows-2019, outpath: 'Release\', fext: .exe, }
-        - { name: Windows VS2022, os: windows-2022, outpath: 'Release\', fext: .exe, }
-        - { name: MacOS   XCode,    os: macos-latest, outpath: './' }
-        # - { name: Linux GCC,      os: ubuntu-latest,  outpath: './' }
+        # - { name: Linux Clang,    os: ubuntu-latest, outpath: './', flags: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++}
+        - { name: Windows VS2019, os: windows-2019,  outpath: 'Release\', fext: .exe, }
+        - { name: Windows VS2022, os: windows-2022,  outpath: 'Release\', fext: .exe, }
+        # - { name: Clang VS2022,   os: windows-2022,  outpath: 'Release\', fext: .exe, flags: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++}
+        - { name: MacOS   XCode,  os: macos-latest,  outpath: './' }
+        # - { name: Linux GCC,      os: ubuntu-latest, outpath: './' }
 
     steps:
     - uses: actions/checkout@v3
 
-    - name: System Info
-      run: cmake cmake -S . -B . -D AES_IMPL=${{matrix.aes.flag}}
+    - run: cmake --version
+
+    - name: Configure
+      run: cmake -S . -B . -D AES_IMPL=${{matrix.aes.flag}}
 
     - name: Build
-      run: cmake --build tests --config Release
+      run: cmake --build . --config Release
 
     - name: Run Tests
       run: "${{matrix.platform.outpath}}tests${{matrix.platform.fext}}"
diff --git a/AES.hpp b/AES.hpp
index 13cd5c0..b1b7ddb 100644
--- a/AES.hpp
+++ b/AES.hpp
@@ -3,11 +3,47 @@
 
 #include <iostream>
 
-#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(__amd64__)
+#if defined(USE_ARM_NEON_AES)
+  #ifndef HARDWARE_ACCELERATION_ARM_NEON_AES
+    #define HARDWARE_ACCELERATION_ARM_NEON_AES
+  #endif
+  #undef HARDWARE_ACCELERATION_INTEL_AESNI
+  #undef PORTABLE_CPP_CODE
+
+#elif defined(USE_INTEL_AESNI)
+  #ifndef HARDWARE_ACCELERATION_INTEL_AESNI
+    #define HARDWARE_ACCELERATION_INTEL_AESNI
+  #endif
+  #undef HARDWARE_ACCELERATION_ARM_NEON_AES
+  #undef PORTABLE_CPP_CODE
+
+#elif defined(USE_CXX_AES)
+  #ifndef PORTABLE_CPP_CODE
+    #define PORTABLE_CPP_CODE
+  #endif
+  #undef HARDWARE_ACCELERATION_INTEL_AESNI
+  #undef HARDWARE_ACCELERATION_ARM_NEON_AES
+
+#else
+  #warning AES Implementation Not Specified
+  #warning Auto Detecting AES Implementation
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64) || defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(__amd64__)) && \
+  !defined(USE_CXX_AES) && !defined(USE_ARM_NEON_AES)
+  #ifdef _MSC_VER
+    #include <intrin.h>
+  #endif
+
+  #include <emmintrin.h>
   #include <immintrin.h>
   #include <xmmintrin.h>
-  #define HARDWARE_ACCELERATION_INTEL_AESNI
-#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM)
+
+  #ifndef HARDWARE_ACCELERATION_INTEL_AESNI
+    #define HARDWARE_ACCELERATION_INTEL_AESNI
+  #endif
+#elif (defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM)) && !defined(USE_CXX_AES) &&                      \
+  !defined(USE_INTEL_AESNI)
   #if defined(__GNUC__)
     #include <stdint.h>
   #endif
@@ -22,20 +58,14 @@
       #include <arm_acle.h>
     #endif
   #endif
-  #define HARDWARE_ACCELERATION_ARM_NEON_AES
-#else
-  #define PORTABLE_CPP_CODE
-#endif
 
-#if defined(_USE_ARM_NEON_AES)
-  #undef HARDWARE_ACCELERATION_INTEL_AESNI
-  #undef PORTABLE_CPP_CODE
-#elif defined(_USE_INTEL_AESNI)
-  #undef HARDWARE_ACCELERATION_ARM_NEON_AES
-  #undef PORTABLE_CPP_CODE
+  #ifndef HARDWARE_ACCELERATION_ARM_NEON_AES
+    #define HARDWARE_ACCELERATION_ARM_NEON_AES
+  #endif
 #else
-  #undef HARDWARE_ACCELERATION_INTEL_AESNI
-  #undef HARDWARE_ACCELERATION_ARM_NEON_AES
+  #ifndef PORTABLE_CPP_CODE
+    #define PORTABLE_CPP_CODE
+  #endif
 #endif
 
 #include <cstring>
@@ -54,178 +84,178 @@ namespace Cipher {
     unsigned char round_keys[round_keys_size];
 
 #ifdef HARDWARE_ACCELERATION_INTEL_AESNI
-    inline __m128i AES_128_ASSIST(__m128i temp1, __m128i temp2) {
-      __m128i temp3;
-      temp2 = _mm_shuffle_epi32(temp2, 0xff);
-      temp3 = _mm_slli_si128(temp1, 0x4);
-      temp1 = _mm_xor_si128(temp1, temp3);
-      temp3 = _mm_slli_si128(temp3, 0x4);
-      temp1 = _mm_xor_si128(temp1, temp3);
-      temp3 = _mm_slli_si128(temp3, 0x4);
-      temp1 = _mm_xor_si128(temp1, temp3);
-      temp1 = _mm_xor_si128(temp1, temp2);
-      return temp1;
+    inline __m128i AES_128_ASSIST(__m128i tmp1, __m128i tmp2) {
+      __m128i tmp3;
+      tmp2 = _mm_shuffle_epi32(tmp2, 0xff);
+      tmp3 = _mm_slli_si128(tmp1, 0x4);
+      tmp1 = _mm_xor_si128(tmp1, tmp3);
+      tmp3 = _mm_slli_si128(tmp3, 0x4);
+      tmp1 = _mm_xor_si128(tmp1, tmp3);
+      tmp3 = _mm_slli_si128(tmp3, 0x4);
+      tmp1 = _mm_xor_si128(tmp1, tmp3);
+      tmp1 = _mm_xor_si128(tmp1, tmp2);
+      return tmp1;
     }
 
-    void AES_128_Key_Expansion(const unsigned char *userkey, unsigned char *key) {
-      __m128i temp1, temp2;
-      __m128i *Key_Schedule = (__m128i *) key;
-
-      temp1 = _mm_loadu_si128((__m128i *) userkey);
-      Key_Schedule[0] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x1);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[1] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x2);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[2] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x4);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[3] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x8);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[4] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x10);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[5] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x20);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[6] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x40);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[7] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x80);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[8] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x1b);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[9] = temp1;
-      temp2 = _mm_aeskeygenassist_si128(temp1, 0x36);
-      temp1 = AES_128_ASSIST(temp1, temp2);
-      Key_Schedule[10] = temp1;
+    void AES_128_Key_Expansion(const unsigned char *user_key, unsigned char *key) {
+      __m128i tmp1, tmp2;
+      __m128i *key_sched = (__m128i *) key;
+
+      tmp1 = _mm_loadu_si128((__m128i *) user_key);
+      key_sched[0] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x1);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[1] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x2);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[2] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x4);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[3] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x8);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[4] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x10);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[5] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x20);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[6] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x40);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[7] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x80);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[8] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x1b);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[9] = tmp1;
+      tmp2 = _mm_aeskeygenassist_si128(tmp1, 0x36);
+      tmp1 = AES_128_ASSIST(tmp1, tmp2);
+      key_sched[10] = tmp1;
     }
 
-    inline void KEY_192_ASSIST(__m128i *temp1, __m128i *temp2, __m128i *temp3) {
-      __m128i temp4;
-      *temp2 = _mm_shuffle_epi32(*temp2, 0x55);
-      temp4 = _mm_slli_si128(*temp1, 0x4);
-      *temp1 = _mm_xor_si128(*temp1, temp4);
-      temp4 = _mm_slli_si128(temp4, 0x4);
-      *temp1 = _mm_xor_si128(*temp1, temp4);
-      temp4 = _mm_slli_si128(temp4, 0x4);
-      *temp1 = _mm_xor_si128(*temp1, temp4);
-      *temp1 = _mm_xor_si128(*temp1, *temp2);
-      *temp2 = _mm_shuffle_epi32(*temp1, 0xff);
-      temp4 = _mm_slli_si128(*temp3, 0x4);
-      *temp3 = _mm_xor_si128(*temp3, temp4);
-      *temp3 = _mm_xor_si128(*temp3, *temp2);
+    inline void KEY_192_ASSIST(__m128i *tmp1, __m128i *tmp2, __m128i *tmp3) {
+      __m128i tmp4;
+      *tmp2 = _mm_shuffle_epi32(*tmp2, 0x55);
+      tmp4 = _mm_slli_si128(*tmp1, 0x4);
+      *tmp1 = _mm_xor_si128(*tmp1, tmp4);
+      tmp4 = _mm_slli_si128(tmp4, 0x4);
+      *tmp1 = _mm_xor_si128(*tmp1, tmp4);
+      tmp4 = _mm_slli_si128(tmp4, 0x4);
+      *tmp1 = _mm_xor_si128(*tmp1, tmp4);
+      *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
+      *tmp2 = _mm_shuffle_epi32(*tmp1, 0xff);
+      tmp4 = _mm_slli_si128(*tmp3, 0x4);
+      *tmp3 = _mm_xor_si128(*tmp3, tmp4);
+      *tmp3 = _mm_xor_si128(*tmp3, *tmp2);
     }
 
-    void AES_192_Key_Expansion(const unsigned char *userkey, unsigned char *key) {
-      __m128i temp1, temp2, temp3;
-      __m128i *Key_Schedule = (__m128i *) key;
-      temp1 = _mm_loadu_si128((__m128i *) userkey);
-      temp3 = _mm_loadu_si128((__m128i *) (userkey + 16));
-      Key_Schedule[0] = temp1;
-      Key_Schedule[1] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x1);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[1] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[1], (__m128d) temp1, 0);
-      Key_Schedule[2] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1);
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x2);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[3] = temp1;
-      Key_Schedule[4] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x4);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[4] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[4], (__m128d) temp1, 0);
-      Key_Schedule[5] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1);
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x8);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[6] = temp1;
-      Key_Schedule[7] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x10);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[7] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[7], (__m128d) temp1, 0);
-      Key_Schedule[8] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1);
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x20);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[9] = temp1;
-      Key_Schedule[10] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x40);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[10] = (__m128i) _mm_shuffle_pd((__m128d) Key_Schedule[10], (__m128d) temp1, 0);
-      Key_Schedule[11] = (__m128i) _mm_shuffle_pd((__m128d) temp1, (__m128d) temp3, 1);
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x80);
-      KEY_192_ASSIST(&temp1, &temp2, &temp3);
-      Key_Schedule[12] = temp1;
+    void AES_192_Key_Expansion(const unsigned char *user_key, unsigned char *key) {
+      __m128i tmp1, tmp2, tmp3;
+      __m128i *key_sched = (__m128i *) key;
+      tmp1 = _mm_loadu_si128((__m128i *) user_key);
+      tmp3 = _mm_loadu_si128((__m128i *) (user_key + 16));
+      key_sched[0] = tmp1;
+      key_sched[1] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x1);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[1]), _mm_castsi128_pd(tmp1), 0));
+      key_sched[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1));
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x2);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[3] = tmp1;
+      key_sched[4] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x4);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[4]), _mm_castsi128_pd(tmp1), 0));
+      key_sched[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1));
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x8);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[6] = tmp1;
+      key_sched[7] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[7]), _mm_castsi128_pd(tmp1), 0));
+      key_sched[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1));
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[9] = tmp1;
+      key_sched[10] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(key_sched[10]), _mm_castsi128_pd(tmp1), 0));
+      key_sched[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp1), _mm_castsi128_pd(tmp3), 1));
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x80);
+      KEY_192_ASSIST(&tmp1, &tmp2, &tmp3);
+      key_sched[12] = tmp1;
     }
 
-    inline void KEY_256_ASSIST_1(__m128i *temp1, __m128i *temp2) {
-      __m128i temp4;
-      *temp2 = _mm_shuffle_epi32(*temp2, 0xff);
-      temp4 = _mm_slli_si128(*temp1, 0x4);
-      *temp1 = _mm_xor_si128(*temp1, temp4);
-      temp4 = _mm_slli_si128(temp4, 0x4);
-      *temp1 = _mm_xor_si128(*temp1, temp4);
-      temp4 = _mm_slli_si128(temp4, 0x4);
-      *temp1 = _mm_xor_si128(*temp1, temp4);
-      *temp1 = _mm_xor_si128(*temp1, *temp2);
+    inline void KEY_256_ASSIST_1(__m128i *tmp1, __m128i *tmp2) {
+      __m128i tmp4;
+      *tmp2 = _mm_shuffle_epi32(*tmp2, 0xff);
+      tmp4 = _mm_slli_si128(*tmp1, 0x4);
+      *tmp1 = _mm_xor_si128(*tmp1, tmp4);
+      tmp4 = _mm_slli_si128(tmp4, 0x4);
+      *tmp1 = _mm_xor_si128(*tmp1, tmp4);
+      tmp4 = _mm_slli_si128(tmp4, 0x4);
+      *tmp1 = _mm_xor_si128(*tmp1, tmp4);
+      *tmp1 = _mm_xor_si128(*tmp1, *tmp2);
     }
 
-    inline void KEY_256_ASSIST_2(__m128i *temp1, __m128i *temp3) {
-      __m128i temp2, temp4;
-      temp4 = _mm_aeskeygenassist_si128(*temp1, 0x0);
-      temp2 = _mm_shuffle_epi32(temp4, 0xaa);
-      temp4 = _mm_slli_si128(*temp3, 0x4);
-      *temp3 = _mm_xor_si128(*temp3, temp4);
-      temp4 = _mm_slli_si128(temp4, 0x4);
-      *temp3 = _mm_xor_si128(*temp3, temp4);
-      temp4 = _mm_slli_si128(temp4, 0x4);
-      *temp3 = _mm_xor_si128(*temp3, temp4);
-      *temp3 = _mm_xor_si128(*temp3, temp2);
+    inline void KEY_256_ASSIST_2(__m128i *tmp1, __m128i *tmp3) {
+      __m128i tmp2, tmp4;
+      tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x0);
+      tmp2 = _mm_shuffle_epi32(tmp4, 0xaa);
+      tmp4 = _mm_slli_si128(*tmp3, 0x4);
+      *tmp3 = _mm_xor_si128(*tmp3, tmp4);
+      tmp4 = _mm_slli_si128(tmp4, 0x4);
+      *tmp3 = _mm_xor_si128(*tmp3, tmp4);
+      tmp4 = _mm_slli_si128(tmp4, 0x4);
+      *tmp3 = _mm_xor_si128(*tmp3, tmp4);
+      *tmp3 = _mm_xor_si128(*tmp3, tmp2);
     }
 
-    void AES_256_Key_Expansion(const unsigned char *userkey, unsigned char *key) {
-      __m128i temp1, temp2, temp3;
-      __m128i *Key_Schedule = (__m128i *) key;
-      temp1 = _mm_loadu_si128((__m128i *) userkey);
-      temp3 = _mm_loadu_si128((__m128i *) (userkey + 16));
-      Key_Schedule[0] = temp1;
-      Key_Schedule[1] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x01);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[2] = temp1;
-      KEY_256_ASSIST_2(&temp1, &temp3);
-      Key_Schedule[3] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x02);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[4] = temp1;
-      KEY_256_ASSIST_2(&temp1, &temp3);
-      Key_Schedule[5] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x04);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[6] = temp1;
-      KEY_256_ASSIST_2(&temp1, &temp3);
-      Key_Schedule[7] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x08);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[8] = temp1;
-      KEY_256_ASSIST_2(&temp1, &temp3);
-      Key_Schedule[9] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x10);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[10] = temp1;
-      KEY_256_ASSIST_2(&temp1, &temp3);
-      Key_Schedule[11] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x20);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[12] = temp1;
-      KEY_256_ASSIST_2(&temp1, &temp3);
-      Key_Schedule[13] = temp3;
-      temp2 = _mm_aeskeygenassist_si128(temp3, 0x40);
-      KEY_256_ASSIST_1(&temp1, &temp2);
-      Key_Schedule[14] = temp1;
+    void AES_256_Key_Expansion(const unsigned char *user_key, unsigned char *key) {
+      __m128i tmp1, tmp2, tmp3;
+      __m128i *key_sched = (__m128i *) key;
+      tmp1 = _mm_loadu_si128((__m128i *) user_key);
+      tmp3 = _mm_loadu_si128((__m128i *) (user_key + 16));
+      key_sched[0] = tmp1;
+      key_sched[1] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[2] = tmp1;
+      KEY_256_ASSIST_2(&tmp1, &tmp3);
+      key_sched[3] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[4] = tmp1;
+      KEY_256_ASSIST_2(&tmp1, &tmp3);
+      key_sched[5] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[6] = tmp1;
+      KEY_256_ASSIST_2(&tmp1, &tmp3);
+      key_sched[7] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[8] = tmp1;
+      KEY_256_ASSIST_2(&tmp1, &tmp3);
+      key_sched[9] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[10] = tmp1;
+      KEY_256_ASSIST_2(&tmp1, &tmp3);
+      key_sched[11] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[12] = tmp1;
+      KEY_256_ASSIST_2(&tmp1, &tmp3);
+      key_sched[13] = tmp3;
+      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
+      KEY_256_ASSIST_1(&tmp1, &tmp2);
+      key_sched[14] = tmp1;
     }
 // #elif defined(HARDWARE_ACCELERATION_ARM_NEON_AES)
 // space for arm neon variables in case needed in the future.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94c4be7..3b6eab6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,19 +1,25 @@
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.20)
 
 project(tests VERSION 1.0.0)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-set(AES_IMPL "portable" CACHE STRING "Choose an AES Implementation")
-set_property(CACHE AES_IMPL PROPERTY STRINGS portable aesni neon)
-
-if(${AES_IMPL} STREQUAL "aesni")
-    add_compile_options(-D_USE_INTEL_AESNI -maes)
-elseif(${AES_IMPL} STREQUAL "neon")
-    add_compile_options(-D_USE_ARM_NEON_AES -march=armv8-a+crypto)
-elseif(NOT ${AES_IMPL} STREQUAL "portable")
-    message(FATAL_ERROR "Invalid AES implementation option.")
-endif()
+set(AES_IMPL "aesni" CACHE STRING "Choose an AES Implementation")
+set_property(CACHE AES_IMPL PROPERTY STRINGS auto portable aesni neon)
 
 add_executable(tests tests.cpp)
+
+if("${AES_IMPL}" STREQUAL "aesni")
+    target_compile_definitions(tests PUBLIC USE_INTEL_AESNI)
+    if(MSVC)
+        target_compile_options(tests PRIVATE /arch:SSE2)
+    else()
+        target_compile_options(tests PRIVATE -maes)
+    endif()
+elseif("${AES_IMPL}" STREQUAL "neon")
+    target_compile_definitions(tests PUBLIC USE_ARM_NEON_AES)
+    target_compile_options(tests PRIVATE -march=armv8-a+crypto)
+elseif("${AES_IMPL}" STREQUAL "portable")
+    target_compile_definitions(tests PUBLIC USE_CXX_AES)
+endif()
\ No newline at end of file
diff --git a/README.md b/README.md
index 15a9186..6f8dd5a 100644
--- a/README.md
+++ b/README.md
@@ -8,54 +8,41 @@ This repository contains a **single header file C++ library** that provides AES
 
 -----------
 
-## **Requirements**
+## Requirement
 
-- Requires C++17 so you need to compile it with the compilation flag `-std=c++17`.
+Requires C++17 so you need to compile it with the compilation flag `-std=c++17`.
 
-## **Performance Compilation D-Flags:**
+## Enable Portable C++ AES Implementation
 
-+ **Portable**:
+By simply including the main header file (`AES.hpp`), the code will be compiled using portable C++. Make sure to compile with the optimization flag `-O3`.
 
-  By simply including the main header file (`AES.hpp`), the code will be compiled using portable C++. Make sure to compile with the optimization flag `-O3`.
-  
-  _Please note that the portable code is slower than the two alternatives mentioned below_.
-
-+ **AES-NI:**
+_Please note that the portable code is slower than the two alternatives mentioned below_.
 
-  To achieve a significant speed-up performance, add the following flag when compiling for `x86-64` architecture.
-  
-  _e.g. mid-range PCs & Laptops_.
+**Additional Compiler Flag: `-D USE_CXX_AES`**
 
-  ```-D_USE_INTEL_AESNI -maes -O3```
+**CMake: `AES_IMPL=portable`**
 
-+ **ARM neon:**
+## Enable AES-NI Hardware Acceleration
 
-  To gain a speed-up performance, add the following flag when compiling for `aarch64 armv8` architecture.
+To achieve a significant speed-up performance, add the following flag when compiling for **`x86-64`** architecture.
   
-  _e.g. modern android devices_.
+_e.g. mid-range PCs & Laptops_.
 
-  ```-D_USE_ARM_NEON_AES -march=armv8-a+crypto -O3```
+**Additional Compiler Flag: `-D USE_INTEL_AESNI -maes`**
 
-## **Sample program:**
+**CMake: `AES_IMPL=aesni`**
 
-- **compile with [pure c/c++ code]**
-
-  ```
-  g++ -o sample.exe sample.cpp -O3
-  ```
+## Enable ARM neon Hardware Acceleration
 
-- **comple with [AES-NI]**
+To gain a speed-up performance, add the following flag when compiling for **`aarch64`**, **`armv8`** architecture.
+  
+ _e.g. modern android devices_.
 
-  ```
-  g++ -o sample.exe sample.cpp -D_USE_INTEL_AESNI -maes -O3
-  ```
+**Additional Compiler Flag: `-D USE_ARM_NEON_AES -march=armv8-a+crypto`**
 
-- **comple with [Arm-NEON-AES]**
-
-  ```
-  g++ -o sample.exe sample.cpp -D_USE_ARM_NEON_AES -march=armv8-a+crypto -O3
-  ```
+**CMake: `AES_IMPL=neon`**
 
+# Sample Program
 
 ```c++
 /*    sample.cpp    */
@@ -85,4 +72,79 @@ int main()
 ```shell
 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30,
 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a,
-```
\ No newline at end of file
+```
+
+# Compiling with CMake
+
+To build with cmake while choosing what AES implementation to use, you can add the following cmake code below into your **CMakeLists.txt** file.
+
+**cmake:**
+
+```cmake
+cmake_minimum_required(VERSION 3.16)
+
+project(YourProjectName VERSION 1.0.0)
+
+# ...
+
+# add this block before `add_executable`
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+set(AES_IMPL "aesni" CACHE STRING "Choose an AES Implementation")
+set_property(CACHE AES_IMPL PROPERTY STRINGS auto portable aesni neon)
+# add this block before `add_executable`
+
+# ...
+
+add_executable(main Source.cpp)
+
+# ...
+
+# add this block to after `add_executable(...)`
+if("${AES_IMPL}" STREQUAL "aesni")
+    target_compile_definitions(main PUBLIC USE_INTEL_AESNI)
+    if(MSVC)
+        target_compile_options(main PRIVATE /arch:SSE2)
+    else()
+        target_compile_options(main PRIVATE -maes)
+    endif()
+elseif("${AES_IMPL}" STREQUAL "neon")
+    target_compile_definitions(main PUBLIC USE_ARM_NEON_AES)
+    target_compile_options(main PRIVATE -march=armv8-a+crypto)
+elseif("${AES_IMPL}" STREQUAL "portable")
+    target_compile_definitions(main PUBLIC USE_CXX_AES)
+endif()
+# add this block to after `add_executable(...)`
+```
+
+Then run **cmake-gui** choose which aes implementation you want to enable in the check-boxe of `AES_IMPL`.
+
+Or use the terminal command for bash/cmd`.
+
+```bash
+cmake -S . -B . -D AES_IMPL=<CHOSEN_AES>
+cmake --build . --config Release
+```
+
+The value of `<CHOSEN_AES>` could be `aesni`, `neon` or `portable`, .
+
+# Compiling in the Command Line
+
+1. **compile with [pure c/c++ code]**
+
+  ```
+  g++ -o sample.exe sample.cpp -O3
+  ```
+
+2. **comple with [AES-NI]**
+
+  ```
+  g++ -o sample.exe sample.cpp -D USE_INTEL_AESNI -maes -O3
+  ```
+
+3. **comple with [Arm-NEON-AES]**
+
+  ```
+  g++ -o sample.exe sample.cpp -D USE_ARM_NEON_AES -march=armv8-a+crypto -O3
+  ```
\ No newline at end of file
diff --git a/makefile b/makefile
index 05af8f3..cb3eab3 100644
--- a/makefile
+++ b/makefile
@@ -28,13 +28,13 @@ VERSION:=
 
 ifeq ($(VERSION), portable)
 COMPILATION_MSG="compiling portable version"
-DFLAGS=
+DFLAGS:=-D USE_CXX_AES
 else ifeq ($(VERSION), aesni)
 COMPILATION_MSG="compiling AES-NI version"
-DFLAGS=-D_USE_INTEL_AESNI -maes
+DFLAGS:=-D USE_INTEL_AESNI -maes
 else ifeq ($(VERSION), neon)
 COMPILATION_MSG="compiling AES aarch64 neon version"
-DFLAGS=-D_USE_ARM_NEON_AES -march=armv8-a+crypto
+DFLAGS:=-D USE_ARM_NEON_AES -march=armv8-a+crypto
 endif
 
 ########################## type ##########################
@@ -74,7 +74,7 @@ style:
 
 microbenchmark:
 	$(CXX) $(CXX_STANDARD) $(LINKER) microbench.cpp -o microbench1.out -O3
-	$(CXX) $(CXX_STANDARD) $(LINKER) microbench.cpp -o microbench2.out -O3 -D_USE_INTEL_AESNI -maes
+	$(CXX) $(CXX_STANDARD) $(LINKER) microbench.cpp -o microbench2.out -O3 -D USE_INTEL_AESNI -maes
 	@echo "Running micro-benchmarks"
 	@echo ""
 	@echo "# **micro-benchmark**" > micro-benchmark.md
diff --git a/microbench.cpp b/microbench.cpp
index 45d3c85..fb7f0a7 100644
--- a/microbench.cpp
+++ b/microbench.cpp
@@ -1,63 +1,60 @@
+#include <chrono>
 #include <cstring>
 #include <iostream>
-#include <chrono>
+#include <limits.h>
 #include <limits>
 #include <random>
-#include <limits.h>
 
 #include "AES.hpp"
 
 int main() {
-    constexpr size_t MB = 16 * 1024 * 1024;
-    constexpr size_t KEY_BIT_SIZE = 256;
-    static_assert(MB % 16 == 0, "Divisible by AES block size");
-
-    std::mt19937_64 engine(std::chrono::steady_clock::now().time_since_epoch().count());
-    std::uniform_int_distribution<unsigned char> rng(
-        std::numeric_limits<unsigned char>::min(), 
-        std::numeric_limits<unsigned char>::max()
-    );
-
-    unsigned char *data = new unsigned char [MB];
-    unsigned char *save = new unsigned char [MB];
-
-    for (size_t i = 0; i < MB; ++i) {
-        data[i] = rng(engine);
-        save[i] = data[i];
-    }
-
-    // benchmark start
-
-    unsigned char key[32] = {
-        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-        0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-    };
-
-    Cipher::Aes<KEY_BIT_SIZE> aes_cipher(key);
-
-    auto enc_start = std::chrono::high_resolution_clock::now();
-    for (size_t i = 0; i < MB; i += 16) {
-        aes_cipher.encrypt_block(&data[i]);
-    }
-    auto enc_end = std::chrono::high_resolution_clock::now();
-    auto enc_dur = std::chrono::duration_cast<std::chrono::milliseconds>(enc_end - enc_start);
-
-    std::cout << "| Encryption | " << KEY_BIT_SIZE << " | " << enc_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n";
-    
-    auto dec_start = std::chrono::high_resolution_clock::now();
-    for (size_t i = 0; i < MB; i += 16) {
-        aes_cipher.decrypt_block(&data[i]);
-    }
-    auto dec_end = std::chrono::high_resolution_clock::now();
-    auto dec_dur = std::chrono::duration_cast<std::chrono::milliseconds>(dec_end - dec_start);
-
-    std::cout << "| Decryption | " << KEY_BIT_SIZE << " | " << dec_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n";
-
-    int result = std::memcmp(data, save, MB);
-    // std::cout << "\n\nresult = " << result << "\n\n";
-    delete [] data;
-    delete [] save;
-    return result;
+  constexpr size_t MB = 16 * 1024 * 1024;
+  constexpr size_t KEY_BIT_SIZE = 256;
+  static_assert(MB % 16 == 0, "Divisible by AES block size");
+
+  std::mt19937_64 engine(std::chrono::steady_clock::now().time_since_epoch().count());
+  std::uniform_int_distribution<unsigned char> rng(
+    std::numeric_limits<unsigned char>::min(), std::numeric_limits<unsigned char>::max()
+  );
+
+  unsigned char *data = new unsigned char[MB];
+  unsigned char *save = new unsigned char[MB];
+
+  for (size_t i = 0; i < MB; ++i) {
+    data[i] = rng(engine);
+    save[i] = data[i];
+  }
+
+  // benchmark start
+
+  unsigned char key[32] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+
+  Cipher::Aes<KEY_BIT_SIZE> aes_cipher(key);
+
+  auto enc_start = std::chrono::high_resolution_clock::now();
+  for (size_t i = 0; i < MB; i += 16) {
+    aes_cipher.encrypt_block(&data[i]);
+  }
+  auto enc_end = std::chrono::high_resolution_clock::now();
+  auto enc_dur = std::chrono::duration_cast<std::chrono::milliseconds>(enc_end - enc_start);
+
+  std::cout << "| Encryption | " << KEY_BIT_SIZE << " | " << enc_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n";
+
+  auto dec_start = std::chrono::high_resolution_clock::now();
+  for (size_t i = 0; i < MB; i += 16) {
+    aes_cipher.decrypt_block(&data[i]);
+  }
+  auto dec_end = std::chrono::high_resolution_clock::now();
+  auto dec_dur = std::chrono::duration_cast<std::chrono::milliseconds>(dec_end - dec_start);
+
+  std::cout << "| Decryption | " << KEY_BIT_SIZE << " | " << dec_dur.count() << "ms | " << (MB / 1024) / 1024 << "|\n";
+
+  int result = std::memcmp(data, save, MB);
+  // std::cout << "\n\nresult = " << result << "\n\n";
+  delete[] data;
+  delete[] save;
+  return result;
 }
\ No newline at end of file
diff --git a/tests b/tests
new file mode 100755
index 0000000..959820a
Binary files /dev/null and b/tests differ