Skip to content

Commit

Permalink
NVIDIA: Adding cuPQC as a backend for ML-KEM. (#2044)
Browse files Browse the repository at this point in the history
* Adding cuPQC as a backend for ML-KEM.

Signed-off-by: Steven Reeves <[email protected]>

* Fixing transposition error that left out OQS_USE_CUPQC in CMake system.

Signed-off-by: Steven Reeves <[email protected]>

* Add CMake dependent options for cupqc. Fixed formatting in kem_ml_kem_####.c and kem/family/kem_scheme.c

Signed-off-by: Steven Reeves <[email protected]>

* Move cupqc_ml-kem source files to correctly named dir

Signed-off-by: Pravek Sharma <[email protected]>

* Stop piggybacking on pqcrystals-kyber-standard and move cupqc_ml-kem metadata to separate upstream repo

Signed-off-by: Pravek Sharma <[email protected]>

* Update licensing information

Signed-off-by: Pravek Sharma <[email protected]>

* Update PLATFORMS.md

Signed-off-by: Pravek Sharma <[email protected]>

* Fix kem_family cmakelists template

Signed-off-by: Pravek Sharma <[email protected]>

* Run copy_from_upsream.py and pull updated upstream

Signed-off-by: Pravek Sharma <[email protected]>

* Add cupqc build test to basic.yml

Signed-off-by: Pravek Sharma <[email protected]>

* Move cupqc build test from basic.yml to linux.yml

Signed-off-by: Pravek Sharma <[email protected]>

* Fix error in linux.yml

Signed-off-by: Pravek Sharma <[email protected]>

* fixup! Fix error in linux.yml

Signed-off-by: Pravek Sharma <[email protected]>

* Redo cupqc build check

Signed-off-by: Pravek Sharma <[email protected]>

* Supply default CUDA arch to cupqc-buildcheck configuration stage

Signed-off-by: Pravek Sharma <[email protected]>

* Specify CUDAXX in cupqc-buildcheck

Signed-off-by: Pravek Sharma <[email protected]>

* Make cuPQC_DIR explicit in cupqc-buildcheck

Signed-off-by: Pravek Sharma <[email protected]>

---------

Signed-off-by: Steven Reeves <[email protected]>
Signed-off-by: Pravek Sharma <[email protected]>
Co-authored-by: Pravek Sharma <[email protected]>
  • Loading branch information
stevenireeves and praveksharma authored Jan 27, 2025
1 parent 99affa6 commit 6a16ac6
Show file tree
Hide file tree
Showing 22 changed files with 805 additions and 61 deletions.
18 changes: 18 additions & 0 deletions .CMake/alg_support.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -338,18 +338,36 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCT
endif()
endif()

if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
if(OQS_USE_CUPQC)
cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cuda "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF)
endif()
endif()

if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
endif()
endif()

if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
if(OQS_USE_CUPQC)
cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cuda "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
endif()
endif()

if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
endif()
endif()

if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
if(OQS_USE_CUPQC)
cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cuda "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
endif()
endif()


if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
Expand Down
13 changes: 13 additions & 0 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,19 @@ jobs:
--numprocesses=auto \
--ignore=tests/test_code_conventions.py ${{ matrix.PYTEST_ARGS }}"
cupqc-buildcheck:
name: Check that code builds with OQS_USE_CUPQC=ON
runs-on: ubuntu-latest
container: openquantumsafe/ci-ubuntu-latest:latest
steps:
- name: Checkout code
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
- name: Configure
run: mkdir build && cd build && cuPQC_DIR=/cupqc/cupqc/cupqc-pkg-0.2.0/cmake/ CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N ..
- name: Build code
run: ninja
working-directory: build

linux_cross_compile:
runs-on: ubuntu-latest
container: openquantumsafe/ci-ubuntu-latest:latest
Expand Down
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ option(OQS_LIBJADE_BUILD "Enable formally verified implementation of supported a
option(OQS_PERMIT_UNSUPPORTED_ARCHITECTURE "Permit compilation on an an unsupported architecture." OFF)
option(OQS_STRICT_WARNINGS "Enable all compiler warnings." OFF)
option(OQS_EMBEDDED_BUILD "Compile liboqs for an Embedded environment without a full standard library." OFF)
option(OQS_USE_CUPQC "Utilize cuPQC as the backend for supported PQC algorithms." OFF)

# Libfuzzer isn't supported on gcc
if('${CMAKE_C_COMPILER_ID}' STREQUAL 'Clang')
Expand Down Expand Up @@ -140,6 +141,16 @@ else()
message(FATAL_ERROR "Unknown or unsupported processor: " ${CMAKE_SYSTEM_PROCESSOR} ". Override by setting OQS_PERMIT_UNSUPPORTED_ARCHITECTURE=ON")
endif()

if(${OQS_USE_CUPQC})
# CMAKE's CUDA language requires CMAKE 3.18
cmake_minimum_required (VERSION 3.18)
enable_language(CUDA)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 80 90)
endif()
find_package(cuPQC 0.2.0 REQUIRED)
endif()

if (NOT ((CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") AND (ARCH_X86_64 STREQUAL "ON")) AND (OQS_LIBJADE_BUILD STREQUAL "ON"))
message(FATAL_ERROR "Building liboqs with libjade implementations from libjade is only supported on Linux and Darwin on x86_64.")
endif()
Expand Down
8 changes: 8 additions & 0 deletions CONFIGURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ The following options can be passed to CMake before the build file generation pr
- [OQS_DIST_BUILD](#OQS_DIST_BUILD)
- [OQS_USE_CPUFEATURE_INSTRUCTIONS](#OQS_USE_CPUFEATURE_INSTRUCTIONS)
- [OQS_USE_OPENSSL](#OQS_USE_OPENSSL)
- [OQS_USE_CUPQC](#OQS_USE_CUPQC)
- [OQS_OPT_TARGET](#OQS_OPT_TARGET)
- [OQS_SPEED_USE_ARM_PMU](#OQS_SPEED_USE_ARM_PMU)
- [USE_SANITIZER](#USE_SANITIZER)
Expand Down Expand Up @@ -124,6 +125,13 @@ Dynamically load OpenSSL through `dlopen`. When using liboqs from other cryptogr

Only has an effect if the system supports `dlopen` and ELF binary format, such as Linux or BSD family.

### OQS_USE_CUPQC

Can be `ON` or `OFF`. When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM). When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirements, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). While the code shipped by liboqs required to use cuPQC is licensed under Apache 2.0 the cuPQC SDK comes with its own license agreement (https://docs.nvidia.com/cuda/cupqc/license.html).

**Default**: `OFF`


## Stateful Hash Based Signatures

XMSS and LMS are the two supported Hash-Based Signatures schemes.
Expand Down
1 change: 1 addition & 0 deletions PLATFORMS.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,4 @@ In this policy, the words "must" and "must not" specify absolute requirements th
- ppc641e for Ubuntu (Focal)
- s390x for Ubuntu (Focal)
- loongarch64 for Debian Linux (trixie)
- NVIDIA GPU architectures 70, 75, 80, 86, 89, and 90 with a x86_64 CPU for Linux
7 changes: 7 additions & 0 deletions docs/algorithms/kem/ml_kem.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
- **Primary Source**<a name="primary-source"></a>:
- **Source**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
- **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
- **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
- **cupqc-cuda**:<a name="cupqc-cuda"></a>
- **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
- **Implementation license (SPDX-Identifier)**: Apache-2.0


## Parameter set summary
Expand All @@ -25,6 +29,7 @@
|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
| [Primary Source](#primary-source) | ref | All | All | None | True | True | False |
| [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False |
| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False |

Are implementations chosen based on runtime CPU feature detection? **Yes**.

Expand All @@ -36,6 +41,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
| [Primary Source](#primary-source) | ref | All | All | None | True | True | False |
| [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False |
| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False |

Are implementations chosen based on runtime CPU feature detection? **Yes**.

Expand All @@ -45,6 +51,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
| [Primary Source](#primary-source) | ref | All | All | None | True | True | False |
| [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False |
| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False |

Are implementations chosen based on runtime CPU feature detection? **Yes**.

Expand Down
34 changes: 34 additions & 0 deletions docs/algorithms/kem/ml_kem.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ primary-upstream:
source: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd
with copy_from_upstream patches
spdx-license-identifier: CC0-1.0 or Apache-2.0
optimized-upstreams:
cupqc-cuda:
source: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
spdx-license-identifier: Apache-2.0
parameter-sets:
- name: ML-KEM-512
claimed-nist-level: 1
Expand Down Expand Up @@ -54,6 +58,16 @@ parameter-sets:
no-secret-dependent-branching-claimed: true
no-secret-dependent-branching-checked-by-valgrind: true
large-stack-usage: false
- upstream: cupqc-cuda
upstream-id: cuda
supported-platforms:
- architecture: CUDA
operating_systems:
- Linux
- Darwin
no-secret-dependent-branching-claimed: false
no-secret-dependent-branching-checked-by-valgrind: false
large-stack-usage: false
- name: ML-KEM-768
claimed-nist-level: 3
claimed-security: IND-CCA2
Expand Down Expand Up @@ -87,6 +101,16 @@ parameter-sets:
no-secret-dependent-branching-claimed: true
no-secret-dependent-branching-checked-by-valgrind: true
large-stack-usage: false
- upstream: cupqc-cuda
upstream-id: cuda
supported-platforms:
- architecture: CUDA
operating_systems:
- Linux
- Darwin
no-secret-dependent-branching-claimed: false
no-secret-dependent-branching-checked-by-valgrind: false
large-stack-usage: false
- name: ML-KEM-1024
claimed-nist-level: 5
claimed-security: IND-CCA2
Expand Down Expand Up @@ -120,3 +144,13 @@ parameter-sets:
no-secret-dependent-branching-claimed: true
no-secret-dependent-branching-checked-by-valgrind: true
large-stack-usage: false
- upstream: cupqc-cuda
upstream-id: cuda
supported-platforms:
- architecture: CUDA
operating_systems:
- Linux
- Darwin
no-secret-dependent-branching-claimed: false
no-secret-dependent-branching-checked-by-valgrind: false
large-stack-usage: false
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@ if(OQS_DIST_X86_64_BUILD OR ({% for flag in platform['required_flags'] -%} OQS_U
{%- endif %}
endif()
{% if platform['operating_systems'] %}endif()
{% endif -%}
{%- endfor -%}
{%- for platform in impl['supported_platforms'] if platform['architecture'] == 'CUDA' %}
{% if platform['operating_systems'] %}if(CMAKE_SYSTEM_NAME MATCHES "{{ platform['operating_systems']|join('|') }}")
{% endif -%}
if(OQS_USE_CUPQC)
cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}" OFF)
{%- if 'alias_scheme' in scheme %}
cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}" OFF)
{%- endif %}
endif()
{% if platform['operating_systems'] %}endif()
{% endif -%}
{%- endfor -%}
{%- for platform in impl['supported_platforms'] if platform['architecture'] == 'ARM64_V8' %}
Expand Down
34 changes: 18 additions & 16 deletions scripts/copy_from_upstream/copy_from_upstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,14 +495,15 @@ def handle_implementation(impl, family, scheme, dst_basedir):
else:
# determine list of files to copy:
if 'sources' in i:
srcs = i['sources'].split(" ")
for s in srcs:
# Copy recursively only in case of directories not with plain files to avoid copying over symbolic links
if os.path.isfile(os.path.join(origfolder, s)):
subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
else:
subprocess.run(
['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
if i['sources']:
srcs = i['sources'].split(" ")
for s in srcs:
# Copy recursively only in case of directories not with plain files to avoid copying over symbolic links
if os.path.isfile(os.path.join(origfolder, s)):
subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
else:
subprocess.run(
['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
else:
subprocess.run(['cp', '-pr', os.path.join(origfolder, '.'), srcfolder])
# raise Exception("Malformed YML file: No sources listed to copy. Check upstream YML file." )
Expand Down Expand Up @@ -598,14 +599,15 @@ def process_families(instructions, basedir, with_kat, with_generator, with_libja
# when provided to the compiler; OQS uses the term ARM_NEON
if req['architecture'] == 'arm_8':
req['architecture'] = 'ARM64_V8'
if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']:
req['required_flags'].remove('asimd')
req['required_flags'].append('arm_neon')
if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']:
req['required_flags'].remove('sha3')
req['required_flags'].append('arm_sha3')
impl['required_flags'] = req['required_flags']
family['all_required_flags'].update(req['required_flags'])
if 'required_flags' in req:
if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']:
req['required_flags'].remove('asimd')
req['required_flags'].append('arm_neon')
if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']:
req['required_flags'].remove('sha3')
req['required_flags'].append('arm_sha3')
impl['required_flags'] = req['required_flags']
family['all_required_flags'].update(req['required_flags'])
except KeyError as ke:
if (impl['name'] != family['default_implementation']):
print("No required flags found for %s (KeyError %s on impl %s)" % (
Expand Down
12 changes: 12 additions & 0 deletions scripts/copy_from_upstream/copy_from_upstream.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ upstreams:
kem_meta_path: '{pretty_name_full}_META.yml'
kem_scheme_path: '.'
patches: [pqcrystals-ml_kem.patch]
-
name: cupqc
git_url: https://github.com/praveksharma/cupqc-mlkem.git
git_branch: main
git_commit: b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
kem_meta_path: '{pretty_name_full}_META.yml'
kem_scheme_path: '.'
patches: []
-
name: pqcrystals-dilithium
git_url: https://github.com/pq-crystals/dilithium.git
Expand Down Expand Up @@ -166,6 +174,10 @@ kems:
-
name: ml_kem
default_implementation: ref
arch_specific_implementations:
cuda: cuda
arch_specific_upstream_locations:
cuda: cupqc
upstream_location: pqcrystals-kyber-standard
schemes:
-
Expand Down
9 changes: 9 additions & 0 deletions scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,19 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in
target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PUBLIC {{ impl['compile_opts'] }})
{%- endif -%}

{%- elif impl['name'] == 'cuda' %}

if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu)
target_link_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc)
set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF)
target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }})
{%- else %}

if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {% for source_file in impl['sources']|sort -%}{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/{{ source_file }}{%- if not loop.last %} {% endif -%}{%- endfor -%})
{%- endif %}
{%- if impl['name'] != 'cuda' %}
target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }})
target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
{%- if impl['name'] != scheme['default_implementation'] and impl['required_flags'] -%}
Expand All @@ -60,6 +68,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if
target_compile_definitions({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE old_gas_syntax)
endif()
{%- endif %}
{%- endif %}{# cupqc #}
set(_{{ family|upper }}_OBJS ${_{{ family|upper }}_OBJS} $<TARGET_OBJECTS:{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}>)
endif()
{%- endfor -%}
Expand Down
Loading

0 comments on commit 6a16ac6

Please sign in to comment.