diff --git a/.gitignore b/.gitignore index 22f5a6cd6..47e5285bd 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ CMakeFiles/ bitsandbytes.dir/ Debug/ Release/ +csrc/config.h # IDE local files .vs/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 62ff4e535..97f799930 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,15 +25,22 @@ list(APPEND SRC_FILES ${CPP_FILES}) set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)") set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps) + +set(BNB_USE_STD_THREADS OFF CACHE BOOL "Use std::thread for parallelism") + option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) if(APPLE) set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) endif() +if(WIN32) + set(BNB_USE_STD_THREADS ON) # no pthread on Windows +endif() + set(BNB_OUTPUT_NAME "bitsandbytes") -message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})") +message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND}, std::thread: ${BNB_USE_STD_THREADS})") if(${COMPUTE_BACKEND} STREQUAL "cuda") if(APPLE) @@ -188,7 +195,7 @@ set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) target_compile_features(bitsandbytes PUBLIC cxx_std_14) target_include_directories(bitsandbytes PUBLIC csrc include) - +configure_file(csrc/config.h.in "csrc/config.h") if(BUILD_CUDA) target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) diff --git a/csrc/config.h.in b/csrc/config.h.in new file mode 100644 index 000000000..950537154 --- /dev/null +++ b/csrc/config.h.in @@ -0,0 +1 @@ +#cmakedefine BNB_USE_STD_THREADS diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index e67135360..323013719 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,6 +1,18 @@ #include #include +#include "config.h" + +#ifdef BNB_USE_STD_THREADS #include +#else +#include + +// Wrapper for `pthread_start` to match the signature of `std::thread` +static void* quantize_block_w(void* arg) { + quantize_block(*(quantize_block_args*)arg); + return nullptr; +} +#endif using namespace BinSearch; @@ -31,7 +43,11 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; +#ifdef BNB_USE_STD_THREADS std::vector threads(valid_chunks); +#else + std::vector threads(valid_chunks); +#endif std::vector args(valid_chunks); int chunks_processed = 0; @@ -51,13 +67,25 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long arg.threadidx = block_idx / blocksize; arg.blocksize = blocksize; +#ifdef BNB_USE_STD_THREADS threads[chunks_processed] = std::thread([arg] { quantize_block(arg); }); +#else + pthread_create(&threads[chunks_processed], NULL, quantize_block_w, &arg); + // TODO: handle error from pthread_create +#endif chunks_processed += 1; if(chunks_processed == valid_chunks){ break; } } for (int i = 0; i < valid_chunks; i++) + { +#ifdef BNB_USE_STD_THREADS threads[i].join(); +#else + int err = pthread_join(threads[i], NULL); +#endif + } + } }