diff --git a/.gitignore b/.gitignore
index 22f5a6cd6..47e5285bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,7 @@ CMakeFiles/
 bitsandbytes.dir/
 Debug/
 Release/
+csrc/config.h
 
 # IDE local files
 .vs/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62ff4e535..97f799930 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,15 +25,22 @@ list(APPEND SRC_FILES ${CPP_FILES})
 
 set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
 set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
+
+set(BNB_USE_STD_THREADS OFF CACHE BOOL "Use std::thread for parallelism")
+
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
   set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
 endif()
 
+if(WIN32)
+    set(BNB_USE_STD_THREADS ON)  # no pthread on Windows
+endif()
+
 set(BNB_OUTPUT_NAME "bitsandbytes")
 
-message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})")
+message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND}, std::thread: ${BNB_USE_STD_THREADS})")
 
 if(${COMPUTE_BACKEND} STREQUAL "cuda")
     if(APPLE)
@@ -188,7 +195,7 @@ set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
 target_compile_features(bitsandbytes PUBLIC cxx_std_14)
 target_include_directories(bitsandbytes PUBLIC csrc include)
-
+configure_file(csrc/config.h.in "csrc/config.h")
 
 if(BUILD_CUDA)
     target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
diff --git a/csrc/config.h.in b/csrc/config.h.in
new file mode 100644
index 000000000..950537154
--- /dev/null
+++ b/csrc/config.h.in
@@ -0,0 +1 @@
+#cmakedefine BNB_USE_STD_THREADS
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index e67135360..323013719 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,6 +1,18 @@
 #include <BinSearch.h>
 #include <common.h>
+#include "config.h"
+
+#ifdef BNB_USE_STD_THREADS
 #include <thread>
+#else
+#include <pthread.h>
+
+// Wrapper for `pthread_start` to match the signature of `std::thread`
+static void* quantize_block_w(void* arg) {
+    quantize_block(*(quantize_block_args*)arg);
+    return nullptr;
+}
+#endif
 
 using namespace BinSearch;
 
@@ -31,7 +43,11 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
     for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
     {
       long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
+#ifdef BNB_USE_STD_THREADS
       std::vector<std::thread> threads(valid_chunks);
+#else
+      std::vector<pthread_t> threads(valid_chunks);
+#endif
       std::vector<quantize_block_args> args(valid_chunks);
 
       int chunks_processed = 0;
@@ -51,13 +67,25 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
           arg.threadidx = block_idx / blocksize;
           arg.blocksize = blocksize;
 
+#ifdef BNB_USE_STD_THREADS
           threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
+#else
+          pthread_create(&threads[chunks_processed], NULL, quantize_block_w, &arg);
+          // TODO: handle error from pthread_create
+#endif
           chunks_processed += 1;
           if(chunks_processed == valid_chunks){ break; }
       }
 
       for (int i = 0; i < valid_chunks; i++)
+      {
+#ifdef BNB_USE_STD_THREADS
           threads[i].join();
+#else
+          int err = pthread_join(threads[i], NULL);
+#endif
+      }
+
     }
 
 }