From 8448b61ec3c1f7ffdebd4f2f5b5dd0b21ffad785 Mon Sep 17 00:00:00 2001 From: stickz Date: Fri, 5 Jul 2024 22:48:51 -0400 Subject: [PATCH] libtorrent: Implement libpopcnt wrapper (#11) This is even faster than countBit1Fast() for desktops and implements the asm popcnt instruction for SSE 4.2. --- libtorrent/rak/algorithm.h | 12 +-- libtorrent/rak/libpopcnt.h | 156 +++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 10 deletions(-) create mode 100644 libtorrent/rak/libpopcnt.h diff --git a/libtorrent/rak/algorithm.h b/libtorrent/rak/algorithm.h index 644d9974..1f056901 100644 --- a/libtorrent/rak/algorithm.h +++ b/libtorrent/rak/algorithm.h @@ -40,6 +40,7 @@ #include #include #include +#include namespace rak { @@ -156,18 +157,9 @@ make_base(_InputIter __first, _InputIter __last, _Ftor __ftor) { return __base; } -inline int countBit1Fast(unsigned int n) { - n = (n & 0x55555555u) + ((n >> 1) & 0x55555555u); - n = (n & 0x33333333u) + ((n >> 2) & 0x33333333u); - n = (n & 0x0f0f0f0fu) + ((n >> 4) & 0x0f0f0f0fu); - n = (n & 0x00ff00ffu) + ((n >> 8) & 0x00ff00ffu); - n = (n & 0x0000ffffu) + ((n >>16) & 0x0000ffffu); - return n; -} - template inline int popcount_wrapper(T t) { - return countBit1Fast(t); + return popcnt64(t); } } diff --git a/libtorrent/rak/libpopcnt.h b/libtorrent/rak/libpopcnt.h new file mode 100644 index 00000000..a5298515 --- /dev/null +++ b/libtorrent/rak/libpopcnt.h @@ -0,0 +1,156 @@ +/* + * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit + * population count) in an array as quickly as possible using + * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON. + * + * Copyright (c) 2016 - 2024, Kim Walisch + * Copyright (c) 2016 - 2018, Wojciech Muła + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIBPOPCNT_H +#define LIBPOPCNT_H + +#include + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +#ifndef __has_include + #define __has_include(x) 0 +#endif + +#ifdef __GNUC__ + #define LIBPOPCNT_GNUC_PREREQ(x, y) \ + (__GNUC__ > x || (__GNUC__ == x && __GNUC_MINOR__ >= y)) +#else + #define LIBPOPCNT_GNUC_PREREQ(x, y) 0 +#endif + +#ifdef __clang__ + #define LIBPOPCNT_CLANG_PREREQ(x, y) \ + (__clang_major__ > x || (__clang_major__ == x && __clang_minor__ >= y)) +#else + #define LIBPOPCNT_CLANG_PREREQ(x, y) 0 +#endif + +#if (_MSC_VER < 1900) && \ + !defined(__cplusplus) + #define inline __inline +#endif + +#if (defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64)) + #define LIBPOPCNT_X86_OR_X64 +#endif + +#if LIBPOPCNT_GNUC_PREREQ(4, 2) || \ + __has_builtin(__builtin_popcount) + #define LIBPOPCNT_HAVE_BUILTIN_POPCOUNT +#endif + +#if LIBPOPCNT_GNUC_PREREQ(4, 2) || \ + LIBPOPCNT_CLANG_PREREQ(3, 0) + #define LIBPOPCNT_HAVE_ASM_POPCNT +#endif + +#if defined(LIBPOPCNT_X86_OR_X64) && \ + (defined(LIBPOPCNT_HAVE_ASM_POPCNT) || \ + defined(_MSC_VER)) + #define LIBPOPCNT_HAVE_POPCNT +#endif + +/* GCC compiler */ +#if defined(LIBPOPCNT_X86_OR_X64) && \ + LIBPOPCNT_GNUC_PREREQ(5, 0) + #define LIBPOPCNT_HAVE_AVX2 +#endif + +/* GCC compiler */ +#if defined(LIBPOPCNT_X86_OR_X64) && \ + LIBPOPCNT_GNUC_PREREQ(11, 0) + #define LIBPOPCNT_HAVE_AVX512 +#endif + +/* Clang (Unix-like OSes) */ +#if defined(LIBPOPCNT_X86_OR_X64) && !defined(_MSC_VER) + #if LIBPOPCNT_CLANG_PREREQ(3, 8) && \ + __has_attribute(target) && \ + (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) + #define LIBPOPCNT_HAVE_AVX2 + #endif + #if LIBPOPCNT_CLANG_PREREQ(9, 0) && \ + __has_attribute(target) && \ + (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) + #define LIBPOPCNT_HAVE_AVX512 + #endif +#endif + +/* + * This uses fewer arithmetic operations than any other known + * implementation on machines with fast multiplication. + * It uses 12 arithmetic operations, one of which is a multiply. + * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation + */ +static inline uint64_t popcnt64_bitwise(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +#if defined(LIBPOPCNT_HAVE_ASM_POPCNT) && \ + defined(__x86_64__) + +static inline uint64_t popcnt64(uint64_t x) +{ + __asm__ ("popcnt %1, %0" : "=r" (x) : "0" (x)); + return x; +} + +/* no hardware POPCNT, + * use pure integer algorithm */ +#else + +static inline uint64_t popcnt64(uint64_t x) +{ + return popcnt64_bitwise(x); +} + +#endif +#endif