Skip to content

Commit

Permalink
Merge pull request #73 from Stefan-Olt/master
Browse files Browse the repository at this point in the history
aarch64 build fix and asm optimization (only autotools)
  • Loading branch information
Stefan-Olt authored Jul 19, 2024
2 parents faaf729 + 124ca6b commit c455fbe
Show file tree
Hide file tree
Showing 15 changed files with 12,585 additions and 41 deletions.
4 changes: 4 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ libmvtools_la_LDFLAGS = -no-undefined -avoid-version $(PLUGINLDFLAGS)

libmvtools_la_LIBADD = $(FFTW3F_LIBS)

if MVTOOLS_ARM
libmvtools_la_SOURCES += src/asm/aarch64-pixel-a.S
endif

if MVTOOLS_X86
libmvtools_la_SOURCES += src/asm/const-a.asm \
src/asm/cpu-a.asm \
Expand Down
4 changes: 3 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ LT_INIT([disable-static win32-dll])

AC_PROG_CC
AC_PROG_CXX
AM_PROG_AS

AC_CANONICAL_HOST

Expand Down Expand Up @@ -54,7 +55,7 @@ AS_CASE(
[i?86], [BITS="32" NASMFLAGS="$NASMFLAGS -DARCH_X86_64=0" X86="true"],
[x86_64|amd64], [BITS="64" NASMFLAGS="$NASMFLAGS -DARCH_X86_64=1 -DPIC" X86="true"],
[powerpc*], [PPC="true"],
[arm*], [ARM="true"],
[arm*|aarch*], [ARM="true"],
[AC_MSG_ERROR([Unknown host CPU: $host_cpu.])]
)

Expand All @@ -63,6 +64,7 @@ AS_CASE(
[darwin*],
[
NASMFLAGS="$NASMFLAGS -f macho$BITS -DPREFIX"
CPPFLAGS="$CPPFLAGS -DPREFIX"
],
[*linux*|gnu*|dragonfly*|*bsd*],
[
Expand Down
8 changes: 7 additions & 1 deletion src/CPU.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@ uint32_t cpu_detect(void) {
return cpu;
}

#else // not MVTOOLS_X86
#elif defined(MVTOOLS_ARM)

uint32_t cpu_detect(void) {
return ~0; // we just assume NEON is available, as there is no instruction to check
}

#else // not MVTOOLS_X86 or MVTOOLS_ARM

uint32_t cpu_detect(void) {
return 0;
Expand Down
3 changes: 3 additions & 0 deletions src/CPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ enum {
#ifdef MVTOOLS_X86
MVOPT_SSE2 = 1,
MVOPT_AVX2 = 2,
#elif MVTOOLS_ARM
MVOPT_NEON = 1,
MVOPT_SSE2 = 1, // SSE2 is converted to Neon
#endif // MVTOOLS_X86
};

Expand Down
8 changes: 6 additions & 2 deletions src/DCTFFTW.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,13 @@ static void Float2Pixels_C(const DCTFFTW *dct, uint8_t *dstp8, int dst_pitch, fl
}


#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)

#if defined(MVTOOLS_ARM)
#include "sse2neon.h"
#else
#include <emmintrin.h>
#endif

template <typename PixelType>
static void Float2Pixels_SSE2(const DCTFFTW *dct, uint8_t *dstp8, int dst_pitch, float *realdata) {
Expand Down Expand Up @@ -153,7 +157,7 @@ void dctInit(DCTFFTW *dct, int sizex, int sizey, int bitsPerSample, int opt) {
dct->Float2Pixels = Float2Pixels_C<uint16_t>;

if (opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
if (bitsPerSample == 8)
dct->Float2Pixels = Float2Pixels_SSE2<uint8_t>;
else
Expand Down
10 changes: 7 additions & 3 deletions src/Luma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,13 @@ unsigned int luma_c(const uint8_t *pSrc8, intptr_t nSrcPitch) {
}


#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)

#if defined(MVTOOLS_ARM)
#include "sse2neon.h"
#else
#include <emmintrin.h>
#endif


#define zeroes _mm_setzero_si128()
Expand Down Expand Up @@ -69,7 +73,7 @@ unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) {
// opt can fit in four bits, if the width and height need more than eight bits each.
#define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
#define LUMA_SSE2(width, height) \
{ KEY(width, height, 8, SSE2), luma_sse2<width, height> },
#else
Expand Down Expand Up @@ -110,7 +114,7 @@ static const std::unordered_map<uint32_t, LUMAFunction> luma_functions = {
LUMAFunction selectLumaFunction(unsigned width, unsigned height, unsigned bits, int opt) {
LUMAFunction luma = luma_functions.at(KEY(width, height, bits, Scalar));

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
if (opt) {
try {
luma = luma_functions.at(KEY(width, height, bits, SSE2));
Expand Down
9 changes: 5 additions & 4 deletions src/MVDegrains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ static void VS_CC mvdegrainFree(void *instanceData, VSCore *core, const VSAPI *v
// opt can fit in four bits, if the width and height need more than eight bits each.
#define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
#define DEGRAIN_SSE2(radius, width, height) \
{ KEY(width, height, 8, MVOPT_SSE2), Degrain_sse2<radius, width, height> },

Expand Down Expand Up @@ -458,17 +458,18 @@ static const std::unordered_map<uint32_t, DenoiseFunction> degrain_functions_sse
static DenoiseFunction selectDegrainFunction(unsigned radius, unsigned width, unsigned height, unsigned bits, int opt) {
DenoiseFunction degrain = degrain_functions[radius - 1].at(KEY(width, height, bits, MVOPT_SCALAR));

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
if (opt) {
try {
degrain = degrain_functions_sse2[radius - 1].at(KEY(width, height, bits, MVOPT_SSE2));
} catch (std::out_of_range &) { }

#if defined(MVTOOLS_X86)
if (g_cpuinfo & X264_CPU_AVX2) {
DenoiseFunction tmp = selectDegrainFunctionAVX2(radius, width, height, bits);
if (tmp)
degrain = tmp;
}
#endif
}
#endif

Expand Down Expand Up @@ -496,7 +497,7 @@ static void selectFunctions(MVDegrainData *d) {

d->ToPixels = ToPixels_uint16_t_uint8_t;

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
if (d->opt) {
d->LimitChanges = LimitChanges_sse2;
}
Expand Down
10 changes: 7 additions & 3 deletions src/MVDegrains.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,16 @@ static void Degrain_C(uint8_t * __restrict pDst8, int nDstPitch, const uint8_t *
}


#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)

#if defined(MVTOOLS_ARM)
#include "sse2neon.h"
#else
#include <emmintrin.h>

DenoiseFunction selectDegrainFunctionAVX2(unsigned radius, unsigned width, unsigned height, unsigned bits);
#endif

// XXX Moves the pointers passed in pRefs. This is okay because they are not
// used after this function is done with them.
template <int radius, int blockWidth, int blockHeight>
Expand Down Expand Up @@ -133,8 +139,6 @@ static void Degrain_sse2(uint8_t *pDst, int nDstPitch, const uint8_t *pSrc, int
}
}

DenoiseFunction selectDegrainFunctionAVX2(unsigned radius, unsigned width, unsigned height, unsigned bits);

static void LimitChanges_sse2(uint8_t *pDst, intptr_t nDstPitch, const uint8_t *pSrc, intptr_t nSrcPitch, intptr_t nWidth, intptr_t nHeight, intptr_t nLimit) {
__m128i bytes_limit = _mm_set1_epi8(nLimit);

Expand Down
45 changes: 28 additions & 17 deletions src/MVFrame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,7 @@
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif

#if defined(MVTOOLS_X86)

#include <emmintrin.h>

#define zeroes _mm_setzero_si128()
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)

/* TODO: port these
extern "C" void VerticalBicubic_iSSE(uint8_t *pDst, const uint8_t *pSrc, intptr_t nDstPitch,
Expand All @@ -49,6 +45,12 @@
extern "C" void RB2FilteredHorizontalInplaceLine_SSE(uint8_t *pSrc, intptr_t nWidthMMX);
*/

#if defined(MVTOOLS_ARM)
#include "sse2neon.h"
#else

#include <emmintrin.h>

void Average2_avx2(uint8_t *pDst, const uint8_t *pSrc1, const uint8_t *pSrc2, intptr_t nPitch, intptr_t nWidth, intptr_t nHeight);
void VerticalBilinear_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample);
Expand All @@ -61,6 +63,12 @@ void VerticalWiener_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
void HorizontalWiener_avx2(uint8_t *pDst, const uint8_t *pSrc, intptr_t nPitch,
intptr_t nWidth, intptr_t nHeight, intptr_t bitsPerSample) ;

#endif



#define zeroes _mm_setzero_si128()

static void Average2_sse2(uint8_t *pDst, const uint8_t *pSrc1, const uint8_t *pSrc2, intptr_t nPitch, intptr_t nWidth, intptr_t nHeight) {
for (int y = 0; y < nHeight; y++) {
for (int x = 0; x < nWidth; x += 16) {
Expand Down Expand Up @@ -707,7 +715,7 @@ static void RB2BilinearFilteredVertical(uint8_t *pDst8, const uint8_t *pSrc8, in
int xstart = 0;

if (sizeof(PixelType) == 1 && opt && nWidthMMX >= 8) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
RB2BilinearFilteredVerticalLine_sse2((uint8_t *)pDst, (const uint8_t *)pSrc, nSrcPitch, nWidthMMX);
xstart = nWidthMMX;
#endif
Expand Down Expand Up @@ -745,7 +753,7 @@ static void RB2BilinearFilteredHorizontalInplace(uint8_t *pSrc8, int nSrcPitch,
int xstart = 1;

if (sizeof(PixelType) == 1 && opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
RB2BilinearFilteredHorizontalInplaceLine_sse2((uint8_t *)pSrc, nWidthMMX); /* very first is skipped */
xstart = nWidthMMX;
#endif
Expand Down Expand Up @@ -797,7 +805,7 @@ static void RB2QuadraticVertical(uint8_t *pDst8, const uint8_t *pSrc8, int nDstP
int xstart = 0;

if (sizeof(PixelType) == 1 && opt && nWidthMMX >= 8) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
RB2QuadraticVerticalLine_sse2((uint8_t *)pDst, (const uint8_t *)pSrc, nSrcPitch, nWidthMMX);
xstart = nWidthMMX;
#endif
Expand Down Expand Up @@ -848,7 +856,7 @@ static void RB2QuadraticHorizontalInplace(uint8_t *pSrc8, int nSrcPitch, int nWi
int xstart = 1;

if (sizeof(PixelType) == 1 && opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
RB2QuadraticHorizontalInplaceLine_sse2((uint8_t *)pSrc, nWidthMMX);
xstart = nWidthMMX;
#endif
Expand Down Expand Up @@ -913,7 +921,7 @@ static void RB2CubicVertical(uint8_t *pDst8, const uint8_t *pSrc8, int nDstPitch
int xstart = 0;

if (sizeof(PixelType) == 1 && opt && nWidthMMX >= 8) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
RB2CubicVerticalLine_sse2((uint8_t *)pDst, (const uint8_t *)pSrc, nSrcPitch, nWidthMMX);
xstart = nWidthMMX;
#endif
Expand Down Expand Up @@ -964,7 +972,7 @@ static void RB2CubicHorizontalInplace(uint8_t *pSrc8, int nSrcPitch, int nWidth,
int xstart = 1;

if (sizeof(PixelType) == 1 && opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
RB2CubicHorizontalInplaceLine_sse2((uint8_t *)pSrc, nWidthMMX);
xstart = nWidthMMX;
#endif
Expand Down Expand Up @@ -1395,16 +1403,17 @@ void mvpRefine(MVPlane *mvp, int sharp) {
refine[2] = DiagonalBilinear<uint8_t>;

if (mvp->opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
refine[0] = HorizontalBilinear_sse2;
refine[1] = VerticalBilinear_sse2;
refine[2] = DiagonalBilinear_sse2;

#if defined(MVTOOLS_X86)
if (g_cpuinfo & X264_CPU_AVX2) {
refine[0] = HorizontalBilinear_avx2;
refine[1] = VerticalBilinear_avx2;
refine[2] = DiagonalBilinear_avx2;
}
#endif
#endif
}
} else {
Expand Down Expand Up @@ -1434,14 +1443,15 @@ void mvpRefine(MVPlane *mvp, int sharp) {
refine[1] = VerticalWiener<uint8_t>;

if (mvp->opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
refine[0] = refine[2] = HorizontalWiener_sse2;
refine[1] = VerticalWiener_sse2;

#if defined(MVTOOLS_X86)
if (g_cpuinfo & X264_CPU_AVX2) {
refine[0] = refine[2] = HorizontalWiener_avx2;
refine[1] = VerticalWiener_avx2;
}
#endif
#endif
}
} else {
Expand Down Expand Up @@ -1485,11 +1495,12 @@ void mvpRefine(MVPlane *mvp, int sharp) {
avg = Average2<uint8_t>;

if (mvp->opt) {
#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
avg = Average2_sse2;

#if defined(MVTOOLS_X86)
if (g_cpuinfo & X264_CPU_AVX2)
avg = Average2_avx2;
#endif
#endif
}
} else {
Expand Down
13 changes: 9 additions & 4 deletions src/Overlap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,13 @@ void overlaps_c(uint8_t *pDst8, intptr_t nDstPitch, const uint8_t *pSrc8, intptr
}


#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)

#if defined(MVTOOLS_ARM)
#include "sse2neon.h"
#else
#include <emmintrin.h>
#endif


#define zeroes _mm_setzero_si128()
Expand Down Expand Up @@ -236,7 +240,7 @@ struct OverlapsWrapper<4, blockHeight> {
// opt can fit in four bits, if the width and height need more than eight bits each.
#define KEY(width, height, bits, opt) (unsigned)(width) << 24 | (height) << 16 | (bits) << 8 | (opt)

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
#define OVERS_SSE2(width, height) \
{ KEY(width, height, 8, MVOPT_SSE2), OverlapsWrapper<width, height>::overlaps_sse2 },
#else
Expand Down Expand Up @@ -305,17 +309,18 @@ static const std::unordered_map<uint32_t, OverlapsFunction> overlaps_functions =
OverlapsFunction selectOverlapsFunction(unsigned width, unsigned height, unsigned bits, int opt) {
OverlapsFunction overs = overlaps_functions.at(KEY(width, height, bits, MVOPT_SCALAR));

#if defined(MVTOOLS_X86)
#if defined(MVTOOLS_X86) || defined(MVTOOLS_ARM)
if (opt) {
try {
overs = overlaps_functions.at(KEY(width, height, bits, MVOPT_SSE2));
} catch (std::out_of_range &) { }

#ifdef MVTOOLS_X86
if (g_cpuinfo & X264_CPU_AVX2) {
OverlapsFunction tmp = selectOverlapsFunctionAVX2(width, height, bits);
if (tmp)
overs = tmp;
}
#endif
}
#endif

Expand Down
Loading

0 comments on commit c455fbe

Please sign in to comment.