diff --git a/NEWS b/NEWS index fecc0bdb..92e7fd09 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,9 @@ to build individual modules using --disable-components logic. - out123: -- added --libversion +- libmpg123: +-- Avoid indirect branches into the assembly routines by using C wrappers, + relieving us of the need to care for bti / endbr instructions. 1.32.3 ------ diff --git a/src/libmpg123/decode.h b/src/libmpg123/decode.h index 9850f108..c40d4d6f 100644 --- a/src/libmpg123/decode.h +++ b/src/libmpg123/decode.h @@ -203,16 +203,6 @@ void INT123_dct64_i386 (real *,real *,real *); void INT123_dct64_altivec(real *,real *,real *); void INT123_dct64_i486(int*, int* , real*); /* Yeah, of no use outside of synth_i486.c .*/ -/* This is used by the layer 3 decoder, one generic function and 3DNow variants. */ -void INT123_dct36 (real *,real *,real *,const real *,real *); -void INT123_dct36_3dnow (real *,real *,real *,const real *,real *); -void INT123_dct36_3dnowext(real *,real *,real *,const real *,real *); -void INT123_dct36_x86_64 (real *,real *,real *,const real *,real *); -void INT123_dct36_sse (real *,real *,real *,const real *,real *); -void INT123_dct36_avx (real *,real *,real *,const real *,real *); -void INT123_dct36_neon (real *,real *,real *,const real *,real *); -void INT123_dct36_neon64 (real *,real *,real *,const real *,real *); - /* Tools for NtoM resampling synth, defined in ntom.c . */ int INT123_synth_ntom_set_step(mpg123_handle *fr); /* prepare ntom decoding */ unsigned long INT123_ntom_val(mpg123_handle *fr, int64_t frame); /* compute INT123_ntom_val for frame offset */ @@ -232,6 +222,13 @@ int64_t INT123_ntom_frameoff(mpg123_handle *fr, int64_t soff); /* Initialization of any static data that majy be needed at runtime. Make sure you call these once before it is too late. */ #ifndef NO_LAYER3 + +#ifdef OPT_THE_DCT36 +// Set the current dct36 function choice. The pointers themselves are to static functions. +void INT123_dct36_choose(mpg123_handle *fr); +int INT123_dct36_match(mpg123_handle *fr, enum optdec t); +#endif + #ifdef RUNTIME_TABLES void INT123_init_layer3(void); #endif diff --git a/src/libmpg123/frame.h b/src/libmpg123/frame.h index 07dfec53..a2e72cde 100644 --- a/src/libmpg123/frame.h +++ b/src/libmpg123/frame.h @@ -167,11 +167,9 @@ struct mpg123_handle_struct { #ifdef OPT_MULTI -#ifndef NO_LAYER3 -#if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64) +#ifdef OPT_THE_DCT36 void (*the_dct36)(real *,real *,real *,const real *,real *); #endif -#endif #endif enum optdec type; diff --git a/src/libmpg123/layer3.c b/src/libmpg123/layer3.c index 0ddaf5a5..c49dbba8 100644 --- a/src/libmpg123/layer3.c +++ b/src/libmpg123/layer3.c @@ -24,6 +24,14 @@ #include "../common/debug.h" +/* Predeclare the assembly routines, only called from wrappers here. */ +void INT123_dct36_3dnow (real *,real *,real *,const real *,real *); +void INT123_dct36_3dnowext(real *,real *,real *,const real *,real *); +void INT123_dct36_x86_64 (real *,real *,real *,const real *,real *); +void INT123_dct36_sse (real *,real *,real *,const real *,real *); +void INT123_dct36_avx (real *,real *,real *,const real *,real *); +void INT123_dct36_neon (real *,real *,real *,const real *,real *); +void INT123_dct36_neon64 (real *,real *,real *,const real *,real *); /* define CUT_SFB21 if you want to cut-off the frequency above 16kHz */ #if 0 @@ -1256,10 +1264,7 @@ static void III_antialias(real xr[SBLIMIT][SSLIMIT],struct gr_info_s *gr_info) Mathematics of Computation, Volume 32, Number 141, January 1978, Pages 175-199 */ - -/* Calculation of the inverse MDCT - used to be static without 3dnow - does that really matter? */ -void INT123_dct36(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf) +static void INT123_dct36(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf) { real tmp[18]; @@ -1449,6 +1454,105 @@ void INT123_dct36(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf) } } +// Wrap the assembly routine calls into C functions that serve as jump target to satisfy +// indirect branch protection if the toolchain enables that. Otherwise, we'd need to anticipate +// that in the assembly (and ensure assemblers support endbr64 and friends). +// Loss of efficiency: + +// In the case of one static optimization choice, we do not have that problem. + +#ifdef OPT_THE_DCT36 + +#define DCT36_WRAP(asmfunc) \ +static void asmfunc ## _wrap(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf) \ +{ \ + return asmfunc(inbuf, o1, o2, wintab, tsbuf); \ +} + +#ifdef OPT_SSE +DCT36_WRAP(INT123_dct36_sse) +#endif +#ifdef OPT_3DNOWEXT_VINTAGE +DCT36_WRAP(INT123_dct36_3dnowext) +#endif +#ifdef OPT_3DNOW_VINTAGE +DCT36_WRAP(INT123_dct36_3dnow) +#endif +#ifdef OPT_X86_64 +DCT36_WRAP(INT123_dct36_x86_64) +#endif +#ifdef OPT_AVX +DCT36_WRAP(INT123_dct36_avx) +#endif +#ifdef OPT_NEON +DCT36_WRAP(INT123_dct36_neon) +#endif +#ifdef OPT_NEON64 +DCT36_WRAP(INT123_dct36_neon64) +#endif + +int INT123_dct36_match(mpg123_handle *fr, enum optdec t) +{ +#ifdef OPT_SSE + if(t == sse && fr->cpu_opts.the_dct36 == INT123_dct36_sse_wrap) + return 1; +#endif +#ifdef OPT_3DNOWEXT_VINTAGE + if(t == dreidnowext_vintage && fr->cpu_opts.the_dct36 == INT123_dct36_3dnowext_wrap) + return 1; +#endif +#ifdef OPT_3DNOW_VINTAGE + if(t == dreidnow_vintage && fr->cpu_opts.the_dct36 == INT123_dct36_3dnow_wrap) + return 1; +#endif + return 0; +} + +void INT123_dct36_choose(mpg123_handle *fr) +{ + switch(fr->cpu_opts.type) + { +#ifdef OPT_SSE + case sse: + fr->cpu_opts.the_dct36 = INT123_dct36_sse_wrap; + break; +#endif +#ifdef OPT_3DNOWEXT_VINTAGE + case dreidnowext_vintage: + fr->cpu_opts.the_dct36 = INT123_dct36_3dnowext_wrap; + break; +#endif +#ifdef OPT_3DNOW_VINTAGE + case dreidnow_vintage: + fr->cpu_opts.the_dct36 = INT123_dct36_3dnow_wrap; + break; +#endif +#ifdef OPT_AVX + case avx: + fr->cpu_opts.the_dct36 = INT123_dct36_avx; + break; +#endif +#ifdef OPT_X86_64 + case x86_64: + fr->cpu_opts.the_dct36 = INT123_dct36_x86_64; + break; +#endif +#ifdef OPT_NEON + case neon: + fr->cpu_opts.the_dct36 = INT123_dct36_neon; + break; +#endif +#ifdef OPT_NEON64 + case neon: + fr->cpu_opts.the_dct36 = INT123_dct36_neon64; + break; +#endif + default: + fr->cpu_opts.the_dct36 = INT123_dct36; + } +} + +#endif /* new DCT12 */ static void dct12(real *in,real *rawout1,real *rawout2,register const real *wi,register real *ts) diff --git a/src/libmpg123/optimize.c b/src/libmpg123/optimize.c index 1d8232b0..00e43c16 100644 --- a/src/libmpg123/optimize.c +++ b/src/libmpg123/optimize.c @@ -160,8 +160,8 @@ static enum optdec sse_or_vintage(mpg123_handle *fr) enum optdec type; type = sse_vintage; # ifdef OPT_SSE -# ifdef OPT_MULTI - if(fr->cpu_opts.the_dct36 == INT123_dct36_sse) +# ifdef OPT_THE_DCT36 + if(INT123_dct36_match(fr, sse)) # endif type = sse; # endif @@ -192,7 +192,7 @@ static int find_dectype(mpg123_handle *fr) type = dreidnowext; # ifdef OPT_3DNOWEXT_VINTAGE # ifdef OPT_MULTI - if(fr->cpu_opts.the_dct36 == INT123_dct36_3dnowext) + if(INT123_dct36_match(fr, dreidnowext_vintage)) # endif type = dreidnowext_vintage; # endif @@ -210,7 +210,7 @@ static int find_dectype(mpg123_handle *fr) type = dreidnow; # ifdef OPT_3DNOW_VINTAGE # ifdef OPT_MULTI - if(fr->cpu_opts.the_dct36 == INT123_dct36_3dnow) + if(INT123_dct36_match(fr, dreidnow_vintage)) # endif type = dreidnow_vintage; # endif @@ -503,13 +503,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) #endif fr->cpu_opts.type = nodec; -#ifdef OPT_MULTI -#ifndef NO_LAYER3 -#if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64) - fr->cpu_opts.the_dct36 = INT123_dct36; -#endif -#endif -#endif /* covers any i386+ cpu; they actually differ only in the INT123_synth_1to1 function, mostly... */ #ifdef OPT_X86 if(cpu_i586(fr->cpu_flags)) @@ -523,11 +516,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = dn_sse; fr->cpu_opts.type = sse; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - /* if(cpu_fast_sse(fr->cpu_flags)) */ fr->cpu_opts.the_dct36 = INT123_dct36_sse; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_sse; # ifdef ACCURATE_ROUNDING @@ -590,11 +578,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = dn_dreidnowext_vintage; fr->cpu_opts.type = dreidnowext_vintage; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - fr->cpu_opts.the_dct36 = INT123_dct36_3dnowext; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_3dnowext; # endif @@ -619,11 +602,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = dn_dreidnow_vintage; fr->cpu_opts.type = dreidnow_vintage; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - fr->cpu_opts.the_dct36 = INT123_dct36_3dnow; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_3dnow; # endif @@ -723,11 +701,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = "x86-64 (AVX)"; fr->cpu_opts.type = avx; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - fr->cpu_opts.the_dct36 = INT123_dct36_avx; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_avx; fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_avx; @@ -749,11 +722,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = "x86-64 (SSE)"; fr->cpu_opts.type = x86_64; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - fr->cpu_opts.the_dct36 = INT123_dct36_x86_64; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_x86_64; fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_x86_64; @@ -796,11 +764,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = dn_neon; fr->cpu_opts.type = neon; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - fr->cpu_opts.the_dct36 = INT123_dct36_neon; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_neon; fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_neon; @@ -834,11 +797,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) { chosen = dn_neon64; fr->cpu_opts.type = neon64; -#ifdef OPT_MULTI -# ifndef NO_LAYER3 - fr->cpu_opts.the_dct36 = INT123_dct36_neon64; -# endif -#endif # ifndef NO_16BIT fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_neon64; fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_neon64; @@ -897,6 +855,10 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu) # endif # endif +#ifdef OPT_THE_DCT36 + INT123_dct36_choose(fr); +#endif + #ifdef OPT_DITHER if(done && dithered) { diff --git a/src/libmpg123/optimize.h b/src/libmpg123/optimize.h index 25df421c..6c0c19c4 100644 --- a/src/libmpg123/optimize.h +++ b/src/libmpg123/optimize.h @@ -364,19 +364,16 @@ extern const int INT123_costab_mmxsse[]; #endif #endif -/* - Now come two blocks of standard definitions for multi-decoder mode and single-decoder mode. - Most stuff is so automatic that it's indeed generated by some inline shell script. - Remember to use these scripts when possible, instead of direct repetitive hacking. -*/ - #ifdef OPT_MULTI # define defopt nodec +# ifndef NO_LAYER3 # if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64) +# define OPT_THE_DCT36 # define opt_dct36(fr) ((fr)->cpu_opts.the_dct36) # endif +# endif #endif /* OPT_MULTI else */