Skip to content

Commit

Permalink
libmpg123: call dct36 assembly routines via C wrappers for indirect b…
Browse files Browse the repository at this point in the history
…ranching

That way, indirect jumps only land in C code that may have the special instructions
for the respective protection against manipulated endpoints. Testing performance
impact:

sse trunk-clean

real	0m12.303s
user	0m11.906s
sys	0m0.402s

sse trunk

real	0m12.642s
user	0m12.338s
sys	0m0.306s

sse_vintage trunk-clean

real	0m13.281s
user	0m12.946s
sys	0m0.336s

sse_vintage trunk

real	0m13.333s
user	0m13.039s
sys	0m0.296s

Compare fixed sse build (--with-cpu=sse)

sse trunk-clean

real	0m12.466s
user	0m12.180s
sys	0m0.288s

sse trunk

real	0m12.399s
user	0m12.068s
sys	0m0.333s

dynamic build on amd64

x86-64 trunk-clean

real	0m10.628s
user	0m10.298s
sys	0m0.336s

x86-64 trunk

real	0m10.595s
user	0m10.323s
sys	0m0.274s

avx trunk-clean

real	0m10.213s
user	0m9.949s
sys	0m0.265s

avx trunk

real	0m10.313s
user	0m9.967s
sys	0m0.348s


There is some impact. It also somewhat matters more for x86, where there are no CPUs that actually
benefit from this. You can avoid this by compiling only one fixed optimization for your CPU.




git-svn-id: svn://scm.orgis.org/mpg123/trunk@5389 35dc7657-300d-0410-a2e5-dc2837fedb53
  • Loading branch information
thor committed Dec 29, 2023
1 parent a08fe96 commit 6425108
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 69 deletions.
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
to build individual modules using --disable-components logic.
- out123:
-- added --libversion
- libmpg123:
-- Avoid indirect branches into the assembly routines by using C wrappers,
relieving us of the need to care for bti / endbr instructions.

1.32.3
------
Expand Down
17 changes: 7 additions & 10 deletions src/libmpg123/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,16 +203,6 @@ void INT123_dct64_i386 (real *,real *,real *);
void INT123_dct64_altivec(real *,real *,real *);
void INT123_dct64_i486(int*, int* , real*); /* Yeah, of no use outside of synth_i486.c .*/

/* This is used by the layer 3 decoder, one generic function and 3DNow variants. */
void INT123_dct36 (real *,real *,real *,const real *,real *);
void INT123_dct36_3dnow (real *,real *,real *,const real *,real *);
void INT123_dct36_3dnowext(real *,real *,real *,const real *,real *);
void INT123_dct36_x86_64 (real *,real *,real *,const real *,real *);
void INT123_dct36_sse (real *,real *,real *,const real *,real *);
void INT123_dct36_avx (real *,real *,real *,const real *,real *);
void INT123_dct36_neon (real *,real *,real *,const real *,real *);
void INT123_dct36_neon64 (real *,real *,real *,const real *,real *);

/* Tools for NtoM resampling synth, defined in ntom.c . */
int INT123_synth_ntom_set_step(mpg123_handle *fr); /* prepare ntom decoding */
unsigned long INT123_ntom_val(mpg123_handle *fr, int64_t frame); /* compute INT123_ntom_val for frame offset */
Expand All @@ -232,6 +222,13 @@ int64_t INT123_ntom_frameoff(mpg123_handle *fr, int64_t soff);
/* Initialization of any static data that majy be needed at runtime.
Make sure you call these once before it is too late. */
#ifndef NO_LAYER3

#ifdef OPT_THE_DCT36
// Set the current dct36 function choice. The pointers themselves are to static functions.
void INT123_dct36_choose(mpg123_handle *fr);
int INT123_dct36_match(mpg123_handle *fr, enum optdec t);
#endif

#ifdef RUNTIME_TABLES
void INT123_init_layer3(void);
#endif
Expand Down
4 changes: 1 addition & 3 deletions src/libmpg123/frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,9 @@ struct mpg123_handle_struct
{
#ifdef OPT_MULTI

#ifndef NO_LAYER3
#if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64)
#ifdef OPT_THE_DCT36
void (*the_dct36)(real *,real *,real *,const real *,real *);
#endif
#endif

#endif
enum optdec type;
Expand Down
112 changes: 108 additions & 4 deletions src/libmpg123/layer3.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@
#include "../common/debug.h"


/* Predeclare the assembly routines, only called from wrappers here. */
void INT123_dct36_3dnow (real *,real *,real *,const real *,real *);
void INT123_dct36_3dnowext(real *,real *,real *,const real *,real *);
void INT123_dct36_x86_64 (real *,real *,real *,const real *,real *);
void INT123_dct36_sse (real *,real *,real *,const real *,real *);
void INT123_dct36_avx (real *,real *,real *,const real *,real *);
void INT123_dct36_neon (real *,real *,real *,const real *,real *);
void INT123_dct36_neon64 (real *,real *,real *,const real *,real *);

/* define CUT_SFB21 if you want to cut-off the frequency above 16kHz */
#if 0
Expand Down Expand Up @@ -1256,10 +1264,7 @@ static void III_antialias(real xr[SBLIMIT][SSLIMIT],struct gr_info_s *gr_info)
Mathematics of Computation, Volume 32, Number 141, January 1978,
Pages 175-199
*/

/* Calculation of the inverse MDCT
used to be static without 3dnow - does that really matter? */
void INT123_dct36(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf)
static void INT123_dct36(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf)
{
real tmp[18];

Expand Down Expand Up @@ -1449,6 +1454,105 @@ void INT123_dct36(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf)
}
}

// Wrap the assembly routine calls into C functions that serve as jump target to satisfy
// indirect branch protection if the toolchain enables that. Otherwise, we'd need to anticipate
// that in the assembly (and ensure assemblers support endbr64 and friends).
// Loss of efficiency:

// In the case of one static optimization choice, we do not have that problem.

#ifdef OPT_THE_DCT36

#define DCT36_WRAP(asmfunc) \
static void asmfunc ## _wrap(real *inbuf,real *o1,real *o2,const real *wintab,real *tsbuf) \
{ \
return asmfunc(inbuf, o1, o2, wintab, tsbuf); \
}

#ifdef OPT_SSE
DCT36_WRAP(INT123_dct36_sse)
#endif
#ifdef OPT_3DNOWEXT_VINTAGE
DCT36_WRAP(INT123_dct36_3dnowext)
#endif
#ifdef OPT_3DNOW_VINTAGE
DCT36_WRAP(INT123_dct36_3dnow)
#endif
#ifdef OPT_X86_64
DCT36_WRAP(INT123_dct36_x86_64)
#endif
#ifdef OPT_AVX
DCT36_WRAP(INT123_dct36_avx)
#endif
#ifdef OPT_NEON
DCT36_WRAP(INT123_dct36_neon)
#endif
#ifdef OPT_NEON64
DCT36_WRAP(INT123_dct36_neon64)
#endif

int INT123_dct36_match(mpg123_handle *fr, enum optdec t)
{
#ifdef OPT_SSE
if(t == sse && fr->cpu_opts.the_dct36 == INT123_dct36_sse_wrap)
return 1;
#endif
#ifdef OPT_3DNOWEXT_VINTAGE
if(t == dreidnowext_vintage && fr->cpu_opts.the_dct36 == INT123_dct36_3dnowext_wrap)
return 1;
#endif
#ifdef OPT_3DNOW_VINTAGE
if(t == dreidnow_vintage && fr->cpu_opts.the_dct36 == INT123_dct36_3dnow_wrap)
return 1;
#endif
return 0;
}

void INT123_dct36_choose(mpg123_handle *fr)
{
switch(fr->cpu_opts.type)
{
#ifdef OPT_SSE
case sse:
fr->cpu_opts.the_dct36 = INT123_dct36_sse_wrap;
break;
#endif
#ifdef OPT_3DNOWEXT_VINTAGE
case dreidnowext_vintage:
fr->cpu_opts.the_dct36 = INT123_dct36_3dnowext_wrap;
break;
#endif
#ifdef OPT_3DNOW_VINTAGE
case dreidnow_vintage:
fr->cpu_opts.the_dct36 = INT123_dct36_3dnow_wrap;
break;
#endif
#ifdef OPT_AVX
case avx:
fr->cpu_opts.the_dct36 = INT123_dct36_avx;
break;
#endif
#ifdef OPT_X86_64
case x86_64:
fr->cpu_opts.the_dct36 = INT123_dct36_x86_64;
break;
#endif
#ifdef OPT_NEON
case neon:
fr->cpu_opts.the_dct36 = INT123_dct36_neon;
break;
#endif
#ifdef OPT_NEON64
case neon:
fr->cpu_opts.the_dct36 = INT123_dct36_neon64;
break;
#endif
default:
fr->cpu_opts.the_dct36 = INT123_dct36;
}
}

#endif

/* new DCT12 */
static void dct12(real *in,real *rawout1,real *rawout2,register const real *wi,register real *ts)
Expand Down
54 changes: 8 additions & 46 deletions src/libmpg123/optimize.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ static enum optdec sse_or_vintage(mpg123_handle *fr)
enum optdec type;
type = sse_vintage;
# ifdef OPT_SSE
# ifdef OPT_MULTI
if(fr->cpu_opts.the_dct36 == INT123_dct36_sse)
# ifdef OPT_THE_DCT36
if(INT123_dct36_match(fr, sse))
# endif
type = sse;
# endif
Expand Down Expand Up @@ -192,7 +192,7 @@ static int find_dectype(mpg123_handle *fr)
type = dreidnowext;
# ifdef OPT_3DNOWEXT_VINTAGE
# ifdef OPT_MULTI
if(fr->cpu_opts.the_dct36 == INT123_dct36_3dnowext)
if(INT123_dct36_match(fr, dreidnowext_vintage))
# endif
type = dreidnowext_vintage;
# endif
Expand All @@ -210,7 +210,7 @@ static int find_dectype(mpg123_handle *fr)
type = dreidnow;
# ifdef OPT_3DNOW_VINTAGE
# ifdef OPT_MULTI
if(fr->cpu_opts.the_dct36 == INT123_dct36_3dnow)
if(INT123_dct36_match(fr, dreidnow_vintage))
# endif
type = dreidnow_vintage;
# endif
Expand Down Expand Up @@ -503,13 +503,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
#endif

fr->cpu_opts.type = nodec;
#ifdef OPT_MULTI
#ifndef NO_LAYER3
#if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64)
fr->cpu_opts.the_dct36 = INT123_dct36;
#endif
#endif
#endif
/* covers any i386+ cpu; they actually differ only in the INT123_synth_1to1 function, mostly... */
#ifdef OPT_X86
if(cpu_i586(fr->cpu_flags))
Expand All @@ -523,11 +516,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = dn_sse;
fr->cpu_opts.type = sse;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
/* if(cpu_fast_sse(fr->cpu_flags)) */ fr->cpu_opts.the_dct36 = INT123_dct36_sse;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_sse;
# ifdef ACCURATE_ROUNDING
Expand Down Expand Up @@ -590,11 +578,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = dn_dreidnowext_vintage;
fr->cpu_opts.type = dreidnowext_vintage;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
fr->cpu_opts.the_dct36 = INT123_dct36_3dnowext;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_3dnowext;
# endif
Expand All @@ -619,11 +602,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = dn_dreidnow_vintage;
fr->cpu_opts.type = dreidnow_vintage;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
fr->cpu_opts.the_dct36 = INT123_dct36_3dnow;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_3dnow;
# endif
Expand Down Expand Up @@ -723,11 +701,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = "x86-64 (AVX)";
fr->cpu_opts.type = avx;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
fr->cpu_opts.the_dct36 = INT123_dct36_avx;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_avx;
fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_avx;
Expand All @@ -749,11 +722,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = "x86-64 (SSE)";
fr->cpu_opts.type = x86_64;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
fr->cpu_opts.the_dct36 = INT123_dct36_x86_64;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_x86_64;
fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_x86_64;
Expand Down Expand Up @@ -796,11 +764,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = dn_neon;
fr->cpu_opts.type = neon;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
fr->cpu_opts.the_dct36 = INT123_dct36_neon;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_neon;
fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_neon;
Expand Down Expand Up @@ -834,11 +797,6 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
{
chosen = dn_neon64;
fr->cpu_opts.type = neon64;
#ifdef OPT_MULTI
# ifndef NO_LAYER3
fr->cpu_opts.the_dct36 = INT123_dct36_neon64;
# endif
#endif
# ifndef NO_16BIT
fr->synths.plain[r_1to1][f_16] = INT123_synth_1to1_neon64;
fr->synths.stereo[r_1to1][f_16] = INT123_synth_1to1_stereo_neon64;
Expand Down Expand Up @@ -897,6 +855,10 @@ int INT123_frame_cpu_opt(mpg123_handle *fr, const char* cpu)
# endif
# endif

#ifdef OPT_THE_DCT36
INT123_dct36_choose(fr);
#endif

#ifdef OPT_DITHER
if(done && dithered)
{
Expand Down
9 changes: 3 additions & 6 deletions src/libmpg123/optimize.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,19 +364,16 @@ extern const int INT123_costab_mmxsse[];
#endif
#endif

/*
Now come two blocks of standard definitions for multi-decoder mode and single-decoder mode.
Most stuff is so automatic that it's indeed generated by some inline shell script.
Remember to use these scripts when possible, instead of direct repetitive hacking.
*/

#ifdef OPT_MULTI

# define defopt nodec

# ifndef NO_LAYER3
# if (defined OPT_3DNOW_VINTAGE || defined OPT_3DNOWEXT_VINTAGE || defined OPT_SSE || defined OPT_X86_64 || defined OPT_AVX || defined OPT_NEON || defined OPT_NEON64)
# define OPT_THE_DCT36
# define opt_dct36(fr) ((fr)->cpu_opts.the_dct36)
# endif
# endif

#endif /* OPT_MULTI else */

Expand Down

0 comments on commit 6425108

Please sign in to comment.