Skip to content

Commit

Permalink
Merge branch 'master' of jsoftware.com:jsource
Browse files Browse the repository at this point in the history
  • Loading branch information
moon-chilled committed Sep 13, 2022
2 parents f92242f + 2d26122 commit 7a3945d
Show file tree
Hide file tree
Showing 22 changed files with 783 additions and 266 deletions.
6 changes: 3 additions & 3 deletions jsrc/ar.c
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ REDUCCPFX(tymesinsO, D, I, TYMESO)
acc1=prim(acc1,acc5); acc2=prim(acc2,acc6); acc3=prim(acc3,acc7); acc0=prim(acc0,acc4); \
acc2=prim(acc2,acc3); acc0=prim(acc0,acc1); acc0=prim(acc0,acc2); /* combine accumulators vertically */ \
acc0=prim(acc0,_mm256_permute4x64_pd(acc0,0b11111110)); acc0=prim(acc0,_mm256_permute_pd(acc0,0xf)); /* combine accumulators horizontally 01+=23, 0+=1 */ \
*(I*)z=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++z; /* store the single result from 0 */ \
*(I*)z=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); /* AVX2 *z=_mm256_cvtsd_f64(acc0); */ ++z; /* store the single result */ \
)

// f/ on rank>1, going down columns to save bandwidth
Expand Down Expand Up @@ -429,8 +429,8 @@ DF1(jtcompsum){
c0=_mm256_add_pd(c0,_mm256_permute_pd(c0,0xf)); acc1=_mm256_permute_pd(acc0,0xf); // combine c0+c1, acc1<-1
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // combine 0123, combine all low parts
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++zv; // store the single result
// _mm_storel_pd(zv++,_mm256_castpd256_pd128(acc0));
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); /* AVX2 *zv=_mm256_cvtsd_f64(acc0);*/ ++zv; // store the single result
// obsolete _mm_storel_pd(zv++,_mm256_castpd256_pd128(acc0));
}
}else{
// rank>1, going down columns to save bandwidth and add accuracy
Expand Down
7 changes: 4 additions & 3 deletions jsrc/cv.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,10 @@ F2(jtfit){F2PREFIP;A f;C c;I k,l,m,r;V*sv;
case CNOT: case CXCO: case CSPARSE: case CEBAR:
R fitct(a,w,cno);
case CQQ: ;
RE(wval=i0(w)); ASSERT(wval==0,EVDOMAIN); // only f"r!.0 is supported
ASSERT(sv->valencefns[1]==jtsumattymes1,EVDOMAIN) // Must be +/@:*"1!:0
R CDERIV(CFIT,0,jtsumattymes1,VIRS2, m,l,r); // supports IRS
RE(wval=i0(w)); ASSERT(BETWEENC(wval,0,1),EVDOMAIN); // only f"r!.[01] is supported
ASSERT(sv->valencefns[1]==jtsumattymes1,EVDOMAIN) // Must be +/@:*"1!:[01]
RZ(f=CDERIV(CFIT,0,jtsumattymes1,VIRS2, m,l,r)); // supports IRS
FAV(f)->localuse.lu1.fittype=wval; R f;
case CSLASH: ;
RE(wval=i0(w)); ASSERT(wval==0,EVDOMAIN); // only f/!.0 is supported
ASSERT(FAV(sv->fgh[0])->id==CPLUS,EVDOMAIN) // Must be +/!:0
Expand Down
2 changes: 1 addition & 1 deletion jsrc/d.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ void jtdebdisp(J jt,DC d){A*x,y;I e,t;
switch(t){
case DCPARSE: dhead(3,d); seeparse(d); if(NETX==jt->etxn)--jt->etxn; eputc(CLF); break;
case DCCALL: dhead(0,d); seecall(d); eputc(CLF); break;
case DCSCRIPT: dhead(0,d); efmt("[-"FMTI"] ", d->dcn-1);
case DCSCRIPT: dhead(0,d); efmt("[-"FMTI"] ", d->dcn); // keep the line number as 1-origin since that's what editors do
if(0<=d->dcm){READLOCK(JT(jt,startlock)) y=AAV(JT(jt,slist))[d->dcm]; ep(AN(y),CAV(y)); READUNLOCK(JT(jt,startlock))}
eputc(CLF); break;
}}
Expand Down
11 changes: 9 additions & 2 deletions jsrc/j.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,18 @@ struct Bd2 {I hdr[AKXR(0)/SZI]; D v[2];};
#define CREBLOCKATOMV2(name,t,v1,v2) struct Bd2 __attribute__((aligned(CACHELINESIZE))) B##name={{AKXR(0),(t)&TRAVERSIBLE,0,(t),ACPERMANENT,1,0},{v1,v2}};
CREBLOCKATOMV2(a0j1,CMPX,0.0,1.0) // 0j1
#if SY_64
#define CBAIVAL(t,v) {7*SZI,(t)&TRAVERSIBLE,0,(t),ACPERMANENT,1,0,(v)}
#define CBAIVALM(t,v,m) {7*SZI,(t)&TRAVERSIBLE,m,(t),ACPERMANENT,1,0,(v)}
#else
#define CBAIVAL(t,v) {8*SZI,(t)&TRAVERSIBLE,0,(t),ACPERMANENT,1,0,0,(v)}
#define CBAIVALM(t,v,m) {8*SZI,(t)&TRAVERSIBLE,m,(t),ACPERMANENT,1,0,0,(v)}
#endif
#define CBAIVAL(t,v) CBAIVALM(t,v,0)
#define CREBLOCKATOMI(name,t,v) I __attribute__((aligned(CACHELINESIZE))) B##name[9-SY_64]=CBAIVAL(t,v);
#define CREBLOCKATOMGMP(name,t,v,m) I __attribute__((aligned(CACHELINESIZE))) B##name[9-SY_64]=CBAIVALM(t,v,m);
CREBLOCKATOMGMP(X0,LIT,0,0) // X block representing GMP 0 - AN=1, AM=0, val=immaterial
CREBLOCKATOMGMP(X1,LIT,1,1) // X block representing GMP 1 - AN=1, AM=1, val=1
struct Bxnum0 {I hdr[AKXR(0)/SZI]; X v[1];};
#define CREBLOCKATOMXNUM(name,v) struct Bxnum0 __attribute__((aligned(CACHELINESIZE))) B##name={{AKXR(0),XNUM&TRAVERSIBLE,0,XNUM,ACPERMANENT,1,0},{(X)B##v}};
CREBLOCKATOMXNUM(xnum1,X1) // XNUM block for 1
#define CREBLOCKVEC0(name,t) I __attribute__((aligned(CACHELINESIZE))) B##name[8]={8*SZI,(t)&TRAVERSIBLE,0,(t),ACPERMANENT,0,1,0}; // no padding at end - no atoms should be referenced
CREBLOCKVEC0(aqq,LIT) // ''
CREBLOCKVEC0(mtv,B01) // i.0 boolean
Expand Down
16 changes: 11 additions & 5 deletions jsrc/j.h
Original file line number Diff line number Diff line change
Expand Up @@ -1237,7 +1237,7 @@ if(likely(!((I)jtinplace&JTWILLBEOPENED)))z=EPILOGNORET(z); RETF(z); \
#define GAE0(v,t,n,r,erraction) {HISTOCALL if(unlikely(!(v=jtga0(jt,(I)(t),(I)(r),(I)(n)))))erraction; AN(v)=(n);} // used when shape=0 and rank is never 1 or will always be filled in by user even if rank 1
#endif
#define GAE(v,t,n,r,s,erraction) {GAE0(v,t,n,r,erraction) MCISH(AS(v),(I*)(s),(r))} // error action
#define GA00(v,t,n,r) {GAE0(v,t,n,r,R 0)} // used when rank will always be filled in by user. Default error action is to exit
#define GA00(v,t,n,r) {GAE0(v,t,n,r,R 0)} // used when shape will always be filled in by user. Default error action is to exit
#define GA(v,t,n,r,s) {GA00(v,t,n,r) MCISH(AS(v),(I*)(s),(r))} // s points to shape
#define GA0(v,t,n,r) {GA00(v,t,n,r) *((r)==1?AS(v):jt->shapesink)=(n);} // used when shape=0 but rank may be 1 and must fill in with AN if so - never for sparse blocks
#define GA10(v,t,n) {GA00(v,t,n,1) AS(v)[0]=(n);} // used when rank is known to be 1
Expand Down Expand Up @@ -1906,12 +1906,18 @@ if(likely(type _i<3)){z=(I)&oneone; z=type _i>1?(I)_zzt:z; _zzt=type _i<1?(I*)z:
#define LGSZS 1 // lg (bytes in an S)

#if (C_AVX2&&SY_64) || EMU_AVX2
// create double-precision product of inputs
// create double-precision product of inputs. outhi must not be an input; outlo can
#define TWOPROD(in0,in1,outhi,outlo) outhi=_mm256_mul_pd(in0,in1); outlo=_mm256_fmsub_pd(in0,in1,outhi);
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. Needs t and signbit.
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. outlo must not be an input. Needs sgnbit.
#define TWOSUM(in0,in1,outhi,outlo) {__m256d t=_mm256_andnot_pd(sgnbit,in0); outlo=_mm256_andnot_pd(sgnbit,in1); t=_mm256_sub_pd(t,outlo); \
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); \
outhi=_mm256_add_pd(in0,in1); outlo=_mm256_sub_pd(outlo,outhi); outlo=_mm256_add_pd(outlo,t);} // 1 if in1 larger; select outlo=max t=min
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); /* outlo=val with larger abs t=val with smaller abs */ \
outhi=_mm256_add_pd(in0,in1); /* single-prec sum */ \
outlo=_mm256_sub_pd(outlo,outhi); /* big-(big+small): implied val of -small after rounding */ \
outlo=_mm256_add_pd(outlo,t);} // amt by which actual value exceeds implied: this is the lost low precision
// Same, but we know which argument is bigger. outhi cannot be an input; outlo can be the same as inbig
#define TWOSUMBS(inbig,insmall,outhi,outlo) {outhi=_mm256_add_pd(inbig,insmall); /* single-prec sum */ \
outlo=_mm256_sub_pd(inbig,outhi); /* big-(big+small): implied val of -small after rounding */ \
outlo=_mm256_add_pd(outlo,insmall);} // amt by which actual value exceeds implied: this is the lost low precision
#define DPADD(hi0,lo0,hi1,lo1,outhi,outlo) outhi=_mm256_add_pd(hi0,hi1); outlo=_mm256_add_pd(lo0,lo1);
#else
#define TWOSPLIT(a,x,y) y=(a)*134217730.0; x=y-(a); x=y-x; y=(a)-x; // must avoid compiler tuning
Expand Down
5 changes: 4 additions & 1 deletion jsrc/ja.h
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,10 @@
#define fplus(x,y) jtfplus(jt,(x),(y))
#define fpoly(x,y) jtfpoly(jt,(x),(y))
#define fpolyc(x) jtfpolyc(jt,(x))
#define fr(x) {if(likely((x)!=0)){I Zs = AC(x); if(likely(!ACISPERM(Zs))){if(likely(--Zs<=0))mf(x);else AC(x)=Zs;}}} // use fr for known nonrecursives, and for locales
#define gmpmfree(x) {I allocsize = AN(x)+AKXR(0); jt->bytes-=allocsize; jt->malloctotal-=allocsize; jt->mfreegenallo-=allocsize; free(x);}
#define frcommon(x,f) {if(likely((x)!=0)){I Zs = AC(x); if(likely(!ACISPERM(Zs))){if(likely(--Zs<=0)){f(x);}else AC(x)=Zs;}}} // use fr for known nonrecursives, and for locales
#define fr(x) frcommon(x,mf)
#define frgmp(x) frcommon(x,gmpmfree) // to free GMP blocks
#define fram(x0,x1,x2,x3,x4) jtfram(jt,(x0),(x1),(x2),(x3),(x4))
#define from(x,y) jtfrom(jt,(x),(y))
#define frombs(x,y) jtfrombs(jt,(x),(y))
Expand Down
1 change: 1 addition & 0 deletions jsrc/je.h
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,7 @@ extern F2(jtdomainerr2);
extern F2(jtdot);
extern F2(jtdrop);
extern F2(jtebar);
extern F2(jtekupdate);
extern F2(jteps);
extern F2(jtetoiso8601);
extern F2(jtiso8601toe);
Expand Down
1 change: 1 addition & 0 deletions jsrc/jtype.h
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,7 @@ typedef struct {
A cachedref; // for namerefs ('name'~), the cached value, or 0 if not cached
AF fork2hfn; // for dyad fork that is NOT a comparison combination or jtintersect, the function to call to process h (might be in h@][)
I forcetask; // for t., the flags extracted from n. Bits 0-7=thread pool; bit 8=worker thread only
I fittype; // for u!.t where t is a code, its value is stored here in the CFIT block
} lu1; // this is the high-use stuff in the second cacheline
};
} localuse; // always 16 bytes, 4 I4s
Expand Down
3 changes: 2 additions & 1 deletion jsrc/m.c
Original file line number Diff line number Diff line change
Expand Up @@ -976,7 +976,8 @@ void jtfamftrav(J jt,AD* RESTRICT wd,I t){I n=AN(wd);
}else if(t&SYMB){wd=jtfreesymtab(jt,wd,AR(wd)); // SYMB is used as a flag; we test here AFTER NAME and ADV which are lower bits
} else if(t&(RAT|XNUM|XD)) {A* RESTRICT v=AAV(wd);
// single-level indirect forms. handle each block
DQ(t&RAT?2*n:n, if(*v)fr(*v); ++v;);
DQ(t&RAT?2*n:n, if(*v)if(AT(*v)&LIT){frgmp(*v);}else fr(*v); ++v;);
// obsolete DQ(t&RAT?2*n:n, if(*v)fr(*v); ++v;);
}else if(ISSPARSE(t)){P* RESTRICT v=PAV(wd);
fana(SPA(v,a)); fana(SPA(v,e)); fana(SPA(v,i)); fana(SPA(v,x));
// for sparse, decrement the usecount
Expand Down
Loading

0 comments on commit 7a3945d

Please sign in to comment.