diff --git a/source/DlgProperty.cpp b/source/DlgProperty.cpp index 810ca04..b51dbc6 100644 --- a/source/DlgProperty.cpp +++ b/source/DlgProperty.cpp @@ -18,9 +18,10 @@ CDlgProperty::CDlgProperty(CWnd* pParent /*=NULL*/) , bUseCUDAFFT(FALSE) , m_EnFastSeek(TRUE) , m_bEnableAVX2(FALSE) + , m_bEnableAVX512(FALSE) //220417 , m_EnCUDAStream(TRUE) //190620 { - Init(1, false, false, 0, CUDA_BLOCKSIZE, CUDA_WARPSIZE, + Init(1, false, false, false, 0, CUDA_BLOCKSIZE, CUDA_WARPSIZE, 0, ATISTREAM_MAXWORK, ATISTREAM_UNITWORK); } @@ -28,7 +29,7 @@ CDlgProperty::~CDlgProperty() { } -void CDlgProperty::Init(int icpu, bool bsimd, bool bavx2, +void CDlgProperty::Init(int icpu, bool bsimd, bool bavx2, bool bavx512, //220417 int iCudaCount, int iCudaBlock, int iCudaWarp, int iATIcount, int iATImaxwork, int iATIunitwork, int iProcessorType) { //CGazoApp* pApp = (CGazoApp*) AfxGetApp(); @@ -38,6 +39,8 @@ void CDlgProperty::Init(int icpu, bool bsimd, bool bavx2, if (bSIMD) bEnableSIMD = TRUE;//160918 enabled again bAVX2 = bavx2; if (bAVX2) m_bEnableAVX2 = TRUE; + bAVX512 = bavx512; //220417 + if (bAVX512) m_bEnableAVX512 = TRUE; //220417 maxCUDA = iCudaCount; maxCUDAThreadsPerBlock = iCudaBlock; iCUDAwarpsize = iCudaWarp; @@ -84,6 +87,7 @@ void CDlgProperty::DoDataExchange(CDataExchange* pDX) DDX_Control(pDX, IDC_PROP_ATISTREAMNWORK, m_ATIstreamNwork); DDX_Check(pDX, IDC_PROP_ENFASTSEEK, m_EnFastSeek); DDX_Check(pDX, IDC_PROP_AVX2, m_bEnableAVX2); + DDX_Check(pDX, IDC_PROP_AVX512, m_bEnableAVX512); DDX_Check(pDX, IDC_PROP_CUDASTREAM, m_EnCUDAStream); } @@ -94,6 +98,7 @@ ON_BN_CLICKED(IDC_PROP_CUDAGPU, &CDlgProperty::OnBnClickedCudagpu) ON_BN_CLICKED(IDC_PROP_ATISTREAM, &CDlgProperty::OnBnClickedPropAtistream) ON_BN_CLICKED(IDC_PROP_SIMD, &CDlgProperty::OnBnClickedPropSimd) ON_BN_CLICKED(IDC_PROP_INFO, &CDlgProperty::OnBnClickedPropInfo) +ON_BN_CLICKED(IDC_PROP_AVX2, &CDlgProperty::OnBnClickedPropAvx2) END_MESSAGE_MAP() @@ -110,6 +115,7 @@ BOOL CDlgProperty::OnInitDialog() r_ProcessorType = m_ProcessorType; rEnableSIMD = bEnableSIMD; rEnableAVX2 = m_bEnableAVX2; + rEnableAVX512 = m_bEnableAVX512;//220417 rCUDAnblock = iCUDAnblock; r_EnReport = m_EnReport; r_EnFastSeek = m_EnFastSeek; @@ -218,6 +224,7 @@ void CDlgProperty::EnableCtrl() { // GetDlgItem(IDC_PROP_SIMD)->EnableWindow(FALSE); GetDlgItem(IDC_PROP_AVX2)->EnableWindow(FALSE); + GetDlgItem(IDC_PROP_AVX512)->EnableWindow(FALSE);//220417 GetDlgItem(IDC_PROP_NCPU)->EnableWindow(FALSE); GetDlgItem(IDC_PROP_NGPU)->EnableWindow(FALSE); GetDlgItem(IDC_PROP_CUDANBLOCK)->EnableWindow(FALSE); @@ -233,6 +240,7 @@ void CDlgProperty::EnableCtrl() { if (bSIMD) { GetDlgItem(IDC_PROP_SIMD)->EnableWindow(TRUE); if (bAVX2 && bEnableSIMD) GetDlgItem(IDC_PROP_AVX2)->EnableWindow(TRUE); + if (bAVX512 && m_bEnableAVX2 && bEnableSIMD) GetDlgItem(IDC_PROP_AVX512)->EnableWindow(TRUE);//220417 } GetDlgItem(IDC_PROP_NCPU)->EnableWindow(TRUE); break;} @@ -317,6 +325,7 @@ void CDlgProperty::OnCancel() iCPU = rCPU; bEnableSIMD = rEnableSIMD; m_bEnableAVX2 = rEnableAVX2; + m_bEnableAVX512 = rEnableAVX512;//220417 iCUDA = rCUDA; iCUDAnblock = rCUDAnblock; bUseCUDAFFT = r_UseCUDAFFT; @@ -344,3 +353,10 @@ void CDlgProperty::OnBnClickedPropInfo() CGazoApp* pApp = (CGazoApp*) AfxGetApp(); pApp->OnViewError(); } + + +void CDlgProperty::OnBnClickedPropAvx2() +{ + UpdateData(); + EnableCtrl(); +} diff --git a/source/DlgProperty.h b/source/DlgProperty.h index 413c4cb..f7969bf 100644 --- a/source/DlgProperty.h +++ b/source/DlgProperty.h @@ -25,13 +25,13 @@ class CDlgProperty : public CDialog DECLARE_MESSAGE_MAP() public: virtual BOOL OnInitDialog(); - void Init(int icpu, bool bsimd, bool bavx2, + void Init(int icpu, bool bsimd, bool bavx2, bool bavx512, //220417 int iCudaCount, int iCudaBlock, int iCudaWarp, int iATIcount, int iATImaxwork, int iATIunitwork, int iProcessorType = -1); CComboBox m_CPU; CComboBox m_Memory; int iCPU, iMemory; - bool bSIMD, bAVX2; + bool bSIMD, bAVX2, bAVX512;//220417 int iCUDA, iCUDAnblock, iCUDAwarpsize; int iATIstream, iATIstreamNwork, iATIstreamUnitwork; protected: @@ -41,7 +41,7 @@ class CDlgProperty : public CDialog int rCUDA, maxCUDA, rCUDAnblock, maxCUDAThreadsPerBlock; int r_ProcessorType; int rATIstream, maxATIstream, rATIstreamNwork, maxATIstreamNwork; - BOOL rEnableSIMD, rEnableAVX2; + BOOL rEnableSIMD, rEnableAVX2, rEnableAVX512; //220417 BOOL r_EnReport; BOOL r_UseCUDAFFT; BOOL r_EnFastSeek; @@ -66,7 +66,9 @@ class CDlgProperty : public CDialog CComboBox m_ATIstreamNwork; BOOL m_EnFastSeek; BOOL m_bEnableAVX2; + BOOL m_bEnableAVX512; // 220417 afx_msg void OnBnClickedPropSimd(); afx_msg void OnBnClickedPropInfo(); BOOL m_EnCUDAStream; + afx_msg void OnBnClickedPropAvx2(); }; diff --git a/source/DlgRenumFiles.cpp b/source/DlgRenumFiles.cpp index 0448017..f758406 100644 --- a/source/DlgRenumFiles.cpp +++ b/source/DlgRenumFiles.cpp @@ -230,13 +230,17 @@ void CDlgRenumFiles::OnOK() e3p.UnitLength(); CXyz e1p = e2 * e3p; CXyz e2p = e3p * e1p; - if (e3p.Length2() < 1E-6) { + //220118 if (e3p.Length2() < 1E-6) { + if (e1p.Length2() < 1E-6) { e2p = e3p * e1; e1p = e2p * e3p; } b[0] = e1p.X(e1); b[3] = e1p.X(e2); b[6] = e1p.X(e3); b[1] = e2p.X(e1); b[4] = e2p.X(e2); b[7] = e2p.X(e3); b[2] = e3p.X(e1); b[5] = e3p.X(e2); b[8] = e3p.X(e3); + //CString line, msg = ""; + //line.Format("%f %f %f\r\n%f %f %f\r\n%f %f %f\r\n", e1p.x, e1p.y, e1p.z, e2p.x, e2p.y, e2p.z, e3p.x, e3p.y, e3p.z); msg += line; + //line.Format("%f %f %f\r\n%f %f %f\r\n%f %f %f\r\n", b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8]); msg += line; TReal x1 = b[0] * (-ixcent) + b[3] * (-iycent) + b[6] * (-izcent); TReal y1 = b[1] * (-ixcent) + b[4] * (-iycent) + b[7] * (-izcent); @@ -244,7 +248,6 @@ void CDlgRenumFiles::OnOK() ixmin = (x1 < ixmin) ? (int)x1 : ixmin; ixmax = (x1 > ixmax) ? (int)x1 : ixmax; iymin = (y1 < iymin) ? (int)y1 : iymin; iymax = (y1 > iymax) ? (int)y1 : iymax; izmin = (z1 < izmin) ? (int)z1 : izmin; izmax = (z1 > izmax) ? (int)z1 : izmax; - //CString line, msg = ""; //line.Format("%d %d %d ==> %f %f %f\r\n", -ixcent, -iycent, -izcent, x1, y1, z1); msg += line; x1 = b[0] * (-ixcent + ixref-1) + b[3] * (-iycent) + b[6] * (-izcent); y1 = b[1] * (-ixcent + ixref-1) + b[4] * (-iycent) + b[7] * (-izcent); @@ -294,9 +297,6 @@ void CDlgRenumFiles::OnOK() ixmin = (x1 < ixmin) ? (int)x1 : ixmin; ixmax = (x1 > ixmax) ? (int)x1 : ixmax; iymin = (y1 < iymin) ? (int)y1 : iymin; iymax = (y1 > iymax) ? (int)y1 : iymax; izmin = (z1 < izmin) ? (int)z1 : izmin; izmax = (z1 > izmax) ? (int)z1 : izmax; - //CString line; - //line.Format("%d-%d %d-%d %d-&d", ixmin, ixmax, iymin, iymax, izmin, izmax); - //AfxMessageBox(line); //output images const unsigned int ixsize = ixmax - ixmin + 1; const unsigned int iysize = iymax - iymin + 1; @@ -321,6 +321,9 @@ void CDlgRenumFiles::OnOK() iz1min = (iz1min < 0) ? 0 : iz1min; nCache = (iz1max-iz1min+1 > nCache) ? iz1max-iz1min+1 : nCache; } + //line.Format("%d-%d %d-%d %d-%d %d", ixmin, ixmax, iymin, iymax, izmin, izmax, nCache); msg += line; + //AfxMessageBox(msg); + //return;//220118 //alloc memory MEMORYSTATUSEX memory; memory.dwLength = sizeof(memory); diff --git a/source/cudaReconst.cu b/source/cudaReconst.cu index f7fe879..15781d9 100644 --- a/source/cudaReconst.cu +++ b/source/cudaReconst.cu @@ -519,7 +519,11 @@ extern "C" int GetCudaNumberOfCores(int iDevice, int* piCores, int* piProcessors cudaDeviceProp deviceProp; if (cudaSuccess != cudaGetDeviceProperties(&deviceProp, iDevice)) { return CUDA_ERROR_DEVICE_GETPROPERTY; } if (deviceProp.major == 9999 && deviceProp.minor == 9999) { return CUDA_ERROR_VIRTUAL_DEVICE_DETECTED; }//virtual device - if (piCores) *piCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + //220422 if (piCores) *piCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + if (piCores) {//220422 + if ((deviceProp.major == 8) && (deviceProp.minor == 6)) *piCores = 128;//GA102-7 has 64 FP32 + 64 FP/INT32 = 128 cores per SM (NVIDIA Ampere GA102 GPU Architecture, 2021) + else *piCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + } if (piProcessors) *piProcessors = deviceProp.multiProcessorCount; return 0; } diff --git a/source/gazo.aps b/source/gazo.aps index 84f9697..fd1bf8f 100644 Binary files a/source/gazo.aps and b/source/gazo.aps differ diff --git a/source/gazo.cpp b/source/gazo.cpp index 49e21c3..8dbe06c 100644 --- a/source/gazo.cpp +++ b/source/gazo.cpp @@ -105,6 +105,7 @@ CGazoApp::CGazoApp() int iLogicalProcessorCount = 1; bool bSIMD = false; bool bAVX2 = false;//181222 + bool bAVX512 = false;//220417 __cpuid(CPUInfo, 0); int idmax = CPUInfo[0]; if (idmax >= 1) { @@ -123,8 +124,21 @@ CGazoApp::CGazoApp() if (idmax >= 7) { #ifdef _MSC_VER #if _MSC_VER >=1910 //VS2017 or later - __cpuidex(CPUInfo, 7, 0); - if (CPUInfo[1] & (1 << 5)) bAVX2 = true; + //__cpuidex(CPUInfo, 7, 0); + //if (CPUInfo[1] & (1 << 5)) bAVX2 = true; + //220417==> + if ((CPUInfo[2] & 0x08000000) && //CPUID.ECX.bit27 XSAVE enabled by OS => xgetbv available + (CPUInfo[2] & 0x10000000)) {//CPUID.ECX.bit28 AVX + const __int64 xcr0 = _xgetbv(0); + if ((xcr0 & 0x06) == 0x06) {//OS supoorts AVX + __cpuidex(CPUInfo, 7, 0); + if (CPUInfo[1] & 0x20) bAVX2 = true; //CPUID(7,0).EBX.bit5 + if ((xcr0 & 0xe0) == 0xe0) {//OS supoorts AVX-512 + if ((CPUInfo[1] & 0x00030000) == 0x00030000) bAVX512 = true;//512F & 512DQ + } + } + } + //==>220417 #endif #endif } @@ -155,6 +169,8 @@ CGazoApp::CGazoApp() else msg += " MMX+SSE+SSE2: not detected\r\n"; if (bAVX2) msg += " AVX2: detected\r\n"; else msg += " AVX2: not detected\r\n"; + if (bAVX512) msg += " AVX-512F+DQ: detected\r\n"; + else msg += " AVX-512F+DQ: not detected\r\n"; //memory MEMORYSTATUSEX memory; memory.dwLength = sizeof(memory); @@ -271,7 +287,7 @@ CGazoApp::CGazoApp() } } // - dlgProperty.Init(iAvailableCPU, bSIMD, bAVX2, + dlgProperty.Init(iAvailableCPU, bSIMD, bAVX2, bAVX512, iCUDAcount, iCUDAblock, iCUDAwarp, iATIcount, iATImaxwork, iATIunitwork, iProcessorType); //prevPixelWidth = -1; @@ -738,6 +754,8 @@ TErr CGazoApp::CalcAvgImage(CString path, CString* files, int nfiles) { void CGazoApp::OnTomoLsqfit() { + //201126 + const int nframes = 20; CString sPathName[] = { "", "" }; POSITION pos = GetFirstDocTemplatePosition(); while (pos) { @@ -752,9 +770,11 @@ void CGazoApp::OnTomoLsqfit() } } } - const int nframes = 20; + int nPathName[] = { 0,0 }; TCHAR path_buffer[_MAX_PATH]; TCHAR drive[_MAX_DRIVE]; TCHAR dir[_MAX_DIR]; TCHAR fnm[_MAX_FNAME]; TCHAR ext[_MAX_EXT]; + CFile file; + CMainFrame* pf = (CMainFrame*)AfxGetMainWnd(); for (int j = 0; j <= 1; j++) { _stprintf_s(path_buffer, _MAX_PATH, sPathName[j]); _tsplitpath_s(path_buffer, drive, _MAX_DRIVE, dir, _MAX_DIR, fnm, _MAX_FNAME, ext, _MAX_EXT); @@ -769,16 +789,24 @@ void CGazoApp::OnTomoLsqfit() sfnm.Format(fmt, i + idx0); _stprintf_s(fnm, _MAX_FNAME, sfnm); _tmakepath_s(path_buffer, _MAX_PATH, drive, dir, fnm, ext); - sPathName[j] += path_buffer; - sPathName[j] += "\r\n"; + CString msg = "Accessing "; + if (pf) pf->m_wndStatusBar.SetPaneText(0, msg + path_buffer); + if (file.Open(path_buffer, CFile::modeRead)) { + file.Close(); + sPathName[j] += path_buffer; + sPathName[j] += "\r\n"; + nPathName[j]++; + } + else break; } } - AfxMessageBox(sPathName[0] + "\r\n-----\r\n" + sPathName[1]); + if (pf) pf->m_wndStatusBar.SetPaneText(0, ""); + //AfxMessageBox(sPathName[0] + "\r\n-----\r\n" + sPathName[1]); CDlgLsqfit dlg; dlg.m_RefList = sPathName[0]; dlg.m_QryList = sPathName[1]; - dlg.nRefFiles = nframes; - dlg.nQryFiles = nframes; + dlg.nRefFiles = nPathName[0]; + dlg.nQryFiles = nPathName[1]; dlg.UpdateNfiles(); dlg.DoModal(); } diff --git a/source/gazo.rc b/source/gazo.rc index f07aa15..9640a9e 100644 --- a/source/gazo.rc +++ b/source/gazo.rc @@ -315,7 +315,7 @@ BEGIN ICON IDR_MAINFRAME,IDC_STATIC,11,17,20,20 DEFPUSHBUTTON "OK",IDOK,164,7,36,14,WS_GROUP LTEXT "CGazoApp::CGazoApp() sProgVersion",IDC_ABOUT_VER,42,11,115,16,SS_NOPREFIX | WS_TABSTOP - LTEXT "Copyright (C) 2008-2021 Ryuta Mizutani.\r\nAll rights reserved.",IDC_STATIC,46,31,154,17 + LTEXT "Copyright (C) 2008-2022 Ryuta Mizutani.\r\nAll rights reserved.",IDC_STATIC,46,31,154,17 LTEXT "https://mizutanilab.github.io/",IDC_STATIC,54,49,139,15 LTEXT "File->Open one of *.his, *.img, or *.h5 files.\r\nGo 'Tomography->Reconstruction' menu.\r\n Click 'Show image' to calcurate tomograms.\r\n Examine tomograms at top and bottom of the sample.\r\n Refine parameters as you like.\r\n Then issue or queue execution.",IDC_STATIC,17,67,179,60 LTEXT "Again File->Open one of rec*.tif files.\r\nGo 'Tomography->Histogram/conversion' menu.\r\n Click 'Select Image'\r\n to select files to convert.\r\n Move green and red lines in the histogram\r\n to define output LAC levels.\r\n Issue or queue execution.",IDC_STATIC,17,119,183,61 @@ -424,36 +424,37 @@ BEGIN EDITTEXT IDC_MSG_TEXT,7,7,173,129,ES_MULTILINE | ES_READONLY | ES_WANTRETURN | WS_VSCROLL | WS_HSCROLL END -IDD_PROPERTY DIALOGEX 0, 0, 163, 240 +IDD_PROPERTY DIALOGEX 0, 0, 163, 253 STYLE DS_SETFONT | DS_MODALFRAME | WS_POPUP | WS_CAPTION | WS_SYSMENU CAPTION "Computing configuration" FONT 9, "MS Sans Serif", 0, 0, 0x0 BEGIN - DEFPUSHBUTTON "OK",IDOK,95,219,28,14 - DEFPUSHBUTTON "Cancel",IDCANCEL,128,219,28,14 + DEFPUSHBUTTON "OK",IDOK,95,232,28,14 + DEFPUSHBUTTON "Cancel",IDCANCEL,128,232,28,14 COMBOBOX IDC_PROP_NCPU,81,47,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP LTEXT "CPU core",IDC_STATIC,21,50,48,8 COMBOBOX IDC_PROP_MEMORY,62,7,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP LTEXT "Memory usage",IDC_STATIC,12,9,48,8 CONTROL "Use SIMD instructions",IDC_PROP_SIMD,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,21,63,85,10 CONTROL "On board x86/x64 CPU",IDC_PROP_INTELCPU,"Button",BS_AUTORADIOBUTTON | WS_GROUP,12,34,124,10 - CONTROL "NVIDIA CUDA GPU",IDC_PROP_CUDAGPU,"Button",BS_AUTORADIOBUTTON,12,76,137,10 - CONTROL "AMD Stream GPU",IDC_PROP_ATISTREAM,"Button",BS_AUTORADIOBUTTON,12,147,132,10 - GROUPBOX "Processor",IDC_STATIC,7,23,149,173 - COMBOBOX IDC_PROP_NGPU,81,89,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "GPU device",IDC_STATIC,21,92,48,8 - COMBOBOX IDC_PROP_CUDANBLOCK,81,106,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "Threads/block",IDC_STATIC,21,109,48,8 - CONTROL "Enable progress bar",IDC_PROP_ENREPORT,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,9,220,78,10 - CONTROL "Use CUDA FFT routine",IDC_PROP_CUDAFFT,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,21,121,85,10 - COMBOBOX IDC_PROP_NATISTREAM,81,160,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "GPU device",IDC_STATIC,21,163,48,8 - COMBOBOX IDC_PROP_ATISTREAMNWORK,81,177,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP - LTEXT "Work group size",IDC_STATIC,21,180,57,8 - CONTROL "Enable fast file access",IDC_PROP_ENFASTSEEK,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,9,204,84,10 - CONTROL "AVX/AVX2",IDC_PROP_AVX2,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,106,63,47,10 - PUSHBUTTON "Info/err",IDC_PROP_INFO,127,200,29,14 - CONTROL "Enable stream",IDC_PROP_CUDASTREAM,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,21,133,66,10 + CONTROL "NVIDIA CUDA GPU",IDC_PROP_CUDAGPU,"Button",BS_AUTORADIOBUTTON,12,90,137,10 + CONTROL "AMD Stream GPU",IDC_PROP_ATISTREAM,"Button",BS_AUTORADIOBUTTON,12,161,132,10 + GROUPBOX "Processor",IDC_STATIC,7,23,149,185 + COMBOBOX IDC_PROP_NGPU,81,103,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "GPU device",IDC_STATIC,21,106,48,8 + COMBOBOX IDC_PROP_CUDANBLOCK,81,120,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "Threads/block",IDC_STATIC,21,123,48,8 + CONTROL "Enable progress bar",IDC_PROP_ENREPORT,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,9,234,78,10 + CONTROL "Use CUDA FFT routine",IDC_PROP_CUDAFFT,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,21,135,85,10 + COMBOBOX IDC_PROP_NATISTREAM,81,174,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "GPU device",IDC_STATIC,21,177,48,8 + COMBOBOX IDC_PROP_ATISTREAMNWORK,81,191,43,70,CBS_DROPDOWNLIST | WS_VSCROLL | WS_TABSTOP + LTEXT "Work group size",IDC_STATIC,21,194,57,8 + CONTROL "Enable fast file access",IDC_PROP_ENFASTSEEK,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,9,218,84,10 + CONTROL "AVX/AVX2",IDC_PROP_AVX2,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,21,76,47,10 + PUSHBUTTON "Info/err",IDC_PROP_INFO,127,214,29,14 + CONTROL "Enable stream",IDC_PROP_CUDASTREAM,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,21,147,66,10 + CONTROL "AVX-512F+DQ",IDC_PROP_AVX512,"Button",BS_AUTOCHECKBOX | WS_TABSTOP,75,76,66,10 END IDD_RECON_OPT DIALOGEX 0, 0, 231, 211 @@ -885,7 +886,7 @@ BEGIN LEFTMARGIN, 7 RIGHTMARGIN, 156 TOPMARGIN, 7 - BOTTOMMARGIN, 233 + BOTTOMMARGIN, 246 END IDD_RECON_OPT, DIALOG diff --git a/source/gazoDoc.cpp b/source/gazoDoc.cpp index cee631e..2c9cf58 100644 --- a/source/gazoDoc.cpp +++ b/source/gazoDoc.cpp @@ -26,6 +26,8 @@ #include "DlgOverlay.h" #include "DlgResolnPlot.h" #include "DlgFrameList.h" +//220417 +#include //CUDA declaration #include "cudaReconst.h" @@ -259,7 +261,8 @@ void CGazoDoc::DeleteAll() for (int i=0; iDelete(); iReconst = irct; return 21021; } - if (irct) delete [] irct; + //220417 if (irct) delete [] irct; + _aligned_free(irct);//220417 maxReconst = ixdim2; } nReconst = ixdimp; @@ -3751,14 +3756,18 @@ TErr CGazoDoc::DeconvBackProj(RECONST_QUEUE* rq, double center, int iMultiplex, try { ppiReconst = new int*[nCPU - 1]; for (int i = 0; i < nCPU - 1; i++) { - ppiReconst[i] = new int[maxReconst]; + //220417 ppiReconst[i] = new int[maxReconst]; + ppiReconst[i] = (int*)_aligned_malloc(sizeof(int) * maxReconst, 64);//220417 memset(ppiReconst[i], 0, sizeof(int) * maxReconst); } } catch (CException* e) { e->Delete(); if (ppiReconst) { - for (int i = 0; i < nCPU - 1; i++) { if (ppiReconst[i]) delete[] ppiReconst[i]; } + for (int i = 0; i < nCPU - 1; i++) { + //220417 if (ppiReconst[i]) delete[] ppiReconst[i]; + if (ppiReconst[i]) _aligned_free(ppiReconst[i]); //220417 + } delete[] ppiReconst; } return 21023; @@ -3817,7 +3826,8 @@ TErr CGazoDoc::DeconvBackProj(RECONST_QUEUE* rq, double center, int iMultiplex, if (ppiReconst[i-1]) { for (int j = 0; j < maxReconst; j++) { iReconst[j] += (ppiReconst[i-1])[j]; } if (pApp->dlgProperty.m_ProcessorType == CDLGPROPERTY_PROCTYPE_CUDA) CUDA_FREE_HOST(ppiReconst[i-1]); - else delete[] ppiReconst[i - 1]; + //220417 else delete[] ppiReconst[i - 1]; + else _aligned_free(ppiReconst[i - 1]);//220417 } } } diff --git a/source/general.cpp b/source/general.cpp index cd45d5a..2b65c10 100644 --- a/source/general.cpp +++ b/source/general.cpp @@ -2064,7 +2064,10 @@ unsigned __stdcall DeconvBackProjThread(void* pArg) { param[6] = iparam6; int* ipgp = (int*)(iparam6); param[7] = 0; - if (pApp->dlgProperty.m_bEnableAVX2) param[7] |= 0x0001; + if (pApp->dlgProperty.m_bEnableAVX2) { + param[7] |= 0x0001; + if (pApp->dlgProperty.m_bEnableAVX512) param[7] |= 0x0002;//220417 + } param[8] = iy0; param[9] = iy1; BOOL bUseSIMD = pApp->dlgProperty.bEnableSIMD; diff --git a/source/projx64.asm b/source/projx64.asm index 35902e6..16de9b0 100644 --- a/source/projx64.asm +++ b/source/projx64.asm @@ -1,6 +1,9 @@ ;ml64.exe -DATA segment align(32) +DATA segment align(64) +FZMM0_15 real4 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0 +FZMM16_16 real4 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0 +FZMM0_0 real4 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 F76543210 real4 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 F88888888 real4 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0 F00000000 real4 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 @@ -61,6 +64,9 @@ projx64 PROC ; ldmxcsr pmxcsr ;jump to AVX routine + mov rax, [rsi + 56] ; AVX flag + and rax, 000000002h + jnz USEAVX512 mov rax, [rsi + 56] ; AVX flag and rax, 000000001h jnz USEAVX @@ -214,6 +220,65 @@ ALOOPYEND2: cmp rdx, r13 jnae ALOOPY ; iy < iy1 + jmp RTN + +USEAVX512: +;220417 +;load valiables + mov rax, [rsi] ; &fcos + vbroadcastss zmm0, real4 ptr [rax] + mov r8, [rsi + 8] ; &fsin + mov r11, [rsi + 16] ; &foffset + + mov rcx, [rsi + 24] ; ixdimpg + vcvtsi2ss xmm6, xmm6, rcx ; xmm6<==ixdimpg + vbroadcastss zmm6, xmm6 ; zmm6<==ixdimpg, ixdimpg, ixdimpg, ixdimpg + vcvtsi2ss xmm1, xmm1, r10 ; xmm1<==ixdimp + vbroadcastss zmm1, xmm1 ; zmm1<==ixdimp, ixdimp, ixdimp, ixdimp + + mov rax, r12; iy = iy0 + mov rcx, r10; ix = ixdimp + imul rcx + shl rax, 2 ; ixy = ixdimp * iy0 * 4 + add rax, [rsi + 40] ; ixy += ifp + mov rdi, rax +; mov rdi, [rsi + 40] ; ifp + mov rsi, [rsi + 48] ; igp + + mov rdx, r12 ; iy<==iy0 +A5LOOPY: + mov rbx, 0 ; ix<==0 + vmovaps zmm2, FZMM0_15 ; reset ix + vcvtsi2ss xmm3, xmm3, rdx ; xmm3<==iy + vbroadcastss zmm3, xmm3 ; xmm3<==iy, iy, iy, iy + vbroadcastss zmm5, real4 ptr [r8] + vmulps zmm5, zmm5, zmm3 ; iy * fsin for each float + vbroadcastss zmm7, real4 ptr [r11] ; zmm7<==foffset + vaddps zmm5, zmm5, zmm7 ; zmm5<==iy * fsin + foffset +A5LOOPX: + vmulps zmm4, zmm0, zmm2 ; (ix+n) * fcos + vaddps zmm4, zmm4, zmm5 ; (ix+n) * fcos + foffset + vcmpltps k1, zmm4, zmm6 ; k1[i:i]=1 if (zmm4 < ixdimpg) + vcmpgeps k2, zmm4, FZMM0_0 ; k2[i:i]=1 if (zmm4 >= 0) + kandw k1, k1, k2 + vpxord zmm3, zmm3, zmm3 ; clear zmm3 + vcvttps2dq zmm4, zmm4 ; zmm4 float*8 to integer32*8 + vpgatherdd zmm3{k1}, [rsi + zmm4 * 4] ; load [rsi+zmm4*4] if k1[i:i]=1 + vcmpltps k1, zmm2, zmm1 ; k1[i:i]=1 if (zmm2 < ixdimp) + vmovdqa32 zmm4{k1}, [rdi + rbx * 4] + vpaddd zmm4, zmm3, zmm4 + vmovdqa32 [rdi + rbx * 4]{k1}, zmm4 + + vaddps zmm2, zmm2, FZMM16_16 ; zmm2 + 16.0 + add rbx, 16 + cmp rbx, r10 + jnae A5LOOPX ; ix < ixdimp +A5LOOPYEND2: + add rdi, r9 ; +ixdimp*4 + inc rdx ; iy++ + cmp rdx, r13 + jnae A5LOOPY ; iy < iy1 + RTN: ; ldmxcsr smxcsr movdqu xmm7, regXMM7 diff --git a/source/projx64.lst b/source/projx64.lst index fd288b2..bdb7eab 100644 --- a/source/projx64.lst +++ b/source/projx64.lst @@ -1,11 +1,11 @@ -Microsoft (R) Macro Assembler (x64) Version 14.16.27025.1 12/04/21 16:48:04 +Microsoft (R) Macro Assembler (x64) Version 14.16.27025.1 04/22/22 10:13:45 projx64.asm Page 1 - 1 ;ml64.exe - 00000000 DATA segment align(32) - 00000000 00000000 F76543210 real4 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 + 00000000 DATA segment align(64) + 00000000 00000000 FZMM0_15 real4 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0 3F800000 40000000 40400000 @@ -13,7 +13,55 @@ projx64.asm Page 1 - 1 40A00000 40C00000 40E00000 - 00000020 41000000 F88888888 real4 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0 + 41000000 + 41100000 + 41200000 + 41300000 + 41400000 + 41500000 + 41600000 + 41700000 + 00000040 41800000 FZMM16_16 real4 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 41800000 + 00000080 00000000 FZMM0_0 real4 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 00000000 + 000000C0 00000000 F76543210 real4 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 + 3F800000 + 40000000 + 40400000 + 40800000 + 40A00000 + 40C00000 + 40E00000 + 000000E0 41000000 F88888888 real4 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0 41000000 41000000 41000000 @@ -21,7 +69,7 @@ projx64.asm Page 1 - 1 41000000 41000000 41000000 - 00000040 00000000 F00000000 real4 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 + 00000100 00000000 F00000000 real4 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 00000000 00000000 00000000 @@ -29,11 +77,11 @@ projx64.asm Page 1 - 1 00000000 00000000 00000000 - 00000060 00000000 F3210 real4 0.0, 1.0, 2.0, 3.0 + 00000120 00000000 F3210 real4 0.0, 1.0, 2.0, 3.0 3F800000 40000000 40400000 - 00000070 DATA ends + 00000130 DATA ends 00000000 .code @@ -92,188 +140,273 @@ projx64.asm Page 1 - 1 ;jump to AVX routine 00000030 48/ 8B 46 38 mov rax, [rsi + 56] ; AVX flag - 00000034 48/ 83 E0 01 and rax, 000000001h - 00000038 0F 85 000000F6 jnz USEAVX + 00000034 48/ 83 E0 02 and rax, 000000002h + 00000038 0F 85 000001D4 jnz USEAVX512 + 0000003E 48/ 8B 46 38 mov rax, [rsi + 56] ; AVX flag + 00000042 48/ 83 E0 01 and rax, 000000001h + 00000046 0F 85 000000F6 jnz USEAVX ;load valiables - 0000003E 49/ 8B C5 mov rax, r13; iy = iy1 - 00000041 48/ FF C8 dec rax - 00000044 49/ 8B CA mov rcx, r10; ix = ixdimp - 00000047 48/ F7 E9 imul rcx - 0000004A 48/ C1 E0 02 shl rax, 2 ; ixy = ixdimp * (iy1 - 1) * 4 - 0000004E 48/ 03 46 28 add rax, [rsi + 40] ; ixy += ifp - 00000052 48/ 8B F8 mov rdi, rax - - 00000055 48/ 8B 06 mov rax, [rsi] ; &fcos - 00000058 F3/ 0F 10 00 movss xmm0, real4 ptr [rax] - 0000005C 0F C6 C0 00 shufps xmm0, xmm0, 0 + 0000004C 49/ 8B C5 mov rax, r13; iy = iy1 + 0000004F 48/ FF C8 dec rax + 00000052 49/ 8B CA mov rcx, r10; ix = ixdimp + 00000055 48/ F7 E9 imul rcx + 00000058 48/ C1 E0 02 shl rax, 2 ; ixy = ixdimp * (iy1 - 1) * 4 + 0000005C 48/ 03 46 28 add rax, [rsi + 40] ; ixy += ifp + 00000060 48/ 8B F8 mov rdi, rax + + 00000063 48/ 8B 06 mov rax, [rsi] ; &fcos + 00000066 F3/ 0F 10 00 movss xmm0, real4 ptr [rax] + 0000006A 0F C6 C0 00 shufps xmm0, xmm0, 0 - 00000060 48/ 8B 46 08 mov rax, [rsi + 8] ; &fsin - 00000064 F3/ 0F 10 08 movss xmm1, real4 ptr [rax] - 00000068 0F C6 C9 00 shufps xmm1, xmm1, 0 + 0000006E 48/ 8B 46 08 mov rax, [rsi + 8] ; &fsin + 00000072 F3/ 0F 10 08 movss xmm1, real4 ptr [rax] + 00000076 0F C6 C9 00 shufps xmm1, xmm1, 0 - 0000006C 48/ 8B 46 10 mov rax, [rsi + 16] ; &foffset - 00000070 F3/ 0F 10 38 movss xmm7, real4 ptr [rax] - 00000074 0F C6 FF 00 shufps xmm7, xmm7, 0 + 0000007A 48/ 8B 46 10 mov rax, [rsi + 16] ; &foffset + 0000007E F3/ 0F 10 38 movss xmm7, real4 ptr [rax] + 00000082 0F C6 FF 00 shufps xmm7, xmm7, 0 - 00000078 0F 28 35 movaps xmm6, F3210 - 00000060 R + 00000086 0F 28 35 movaps xmm6, F3210 + 00000120 R ;start process - 0000007F 48/ 8B 4E 18 mov rcx, [rsi + 24] ; ixdimpg - 00000083 48/ 8B 76 30 mov rsi, [rsi + 48] ; igp + 0000008D 48/ 8B 4E 18 mov rcx, [rsi + 24] ; ixdimpg + 00000091 48/ 8B 76 30 mov rsi, [rsi + 48] ; igp - 00000087 49/ 8B D5 mov rdx, r13 ; iy<==iy1 - 0000008A 48/ FF CA dec rdx - 0000008D 48/ C7 C0 mov rax, 0 + 00000095 49/ 8B D5 mov rdx, r13 ; iy<==iy1 + 00000098 48/ FF CA dec rdx + 0000009B 48/ C7 C0 mov rax, 0 00000000 - 00000094 LOOPY: - 00000094 49/ 8B DA mov rbx, r10 ; ix<==ixdimp - 00000097 48/ FF CB dec rbx - 0000009A F3/ 48/ 0F 2A DA cvtsi2ss xmm3, rdx ; xmm3<==iy - 0000009F 0F C6 DB 00 shufps xmm3, xmm3, 0 ; xmm3<==iy, iy, iy, iy - 000000A3 0F 28 E9 movaps xmm5, xmm1 ; xmm5<==fsin, fsin, fsin, fsin - 000000A6 0F 59 EB mulps xmm5, xmm3 ; iy * fsin for each float - 000000A9 0F 58 EF addps xmm5, xmm7 ; + foffset for each float - 000000AC LOOPX: - 000000AC F3/ 48/ 0F 2A D3 cvtsi2ss xmm2, rbx ; xmm2<==ix - 000000B1 0F C6 D2 00 shufps xmm2, xmm2, 0 ; xmm2<==ix, ix, ix, ix - 000000B5 0F 5C D6 subps xmm2, xmm6 ; xmm2<==ix-3, ix-2, ix-2, ix - 000000B8 0F 28 E0 movaps xmm4, xmm0 ; xmm4<==fcos, fcos, fcos, fcos - 000000BB 0F 59 E2 mulps xmm4, xmm2 ; (ix-n) * fcos - 000000BE 0F 58 E5 addps xmm4, xmm5 ; (ix-n) * fcos + foffset - 000000C1 F3/ 0F 5B E4 cvttps2dq xmm4, xmm4 ; xmm4 float*4 to integer32*4 - 000000C5 66| 0F 7E E0 movd eax, xmm4 ; lower 4 bytes to eax + 000000A2 LOOPY: + 000000A2 49/ 8B DA mov rbx, r10 ; ix<==ixdimp + 000000A5 48/ FF CB dec rbx + 000000A8 F3/ 48/ 0F 2A DA cvtsi2ss xmm3, rdx ; xmm3<==iy + 000000AD 0F C6 DB 00 shufps xmm3, xmm3, 0 ; xmm3<==iy, iy, iy, iy + 000000B1 0F 28 E9 movaps xmm5, xmm1 ; xmm5<==fsin, fsin, fsin, fsin + 000000B4 0F 59 EB mulps xmm5, xmm3 ; iy * fsin for each float + 000000B7 0F 58 EF addps xmm5, xmm7 ; + foffset for each float + 000000BA LOOPX: + 000000BA F3/ 48/ 0F 2A D3 cvtsi2ss xmm2, rbx ; xmm2<==ix + 000000BF 0F C6 D2 00 shufps xmm2, xmm2, 0 ; xmm2<==ix, ix, ix, ix + 000000C3 0F 5C D6 subps xmm2, xmm6 ; xmm2<==ix-3, ix-2, ix-2, ix + 000000C6 0F 28 E0 movaps xmm4, xmm0 ; xmm4<==fcos, fcos, fcos, fcos + 000000C9 0F 59 E2 mulps xmm4, xmm2 ; (ix-n) * fcos + 000000CC 0F 58 E5 addps xmm4, xmm5 ; (ix-n) * fcos + foffset + 000000CF F3/ 0F 5B E4 cvttps2dq xmm4, xmm4 ; xmm4 float*4 to integer32*4 + 000000D3 66| 0F 7E E0 movd eax, xmm4 ; lower 4 bytes to eax ; pextrd eax, xmm4, 0 ; SSE4.1 - 000000C9 3B C1 cmp eax, ecx ; ix<=>ixdimpg - 000000CB 73 06 jae LOOPXSKIP1 ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 - 000000CD 8B 04 86 mov eax, [rsi + rax * 4] ; eax<==igp[ix * DBPT_GINTP] - 000000D0 01 04 9F add [rdi + rbx * 4], eax ; ifp[ix] += eax - 000000D3 LOOPXSKIP1: - 000000D3 48/ FF CB dec rbx ; ix-- - 000000D6 7C 48 jl LOOPYEND ; ix < 0 - 000000D8 66| 0F 73 DC psrldq xmm4, 4 ; shift right by 4 bytes (integer32) + 000000D7 3B C1 cmp eax, ecx ; ix<=>ixdimpg + 000000D9 73 06 jae LOOPXSKIP1 ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 + 000000DB 8B 04 86 mov eax, [rsi + rax * 4] ; eax<==igp[ix * DBPT_GINTP] + 000000DE 01 04 9F add [rdi + rbx * 4], eax ; ifp[ix] += eax + 000000E1 LOOPXSKIP1: + 000000E1 48/ FF CB dec rbx ; ix-- + 000000E4 7C 48 jl LOOPYEND ; ix < 0 + 000000E6 66| 0F 73 DC psrldq xmm4, 4 ; shift right by 4 bytes (integer32) 04 - 000000DD 66| 0F 7E E0 movd eax, xmm4 + 000000EB 66| 0F 7E E0 movd eax, xmm4 ; pextrd eax, xmm4, 1 ; SSE4.1 - 000000E1 3B C1 cmp eax, ecx; ixdimpg - 000000E3 73 06 jae LOOPXSKIP2 ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 - 000000E5 8B 04 86 mov eax, [rsi + rax * 4] - 000000E8 01 04 9F add [rdi + rbx * 4], eax - 000000EB LOOPXSKIP2: - 000000EB 48/ FF CB dec rbx ; ix-- - 000000EE 7C 30 jl LOOPYEND ; ix < 0 - 000000F0 66| 0F 73 DC psrldq xmm4, 4 + 000000EF 3B C1 cmp eax, ecx; ixdimpg + 000000F1 73 06 jae LOOPXSKIP2 ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 + 000000F3 8B 04 86 mov eax, [rsi + rax * 4] + 000000F6 01 04 9F add [rdi + rbx * 4], eax + 000000F9 LOOPXSKIP2: + 000000F9 48/ FF CB dec rbx ; ix-- + 000000FC 7C 30 jl LOOPYEND ; ix < 0 + 000000FE 66| 0F 73 DC psrldq xmm4, 4 04 - 000000F5 66| 0F 7E E0 movd eax, xmm4 + 00000103 66| 0F 7E E0 movd eax, xmm4 ; pextrd eax, xmm4, 2 ; SSE4.1 - 000000F9 3B C1 cmp eax, ecx; ixdimpg - 000000FB 73 06 jae LOOPXSKIP3 ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 - 000000FD 8B 04 86 mov eax, [rsi + rax * 4] - 00000100 01 04 9F add [rdi + rbx * 4], eax - 00000103 LOOPXSKIP3: - 00000103 48/ FF CB dec rbx ; ix-- - 00000106 7C 18 jl LOOPYEND ; ix < 0 - 00000108 66| 0F 73 DC psrldq xmm4, 4 + 00000107 3B C1 cmp eax, ecx; ixdimpg + 00000109 73 06 jae LOOPXSKIP3 ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 + 0000010B 8B 04 86 mov eax, [rsi + rax * 4] + 0000010E 01 04 9F add [rdi + rbx * 4], eax + 00000111 LOOPXSKIP3: + 00000111 48/ FF CB dec rbx ; ix-- + 00000114 7C 18 jl LOOPYEND ; ix < 0 + 00000116 66| 0F 73 DC psrldq xmm4, 4 04 - 0000010D 66| 0F 7E E0 movd eax, xmm4 + 0000011B 66| 0F 7E E0 movd eax, xmm4 ; pextrd eax, xmm4, 3 ; SSE4.1 - 00000111 3B C1 cmp eax, ecx; ixdimpg - 00000113 73 06 jae LOOPXEND ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 - 00000115 8B 04 86 mov eax, [rsi + rax * 4] - 00000118 01 04 9F add [rdi + rbx * 4], eax - 0000011B LOOPXEND: - 0000011B 48/ FF CB dec rbx ; ix-- - 0000011E 7D 8C jge LOOPX ; ix >= 0 - 00000120 LOOPYEND: - 00000120 49/ 2B F9 sub rdi, r9 ; ixdimp*4 - 00000123 48/ FF CA dec rdx ; iy-- - 00000126 49/ 3B D4 cmp rdx, r12 - 00000129 0F 8D FFFFFF65 jge LOOPY ; iy >= iy0 - - 0000012F E9 000000CB jmp RTN - - 00000134 USEAVX: + 0000011F 3B C1 cmp eax, ecx; ixdimpg + 00000121 73 06 jae LOOPXEND ; ix0 >= ixdimp * DBPT_GINTP or ix0 < 0 + 00000123 8B 04 86 mov eax, [rsi + rax * 4] + 00000126 01 04 9F add [rdi + rbx * 4], eax + 00000129 LOOPXEND: + 00000129 48/ FF CB dec rbx ; ix-- + 0000012C 7D 8C jge LOOPX ; ix >= 0 + 0000012E LOOPYEND: + 0000012E 49/ 2B F9 sub rdi, r9 ; ixdimp*4 + 00000131 48/ FF CA dec rdx ; iy-- + 00000134 49/ 3B D4 cmp rdx, r12 + 00000137 0F 8D FFFFFF65 jge LOOPY ; iy >= iy0 + + 0000013D E9 000001BC jmp RTN + + 00000142 USEAVX: ;load valiables - 00000134 48/ 8B 06 mov rax, [rsi] ; &fcos - 00000137 C4 E2 7D/ 18 00 vbroadcastss ymm0, real4 ptr [rax] - 0000013C 4C/ 8B 46 08 mov r8, [rsi + 8] ; &fsin - 00000140 4C/ 8B 5E 10 mov r11, [rsi + 16] ; &foffset - - 00000144 48/ 8B 4E 18 mov rcx, [rsi + 24] ; ixdimpg - 00000148 C4 E1 CA/ 2A F1 vcvtsi2ss xmm6, xmm6, rcx ; xmm6<==ixdimpg - 0000014D C4 E2 7D/ 18 F6 vbroadcastss ymm6, xmm6 ; ymm6<==ixdimpg, ixdimpg, ixdimpg, ixdimpg - 00000152 C4 C1 F2/ 2A CA vcvtsi2ss xmm1, xmm1, r10 ; xmm1<==ixdimp - 00000157 C4 E2 7D/ 18 C9 vbroadcastss ymm1, xmm1 ; ymm1<==ixdimp, ixdimp, ixdimp, ixdimp + 00000142 48/ 8B 06 mov rax, [rsi] ; &fcos + 00000145 C4 E2 7D/ 18 00 vbroadcastss ymm0, real4 ptr [rax] + 0000014A 4C/ 8B 46 08 mov r8, [rsi + 8] ; &fsin + 0000014E 4C/ 8B 5E 10 mov r11, [rsi + 16] ; &foffset + + 00000152 48/ 8B 4E 18 mov rcx, [rsi + 24] ; ixdimpg + 00000156 C4 E1 CA/ 2A F1 vcvtsi2ss xmm6, xmm6, rcx ; xmm6<==ixdimpg + 0000015B C4 E2 7D/ 18 F6 vbroadcastss ymm6, xmm6 ; ymm6<==ixdimpg, ixdimpg, ixdimpg, ixdimpg + 00000160 C4 C1 F2/ 2A CA vcvtsi2ss xmm1, xmm1, r10 ; xmm1<==ixdimp + 00000165 C4 E2 7D/ 18 C9 vbroadcastss ymm1, xmm1 ; ymm1<==ixdimp, ixdimp, ixdimp, ixdimp - 0000015C 49/ 8B C4 mov rax, r12; iy = iy0 - 0000015F 49/ 8B CA mov rcx, r10; ix = ixdimp - 00000162 48/ F7 E9 imul rcx - 00000165 48/ C1 E0 02 shl rax, 2 ; ixy = ixdimp * iy0 * 4 - 00000169 48/ 03 46 28 add rax, [rsi + 40] ; ixy += ifp - 0000016D 48/ 8B F8 mov rdi, rax + 0000016A 49/ 8B C4 mov rax, r12; iy = iy0 + 0000016D 49/ 8B CA mov rcx, r10; ix = ixdimp + 00000170 48/ F7 E9 imul rcx + 00000173 48/ C1 E0 02 shl rax, 2 ; ixy = ixdimp * iy0 * 4 + 00000177 48/ 03 46 28 add rax, [rsi + 40] ; ixy += ifp + 0000017B 48/ 8B F8 mov rdi, rax ; mov rdi, [rsi + 40] ; ifp - 00000170 48/ 8B 76 30 mov rsi, [rsi + 48] ; igp + 0000017E 48/ 8B 76 30 mov rsi, [rsi + 48] ; igp - 00000174 49/ 8B D4 mov rdx, r12 ; iy<==iy0 - 00000177 ALOOPY: - 00000177 48/ C7 C3 mov rbx, 0 ; ix<==0 + 00000182 49/ 8B D4 mov rdx, r12 ; iy<==iy0 + 00000185 ALOOPY: + 00000185 48/ C7 C3 mov rbx, 0 ; ix<==0 00000000 - 0000017E C5 FC/ 28 15 vmovaps ymm2, F76543210 ; reset ix - 00000000 R - 00000186 C4 E1 E2/ 2A DA vcvtsi2ss xmm3, xmm3, rdx ; xmm3<==iy - 0000018B C4 E2 7D/ 18 DB vbroadcastss ymm3, xmm3 ; xmm3<==iy, iy, iy, iy - 00000190 C4 C2 7D/ 18 28 vbroadcastss ymm5, real4 ptr [r8] - 00000195 C5 D4/ 59 EB vmulps ymm5, ymm5, ymm3 ; iy * fsin for each float - 00000199 C4 C2 7D/ 18 3B vbroadcastss ymm7, real4 ptr [r11] ; ymm7<==foffset - 0000019E C5 D4/ 58 EF vaddps ymm5, ymm5, ymm7 ; ymm5<==iy * fsin + foffset - 000001A2 ALOOPX: - 000001A2 C5 FC/ 59 E2 vmulps ymm4, ymm0, ymm2 ; (ix+n) * fcos - 000001A6 C5 DC/ 58 E5 vaddps ymm4, ymm4, ymm5 ; (ix+n) * fcos + foffset - 000001AA C5 DC/ C2 FE 01 vcmpltps ymm7, ymm4, ymm6 ; ymm7[i:i]=1 if (ymm4 < ixdimpg) - 000001AF C5 DC/ C2 1D vcmpgeps ymm3, ymm4, F00000000 ; ymm3[i:i]=1 if (ymm4 >= 0) - 00000040 R 0D - 000001B8 C5 C5/ DB FB vpand ymm7, ymm7, ymm3 - 000001BC C5 E5/ EF DB vpxor ymm3, ymm3, ymm3 ; clear ymm3 - 000001C0 C5 FE/ 5B E4 vcvttps2dq ymm4, ymm4 ; ymm4 float*8 to integer32*8 - 000001C4 C4 E2 45/ 90 1C vpgatherdd ymm3, [rsi + ymm4 * 4], ymm7 ; load [rsi+ymm4*4] if ymm7=1 + 0000018C C5 FC/ 28 15 vmovaps ymm2, F76543210 ; reset ix + 000000C0 R + 00000194 C4 E1 E2/ 2A DA vcvtsi2ss xmm3, xmm3, rdx ; xmm3<==iy + 00000199 C4 E2 7D/ 18 DB vbroadcastss ymm3, xmm3 ; xmm3<==iy, iy, iy, iy + 0000019E C4 C2 7D/ 18 28 vbroadcastss ymm5, real4 ptr [r8] + 000001A3 C5 D4/ 59 EB vmulps ymm5, ymm5, ymm3 ; iy * fsin for each float + 000001A7 C4 C2 7D/ 18 3B vbroadcastss ymm7, real4 ptr [r11] ; ymm7<==foffset + 000001AC C5 D4/ 58 EF vaddps ymm5, ymm5, ymm7 ; ymm5<==iy * fsin + foffset + 000001B0 ALOOPX: + 000001B0 C5 FC/ 59 E2 vmulps ymm4, ymm0, ymm2 ; (ix+n) * fcos + 000001B4 C5 DC/ 58 E5 vaddps ymm4, ymm4, ymm5 ; (ix+n) * fcos + foffset + 000001B8 C5 DC/ C2 FE 01 vcmpltps ymm7, ymm4, ymm6 ; ymm7[i:i]=1 if (ymm4 < ixdimpg) + 000001BD C5 DC/ C2 1D vcmpgeps ymm3, ymm4, F00000000 ; ymm3[i:i]=1 if (ymm4 >= 0) + 00000100 R 0D + 000001C6 C5 C5/ DB FB vpand ymm7, ymm7, ymm3 + 000001CA C5 E5/ EF DB vpxor ymm3, ymm3, ymm3 ; clear ymm3 + 000001CE C5 FE/ 5B E4 vcvttps2dq ymm4, ymm4 ; ymm4 float*8 to integer32*8 + 000001D2 C4 E2 45/ 90 1C vpgatherdd ymm3, [rsi + ymm4 * 4], ymm7 ; load [rsi+ymm4*4] if ymm7=1 A6 - 000001CA C5 EC/ C2 F9 01 vcmpltps ymm7, ymm2, ymm1 ; ymm7[i:i]=1 if (ymm2 < ixdimp) - 000001CF C4 E2 45/ 8C 24 vpmaskmovd ymm4, ymm7, [rdi + rbx * 4] + 000001D8 C5 EC/ C2 F9 01 vcmpltps ymm7, ymm2, ymm1 ; ymm7[i:i]=1 if (ymm2 < ixdimp) + 000001DD C4 E2 45/ 8C 24 vpmaskmovd ymm4, ymm7, [rdi + rbx * 4] 9F - 000001D5 C5 E5/ FE E4 vpaddd ymm4, ymm3, ymm4 - 000001D9 C4 E2 45/ 8E 24 vpmaskmovd [rdi + rbx * 4], ymm7, ymm4 + 000001E3 C5 E5/ FE E4 vpaddd ymm4, ymm3, ymm4 + 000001E7 C4 E2 45/ 8E 24 vpmaskmovd [rdi + rbx * 4], ymm7, ymm4 9F - 000001DF C5 EC/ 58 15 vaddps ymm2, ymm2, F88888888 ; ymm2 + 8.0 - 00000020 R - 000001E7 48/ 83 C3 08 add rbx, 8 - 000001EB 49/ 3B DA cmp rbx, r10 - 000001EE 72 B2 jnae ALOOPX ; ix < ixdimp - 000001F0 ALOOPYEND2: - 000001F0 49/ 03 F9 add rdi, r9 ; +ixdimp*4 - 000001F3 48/ FF C2 inc rdx ; iy++ - 000001F6 49/ 3B D5 cmp rdx, r13 - 000001F9 0F 82 FFFFFF78 jnae ALOOPY ; iy < iy1 - - 000001FF RTN: + 000001ED C5 EC/ 58 15 vaddps ymm2, ymm2, F88888888 ; ymm2 + 8.0 + 000000E0 R + 000001F5 48/ 83 C3 08 add rbx, 8 + 000001F9 49/ 3B DA cmp rbx, r10 + 000001FC 72 B2 jnae ALOOPX ; ix < ixdimp + 000001FE ALOOPYEND2: + 000001FE 49/ 03 F9 add rdi, r9 ; +ixdimp*4 + 00000201 48/ FF C2 inc rdx ; iy++ + 00000204 49/ 3B D5 cmp rdx, r13 + 00000207 0F 82 FFFFFF78 jnae ALOOPY ; iy < iy1 + + 0000020D E9 000000EC jmp RTN + + 00000212 USEAVX512: + ;220417 + ;load valiables + 00000212 48/ 8B 06 mov rax, [rsi] ; &fcos + 00000215 62 F2 7D 48/ 18 vbroadcastss zmm0, real4 ptr [rax] + 00 + 0000021B 4C/ 8B 46 08 mov r8, [rsi + 8] ; &fsin + 0000021F 4C/ 8B 5E 10 mov r11, [rsi + 16] ; &foffset + + 00000223 48/ 8B 4E 18 mov rcx, [rsi + 24] ; ixdimpg + 00000227 C4 E1 CA/ 2A F1 vcvtsi2ss xmm6, xmm6, rcx ; xmm6<==ixdimpg + 0000022C 62 F2 7D 48/ 18 vbroadcastss zmm6, xmm6 ; zmm6<==ixdimpg, ixdimpg, ixdimpg, ixdimpg + F6 + 00000232 C4 C1 F2/ 2A CA vcvtsi2ss xmm1, xmm1, r10 ; xmm1<==ixdimp + 00000237 62 F2 7D 48/ 18 vbroadcastss zmm1, xmm1 ; zmm1<==ixdimp, ixdimp, ixdimp, ixdimp + C9 + + 0000023D 49/ 8B C4 mov rax, r12; iy = iy0 + 00000240 49/ 8B CA mov rcx, r10; ix = ixdimp + 00000243 48/ F7 E9 imul rcx + 00000246 48/ C1 E0 02 shl rax, 2 ; ixy = ixdimp * iy0 * 4 + 0000024A 48/ 03 46 28 add rax, [rsi + 40] ; ixy += ifp + 0000024E 48/ 8B F8 mov rdi, rax + ; mov rdi, [rsi + 40] ; ifp + 00000251 48/ 8B 76 30 mov rsi, [rsi + 48] ; igp + + 00000255 49/ 8B D4 mov rdx, r12 ; iy<==iy0 + 00000258 A5LOOPY: + 00000258 48/ C7 C3 mov rbx, 0 ; ix<==0 + 00000000 + 0000025F 62 F1 7C 48/ 28 vmovaps zmm2, FZMM0_15 ; reset ix + 15 00000000 R + 00000269 C4 E1 E2/ 2A DA vcvtsi2ss xmm3, xmm3, rdx ; xmm3<==iy + 0000026E 62 F2 7D 48/ 18 vbroadcastss zmm3, xmm3 ; xmm3<==iy, iy, iy, iy + DB + 00000274 62 D2 7D 48/ 18 vbroadcastss zmm5, real4 ptr [r8] + 28 + 0000027A 62 F1 54 48/ 59 vmulps zmm5, zmm5, zmm3 ; iy * fsin for each float + EB + 00000280 62 D2 7D 48/ 18 vbroadcastss zmm7, real4 ptr [r11] ; zmm7<==foffset + 3B + 00000286 62 F1 54 48/ 58 vaddps zmm5, zmm5, zmm7 ; zmm5<==iy * fsin + foffset + EF + 0000028C A5LOOPX: + 0000028C 62 F1 7C 48/ 59 vmulps zmm4, zmm0, zmm2 ; (ix+n) * fcos + E2 + 00000292 62 F1 5C 48/ 58 vaddps zmm4, zmm4, zmm5 ; (ix+n) * fcos + foffset + E5 + 00000298 62 F1 5C 48/ C2 vcmpltps k1, zmm4, zmm6 ; k1[i:i]=1 if (zmm4 < ixdimpg) + CE 01 + 0000029F 62 F1 5C 48/ C2 vcmpgeps k2, zmm4, FZMM0_0 ; k2[i:i]=1 if (zmm4 >= 0) + 15 00000080 R + 0D + 000002AA C5 F4/ 41 CA kandw k1, k1, k2 + 000002AE 62 F1 65 48/ EF vpxord zmm3, zmm3, zmm3 ; clear zmm3 + DB + 000002B4 62 F1 7E 48/ 5B vcvttps2dq zmm4, zmm4 ; zmm4 float*8 to integer32*8 + E4 + 000002BA 62 F2 7D 49/ 90 vpgatherdd zmm3{k1}, [rsi + zmm4 * 4] ; load [rsi+zmm4*4] if k1[i:i]=1 + 1C A6 + 000002C1 62 F1 6C 48/ C2 vcmpltps k1, zmm2, zmm1 ; k1[i:i]=1 if (zmm2 < ixdimp) + C9 01 + 000002C8 62 F1 7D 49/ 6F vmovdqa32 zmm4{k1}, [rdi + rbx * 4] + 24 9F + 000002CF 62 F1 65 48/ FE vpaddd zmm4, zmm3, zmm4 + E4 + 000002D5 62 F1 7D 49/ 7F vmovdqa32 [rdi + rbx * 4]{k1}, zmm4 + 24 9F + + 000002DC 62 F1 6C 48/ 58 vaddps zmm2, zmm2, FZMM16_16 ; zmm2 + 16.0 + 15 00000040 R + 000002E6 48/ 83 C3 10 add rbx, 16 + 000002EA 49/ 3B DA cmp rbx, r10 + 000002ED 72 9D jnae A5LOOPX ; ix < ixdimp + 000002EF A5LOOPYEND2: + 000002EF 49/ 03 F9 add rdi, r9 ; +ixdimp*4 + 000002F2 48/ FF C2 inc rdx ; iy++ + 000002F5 49/ 3B D5 cmp rdx, r13 + 000002F8 0F 82 FFFFFF5A jnae A5LOOPY ; iy < iy1 + + 000002FE RTN: ; ldmxcsr smxcsr - 000001FF F3/ 0F 6F 7D movdqu xmm7, regXMM7 + 000002FE F3/ 0F 6F 7D movdqu xmm7, regXMM7 E0 - 00000204 F3/ 0F 6F 75 movdqu xmm6, regXMM6 + 00000303 F3/ 0F 6F 75 movdqu xmm6, regXMM6 F0 - 00000209 41/ 5D pop r13 - 0000020B 41/ 5C pop r12 - 0000020D 5F pop rdi - 0000020E 5E pop rsi - 0000020F 5D pop rbp - 00000210 5B pop rbx + 00000308 41/ 5D pop r13 + 0000030A 41/ 5C pop r12 + 0000030C 5F pop rdi + 0000030D 5E pop rsi + 0000030E 5D pop rbp + 0000030F 5B pop rbx ret - 00000213 projx64 ENDP + 00000312 projx64 ENDP end - Microsoft (R) Macro Assembler (x64) Version 14.16.27025.1 12/04/21 16:48:04 + Microsoft (R) Macro Assembler (x64) Version 14.16.27025.1 04/22/22 10:13:45 projx64.asm Symbols 2 - 1 @@ -283,38 +416,45 @@ Segments: N a m e Length Align Class -DATA . . . . . . . . . . . . . . 00000070 32 +DATA . . . . . . . . . . . . . . 00000130 64 Procedures, parameters, and locals: N a m e Type Value Attr -projx64 . . . . . . . . . . . . P 00000000 _TEXT Length= 00000213 Public +projx64 . . . . . . . . . . . . P 00000000 _TEXT Length= 00000312 Public regXMM6 . . . . . . . . . . . XmmWord rbp - 00000010 regXMM7 . . . . . . . . . . . XmmWord rbp - 00000020 - LOOPY . . . . . . . . . . . . L 00000094 _TEXT - LOOPX . . . . . . . . . . . . L 000000AC _TEXT - LOOPXSKIP1 . . . . . . . . . . L 000000D3 _TEXT - LOOPXSKIP2 . . . . . . . . . . L 000000EB _TEXT - LOOPXSKIP3 . . . . . . . . . . L 00000103 _TEXT - LOOPXEND . . . . . . . . . . . L 0000011B _TEXT - LOOPYEND . . . . . . . . . . . L 00000120 _TEXT - USEAVX . . . . . . . . . . . . L 00000134 _TEXT - ALOOPY . . . . . . . . . . . . L 00000177 _TEXT - ALOOPX . . . . . . . . . . . . L 000001A2 _TEXT - ALOOPYEND2 . . . . . . . . . . L 000001F0 _TEXT - RTN . . . . . . . . . . . . . L 000001FF _TEXT + LOOPY . . . . . . . . . . . . L 000000A2 _TEXT + LOOPX . . . . . . . . . . . . L 000000BA _TEXT + LOOPXSKIP1 . . . . . . . . . . L 000000E1 _TEXT + LOOPXSKIP2 . . . . . . . . . . L 000000F9 _TEXT + LOOPXSKIP3 . . . . . . . . . . L 00000111 _TEXT + LOOPXEND . . . . . . . . . . . L 00000129 _TEXT + LOOPYEND . . . . . . . . . . . L 0000012E _TEXT + USEAVX . . . . . . . . . . . . L 00000142 _TEXT + ALOOPY . . . . . . . . . . . . L 00000185 _TEXT + ALOOPX . . . . . . . . . . . . L 000001B0 _TEXT + ALOOPYEND2 . . . . . . . . . . L 000001FE _TEXT + USEAVX512 . . . . . . . . . . L 00000212 _TEXT + A5LOOPY . . . . . . . . . . . L 00000258 _TEXT + A5LOOPX . . . . . . . . . . . L 0000028C _TEXT + A5LOOPYEND2 . . . . . . . . . L 000002EF _TEXT + RTN . . . . . . . . . . . . . L 000002FE _TEXT Symbols: N a m e Type Value Attr -F00000000 . . . . . . . . . . . DWord 00000040 DATA -F3210 . . . . . . . . . . . . . DWord 00000060 DATA -F76543210 . . . . . . . . . . . DWord 00000000 DATA -F88888888 . . . . . . . . . . . DWord 00000020 DATA +F00000000 . . . . . . . . . . . DWord 00000100 DATA +F3210 . . . . . . . . . . . . . DWord 00000120 DATA +F76543210 . . . . . . . . . . . DWord 000000C0 DATA +F88888888 . . . . . . . . . . . DWord 000000E0 DATA +FZMM0_0 . . . . . . . . . . . . DWord 00000080 DATA +FZMM0_15 . . . . . . . . . . . . DWord 00000000 DATA +FZMM16_16 . . . . . . . . . . . DWord 00000040 DATA WIN_X64 . . . . . . . . . . . . Text 0 Warnings diff --git a/source/resource.h b/source/resource.h index 8ec2589..fbf0822 100644 --- a/source/resource.h +++ b/source/resource.h @@ -155,6 +155,8 @@ #define IDC_PROP_CUDASTREAM 1056 #define IDC_REFR_XENERGY 1057 #define IDC_REFR_S2DDIST 1058 +#define IDC_PROP_AVX3 1058 +#define IDC_PROP_AVX512 1058 #define IDC_REFR_PXSIZE 1059 #define IDC_REFR_STATUS 1060 #define IDC_REFR_PXSIZE2 1061 @@ -262,6 +264,7 @@ #define IDC_POLYGONLIST 1125 #define IDC_POLYGON_CURRENT 1126 #define IDC_POLYGON_DELETE 1127 +#define IDC_LSQFIT_FLIP 1128 #define IDC_DIALBOX_COMPORT 1414 #define IDC_DIALBOX_CONNECT 1415 #define IDC_DIALBOX_DISCONNECT 1416 @@ -365,6 +368,10 @@ #define ID_TOOLBAR_QUEUE 32844 #define ID_TOOLBAR_HISTG 32845 #define ID_ANALYSIS_SUBTRACT 32846 +#define ID_POPUPQUEUE_INSERTSTOP 32847 +#define ID_POPUPQUEUE_APPENDSTOP 32848 +#define ID_POPUPQUEUE_APPENDPAUSE 32849 +#define ID_POPUPQUEUE_APPENDSLEEP 32850 #define ID_POPUPQUEUE_DEL 32860 // Next default values for new objects @@ -373,8 +380,8 @@ #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_3D_CONTROLS 1 #define _APS_NEXT_RESOURCE_VALUE 153 -#define _APS_NEXT_COMMAND_VALUE 32847 -#define _APS_NEXT_CONTROL_VALUE 1128 +#define _APS_NEXT_COMMAND_VALUE 32851 +#define _APS_NEXT_CONTROL_VALUE 1129 #define _APS_NEXT_SYMED_VALUE 102 #endif #endif