diff --git a/dlpack/LICENSE b/dlpack/LICENSE new file mode 100644 index 0000000000..20a9c8a7b4 --- /dev/null +++ b/dlpack/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2017 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dlpack/dlpack.h b/dlpack/dlpack.h new file mode 100644 index 0000000000..ef6960b23a --- /dev/null +++ b/dlpack/dlpack.h @@ -0,0 +1,233 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current version of dlpack */ +#define DLPACK_VERSION 70 + +/*! \brief The current ABI version of dlpack */ +#define DLPACK_ABI_VERSION 1 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +// NB: This is the only difference from +// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to +// allow forward declaration of DLDevice. +typedef struct DLDevice_ { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be + * well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow + * the native endian-ness. An explicit error message should be raised when + * attempting to export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes=1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4 + * - int8: type_code = 0, bits = 8, lanes=1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void* manager_ctx; + /*! \brief Destructor signature void (*)(void*) - this should be called + * to destruct manager_ctx which holds the DLManagedTensor. It can be NULL + * if there is no way for the caller to provide a reasonable destructor. + * The destructors deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor* self); +} DLManagedTensor; +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/kokkostbx/kokkos_dlpack.h b/kokkostbx/kokkos_dlpack.h new file mode 100644 index 0000000000..dedddf6264 --- /dev/null +++ b/kokkostbx/kokkos_dlpack.h @@ -0,0 +1,92 @@ +#ifndef KOKKOS_DLPACK_H +#define KOKKOS_DLPACK_H +#include +#include + +namespace kokkostbx { + +template +DLDataTypeCode getDLPackTypeCode() { + using ValueType = typename Kokkos::View::value_type; + if (std::is_same::value) { + return kDLFloat; + } else if (std::is_same::value) { + return kDLFloat; + } else if (std::is_same::value) { + return kDLInt; + } else if (std::is_same::value) { + return kDLUInt; + // } else if (std::is_same::value) { + // return kDLBool; + } else { + // Unsupported data type + throw std::runtime_error("Unsupported data type for DLPack conversion"); + } +} + +template +DLDataType getDLPackDataType() { + DLDataType dtype; + dtype.code = getDLPackTypeCode(); + dtype.bits = sizeof(typename Kokkos::View::value_type) * 8; + dtype.lanes = 1; + return dtype; +} + +template +DLDevice getDLPackDevice() { + const int device_id = std::max(0, Kokkos::device_id()); // convert host id from -1 to 0 + + if (std::is_same::value) { + return {kDLCPU, device_id}; + } +#ifdef KOKKOS_ENABLE_CUDA + else if (std::is_same::value) { + return {kDLCUDA, device_id}; + } else if (std::is_same::value) { + return {kDLCUDAManaged, device_id}; + } else if (std::is_same::value) { + return {kDLCUDAHost, device_id}; + } +#endif +#ifdef KOKKOS_ENABLE_HIP + else if (std::is_same::value) { + return {kDLROCM, device_id}; + } else if (std::is_same::value) { + return {kDLROCMHost, device_id}; + } +#endif + else { + // Extend to other device types as needed + throw std::runtime_error("Unsupported Kokkos device type for DLPack conversion."); + } +} + +template +DLManagedTensor* view_to_dlpack(Kokkos::View& view) { + // Get the Kokkos view size and dimensions + constexpr size_t numDims = Kokkos::View::rank; + int64_t* shape = new int64_t[numDims]; + for (size_t i = 0; i < numDims; i++) { + shape[i] = view.extent(i); + } + + // Create a DLPack tensor + DLManagedTensor* dlpackTensor = new DLManagedTensor; + dlpackTensor->dl_tensor.data = view.data(); + dlpackTensor->dl_tensor.device = getDLPackDevice(); + dlpackTensor->dl_tensor.ndim = numDims; + dlpackTensor->dl_tensor.dtype = getDLPackDataType(); + dlpackTensor->dl_tensor.shape = shape; + dlpackTensor->dl_tensor.strides = nullptr; + dlpackTensor->dl_tensor.byte_offset = 0; + dlpackTensor->manager_ctx = nullptr; + dlpackTensor->deleter = [](DLManagedTensor* tensor) { + delete[] tensor->dl_tensor.shape; + }; + return dlpackTensor; +} + +} + +#endif // KOKKOS_DLPACK_H diff --git a/libtbx/citations.params b/libtbx/citations.params index 3b10bf44aa..0a4a3b2d0a 100644 --- a/libtbx/citations.params +++ b/libtbx/citations.params @@ -484,12 +484,12 @@ citation { article_id = mmseqs2b authors = Mirdita M, Steinegger M, Söding J title = MMseqs2 desktop and local web server app for fast interactive sequence searches - journal = Bioinformatics + journal = Bioinformatics volume = 35 - pages = 2856-2858 + pages = 2856-2858 year = 2019 doi_id = 10.1093/bioinformatics/bty1057 - pmid = 30615063 + pmid = 30615063 } citation { article_id = mmseqs2 diff --git a/simtbx/diffBragg/attr_list.py b/simtbx/diffBragg/attr_list.py index f961528cd0..fdf7d65459 100644 --- a/simtbx/diffBragg/attr_list.py +++ b/simtbx/diffBragg/attr_list.py @@ -29,6 +29,7 @@ 'fluence', 'flux', 'has_anisotropic_mosaic_spread', + 'host_transfer', 'interpolate', 'isotropic_ncells', 'lambda_coefficients', diff --git a/simtbx/diffBragg/hopper_ensemble_utils.py b/simtbx/diffBragg/hopper_ensemble_utils.py index f15f608d10..c3e0d712aa 100644 --- a/simtbx/diffBragg/hopper_ensemble_utils.py +++ b/simtbx/diffBragg/hopper_ensemble_utils.py @@ -5,6 +5,7 @@ import socket import logging import os +import torch import numpy as np from scipy.optimize import basinhopping @@ -86,7 +87,7 @@ def __call__(self, x, *args, **kwargs): if modelers.SIM.D.record_timings: modelers.SIM.D.show_timings() - return f + return f.item() def target_func(x, modelers): @@ -105,7 +106,7 @@ def target_func(x, modelers): f = 0 # target functional g = np.zeros(modelers.num_total_modelers * num_shot_params) - g_fhkl = np.zeros(num_fhkl_params) + g_fhkl = torch.zeros(num_fhkl_params) zscore_sigs = [] fcell_params = x[-num_fhkl_params:] for ii, i_shot in enumerate(modelers): @@ -126,13 +127,13 @@ def target_func(x, modelers): # data contributions to target function V = model_pix + shot_modeler.all_sigma_rdout**2 resid_square = resid**2 - shot_fLogLike = (.5*(np.log(2*np.pi*V) + resid_square / V)) + shot_fLogLike = (.5*(torch.log(2*np.pi*V) + resid_square / V)) if shot_modeler.params.roi.allow_overlapping_spots: shot_fLogLike /= shot_modeler.all_freq shot_fLogLike = shot_fLogLike[shot_modeler.all_trusted].sum() # negative log Likelihood target f += shot_fLogLike - zscore_sig = np.std((resid / np.sqrt(V))[shot_modeler.all_trusted]) + zscore_sig = torch.std((resid / torch.sqrt(V))[shot_modeler.all_trusted]).item() zscore_sigs.append(zscore_sig) # get this shots contribution to the gradient @@ -145,14 +146,16 @@ def target_func(x, modelers): for name in shot_modeler.non_fhkl_params: p = shot_modeler.P[name] Jac_p = Jac[p.xpos] - shot_g[p.xpos] += (Jac_p[shot_modeler.all_trusted] * common_grad_term).sum() + shot_g[p.xpos] += (Jac_p[shot_modeler.all_trusted] * common_grad_term).sum().item() np.add.at(g, shot_x_slice, shot_g) spot_scale_p = shot_modeler.P["G_xtal0"] G = spot_scale_p.get_val(x[spot_scale_p.xpos]) g_fhkl += modelers.SIM.D.add_Fhkl_gradients( - shot_modeler.pan_fast_slow, resid, V, shot_modeler.all_trusted, - shot_modeler.all_freq, modelers.SIM.num_Fhkl_channels, G) + shot_modeler.pan_fast_slow, resid.cpu().numpy(), V.cpu().numpy(), shot_modeler.all_trusted.cpu().numpy(), + shot_modeler.all_freq.cpu().numpy(), modelers.SIM.num_Fhkl_channels, G) + if not modelers.SIM.D.host_transfer: + g_fhkl += torch.from_dlpack(modelers.SIM.D.get_Fhkl_scale_deriv()) # add up target and gradients across all ranks f = COMM.bcast(COMM.reduce(f)) @@ -495,7 +498,7 @@ def save_up(self, x, ref_iter=None): if i_shot % 100==0: MAIN_LOGGER.info("Getting Fhkl errors for shot %d/%d ... " % (i_shot+1, self.num_modelers)) Fhkl_scale_hessian += self.SIM.D.add_Fhkl_gradients( - mod.pan_fast_slow, resid, V, mod.all_trusted, mod.all_freq, + mod.pan_fast_slow, resid.cpu().numpy(), V.cpu().numpy(), mod.all_trusted.cpu().numpy(), mod.all_freq.cpu().numpy(), self.SIM.num_Fhkl_channels, G, track=False, errors=True) # ------------ diff --git a/simtbx/diffBragg/hopper_utils.py b/simtbx/diffBragg/hopper_utils.py index 5acd3ced80..9b6a74eb37 100644 --- a/simtbx/diffBragg/hopper_utils.py +++ b/simtbx/diffBragg/hopper_utils.py @@ -2,6 +2,7 @@ import time import os import json +import torch from dials.algorithms.shoebox import MaskCode from copy import deepcopy from dials.model.data import Shoebox @@ -24,7 +25,7 @@ from simtbx.diffBragg import utils from simtbx.diffBragg.refiners.parameters import RangedParameter, Parameters, PositiveParameter from simtbx.diffBragg.attr_list import NB_BEAM_ATTRS, NB_CRYST_ATTRS, DIFFBRAGG_ATTRS -from simtbx.diffBragg import psf +from simtbx.diffBragg import psf, kokkos_device try: from line_profiler import LineProfiler @@ -683,25 +684,26 @@ def data_to_one_dim(self, img_data, is_trusted, background): x1, x2, y1, y2 = self.rois[i_roi] freq = pixel_counter[pid, y1:y2, x1:x2].ravel() all_freq += list(freq) - self.all_freq = np.array(all_freq, np.int32) # if no overlapping pixels, this should be an array of 1's + self.all_freq = torch.tensor(all_freq, dtype=torch.int32, device=kokkos_device()) # if no overlapping pixels, this should be an array of 1's if not self.params.roi.allow_overlapping_spots: - if not np.all(self.all_freq==1): - print(set(self.all_freq)) + if not torch.all(self.all_freq==1): + print(set(self.all_freq.cpu().numpy())) raise ValueError("There are overlapping regions of interest, despite the command to not allow overlaps") self.all_q_perpix = np.array(all_q_perpix) pan_fast_slow = np.ascontiguousarray((np.vstack([all_pid, all_fast, all_slow]).T).ravel()) self.pan_fast_slow = flex.size_t(pan_fast_slow) - self.all_background = np.array(all_background) + self.all_background = torch.tensor(all_background, device=kokkos_device()) self.roi_id = np.array(roi_id) - self.all_data = np.array(all_data) + self.all_data = torch.tensor(all_data, device=kokkos_device()) if np.allclose(all_sigma_rdout, self.nominal_sigma_rdout): self.all_sigma_rdout = self.nominal_sigma_rdout else: self.all_sigma_rdout = np.array(all_sigma_rdout) - self.all_sigmas = np.array(all_sigmas) + self.all_sigmas = torch.tensor(all_sigmas, device=kokkos_device()) # note rare chance for sigmas to be nan if the args of sqrt is below 0 - self.all_trusted = np.logical_and(np.array(all_trusted), ~np.isnan(all_sigmas)) + all_trusted = torch.tensor(all_trusted, device=kokkos_device()) + self.all_trusted = torch.logical_and(all_trusted, ~torch.isnan(self.all_sigmas)) if self.params.roi.skip_roi_with_negative_bg: # Dont include pixels whose background model is below 0 @@ -728,18 +730,18 @@ def dump_gathered_to_refl(self, output_name, do_xyobs_sanity_check=False): roi_sel = self.roi_id==i_roi x1, x2, y1, y2 = self.rois[i_roi] roi_shape = y2-y1, x2-x1 - roi_img = self.all_data[roi_sel].reshape(roi_shape).astype(np.float32) #NOTE this has already been converted to photon units - roi_bg = self.all_background[roi_sel].reshape(roi_shape).astype(np.float32) + roi_img = self.all_data[roi_sel].reshape(roi_shape).float() #NOTE this has already been converted to photon units + roi_bg = self.all_background[roi_sel].reshape(roi_shape).float() sb = Shoebox((x1, x2, y1, y2, 0, 1)) sb.allocate() - sb.data = flex.float(np.ascontiguousarray(roi_img[None])) - sb.background = flex.float(np.ascontiguousarray(roi_bg[None])) + sb.data = flex.float(roi_img[None].cpu().contiguous().numpy()) + sb.background = flex.float(roi_bg[None].cpu().contiguous().numpy()) - dials_mask = np.zeros(roi_img.shape).astype(np.int32) + dials_mask = torch.zeros(roi_img.shape, device=kokkos_device()).int() mask = self.all_trusted[roi_sel].reshape(roi_shape) dials_mask[mask] = dials_mask[mask] + MaskCode.Valid - sb.mask = flex.int(np.ascontiguousarray(dials_mask[None])) + sb.mask = flex.int(dials_mask[None].cpu().contiguous().numpy()) # quick sanity test if do_xyobs_sanity_check: @@ -1248,7 +1250,7 @@ def save_up(self, x, SIM, rank=0, i_shot=0, SIM.D.force_cpu = True MAIN_LOGGER.info("Getting Fhkl errors (forcing CPUkernel usage)... might take some time") Fhkl_scale_errors = SIM.D.add_Fhkl_gradients( - self.pan_fast_slow, resid, V, self.all_trusted, self.all_freq, + self.pan_fast_slow, resid.cpu().numpy(), V.cpu().numpy(), self.all_trusted.cpu().numpy(), self.all_freq.cpu().numpy(), SIM.num_Fhkl_channels, G, track=True, errors=True) SIM.D.force_gpu = force_cpu # ------------ @@ -1336,21 +1338,22 @@ def save_up(self, x, SIM, rank=0, i_shot=0, fit = model_subimg[i_roi] trust = trusted_subimg[i_roi] if sigma_rdout_subimg is not None: - sig = np.sqrt(fit + sigma_rdout_subimg[i_roi] ** 2) + sig = torch.sqrt(fit + sigma_rdout_subimg[i_roi] ** 2) else: - sig = np.sqrt(fit + Modeler.nominal_sigma_rdout ** 2) + sig = torch.sqrt(fit + Modeler.nominal_sigma_rdout ** 2) Z = (dat - fit) / sig sigmaZ = np.nan - if np.any(trust): - sigmaZ = Z[trust].std() + if torch.any(trust): + sigmaZ = Z[trust].std().item() sigmaZs.append(sigmaZ) if bragg_subimg[0] is not None: - if np.any(bragg_subimg[i_roi] > 0): + if torch.any(bragg_subimg[i_roi] > 0): ref_idx = Modeler.refls_idx[i_roi] ref = Modeler.refls[ref_idx] I = bragg_subimg[i_roi] - Y, X = np.indices(bragg_subimg[i_roi].shape) + ny, nx = bragg_subimg[i_roi].shape + Y, X = torch.meshgrid(torch.arange(ny, device=kokkos_device()), torch.arange(nx, device=kokkos_device()), indexing='ij') x1, x2, y1, y2 = Modeler.rois[i_roi] com_x, com_y, _ = ref["xyzobs.px.value"] com_x = int(com_x - x1 - 0.5) @@ -1361,11 +1364,11 @@ def save_up(self, x, SIM, rank=0, i_shot=0, continue except IndexError: continue - X += x1 - Y += y1 - Isum = I.sum() - xcom = (X * I).sum() / Isum - ycom = (Y * I).sum() / Isum + X = X + x1 + Y = Y + y1 + Isum = I.sum().item() + xcom = (X * I).sum().item() / Isum + ycom = (Y * I).sum().item() / Isum com = xcom + .5, ycom + .5, 0 new_xycalcs[ref_idx] = com if not Modeler.params.fix.perRoiScale: @@ -1570,7 +1573,7 @@ def model(x, Mod, SIM, compute_grad=True, dont_rescale_gradient=False, update_s J = None if compute_grad: # This should be all params save the Fhkl params - J = np.zeros((nparam-SIM.Num_ASU*SIM.num_Fhkl_channels, npix)) # gradients + J = torch.zeros((nparam-SIM.Num_ASU*SIM.num_Fhkl_channels, npix), device=kokkos_device()) # gradients model_pix = None #TODO check roiScales mode and if its broken, git rid of it! @@ -1581,7 +1584,7 @@ def model(x, Mod, SIM, compute_grad=True, dont_rescale_gradient=False, update_s if not Mod.params.fix.perRoiScale: perRoiParams = [Mod.P["scale_roi%d" % roi_id] for roi_id in Mod.roi_id_unique] perRoiScaleFactors = [p.get_val(x[p.xpos]) for p in perRoiParams] - roiScalesPerPix = np.zeros(npix) + roiScalesPerPix = torch.zeros(npix, device=kokkos_device()) for i_roi, roi_id in enumerate(Mod.roi_id_unique): slc = Mod.roi_id_slices[roi_id][0] roiScalesPerPix[slc] = perRoiScaleFactors[i_roi] @@ -1615,8 +1618,7 @@ def model(x, Mod, SIM, compute_grad=True, dont_rescale_gradient=False, update_s SIM.D.add_diffBragg_spots(pfs) - pix_noRoiScale = SIM.D.raw_pixels_roi[:npix] - pix_noRoiScale = pix_noRoiScale.as_numpy_array() + pix_noRoiScale = torch.from_dlpack(SIM.D.get_floatimage())[:npix] pix = pix_noRoiScale * roiScalesPerPix @@ -1634,16 +1636,17 @@ def model(x, Mod, SIM, compute_grad=True, dont_rescale_gradient=False, update_s J[G.xpos] += scale_grad if RotXYZ_params[0].refine: + rot_grads = scale * torch.from_dlpack(SIM.D.get_d_Umat_images()) for i_rot in range(3): - rot_grad = scale * SIM.D.get_derivative_pixels(ROTXYZ_IDS[i_rot]).as_numpy_array()[:npix] + rot_grad = rot_grads[i_rot*npix:(i_rot+1)*npix] rot_p = RotXYZ_params[i_rot] rot_grad = rot_p.get_deriv(x[rot_p.xpos], rot_grad) J[rot_p.xpos] += rot_grad if Nabc_params[0].refine: - Nabc_grads = SIM.D.get_ncells_derivative_pixels() + Nabc_grads = scale * torch.from_dlpack(SIM.D.get_d_Ncells_images())[:3*npix] for i_n in range(3): - N_grad = scale*(Nabc_grads[i_n][:npix].as_numpy_array()) + N_grad = Nabc_grads[i_n*npix:(i_n+1)*npix] p = Nabc_params[i_n] N_grad = p.get_deriv(x[p.xpos], N_grad) J[p.xpos] += N_grad @@ -1651,53 +1654,51 @@ def model(x, Mod, SIM, compute_grad=True, dont_rescale_gradient=False, update_s break if Ndef_params[0].refine: - Ndef_grads = SIM.D.get_ncells_def_derivative_pixels() + Ndef_grads = scale * torch.from_dlpack(SIM.D.get_d_Ncells_images())[3*npix:] for i_n in range(3): - N_grad = scale * (Ndef_grads[i_n][:npix].as_numpy_array()) + N_grad = Ndef_grads[i_n*npix:(i_n+1)*npix] p = Ndef_params[i_n] N_grad = p.get_deriv(x[p.xpos], N_grad) J[p.xpos] += N_grad if SIM.D.use_diffuse: for t in ['gamma','sigma']: - diffuse_grads = getattr(SIM.D, "get_diffuse_%s_derivative_pixels" % t)() + diffuse_grads = scale * torch.from_dlpack( getattr(SIM.D, "get_d_diffuse_%s_images" % t)() ) if diffuse_params_lookup[t][0].refine: for i_diff in range(3): - diff_grad = scale*(diffuse_grads[i_diff][:npix].as_numpy_array()) + diff_grad = diffuse_grads[i_diff*npix:(i_diff+1)*npix] p = diffuse_params_lookup[t][i_diff] diff_grad = p.get_deriv(x[p.xpos], diff_grad) J[p.xpos] += diff_grad if eta_params[0].refine: - if SIM.D.has_anisotropic_mosaic_spread: - eta_derivs = SIM.D.get_aniso_eta_deriv_pixels() - else: - eta_derivs = [SIM.D.get_derivative_pixels(ETA_ID)] + eta_derivs = scale * torch.from_dlpack(SIM.D.get_d_eta_images()) num_eta = 3 if SIM.D.has_anisotropic_mosaic_spread else 1 for i_eta in range(num_eta): p = eta_params[i_eta] - eta_grad = scale * (eta_derivs[i_eta][:npix].as_numpy_array()) + eta_grad = eta_derivs[i_eta*npix:(i_eta+1)*npix] eta_grad = p.get_deriv(x[p.xpos], eta_grad) J[p.xpos] += eta_grad if ucell_params[0].refine: + ucell_grads = scale * torch.from_dlpack(SIM.D.get_d_Bmat_images()) for i_ucell in range(nucell): p = ucell_params[i_ucell] - deriv = scale*SIM.D.get_derivative_pixels(UCELL_ID_OFFSET+i_ucell).as_numpy_array()[:npix] + deriv = ucell_grads[i_ucell*npix: (i_ucell+1)*npix] deriv = p.get_deriv(x[p.xpos], deriv) J[p.xpos] += deriv if DetZ.refine: - d = SIM.D.get_derivative_pixels(DETZ_ID).as_numpy_array()[:npix] + d = torch.from_dlpack(SIM.D.get_d_panel_orig_images())[npix:2*npix] d = DetZ.get_deriv(x[DetZ.xpos], d) J[DetZ.xpos] += d if Mod.P["lambda_offset"].refine: - lambda_derivs = SIM.D.get_lambda_derivative_pixels() + lambda_derivs = torch.from_dlpack(SIM.D.get_d_lambda_images()) lambda_param_names = "lambda_offset", "lambda_scale" - for d,name in zip(lambda_derivs, lambda_param_names): + for i_lmbd,name in enumerate(lambda_param_names): p = Mod.P[name] - d = d.as_numpy_array()[:npix] + d = lambda_derivs[i_lmbd*npix:(i_lmbd+1)*npix] d = p.get_deriv(x[p.xpos], d) J[p.xpos] += d @@ -1907,14 +1908,14 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores V = model_pix + sigma_rdout**2 # TODO:what if V is allowed to be negative? The logarithm/sqrt will explore below resid_square = resid**2 - fLogLike = (.5*(np.log(2*np.pi*V) + resid_square / V)) + fLogLike = (.5*(torch.log(2*torch.pi*V) + resid_square / V)) if params.roi.allow_overlapping_spots: fLogLike /= mod.all_freq - fLogLike = fLogLike[trusted].sum() # negative log Likelihood target + fLogLike = fLogLike[trusted].sum().item() # negative log Likelihood target # width of z-score should decrease as refinement proceeds - zscore_per = resid/np.sqrt(V) - zscore_sigma = np.std(zscore_per[trusted]) + zscore_per = resid/torch.sqrt(V) + zscore_sigma = torch.std(zscore_per[trusted]).item() restraint_terms = {} if params.use_restraints: @@ -2041,8 +2042,10 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores if SIM.refining_Fhkl: spot_scale_p = mod.P["G_xtal0"] G = spot_scale_p.get_val(x[spot_scale_p.xpos]) - fhkl_grad = SIM.D.add_Fhkl_gradients(pfs, resid, V, trusted, - mod.all_freq, SIM.num_Fhkl_channels, G) + fhkl_grad = SIM.D.add_Fhkl_gradients(pfs, resid.cpu().numpy(), V.cpu().numpy(), trusted.cpu().numpy(), + mod.all_freq.cpu().numpy(), SIM.num_Fhkl_channels, G) + if not SIM.D.host_transfer: + fhkl_grad = torch.from_dlpack(SIM.D.get_Fhkl_scale_deriv()).cpu().numpy() if params.betas.Fhkl is not None: for i_chan in range(SIM.num_Fhkl_channels): @@ -2057,7 +2060,6 @@ def target_func(x, udpate_terms, mod, SIM, compute_grad=True, return_all_zscores gnorm = np.linalg.norm(g) - debug_s = "F=%10.7g sigZ=%10.7g (Fracs of F: %s), |g|=%10.7g" \ % (f, zscore_sigma, restraint_debug_s, gnorm) @@ -2080,6 +2082,11 @@ def refine(exp, ref, params, spec=None, gpu_device=None, return_modeler=False, b SIM = get_simulator_for_data_modelers(Modeler) Modeler.set_parameters_for_experiment(best=best) SIM.D.device_Id = gpu_device + old_transfer = None + if os.environ.get("DIFFBRAGG_USE_KOKKOS") is not None: + if SIM.D.host_transfer == True: + old_transfer = True + SIM.D.host_transfer = False nparam = len(Modeler.P) if SIM.refining_Fhkl: @@ -2107,6 +2114,9 @@ def refine(exp, ref, params, spec=None, gpu_device=None, return_modeler=False, b if free_mem: Modeler.clean_up(SIM) + if old_transfer is not None: + SIM.D.host_transfer = old_transfer + if return_modeler: return new_exp, new_refl, Modeler, SIM, x @@ -2189,12 +2199,12 @@ def get_new_xycalcs(Modeler, new_exp, old_refl_tag="dials"): for i_roi in range(len(bragg_subimg)): ref_idx = Modeler.refls_idx[i_roi] - #assert ref_idx==i_roi - if np.any(bragg_subimg[i_roi] > 0): + if torch.any(bragg_subimg[i_roi] > 0): I = bragg_subimg[i_roi] - assert np.all(I>=0) - Y, X = np.indices(bragg_subimg[i_roi].shape) + assert torch.all(I>=0) + ny, nx = bragg_subimg[i_roi].shape + X, Y = torch.meshgrid(torch.arange(nx, device=kokkos_device()), torch.arange(ny, device=kokkos_device()), indexing='xy') x1, _, y1, _ = Modeler.rois[i_roi] com_x, com_y, _ = new_refls[ref_idx]["xyzobs.px.value"] @@ -2207,11 +2217,11 @@ def get_new_xycalcs(Modeler, new_exp, old_refl_tag="dials"): except IndexError: continue - X += x1 - Y += y1 - Isum = I.sum() - xcom = (X * I).sum() / Isum + .5 - ycom = (Y * I).sum() / Isum + .5 + X = X + x1 + Y = Y + y1 + Isum = I.sum().item() + xcom = (X * I).sum().item() / Isum + .5 + ycom = (Y * I).sum().item() / Isum + .5 com = xcom, ycom, 0 pid = Modeler.pids[i_roi] diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp index a42cc5b960..f4b29476d4 100644 --- a/simtbx/diffBragg/src/diffBragg.cpp +++ b/simtbx/diffBragg/src/diffBragg.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "diffBragg.h" namespace np=boost::python::numpy; @@ -363,6 +364,7 @@ diffBragg::diffBragg(const dxtbx::model::Detector& detector, const dxtbx::model: O_reference <<0,0,0; + host_transfer = true; update_oversample_during_refinement = true; oversample_omega = true; only_save_omega_kahn = false; @@ -1507,6 +1509,133 @@ boost::python::tuple diffBragg::get_ncells_derivative_pixels(){ return derivative_pixels; } +void dlpack_destructor(PyObject* capsule) { + if (!PyCapsule_IsValid(capsule, "dltensor")) { + return; + } + + // If the capsule has not been used, we need to delete it + DLManagedTensor* dlpackTensor = static_cast(PyCapsule_GetPointer(capsule, "dltensor")); + dlpackTensor->deleter(dlpackTensor); + delete dlpackTensor; +} + + +// Fun with pointer-to-member-functions +PyObject* diffBragg::PyCapsule_Wrapper( DLManagedTensor* (diffBraggKOKKOS::*func)(), image_type &vec) { +#ifdef DIFFBRAGG_HAVE_KOKKOS + if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL){ + if (diffBragg_runner == nullptr) { + return nullptr; + } + return PyCapsule_New((*diffBragg_runner.*func)(), "dltensor", dlpack_destructor); + } +#endif + return PyCapsule_New(array_to_dlpack(vec.data(), vec.size()), "dltensor", dlpack_destructor); +} + +PyObject* diffBragg::get_floatimage() { +#ifdef DIFFBRAGG_HAVE_KOKKOS + if (use_gpu || getenv("DIFFBRAGG_USE_KOKKOS")!=NULL) { + if (diffBragg_runner == nullptr) { + return nullptr; + } + return PyCapsule_New(diffBragg_runner->get_floatimage(), "dltensor", dlpack_destructor); + } +#endif + return PyCapsule_New(array_to_dlpack(raw_pixels_roi.begin(), Npix_to_model), "dltensor", dlpack_destructor); +} + +PyObject* diffBragg::get_wavelenimage() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_wavelenimage, first_deriv_imgs.wavelength); +} + +PyObject* diffBragg::get_d_diffuse_gamma_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_gamma_images, first_deriv_imgs.diffuse_gamma); +} + +PyObject* diffBragg::get_d_diffuse_sigma_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_diffuse_sigma_images, first_deriv_imgs.diffuse_sigma); +} + +PyObject* diffBragg::get_d_Umat_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Umat_images, first_deriv_imgs.Umat); +} + +PyObject* diffBragg::get_d2_Umat_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Umat_images, second_deriv_imgs.Umat); +} + +PyObject* diffBragg::get_d_Bmat_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Bmat_images, first_deriv_imgs.Bmat); +} + +PyObject* diffBragg::get_d2_Bmat_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Bmat_images, second_deriv_imgs.Bmat); +} + +PyObject* diffBragg::get_d_Ncells_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_Ncells_images, first_deriv_imgs.Ncells); +} + +PyObject* diffBragg::get_d2_Ncells_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_Ncells_images, second_deriv_imgs.Ncells); +} + +PyObject* diffBragg::get_d_fcell_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fcell_images, first_deriv_imgs.fcell); +} + +PyObject* diffBragg::get_d2_fcell_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_fcell_images, second_deriv_imgs.fcell); +} + +PyObject* diffBragg::get_d_eta_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_eta_images, first_deriv_imgs.eta); +} + +PyObject* diffBragg::get_d2_eta_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_eta_images, second_deriv_imgs.eta); +} + +PyObject* diffBragg::get_d_lambda_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_lambda_images, first_deriv_imgs.lambda); +} + +PyObject* diffBragg::get_d2_lambda_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_lambda_images, second_deriv_imgs.lambda); +} + +PyObject* diffBragg::get_d_panel_rot_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_rot_images, first_deriv_imgs.panel_rot); +} + +PyObject* diffBragg::get_d2_panel_rot_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_rot_images, second_deriv_imgs.panel_rot); +} + +PyObject* diffBragg::get_d_panel_orig_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_panel_orig_images, first_deriv_imgs.panel_orig); +} + +PyObject* diffBragg::get_d2_panel_orig_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d2_panel_orig_images, second_deriv_imgs.panel_orig); +} + +PyObject* diffBragg::get_d_fp_fdp_images() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_d_fp_fdp_images, first_deriv_imgs.fp_fdp); +} + +PyObject* diffBragg::get_Fhkl_scale_deriv() { + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv, first_deriv_imgs.Fhkl_scale_deriv); +} + +PyObject* diffBragg::get_Fhkl_hessian() { + // Fhkl_scale_deriv is overloaded, depending on Fhkl_errors_mode + return PyCapsule_Wrapper(&diffBraggKOKKOS::get_Fhkl_scale_deriv, first_deriv_imgs.Fhkl_hessian); +} + + boost::python::tuple diffBragg::get_diffuse_gamma_derivative_pixels(){ SCITBX_ASSERT(db_flags.refine_diffuse); int Npix_total = first_deriv_imgs.diffuse_gamma.size() / 3; @@ -1847,7 +1976,6 @@ void diffBragg::add_diffBragg_spots(const af::shared& panels_fasts_slows Npix_to_model = panels_fasts_slows.size()/3; SCITBX_ASSERT(Npix_to_model <= Npix_total); - double * floatimage_roi = raw_pixels_roi.begin(); diffBragg_rot_mats(); /* make sure we are normalizing with the right number of sub-steps */ @@ -1939,6 +2067,7 @@ void diffBragg::add_diffBragg_spots(const af::shared& panels_fasts_slows db_flags.refine_fp_fdp = fp_fdp_managers[0]->refine_me; db_flags.use_lambda_coefficients = use_lambda_coefficients; db_flags.oversample_omega = oversample_omega; + db_flags.host_transfer = host_transfer; db_flags.printout_fpixel = printout_fpixel; db_flags.printout_spixel = printout_spixel; db_flags.verbose = verbose; @@ -2167,81 +2296,84 @@ void diffBragg::add_diffBragg_spots(const af::shared& panels_fasts_slows gettimeofday(&t1,0 ); - for (int i_pix=0; i_pix< Npix_to_model; i_pix++){ - floatimage_roi[i_pix] = image[i_pix]; + if (db_flags.host_transfer) { + double * floatimage_roi = raw_pixels_roi.begin(); + for (int i_pix=0; i_pix< Npix_to_model; i_pix++){ + floatimage_roi[i_pix] = image[i_pix]; - for (int i_rot=0; i_rot<3; i_rot++){ - if (rot_managers[i_rot]->refine_me){ - int idx = i_rot*Npix_to_model + i_pix; - rot_managers[i_rot]->increment_image(i_pix, first_deriv_imgs.Umat[idx], second_deriv_imgs.Umat[idx], compute_curvatures); + for (int i_rot=0; i_rot<3; i_rot++){ + if (rot_managers[i_rot]->refine_me){ + int idx = i_rot*Npix_to_model + i_pix; + rot_managers[i_rot]->increment_image(i_pix, first_deriv_imgs.Umat[idx], second_deriv_imgs.Umat[idx], compute_curvatures); + } } - } - for (int i_uc=0; i_uc<6; i_uc++){ - if (ucell_managers[i_uc]->refine_me){ - int idx = i_uc*Npix_to_model + i_pix; - ucell_managers[i_uc]->increment_image(i_pix, first_deriv_imgs.Bmat[idx], second_deriv_imgs.Bmat[idx], compute_curvatures); + for (int i_uc=0; i_uc<6; i_uc++){ + if (ucell_managers[i_uc]->refine_me){ + int idx = i_uc*Npix_to_model + i_pix; + ucell_managers[i_uc]->increment_image(i_pix, first_deriv_imgs.Bmat[idx], second_deriv_imgs.Bmat[idx], compute_curvatures); + } } - } - if (Ncells_managers[0]->refine_me){ - Ncells_managers[0]->increment_image(i_pix, first_deriv_imgs.Ncells[i_pix], second_deriv_imgs.Ncells[i_pix], compute_curvatures); - if (! isotropic_ncells){ - int idx= Npix_to_model+i_pix; - Ncells_managers[1]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures); - idx = 2*Npix_to_model + i_pix; - Ncells_managers[2]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures); + if (Ncells_managers[0]->refine_me){ + Ncells_managers[0]->increment_image(i_pix, first_deriv_imgs.Ncells[i_pix], second_deriv_imgs.Ncells[i_pix], compute_curvatures); + if (! isotropic_ncells){ + int idx= Npix_to_model+i_pix; + Ncells_managers[1]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures); + idx = 2*Npix_to_model + i_pix; + Ncells_managers[2]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures); + } } - } - if (refine_Ncells_def){ - for (int i_nc =3; i_nc < 6; i_nc++){ - int idx= i_nc*Npix_to_model+i_pix; - Ncells_managers[i_nc]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures); + if (refine_Ncells_def){ + for (int i_nc =3; i_nc < 6; i_nc++){ + int idx= i_nc*Npix_to_model+i_pix; + Ncells_managers[i_nc]->increment_image(i_pix, first_deriv_imgs.Ncells[idx], second_deriv_imgs.Ncells[idx], compute_curvatures); + } } - } - if (fcell_managers[0]->refine_me){ - int idx= i_pix; - fcell_managers[0]->increment_image(i_pix, first_deriv_imgs.fcell[idx], second_deriv_imgs.fcell[idx], compute_curvatures); - } + if (fcell_managers[0]->refine_me){ + int idx= i_pix; + fcell_managers[0]->increment_image(i_pix, first_deriv_imgs.fcell[idx], second_deriv_imgs.fcell[idx], compute_curvatures); + } - if (eta_managers[0]->refine_me){ - eta_managers[0]->increment_image(i_pix, first_deriv_imgs.eta[i_pix], second_deriv_imgs.eta[i_pix], compute_curvatures); - if (modeling_anisotropic_mosaic_spread){ - if (verbose && i_pix==0)printf("copying aniso eta derivatives\n"); - for(int i_eta=1; i_eta < 3; i_eta++){ - int idx = i_eta*Npix_to_model+i_pix; - eta_managers[i_eta]->increment_image(i_pix, first_deriv_imgs.eta[idx], second_deriv_imgs.eta[idx], compute_curvatures); + if (eta_managers[0]->refine_me){ + eta_managers[0]->increment_image(i_pix, first_deriv_imgs.eta[i_pix], second_deriv_imgs.eta[i_pix], compute_curvatures); + if (modeling_anisotropic_mosaic_spread){ + if (verbose && i_pix==0)printf("copying aniso eta derivatives\n"); + for(int i_eta=1; i_eta < 3; i_eta++){ + int idx = i_eta*Npix_to_model+i_pix; + eta_managers[i_eta]->increment_image(i_pix, first_deriv_imgs.eta[idx], second_deriv_imgs.eta[idx], compute_curvatures); + } } } - } - for(int i_lam=0; i_lam < 2; i_lam++){ - if (lambda_managers[i_lam]->refine_me){ - int idx= Npix_to_model*i_lam + i_pix; - lambda_managers[i_lam]->increment_image(i_pix, first_deriv_imgs.lambda[idx], second_deriv_imgs.lambda[idx], compute_curvatures); + for(int i_lam=0; i_lam < 2; i_lam++){ + if (lambda_managers[i_lam]->refine_me){ + int idx= Npix_to_model*i_lam + i_pix; + lambda_managers[i_lam]->increment_image(i_pix, first_deriv_imgs.lambda[idx], second_deriv_imgs.lambda[idx], compute_curvatures); + } } - } - for(int i_pan=0; i_pan <3; i_pan++){ - int i_rot = pan_rot_ids[i_pan]; - if (panels[i_rot]->refine_me){ - int idx = Npix_to_model*i_pan + i_pix; - panels[i_rot]->increment_image(i_pix, first_deriv_imgs.panel_rot[idx], second_deriv_imgs.panel_rot[idx], compute_curvatures); - } + for(int i_pan=0; i_pan <3; i_pan++){ + int i_rot = pan_rot_ids[i_pan]; + if (panels[i_rot]->refine_me){ + int idx = Npix_to_model*i_pan + i_pix; + panels[i_rot]->increment_image(i_pix, first_deriv_imgs.panel_rot[idx], second_deriv_imgs.panel_rot[idx], compute_curvatures); + } - int i_orig = pan_orig_ids[i_pan]; - if(panels[i_orig]->refine_me){ - int idx= Npix_to_model*i_pan + i_pix; - panels[i_orig]->increment_image(i_pix, first_deriv_imgs.panel_orig[idx], second_deriv_imgs.panel_orig[idx], compute_curvatures); + int i_orig = pan_orig_ids[i_pan]; + if(panels[i_orig]->refine_me){ + int idx= Npix_to_model*i_pan + i_pix; + panels[i_orig]->increment_image(i_pix, first_deriv_imgs.panel_orig[idx], second_deriv_imgs.panel_orig[idx], compute_curvatures); + } } - } - if (fp_fdp_managers[0]->refine_me) - fp_fdp_managers[0]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix], 0, compute_curvatures); - if (fp_fdp_managers[1]->refine_me) - fp_fdp_managers[1]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix+Npix_to_model], 0, compute_curvatures); + if (fp_fdp_managers[0]->refine_me) + fp_fdp_managers[0]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix], 0, compute_curvatures); + if (fp_fdp_managers[1]->refine_me) + fp_fdp_managers[1]->increment_image(i_pix, first_deriv_imgs.fp_fdp[i_pix+Npix_to_model], 0, compute_curvatures); - } // END of flex array update + } // END of flex array update + } delete[] db_steps.subS_pos; delete[] db_steps.subF_pos; @@ -2257,7 +2389,6 @@ void diffBragg::add_diffBragg_spots(const af::shared& panels_fasts_slows TIMERS.timings += 1; // only increment timings at the end of the add_diffBragg_spots call } - if(verbose) printf("done with pixel loop\n"); } // END of add_diffBragg_spots diff --git a/simtbx/diffBragg/src/diffBragg.h b/simtbx/diffBragg/src/diffBragg.h index c26d0d019e..501f83b58b 100644 --- a/simtbx/diffBragg/src/diffBragg.h +++ b/simtbx/diffBragg/src/diffBragg.h @@ -170,7 +170,6 @@ class diffBragg: public nanoBragg{ inline void kokkos_free() { diffBragg_runner.reset(); } // allocate when needed to avoid problems with kokkos initialization when cuda/kokkos isn't used std::shared_ptr diffBragg_runner{}; - // diffBraggKOKKOS diffBragg_runner; #endif inline void gpu_free(){ @@ -238,6 +237,32 @@ class diffBragg: public nanoBragg{ af::flex_double get_raw_pixels_roi(); boost::python::tuple get_fp_fdp_derivative_pixels(); boost::python::tuple get_ncells_derivative_pixels(); + + PyObject* PyCapsule_Wrapper(DLManagedTensor* (diffBraggKOKKOS::*func)(), image_type &vec); + PyObject* get_floatimage(); + PyObject* get_wavelenimage(); + PyObject* get_d_diffuse_gamma_images(); + PyObject* get_d_diffuse_sigma_images(); + PyObject* get_d_Umat_images(); + PyObject* get_d2_Umat_images(); + PyObject* get_d_Bmat_images(); + PyObject* get_d2_Bmat_images(); + PyObject* get_d_Ncells_images(); + PyObject* get_d2_Ncells_images(); + PyObject* get_d_fcell_images(); + PyObject* get_d2_fcell_images(); + PyObject* get_d_eta_images(); + PyObject* get_d2_eta_images(); + PyObject* get_d_lambda_images(); + PyObject* get_d2_lambda_images(); + PyObject* get_d_panel_rot_images(); + PyObject* get_d2_panel_rot_images(); + PyObject* get_d_panel_orig_images(); + PyObject* get_d2_panel_orig_images(); + PyObject* get_d_fp_fdp_images(); + PyObject* get_Fhkl_scale_deriv(); + PyObject* get_Fhkl_hessian(); + boost::python::tuple get_diffuse_gamma_derivative_pixels(); boost::python::tuple get_diffuse_sigma_derivative_pixels(); boost::python::tuple get_ncells_def_derivative_pixels(); @@ -287,6 +312,7 @@ class diffBragg: public nanoBragg{ bool update_oversample_during_refinement; bool oversample_omega; bool only_save_omega_kahn; + bool host_transfer; // miller array void quick_Fcell_update(boost::python::tuple const& value); diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp index 273088f765..4656835311 100644 --- a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp +++ b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp @@ -581,56 +581,58 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos( gettimeofday(&t1, 0); // COPY BACK FROM DEVICE Kokkos::Tools::pushRegion("COPY BACK FROM DEVICE"); - kokkostbx::transfer_kokkos2vector(floatimage, m_floatimage); + if (db_flags.host_transfer) { + kokkostbx::transfer_kokkos2vector(floatimage, m_floatimage); - if (db_flags.wavelength_img) { - kokkostbx::transfer_kokkos2vector(d_image.wavelength, m_wavelenimage); - } - if (db_flags.refine_fcell) { - kokkostbx::transfer_kokkos2vector(d_image.fcell, m_d_fcell_images); - kokkostbx::transfer_kokkos2vector(d2_image.fcell, m_d2_fcell_images); - } - if (db_flags.Fhkl_gradient_mode){ - if (db_flags.Fhkl_errors_mode){ - kokkostbx::transfer_kokkos2vector(d_image.Fhkl_hessian, m_Fhkl_scale_deriv); + if (db_flags.wavelength_img) { + kokkostbx::transfer_kokkos2vector(d_image.wavelength, m_wavelenimage); } - else{ - kokkostbx::transfer_kokkos2vector(d_image.Fhkl_scale_deriv, m_Fhkl_scale_deriv); + if (db_flags.refine_fcell) { + kokkostbx::transfer_kokkos2vector(d_image.fcell, m_d_fcell_images); + kokkostbx::transfer_kokkos2vector(d2_image.fcell, m_d2_fcell_images); + } + if (db_flags.Fhkl_gradient_mode){ + if (db_flags.Fhkl_errors_mode){ + kokkostbx::transfer_kokkos2vector(d_image.Fhkl_hessian, m_Fhkl_scale_deriv); + } + else{ + kokkostbx::transfer_kokkos2vector(d_image.Fhkl_scale_deriv, m_Fhkl_scale_deriv); + } + } + if (std::count(db_flags.refine_Umat.begin(), db_flags.refine_Umat.end(), true) > 0) { + kokkostbx::transfer_kokkos2vector(d_image.Umat, m_d_Umat_images); + kokkostbx::transfer_kokkos2vector(d2_image.Umat, m_d2_Umat_images); + } + if (std::count(db_flags.refine_panel_rot.begin(), db_flags.refine_panel_rot.end(), true) > 0) { + kokkostbx::transfer_kokkos2vector(d_image.panel_rot, m_d_panel_rot_images); + } + if (std::count(db_flags.refine_panel_origin.begin(), db_flags.refine_panel_origin.end(), true) > + 0) { + kokkostbx::transfer_kokkos2vector(d_image.panel_orig, m_d_panel_orig_images); + } + if (db_flags.refine_eta) { + kokkostbx::transfer_kokkos2vector(d_image.eta, m_d_eta_images); + kokkostbx::transfer_kokkos2vector(d2_image.eta, m_d2_eta_images); + } + if (std::count(db_flags.refine_Ncells.begin(), db_flags.refine_Ncells.end(), true) > 0 || + db_flags.refine_Ncells_def) { + kokkostbx::transfer_kokkos2vector(d_image.Ncells, m_d_Ncells_images); + kokkostbx::transfer_kokkos2vector(d2_image.Ncells, m_d2_Ncells_images); + } + if (db_flags.refine_diffuse) { + kokkostbx::transfer_kokkos2vector(d_image.diffuse_gamma, m_d_diffuse_gamma_images); + kokkostbx::transfer_kokkos2vector(d_image.diffuse_sigma, m_d_diffuse_sigma_images); + } + if (std::count(db_flags.refine_Bmat.begin(), db_flags.refine_Bmat.end(), true) > 0) { + kokkostbx::transfer_kokkos2vector(d_image.Bmat, m_d_Bmat_images); + kokkostbx::transfer_kokkos2vector(d2_image.Bmat, m_d2_Bmat_images); + } + if (std::count(db_flags.refine_lambda.begin(), db_flags.refine_lambda.end(), true) > 0) { + kokkostbx::transfer_kokkos2vector(d_image.lambda, m_d_lambda_images); + } + if (db_flags.refine_fp_fdp) { + kokkostbx::transfer_kokkos2vector(d_image.fp_fdp, m_d_fp_fdp_images); } - } - if (std::count(db_flags.refine_Umat.begin(), db_flags.refine_Umat.end(), true) > 0) { - kokkostbx::transfer_kokkos2vector(d_image.Umat, m_d_Umat_images); - kokkostbx::transfer_kokkos2vector(d2_image.Umat, m_d2_Umat_images); - } - if (std::count(db_flags.refine_panel_rot.begin(), db_flags.refine_panel_rot.end(), true) > 0) { - kokkostbx::transfer_kokkos2vector(d_image.panel_rot, m_d_panel_rot_images); - } - if (std::count(db_flags.refine_panel_origin.begin(), db_flags.refine_panel_origin.end(), true) > - 0) { - kokkostbx::transfer_kokkos2vector(d_image.panel_orig, m_d_panel_orig_images); - } - if (db_flags.refine_eta) { - kokkostbx::transfer_kokkos2vector(d_image.eta, m_d_eta_images); - kokkostbx::transfer_kokkos2vector(d2_image.eta, m_d2_eta_images); - } - if (std::count(db_flags.refine_Ncells.begin(), db_flags.refine_Ncells.end(), true) > 0 || - db_flags.refine_Ncells_def) { - kokkostbx::transfer_kokkos2vector(d_image.Ncells, m_d_Ncells_images); - kokkostbx::transfer_kokkos2vector(d2_image.Ncells, m_d2_Ncells_images); - } - if (db_flags.refine_diffuse) { - kokkostbx::transfer_kokkos2vector(d_image.diffuse_gamma, m_d_diffuse_gamma_images); - kokkostbx::transfer_kokkos2vector(d_image.diffuse_sigma, m_d_diffuse_sigma_images); - } - if (std::count(db_flags.refine_Bmat.begin(), db_flags.refine_Bmat.end(), true) > 0) { - kokkostbx::transfer_kokkos2vector(d_image.Bmat, m_d_Bmat_images); - kokkostbx::transfer_kokkos2vector(d2_image.Bmat, m_d2_Bmat_images); - } - if (std::count(db_flags.refine_lambda.begin(), db_flags.refine_lambda.end(), true) > 0) { - kokkostbx::transfer_kokkos2vector(d_image.lambda, m_d_lambda_images); - } - if (db_flags.refine_fp_fdp) { - kokkostbx::transfer_kokkos2vector(d_image.fp_fdp, m_d_fp_fdp_images); } Kokkos::Tools::popRegion(); @@ -639,3 +641,91 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos( Kokkos::Tools::popRegion(); } + +DLManagedTensor* diffBraggKOKKOS::get_floatimage() { + return kokkostbx::view_to_dlpack(m_floatimage); +} + +DLManagedTensor* diffBraggKOKKOS::get_wavelenimage() { + return kokkostbx::view_to_dlpack(m_wavelenimage); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_diffuse_gamma_images() { + return kokkostbx::view_to_dlpack(m_d_diffuse_gamma_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_diffuse_sigma_images() { + return kokkostbx::view_to_dlpack(m_d_diffuse_sigma_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_Umat_images() { + return kokkostbx::view_to_dlpack(m_d_Umat_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_Umat_images() { + return kokkostbx::view_to_dlpack(m_d2_Umat_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_Bmat_images() { + return kokkostbx::view_to_dlpack(m_d_Bmat_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_Bmat_images() { + return kokkostbx::view_to_dlpack(m_d2_Bmat_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_Ncells_images() { + return kokkostbx::view_to_dlpack(m_d_Ncells_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_Ncells_images() { + return kokkostbx::view_to_dlpack(m_d2_Ncells_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_fcell_images() { + return kokkostbx::view_to_dlpack(m_d_fcell_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_fcell_images() { + return kokkostbx::view_to_dlpack(m_d2_fcell_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_eta_images() { + return kokkostbx::view_to_dlpack(m_d_eta_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_eta_images() { + return kokkostbx::view_to_dlpack(m_d2_eta_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_lambda_images() { + return kokkostbx::view_to_dlpack(m_d_lambda_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_lambda_images() { + return kokkostbx::view_to_dlpack(m_d2_lambda_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_panel_rot_images() { + return kokkostbx::view_to_dlpack(m_d_panel_rot_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_panel_rot_images() { + return kokkostbx::view_to_dlpack(m_d2_panel_rot_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_panel_orig_images() { + return kokkostbx::view_to_dlpack(m_d_panel_orig_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d2_panel_orig_images() { + return kokkostbx::view_to_dlpack(m_d2_panel_orig_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_d_fp_fdp_images() { + return kokkostbx::view_to_dlpack(m_d_fp_fdp_images); +} + +DLManagedTensor* diffBraggKOKKOS::get_Fhkl_scale_deriv() { + return kokkostbx::view_to_dlpack(m_Fhkl_scale_deriv); +} diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.h b/simtbx/diffBragg/src/diffBraggKOKKOS.h index 3aa07ae27b..78ed67a505 100644 --- a/simtbx/diffBragg/src/diffBraggKOKKOS.h +++ b/simtbx/diffBragg/src/diffBraggKOKKOS.h @@ -7,6 +7,7 @@ #include "kokkostbx/kokkos_types.h" #include "kokkostbx/kokkos_utils.h" +#include "kokkostbx/kokkos_dlpack.h" #include "simtbx/diffBragg/src/util.h" #include "simtbx/diffBragg/src/util_kokkos.h" #include "simtbx/diffBragg/src/diffBragg_refine_flag.h" @@ -147,6 +148,29 @@ class diffBraggKOKKOS { cuda_flags& db_cu_flags, // diffBragg_kokkosPointers& kp, timer_variables& TIMERS); + + DLManagedTensor* get_floatimage(); + DLManagedTensor* get_wavelenimage(); + DLManagedTensor* get_d_diffuse_gamma_images(); + DLManagedTensor* get_d_diffuse_sigma_images(); + DLManagedTensor* get_d_Umat_images(); + DLManagedTensor* get_d2_Umat_images(); + DLManagedTensor* get_d_Bmat_images(); + DLManagedTensor* get_d2_Bmat_images(); + DLManagedTensor* get_d_Ncells_images(); + DLManagedTensor* get_d2_Ncells_images(); + DLManagedTensor* get_d_fcell_images(); + DLManagedTensor* get_d2_fcell_images(); + DLManagedTensor* get_d_eta_images(); + DLManagedTensor* get_d2_eta_images(); + DLManagedTensor* get_d_lambda_images(); + DLManagedTensor* get_d2_lambda_images(); + DLManagedTensor* get_d_panel_rot_images(); + DLManagedTensor* get_d2_panel_rot_images(); + DLManagedTensor* get_d_panel_orig_images(); + DLManagedTensor* get_d2_panel_orig_images(); + DLManagedTensor* get_d_fp_fdp_images(); + DLManagedTensor* get_Fhkl_scale_deriv(); }; #endif diff --git a/simtbx/diffBragg/src/diffBragg_ext.cpp b/simtbx/diffBragg/src/diffBragg_ext.cpp index 47d500ca77..feafb15c49 100644 --- a/simtbx/diffBragg/src/diffBragg_ext.cpp +++ b/simtbx/diffBragg/src/diffBragg_ext.cpp @@ -4,6 +4,7 @@ #include #include #include +#include using namespace boost::python; namespace simtbx{ @@ -503,6 +504,19 @@ namespace boost_python { namespace { return boost::python::make_tuple(diffBragg.pythony_indices,diffBragg.pythony_amplitudes); } +std::string kokkos_device() { + std::string backend = "cpu:0"; +#ifdef DIFFBRAGG_HAVE_KOKKOS + if (Kokkos::is_finalized() || !Kokkos::is_initialized()) { + return backend; + } +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + backend = "cuda:" + std::to_string( Kokkos::device_id() ); +#endif +#endif + return backend; +} + #ifdef DIFFBRAGG_HAVE_KOKKOS void finalize_kokkos(){ Kokkos::finalize(); @@ -512,6 +526,147 @@ namespace boost_python { namespace { Kokkos::initialize(Kokkos::InitializationSettings() .set_device_id(dev)); } + + void PrintDLTensorParameters( PyObject* capsule ) { + auto tensor = static_cast(PyCapsule_GetPointer(capsule, "dltensor")); + if (tensor == nullptr) { + std::cerr << "The input DLTensor is null." << std::endl; + return; + } + + // Print the number of dimensions. + std::cout << "Number of dimensions (ndim): " << tensor->ndim << std::endl; + + // Print the shape of the tensor. + std::cout << "Shape: ["; + for (int i = 0; i < tensor->ndim; ++i) { + std::cout << tensor->shape[i]; + if (i < tensor->ndim - 1) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + + // Print the data type of the tensor. + std::cout << "Data type (dtype): "; + switch (tensor->dtype.code) { + case kDLInt: std::cout << "Int"; break; + case kDLUInt: std::cout << "UInt"; break; + case kDLFloat: std::cout << "Float"; break; + case kDLBfloat: std::cout << "BFloat"; break; + default: std::cout << "Unknown"; break; + } + std::cout << " (" << int(tensor->dtype.bits) << " bits, " << tensor->dtype.lanes << " lanes)" << std::endl; + + // Print the device context (device type and device id). + std::cout << "Device context: "; + switch (tensor->device.device_type) { + case kDLCPU: std::cout << "CPU"; break; + case kDLCUDA: std::cout << "CUDA"; break; + case kDLCUDAHost: std::cout << "CUDA Host"; break; + case kDLOpenCL: std::cout << "OpenCL"; break; + case kDLVulkan: std::cout << "Vulkan"; break; + case kDLMetal: std::cout << "Metal"; break; + case kDLVPI: std::cout << "VPI"; break; + case kDLROCM: std::cout << "ROCM"; break; + default: std::cout << "Unknown"; break; + } + std::cout << ", Device ID: " << tensor->device.device_id << std::endl; + + // Print the strides of the tensor, if available. + if (tensor->strides != nullptr) { + std::cout << "Strides: ["; + for (int i = 0; i < tensor->ndim; ++i) { + std::cout << tensor->strides[i]; + if (i < tensor->ndim - 1) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + } else { + std::cout << "Strides: [Contiguous in memory]" << std::endl; + } + + // Print the byte offset of the tensor data. + std::cout << "Byte Offset: " << tensor->byte_offset << std::endl; +} + +void dlpack_destructor(PyObject* capsule) { + if (!PyCapsule_IsValid(capsule, "dltensor")) { + std::cout << "Muh0 " << PyCapsule_GetPointer(capsule, "used_dltensor") << std::endl; + return; + } + + // If the capsule has not been used, we need to delete it + std::cout << "Muh1" << std::endl; + DLManagedTensor* dlpackTensor = static_cast(PyCapsule_GetPointer(capsule, "dltensor")); + std::cout << "Muh2" << std::endl; + dlpackTensor->deleter(dlpackTensor); + std::cout << "Muh3" << std::endl; + delete dlpackTensor; + std::cout << "Muh4" << std::endl; +} + +struct DLPackAPI { + + double container[50]; + + PyObject* dlpack() { + // Get the Kokkos view size and dimensions + size_t numDims = 1; + int64_t* shape = new int64_t[numDims]; + for (size_t i = 0; i < numDims; i++) { + shape[i] = 50; + } + + // Create a DLPack tensor + DLManagedTensor* dlpackTensor = new DLManagedTensor; + dlpackTensor->dl_tensor.data = static_cast(&container); + dlpackTensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU; + dlpackTensor->dl_tensor.device.device_id = 0; + dlpackTensor->dl_tensor.ndim = numDims; + dlpackTensor->dl_tensor.dtype = getDLPackDataType(); + dlpackTensor->dl_tensor.shape = shape; + dlpackTensor->dl_tensor.strides = nullptr; + dlpackTensor->dl_tensor.byte_offset = 0; + dlpackTensor->manager_ctx = nullptr; + dlpackTensor->deleter = [](DLManagedTensor* tensor) { + std::cout << "Blob" << std::endl; + delete[] tensor->dl_tensor.shape; + }; + + // Create a PyCapsule with the DLPack tensor + PyObject* capsule = PyCapsule_New(dlpackTensor, "dltensor", dlpack_destructor); + + return capsule; + } + + DLDataType getDLPackDataType() { + DLDataType dtype; + dtype.code = kDLFloat; + dtype.bits = sizeof(double) * 8; + dtype.lanes = 1; + return dtype; + } + + void print_hello() { + std::cout << "Hello Python!" << std::endl; + } + + void print_container() { + std::cout << "C = [ "; + for (int i=0; i<50; i++) { + std::cout << container[i] << " "; + } + std::cout << "]" << std::endl; + } + + boost::python::tuple dlpack_device() { + return boost::python::make_tuple(static_cast(DLDeviceType::kDLCPU), 0); + } + + }; + #endif void diffBragg_init_module() { @@ -529,8 +684,23 @@ namespace boost_python { namespace { def("initialize_kokkos", initialize_kokkos, "the sole argument `dev` (an int from 0 to Ngpu-1) is passed to Kokkos::initialize()"); + + def("kokkos_device", kokkos_device, "returns kokkos device for use in PyTorch"); + + def("print_dlpack",PrintDLTensorParameters,"Print information about a dlpack"); + + // def("get_d_Ncells_images", &get_dlpack, "Return DLPackTensor for d_Ncells_images; pot. on GPU") + + class_("KokkosViewToDLPack", no_init) + .def(init("DLPack init")) + .def("__dlpack__", &DLPackAPI::dlpack, "Part of DLPack API") + .def("__dlpack_device__", &DLPackAPI::dlpack_device, "Part of DLPack API") + .def("hello", &DLPackAPI::print_hello, "Dummy test function") + .def("print", &DLPackAPI::print_container, "Print container") + ; #endif + class_ > ("diffBragg", no_init) /* constructor that takes a dxtbx detector and beam model */ @@ -579,7 +749,6 @@ namespace boost_python { namespace { .def("get_ncells_values", &simtbx::nanoBragg::diffBragg::get_ncells_values, "get Ncells values as a 3-tuple (Na, Nb, Nc)") - .def("add_diffBragg_spots_full", &simtbx::nanoBragg::diffBragg::add_diffBragg_spots_full, "forward model and gradients at every pixel") .def("get_ncells_derivative_pixels", &simtbx::nanoBragg::diffBragg::get_ncells_derivative_pixels, "get derivatives of intensity w.r.t (Na, Nb, Nc)") @@ -707,6 +876,11 @@ namespace boost_python { namespace { make_setter(&simtbx::nanoBragg::diffBragg::oversample_omega,dcp()), "whether to use an average solid angle correction per pixel, or one at the sub pixel level") + .add_property("host_transfer", + make_getter(&simtbx::nanoBragg::diffBragg::host_transfer,rbv()), + make_setter(&simtbx::nanoBragg::diffBragg::host_transfer,dcp()), + "whether to transfer results from device to host") + .add_property("force_cpu", make_getter(&simtbx::nanoBragg::diffBragg::force_cpu,rbv()), make_setter(&simtbx::nanoBragg::diffBragg::force_cpu,dcp()), @@ -938,6 +1112,52 @@ namespace boost_python { namespace { make_function(&set_beams,dcp()), "list of dxtbx::Beam objects corresponding to each zero-divergence and monochromatic x-ray point source in the numerical simulation ") +#ifdef DIFFBRAGG_HAVE_KOKKOS + .def("get_floatimage", &simtbx::nanoBragg::diffBragg::get_floatimage, "get DLPackTensor for floatimage; pot. on GPU") + + .def("get_wavelenimage", &simtbx::nanoBragg::diffBragg::get_wavelenimage, "get DLPackTensor for wavelenimage; pot. on GPU") + + .def("get_d_diffuse_gamma_images", &simtbx::nanoBragg::diffBragg::get_d_diffuse_gamma_images, "get DLPackTensor for d_diffuse_gamma_images; pot. on GPU") + + .def("get_d_diffuse_sigma_images", &simtbx::nanoBragg::diffBragg::get_d_diffuse_sigma_images, "get DLPackTensor for d_diffuse_sigma_images; pot. on GPU") + + .def("get_d_Umat_images", &simtbx::nanoBragg::diffBragg::get_d_Umat_images, "get DLPackTensor for d_Umat_images; pot. on GPU") + + .def("get_d2_Umat_images", &simtbx::nanoBragg::diffBragg::get_d2_Umat_images, "get DLPackTensor for d2_Umat_images; pot. on GPU") + + .def("get_d_Bmat_images", &simtbx::nanoBragg::diffBragg::get_d_Bmat_images, "get DLPackTensor for d_Bmat_images; pot. on GPU") + + .def("get_d2_Bmat_images", &simtbx::nanoBragg::diffBragg::get_d2_Bmat_images, "get DLPackTensor for d2_Bmat_images; pot. on GPU") + + .def("get_d_Ncells_images", &simtbx::nanoBragg::diffBragg::get_d_Ncells_images, "get DLPackTensor for d_Ncells_images; pot. on GPU") + + .def("get_d2_Ncells_images", &simtbx::nanoBragg::diffBragg::get_d2_Ncells_images, "get DLPackTensor for d2_Ncells_images; pot. on GPU") + + .def("get_d_fcell_images", &simtbx::nanoBragg::diffBragg::get_d_fcell_images, "get DLPackTensor for d_fcell_images; pot. on GPU") + + .def("get_d2_fcell_images", &simtbx::nanoBragg::diffBragg::get_d2_fcell_images, "get DLPackTensor for d2_fcell_images; pot. on GPU") + + .def("get_d_eta_images", &simtbx::nanoBragg::diffBragg::get_d_eta_images, "get DLPackTensor for d_eta_images; pot. on GPU") + + .def("get_d2_eta_images", &simtbx::nanoBragg::diffBragg::get_d2_eta_images, "get DLPackTensor for d2_eta_images; pot. on GPU") + + .def("get_d_lambda_images", &simtbx::nanoBragg::diffBragg::get_d_lambda_images, "get DLPackTensor for d_lambda_images; pot. on GPU") + + .def("get_d2_lambda_images", &simtbx::nanoBragg::diffBragg::get_d2_lambda_images, "get DLPackTensor for d2_lambda_images; pot. on GPU") + + .def("get_d_panel_rot_images", &simtbx::nanoBragg::diffBragg::get_d_panel_rot_images, "get DLPackTensor for d_panel_rot_images; pot. on GPU") + + .def("get_d2_panel_rot_images", &simtbx::nanoBragg::diffBragg::get_d2_panel_rot_images, "get DLPackTensor for d2_panel_rot_images; pot. on GPU") + + .def("get_d_panel_orig_images", &simtbx::nanoBragg::diffBragg::get_d_panel_orig_images, "get DLPackTensor for d_panel_orig_images; pot. on GPU") + + .def("get_d2_panel_orig_images", &simtbx::nanoBragg::diffBragg::get_d2_panel_orig_images, "get DLPackTensor for d2_panel_orig_images; pot. on GPU") + + .def("get_d_fp_fdp_images", &simtbx::nanoBragg::diffBragg::get_d_fp_fdp_images, "get DLPackTensor for d_fp_fdp_images; pot. on GPU") + + .def("get_Fhkl_scale_deriv", &simtbx::nanoBragg::diffBragg::get_Fhkl_scale_deriv, "get DLPackTensor for Fhkl_scale_deriv; pot. on GPU") +#endif + ; // end of diffBragg extention } // end of diffBragg_init_module diff --git a/simtbx/diffBragg/src/util.h b/simtbx/diffBragg/src/util.h index ec197c1dc9..5de97ce4fd 100644 --- a/simtbx/diffBragg/src/util.h +++ b/simtbx/diffBragg/src/util.h @@ -10,6 +10,8 @@ #include #include +#include "dlpack/dlpack.h" + #ifndef CUDAREAL #define CUDAREAL double #endif @@ -29,6 +31,48 @@ inline void easy_time(double& timer, struct timeval& t, bool recording){ timer += time; } +template +DLDataTypeCode getDLPackTypeCode() { + if (std::is_same::value) { + return kDLFloat; + } else if (std::is_same::value) { + return kDLFloat; + } else if (std::is_same::value) { + return kDLInt; + } else if (std::is_same::value) { + return kDLUInt; + // } else if (std::is_same::value) { + // return kDLBool; + } else { + // Unsupported data type + throw std::runtime_error("Unsupported data type for DLPack conversion"); + } +} + +template +DLManagedTensor* array_to_dlpack(DataType* pointer, int64_t length) { + + int64_t* shape = new int64_t[1]; + shape[0] = length; + + // Create a DLPack tensor + DLManagedTensor* dlpackTensor = new DLManagedTensor; + dlpackTensor->dl_tensor.data = static_cast(pointer); + dlpackTensor->dl_tensor.device = {kDLCPU, 0}; + dlpackTensor->dl_tensor.dtype.code = getDLPackTypeCode(); + dlpackTensor->dl_tensor.dtype.bits = sizeof(DataType) * 8; + dlpackTensor->dl_tensor.dtype.lanes = 1; + dlpackTensor->dl_tensor.ndim = 1; + dlpackTensor->dl_tensor.shape = shape; + dlpackTensor->dl_tensor.strides = nullptr; + dlpackTensor->dl_tensor.byte_offset = 0; + dlpackTensor->manager_ctx = nullptr; + dlpackTensor->deleter = [](DLManagedTensor* tensor) { + delete[] tensor->dl_tensor.shape; + }; + return dlpackTensor; +} + struct timer_variables{ double add_spots_pre=0; // times the initializations for add spots kernel double add_spots_post=0; // times the copies that occur after add spots kernel @@ -124,6 +168,7 @@ struct flags{ bool isotropic_ncells = false; // one mosaic domain parameter bool complex_miller = false; // is the miller array complex (such thet Fhkl_linear and Fhkl2_linear are both defined) bool no_Nabc_scale = false; // no Nabc prefactor + bool host_transfer = true; // transfer data after add_diffbragg_spots bool refine_diffuse = false; // flag for computing diffuse gradients std::vector refine_Bmat; // Bmatrix std::vector refine_Ncells; // mosaic domain size