forked from DIDSR/MCGPU
-
Notifications
You must be signed in to change notification settings - Fork 1
/
MC-GPU_v1.3.cu
3682 lines (3055 loc) · 204 KB
/
MC-GPU_v1.3.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// --> MC-GPU v1.3 was originally released in Google Code in 2012.
// This page contains the original 2012 code, with only a bug in the function "report_voxels_dose" corrected.
// An upgraded version of MC-GPU is being developed.
// ** CHANGE LIST ** See below \section sec_changes_13 List of changes in code version 1.3
// -- Code upgraded from CUDA 4 to CUDA 5, after cutil_inline.h has been eliminated from the SDK:
// CUDA 5.0: Using "getLastCudaError" instead of "cutilCheckMsg"
// CUDA 5.0: Using "checkCudaErrors" instead of "cutilSafeCall" or "cutilCheckError"
// CUDA 5.0: Using "gpuGetMaxGflopsDeviceId instead of "cutGetMaxGflopsDeviceId"
// CUDA 5.0: Substitute all the CUDA timer functions (cutResetTimer, cutStartTimer, cutStopTimer, cutGetTimerValue) for standard C clock() calls.
////////////////////////////////////////////////////////////////////////////////////////
//
// ****************************
// *** MC-GPU , version 1.3 ***
// ****************************
//
/**
* \mainpage MC-GPU v1.3
*
* \code
*
* Andreu Badal, PhD (Andreu.Badal-Soler{at}fda.hhs.gov)
*
* Division of Imaging and Applied Mathematics
* Office of Science and Engineering Laboratories
* Center for Devices and Radiological Health
* U.S. Food and Drug Administration
*
* Code release date: 2012/12/12
*
*
*
* \endcode
*
*
*
* \b MC-GPU [1-4] is a Monte Carlo simulation code that can generate synthetic radiographic
* images and computed tomography (CT) scans of realistic models of the human anatomy using the
* computational power of commodity Graphics Processing Unit (GPU) cards.
* The code implements a massively multi-threaded Monte Carlo simulation algorithm
* for the transport of x rays in a voxelized geometry. The x ray interaction models and material
* properties have been adapted from \b PENELOPE \b 2006 [5].
*
*
* \b MC-GPU was developed using the \b CUDA programming model from \b NVIDIA [6] to achieve
* maximum performance on NVIDIA GPUs. The code can also be compiled with a standard C compiler
* to be executed in a regular CPU.
* In a typical medical imaging simulation, the use of GPU computing with MC-GPU has been shown
* to provide a speed up of between 20 and 40 times, compared to the execution on a single CPU core.
*
* The MC-GPU code has been described in different scientific publications [1-4].
* The main reference of this work, which the users should cite, is the following [1]:
* \code
* Andreu Badal and Aldo Badano, "Accelerating Monte Carlo simulations of
* photon transport in a voxelized geometry using a massively parallel
* Graphics Processing Unit", Medical Physics 36, pp. 4878–4880 (2009)
* \endcode
* The main developer of MC-GPU is \b Andreu \b Badal, working at the U.S. \b Food \b and
* \b Drug \b Administration (Center for Devices and Radiological Health, Office of Science
* and Engineering Laboratories, Division of Imaging and Applied Mathematics).
* The source code of MC-GPU is free and open software in the public domain, as explained
* in the Disclaimer section below.
* The source code of MC-GPU and its auxiliary files are distributed from the website: http://code.google.com/.
*
*
* This documentation has been automatically generated by \b Doxygen parsing the comments in
* the MC-GPU source code.
* This code is still in development, please report to the author any issue/bug
* that you may encounter. Feel free to suggest improvements to the code too!
*
*
*
* \section sec_changes List of modifications in different versions of the code
*
* \subsection sec_changes_v13 Version 1.3 (release date: 2012/12/12)
*
* - Code upgraded to CUDA 5.0 (not compatible with previous versions of CUDA!).
* - Removed limit on the amount of projection images that can be simulated per CT scan (source and
* detector parameters now stored in global memory and transferring to shared memory at run time
* to avoid using the limited constant memory).
* - New material dose tally implemented to estimate the dose deposited in each material independently
* of the voxel dose tally (the voxel dose tally measures the dose in each material adding the energy
* deposited in each voxel of that material within the defined voxelized region-of-interest).
* - Interaction loop re-organized to maximize performance (virtual interactions simulated before real ones).
* - Improvements and small corrections in the source sampling and tally routines.
* - Allow input of material and voxel geometry files compressed with gzip (zlib library now required for compilation).
*
*
*
* \subsection sec_changes_v12 Version 1.2 (release date: 2011/10/25)
*
* - Implemented a voxel dose tally.
* - Polyenergetic source model.
* - MPI support for simulating individual projections.
* - Simulation by time limit.
* - Improved flexibility of the CT trajectories, helical scans.
*
*
*
* \section sec_disc Disclaimer
*
* This software and documentation (the "Software") were developed at the Food and
* Drug Administration (FDA) by employees of the Federal Government in the course
* of their official duties. Pursuant to Title 17, Section 105 of the United States
* Code, this work is not subject to copyright protection and is in the public
* domain. Permission is hereby granted, free of charge, to any person obtaining a
* copy of the Software, to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish, distribute,
* sublicense, or sell copies of the Software or derivatives, and to permit persons
* to whom the Software is furnished to do so. FDA assumes no responsibility
* whatsoever for use by other parties of the Software, its source code,
* documentation or compiled executables, and makes no guarantees, expressed or
* implied, about its quality, reliability, or any other characteristic. Further,
* use of this code in no way implies endorsement by the FDA or confers any
* advantage in regulatory decisions. Although this software can be redistributed
* and/or modified freely, we ask that any derivative works bear some notice that
* they are derived from it, and any modified versions bear some notice that they
* have been modified.
*
*
*
* \section sec_Intro Code features
*
* In this section we provide a brief description of the features of the MC-GPU code. A
* more complete description of the code can be found in our published articles.
* important information regarding the operation of the code is provided as comments in the
* input files of the sample simulations provided with the MC-GPU package.
* Detailed information on each function of the code can be found in the complete Doxygen
* documentation of the source code
*
* The basic operation of the code consists in adapting the simulation input file
* to describe the location and characteristics of the x ray source, define the CT trajectory
* (if any), list the materials to be used in the simulation, define the geometry of
* the x ray detector and, finally, specify the voxelized object file to be
* used as the simulation material universe.
* In the first line of the input file, the user can fix the total number of x rays that have
* to be simulated (> 1e5 histories) or the total simulation time (maximum 1e5 seconds).
*
*
* The coordinate system of the simulated world is determined by the input voxelized geometry.
* The origin of coordinates is assumed to be located at the lower-back corner of the voxelized
* volume, and the axis are located on the vertices of the voxelized volume.
* This means that the lower-back corner of the first voxel is on the origin and the
* following voxels are located along the positive X, Y and Z axis (first quadrant).
*
*
* To simulate the atomic interactions, MC-GPU uses a database of material properties based on the
* database from PENELOPE. A PENELOPE 2006 material file can be converted into an MC-GPU material
* file using the auxiliary utility "MC-GPU_create_material_data.f" provided with the MC-GPU
* package. Pre-defined material files for a set of materials typically used in medical imaging
* simulations are already provided in the folder "MC-GPU_material_files".
*
*
* The code includes two tally options: an \b image \b tally that creates projection x-ray images,
* and a radiation \b dose \b tally that estimates the dose deposited inside the patient model.
* MC-GPU does not currently simulate the transport of electrons and therefore the dose
* deposition tally (KERMA tally rigorously) will not be accurate for high energies or near
* material interfaces and small voxels.
* In the image tally the images are formed by counting the energy that enters a user-defined 2D
* grid of pixels, which is a simple approximation to a noise-free flat-panel detector with
* 100% detection efficiency. The pixel values have units of eV/cm^2.
* Four different images are reported at the end of the simulation, corresponding
* to the signal produced by x rays that did not interact between the source and the detector
* (non-scattered), x rays that suffered a single Compton (inelastic) interaction, a single
* Rayleigh (elastic) interaction, and multi-scattered x rays.
* The dose tally counts the energy deposited by each x ray track inside each voxel of the
* geometry, within a user-defined volumetric region-of-interest (ROI). The average dose deposited
* inside each voxel and in each material (and the associated statistical uncertainties) are reported
* at the end of the simulation.
*
*
* MC-GPU can simulate a single projection image or a full CT scan.
* The CT is simulated generating many projection images around the static
* voxelized geometry. Currently, the code is limited to perform a simple
* CT trajectory rotating around the Z axis. The user can specify the angular shift and longitudinal
* translation (pitch) of the source between each projection and also the distance between the
* source and the axis of rotation (the axis is assumed to be parallel to the Z axis).
* By now, the code does not simulate some relevant components of a CT scanner such as the
* anti-scatter grid, a bow-tie filter or a curved detector (flat-panel detector only).
*
*
* The x ray source is defined as a point source emitting x rays with an energy randomly sampled
* from the user-provided energy spectrum. The polyenergetic spectrum is efficiently sampled
* using the Walker aliasing algorithm. The emitted cone beam is computationally
* collimated to produce a rectangular field on the detector plane, within the azimuthal and
* polar angles specified by the user.
* The detector plane is automatically located at the specified distance right in front of the
* source focal spot, with the collimated cone beam pointing towards the geometric center of the detector.
*
*
* In order to optimize the particle tracking algorithm (ray-tracing) and minimize
* the accesses to the slow GPU main memory, the photon trajectories across the voxels
* are computed using the Woodcock tracking algorithm.
* With this technique the photons perceive the geometry as a uniform medium
* composed of the material of the most attenuating voxel.
* In this way, the voxel boundaries do not have to be explicitly calculated and
* multiple voxels can be crossed in a single step.
* To keep the simulation unbiased, some of the interactions are considered
* "virtual" (i.e., do not change the photon energy or direction of movement),
* depending on the x ray energy and the actual material at the interaction site.
* In typical medical imaging simulations where the most attenuating material is cortical bone,
* the Woodcock tracking algorithm gives an speed up of almost one order of magnitude compared
* to computing voxel boundaries all the time. However, if the geometry includes a high
* density voxel, such as a metallic implant, the performance of the code can be severely
* reduced because a large fraction of the sampled interactions will be virtual.
*
*
* The random number generator used in PENELOPE [5], RANECU, is also used in the GPU
* program. To ensure that the simulated tracks are not correlated, each thread initializes
* the generator to a unique position in the random sequence, far enough from the
* other threads, using the algorithm implemented in the seedsMLCG code [7].
*
*
* In a typical simulation, several thousand threads are launched simultaneously in
* the GPU, each one of them simulating a batch of several x ray tracks.
* If the code is compiled with MPI support (see below), multiple GPUs can be used in parallel.
* The code will perform a short speed test to estimate the relative speed of each GPU used
* in the simulation and then distribute the number of particles among the available GPUs correspondingly.
* If the user specified a time limit in the simulation, all the GPUs will simulate in parallel
* for the allowed time. Since the code is already optimized to scale well in
* thousands of GPU threads, it scales almost linearly with the number of GPUs in most
* situations, with only a few seconds of overhead in the initialization of the multiple GPUs
* and in the reduction of the final results.
*
*
*
*
* \section sec_output Code output
*
* At the end of the simulation the code reports the tallied 3D dose distribution and the
* final simulated images in RAW binary form, as 32-bits float values. The image data is provided
* as a collection of five consecutive images corresponding to: total image (scatter+primaries),
* primary particles, Compton, Rayleigh and multi-scatter.
* The dose data is reported as two RAW files with the mean dose and twice the standard deviation
* of the dose in each voxel of the geometry respectively, within the input ROI.
* The average dose deposited in each material of the geometry is also reported to the standard output.
* Organ doses can be obtained by post-processing the output dose file, knowing which voxel
* corresponds to each organ.
* The pixel and voxel dose data values are stored with the X coordinate incrementing first, the Y
* coordinate incrementing second, and the Z coordinate incrementing last.
*
* The program also reports the simulated images and the dose at the Z plane at the level of the x ray
* source as ASCII text files. The ASCII output can be readily visualized with the GNUPLOT scripts
* distributed with MC-GPU. The header section at the beginning of these text files provides the
* information required to easily read the RAW binary files with IMAGEJ, OCTAVE or other programs.
*
*
*
* \section sec_compilation Code compilation and execution
*
* MC-GPU has been developed and tested only in the Linux operating system.
* A Makefile script is provided to compile the MC-GPU code in Linux.
* The CUDA libraries and the GNU GCC compiler must be previously installed.
* The Makefile may have to be edited to modify the library path.
* The code requires the "zlib.h" library to be able to open gzipped input files.
*
*
* MC-GPU uses CUDA to access NVIDIA GPUs but all the actual computations are coded
* in standard C and the CUDA-specific commands are enclosed within preprocessor
* "if" statements. Defining the pre-processor variable "USING_CUDA" (i.e.,
* compiling with "-DUSING_CUDA") the particle transport routines are compiled to simulate
* many x ray histories in parallel in an NVIDIA GPU using CUDA. Otherwise, the code is
* sequentially executed in the CPU.
* The same coding approach has been used to allow the use of multiple GPUs.
* Defining the pre-processor variable "USING_MPI" (i.e., compiling with
* "-DUSING_MPI"), Message Passing Interface (MPI) library calls are used to share information
* between multiple CPU threads in different computers.
* Each MPI thread gets a unique id in the CPU and addresses a unique GPU.
* At the end of the simulation the images and doses tallied by the different GPUs are
* reduced to form single output file equivalent to a sequential simulation of the same
* number of particles.
*
* The code can be easily compiled executing the command "make" or running the provided
* "./make.sh" script.
* Optionally, the code can be executed from the command line with a command like this
* (example using CUDA and MPI, openMPI library in this case):
* \code
* nvcc -DUSING_CUDA -DUSING_MPI MC-GPU_v1.3.cu -o MC-GPU_v1.3.x -O3
* -use_fast_math -L/usr/lib/ -I. -I/usr/local/cuda/include
* -I/usr/local/cuda/samples/common/inc -I/usr/local/cuda/samples/shared/inc/
* -I/usr/include/openmpi -lmpi -lz --ptxas-options=-v
* -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_30,code=sm_30
* \endcode
*
* The same source code can also be compiled for a regular CPU using:
* \code
* gcc -x c -O3 MC-GPU_v1.3.cu -o MC-GPU_v1.3_CPU.x -I./ -lm -lz
* \endcode
*
* To run a simulation (and keep the information reported to the standard
* output in an external file) the compiled code can be executed as:
* \code
* ./MC-GPU_v1.3.x MC-GPU_v1.3.in | tee MC-GPU_v1.3.out
* \endcode
*
* All simulation can be executed in the same way using the code compiled for the CPU
* or the GPU (however, the number of histories should be reduced for the CPU to finish
* the simulation in a reasonable time).
* To run the simulation in parallel with MPI in multiple GPUs (or CPU cores) in the
* current computer the user can execute:
* \code
* mpirun -n 4 ./MC-GPU_v1.3.x MC-GPU_v1.3.in
* \endcode
*
* To use GPUs in different computers, the user must make sure all computers can access the simulation
* files and that the libraries are correctly set up in all nodes.
* To execute a simulation (with verbose MPI information being reported):
* \code
* mpirun --tag-output -v -x LD_LIBRARY_PATH -hostfile myhostfile.txt -n 8
* /fullPath/MC-GPU_v1.3.x /fullPath/MC-GPU_v1.3.in | tee MC-GPU_v1.3.out
* \endcode
*
* The text file 'hostfile' lists the IP addresses and number of computing slots (GPUs) of the
* computers collaborating in the simulation. This file is not necessary when using multiple
* GPUs in a single workstation. When using multiple computers, the simulation files should
* be located in a shared drive to make sure every node can access the input data.
* The different workstations must have different host names in order to be differentiated by
* the MPI threads. The multiple threads communicate to each other to make sure they don't
* use the same GPU in the same workstation.
*
*
*
* \section sec_issues Known issues
*
* In extremely long simulations, it is theoretically possible to cause an overflow of the counters
* estimating the mean and standard deviation of the material or voxel doses. If this happen, the
* results will be incorrect and even negative or nan values can be reported.
*
*
*
*
* \section sec_ref References
*
* -# A. Badal and A. Badano, Accelerating Monte Carlo simulations of photon transport in a voxelized geometry using a massively parallel Graphics Processing Unit, Med. Phys. 36, p. 4878-4880 (2009)
* -# A. Badal and A. Badano, Monte Carlo Simulation of X-Ray Imaging Using a Graphics Processing Unit, IEEE NSC-MIC, Conference Record , HP3–1, p. 4081-4084 (2009)
* -# A. Badal, I. Kyprianou, D. Sharma and A. Badano, Fast cardiac CT simulation using a Graphics Processing Unit-accelerated Monte Carlo code, Proc. SPIE Medical Imaging Conference 7622, p. 762231 (2010)
* -# A. Badal and A. Badano, Fast Simulation of Radiographic Images Using a Monte Carlo X-Ray Transport Algorithm Implemented in CUDA, Chapter 50 of GPU Computing Gems (Emerald Edition), p. 813-830, editor Wen-mei W. Hwu, publisher Morgan Kaufmann (Elsevier), Burlington MA, 2010
* -# F. Salvat, J. M. Fernandez-Varea and J. Sempau, PENELOPE – A code system for Monte Carlo simulation of electron and photon transport, NEA-OECD, Issy-les-Moulineaux, available at www.nea.fr/html/dbprog/peneloperef.html (2006)
* -# NVIDIA Corporation, NVIDIA CUDA(TM) Programming Guide, Technical Report available at www.nvidia.com/cuda (2011)
* -# A. Badal and J. Sempau, A package of Linux scripts for the parallelization of Monte Carlo simulations, Comput. Phys. Commun. 175 (6), p. 440-450 (2006)
*
*
*
* @file MC-GPU_v1.3.cu
* @author Andreu Badal ([email protected])
* @date 2012/12/12
* -- MC-GPU v.1.3: 2012/12/12
* -- MC-GPU v.1.2: 2011/10/25
* -- MC-GPU v.1.1: 2010/06/25
* -- MC-GPU v.1.0: 2009/03/17
*/
////////////////////////////////////////////////////////////////////////////////////////
// *** Include header file with the structures and functions declarations
#include <MC-GPU_v1.3.h>
// *** Include the computing kernel:
#include <MC-GPU_kernel_v1.3.cu>
////////////////////////////////////////////////////////////////////////////////
//! Main program of MC-GPU: initialize the simulation enviroment, launch the GPU
//! kernels that perform the x ray transport and report the final results.
//! This function reads the description of the simulation from an external file
//! given in the command line. This input file defines the number of particles to
//! simulate, the characteristics of the x-ray source and the detector, the number
//! and spacing of the projections (if simulating a CT), the location of the
//! material files containing the interaction mean free paths, and the location
//! of the voxelized geometry file.
//!
//! @author Andreu Badal
//!
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
// -- Start time counter:
time_t current_time = time(NULL); // Get current time (in seconds)
clock_t clock_start, clock_end, clock_start_beginning; // (requires standard header <time.h>)
clock_start = clock(); // Get current clock counter
clock_start_beginning = clock_start;
#ifdef USING_MPI
// -- Using MPI to access multiple GPUs to simulate the x-ray projection image:
int myID = -88, numprocs = -99, return_reduce = -1;
MPI_Init(&argc, &argv); // Init MPI and get the current thread ID
MPI_Comm_rank(MPI_COMM_WORLD, &myID);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
char MPI_processor_name[81];
int resultlen = -1;
MPI_Get_processor_name(MPI_processor_name, &resultlen);
char* char_time = ctime(¤t_time); char_time[19] = '\0'; // The time is located betwen the characters 11 and 19.
printf(" >> MPI run (myId=%d, numprocs=%d) on processor \"%s\" (time: %s) <<\n", myID, numprocs, MPI_processor_name, &char_time[11]);
fflush(stdout); // Clear the screen output buffer
MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads
MASTER_THREAD printf(" -- Time spent initializing the MPI world (MPI_Barrier): %.3f s\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
#else
int myID = 0, numprocs = 1; // Only one CPU thread used when MPI is not activated (multiple projections will be simulated sequentially).
#endif
MASTER_THREAD
{
printf("\n\n *****************************************************************************\n");
printf( " *** MC-GPU, version 1.3 (http://code.google.com/p/mcgpu/) ***\n");
printf( " *** ***\n");
printf( " *** A. Badal and A. Badano, \"Accelerating Monte Carlo simulations of *** \n");
printf( " *** photon transport in a voxelized geometry using a massively parallel *** \n");
printf( " *** Graphics Processing Unit\", Medical Physics 36, pp. 4878–4880 (2009) ***\n");
printf( " *** ***\n");
printf( " *** Andreu Badal ([email protected]) ***\n");
printf( " *****************************************************************************\n\n");
printf("****** Code execution started on: %s\n\n", ctime(¤t_time));
fflush(stdout);
}
#ifdef USING_CUDA
// The "MASTER_THREAD" macro prints the messages just once when using MPI threads (it has no effect if MPI is not used): MASTER_THREAD == "if(0==myID)"
MASTER_THREAD printf ("\n *** CUDA SIMULATION IN THE GPU ***\n");
#else
MASTER_THREAD printf ("\n *** SIMULATION IN THE CPU ***\n");
#endif
MASTER_THREAD printf("\n -- INITIALIZATION phase:\n");
MASTER_THREAD fflush(stdout); // Clear the screen output buffer for the master thread
///////////////////////////////////////////////////////////////////////////////////////////////////
// *** Declare the arrays and structures that will contain the simulation data:
struct voxel_struct voxel_data; // Define the geometric constants of the voxel file
struct detector_struct detector_data[MAX_NUM_PROJECTIONS]; // Define an x ray detector (for each projection)
struct source_struct source_data[MAX_NUM_PROJECTIONS]; // Define the particles source (for each projection)
struct source_energy_struct source_energy_data; // Define the source energy spectrum
struct linear_interp mfp_table_data; // Constant data for the linear interpolation
struct compton_struct compton_table; // Structure containing Compton sampling data (to be copied to CONSTANT memory)
struct rayleigh_struct rayleigh_table; // Structure containing Rayleigh sampling data (to be copied to CONSTANT memory)
float2 *voxel_mat_dens = NULL; // Poiter where voxels array will be allocated
unsigned int voxel_mat_dens_bytes = 0; // Size (in bytes) of the voxels array (using unsigned int to allocate up to 4.2GBytes)
float density_max[MAX_MATERIALS];
float density_nominal[MAX_MATERIALS];
unsigned long long int *image = NULL; // Poiter where image array will be allocated
int image_bytes = -1; // Size of the image array
int mfp_table_bytes = -1, mfp_Woodcock_table_bytes = -1; // Size of the table arrays
float2 *mfp_Woodcock_table = NULL; // Linear interpolation data for the Woodcock mean free path [cm]
float3 *mfp_table_a = NULL, *mfp_table_b = NULL; // Linear interpolation data for 3 different interactions:
// (1) inverse total mean free path (divided by density, cm^2/g)
// (2) inverse Compton mean free path (divided by density, cm^2/g)
// (3) inverse Rayleigh mean free path (divided by density, cm^2/g)
short int dose_ROI_x_min, dose_ROI_x_max, dose_ROI_y_min, dose_ROI_y_max, dose_ROI_z_min, dose_ROI_z_max; // Coordinates of the dose region of interest (ROI)
ulonglong2 *voxels_Edep = NULL; // Poiter where the voxel energy deposition array will be allocated
int voxels_Edep_bytes = 0; // Size of the voxel Edep array
ulonglong2 materials_dose[MAX_MATERIALS]; // Array for tally_materials_dose. !!tally_materials_dose!!
int kk;
for(kk=0;kk<MAX_MATERIALS;kk++)
{
materials_dose[kk].x = 0; // Initializing data !!tally_materials_dose!!
materials_dose[kk].y = 0;
density_nominal[kk] =-1.0f;
}
clock_t clock_kernel; // Using only cpu timers after CUDA 5.0
double time_elapsed_MC_loop = 0.0, time_total_MC_simulation = 0.0, time_total_MC_init_report = 0.0;
unsigned long long int total_histories;
int histories_per_thread, seed_input, num_threads_per_block, gpu_id, num_projections;
int flag_material_dose = -2;
double D_angle=-1.0, angularROI_0=0.0, angularROI_1=360.0, initial_angle=0.0, SRotAxisD=-1.0, vertical_translation_per_projection=0.0;
char file_name_voxels[250], file_name_materials[MAX_MATERIALS][250], file_name_output[250], file_dose_output[250], file_name_espc[250];
// *** Read the input file given in the command line and return the significant data:
read_input(argc, argv, myID, &total_histories, &seed_input, &gpu_id, &num_threads_per_block, &histories_per_thread, detector_data, &image, &image_bytes, source_data, &source_energy_data, file_name_voxels, file_name_materials, file_name_output, file_name_espc, &num_projections, &D_angle, &angularROI_0, &angularROI_1, &initial_angle, &voxels_Edep, &voxels_Edep_bytes, file_dose_output, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max, &SRotAxisD, &vertical_translation_per_projection, &flag_material_dose);
// *** Read the energy spectrum and initialize its sampling with the Walker aliasing method:
MASTER_THREAD printf(" -- Reading the energy spectrum and initializing the Walker aliasing sampling algorithm.\n");
float mean_energy_spectrum = 0.0f;
init_energy_spectrum(file_name_espc, &source_energy_data, &mean_energy_spectrum);
// *** Output some of the data read to make sure everything was correctly read:
MASTER_THREAD
{
if (total_histories < (unsigned long long int)(100000))
printf(" simulation time = %lld s\n", total_histories);
else
printf(" x-ray tracks to simulate = %lld\n", total_histories);
printf(" initial random seed = %d\n", seed_input);
printf(" azimuthal (phi), polar apertures = %.6f , %.6f degrees\n", ((double)source_data[0].D_phi)*RAD2DEG, 2.0*(90.0 - acos(((double)source_data[0].cos_theta_low))*RAD2DEG) );
printf(" focal spot position = (%f, %f, %f)\n", source_data[0].position.x, source_data[0].position.y, source_data[0].position.z);
printf(" source direction = (%f, %f, %f)\n", source_data[0].direction.x, source_data[0].direction.y, source_data[0].direction.z);
printf(" initial angle from X = %lf\n", initial_angle*RAD2DEG);
printf(" source-detector distance = %f cm\n", detector_data[0].sdd);
printf(" detector center = (%f, %f, %f)\n", (source_data[0].position.x + source_data[0].direction.x * detector_data[0].sdd), // Center of the detector straight ahead of the focal spot.
(source_data[0].position.y + source_data[0].direction.y * detector_data[0].sdd),
(source_data[0].position.z + source_data[0].direction.z * detector_data[0].sdd));
printf(" detector low corner (at +Y) = (%f, %f, %f)\n", detector_data[0].corner_min_rotated_to_Y.x, detector_data[0].corner_min_rotated_to_Y.y, detector_data[0].corner_min_rotated_to_Y.z);
printf(" number of pixels image = %dx%d = %d\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y, detector_data[0].total_num_pixels);
printf(" pixel size = %.3fx%.3f cm\n", 1.0f/detector_data[0].inv_pixel_size_X, 1.0f/detector_data[0].inv_pixel_size_Z);
printf(" number of projections = %d\n", num_projections);
if (num_projections!=1)
{
printf(" source-rotation axis-distance = %lf cm\n", SRotAxisD);
printf(" angle between projections = %lf\n", D_angle*RAD2DEG);
printf(" angular region of interest = [%lf,%lf] degrees\n", angularROI_0*RAD2DEG, angularROI_1*RAD2DEG);
printf(" vertical translation per projection = %lf cm\n", vertical_translation_per_projection);
}
printf(" Input voxel file = %s\n", file_name_voxels);
printf(" Output image file = %s\n", file_name_output);
if (dose_ROI_x_max>-1)
{
printf(" Output dose file = %s\n", file_dose_output);
printf(" Input region of interest dose = X[%d,%d], Y[%d,%d], Z[%d,%d]\n", dose_ROI_x_min+1, dose_ROI_x_max+1, dose_ROI_y_min+1, dose_ROI_y_max+1, dose_ROI_z_min+1, dose_ROI_z_max+1); // Show ROI with index=1 for the first voxel instead of 0.
}
printf("\n Energy spectrum file = %s\n", file_name_espc);
printf( " number of energy bins read = %d\n", source_energy_data.num_bins_espc);
printf( " minimum, maximum energies = %.3f, %.3f keV\n", 0.001f*source_energy_data.espc[0], 0.001f*source_energy_data.espc[source_energy_data.num_bins_espc]);
printf( " mean energy spectrum = %.3f keV\n\n", 0.001f*mean_energy_spectrum);
fflush(stdout);
}
// *** Set the detectors and sources for the CT trajectory (if needed, ie, for more than one projection):
if (num_projections != 1)
{
set_CT_trajectory(myID, num_projections, D_angle, angularROI_0, angularROI_1, SRotAxisD, source_data, detector_data, vertical_translation_per_projection);
}
fflush(stdout);
// *** Read the voxel data and allocate the density map matrix. Return the maximum density:
load_voxels(myID, file_name_voxels, density_max, &voxel_data, &voxel_mat_dens, &voxel_mat_dens_bytes, &dose_ROI_x_max, &dose_ROI_y_max, &dose_ROI_z_max);
MASTER_THREAD printf(" Total CPU memory allocated for voxels vector and data structures = %f Mbytes\n", (voxel_mat_dens_bytes+image_bytes+sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp)+2*mfp_table_bytes+sizeof(struct rayleigh_struct)+sizeof(struct compton_struct))/(1024.f*1024.f));
MASTER_THREAD fflush(stdout);
// *** Read the material mean free paths and set the interaction table in a "linear_interp" structure:
load_material(myID, file_name_materials, density_max, density_nominal, &mfp_table_data, &mfp_Woodcock_table, &mfp_Woodcock_table_bytes, &mfp_table_a, &mfp_table_b, &mfp_table_bytes, &rayleigh_table, &compton_table);
// -- Check that the input material tables and the x-ray source are consistent:
if ( (source_energy_data.espc[0] < mfp_table_data.e0) || (source_energy_data.espc[source_energy_data.num_bins_espc] > (mfp_table_data.e0 + (mfp_table_data.num_values-1)/mfp_table_data.ide)) )
{
MASTER_THREAD
{
printf("\n\n\n !!ERROR!! The input x-ray source energy spectrum minimum (%.3f eV) and maximum (%.3f eV) energy values\n", source_energy_data.espc[0], source_energy_data.espc[source_energy_data.num_bins_espc]);
printf( " are outside the tabulated energy interval for the material properties tables (from %.3f to %.3f eV)!!\n", mfp_table_data.e0, (mfp_table_data.e0+(mfp_table_data.num_values-1)/mfp_table_data.ide));
printf( " Please, modify the input energy spectra to fit the tabulated limits or create new tables.\n\n");
}
#ifdef USING_MPI
MPI_Finalize();
#endif
exit(-1);
}
// -- Pre-compute the total mass of each material present in the voxel phantom (to be used in "report_materials_dose"):
double voxel_volume = 1.0 / ( ((double)voxel_data.inv_voxel_size.x) * ((double)voxel_data.inv_voxel_size.y) * ((double)voxel_data.inv_voxel_size.z) );
double mass_materials[MAX_MATERIALS];
for(kk=0; kk<MAX_MATERIALS; kk++)
mass_materials[kk] = 0.0;
for(kk=0; kk<(voxel_data.num_voxels.x*voxel_data.num_voxels.y*voxel_data.num_voxels.z); kk++) // For each voxel in the geometry
mass_materials[((int)voxel_mat_dens[kk].x)-1] += ((double)voxel_mat_dens[kk].y)*voxel_volume; // Add material mass = density*volume
// *** Initialize the GPU using the NVIDIA CUDA libraries, if USING_CUDA parameter defined at compile time:
#ifdef USING_CUDA
// -- Declare the pointers to the device global memory, when using the GPU:
float2 *voxel_mat_dens_device = NULL,
*mfp_Woodcock_table_device = NULL;
float3 *mfp_table_a_device = NULL,
*mfp_table_b_device = NULL;
unsigned long long int *image_device = NULL;
struct rayleigh_struct *rayleigh_table_device = NULL;
struct compton_struct *compton_table_device = NULL;
ulonglong2 *voxels_Edep_device = NULL;
struct detector_struct *detector_data_device = NULL;
struct source_struct *source_data_device = NULL;
ulonglong2 *materials_dose_device = NULL; // !!tally_materials_dose!!
// -- Sets the CUDA enabled GPU that will be used in the simulation, and allocate and copies the simulation data in the GPU global and constant memories.
init_CUDA_device(&gpu_id, myID, numprocs, &voxel_data, source_data, &source_energy_data, detector_data, &mfp_table_data, /*Variables GPU constant memory*/
voxel_mat_dens, &voxel_mat_dens_device, voxel_mat_dens_bytes, /*Variables GPU global memory*/
image, &image_device, image_bytes,
mfp_Woodcock_table, &mfp_Woodcock_table_device, mfp_Woodcock_table_bytes,
mfp_table_a, mfp_table_b, &mfp_table_a_device, &mfp_table_b_device, mfp_table_bytes,
&rayleigh_table, &rayleigh_table_device,
&compton_table, &compton_table_device, &detector_data_device, &source_data_device,
voxels_Edep, &voxels_Edep_device, voxels_Edep_bytes, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max,
materials_dose, &materials_dose_device, flag_material_dose, num_projections);
// -- Constant data already moved to the GPU: clean up unnecessary RAM memory
free(mfp_Woodcock_table);
free(mfp_table_a);
free(mfp_table_b);
if (0!=myID) // Keep the geometry data for the MPI root because the voxel densities are still needed to compute the final doses
free(voxel_mat_dens);
#endif
MASTER_THREAD
{
current_time=time(NULL);
printf("\n -- INITIALIZATION finished: elapsed time = %.3f s. \n\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
}
#ifdef USING_MPI
fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads before starting the MC phase.
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
MASTER_THREAD
{
current_time=time(NULL);
printf("\n\n -- MONTE CARLO LOOP phase. Time: %s\n\n", ctime(¤t_time));
fflush(stdout);
}
// -- A number of histories smaller than 24 hours in sec (3600*24=86400) means that the user wants to simulate for the input number of seconds in each GPU, not a fix number of histories:
unsigned long long int total_histories_INPUT = total_histories; // Save the original input values to be re-used for multiple projections
int seed_input_INPUT = seed_input, doing_speed_test = -1;
int simulating_by_time = 0; // 0==false
if (total_histories < (unsigned long long int)(95000))
simulating_by_time = 1; // 1=true
int num_blocks_speed_test = 0;
unsigned long long int histories_speed_test = (unsigned long long int)0, total_histories_speed_test = (unsigned long long int)0;
float node_speed = -1.0f, total_speed = 1.0f;
double current_angle;
int num_p; // == current projection number
// *** CT simulation: find the current projection angle and start Monte Carlo simulation:
for (num_p=0; num_p<num_projections; num_p++)
{
// -- Check if this projection is inside the input angular region of interest (the angle can be negative, or larger than 360 in helical scans):
current_angle = initial_angle + num_p * D_angle;
if ((current_angle < angularROI_0) || (current_angle > angularROI_1))
{
MASTER_THREAD printf(" << Skipping projection #%d of %d >> Angle %f degrees: outside angular region of interest.\n", num_p+1, num_projections, current_angle*RAD2DEG);
continue; // Cycle loop: do not simulate this projection!
}
if (num_projections!=1)
MASTER_THREAD printf("\n\n\n << Simulating Projection %d of %d >> Angle: %lf degrees.\n\n\n", num_p+1, num_projections, current_angle*RAD2DEG);
clock_start = clock(); // Start the CPU clock
#ifdef USING_CUDA
// *** Simulate in the GPUs the input amount of time or amount of particles:
// -- Estimate GPU speed to use a total simulation time or multiple GPUs:
if ( simulating_by_time==0 && // Simulating a fixed number of particles, not a fixed time (so performing the speed test only once)
node_speed>0.0f && // Speed test already performed for a previous projection in this simulation (node_speed and total_speed variables set)
numprocs>1) // Using multiple GPUs (ie, multiple MPI threads)
{
// -- Simulating successive projections after the first one with a fix number of particles, with multiple MPI threads: re-use the speed test results from the first projection image:
total_histories = (unsigned long long int)(0.5 + ((double)total_histories_INPUT) * (((double)node_speed)/total_speed));
doing_speed_test = 0; // No speed test for this projection.
}
else if ( simulating_by_time==1 || numprocs>1)
{
// -- Simulating with a time limit OR multiple MPI threads for the first time (num_p==0): run a speed test to calculate the speed of the current GPU and distribute the number of particles to the multiple GPUs or estimate the total number of particles required to run the input amount of time:
// Note that this ELSE IF block will be skipped if we are using a single MPI thread and a fix number of particles.
doing_speed_test = 1; // Remember that we are performing the speed test to make sure we add the test histories to the total before the tally reports.
if (node_speed<0.0f) // Speed test not performed before (first projection being simulated): set num_blocks_speed_test and histories_speed_test.
{
num_blocks_speed_test = guestimate_GPU_performance(gpu_id); // Guestimating a good number of blocks to estimate the speed of different generations of GPUs. Slower GPUs will simulate less particles and hopefully the fastest GPUs will not have to wait much.
// !!DeBuG!! Error in code version 1.2 has been corrected here. Old code: histories_speed_test = (unsigned long long int)(num_blocks_speed_test*num_threads_per_block)*(unsigned long long int)(histories_per_thread);
}
histories_speed_test = (unsigned long long int)(num_blocks_speed_test*num_threads_per_block)*(unsigned long long int)(histories_per_thread);
// Re-load the input total number of histories and the random seed:
total_histories = total_histories_INPUT;
seed_input = seed_input_INPUT;
dim3 blocks_speed_test(num_blocks_speed_test, 1);
dim3 threads_speed_test(num_threads_per_block, 1);
// -- Init the current random number generator seed to avoid overlapping sequences with other MPI threads:
if (simulating_by_time == 1)
// Simulating by time: set an arbitrary huge number of particles to skip.
update_seed_PRNG( (myID + num_p*numprocs), (unsigned long long int)(123456789012), &seed_input); // Set the random number seed far from any other MPI thread (myID) and away from the seeds used in the previous projections (num_p*numprocs).
else
// Simulating by histories
update_seed_PRNG( (myID + num_p*numprocs), total_histories, &seed_input); // Using different random seeds for each projection
#ifdef USING_MPI
printf(" ==> CUDA (MPI process #%d in \"%s\"): estimate GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total (random seed: %d).\n", myID, MPI_processor_name, num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test, seed_input);
#else
printf(" ==> CUDA: Estimating the GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total.\n", num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test);
#endif
fflush(stdout);
clock_kernel = clock();
// -- Launch Monte Carlo simulation kernel for the speed test:
track_particles<<<blocks_speed_test,threads_speed_test>>>(histories_per_thread, num_p, seed_input, image_device, voxels_Edep_device, voxel_mat_dens_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
#ifdef USING_MPI
// Find out the total number of histories simulated in the speed test by all the GPUs. Note that this MPI call will be executed in parallel with the GPU kernel because it is located before the cudaThreadSynchronize command!
return_reduce = MPI_Allreduce(&histories_speed_test, &total_histories_speed_test, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
if (MPI_SUCCESS != return_reduce)
printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the total number of histories in the speed test test??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
else
#else
total_histories_speed_test = histories_speed_test;
#endif
cudaThreadSynchronize(); // Force the runtime to wait until GPU kernel has completed
getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! "); // Check if the CUDA function returned any error
float speed_test_time = float(clock()-clock_kernel)/CLOCKS_PER_SEC;
node_speed = (float) (((double)histories_speed_test)/speed_test_time);
#ifdef USING_MPI
printf(" (MPI process #%d): Estimated GPU speed = %lld hist / %.4f s = %.3f hist/s\n", myID, histories_speed_test, speed_test_time, node_speed);
#else
printf(" Estimated GPU speed = %lld hist / %.3f s = %.3f hist/s\n", histories_speed_test, speed_test_time, node_speed);
#endif
// -- Init random number generator seed to avoid repeating the random numbers used in the speed test:
update_seed_PRNG(1, histories_speed_test, &seed_input);
if (simulating_by_time==1)
{
// -- Set number of histories for each GPU when simulating by time:
if (total_histories > speed_test_time)
total_histories = (total_histories - speed_test_time)*node_speed; // Calculate the total number of remaining histories by "GPU speed" * "remaining time"
else
total_histories = 1; // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
}
else
{
#ifdef USING_MPI
// -- Simulating a fix number of histories divided between all GPUs (execution time variable):
// Compute the fraction of the total speed that accounts for the current MPI thread:
return_reduce = MPI_Allreduce(&node_speed, &total_speed, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); // Sum all the times and send result to all processes
if (MPI_SUCCESS != return_reduce)
printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the speed test results??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
else
MASTER_THREAD
{
printf(" -- Total speed for all GPUs (MPI_Allreduce) = %.3f hist/s; total histories simulated in the speed test (MPI_Allreduce) = %lld.\n", total_speed, total_histories_speed_test);
printf(" The master thread will simulate %.2f%% of the x rays in the simulation.\n", 100.0f*node_speed/total_speed);
}
#else
total_speed = node_speed;
#endif
// - Divide the remaining histories among the MPI threads (GPUs) according to their fraction of the total speed (rounding up).
if (total_histories_speed_test < total_histories)
total_histories = (unsigned long long int)(0.5 + ((double)(total_histories-total_histories_speed_test)) * ((double)(node_speed/total_speed)));
else
total_histories = numprocs; // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
}
} // [Done with case of simulating projections by time or first projection by number of particles]
// else ==> if using only 1 GPU and a fixed number of histories the whole speed test is skipped. The random seed will be different for each projection because it is updated after calling the kernel below.
// fflush(stdout);
// MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads here if we want to have a better organized output text at the expense of losing some performance
// *** Perform the MC simulation itself (the speed test would be skipped for a single CPU thread using a fix number of histories):
// -- Compute the number of CUDA blocks to simulate, rounding up and making sure it is below the limit of 65535 blocks.
// The total number of particles simulated will be increased to the nearest multiple "histories_per_thread".
double total_threads = ceil(((double)total_histories)/((double)histories_per_thread)); // Divide the histories among GPU threads, rounding up and avoiding overflow // New in MC-GPU v1.4 (Mina's bug)
int total_threads_blocks = (int)(((double)total_threads)/((double)num_threads_per_block) + 0.9990); // Divide the GPU threads among CUDA blocks, rounding up
if (total_threads_blocks>65535)
{
#ifdef USING_MPI
printf(" WARNING (MPI process #%d): %d hist per thread would produce %d CUDA blocks (>65535 maximum).", myID, histories_per_thread, total_threads_blocks);
#else
printf("\n WARNING: %d hist per thread would produce %d CUDA blocks, more than the maximum value of 65535.", histories_per_thread, total_threads_blocks);
#endif
total_threads_blocks = 65000; // Increase the histories per thread to have exactly 65000 blocks.
histories_per_thread = (int) ( ((double)total_histories)/((double)(total_threads_blocks*num_threads_per_block)) + 0.9990 );
printf(" Increasing to %d hist to run exactly %d blocks in the GPU.\n", histories_per_thread, total_threads_blocks);
}
else if (total_threads_blocks<1)
{
total_threads_blocks = 1; // Make sure we have at least 1 block to run
}
total_histories = ((unsigned long long int)(total_threads_blocks*num_threads_per_block))*histories_per_thread; // Total histories will be equal or higher than the input value due to the rounding up in the division of the histories
float total_histories_current_kernel_float = (float)total_histories; // Keep a float approx of the num histories for the timing below
#ifdef USING_MPI
MASTER_THREAD printf("\n\n");
printf(" ==> CUDA (MPI process #%d in \"%s\"): Executing %d blocks of %d threads, with %d histories in each thread: %lld histories in total (random seed: %d).\n", myID, MPI_processor_name, total_threads_blocks, num_threads_per_block, histories_per_thread, total_histories, seed_input);
#else
printf("\n ==> CUDA: Executing %d blocks of %d threads, with %d histories in each thread: %lld histories in total (random seed: %d).\n", total_threads_blocks, num_threads_per_block, histories_per_thread, total_histories, seed_input);
#endif
fflush(stdout);
// -- Setup the execution parameters (Max number threads per block: 512, Max sizes each dimension of grid: 65535x65535x1)
dim3 blocks(total_threads_blocks, 1);
dim3 threads(num_threads_per_block, 1);
clock_kernel = clock();
// *** Execute the x-ray transport kernel in the GPU ***
track_particles<<<blocks,threads>>>(histories_per_thread, num_p, seed_input, image_device, voxels_Edep_device, voxel_mat_dens_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
if (1==doing_speed_test)
total_histories += histories_speed_test; // Speed test was done: compute the total number of histories including the particles simulated in the speed test
// -- Move the pseudo-random number generator seed ahead to skip all the random numbers generated in the current projection by this and the other
// "numprocs" MPI threads. Each projection will use independent seeds! (this code runs in parallel with the asynchronous GPU kernel):
update_seed_PRNG(numprocs, total_histories, &seed_input); // Do not repeat seed for each projection. Note that this function only updates 1 seed, the other is not computed.
#ifdef USING_MPI
if (numprocs>1) // Using more than 1 MPI thread:
{
// -- Compute the total number of histories simulated with all MPI thread, including the speed test (histories_speed_test==0 if speed test was skipped).
// These MPI messajes are sent concurrently with the GPU kernel computation for maximum efficiency.
unsigned long long int current_GPU_histories = total_histories;
return_reduce = MPI_Reduce(¤t_GPU_histories, &total_histories, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD); // Sum all the simulated particles and send to thread 0
MASTER_THREAD
{
if (MPI_SUCCESS != return_reduce)
printf("\n\n !!ERROR!! Error getting the total number of particles simulated in all the GPUs (MPI_Reduce). return_reduce = %d.\n\n\n", return_reduce);
if (1==simulating_by_time || 1==doing_speed_test)
{
printf("\n -- Total number of histories being simulated in all the GPUs for the current projection (including speed test)= %.3lld.\n\n", total_histories);
fflush(stdout);
}
}
}
#endif
cudaThreadSynchronize(); // Force the runtime to wait until the GPU kernel is completed
getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! "); // Check if kernel execution generated any error
float real_GPU_speed = total_histories_current_kernel_float/(float(clock()-clock_kernel)/CLOCKS_PER_SEC); // GPU speed for all the image simulation, not just the speed test.
// #ifdef USING_MPI
// printf(" ==> CUDA (MPI process #%d in \"%s\"): GPU kernel execution time: %.4f s (%.3f hist/s)\n", myID, MPI_processor_name, time_kernel, total_histories_current_kernel_float/time_kernel);
// #else
// printf(" ==> CUDA: Kernel execution time: %.4f s\n", time_kernel);
// #endif
// -- Copy the simulated image from the GPU memory to the CPU:
checkCudaErrors(cudaMemcpy(image, image_device, image_bytes, cudaMemcpyDeviceToHost) ); // Copy final results to host
///////////////////////////////////////////////////////////////////////////////////////////////////
#else
// *** Executing the kernel in the CPU:
// If using more than one MPI thread, the number of particles is equally dividied among the threads.
// !!DeBuG!! --> NOT USING SPEED TEST IN THE CPU!! Not possible to limit the execution by time in the CPU.
int total_threads = (int)(((double)total_histories)/((double)histories_per_thread*numprocs) + 0.9990); // Divide the histories among MPI threads, rounding up
unsigned long long int total_histories_per_thread = ((unsigned long long int)(total_threads))*histories_per_thread;
total_histories = total_histories_per_thread*numprocs; // Total histories will be equal or higher than the input value due to the rounding up in the division of the histories
if (numprocs>1)
{
#ifdef USING_MPI
update_seed_PRNG(myID, total_histories, &seed_input); // Compute the initial random seed for each MPI threads, avoiding overlapping of the random sequences
printf(" Executing %d history batches in the CPU, with %d histories in each batch (thread %d of %d at \'%s\'): %lld histories (random seed=%d).\n", total_threads, histories_per_thread, myID+1, numprocs, MPI_processor_name, total_histories_per_thread, seed_input);
MASTER_THREAD printf(" Simulating %lld histories in total for the %d MPI threads.\n\n", total_histories, numprocs);
#endif
}
else
{
printf(" Executing %d history batches in the CPU, with %d histories in each batch: %lld histories in total.\n\n", total_threads, histories_per_thread, total_histories);
}
fflush(stdout);
// -- Copy local structures to global struct variables accessible from "track_particles" (__constant__ variables in the GPU):
source_energy_data_CONST = source_energy_data;
voxel_data_CONST = voxel_data;
mfp_table_data_CONST = mfp_table_data;
dose_ROI_x_min_CONST = dose_ROI_x_min;
dose_ROI_x_max_CONST = dose_ROI_x_max;
dose_ROI_y_min_CONST = dose_ROI_y_min;
dose_ROI_y_max_CONST = dose_ROI_y_max;
dose_ROI_z_min_CONST = dose_ROI_z_min;
dose_ROI_z_max_CONST = dose_ROI_z_max;
int CPU_batch;
for(CPU_batch=0; CPU_batch<total_threads; CPU_batch++)
{
// -- Simulate a particle track initializing the PRNG with the particle number 'n':
track_particles(CPU_batch, histories_per_thread, num_p, seed_input, image, voxels_Edep, voxel_mat_dens, mfp_Woodcock_table, mfp_table_a, mfp_table_b, &rayleigh_table, &compton_table, detector_data, source_data, materials_dose);
}
#endif
// Get current time and calculate execution time in the MC loop:
time_elapsed_MC_loop = ((double)(clock()-clock_start))/CLOCKS_PER_SEC;
time_total_MC_simulation += time_elapsed_MC_loop; // Count total time (in seconds).
// printf("\n -- MONTE CARLO LOOP finished: time tallied in MAIN program: %.3f s\n\n", time_elapsed_MC_loop);
///////////////////////////////////////////////////////////////////////////////////////////////////
// *** Move the images simulated in the GPU (or multiple CPU cores) to the host memory space:
#ifdef USING_MPI
if (numprocs>1) // Using more than 1 MPI thread
{
// -- Add the images simulated in all the MPI threads:
MASTER_THREAD printf("\n >> Synchronize the MPI threads and accumulate the simulated images (MPI_Reduce).\n\n");
// Allocate the memory for the final image in the master thread:
unsigned long long int *image_MPI = NULL;
MASTER_THREAD image_MPI = (unsigned long long int*) malloc(image_bytes);
MASTER_THREAD if (image_MPI==NULL)
{
printf("\n\n !!malloc ERROR!! Problem allocating the total MPI image. Out of memory??\n\n");
exit(-4);
}
// !!DeBuG!! To know how much time the threads lose waiting for other threads in the MPI_Reduce, I have to use an explicit barrier here. It may be more efficient to let the threads advance to the MPI_Reduce directly.
clock_start = clock();
MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads
current_time=time(NULL);
char_time = ctime(¤t_time); char_time[19] = '\0'; // The time is located between the characters 11 and 19.
#ifdef USING_CUDA