Source code for toor.Optimizer.gpu_multiple_kernel

import numpy as np
from scipy.ndimage import median_filter
from pycuda.compiler import SourceModule
import math
import time
import os
from array import array
import gc
from .GaussianFileGenarator import GaussianParameters


[docs] class GPUSharedMemoryMultipleKernel: def __init__(self, EM_obj=None, optimize_reads_and_calcs=False): self.optimize_reads_and_calcs = optimize_reads_and_calcs self.cuda_drv = EM_obj.cuda_drv self.number_of_iterations = EM_obj.number_of_iterations self.number_of_subsets = EM_obj.number_of_subsets self.mod_forward_projection_shared_mem = None self.mod_backward_projection_shared_mem = None self.directory = EM_obj.directory self.saved_image_by_iteration = EM_obj.saved_image_by_iteration if self.saved_image_by_iteration: self.iterations_path = os.path.join(self.directory,"iterations") if not os.path.isdir(self.iterations_path): os.makedirs(self.iterations_path) self.a = EM_obj.a self.a_normal = EM_obj.a_normal self.a_cf = EM_obj.a_cf self.b = EM_obj.b self.b_normal = EM_obj.b_normal self.b_cf = EM_obj.b_cf self.c = EM_obj.c self.c_normal = EM_obj.c_normal self.c_cf = EM_obj.c_cf self.d = EM_obj.d self.d_normal = EM_obj.d_normal self.d_cf = EM_obj.d_cf self.A = EM_obj.im_index_x self.B = EM_obj.im_index_y self.C = EM_obj.im_index_z self.sum_vor = EM_obj.sum_pixel self.number_of_pixels_x = EM_obj.number_of_pixels_x self.number_of_pixels_y = EM_obj.number_of_pixels_y self.number_of_pixels_z = EM_obj.number_of_pixels_z self.adjust_coef = EM_obj.adjust_coef self.im = EM_obj.im self.fov_matrix_cut = EM_obj.fov_matrix_cut self.half_crystal_pitch_xy = EM_obj.half_crystal_pitch_xy self.half_crystal_pitch_z = EM_obj.half_crystal_pitch_z self.distance_between_array_pixel = EM_obj.distance_between_array_pixel self.distance_to_center_plane = EM_obj.distance_to_center_plane self.distance_to_center_plane_normal = EM_obj.distance_to_center_plane_normal self.time_factor = EM_obj.time_correction self.normalization_matrix = EM_obj.normalization_matrix self.algorithm = EM_obj.algorithm self.algorithm_options = EM_obj.algorithm_options self.normalizationFlag = EM_obj.normalization_calculation_flag self.number_of_events = np.int32(len(self.a)) self.weight = np.int32(self.A.shape[0]) self.height = np.int32(self.A.shape[1]) self.depth = np.int32(self.A.shape[2]) # Optimizer corrections self.projector_type = EM_obj.projector_type self.doi_correction = EM_obj.doi_correction self.decay_correction = EM_obj.decay_correction self.random_correction = EM_obj.random_correction self.scatter_correction = EM_obj.scatter_correction self.scatter_angle_correction = EM_obj.scatter_angle_correction self.x_min_f = EM_obj.x_min_f self.x_max_f = EM_obj.x_max_f self.y_min_f = EM_obj.y_min_f self.y_max_f = EM_obj.y_max_f self.z_min_f = EM_obj.z_min_f self.z_max_f = EM_obj.z_max_f self.voxelSize = EM_obj.voxelSize # self.projector_type_args = [True] # if self.projector_type == "Solid-Angle": # self.small_angle_approximation = self.projector_type_args[0] self.file_to_open = { 1: {"Memory Optimized": False, "Projector_type": "Solid-Angle small approximation", "DOI": None, "Decay": False, "Random": False, "Scatter": False, "Scatter Angle": False, "File Folder": "Solid Angle", "Filename Forward": "fw_mk_sasaa.c", "Filename Back": "bk_mk_sasaa.c" }, 2: {"Memory Optimized": False, "Projector_type": "Box Counts", "DOI": None, "Decay": False, "Random": False, "Scatter": False, "Scatter Angle": False, "File Folder": "Box Counts", "Filename Forward": "fw_mk_cb.c", "Filename Back": "bk_mk_cb.c" }, 3: {"Memory Optimized": False, "Projector_type": "Orthogonal Projector", "DOI": None, "Decay": False, "Random": False, "Scatter": False, "Scatter Angle": False, "File Folder": "Orthogonal Projector", "Filename Forward": "fw_mk_gp.c", "Filename Back": "bk_mk_gp.c" }, 4: {"Memory Optimized": False, "Projector_type": "Constant gaussian", "DOI": None, "Decay": False, "Random": False, "Scatter": False, "Scatter Angle": False, "File Folder": "Gaussian", "Filename Forward": "fw_mk_gaussian_fast.c", "Filename Back": "bk_mk_gaussian_fast.c" }, 5: {"Memory Optimized": False, "Projector_type": "Variable gaussian", "DOI": None, "Decay": False, "Random": False, "Scatter": False, "Scatter Angle": False, "File Folder": "Gaussian", "Filename Forward": "fw_mk_variable_gaussian_fast.c", "Filename Back": "bk_mk_variable_gaussian_fast.c" }, } def _load_machine_C_code(self): for i in range(1,len(self.file_to_open)+1): if self.file_to_open[i]["Memory Optimized"] == self.optimize_reads_and_calcs: if self.file_to_open[i]["Projector_type"] == self.projector_type: if self.file_to_open[i]["DOI"] == self.doi_correction: if self.file_to_open[i]["Decay"] == self.decay_correction: if self.file_to_open[i]["Random"] == self.random_correction: self.fw_source_model_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PET", self.file_to_open[i]["File Folder"], self.file_to_open[i]["Filename Forward"]) self.bw_source_model_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PET", self.file_to_open[i]["File Folder"], self.file_to_open[i]["Filename Back"]) print("Forward: {}".format(self.fw_source_model_file)) print("Backward: {}".format(self.bw_source_model_file)) self.fw_source_model = open(self.fw_source_model_file) self.bw_source_model = open(self.bw_source_model_file) self.mod_forward_projection_shared_mem = SourceModule("""{}""".format((self.fw_source_model.read()))) self.mod_backward_projection_shared_mem = SourceModule("""{}""".format((self.bw_source_model.read())))
[docs] def multiplekernel(self): """ """ print('Optimizer STARTED - Multiple reads') # cuda.init() cuda = self.cuda_drv # device = cuda.Device(0) # enter your gpu id here # ctx = device.make_context() # start_x = np.int32(A[0, 0, 0]) start_x = np.int32(self.A[0, 0, 0]) start_y = np.int32(self.B[0, 0, 0]) start_z = np.int32(self.C[0, 0, 0]) print("Start_point: {},{},{}".format(start_x, start_y, start_z)) print('Image size: {},{}, {}'.format(self.weight, self.height, self.depth)) half_distance_between_array_pixel = np.float32(self.distance_between_array_pixel / 2) normalization_matrix = self.normalization_matrix.reshape( self.normalization_matrix.shape[0] * self.normalization_matrix.shape[1] * self.normalization_matrix.shape[ 2]) # SOURCE MODELS (DEVICE CODE) self._load_machine_C_code() number_of_datasets = np.int32(1) # Number of datasets (and concurrent operations) used. number_of_datasets_back = np.int32(1) # Number of datasets (and concurrent operations) used. # Start concurrency Test # Event as reference point ref = cuda.Event() ref.record() # Create the streams and events needed to calculation stream, event = [], [] marker_names = ['kernel_begin', 'kernel_end'] # Create List to allocate chunks of data A_cut_gpu, B_cut_gpu, C_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [ None] * number_of_datasets A_cut, B_cut, C_cut = [None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets adjust_coef_cut = [None for _ in range(number_of_datasets_back)] adjust_coef_gpu = [None] * number_of_datasets adjust_coef_pinned = [None] * number_of_datasets_back fov_cut_matrix_cutted_gpu = [None] * number_of_datasets_back fov_cut_matrix_cut = [None] * number_of_datasets # Streams and Events creation for dataset in range(number_of_datasets): stream.append(cuda.Stream()) event.append(dict([(marker_names[t], cuda.Event()) for t in range(len(marker_names))])) # Forward Projection Memory Allocation # Variables that need an unique alocation im_shappened = np.ascontiguousarray(self.im.reshape(self.im.shape[0] * self.im.shape[1] * self.im.shape[2]), dtype=np.float32) fov_cut_matrix_shappened = np.ascontiguousarray( self.fov_matrix_cut.reshape( self.fov_matrix_cut.shape[0] * self.fov_matrix_cut.shape[1] * self.fov_matrix_cut.shape[2]), dtype=np.byte) im_gpu = cuda.mem_alloc(im_shappened.size * im_shappened.dtype.itemsize) fov_cut_matrix_gpu = cuda.mem_alloc(fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize) # texref = mod_forward_projection_shared_mem.get_texref('tex') # texref.set_address(fov_cut_matrix_gpu, fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize) # # texref.set_format(cuda.array_format.UNSIGNED_INT8, 1) cuda.memcpy_htod_async(im_gpu, im_shappened) cuda.memcpy_htod_async(fov_cut_matrix_gpu, fov_cut_matrix_shappened) # Forward Memory allocation if self.projector_type == "Variable gaussian": gaussianFeatures = GaussianParameters(voxelSize=self.voxelSize) gaussianFeatures.setShiftVariantParameters(self.distance_to_center_plane_normal, self.distance_to_center_plane) forward_projection_arrays_all_data = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor, gaussianFeatures.sigma_y_square, gaussianFeatures.sigma_z_square, gaussianFeatures.gaussian_y_fix_term, gaussianFeatures.gaussian_z_fix_term, gaussianFeatures.acceptableZDistance, gaussianFeatures.acceptableYDistance, gaussianFeatures.invert2timesigma_y_square, gaussianFeatures.invert2timesigma_z_square] else: forward_projection_arrays_all_data = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor] forward_projection_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] forward_projection_gpu_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] forward_projection_pinned_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] for ar in range(len(forward_projection_arrays)): array_original = forward_projection_arrays_all_data[ar] array = forward_projection_arrays[ar] array_gpu = forward_projection_gpu_arrays[ar] array_pinned = forward_projection_pinned_arrays[ar] for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) array[dataset] = array_original[begin_dataset:end_dataset] array_gpu[dataset] = cuda.mem_alloc(array[dataset].size * array[dataset].dtype.itemsize) array_pinned[dataset] = cuda.register_host_memory(array[dataset]) assert np.all(array_pinned[dataset] == array[dataset]) cuda.memcpy_htod_async(array_gpu[dataset], array_pinned[dataset], stream[dataset]) forward_projection_arrays_all_data[ar] = array_original forward_projection_arrays[ar] = array forward_projection_gpu_arrays[ar] = array_gpu forward_projection_pinned_arrays[ar] = array_pinned free, total = cuda.mem_get_info() print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) # Back projection Memory allocation if self.projector_type == "Variable gaussian": gaussianFeatures = GaussianParameters(voxelSize=self.voxelSize) gaussianFeatures.setShiftVariantParameters(self.distance_to_center_plane_normal,self.distance_to_center_plane) backward_projection_arrays_full_arrays = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor, gaussianFeatures.sigma_y_square, gaussianFeatures.sigma_z_square, gaussianFeatures.gaussian_y_fix_term, gaussianFeatures.gaussian_z_fix_term, gaussianFeatures.acceptableZDistance, gaussianFeatures.acceptableYDistance, gaussianFeatures.invert2timesigma_y_square, gaussianFeatures.invert2timesigma_z_square] else: backward_projection_arrays_full_arrays = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor] backward_projection_array_gpu_arrays = [[None] * number_of_datasets for _ in range(len(backward_projection_arrays_full_arrays))] backward_projection_pinned_arrays = [None] * len(backward_projection_arrays_full_arrays) for st in range(len(backward_projection_arrays_full_arrays)): backward_projection_array_gpu_arrays[st] = \ cuda.mem_alloc(backward_projection_arrays_full_arrays[st].size * backward_projection_arrays_full_arrays[st].dtype.itemsize) cuda.memcpy_htod_async(backward_projection_array_gpu_arrays[st], backward_projection_arrays_full_arrays[st]) # backward_projection_pinned_arrays[st] = cuda.register_host_memory(backward_projection_pinned_arrays[st]) # assert np.all(array_pinned[dataset] == array[dataset]) # cuda.memcpy_htod_async(array_gpu[dataset], array_pinned[dataset], stream[dataset]) adjust_coef = np.ascontiguousarray(self.adjust_coef.reshape( self.adjust_coef.shape[0] * self.adjust_coef.shape[1] * self.adjust_coef.shape[2]), dtype=np.float32) A = np.ascontiguousarray(self.A.reshape( self.A.shape[0] * self.A.shape[1] * self.A.shape[2]), dtype=np.short) B = np.ascontiguousarray(self.B.reshape( self.B.shape[0] * self.B.shape[1] * self.B.shape[2]), dtype=np.short) C = np.ascontiguousarray(self.C.reshape( self.C.shape[0] * self.C.shape[1] * self.C.shape[2]), dtype=np.short) # ---- Divide into datasets variables backprojection for dataset in range(number_of_datasets_back): voxels_division = adjust_coef.shape[0] // number_of_datasets_back adjust_coef_cut[dataset] = np.ascontiguousarray( adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.float32) fov_cut_matrix_cut[dataset] = np.ascontiguousarray( fov_cut_matrix_shappened[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.byte) A_cut[dataset] = np.ascontiguousarray( A[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.short) B_cut[dataset] = np.ascontiguousarray( B[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.short) C_cut[dataset] = np.ascontiguousarray( C[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.short) # Backprojection adjust_coef_gpu[dataset] = cuda.mem_alloc( adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize) adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset]) assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset]) cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset]) fov_cut_matrix_cutted_gpu[dataset] = cuda.mem_alloc( fov_cut_matrix_cut[dataset].size * fov_cut_matrix_cut[dataset].dtype.itemsize) A_cut_gpu[dataset] = cuda.mem_alloc( A_cut[dataset].size * A_cut[dataset].dtype.itemsize) B_cut_gpu[dataset] = cuda.mem_alloc( B_cut[dataset].size * B_cut[dataset].dtype.itemsize) C_cut_gpu[dataset] = cuda.mem_alloc( C_cut[dataset].size * C_cut[dataset].dtype.itemsize) cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset]) cuda.memcpy_htod_async(fov_cut_matrix_cutted_gpu[dataset], fov_cut_matrix_cut[dataset]) cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset]) cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset]) cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset]) free, total = cuda.mem_get_info() print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) print('Number events for reconstruction: {}'.format(self.number_of_events)) # -------------OSEM--------- it = self.number_of_iterations subsets = self.number_of_subsets im = np.ascontiguousarray(self.im.reshape(self.im.shape[0] * self.im.shape[1] * self.im.shape[2]), dtype=np.float32) for i in range(it): print('Iteration number: {}\n----------------'.format(i + 1)) print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) begin_event = np.int32(0) end_event = np.int32(self.number_of_events / subsets) for sb in range(subsets): print('Subset number: {}'.format(sb)) number_of_events_subset = np.int32(end_event - begin_event) tic = time.time() # Cycle forward Projection for dataset in range(number_of_datasets): if dataset == number_of_datasets: begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets) end_dataset = number_of_events_subset else: begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets) end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets) threadsperblock = (256, 1, 1) blockspergrid_x = int(math.ceil((end_dataset - begin_dataset) / threadsperblock[0])) blockspergrid_y = int(math.ceil(1 / threadsperblock[1])) blockspergrid_z = int(math.ceil(1 / threadsperblock[2])) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) event[dataset]['kernel_begin'].record(stream[dataset]) func_forward = self.mod_forward_projection_shared_mem.get_function( "forward_projection_cdrf") # if gaussian: if self.projector_type == "Constant gaussian": gaussianFeatures = GaussianParameters(voxelSize=self.voxelSize) gaussianFeatures.setShiftInvariantParameters() func_forward(gaussianFeatures.sigma_y_square, gaussianFeatures.sigma_z_square, gaussianFeatures.gaussian_y_fix_term, gaussianFeatures.gaussian_z_fix_term, gaussianFeatures.acceptableZDistance, gaussianFeatures.acceptableYDistance, gaussianFeatures.invert2timesigma_y_square, gaussianFeatures.invert2timesigma_z_square, self.weight, self.height, self.depth, start_x, start_y, start_z, forward_projection_gpu_arrays[13][dataset], forward_projection_gpu_arrays[14][dataset], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, forward_projection_gpu_arrays[0][dataset], forward_projection_gpu_arrays[4][dataset], forward_projection_gpu_arrays[8][dataset], forward_projection_gpu_arrays[1][dataset], forward_projection_gpu_arrays[5][dataset], forward_projection_gpu_arrays[9][dataset], forward_projection_gpu_arrays[2][dataset], forward_projection_gpu_arrays[6][dataset], forward_projection_gpu_arrays[10][dataset], forward_projection_gpu_arrays[3][dataset], forward_projection_gpu_arrays[7][dataset], forward_projection_gpu_arrays[11][dataset], forward_projection_gpu_arrays[12][dataset], fov_cut_matrix_gpu, im_gpu, block=threadsperblock, grid=blockspergrid, stream=stream[dataset]) elif self.projector_type == "Variable gaussian": func_forward(forward_projection_gpu_arrays[16][dataset], forward_projection_gpu_arrays[17][dataset], forward_projection_gpu_arrays[18][dataset], forward_projection_gpu_arrays[19][dataset], forward_projection_gpu_arrays[20][dataset], forward_projection_gpu_arrays[21][dataset], forward_projection_gpu_arrays[22][dataset], forward_projection_gpu_arrays[23][dataset], self.weight, self.height, self.depth, start_x, start_y, start_z, forward_projection_gpu_arrays[13][dataset], forward_projection_gpu_arrays[14][dataset], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, forward_projection_gpu_arrays[0][dataset], forward_projection_gpu_arrays[4][dataset], forward_projection_gpu_arrays[8][dataset], forward_projection_gpu_arrays[1][dataset], forward_projection_gpu_arrays[5][dataset], forward_projection_gpu_arrays[9][dataset], forward_projection_gpu_arrays[2][dataset], forward_projection_gpu_arrays[6][dataset], forward_projection_gpu_arrays[10][dataset], forward_projection_gpu_arrays[3][dataset], forward_projection_gpu_arrays[7][dataset], forward_projection_gpu_arrays[11][dataset], forward_projection_gpu_arrays[12][dataset], fov_cut_matrix_gpu, im_gpu, block=threadsperblock, grid=blockspergrid, stream=stream[dataset]) else: func_forward(self.weight, self.height, self.depth, start_x, start_y, start_z, forward_projection_gpu_arrays[13][dataset], forward_projection_gpu_arrays[14][dataset], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, forward_projection_gpu_arrays[0][dataset], forward_projection_gpu_arrays[4][dataset], forward_projection_gpu_arrays[8][dataset], forward_projection_gpu_arrays[1][dataset], forward_projection_gpu_arrays[5][dataset], forward_projection_gpu_arrays[9][dataset], forward_projection_gpu_arrays[2][dataset], forward_projection_gpu_arrays[6][dataset], forward_projection_gpu_arrays[10][dataset], forward_projection_gpu_arrays[3][dataset], forward_projection_gpu_arrays[7][dataset], forward_projection_gpu_arrays[11][dataset], forward_projection_gpu_arrays[12][dataset], fov_cut_matrix_gpu, im_gpu, block=threadsperblock, grid=blockspergrid, stream=stream[dataset]) # Sincronization of streams for dataset in range(number_of_datasets): # Commenting out this line should break concurrency. event[dataset]['kernel_end'].record(stream[dataset]) # Transfering data from Optimizer for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) # cuda.memcpy_dtoh_async(sum_vor[begin_dataset:end_dataset], sum_vor_gpu[dataset]) # cuda.memcpy_dtoh_async(forward_projection_pinned_arrays[12][dataset], # forward_projection_gpu_arrays[12][dataset], stream[dataset]) # forward_projection_arrays_all_data[12][begin_dataset:end_dataset] = \ # forward_projection_pinned_arrays[12][dataset] cuda.memcpy_dtoh_async(forward_projection_arrays[12][dataset], forward_projection_gpu_arrays[12][dataset], stream[dataset]) forward_projection_arrays_all_data[12][begin_dataset:end_dataset] = \ forward_projection_arrays[12][dataset] # cuda.cudaStream.Synchronize(stream[dataset]) toc = time.time() cuda.Context.synchronize() # if self.normalizationFlag: # forward_projection_arrays_all_data[12] = np.ones_like(forward_projection_arrays_all_data[12]) print('Time part Forward Projection {} : {}'.format(1, toc - tic)) # number_of_datasets = np.int32(2) # # teste = np.copy(forward_projection_arrays_all_data[12]) # # sum_vor[sum_vor<1]=0 # sum_vor = np.ascontiguousarray(teste, dtype=np.float32) # sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32) print('SUM VOR: {}'.format(np.sum(forward_projection_arrays_all_data[12]))) print('LEN VOR: {}'.format(len(forward_projection_arrays_all_data[12][forward_projection_arrays_all_data[12]==0]))) # cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor) cuda.memcpy_htod_async(backward_projection_array_gpu_arrays[12], forward_projection_arrays_all_data[12]) # ------------BACKPROJECTION----------- for dataset in range(number_of_datasets_back): dataset = np.int32(dataset) begin_dataset = np.int32(0) end_dataset = np.int32(number_of_events_subset) event[dataset]['kernel_begin'].record(stream[dataset]) weight_cutted, height_cutted, depth_cutted = np.int32( adjust_coef_cut[dataset].shape[0]), np.int32( 1), np.int32(1) number_of_voxels_thread = 128 threadsperblock = (np.int(number_of_voxels_thread), 1, 1) blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0])) blockspergrid_y = int(math.ceil(1 / threadsperblock[1])) blockspergrid_z = int(math.ceil(1 / threadsperblock[2])) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) func_backward = self.mod_backward_projection_shared_mem.get_function( "backprojection_cdrf") if self.projector_type == "Constant gaussian": func_backward(gaussianFeatures.sigma_y_square, gaussianFeatures.sigma_z_square, gaussianFeatures.gaussian_y_fix_term, gaussianFeatures.gaussian_z_fix_term, gaussianFeatures.acceptableZDistance, gaussianFeatures.acceptableYDistance, gaussianFeatures.invert2timesigma_y_square, gaussianFeatures.invert2timesigma_z_square, dataset, weight_cutted, height_cutted, depth_cutted, backward_projection_array_gpu_arrays[13], backward_projection_array_gpu_arrays[14], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, backward_projection_array_gpu_arrays[0], backward_projection_array_gpu_arrays[4], backward_projection_array_gpu_arrays[8], backward_projection_array_gpu_arrays[1], backward_projection_array_gpu_arrays[5], backward_projection_array_gpu_arrays[9], backward_projection_array_gpu_arrays[2], backward_projection_array_gpu_arrays[6], backward_projection_array_gpu_arrays[10], backward_projection_array_gpu_arrays[3], backward_projection_array_gpu_arrays[7], backward_projection_array_gpu_arrays[11], A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset], backward_projection_array_gpu_arrays[12], fov_cut_matrix_cutted_gpu[dataset], backward_projection_array_gpu_arrays[15], block=threadsperblock, grid=blockspergrid, shared=np.int(4 * number_of_voxels_thread), stream=stream[dataset], ) elif self.projector_type == "Variable gaussian": func_backward(backward_projection_array_gpu_arrays[16], backward_projection_array_gpu_arrays[17], backward_projection_array_gpu_arrays[18], backward_projection_array_gpu_arrays[19], backward_projection_array_gpu_arrays[20], backward_projection_array_gpu_arrays[21], backward_projection_array_gpu_arrays[22], backward_projection_array_gpu_arrays[23], dataset, weight_cutted, height_cutted, depth_cutted, backward_projection_array_gpu_arrays[13], backward_projection_array_gpu_arrays[14], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, backward_projection_array_gpu_arrays[0], backward_projection_array_gpu_arrays[4], backward_projection_array_gpu_arrays[8], backward_projection_array_gpu_arrays[1], backward_projection_array_gpu_arrays[5], backward_projection_array_gpu_arrays[9], backward_projection_array_gpu_arrays[2], backward_projection_array_gpu_arrays[6], backward_projection_array_gpu_arrays[10], backward_projection_array_gpu_arrays[3], backward_projection_array_gpu_arrays[7], backward_projection_array_gpu_arrays[11], A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset], backward_projection_array_gpu_arrays[12], fov_cut_matrix_cutted_gpu[dataset], backward_projection_array_gpu_arrays[15], block=threadsperblock, grid=blockspergrid, shared=np.int(4 * number_of_voxels_thread), stream=stream[dataset], ) else: func_backward(dataset, weight_cutted, height_cutted, depth_cutted, backward_projection_array_gpu_arrays[13], backward_projection_array_gpu_arrays[14], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, backward_projection_array_gpu_arrays[0], backward_projection_array_gpu_arrays[4], backward_projection_array_gpu_arrays[8], backward_projection_array_gpu_arrays[1], backward_projection_array_gpu_arrays[5], backward_projection_array_gpu_arrays[9], backward_projection_array_gpu_arrays[2], backward_projection_array_gpu_arrays[6], backward_projection_array_gpu_arrays[10], backward_projection_array_gpu_arrays[3], backward_projection_array_gpu_arrays[7], backward_projection_array_gpu_arrays[11], A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset], backward_projection_array_gpu_arrays[12], fov_cut_matrix_cutted_gpu[dataset], backward_projection_array_gpu_arrays[15], block=threadsperblock, grid=blockspergrid, shared=np.int(4 * number_of_voxels_thread), stream=stream[dataset], ) # for dataset in range(number_of_datasets_back): # Commenting out this line should break concurrency. event[dataset]['kernel_end'].record(stream[dataset]) for dataset in range(number_of_datasets_back): cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset]) adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division] = adjust_coef_cut[ dataset] cuda.Context.synchronize() print('Time part Backward Projection {} : {}'.format(1, time.time() - toc)) print('SUM IMAGE: {}'.format(np.sum(adjust_coef))) penalized_term = self._load_penalized_term(im) # normalization im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[ normalization_matrix != 0] / (normalization_matrix[normalization_matrix != 0]) # im[normalization_matrix == 0] = 0 if self.algorithm == "LM-MRP": im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0] # im = self._apply_penalized_term(im, penalized_term) print('SUM IMAGE: {}'.format(np.sum(im))) im = np.ascontiguousarray(im, dtype=np.float32) cuda.memcpy_htod_async(im_gpu, im) # # Clearing variables forward_projection_arrays_all_data[12] = np.ascontiguousarray( np.zeros(self.a.shape, dtype=np.float32)) adjust_coef = np.ascontiguousarray( np.zeros((self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z), dtype=np.float32)) for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) forward_projection_arrays[12][dataset] = forward_projection_arrays_all_data[12][ begin_dataset:end_dataset] forward_projection_gpu_arrays[12][dataset] = cuda.mem_alloc( forward_projection_arrays[12][dataset].size * forward_projection_arrays[12][ dataset].dtype.itemsize) # forward_projection_pinned_arrays[12][dataset] = cuda.register_host_memory( # forward_projection_arrays[12][dataset]) # assert np.all( # forward_projection_pinned_arrays[12][dataset] == forward_projection_arrays[12][dataset]) # cuda.memcpy_htod_async(forward_projection_gpu_arrays[12][dataset], # forward_projection_pinned_arrays[12][dataset], stream[dataset]) cuda.memcpy_htod_async(forward_projection_gpu_arrays[12][dataset], forward_projection_arrays[12][dataset], stream[dataset]) for dataset in range(number_of_datasets_back): adjust_coef_cut[dataset] = adjust_coef[ dataset * voxels_division:(dataset + 1) * voxels_division] # adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset]) # assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset]) # cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset]) cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset], stream[dataset]) if self.saved_image_by_iteration: if i % 2==0: im_to_save = im.reshape(self.weight, self.height, self.depth) self._save_image_by_it(im_to_save, i, sb) # del adjust_coef_pinned gc.collect() # adjust_coef_pinned = [None] * number_of_datasets_back # if self.signals_interface is not None: # self.signals_interface.trigger_update_label_reconstruction_status.emit( # "{}: Iteration {}".format(self.current_info_step, i + 1)) # self.signals_interface.trigger_progress_reconstruction_partial.emit( # int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0))) im = im.reshape(self.weight, self.height, self.depth) self.im = im
[docs] def multikernel_optimized_memory_reads(self): print('Optimizer STARTED - Multiple reads') # cuda.init() cuda = self.cuda_drv # device = cuda.Device(0) # enter your gpu id here # ctx = device.make_context() # start_x = np.int32(A[0, 0, 0]) start_x = np.int32(self.A[0, 0, 0]) start_y = np.int32(self.B[0, 0, 0]) start_z = np.int32(self.C[0, 0, 0]) print("Start_point: {},{},{}".format(start_x, start_y, start_z)) print('Image size: {},{}, {}'.format(self.weight, self.height, self.depth)) half_distance_between_array_pixel = np.float32(self.distance_between_array_pixel / 2) normalization_matrix = self.normalization_matrix.reshape( self.normalization_matrix.shape[0] * self.normalization_matrix.shape[1] * self.normalization_matrix.shape[ 2]) # SOURCE MODELS (DEVICE CODE) self._load_machine_C_code() number_of_datasets = np.int32(1) # Number of datasets (and concurrent operations) used. number_of_datasets_back = np.int32(1) # Number of datasets (and concurrent operations) used. # Start concurrency Test # Event as reference point ref = cuda.Event() ref.record() # Create the streams and events needed to calculation stream, event = [], [] marker_names = ['kernel_begin', 'kernel_end'] # Create List to allocate chunks of data A_cut_gpu, B_cut_gpu, C_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [ None] * number_of_datasets A_cut, B_cut, C_cut = [None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets adjust_coef_cut = [None for _ in range(number_of_datasets_back)] adjust_coef_gpu = [None] * number_of_datasets adjust_coef_pinned = [None] * number_of_datasets_back fov_cut_matrix_cutted_gpu = [None] * number_of_datasets_back fov_cut_matrix_cut = [None] * number_of_datasets # Streams and Events creation for dataset in range(number_of_datasets): stream.append(cuda.Stream()) event.append(dict([(marker_names[t], cuda.Event()) for t in range(len(marker_names))])) # Forward Projection Memory Allocation # Variables that need an unique alocation im_shappened = np.ascontiguousarray(self.im.reshape(self.im.shape[0] * self.im.shape[1] * self.im.shape[2]), dtype=np.float32) fov_cut_matrix_shappened = np.ascontiguousarray( self.fov_matrix_cut.reshape( self.fov_matrix_cut.shape[0] * self.fov_matrix_cut.shape[1] * self.fov_matrix_cut.shape[2]), dtype=np.byte) im_gpu = cuda.mem_alloc(im_shappened.size * im_shappened.dtype.itemsize) fov_cut_matrix_gpu = cuda.mem_alloc(fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize) # texref = mod_forward_projection_shared_mem.get_texref('tex') # texref.set_address(fov_cut_matrix_gpu, fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize) # # texref.set_format(cuda.array_format.UNSIGNED_INT8, 1) cuda.memcpy_htod_async(im_gpu, im_shappened) cuda.memcpy_htod_async(fov_cut_matrix_gpu, fov_cut_matrix_shappened) # Forward Memory allocation forward_projection_arrays_all_data = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor, self.x_min_f, self.x_max_f, self.y_min_f, self.y_max_f, self.z_min_f, self.z_max_f] forward_projection_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] forward_projection_gpu_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] forward_projection_pinned_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] for ar in range(len(forward_projection_arrays)): array_original = forward_projection_arrays_all_data[ar] array = forward_projection_arrays[ar] array_gpu = forward_projection_gpu_arrays[ar] # array_pinned = forward_projection_pinned_arrays[ar] for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) array[dataset] = array_original[begin_dataset:end_dataset] array_gpu[dataset] = cuda.mem_alloc(array[dataset].size * array[dataset].dtype.itemsize) # array_pinned[dataset] = cuda.register_host_memory(array[dataset]) # assert np.all(array_pinned[dataset] == array[dataset]) cuda.memcpy_htod_async(array_gpu[dataset], array[dataset], stream[dataset]) forward_projection_arrays_all_data[ar] = array_original forward_projection_arrays[ar] = array forward_projection_gpu_arrays[ar] = array_gpu # forward_projection_pinned_arrays[ar] = array_pinned free, total = cuda.mem_get_info() print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) # Back projection Memory allocation backward_projection_arrays_full_arrays = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor, self.x_min_f, self.x_max_f, self.y_min_f, self.y_max_f, self.z_min_f, self.z_max_f] backward_projection_array_gpu_arrays = [[None] * number_of_datasets for _ in range(len(backward_projection_arrays_full_arrays))] backward_projection_pinned_arrays = [None] * len(backward_projection_arrays_full_arrays) for st in range(len(backward_projection_arrays_full_arrays)): backward_projection_array_gpu_arrays[st] = \ cuda.mem_alloc(backward_projection_arrays_full_arrays[st].size * backward_projection_arrays_full_arrays[st].dtype.itemsize) cuda.memcpy_htod_async(backward_projection_array_gpu_arrays[st], backward_projection_arrays_full_arrays[st]) # backward_projection_pinned_arrays[st] = cuda.register_host_memory(backward_projection_pinned_arrays[st]) # assert np.all(array_pinned[dataset] == array[dataset]) # cuda.memcpy_htod_async(array_gpu[dataset], array_pinned[dataset], stream[dataset]) adjust_coef = np.ascontiguousarray(self.adjust_coef.reshape( self.adjust_coef.shape[0] * self.adjust_coef.shape[1] * self.adjust_coef.shape[2]), dtype=np.float32) A = np.ascontiguousarray(self.A.reshape( self.A.shape[0] * self.A.shape[1] * self.A.shape[2]), dtype=np.short) B = np.ascontiguousarray(self.B.reshape( self.B.shape[0] * self.B.shape[1] * self.B.shape[2]), dtype=np.short) C = np.ascontiguousarray(self.C.reshape( self.C.shape[0] * self.C.shape[1] * self.C.shape[2]), dtype=np.short) # ---- Divide into datasets variables backprojection for dataset in range(number_of_datasets_back): voxels_division = adjust_coef.shape[0] // number_of_datasets_back adjust_coef_cut[dataset] = np.ascontiguousarray( adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.float32) fov_cut_matrix_cut[dataset] = np.ascontiguousarray( fov_cut_matrix_shappened[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.byte) A_cut[dataset] = np.ascontiguousarray( A[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.short) B_cut[dataset] = np.ascontiguousarray( B[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.short) C_cut[dataset] = np.ascontiguousarray( C[dataset * voxels_division:(dataset + 1) * voxels_division], dtype=np.short) # Backprojection adjust_coef_gpu[dataset] = cuda.mem_alloc( adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize) adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset]) assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset]) cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset]) fov_cut_matrix_cutted_gpu[dataset] = cuda.mem_alloc( fov_cut_matrix_cut[dataset].size * fov_cut_matrix_cut[dataset].dtype.itemsize) A_cut_gpu[dataset] = cuda.mem_alloc( A_cut[dataset].size * A_cut[dataset].dtype.itemsize) B_cut_gpu[dataset] = cuda.mem_alloc( B_cut[dataset].size * B_cut[dataset].dtype.itemsize) C_cut_gpu[dataset] = cuda.mem_alloc( C_cut[dataset].size * C_cut[dataset].dtype.itemsize) cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset]) cuda.memcpy_htod_async(fov_cut_matrix_cutted_gpu[dataset], fov_cut_matrix_cut[dataset]) cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset]) cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset]) cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset]) free, total = cuda.mem_get_info() print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) print('Number events for reconstruction: {}'.format(self.number_of_events)) # -------------OSEM--------- it = self.number_of_iterations subsets = self.number_of_subsets im = np.ascontiguousarray(self.im.reshape(self.im.shape[0] * self.im.shape[1] * self.im.shape[2]), dtype=np.float32) for i in range(it): print('Iteration number: {}\n----------------'.format(i + 1)) print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) begin_event = np.int32(0) end_event = np.int32(self.number_of_events / subsets) for sb in range(subsets): print('Subset number: {}'.format(sb)) number_of_events_subset = np.int32(end_event - begin_event) tic = time.time() # Cycle forward Projection for dataset in range(number_of_datasets): if dataset == number_of_datasets: begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets) end_dataset = number_of_events_subset else: begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets) end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets) threadsperblock = (256, 1, 1) blockspergrid_x = int(math.ceil((end_dataset - begin_dataset) / threadsperblock[0])) blockspergrid_y = int(math.ceil(1 / threadsperblock[1])) blockspergrid_z = int(math.ceil(1 / threadsperblock[2])) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) event[dataset]['kernel_begin'].record(stream[dataset]) func_forward = self.mod_forward_projection_shared_mem.get_function( "forward_projection_cdrf") func_forward(self.weight, self.height, self.depth, start_x, start_y, start_z, forward_projection_gpu_arrays[13][dataset], forward_projection_gpu_arrays[14][dataset], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, forward_projection_gpu_arrays[0][dataset], forward_projection_gpu_arrays[4][dataset], forward_projection_gpu_arrays[8][dataset], forward_projection_gpu_arrays[1][dataset], forward_projection_gpu_arrays[5][dataset], forward_projection_gpu_arrays[9][dataset], forward_projection_gpu_arrays[2][dataset], forward_projection_gpu_arrays[6][dataset], forward_projection_gpu_arrays[10][dataset], forward_projection_gpu_arrays[3][dataset], forward_projection_gpu_arrays[7][dataset], forward_projection_gpu_arrays[11][dataset], forward_projection_gpu_arrays[12][dataset], fov_cut_matrix_gpu, im_gpu, forward_projection_gpu_arrays[16][dataset], forward_projection_gpu_arrays[17][dataset], forward_projection_gpu_arrays[18][dataset], forward_projection_gpu_arrays[19][dataset], forward_projection_gpu_arrays[20][dataset], forward_projection_gpu_arrays[21][dataset], block=threadsperblock, grid=blockspergrid, stream=stream[dataset]) # Sincronization of streams for dataset in range(number_of_datasets): # Commenting out this line should break concurrency. event[dataset]['kernel_end'].record(stream[dataset]) # Transfering data from Optimizer for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) # cuda.memcpy_dtoh_async(sum_vor[begin_dataset:end_dataset], sum_vor_gpu[dataset]) # cuda.memcpy_dtoh_async(forward_projection_pinned_arrays[12][dataset], # forward_projection_gpu_arrays[12][dataset], stream[dataset]) # forward_projection_arrays_all_data[12][begin_dataset:end_dataset] = \ # forward_projection_pinned_arrays[12][dataset] cuda.memcpy_dtoh_async(forward_projection_arrays[12][dataset], forward_projection_gpu_arrays[12][dataset], stream[dataset]) forward_projection_arrays_all_data[12][begin_dataset:end_dataset] = \ forward_projection_arrays[12][dataset] # cuda.cudaStream.Synchronize(stream[dataset]) toc = time.time() cuda.Context.synchronize() # if self.normalizationFlag: # forward_projection_arrays_all_data[12] = np.ones_like(forward_projection_arrays_all_data[12]) print('Time part Forward Projection {} : {}'.format(1, toc - tic)) # number_of_datasets = np.int32(2) # # teste = np.copy(forward_projection_arrays_all_data[12]) # # sum_vor[sum_vor<1]=0 # sum_vor = np.ascontiguousarray(teste, dtype=np.float32) # sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32) print('SUM VOR: {}'.format(np.sum(forward_projection_arrays_all_data[12]))) print('LEN VOR: {}'.format( len(forward_projection_arrays_all_data[12][forward_projection_arrays_all_data[12] == 0]))) # cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor) cuda.memcpy_htod_async(backward_projection_array_gpu_arrays[12], forward_projection_arrays_all_data[12]) # ------------BACKPROJECTION----------- for dataset in range(number_of_datasets_back): dataset = np.int32(dataset) begin_dataset = np.int32(0) end_dataset = np.int32(number_of_events_subset) event[dataset]['kernel_begin'].record(stream[dataset]) weight_cutted, height_cutted, depth_cutted = np.int32( adjust_coef_cut[dataset].shape[0]), np.int32( 1), np.int32(1) number_of_voxels_thread = 128 threadsperblock = (np.int(number_of_voxels_thread), 1, 1) blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0])) blockspergrid_y = int(math.ceil(1 / threadsperblock[1])) blockspergrid_z = int(math.ceil(1 / threadsperblock[2])) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) func_backward = self.mod_backward_projection_shared_mem.get_function( "backprojection_cdrf") func_backward(dataset, self.weight, self.height, self.depth, start_x, start_y, start_z, backward_projection_array_gpu_arrays[13], backward_projection_array_gpu_arrays[14], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, backward_projection_array_gpu_arrays[0], backward_projection_array_gpu_arrays[4], backward_projection_array_gpu_arrays[8], backward_projection_array_gpu_arrays[1], backward_projection_array_gpu_arrays[5], backward_projection_array_gpu_arrays[9], backward_projection_array_gpu_arrays[2], backward_projection_array_gpu_arrays[6], backward_projection_array_gpu_arrays[10], backward_projection_array_gpu_arrays[3], backward_projection_array_gpu_arrays[7], backward_projection_array_gpu_arrays[11], A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset], backward_projection_array_gpu_arrays[12], fov_cut_matrix_cutted_gpu[dataset], backward_projection_array_gpu_arrays[16], backward_projection_array_gpu_arrays[17], backward_projection_array_gpu_arrays[18], backward_projection_array_gpu_arrays[19], backward_projection_array_gpu_arrays[20], backward_projection_array_gpu_arrays[21], block=threadsperblock, grid=blockspergrid, shared=np.int(4 * number_of_voxels_thread), stream=stream[dataset], ) # for dataset in range(number_of_datasets_back): # Commenting out this line should break concurrency. event[dataset]['kernel_end'].record(stream[dataset]) for dataset in range(number_of_datasets_back): cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset]) adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division] = adjust_coef_cut[ dataset] cuda.Context.synchronize() print('Time part Backward Projection {} : {}'.format(1, time.time() - toc)) print('SUM IMAGE: {}'.format(np.sum(adjust_coef))) penalized_term = self._load_penalized_term(im) # normalization im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[ normalization_matrix != 0] / (normalization_matrix[normalization_matrix != 0]) # im[normalization_matrix == 0] = 0 if self.algorithm == "LM-MRP": im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0] # im = self._apply_penalized_term(im, penalized_term) print('SUM IMAGE: {}'.format(np.sum(im))) im = np.ascontiguousarray(im, dtype=np.float32) cuda.memcpy_htod_async(im_gpu, im) # # Clearing variables forward_projection_arrays_all_data[12] = np.ascontiguousarray( np.zeros(self.a.shape, dtype=np.float32)) adjust_coef = np.ascontiguousarray( np.zeros((self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z), dtype=np.float32)) for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) forward_projection_arrays[12][dataset] = forward_projection_arrays_all_data[12][ begin_dataset:end_dataset] forward_projection_gpu_arrays[12][dataset] = cuda.mem_alloc( forward_projection_arrays[12][dataset].size * forward_projection_arrays[12][ dataset].dtype.itemsize) # forward_projection_pinned_arrays[12][dataset] = cuda.register_host_memory( # forward_projection_arrays[12][dataset]) # assert np.all( # forward_projection_pinned_arrays[12][dataset] == forward_projection_arrays[12][dataset]) # cuda.memcpy_htod_async(forward_projection_gpu_arrays[12][dataset], # forward_projection_pinned_arrays[12][dataset], stream[dataset]) cuda.memcpy_htod_async(forward_projection_gpu_arrays[12][dataset], forward_projection_arrays[12][dataset], stream[dataset]) for dataset in range(number_of_datasets_back): adjust_coef_cut[dataset] = adjust_coef[ dataset * voxels_division:(dataset + 1) * voxels_division] # adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset]) # assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset]) # cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset]) cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset], stream[dataset]) if self.saved_image_by_iteration: if i % 2 == 0: im_to_save = im.reshape(self.weight, self.height, self.depth) self._save_image_by_it(im_to_save, i, sb) # del adjust_coef_pinned gc.collect() # adjust_coef_pinned = [None] * number_of_datasets_back # if self.signals_interface is not None: # self.signals_interface.trigger_update_label_reconstruction_status.emit( # "{}: Iteration {}".format(self.current_info_step, i + 1)) # self.signals_interface.trigger_progress_reconstruction_partial.emit( # int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0))) im = im.reshape(self.weight, self.height, self.depth) self.im = im
[docs] def multikernel_optimized_memory_reads_review(self): print('Optimizer STARTED') # cuda.init() cuda = self.cuda_drv start_x = np.int32(self.A[0, 0, 0]) start_y = np.int32(self.B[0, 0, 0]) start_z = np.int32(self.C[0, 0, 0]) print("Start_point: {},{},{}".format(start_x, start_y, start_z)) print('Image size: {},{}, {}'.format(self.weight, self.height, self.depth)) half_distance_between_array_pixel = np.float32(self.distance_between_array_pixel / 2) normalization_matrix = self.normalization_matrix.reshape( self.normalization_matrix.shape[0] * self.normalization_matrix.shape[1] * self.normalization_matrix.shape[ 2]) # SOURCE MODELS (DEVICE CODE) self._load_machine_C_code() # texref = self.mod_backward_projection_shared_mem.get_texref("tex") # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # texref.set_filter_mode(cuda.filter_mode.LINEAR) # Host Code B, C, im, vector_matrix, number_of_datasets = np.int32(1) # Number of datasets (and concurrent operations) used. number_of_datasets_back = np.int32(1) # Number of datasets (and concurrent operations) used. # Start concurrency Test # Event as reference point ref = cuda.Event() ref.record() # Create the streams and events needed to calculation stream, event = [], [] marker_names = ['kernel_begin', 'kernel_end'] # Create List to allocate chunks of data adjust_coef_cut = [None] * number_of_datasets adjust_coef_gpu = [None] * number_of_datasets adjust_coef_pinned = [None] * number_of_datasets_back A_cut_gpu, B_cut_gpu, C_cut_gpu = [None for _ in range(number_of_datasets_back)], \ [None for _ in range(number_of_datasets_back)],\ [None for _ in range(number_of_datasets_back)] A_cut, B_cut, C_cut = [None for _ in range(number_of_datasets_back)],\ [None for _ in range(number_of_datasets_back)], \ [None for _ in range(number_of_datasets_back)] adjust_coef_cut = [None for _ in range(number_of_datasets_back)] adjust_coef_gpu = [None for _ in range(number_of_datasets_back)] adjust_coef_pinned = [None for _ in range(number_of_datasets_back)] fov_cut_matrix_cutted_gpu = [None for _ in range(number_of_datasets_back)] fov_cut_matrix_cut = [None for _ in range(number_of_datasets_back)] # Streams and Events creation for dataset in range(number_of_datasets_back): stream.append(cuda.Stream()) event.append(dict([(marker_names[l], cuda.Event()) for l in range(len(marker_names))])) im_shappened = np.ascontiguousarray(self.im.reshape(self.im.shape[0] * self.im.shape[1] * self.im.shape[2]), dtype=np.float32) fov_cut_matrix_shappened = np.ascontiguousarray( self.fov_matrix_cut.reshape( self.fov_matrix_cut.shape[0] * self.fov_matrix_cut.shape[1] * self.fov_matrix_cut.shape[2]), dtype=np.byte) # Forward Memory allocation forward_projection_arrays_all_data = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor, self.x_min_f, self.x_max_f, self.y_min_f, self.y_max_f, self.z_min_f, self.z_max_f] forward_projection_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] forward_projection_gpu_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] forward_projection_pinned_arrays = [[None] * number_of_datasets for _ in range(len(forward_projection_arrays_all_data))] for ar in range(len(forward_projection_arrays)): array_original = forward_projection_arrays_all_data[ar] array = forward_projection_arrays[ar] array_gpu = forward_projection_gpu_arrays[ar] array_pinned = forward_projection_pinned_arrays[ar] for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) array[dataset] = array_original[begin_dataset:end_dataset] array_gpu[dataset] = cuda.mem_alloc(array[dataset].size * array[dataset].dtype.itemsize) array_pinned[dataset] = cuda.register_host_memory(array[dataset]) assert np.all(array_pinned[dataset] == array[dataset]) cuda.memcpy_htod_async(array_gpu[dataset], array_pinned[dataset], stream[dataset]) forward_projection_arrays_all_data[ar] = array_original forward_projection_arrays[ar] = array forward_projection_gpu_arrays[ar] = array_gpu forward_projection_pinned_arrays[ar] = array_pinned im_gpu = cuda.mem_alloc(im_shappened.size * im_shappened.dtype.itemsize) fov_cut_matrix_gpu = cuda.mem_alloc(fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize) cuda.memcpy_htod_async(im_gpu, im_shappened) cuda.memcpy_htod_async(fov_cut_matrix_gpu, fov_cut_matrix_shappened) adjust_coef = np.ascontiguousarray(self.adjust_coef.reshape( self.adjust_coef.shape[0] * self.adjust_coef.shape[1] * self.adjust_coef.shape[2]), dtype=np.float32) d2_normal = np.ascontiguousarray(self.distance_to_center_plane * np.sqrt( self.a_normal**2 + self.b_normal**2 + self.c_normal**2), dtype=np.float32) d2= np.ascontiguousarray(self.distance_to_center_plane_normal * np.sqrt( self.a ** 2 + self.b ** 2 + self.c ** 2), dtype=np.float32) d2_cf = np.ascontiguousarray(half_distance_between_array_pixel * np.sqrt( self.a_cf ** 2 + self.b_cf ** 2 + self.c_cf ** 2), dtype=np.float32) voxels_division = adjust_coef.shape[0] // number_of_datasets_back backward_projection_arrays_full_arrays = [self.a, self.b, self.c, self.d, self.a_normal, self.b_normal, self.c_normal, self.d_normal, self.a_cf, self.b_cf, self.c_cf, self.d_cf, self.sum_vor, self.distance_to_center_plane, self.distance_to_center_plane_normal, self.time_factor, self.x_min_f, self.x_max_f, self.y_min_f, self.y_max_f, self.z_min_f, self.z_max_f, d2_normal, d2, d2_cf] backward_projection_array_gpu_arrays = [[None] * number_of_datasets for _ in range(len(backward_projection_arrays_full_arrays))] backward_projection_pinned_arrays = [None] * len(backward_projection_arrays_full_arrays) for st in range(len(backward_projection_arrays_full_arrays)): backward_projection_array_gpu_arrays[st] = \ cuda.mem_alloc(backward_projection_arrays_full_arrays[st].size * backward_projection_arrays_full_arrays[st].dtype.itemsize) cuda.memcpy_htod_async(backward_projection_array_gpu_arrays[st], backward_projection_arrays_full_arrays[st]) # backward_projection_pinned_arrays[st] = cuda.register_host_memory(backward_projection_pinned_arrays[st]) # assert np.all(array_pinned[dataset] == array[dataset]) # cuda.memcpy_htod_async(array_gpu[dataset], array_pinned[dataset], stream[dataset]) # texref.set_address(backward_projection_array_gpu_arrays[0], backward_projection_arrays_full_arrays[0].nbytes) # texref.set_format(cuda.array_format.FLOAT, 1) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) adjust_coef = np.ascontiguousarray(self.adjust_coef.reshape( self.adjust_coef.shape[0] * self.adjust_coef.shape[1] * self.adjust_coef.shape[2]), dtype=np.float32) A = np.ascontiguousarray(self.A.reshape( self.A.shape[0] * self.A.shape[1] * self.A.shape[2]), dtype=np.short) B = np.ascontiguousarray(self.B.reshape( self.B.shape[0] * self.B.shape[1] * self.B.shape[2]), dtype=np.short) C = np.ascontiguousarray(self.C.reshape( self.C.shape[0] * self.C.shape[1] * self.C.shape[2]), dtype=np.short) adjust_coef_gpu= cuda.mem_alloc( adjust_coef.size * adjust_coef.dtype.itemsize) # ---- Divide into datasets variables backprojection events_mapping_per_dataset = [None for _ in range(number_of_datasets_back)] events_mapping_per_dataset_gpu = [None for _ in range(number_of_datasets_back)] events_mapping_per_dataset_pinned = [None for _ in range(number_of_datasets_back)] for dataset in range(number_of_datasets_back): index_events = self.map_events_gpu(min=dataset * 8, max=(dataset + 1) * 8) events_mapping_per_dataset[dataset] = np.ascontiguousarray(index_events, dtype=np.int32) events_mapping_per_dataset_gpu[dataset] = cuda.mem_alloc( events_mapping_per_dataset[dataset].size * events_mapping_per_dataset[dataset].dtype.itemsize) events_mapping_per_dataset_pinned[dataset] = cuda.register_host_memory(events_mapping_per_dataset[dataset]) assert np.all(events_mapping_per_dataset_pinned[dataset] == events_mapping_per_dataset[dataset]) cuda.memcpy_htod_async(events_mapping_per_dataset_gpu[dataset], events_mapping_per_dataset_pinned[dataset], stream[dataset]) # voxels_division = adjust_coef.shape[0] // number_of_datasets_back # adjust_coef_cut[dataset] = np.ascontiguousarray( # adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division], # dtype=np.float32) # # fov_cut_matrix_cut[dataset] = np.ascontiguousarray( # fov_cut_matrix_shappened[dataset * voxels_division:(dataset + 1) * voxels_division], # dtype=np.byte) # # A_cut[dataset] = np.ascontiguousarray( # A[dataset * voxels_division:(dataset + 1) * voxels_division], # dtype=np.short) # # B_cut[dataset] = np.ascontiguousarray( # B[dataset * voxels_division:(dataset + 1) * voxels_division], # dtype=np.short) # # C_cut[dataset] = np.ascontiguousarray( # C[dataset * voxels_division:(dataset + 1) * voxels_division], # dtype=np.short) # # Backprojection # adjust_coef_gpu[dataset] = cuda.mem_alloc( # adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize) # # adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset]) # assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset]) # cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset]) # # fov_cut_matrix_cutted_gpu[dataset] = cuda.mem_alloc( # fov_cut_matrix_cut[dataset].size * fov_cut_matrix_cut[dataset].dtype.itemsize) # # A_cut_gpu[dataset] = cuda.mem_alloc( # A_cut[dataset].size * A_cut[dataset].dtype.itemsize) # # B_cut_gpu[dataset] = cuda.mem_alloc( # B_cut[dataset].size * B_cut[dataset].dtype.itemsize) # # C_cut_gpu[dataset] = cuda.mem_alloc( # C_cut[dataset].size * C_cut[dataset].dtype.itemsize) # # cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset]) # cuda.memcpy_htod_async(fov_cut_matrix_cutted_gpu[dataset], fov_cut_matrix_cut[dataset]) # cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset]) # cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset]) # cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset]) free, total = cuda.mem_get_info() # cuda.memcpy_htod_async(im_gpu, realrow) # cuda.matrix_to_texref(realrow, texref, order="C") # texref.set_array(realrow) print('%.1f %% of device memory is free.' % ((free / float(total)) * 100)) # -------------OSEM--------- it = self.number_of_iterations subsets = self.number_of_subsets print('Number events for reconstruction: {}'.format(self.number_of_events)) im = np.ascontiguousarray(self.im.reshape(self.im.shape[0] * self.im.shape[1] * self.im.shape[2]), dtype=np.float32) for i in range(it): print('Iteration number: {}\n----------------'.format(i + 1)) begin_event = np.int32(0) end_event = np.int32(self.number_of_events / subsets) for sb in range(subsets): print('Subset number: {}'.format(sb)) number_of_events_subset = np.int32(end_event - begin_event) tic = time.time() # Cycle forward Projection for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets) end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets) threadsperblock = (256, 1, 1) blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0])) blockspergrid_y = int(math.ceil(1 / threadsperblock[1])) blockspergrid_z = int(math.ceil(1 / threadsperblock[2])) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) event[dataset]['kernel_begin'].record(stream[dataset]) func_forward = self.mod_forward_projection_shared_mem.get_function("forward_projection_cdrf") func_forward(self.weight, self.height, self.depth, start_x, start_y, start_z, forward_projection_gpu_arrays[13][dataset], forward_projection_gpu_arrays[14][dataset], half_distance_between_array_pixel, self.number_of_events, begin_dataset, end_dataset, forward_projection_gpu_arrays[0][dataset], forward_projection_gpu_arrays[4][dataset], forward_projection_gpu_arrays[8][dataset], forward_projection_gpu_arrays[1][dataset], forward_projection_gpu_arrays[5][dataset], forward_projection_gpu_arrays[9][dataset], forward_projection_gpu_arrays[2][dataset], forward_projection_gpu_arrays[6][dataset], forward_projection_gpu_arrays[10][dataset], forward_projection_gpu_arrays[3][dataset], forward_projection_gpu_arrays[7][dataset], forward_projection_gpu_arrays[11][dataset], forward_projection_gpu_arrays[12][dataset], fov_cut_matrix_gpu, im_gpu, forward_projection_gpu_arrays[16][dataset], forward_projection_gpu_arrays[17][dataset], forward_projection_gpu_arrays[18][dataset], forward_projection_gpu_arrays[19][dataset], forward_projection_gpu_arrays[20][dataset], forward_projection_gpu_arrays[21][dataset], block=threadsperblock, grid=blockspergrid, stream=stream[dataset] ) # Sincronization of streams for dataset in range(number_of_datasets): # Commenting out this line should break concurrency. event[dataset]['kernel_end'].record(stream[dataset]) # Transfering data from Optimizer for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) cuda.memcpy_dtoh_async(forward_projection_pinned_arrays[12][dataset], forward_projection_gpu_arrays[12][dataset], stream[dataset]) forward_projection_arrays_all_data[12][begin_dataset:end_dataset] = \ forward_projection_pinned_arrays[12][dataset] # # cuda.Context.synchronize() toc = time.time() print('Time part Forward Projection {} : {}'.format(1, toc - tic)) # # number_of_datasets = np.int32(2) teste = np.copy(forward_projection_arrays[12][dataset]) # # # sum_vor[sum_vor<1]=0 # # sum_vor = np.ascontiguousarray(teste, dtype=np.float32) # # # sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32) print('SUM VOR: {}'.format(np.sum(teste))) # # cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor) # # # cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor) t_back_init = time.time() # ------------BACKPROJECTION----------- cuda.memcpy_htod_async(backward_projection_array_gpu_arrays[12], forward_projection_arrays_all_data[12]) # ------------BACKPROJECTION----------- for dataset in range(number_of_datasets_back): dataset = np.int32(dataset) begin_dataset = np.int32(number_of_events_subset*(dataset)/number_of_datasets_back) end_dataset = np.int32(number_of_events_subset*(dataset+1)/number_of_datasets_back) event[dataset]['kernel_begin'].record(stream[dataset]) weight_cutted, height_cutted, depth_cutted = np.int32( adjust_coef.shape[0]), np.int32( 1), np.int32(1) number_of_voxels_thread = 128 threadsperblock = (np.int(number_of_voxels_thread), 1, 1) # blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0])) blockspergrid_x = int(math.ceil(adjust_coef.shape[0] / threadsperblock[0])) z_min = np.short(np.min(self.z_min_f[begin_dataset:end_dataset])) z_max = np.short(np.max(self.z_max_f[begin_dataset:end_dataset])) blockspergrid_x = int(math.ceil(8*88*88 / threadsperblock[0])) blockspergrid_y = int(math.ceil(1 / threadsperblock[1])) blockspergrid_z = int(math.ceil(1 / threadsperblock[2])) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) # print(z_min) # print(z_max) # print("---------------") # print(np.int32(len(events_mapping_per_dataset[dataset]))) func_backward = self.mod_backward_projection_shared_mem.get_function( "backprojection_cdrf") func_backward(dataset, self.weight, self.height, self.depth, backward_projection_array_gpu_arrays[13], backward_projection_array_gpu_arrays[14], half_distance_between_array_pixel, self.number_of_events, begin_dataset, np.int32(len(events_mapping_per_dataset[dataset])), backward_projection_array_gpu_arrays[0], backward_projection_array_gpu_arrays[4], backward_projection_array_gpu_arrays[8], backward_projection_array_gpu_arrays[1], backward_projection_array_gpu_arrays[5], backward_projection_array_gpu_arrays[9], backward_projection_array_gpu_arrays[2], backward_projection_array_gpu_arrays[6], backward_projection_array_gpu_arrays[10], backward_projection_array_gpu_arrays[3], backward_projection_array_gpu_arrays[7], backward_projection_array_gpu_arrays[11], adjust_coef_gpu, backward_projection_array_gpu_arrays[12], backward_projection_array_gpu_arrays[15], z_min, z_max, backward_projection_array_gpu_arrays[22], backward_projection_array_gpu_arrays[23], backward_projection_array_gpu_arrays[24], events_mapping_per_dataset_gpu[dataset], block=threadsperblock, grid=blockspergrid, shared=np.int(4 * number_of_voxels_thread*1), stream=stream[dataset] ) for dataset in range(number_of_datasets_back): # Commenting out this line should break concurrency. event[dataset]['kernel_end'].record(stream[dataset]) # for dataset in range(number_of_datasets_back): # cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset]) # adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division] = adjust_coef_cut[ # dataset] cuda.memcpy_dtoh_async(adjust_coef, adjust_coef_gpu) cuda.Context.synchronize() print('Time part Backward Projection {} : {}'.format(1, time.time() - t_back_init)) print('SUM IMAGE: {}'.format(np.sum(adjust_coef))) penalized_term = self._load_penalized_term(im) # normalization im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[ normalization_matrix != 0] / (normalization_matrix[normalization_matrix != 0]) # im[normalization_matrix == 0] = 0 im = self._apply_penalized_term(im, penalized_term) print('SUM IMAGE: {}'.format(np.sum(im))) im = np.ascontiguousarray(im, dtype=np.float32) cuda.memcpy_htod_async(im_gpu, im) # # Clearing variables forward_projection_arrays_all_data[12] = np.ascontiguousarray( np.zeros(self.a.shape, dtype=np.float32)) adjust_coef = np.ascontiguousarray( np.zeros((self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z), dtype=np.float32)) for dataset in range(number_of_datasets): begin_dataset = np.int32(dataset * self.number_of_events / number_of_datasets) end_dataset = np.int32((dataset + 1) * self.number_of_events / number_of_datasets) forward_projection_arrays[12][dataset] = forward_projection_arrays_all_data[12][ begin_dataset:end_dataset] forward_projection_gpu_arrays[12][dataset] = cuda.mem_alloc( forward_projection_arrays[12][dataset].size * forward_projection_arrays[12][ dataset].dtype.itemsize) forward_projection_pinned_arrays[12][dataset] = cuda.register_host_memory( forward_projection_arrays[12][dataset]) assert np.all( forward_projection_pinned_arrays[12][dataset] == forward_projection_arrays[12][dataset]) cuda.memcpy_htod_async(forward_projection_gpu_arrays[12][dataset], forward_projection_pinned_arrays[12][dataset], stream[dataset]) # del my_array # del my_object # gc.collect() # for dataset in range(number_of_datasets_back): # adjust_coef_cut[dataset] = adjust_coef[ # dataset * voxels_division:(dataset + 1) * voxels_division] # adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset]) # assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset]) # cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset]) # if self.saved_image_by_iteration: # im_to_save = im.reshape(weight, height, depth) # self._save_image_by_it(im_to_save, i, sb) # # if self.signals_interface is not None: # self.signals_interface.trigger_update_label_reconstruction_status.emit( # "{}: Iteration {}".format(self.current_info_step, i + 1)) # self.signals_interface.trigger_progress_reconstruction_partial.emit( # int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0))) im = im.reshape(self.weight, self.height, self.depth) self.im = im
def _load_penalized_term(self, im): penalized_term = None im_c = np.copy(im) if self.algorithm == "LM-MRP": beta = self.algorithm_options[0] kernel_filter_size = self.algorithm_options[1] im_to_filter = im_c.reshape(self.weight, self.height, self.depth) im_med = median_filter(im_to_filter, kernel_filter_size) penalized_term = np.copy(im_to_filter) penalized_term[im_med != 0] = 1 + beta * (im_to_filter[im_med != 0] - im_med[im_med != 0]) / \ im_med[ im_med != 0] penalized_term = np.ascontiguousarray(penalized_term.reshape(self.weight * self.height * self.depth), dtype=np.float32) if self.algorithm == "MAP": beta = 0.5 im_map = np.load( "C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy") return penalized_term def _apply_penalized_term(self, im, penalized_term): if self.algorithm == "LM-MRP": im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0] return im
[docs] def map_events_gpu(self, min=None, max=None): image_cut_limits = np.array([min,max]) cond = (self.z_min_f > image_cut_limits[1]) & (self.z_max_f > image_cut_limits[1]) cond_2 = (self.z_min_f < image_cut_limits[0]) & (self.z_max_f < image_cut_limits[0]) return np.where(~(cond | cond_2))[0]
def _save_image_by_it(self, im, it=None, sb=None): file_name = os.path.join(self.iterations_path, "EasyPETScan_it{}_sb{}".format(it, sb)) print(file_name) volume = im.astype(np.float32) length = 1 for i in volume.shape: length *= i # length = volume.shape[0] * volume.shape[2] * volume.shape[1] if len(volume.shape) > 1: data = np.reshape(volume, [1, length], order='F') else: data = volume shapeIm = volume.shape output_file = open(file_name, 'wb') arr = array('f', data[0]) arr.tofile(output_file) output_file.close()