import numpy as np
from pycuda.compiler import SourceModule
import math
import time
import os
[docs]
class GPUSharedMemorySingleKernel:
def __init__(self, EM_obj=None):
self.mod_forward_projection_shared_mem = None
self.mod_backward_projection_shared_mem = None
self.a = EM_obj.a
self.a_normal = EM_obj.a_normal
self.a_cf = EM_obj.a_cf
self.b = EM_obj.b
self.b_normal = EM_obj.b_normal
self.b_cf = EM_obj.b_cf
self.c = EM_obj.c
self.c_normal = EM_obj.c_normal
self.c_cf = EM_obj.c_cf
self.d = EM_obj.d
self.d_normal = EM_obj.d_normal
self.d_cf = EM_obj.d_cf
self.adjust_coef = EM_obj.adjust_coef
self.im = EM_obj.im
self.half_crystal_pitch_xy = EM_obj.half_crystal_pitch_xy
self.fw_source_model_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PET", "fw_single_kernel.c")
self.fw_source_model = open(self.fw_source_model_file)
self.bw_source_model_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PET", "bw_single_kernel.c")
self.bw_source_model = open(self.fw_source_model_file)
def _load_machine_C_code(self):
self.mod_forward_projection_shared_mem = SourceModule(self.fw_source_model)
self.mod_backward_projection_shared_mem = SourceModule(self.bw_source_model)
# self.mod_forward_projection_shared_mem = SourceModule("""__global__ void forward_projection
# (const int n, const int m, const int p, const float crystal_pitch_XY,const float crystal_pitch_Z,const float distance_between_array_pixel,int number_of_events, const int begin_event_gpu_limitation, const int end_event_gpu_limitation, float *a, float *a_normal,
# float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal, float *c_cf, float *d, float *d_normal, float *d_cf, const int *A, const int *B,
# const int *C, float *sum_vor, const char *sensitivity_matrix, const float *im_old, const float *probability)
# {
# const int shared_memory_size = 256;
# __shared__ float a_shared[shared_memory_size];
# __shared__ float b_shared[shared_memory_size];
# __shared__ float c_shared[shared_memory_size];
# __shared__ float d_shared[shared_memory_size];
# __shared__ float a_normal_shared[shared_memory_size];
# __shared__ float b_normal_shared[shared_memory_size];
# __shared__ float c_normal_shared[shared_memory_size];
# __shared__ float d_normal_shared[shared_memory_size];
# __shared__ float a_cf_shared[shared_memory_size];
# __shared__ float b_cf_shared[shared_memory_size];
# __shared__ float c_cf_shared[shared_memory_size];
#
# __shared__ float d_cf_shared[shared_memory_size];
# __shared__ float sum_vor_shared[shared_memory_size];
# /*
# const int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
# const int threadId= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
# */
# int threadId=blockIdx.x *blockDim.x + threadIdx.x;
# int e;
#
# float d2;
# float d2_normal;
# float d2_cf;
# float value;
# float value_normal;
# float value_cf;
# float sum_vor_temp;
# int index;
# const float total_length =sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel/4);
#
#
#
# const int number_events_max = end_event_gpu_limitation-begin_event_gpu_limitation;
# const float error_pixel = 0.0000f;
# if (threadIdx.x>shared_memory_size)
# {
# return;
# }
# if(threadId>=number_events_max)
# {
# return;
# }
#
# __syncthreads();
# e = threadId;
# int e_m = threadIdx.x;
# a_shared[e_m] = a[e];
# b_shared[e_m] = b[e];
# c_shared[e_m] = c[e];
# d_shared[e_m] = d[e];
# a_normal_shared[e_m] = a_normal[e];
# b_normal_shared[e_m] = b_normal[e];
# c_normal_shared[e_m] = c_normal[e];
# d_normal_shared[e_m] = d_normal[e];
# a_cf_shared[e_m] = a_cf[e];
# b_cf_shared[e_m] = b_cf[e];
# c_cf_shared[e_m] = c_cf[e];
# d_cf_shared[e_m] = d_cf[e];
#
# sum_vor_shared[e_m] = sum_vor[e];
#
#
# for(int k=0; k<p; k++)
# {
# for(int j=0; j<m; j++)
# {
# for(int l=0; l<n; l++)
# {
# /*index = k+j*p+l*p*m;*/
# index = l+j*n+k*m*n;
#
#
# if (sensitivity_matrix[index]!=0)
# {
# value = a_shared[e_m]*A[index]+b_shared[e_m]*B[index]+c_shared[e_m]*C[index]- d_shared[e_m];
# d2 = error_pixel+crystal_pitch_Z * sqrt(a_shared[e_m]*a_shared[e_m] + b_shared[e_m]*b_shared[e_m] + c_shared[e_m]*c_shared[e_m]);
# if (value < d2 && value >=-d2 )
# {
# value_normal = a_normal_shared[e_m]*A[index]+b_normal_shared[e_m]*B[index]+c_normal_shared[e_m] * C[index]-d_normal_shared[e_m];
# d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal_shared[e_m]*a_normal_shared[e_m]+b_normal_shared[e_m]*b_normal_shared[e_m]+c_normal_shared[e_m]*c_normal_shared[e_m]);
#
#
# if (value_normal < d2_normal && value_normal >= -d2_normal)
#
# {
# value_cf = a_cf_shared[e_m]*A[index]+b_cf_shared[e_m]*B[index]+c_cf_shared[e_m] * C[index]-d_cf_shared[e_m];
# d2_cf = (distance_between_array_pixel/2)*sqrt(a_cf_shared[e_m]*a_cf_shared[e_m]+b_cf_shared[e_m]*b_cf_shared[e_m]+c_cf_shared[e_m]*c_cf_shared[e_m]);
#
# if ((value_cf > -d2_cf) && value_cf<=d2_cf)
# {
# sum_vor_shared[e_m] += im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
# }
# /*
#
# sum_vor_temp += im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
# sum_vor_temp += im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+distance_between_array_pixel*d2_cf*distance_between_array_pixel*d2_cf/4));
# if ( (value_cf>=0 && value_cf < distance_between_array_pixel*d2_cf && value_cf>=d2_cf) || (value_cf<0 && value_cf > -distance_between_array_pixel*d2_cf && value_cf<=d2_cf))
# {
# }
# *(1-abs(distance_between_array_pixel/2-value_cf)/distance_between_array_pixel/2)
#
# *(1-((sqrt(value*value+value_normal*value_normal+4*crystal_pitch_XY*crystal_pitch_XY))/sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel)));
# sum_vor_shared[e_m] += im_old[index]*(1-((sqrt(value*value+value_normal*value_normal+4*crystal_pitch_XY*crystal_pitch_XY))/sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel)));;
# sum_vor_shared[e_m] += im_old[index]*sensitivity_matrix[index]*(1-((sqrt(value*value+value_normal*value_normal))/sqrt(crystal_pitch_Z*crystal_pitch_Z+crystal_pitch_XY*crystal_pitch_XY+)));;
# sum_vor_shared[e_m] += im_old[index];
# sum_vor_temp += sensitivity_matrix[index]*im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+4*crystal_pitch_XY*crystal_pitch_XY))/total_length);
# *(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/total_length)
# (1-abs(value_cf)/(distance_between_array_pixel/2*d2_cf));
#
# ;
#
#
#
# }
# */
#
# }
#
#
#
#
# }
# }
#
# }
# }
# }
#
# /*
# sum_vor[e]= sum_vor_shared[e_m];
# sum_vor[e]= sum_vor_temp;
# */
#
# __syncthreads();
# sum_vor[e]= sum_vor_shared[e_m];
#
#
#
# }""")
# self.mod_backward_projection_shared_mem = SourceModule("""
#
# __global__ void backprojection
# ( int dataset_number, int n, int m, int p, const float crystal_pitch_XY, const float crystal_pitch_Z, const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation, const int end_event_gpu_limitation, float *a, float *a_normal,
# float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal, float *c_cf, float *d, float *d_normal, float *d_cf,const int *A, const int *B,
# const int *C, float *adjust_coef, float *sum_vor, char *sensitivity_matrix, const float *probability, float *time_factor)
# {
#
# /*
#
# int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
# int idt= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
#
#
#
#
# const int shared_memory_size = 143;
# __shared__ float adjust_coef_shared[shared_memory_size];
#
# */
# extern __shared__ float adjust_coef_shared[];
# int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
# int idt = blockId * blockDim.x + threadIdx.x;
#
#
#
#
# float d2;
# float d2_normal;
# float d2_cf;
# float normal_value;
# float value;
# float value_cf;
# float error_pixel = 0.000f;
# float declive;
# const float total_length = sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel/4);
#
#
# int i_s=threadIdx.x;
#
# if (idt>=n*m*p)
# {
# return;
# }
# if (i_s>n)
# {
# return;
# }
#
#
# __syncthreads();
# adjust_coef_shared[i_s] = adjust_coef[idt];
#
# for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
# {
# if (sensitivity_matrix[idt]!=0)
# {
#
# value = a[e]*A[idt]+b[e]*B[idt]+c[e]*C[idt]- d[e];
# d2 = error_pixel+ crystal_pitch_Z * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);
#
#
# if (value < d2 && value >=-d2)
# {
# normal_value= a_normal[e]*A[idt]+b_normal[e]*B[idt]+c_normal[e] * C[idt]-d_normal[e];
#
# d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
#
#
# if (normal_value< d2_normal && normal_value >= -d2_normal)
# {
#
# value_cf = a_cf[e]*A[idt]+b_cf[e]*B[idt]+c_cf[e] * C[idt]-d_cf[e];
# d2_cf = distance_between_array_pixel/2*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);
#
#
# if ((value_cf > -d2_cf) && value_cf<=d2_cf)
# {
# adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
#
#
#
# }
#
# /*
# adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
# */
#
#
#
#
#
# }
# }
#
# }
#
# }
#
# adjust_coef[idt] = adjust_coef_shared[i_s];
# __syncthreads();
#
#
#
# }
# """)
def _vor_design_gpu_shared_memory(self):
print('Optimizer STARTED')
# cuda.init()
cuda = self.EM_obj.cuda_drv
# device = cuda.Device(0) # enter your gpu id here
# ctx = device.make_context()
number_of_events = np.int32(len(a))
weight = np.int32(A.shape[0])
height = np.int32(A.shape[1])
depth = np.int32(A.shape[2])
print('Image size: {},{}, {}'.format(weight, height, depth))
print('BACKPROJECTION FUNCTION')
# probability= self.probability
probability = np.ascontiguousarray(
np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z), dtype=np.float32))
distance_between_array_pixel = self.distance_between_array_pixel
# SOURCE MODELS (DEVICE CODE)
self._load_machine_C_code()
# float crystal_pitch, int number_of_events, float *a, float *b, float *c, float *d, int *A, int *B, int *C, float *im, float *vector_matrix
# Host Code B, C, im, vector_matrix,
number_of_datasets = np.int32(1) # Number of datasets (and concurrent operations) used.
# Start concurrency Test
# Event as reference point
ref = cuda.Event()
ref.record()
# Create the streams and events needed to calculation
stream, event = [], []
marker_names = ['kernel_begin', 'kernel_end']
# Create List to allocate chunks of data
A_cut_gpu, B_cut_gpu, C_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
None] * number_of_datasets
A_cut, B_cut, C_cut = [None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
a_gpu, b_gpu, c_gpu, d_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets
a_normal_gpu, b_normal_gpu, c_normal_gpu, d_normal_gpu = [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
a_cut, b_cut, c_cut, d_cut = [None] * number_of_datasets, [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets
a_normal_cut, b_normal_cut, c_normal_cut, d_normal_cut = [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
a_cf_cut, b_cf_cut, c_cf_cut, d_cf_cut = [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
a_cut_gpu, b_cut_gpu, c_cut_gpu, d_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets
a_cut_normal_gpu, b_cut_normal_gpu, c_cut_normal_gpu, d_cut_normal_gpu = [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
a_cf_cut_gpu, b_cf_cut_gpu, c_cf_cut_gpu, d_cf_cut_gpu = [None] * number_of_datasets, [
None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
time_factor_cut_gpu = [None] * number_of_datasets
sum_vor_gpu = [None] * number_of_datasets
sum_vor_cut = [None] * number_of_datasets
probability_cut = [None] * number_of_datasets
probability_gpu = [None] * number_of_datasets
adjust_coef_cut = [None] * number_of_datasets
adjust_coef_gpu = [None] * number_of_datasets
sensivity_matrix_cut = [None] * number_of_datasets
sensivity_cut = [None] * number_of_datasets
sensivity_gpu = [None] * number_of_datasets
# Streams and Events creation
for dataset in range(number_of_datasets):
stream.append(cuda.Stream())
event.append(dict([(marker_names[l], cuda.Event()) for l in range(len(marker_names))]))
# Memory Allocation
# Variables that need an unique alocation
A_gpu = cuda.mem_alloc(A.size * A.dtype.itemsize)
B_gpu = cuda.mem_alloc(B.size * B.dtype.itemsize)
C_gpu = cuda.mem_alloc(C.size * C.dtype.itemsize)
im_gpu = cuda.mem_alloc(im.size * im.dtype.itemsize)
sensivity_matrix_gpu = cuda.mem_alloc(sensivity_matrix.size * sensivity_matrix.dtype.itemsize)
cuda.memcpy_htod_async(A_gpu, A)
cuda.memcpy_htod_async(B_gpu, B)
cuda.memcpy_htod_async(C_gpu, C)
cuda.memcpy_htod_async(im_gpu, im)
cuda.memcpy_htod_async(sensivity_matrix_gpu, sensivity_matrix)
a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)
d_gpu = cuda.mem_alloc(d.size * d.dtype.itemsize)
a_normal_gpu = cuda.mem_alloc(a_normal.size * a_normal.dtype.itemsize)
b_normal_gpu = cuda.mem_alloc(b_normal.size * b_normal.dtype.itemsize)
c_normal_gpu = cuda.mem_alloc(c_normal.size * c_normal.dtype.itemsize)
d_normal_gpu = cuda.mem_alloc(d_normal.size * d_normal.dtype.itemsize)
a_cf_gpu = cuda.mem_alloc(a_cf.size * a_cf.dtype.itemsize)
b_cf_gpu = cuda.mem_alloc(b_cf.size * b_cf.dtype.itemsize)
c_cf_gpu = cuda.mem_alloc(c_cf.size * c_cf.dtype.itemsize)
d_cf_gpu = cuda.mem_alloc(d_cf.size * d_cf.dtype.itemsize)
sum_vor_t_gpu = cuda.mem_alloc(sum_vor.size * sum_vor.dtype.itemsize)
probability_t_gpu = cuda.mem_alloc(probability.size * probability.dtype.itemsize)
time_factor_gpu = cuda.mem_alloc(time_factor.size * time_factor.dtype.itemsize)
# Transfer memory to Optimizer
cuda.memcpy_htod_async(a_gpu, a)
cuda.memcpy_htod_async(b_gpu, b)
cuda.memcpy_htod_async(c_gpu, c)
cuda.memcpy_htod_async(d_gpu, d)
cuda.memcpy_htod_async(a_normal_gpu, a_normal)
cuda.memcpy_htod_async(b_normal_gpu, b_normal)
cuda.memcpy_htod_async(c_normal_gpu, c_normal)
cuda.memcpy_htod_async(d_normal_gpu, d_normal)
cuda.memcpy_htod_async(a_cf_gpu, a_cf)
cuda.memcpy_htod_async(b_cf_gpu, b_cf)
cuda.memcpy_htod_async(c_cf_gpu, c_cf)
cuda.memcpy_htod_async(d_cf_gpu, d_cf)
cuda.memcpy_htod_async(time_factor_gpu, time_factor)
im_cut_dim = [im.shape[0], im.shape[1], int(im.shape[2] / number_of_datasets)] # Dividing image in small chunks
for dataset in range(number_of_datasets):
if dataset == number_of_datasets:
begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
end_dataset = number_of_events
adjust_coef_cut[dataset] = np.ascontiguousarray(
adjust_coef[int(np.floor(im_cut_dim[0] * dataset)):adjust_coef.shape[0],
int(np.floor(im_cut_dim[1] * dataset)):adjust_coef.shape[1],
int(np.floor(im_cut_dim[2] * dataset)):adjust_coef.shape[2]],
dtype=np.float32)
sensivity_cut[dataset] = np.ascontiguousarray(
sensivity_matrix[int(np.floor(im_cut_dim[0] * dataset)):sensivity_matrix.shape[0],
int(np.floor(im_cut_dim[1] * dataset)):sensivity_matrix.shape[1],
int(np.floor(im_cut_dim[2] * dataset)):sensivity_matrix.shape[2]],
dtype=np.float32)
else:
begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
adjust_coef_cut[dataset] = np.ascontiguousarray(adjust_coef[:, :,
int(np.floor(im_cut_dim[2] * dataset)):int(
np.floor(im_cut_dim[2] * (dataset + 1)))],
dtype=np.float32)
sensivity_cut[dataset] = np.ascontiguousarray(sensivity_matrix[:, :,
int(np.floor(im_cut_dim[2] * dataset)):int(
np.floor(im_cut_dim[2] * (dataset + 1)))],
dtype=np.byte)
A_cut[dataset] = np.ascontiguousarray(A[:, :,
int(np.floor(im_cut_dim[2] * dataset)):int(
np.floor(im_cut_dim[2] * (dataset + 1)))], dtype=np.int32)
B_cut[dataset] = np.ascontiguousarray(B[:, :,
int(np.floor(im_cut_dim[2] * dataset)):int(
np.floor(im_cut_dim[2] * (dataset + 1)))], dtype=np.int32)
C_cut[dataset] = np.ascontiguousarray(C[:, :,
int(np.floor(im_cut_dim[2] * dataset)):int(
np.floor(im_cut_dim[2] * (dataset + 1)))], dtype=np.int32)
# Cutting dataset
# For forward projection the data is cutted by number of events. For backprojection is cutted per image pieces of image
a_cut[dataset] = a[begin_dataset:end_dataset]
b_cut[dataset] = b[begin_dataset:end_dataset]
c_cut[dataset] = c[begin_dataset:end_dataset]
d_cut[dataset] = d[begin_dataset:end_dataset]
a_normal_cut[dataset] = a_normal[begin_dataset:end_dataset]
b_normal_cut[dataset] = b_normal[begin_dataset:end_dataset]
c_normal_cut[dataset] = c_normal[begin_dataset:end_dataset]
d_normal_cut[dataset] = d_normal[begin_dataset:end_dataset]
a_cf_cut[dataset] = a_cf[begin_dataset:end_dataset]
b_cf_cut[dataset] = b_cf[begin_dataset:end_dataset]
c_cf_cut[dataset] = c_cf[begin_dataset:end_dataset]
d_cf_cut[dataset] = d_cf[begin_dataset:end_dataset]
sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
probability_cut[dataset] = probability[begin_dataset:end_dataset]
# Forward
a_cut_gpu[dataset] = cuda.mem_alloc(a_cut[dataset].size * a_cut[dataset].dtype.itemsize)
b_cut_gpu[dataset] = cuda.mem_alloc(b_cut[dataset].size * b_cut[dataset].dtype.itemsize)
c_cut_gpu[dataset] = cuda.mem_alloc(c_cut[dataset].size * c_cut[dataset].dtype.itemsize)
d_cut_gpu[dataset] = cuda.mem_alloc(d_cut[dataset].size * d_cut[dataset].dtype.itemsize)
a_cut_normal_gpu[dataset] = cuda.mem_alloc(
a_normal_cut[dataset].size * a_normal_cut[dataset].dtype.itemsize)
b_cut_normal_gpu[dataset] = cuda.mem_alloc(
b_normal_cut[dataset].size * b_normal_cut[dataset].dtype.itemsize)
c_cut_normal_gpu[dataset] = cuda.mem_alloc(
c_normal_cut[dataset].size * c_normal_cut[dataset].dtype.itemsize)
d_cut_normal_gpu[dataset] = cuda.mem_alloc(
d_normal_cut[dataset].size * d_normal_cut[dataset].dtype.itemsize)
a_cf_cut_gpu[dataset] = cuda.mem_alloc(a_cf_cut[dataset].size * a_cf_cut[dataset].dtype.itemsize)
b_cf_cut_gpu[dataset] = cuda.mem_alloc(b_cf_cut[dataset].size * b_cf_cut[dataset].dtype.itemsize)
c_cf_cut_gpu[dataset] = cuda.mem_alloc(c_cf_cut[dataset].size * c_cf_cut[dataset].dtype.itemsize)
d_cf_cut_gpu[dataset] = cuda.mem_alloc(d_cf_cut[dataset].size * d_cf_cut[dataset].dtype.itemsize)
sum_vor_gpu[dataset] = cuda.mem_alloc(sum_vor_cut[dataset].size * sum_vor_cut[dataset].dtype.itemsize)
probability_gpu[dataset] = cuda.mem_alloc(
probability_cut[dataset].size * probability_cut[dataset].dtype.itemsize)
cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_cut[dataset])
cuda.memcpy_htod_async(probability_gpu[dataset], probability_cut[dataset])
cuda.memcpy_htod_async(a_cut_gpu[dataset], a_cut[dataset])
cuda.memcpy_htod_async(b_cut_gpu[dataset], b_cut[dataset])
cuda.memcpy_htod_async(c_cut_gpu[dataset], c_cut[dataset])
cuda.memcpy_htod_async(d_cut_gpu[dataset], d_cut[dataset])
cuda.memcpy_htod_async(a_cut_normal_gpu[dataset], a_normal_cut[dataset])
cuda.memcpy_htod_async(b_cut_normal_gpu[dataset], b_normal_cut[dataset])
cuda.memcpy_htod_async(c_cut_normal_gpu[dataset], c_normal_cut[dataset])
cuda.memcpy_htod_async(d_cut_normal_gpu[dataset], d_normal_cut[dataset])
cuda.memcpy_htod_async(a_cf_cut_gpu[dataset], a_cf_cut[dataset])
cuda.memcpy_htod_async(b_cf_cut_gpu[dataset], b_cf_cut[dataset])
cuda.memcpy_htod_async(c_cf_cut_gpu[dataset], c_cf_cut[dataset])
cuda.memcpy_htod_async(d_cf_cut_gpu[dataset], d_cf_cut[dataset])
# Backprojection
adjust_coef_gpu[dataset] = cuda.mem_alloc(
adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize)
sensivity_gpu[dataset] = cuda.mem_alloc(
sensivity_cut[dataset].size * sensivity_cut[dataset].dtype.itemsize)
A_cut_gpu[dataset] = cuda.mem_alloc(
A_cut[dataset].size * A_cut[dataset].dtype.itemsize)
B_cut_gpu[dataset] = cuda.mem_alloc(
B_cut[dataset].size * B_cut[dataset].dtype.itemsize)
C_cut_gpu[dataset] = cuda.mem_alloc(
C_cut[dataset].size * C_cut[dataset].dtype.itemsize)
cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset])
cuda.memcpy_htod_async(sensivity_gpu[dataset], sensivity_cut[dataset])
cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset])
cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset])
cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset])
free, total = cuda.mem_get_info()
print('%.1f %% of device memory is free.' % ((free / float(total)) * 100))
# -------------OSEM---------
it = self.number_of_iterations
subsets = self.number_of_subsets
print('Number events for reconstruction: {}'.format(number_of_events))
# total_pixels_per_event = weight * height * depth
# number_of_pixel_calculated_per_second = 3E9
# time_to_prevent_watchdog = 1.8 # seconds
# number_events_cutted_by_watchdog = np.int32(
# (number_of_pixel_calculated_per_second * time_to_prevent_watchdog) / total_pixels_per_event)
# propability calculation
# begin_event = np.int32(0)
# end_event = np.int32(number_of_events / subsets)
# number_of_events_subset = np.int32(end_event - begin_event)
# for dataset in range(number_of_datasets):
#
# if dataset == number_of_datasets:
# begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
# end_dataset = number_of_events_subset
# else:
# begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
# end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)
#
# threadsperblock = (512, 1, 1)
# blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0]))
# blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
# blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
# blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
# event[dataset]['kernel_begin'].record(stream[dataset])
#
# func_prob = mod_probability_shared_mem.get_function("probability")
# func_prob(weight, height, depth, half_crystal_pitch_xy, half_crystal_pitch_z,
# number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset], a_cut_normal_gpu[dataset],
# b_cut_gpu[dataset], b_cut_normal_gpu[dataset], c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
# d_cut_gpu[dataset],
# d_cut_normal_gpu[dataset], A_gpu, B_gpu, C_gpu, probability_gpu[dataset],
# im_gpu,
# block=threadsperblock,
# grid=blockspergrid,
# stream=stream[dataset])
#
# # Sincronization of streams
# for dataset in range(number_of_datasets): # Commenting out this line should break concurrency.
# event[dataset]['kernel_end'].record(stream[dataset])
#
# # Transfering data from Optimizer
# for dataset in range(number_of_datasets):
# begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
# end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
# cuda.memcpy_dtoh_async(probability[begin_dataset:end_dataset], probability_gpu[dataset])
# cuda.memcpy_htod_async(probability_t_gpu, probability)
for i in range(it):
print('Iteration number: {}\n----------------'.format(i + 1))
begin_event = np.int32(0)
end_event = np.int32(number_of_events / subsets)
for sb in range(subsets):
print('Subset number: {}'.format(sb))
number_of_events_subset = np.int32(end_event - begin_event)
tic = time.time()
# Cycle forward Projection
for dataset in range(number_of_datasets):
if dataset == number_of_datasets:
begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
end_dataset = number_of_events_subset
else:
begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)
threadsperblock = (256, 1, 1)
blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0]))
blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
event[dataset]['kernel_begin'].record(stream[dataset])
func_forward = mod_forward_projection_shared_mem.get_function("forward_projection")
func_forward(weight, height, depth, half_crystal_pitch_xy, half_crystal_pitch_z,
distance_between_array_pixel,
number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset],
a_cut_normal_gpu[dataset], a_cf_cut_gpu[dataset],
b_cut_gpu[dataset], b_cut_normal_gpu[dataset], b_cf_cut_gpu[dataset],
c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
c_cf_cut_gpu[dataset],
d_cut_gpu[dataset],
d_cut_normal_gpu[dataset], d_cf_cut_gpu[dataset], A_gpu, B_gpu, C_gpu,
sum_vor_gpu[dataset], sensivity_matrix_gpu, im_gpu, probability_gpu[dataset],
block=threadsperblock,
grid=blockspergrid,
stream=stream[dataset])
# Sincronization of streams
for dataset in range(number_of_datasets): # Commenting out this line should break concurrency.
event[dataset]['kernel_end'].record(stream[dataset])
# Transfering data from Optimizer
for dataset in range(number_of_datasets):
begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
cuda.memcpy_dtoh_async(sum_vor[begin_dataset:end_dataset], sum_vor_gpu[dataset])
toc = time.time()
print('Time part Forward Projection {} : {}'.format(1, toc - tic))
# number_of_datasets = np.int32(2)
teste = np.copy(sum_vor)
# sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32)
print('SUM VOR: {}'.format(np.sum(teste)))
cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor)
# ------------BACKPROJECTION-----------
for dataset in range(number_of_datasets):
dataset = np.int32(dataset)
if dataset == number_of_datasets:
begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
end_dataset = number_of_events_subset
else:
begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)
# begin_dataset = np.int32(0)
# end_dataset = np.int32(number_of_events)
event[dataset]['kernel_begin'].record(stream[dataset])
weight_cutted, height_cutted, depth_cutted = np.int32(adjust_coef_cut[dataset].shape[0]), np.int32(
adjust_coef_cut[dataset].shape[1]), np.int32(adjust_coef_cut[dataset].shape[2])
threadsperblock = (np.int(A.shape[0]), 1, 1)
blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(adjust_coef_cut[dataset].shape[1] / threadsperblock[1]))
blockspergrid_z = int(math.ceil(adjust_coef_cut[dataset].shape[2] / threadsperblock[2]))
blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
shared_memory = threadsperblock[0] * threadsperblock[1] * threadsperblock[2] * 4
func_backward = mod_backward_projection_shared_mem.get_function("backprojection")
# func_backward.prepare(dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy, half_crystal_pitch_z, distance_between_array_pixel,
# number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
# b_gpu, b_normal_gpu,b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
# d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset],
# sum_vor_t_gpu, sensivity_gpu[dataset], probability_t_gpu, time_factor_gpu,shared=512)
#
# func.prepared_call(blockspergrid, threadsperblock, dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy, half_crystal_pitch_z, distance_between_array_pixel,
# number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
# b_gpu, b_normal_gpu,b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
# d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset],
# sum_vor_t_gpu, sensivity_gpu[dataset], probability_t_gpu, time_factor_gpu)
func_backward(dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy,
half_crystal_pitch_z, distance_between_array_pixel,
number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset],
adjust_coef_gpu[dataset],
sum_vor_t_gpu, sensivity_gpu[dataset], probability_t_gpu, time_factor_gpu,
block=threadsperblock,
grid=blockspergrid,
shared=np.int(4 * A.shape[0]),
stream=stream[dataset],
)
# shared=np.int(shared_memory),
for dataset in range(number_of_datasets): # Commenting out this line should break concurrency.
event[dataset]['kernel_end'].record(stream[dataset])
for dataset in range(number_of_datasets):
cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset])
adjust_coef[:, :,
int(np.floor(im_cut_dim[2] * dataset)):int(np.floor(im_cut_dim[2] * (dataset + 1)))] = \
adjust_coef_cut[dataset]
print('Time part Backward Projection {} : {}'.format(1, time.time() - toc))
# Image Normalization
# if i ==0:
# norm_im=np.copy(adjust_coef)
# norm_im=norm_im/np.max(norm_im)
# norm_im[norm_im == 0] = np.min(norm_im[np.nonzero(norm_im)])
# normalization_matrix = gaussian_filter(normalization_matrix, 0.5)
# im_med = np.load("C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")
# self.algorithm = "LM-MRP"
if self.algorithm == "LM-MRP":
beta = self.algorithm_options[0]
kernel_filter_size = self.algorithm_options[1]
im_med = median_filter(im, kernel_filter_size)
penalized_term = np.copy(im)
penalized_term[im_med != 0] = 1 + beta * (im[im_med != 0] - im_med[im_med != 0]) / im_med[
im_med != 0]
penalized_term = np.ascontiguousarray(penalized_term, dtype=np.float32)
if self.algorithm == "MAP":
beta = 0.5
im_map = np.load(
"C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")
im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[
normalization_matrix != 0] / ((normalization_matrix[normalization_matrix != 0]) /
self.attenuation_matrix[normalization_matrix != 0])
im[normalization_matrix == 0] = 0
if self.algorithm == "LM-MRP":
im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0]
# im = fourier_gaussian(im, sigma=0.2)
# im = gaussian_filter(im, 0.4)
print('SUM IMAGE: {}'.format(np.sum(im)))
im = np.ascontiguousarray(im, dtype=np.float32)
# im = im * adjust_coef / sensivity_matrix[np.nonzero(sensivity_matrix)]
cuda.memcpy_htod_async(im_gpu, im)
# Clearing variables
sum_vor = np.ascontiguousarray(
np.zeros(self.a.shape, dtype=np.float32))
adjust_coef = np.ascontiguousarray(
np.zeros((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
dtype=np.float32))
for dataset in range(number_of_datasets):
if dataset == number_of_datasets:
begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
end_dataset = number_of_events
else:
begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
# adjust_coef_cut[dataset] = np.ascontiguousarray(adjust_coef[:, :,
# int(np.floor(im_cut_dim[2] * dataset)):int(
# np.floor(im_cut_dim[2] * (dataset + 1)))],
# dtype=np.float32)
adjust_coef_cut[dataset] = adjust_coef
sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_cut[dataset])
cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset])
if self.saved_image_by_iteration:
self._save_image_by_it(im, i, sb)
if self.signals_interface is not None:
self.signals_interface.trigger_update_label_reconstruction_status.emit(
"{}: Iteration {}".format(self.current_info_step, i + 1))
self.signals_interface.trigger_progress_reconstruction_partial.emit(
int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0)))
return im * subsets