Source code for toor.Optimizer.gpu_shared_memory

import numpy as np
from pycuda.compiler import SourceModule
import math
import time
import os



[docs]
class GPUSharedMemorySingleKernel:
    def __init__(self, EM_obj=None):
        self.mod_forward_projection_shared_mem = None
        self.mod_backward_projection_shared_mem = None
        self.a = EM_obj.a
        self.a_normal = EM_obj.a_normal
        self.a_cf = EM_obj.a_cf

        self.b = EM_obj.b
        self.b_normal = EM_obj.b_normal
        self.b_cf = EM_obj.b_cf

        self.c = EM_obj.c
        self.c_normal = EM_obj.c_normal
        self.c_cf = EM_obj.c_cf

        self.d = EM_obj.d
        self.d_normal = EM_obj.d_normal
        self.d_cf = EM_obj.d_cf

        self.adjust_coef = EM_obj.adjust_coef
        self.im = EM_obj.im
        self.half_crystal_pitch_xy = EM_obj.half_crystal_pitch_xy

        self.fw_source_model_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PET", "fw_single_kernel.c")
        self.fw_source_model = open(self.fw_source_model_file)
        self.bw_source_model_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "PET", "bw_single_kernel.c")
        self.bw_source_model = open(self.fw_source_model_file)

    def _load_machine_C_code(self):
        self.mod_forward_projection_shared_mem = SourceModule(self.fw_source_model)
        self.mod_backward_projection_shared_mem = SourceModule(self.bw_source_model)
        # self.mod_forward_projection_shared_mem = SourceModule("""__global__ void forward_projection
        #                                                                      (const int n, const int m, const int p, const float crystal_pitch_XY,const float crystal_pitch_Z,const float distance_between_array_pixel,int number_of_events, const int begin_event_gpu_limitation,  const int end_event_gpu_limitation, float *a, float *a_normal,
        #                                                                       float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal, float *c_cf, float *d, float *d_normal, float *d_cf, const int *A, const int *B,
        #                                                                       const int *C, float *sum_vor, const char *sensitivity_matrix, const float *im_old, const float *probability)
        #                                                 {
        #                                                    const int shared_memory_size = 256;
        #                                                     __shared__ float a_shared[shared_memory_size];
        #                                                     __shared__ float b_shared[shared_memory_size];
        #                                                     __shared__ float c_shared[shared_memory_size];
        #                                                     __shared__ float d_shared[shared_memory_size];
        #                                                     __shared__ float a_normal_shared[shared_memory_size];
        #                                                     __shared__ float b_normal_shared[shared_memory_size];
        #                                                     __shared__ float c_normal_shared[shared_memory_size];
        #                                                     __shared__ float d_normal_shared[shared_memory_size];
        #                                                     __shared__ float a_cf_shared[shared_memory_size];
        #                                                     __shared__ float b_cf_shared[shared_memory_size];
        #                                                     __shared__ float c_cf_shared[shared_memory_size];
        #
        #                                                     __shared__ float d_cf_shared[shared_memory_size];
        #                                                     __shared__ float sum_vor_shared[shared_memory_size];
        #                                                     /*
        #                                                    const int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
        #                                                    const int threadId= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
        #                                                    */
        #                                                    int threadId=blockIdx.x *blockDim.x + threadIdx.x;
        #                                                    int e;
        #
        #                                                    float d2;
        #                                                    float d2_normal;
        #                                                    float d2_cf;
        #                                                    float value;
        #                                                    float value_normal;
        #                                                    float value_cf;
        #                                                    float sum_vor_temp;
        #                                                    int index;
        #                                                    const float total_length =sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel/4);
        #
        #
        #
        #                                                    const int number_events_max = end_event_gpu_limitation-begin_event_gpu_limitation;
        #                                                    const float error_pixel = 0.0000f;
        #                                                     if (threadIdx.x>shared_memory_size)
        #                                                     {
        #                                                     return;
        #                                                     }
        #                                                     if(threadId>=number_events_max)
        #                                                     {
        #                                                     return;
        #                                                     }
        #
        #                                                    __syncthreads();
        #                                                    e = threadId;
        #                                                    int e_m = threadIdx.x;
        #                                                    a_shared[e_m] = a[e];
        #                                                    b_shared[e_m] = b[e];
        #                                                    c_shared[e_m] = c[e];
        #                                                    d_shared[e_m] = d[e];
        #                                                    a_normal_shared[e_m] = a_normal[e];
        #                                                    b_normal_shared[e_m] = b_normal[e];
        #                                                    c_normal_shared[e_m] = c_normal[e];
        #                                                    d_normal_shared[e_m] = d_normal[e];
        #                                                    a_cf_shared[e_m] = a_cf[e];
        #                                                    b_cf_shared[e_m] = b_cf[e];
        #                                                    c_cf_shared[e_m] = c_cf[e];
        #                                                    d_cf_shared[e_m] = d_cf[e];
        #
        #                                                    sum_vor_shared[e_m] = sum_vor[e];
        #
        #
        #                                                    for(int k=0; k<p; k++)
        #                                                    {
        #                                                        for(int j=0; j<m; j++)
        #                                                        {
        #                                                             for(int l=0; l<n; l++)
        #                                                                {
        #                                                                /*index = k+j*p+l*p*m;*/
        #                                                                index = l+j*n+k*m*n;
        #
        #
        #                                                                if (sensitivity_matrix[index]!=0)
        #                                                                {
        #                                                                value = a_shared[e_m]*A[index]+b_shared[e_m]*B[index]+c_shared[e_m]*C[index]- d_shared[e_m];
        #                                                                d2 = error_pixel+crystal_pitch_Z * sqrt(a_shared[e_m]*a_shared[e_m] + b_shared[e_m]*b_shared[e_m] + c_shared[e_m]*c_shared[e_m]);
        #                                                                if (value < d2 && value >=-d2 )
        #                                                                {
        #                                                                    value_normal = a_normal_shared[e_m]*A[index]+b_normal_shared[e_m]*B[index]+c_normal_shared[e_m] * C[index]-d_normal_shared[e_m];
        #                                                                    d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal_shared[e_m]*a_normal_shared[e_m]+b_normal_shared[e_m]*b_normal_shared[e_m]+c_normal_shared[e_m]*c_normal_shared[e_m]);
        #
        #
        #                                                                     if (value_normal < d2_normal && value_normal >= -d2_normal)
        #
        #                                                                     {
        #                                                                            value_cf = a_cf_shared[e_m]*A[index]+b_cf_shared[e_m]*B[index]+c_cf_shared[e_m] * C[index]-d_cf_shared[e_m];
        #                                                                             d2_cf = (distance_between_array_pixel/2)*sqrt(a_cf_shared[e_m]*a_cf_shared[e_m]+b_cf_shared[e_m]*b_cf_shared[e_m]+c_cf_shared[e_m]*c_cf_shared[e_m]);
        #
        #                                                                        if ((value_cf > -d2_cf) && value_cf<=d2_cf)
        #                                                                            {
        #                                                                             sum_vor_shared[e_m] += im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
        #                                                                              }
        #                                                                             /*
        #
        #                                                                                sum_vor_temp += im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
        #                                                                             sum_vor_temp += im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+distance_between_array_pixel*d2_cf*distance_between_array_pixel*d2_cf/4));
        #                                                                                if ( (value_cf>=0 && value_cf < distance_between_array_pixel*d2_cf && value_cf>=d2_cf) || (value_cf<0 && value_cf > -distance_between_array_pixel*d2_cf && value_cf<=d2_cf))
        #                                                                        {
        #                                                                        }
        #                                                                        *(1-abs(distance_between_array_pixel/2-value_cf)/distance_between_array_pixel/2)
        #
        #                                                                             *(1-((sqrt(value*value+value_normal*value_normal+4*crystal_pitch_XY*crystal_pitch_XY))/sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel)));
        #                                                                               sum_vor_shared[e_m] += im_old[index]*(1-((sqrt(value*value+value_normal*value_normal+4*crystal_pitch_XY*crystal_pitch_XY))/sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel)));;
        #                                                                             sum_vor_shared[e_m] += im_old[index]*sensitivity_matrix[index]*(1-((sqrt(value*value+value_normal*value_normal))/sqrt(crystal_pitch_Z*crystal_pitch_Z+crystal_pitch_XY*crystal_pitch_XY+)));;
        #                                                                             sum_vor_shared[e_m] += im_old[index];
        #                                                                             sum_vor_temp += sensitivity_matrix[index]*im_old[index]*(1-(sqrt(value*value+value_normal*value_normal+4*crystal_pitch_XY*crystal_pitch_XY))/total_length);
        #                                                                             *(1-(sqrt(value*value+value_normal*value_normal+value_cf*value_cf))/total_length)
        #                                                                             (1-abs(value_cf)/(distance_between_array_pixel/2*d2_cf));
        #
        #                                                                             ;
        #
        #
        #
        #                                                                         }
        #                                                                          */
        #
        #                                                                    }
        #
        #
        #
        #
        #                                                                }
        #                                                                }
        #
        #                                                             }
        #                                                         }
        #                                                     }
        #
        #                                                     /*
        #                                                     sum_vor[e]= sum_vor_shared[e_m];
        #                                                     sum_vor[e]= sum_vor_temp;
        #                                                     */
        #
        #                                                     __syncthreads();
        #                                                     sum_vor[e]= sum_vor_shared[e_m];
        #
        #
        #
        #                                 }""")
        # self.mod_backward_projection_shared_mem = SourceModule("""
        #
        #                             __global__ void backprojection
        #                              ( int dataset_number, int n, int m, int p, const float crystal_pitch_XY, const float crystal_pitch_Z, const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation, const int end_event_gpu_limitation, float *a, float *a_normal,
        #                               float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal,  float *c_cf, float *d, float *d_normal,  float *d_cf,const int *A, const int *B,
        #                               const int *C, float *adjust_coef, float *sum_vor, char *sensitivity_matrix, const float *probability, float *time_factor)
        #                            {
        #
        #                                /*
        #
        #                                int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
        #                                int idt= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
        #
        #
        #
        #
        #                                const int shared_memory_size = 143;
        #                                __shared__ float adjust_coef_shared[shared_memory_size];
        #
        #                                */
        #                                extern __shared__ float adjust_coef_shared[];
        #                                int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
        #                                int idt = blockId * blockDim.x + threadIdx.x;
        #
        #
        #
        #
        #                                float d2;
        #                                float d2_normal;
        #                                float d2_cf;
        #                                float normal_value;
        #                                float value;
        #                                float value_cf;
        #                                float error_pixel = 0.000f;
        #                                float declive;
        #                                const float total_length = sqrt(4*crystal_pitch_Z*crystal_pitch_Z+4*crystal_pitch_XY*crystal_pitch_XY+distance_between_array_pixel*distance_between_array_pixel/4);
        #
        #
        #                                int i_s=threadIdx.x;
        #
        #                                if (idt>=n*m*p)
        #                                {
        #                                     return;
        #                                 }
        #                                 if (i_s>n)
        #                                {
        #                                     return;
        #                                 }
        #
        #
        #                                 __syncthreads();
        #                                 adjust_coef_shared[i_s] = adjust_coef[idt];
        #
        #                                for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
        #                                {
        #                                        if (sensitivity_matrix[idt]!=0)
        #                                                                {
        #
        #                                        value = a[e]*A[idt]+b[e]*B[idt]+c[e]*C[idt]- d[e];
        #                                        d2 = error_pixel+ crystal_pitch_Z * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);
        #
        #
        #                                        if (value < d2 && value >=-d2)
        #                                        {
        #                                            normal_value= a_normal[e]*A[idt]+b_normal[e]*B[idt]+c_normal[e] * C[idt]-d_normal[e];
        #
        #                                            d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
        #
        #
        #                                             if (normal_value< d2_normal && normal_value >= -d2_normal)
        #                                             {
        #
        #                                                  value_cf = a_cf[e]*A[idt]+b_cf[e]*B[idt]+c_cf[e] * C[idt]-d_cf[e];
        #                                                  d2_cf = distance_between_array_pixel/2*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);
        #
        #
        #                                                    if ((value_cf > -d2_cf) && value_cf<=d2_cf)
        #                                                    {
        #                                                         adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
        #
        #
        #
        #                                                   }
        #
        #                                                        /*
        #                                                      adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
        #                                                         */
        #
        #
        #
        #
        #
        #                                             }
        #                                         }
        #
        #                                }
        #
        #                                }
        #
        #                                adjust_coef[idt] = adjust_coef_shared[i_s];
        #                                __syncthreads();
        #
        #
        #
        #                            }
        #                            """)

    def _vor_design_gpu_shared_memory(self):
        print('Optimizer STARTED')
        # cuda.init()
        cuda = self.EM_obj.cuda_drv
        # device = cuda.Device(0)  # enter your gpu id here
        # ctx = device.make_context()
        number_of_events = np.int32(len(a))
        weight = np.int32(A.shape[0])
        height = np.int32(A.shape[1])
        depth = np.int32(A.shape[2])
        print('Image size: {},{}, {}'.format(weight, height, depth))
        print('BACKPROJECTION FUNCTION')
        # probability= self.probability
        probability = np.ascontiguousarray(
            np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z), dtype=np.float32))
        distance_between_array_pixel = self.distance_between_array_pixel

        # SOURCE MODELS (DEVICE CODE)
        self._load_machine_C_code()
        # float crystal_pitch, int number_of_events, float *a, float *b, float *c, float *d, int *A, int *B, int *C, float *im, float *vector_matrix
        # Host Code   B, C, im, vector_matrix,
        number_of_datasets = np.int32(1)  # Number of datasets (and concurrent operations) used.
        # Start concurrency Test
        # Event as reference point
        ref = cuda.Event()
        ref.record()

        # Create the streams and events needed to calculation
        stream, event = [], []
        marker_names = ['kernel_begin', 'kernel_end']
        # Create List to allocate chunks of data
        A_cut_gpu, B_cut_gpu, C_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets
        A_cut, B_cut, C_cut = [None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_gpu, b_gpu, c_gpu, d_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets
        a_normal_gpu, b_normal_gpu, c_normal_gpu, d_normal_gpu = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cut, b_cut, c_cut, d_cut = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets
        a_normal_cut, b_normal_cut, c_normal_cut, d_normal_cut = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cf_cut, b_cf_cut, c_cf_cut, d_cf_cut = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cut_gpu, b_cut_gpu, c_cut_gpu, d_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets

        a_cut_normal_gpu, b_cut_normal_gpu, c_cut_normal_gpu, d_cut_normal_gpu = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cf_cut_gpu, b_cf_cut_gpu, c_cf_cut_gpu, d_cf_cut_gpu = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        time_factor_cut_gpu = [None] * number_of_datasets
        sum_vor_gpu = [None] * number_of_datasets
        sum_vor_cut = [None] * number_of_datasets
        probability_cut = [None] * number_of_datasets
        probability_gpu = [None] * number_of_datasets
        adjust_coef_cut = [None] * number_of_datasets
        adjust_coef_gpu = [None] * number_of_datasets
        sensivity_matrix_cut = [None] * number_of_datasets
        sensivity_cut = [None] * number_of_datasets
        sensivity_gpu = [None] * number_of_datasets

        # Streams and Events creation
        for dataset in range(number_of_datasets):
            stream.append(cuda.Stream())
            event.append(dict([(marker_names[l], cuda.Event()) for l in range(len(marker_names))]))

        # Memory Allocation
        # Variables that need an unique alocation
        A_gpu = cuda.mem_alloc(A.size * A.dtype.itemsize)
        B_gpu = cuda.mem_alloc(B.size * B.dtype.itemsize)
        C_gpu = cuda.mem_alloc(C.size * C.dtype.itemsize)
        im_gpu = cuda.mem_alloc(im.size * im.dtype.itemsize)
        sensivity_matrix_gpu = cuda.mem_alloc(sensivity_matrix.size * sensivity_matrix.dtype.itemsize)

        cuda.memcpy_htod_async(A_gpu, A)
        cuda.memcpy_htod_async(B_gpu, B)
        cuda.memcpy_htod_async(C_gpu, C)
        cuda.memcpy_htod_async(im_gpu, im)
        cuda.memcpy_htod_async(sensivity_matrix_gpu, sensivity_matrix)

        a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
        b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
        c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)
        d_gpu = cuda.mem_alloc(d.size * d.dtype.itemsize)
        a_normal_gpu = cuda.mem_alloc(a_normal.size * a_normal.dtype.itemsize)
        b_normal_gpu = cuda.mem_alloc(b_normal.size * b_normal.dtype.itemsize)
        c_normal_gpu = cuda.mem_alloc(c_normal.size * c_normal.dtype.itemsize)
        d_normal_gpu = cuda.mem_alloc(d_normal.size * d_normal.dtype.itemsize)

        a_cf_gpu = cuda.mem_alloc(a_cf.size * a_cf.dtype.itemsize)
        b_cf_gpu = cuda.mem_alloc(b_cf.size * b_cf.dtype.itemsize)
        c_cf_gpu = cuda.mem_alloc(c_cf.size * c_cf.dtype.itemsize)
        d_cf_gpu = cuda.mem_alloc(d_cf.size * d_cf.dtype.itemsize)
        sum_vor_t_gpu = cuda.mem_alloc(sum_vor.size * sum_vor.dtype.itemsize)
        probability_t_gpu = cuda.mem_alloc(probability.size * probability.dtype.itemsize)
        time_factor_gpu = cuda.mem_alloc(time_factor.size * time_factor.dtype.itemsize)
        # Transfer memory to Optimizer
        cuda.memcpy_htod_async(a_gpu, a)
        cuda.memcpy_htod_async(b_gpu, b)
        cuda.memcpy_htod_async(c_gpu, c)
        cuda.memcpy_htod_async(d_gpu, d)
        cuda.memcpy_htod_async(a_normal_gpu, a_normal)
        cuda.memcpy_htod_async(b_normal_gpu, b_normal)
        cuda.memcpy_htod_async(c_normal_gpu, c_normal)
        cuda.memcpy_htod_async(d_normal_gpu, d_normal)

        cuda.memcpy_htod_async(a_cf_gpu, a_cf)
        cuda.memcpy_htod_async(b_cf_gpu, b_cf)
        cuda.memcpy_htod_async(c_cf_gpu, c_cf)
        cuda.memcpy_htod_async(d_cf_gpu, d_cf)
        cuda.memcpy_htod_async(time_factor_gpu, time_factor)

        im_cut_dim = [im.shape[0], im.shape[1], int(im.shape[2] / number_of_datasets)]  # Dividing image in small chunks
        for dataset in range(number_of_datasets):
            if dataset == number_of_datasets:
                begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                end_dataset = number_of_events
                adjust_coef_cut[dataset] = np.ascontiguousarray(
                    adjust_coef[int(np.floor(im_cut_dim[0] * dataset)):adjust_coef.shape[0],
                    int(np.floor(im_cut_dim[1] * dataset)):adjust_coef.shape[1],
                    int(np.floor(im_cut_dim[2] * dataset)):adjust_coef.shape[2]],
                    dtype=np.float32)
                sensivity_cut[dataset] = np.ascontiguousarray(
                    sensivity_matrix[int(np.floor(im_cut_dim[0] * dataset)):sensivity_matrix.shape[0],
                    int(np.floor(im_cut_dim[1] * dataset)):sensivity_matrix.shape[1],
                    int(np.floor(im_cut_dim[2] * dataset)):sensivity_matrix.shape[2]],
                    dtype=np.float32)
            else:
                begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)

                adjust_coef_cut[dataset] = np.ascontiguousarray(adjust_coef[:, :,
                                                                int(np.floor(im_cut_dim[2] * dataset)):int(
                                                                    np.floor(im_cut_dim[2] * (dataset + 1)))],
                                                                dtype=np.float32)
                sensivity_cut[dataset] = np.ascontiguousarray(sensivity_matrix[:, :,
                                                              int(np.floor(im_cut_dim[2] * dataset)):int(
                                                                  np.floor(im_cut_dim[2] * (dataset + 1)))],
                                                              dtype=np.byte)

                A_cut[dataset] = np.ascontiguousarray(A[:, :,
                                                      int(np.floor(im_cut_dim[2] * dataset)):int(
                                                          np.floor(im_cut_dim[2] * (dataset + 1)))], dtype=np.int32)
                B_cut[dataset] = np.ascontiguousarray(B[:, :,
                                                      int(np.floor(im_cut_dim[2] * dataset)):int(
                                                          np.floor(im_cut_dim[2] * (dataset + 1)))], dtype=np.int32)
                C_cut[dataset] = np.ascontiguousarray(C[:, :,
                                                      int(np.floor(im_cut_dim[2] * dataset)):int(
                                                          np.floor(im_cut_dim[2] * (dataset + 1)))], dtype=np.int32)

            # Cutting dataset
            # For forward projection the data is cutted by number of events. For backprojection is cutted per image pieces of image
            a_cut[dataset] = a[begin_dataset:end_dataset]
            b_cut[dataset] = b[begin_dataset:end_dataset]
            c_cut[dataset] = c[begin_dataset:end_dataset]
            d_cut[dataset] = d[begin_dataset:end_dataset]
            a_normal_cut[dataset] = a_normal[begin_dataset:end_dataset]
            b_normal_cut[dataset] = b_normal[begin_dataset:end_dataset]
            c_normal_cut[dataset] = c_normal[begin_dataset:end_dataset]
            d_normal_cut[dataset] = d_normal[begin_dataset:end_dataset]

            a_cf_cut[dataset] = a_cf[begin_dataset:end_dataset]
            b_cf_cut[dataset] = b_cf[begin_dataset:end_dataset]
            c_cf_cut[dataset] = c_cf[begin_dataset:end_dataset]
            d_cf_cut[dataset] = d_cf[begin_dataset:end_dataset]
            sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
            probability_cut[dataset] = probability[begin_dataset:end_dataset]

            # Forward
            a_cut_gpu[dataset] = cuda.mem_alloc(a_cut[dataset].size * a_cut[dataset].dtype.itemsize)
            b_cut_gpu[dataset] = cuda.mem_alloc(b_cut[dataset].size * b_cut[dataset].dtype.itemsize)
            c_cut_gpu[dataset] = cuda.mem_alloc(c_cut[dataset].size * c_cut[dataset].dtype.itemsize)
            d_cut_gpu[dataset] = cuda.mem_alloc(d_cut[dataset].size * d_cut[dataset].dtype.itemsize)
            a_cut_normal_gpu[dataset] = cuda.mem_alloc(
                a_normal_cut[dataset].size * a_normal_cut[dataset].dtype.itemsize)
            b_cut_normal_gpu[dataset] = cuda.mem_alloc(
                b_normal_cut[dataset].size * b_normal_cut[dataset].dtype.itemsize)
            c_cut_normal_gpu[dataset] = cuda.mem_alloc(
                c_normal_cut[dataset].size * c_normal_cut[dataset].dtype.itemsize)
            d_cut_normal_gpu[dataset] = cuda.mem_alloc(
                d_normal_cut[dataset].size * d_normal_cut[dataset].dtype.itemsize)

            a_cf_cut_gpu[dataset] = cuda.mem_alloc(a_cf_cut[dataset].size * a_cf_cut[dataset].dtype.itemsize)
            b_cf_cut_gpu[dataset] = cuda.mem_alloc(b_cf_cut[dataset].size * b_cf_cut[dataset].dtype.itemsize)
            c_cf_cut_gpu[dataset] = cuda.mem_alloc(c_cf_cut[dataset].size * c_cf_cut[dataset].dtype.itemsize)
            d_cf_cut_gpu[dataset] = cuda.mem_alloc(d_cf_cut[dataset].size * d_cf_cut[dataset].dtype.itemsize)

            sum_vor_gpu[dataset] = cuda.mem_alloc(sum_vor_cut[dataset].size * sum_vor_cut[dataset].dtype.itemsize)
            probability_gpu[dataset] = cuda.mem_alloc(
                probability_cut[dataset].size * probability_cut[dataset].dtype.itemsize)

            cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_cut[dataset])
            cuda.memcpy_htod_async(probability_gpu[dataset], probability_cut[dataset])
            cuda.memcpy_htod_async(a_cut_gpu[dataset], a_cut[dataset])
            cuda.memcpy_htod_async(b_cut_gpu[dataset], b_cut[dataset])
            cuda.memcpy_htod_async(c_cut_gpu[dataset], c_cut[dataset])
            cuda.memcpy_htod_async(d_cut_gpu[dataset], d_cut[dataset])
            cuda.memcpy_htod_async(a_cut_normal_gpu[dataset], a_normal_cut[dataset])
            cuda.memcpy_htod_async(b_cut_normal_gpu[dataset], b_normal_cut[dataset])
            cuda.memcpy_htod_async(c_cut_normal_gpu[dataset], c_normal_cut[dataset])
            cuda.memcpy_htod_async(d_cut_normal_gpu[dataset], d_normal_cut[dataset])

            cuda.memcpy_htod_async(a_cf_cut_gpu[dataset], a_cf_cut[dataset])
            cuda.memcpy_htod_async(b_cf_cut_gpu[dataset], b_cf_cut[dataset])
            cuda.memcpy_htod_async(c_cf_cut_gpu[dataset], c_cf_cut[dataset])
            cuda.memcpy_htod_async(d_cf_cut_gpu[dataset], d_cf_cut[dataset])

            # Backprojection
            adjust_coef_gpu[dataset] = cuda.mem_alloc(
                adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize)

            sensivity_gpu[dataset] = cuda.mem_alloc(
                sensivity_cut[dataset].size * sensivity_cut[dataset].dtype.itemsize)

            A_cut_gpu[dataset] = cuda.mem_alloc(
                A_cut[dataset].size * A_cut[dataset].dtype.itemsize)

            B_cut_gpu[dataset] = cuda.mem_alloc(
                B_cut[dataset].size * B_cut[dataset].dtype.itemsize)

            C_cut_gpu[dataset] = cuda.mem_alloc(
                C_cut[dataset].size * C_cut[dataset].dtype.itemsize)

            cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset])
            cuda.memcpy_htod_async(sensivity_gpu[dataset], sensivity_cut[dataset])
            cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset])
            cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset])
            cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset])

        free, total = cuda.mem_get_info()

        print('%.1f %% of device memory is free.' % ((free / float(total)) * 100))

        # -------------OSEM---------
        it = self.number_of_iterations
        subsets = self.number_of_subsets
        print('Number events for reconstruction: {}'.format(number_of_events))
        # total_pixels_per_event = weight * height * depth
        # number_of_pixel_calculated_per_second = 3E9
        # time_to_prevent_watchdog = 1.8  # seconds
        # number_events_cutted_by_watchdog = np.int32(
        #     (number_of_pixel_calculated_per_second * time_to_prevent_watchdog) / total_pixels_per_event)
        # propability calculation
        # begin_event = np.int32(0)
        # end_event = np.int32(number_of_events / subsets)
        # number_of_events_subset = np.int32(end_event - begin_event)
        # for dataset in range(number_of_datasets):
        #
        #     if dataset == number_of_datasets:
        #         begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
        #         end_dataset = number_of_events_subset
        #     else:
        #         begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
        #         end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)
        #
        #     threadsperblock = (512, 1, 1)
        #     blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0]))
        #     blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
        #     blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
        #     blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
        #     event[dataset]['kernel_begin'].record(stream[dataset])
        #
        #     func_prob = mod_probability_shared_mem.get_function("probability")
        #     func_prob(weight, height, depth, half_crystal_pitch_xy, half_crystal_pitch_z,
        #                  number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset], a_cut_normal_gpu[dataset],
        #                  b_cut_gpu[dataset], b_cut_normal_gpu[dataset], c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
        #                  d_cut_gpu[dataset],
        #                  d_cut_normal_gpu[dataset], A_gpu, B_gpu, C_gpu, probability_gpu[dataset],
        #                  im_gpu,
        #                  block=threadsperblock,
        #                  grid=blockspergrid,
        #                  stream=stream[dataset])
        #
        # # Sincronization of streams
        # for dataset in range(number_of_datasets):  # Commenting out this line should break concurrency.
        #     event[dataset]['kernel_end'].record(stream[dataset])
        #
        # # Transfering data from Optimizer
        # for dataset in range(number_of_datasets):
        #     begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
        #     end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
        #     cuda.memcpy_dtoh_async(probability[begin_dataset:end_dataset], probability_gpu[dataset])

        # cuda.memcpy_htod_async(probability_t_gpu, probability)

        for i in range(it):
            print('Iteration number: {}\n----------------'.format(i + 1))
            begin_event = np.int32(0)
            end_event = np.int32(number_of_events / subsets)
            for sb in range(subsets):
                print('Subset number: {}'.format(sb))
                number_of_events_subset = np.int32(end_event - begin_event)
                tic = time.time()
                # Cycle forward Projection
                for dataset in range(number_of_datasets):

                    if dataset == number_of_datasets:
                        begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
                        end_dataset = number_of_events_subset
                    else:
                        begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
                        end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)

                    threadsperblock = (256, 1, 1)
                    blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0]))
                    blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
                    blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
                    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
                    event[dataset]['kernel_begin'].record(stream[dataset])

                    func_forward = mod_forward_projection_shared_mem.get_function("forward_projection")
                    func_forward(weight, height, depth, half_crystal_pitch_xy, half_crystal_pitch_z,
                                 distance_between_array_pixel,
                                 number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset],
                                 a_cut_normal_gpu[dataset], a_cf_cut_gpu[dataset],
                                 b_cut_gpu[dataset], b_cut_normal_gpu[dataset], b_cf_cut_gpu[dataset],
                                 c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
                                 c_cf_cut_gpu[dataset],
                                 d_cut_gpu[dataset],
                                 d_cut_normal_gpu[dataset], d_cf_cut_gpu[dataset], A_gpu, B_gpu, C_gpu,
                                 sum_vor_gpu[dataset], sensivity_matrix_gpu, im_gpu, probability_gpu[dataset],
                                 block=threadsperblock,
                                 grid=blockspergrid,
                                 stream=stream[dataset])

                # Sincronization of streams
                for dataset in range(number_of_datasets):  # Commenting out this line should break concurrency.
                    event[dataset]['kernel_end'].record(stream[dataset])

                # Transfering data from Optimizer
                for dataset in range(number_of_datasets):
                    begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                    end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
                    cuda.memcpy_dtoh_async(sum_vor[begin_dataset:end_dataset], sum_vor_gpu[dataset])
                    toc = time.time()
                    print('Time part Forward Projection {} : {}'.format(1, toc - tic))

                # number_of_datasets = np.int32(2)
                teste = np.copy(sum_vor)
                # sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32)
                print('SUM VOR: {}'.format(np.sum(teste)))
                cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor)

                # ------------BACKPROJECTION-----------

                for dataset in range(number_of_datasets):
                    dataset = np.int32(dataset)
                    if dataset == number_of_datasets:
                        begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
                        end_dataset = number_of_events_subset
                    else:
                        begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
                        end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)

                    # begin_dataset = np.int32(0)
                    # end_dataset = np.int32(number_of_events)

                    event[dataset]['kernel_begin'].record(stream[dataset])
                    weight_cutted, height_cutted, depth_cutted = np.int32(adjust_coef_cut[dataset].shape[0]), np.int32(
                        adjust_coef_cut[dataset].shape[1]), np.int32(adjust_coef_cut[dataset].shape[2])

                    threadsperblock = (np.int(A.shape[0]), 1, 1)
                    blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0]))
                    blockspergrid_y = int(math.ceil(adjust_coef_cut[dataset].shape[1] / threadsperblock[1]))
                    blockspergrid_z = int(math.ceil(adjust_coef_cut[dataset].shape[2] / threadsperblock[2]))
                    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
                    shared_memory = threadsperblock[0] * threadsperblock[1] * threadsperblock[2] * 4
                    func_backward = mod_backward_projection_shared_mem.get_function("backprojection")

                    # func_backward.prepare(dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy, half_crystal_pitch_z, distance_between_array_pixel,
                    #      number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
                    #      b_gpu, b_normal_gpu,b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
                    #      d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset],
                    #      sum_vor_t_gpu, sensivity_gpu[dataset], probability_t_gpu, time_factor_gpu,shared=512)
                    #
                    # func.prepared_call(blockspergrid, threadsperblock, dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy, half_crystal_pitch_z, distance_between_array_pixel,
                    #      number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
                    #      b_gpu, b_normal_gpu,b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
                    #      d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset], adjust_coef_gpu[dataset],
                    #      sum_vor_t_gpu, sensivity_gpu[dataset], probability_t_gpu, time_factor_gpu)
                    func_backward(dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy,
                                  half_crystal_pitch_z, distance_between_array_pixel,
                                  number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
                                  b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
                                  d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset], C_cut_gpu[dataset],
                                  adjust_coef_gpu[dataset],
                                  sum_vor_t_gpu, sensivity_gpu[dataset], probability_t_gpu, time_factor_gpu,
                                  block=threadsperblock,
                                  grid=blockspergrid,
                                  shared=np.int(4 * A.shape[0]),
                                  stream=stream[dataset],
                                  )

                # shared=np.int(shared_memory),

                for dataset in range(number_of_datasets):  # Commenting out this line should break concurrency.
                    event[dataset]['kernel_end'].record(stream[dataset])

                for dataset in range(number_of_datasets):
                    cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset])

                    adjust_coef[:, :,
                    int(np.floor(im_cut_dim[2] * dataset)):int(np.floor(im_cut_dim[2] * (dataset + 1)))] = \
                        adjust_coef_cut[dataset]

                print('Time part Backward Projection {} : {}'.format(1, time.time() - toc))

                # Image Normalization
                # if i ==0:
                #     norm_im=np.copy(adjust_coef)
                #     norm_im=norm_im/np.max(norm_im)
                #     norm_im[norm_im == 0] = np.min(norm_im[np.nonzero(norm_im)])
                # normalization_matrix = gaussian_filter(normalization_matrix, 0.5)

                # im_med = np.load("C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")
                # self.algorithm = "LM-MRP"
                if self.algorithm == "LM-MRP":
                    beta = self.algorithm_options[0]
                    kernel_filter_size = self.algorithm_options[1]
                    im_med = median_filter(im, kernel_filter_size)
                    penalized_term = np.copy(im)
                    penalized_term[im_med != 0] = 1 + beta * (im[im_med != 0] - im_med[im_med != 0]) / im_med[
                        im_med != 0]
                    penalized_term = np.ascontiguousarray(penalized_term, dtype=np.float32)

                if self.algorithm == "MAP":
                    beta = 0.5
                    im_map = np.load(
                        "C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")

                im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[
                    normalization_matrix != 0] / ((normalization_matrix[normalization_matrix != 0]) /
                                                  self.attenuation_matrix[normalization_matrix != 0])
                im[normalization_matrix == 0] = 0
                if self.algorithm == "LM-MRP":
                    im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0]
                # im = fourier_gaussian(im, sigma=0.2)
                # im = gaussian_filter(im, 0.4)
                print('SUM IMAGE: {}'.format(np.sum(im)))
                im = np.ascontiguousarray(im, dtype=np.float32)
                # im = im * adjust_coef / sensivity_matrix[np.nonzero(sensivity_matrix)]
                cuda.memcpy_htod_async(im_gpu, im)

                # Clearing variables
                sum_vor = np.ascontiguousarray(
                    np.zeros(self.a.shape, dtype=np.float32))

                adjust_coef = np.ascontiguousarray(
                    np.zeros((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
                             dtype=np.float32))

                for dataset in range(number_of_datasets):
                    if dataset == number_of_datasets:
                        begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                        end_dataset = number_of_events

                    else:
                        begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                        end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
                        # adjust_coef_cut[dataset] = np.ascontiguousarray(adjust_coef[:, :,
                        #                                                 int(np.floor(im_cut_dim[2] * dataset)):int(
                        #                                                     np.floor(im_cut_dim[2] * (dataset + 1)))],
                        #                                                 dtype=np.float32)
                        adjust_coef_cut[dataset] = adjust_coef

                    sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
                    cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_cut[dataset])
                    cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset])

                if self.saved_image_by_iteration:
                    self._save_image_by_it(im, i, sb)

                if self.signals_interface is not None:
                    self.signals_interface.trigger_update_label_reconstruction_status.emit(
                        "{}: Iteration {}".format(self.current_info_step, i + 1))
                    self.signals_interface.trigger_progress_reconstruction_partial.emit(
                        int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0)))
        return im * subsets