Source code for toor.Optimizer.GPUManager

import numpy as np
import os
from array import array
import time
import math
import pycuda.driver as cuda
# import pycuda.autoinit
from pycuda.compiler import SourceModule

# from ..Segmentation.findisosurface import FindISOSurface
from .selectable_events import ROIEvents
from .CPU import IterativeAlgorithmCPU
from .gpu_shared_memory import GPUSharedMemorySingleKernel
from .gpu_multiple_kernel import GPUSharedMemoryMultipleKernel



[docs]
class EM:
    def __init__(self, easypetdata=None, planes_equation=None, algorithm="LM-MLEM", algorithm_options=None,
                 number_of_iterations=10, number_of_subsets=1, projector_type="Box Counts",
                 normalization_matrix=None, attenuation_correction=False, attenuation_map=None,
                 decay_correction=False, time_correction=None, doi_correction=None, doi_mapping=None,
                 normalization_calculation_flag=False, random_correction=False, scatter_correction=False,
                 scatter_angle_correction=False, cut_fov=True,
                 cuda_drv=None, GPU=True, directory=None,
                 shared_memory=True, saved_image_by_iteration=True, pixeltoangle=False,
                 entry_im=None, multiple_kernel=False, signals_interface=None,
                 current_info_step=""):

        if planes_equation is None:
            return

        if directory is None:
            return

        self.easypetdata = easypetdata

        self.algorithm = algorithm
        self.algorithm_options = algorithm_options
        self.cuda_drv = cuda_drv
        self.normalization_calculation_flag = normalization_calculation_flag
        self.signals_interface = signals_interface
        self.current_info_step = current_info_step
        self.directory = directory  # study directory

        self.number_of_events = planes_equation.number_of_events
        # Algorithm
        self.number_of_iterations = number_of_iterations
        self.number_of_subsets = number_of_subsets
        self.saved_image_by_iteration = saved_image_by_iteration

        # Optimizer
        self.shared_memory = shared_memory

        # Geometry

        # self.crystal_central_planes = planes_equation.crystal_planes
        self.number_of_pixels_x = planes_equation.number_of_pixels_x
        self.number_of_pixels_y = planes_equation.number_of_pixels_y
        self.number_of_pixels_z = planes_equation.number_of_pixels_z
        self.distance_to_center_plane = planes_equation.distance_to_central_plane
        self.distance_to_center_plane_normal = planes_equation.distance_to_central_plane_normal
        self.im_index_x = planes_equation.im_index_x
        self.im_index_y = planes_equation.im_index_y
        self.im_index_z = planes_equation.im_index_z
        #
        # self.p1_list = planes_equation.p1_list
        # self.p2_list = planes_equation.p2_list
        # self.p3_list = planes_equation.p3_list
        # self.p4_list = planes_equation.p4_list

        self.a = planes_equation.a
        self.b = planes_equation.b
        self.c = planes_equation.c
        self.d = planes_equation.d

        self.a_normal = planes_equation.a_normal
        self.b_normal = planes_equation.b_normal
        self.c_normal = planes_equation.c_normal
        self.d_normal = planes_equation.d_normal

        self.a_cf = planes_equation.a_cf
        self.b_cf = planes_equation.b_cf
        self.c_cf = planes_equation.c_cf
        self.d_cf = planes_equation.d_cf

        self.half_crystal_pitch_z = planes_equation.half_crystal_pitch_z
        self.distance_between_array_pixel = planes_equation.distance_between_array_pixel
        self.half_crystal_pitch_xy = planes_equation.half_crystal_pitch_xy

        self.x_min_f = planes_equation.x_min_f
        self.x_max_f = planes_equation.x_max_f
        self.y_min_f = planes_equation.y_min_f
        self.y_max_f = planes_equation.y_max_f
        self.z_min_f = planes_equation.z_min_f
        self.z_max_f = planes_equation.z_max_f

        self.voxelSize =[planes_equation.pixelSizeXY, planes_equation.pixelSizeXY, planes_equation.pixelSizeXYZ]


        # Corrections
        self.isosurface = None
        self.surface = None
        self.active_pixel_x = None
        self.active_pixel_y = None
        self.active_pixel_z = None

        self.attenuation_matrix = None
        self.adjust_coef = None
        self.sum_pixel = None

        self.doi_mapping = doi_mapping
        self.time_correction = time_correction
        self.time_correction = np.ascontiguousarray(time_correction, dtype=np.float32)

        self.projector_type = projector_type
        self.doi_correction = doi_correction
        self.decay_correction = decay_correction
        self.random_correction = random_correction
        self.scatter_correction = scatter_correction
        self.scatter_angle_correction = scatter_angle_correction

        self.roi_map = None

        if normalization_matrix is None:

            self.normalization_matrix = np.ascontiguousarray(
                np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
                        dtype=np.float32))

        else:
            normalize_sens_size = ((np.array(normalization_matrix.shape) - np.array(self.im_index_x.shape)) / 2).astype(
                np.int16)
            if normalize_sens_size[0] >= 0:
                self.normalization_matrix = normalization_matrix[
                                            normalize_sens_size[0]:(
                                                        normalization_matrix.shape[0] - normalize_sens_size[0]),
                                            normalize_sens_size[1]:(
                                                        normalization_matrix.shape[1] - normalize_sens_size[1]),
                                            normalize_sens_size[2]:(
                                                        normalization_matrix.shape[2] - normalize_sens_size[2])]
            else:
                mask = np.zeros((self.im_index_x.shape))

                normalize_sens_size_abs = np.abs(normalize_sens_size)
                rest = normalize_sens_size_abs % 2
                mask[normalize_sens_size_abs[0]:mask.shape[0] - normalize_sens_size_abs[0]- rest[0],
                normalize_sens_size_abs[1]:mask.shape[1] - normalize_sens_size_abs[1]-rest[1],
                normalize_sens_size_abs[2]:mask.shape[2] - normalize_sens_size_abs[2]-rest[2]] = normalization_matrix

                self.normalization_matrix = mask

            if self.normalization_matrix.shape != self.im_index_x.shape:
                self.normalization_matrix = self.normalization_matrix[1:self.im_index_x.shape[0] + 1,
                                            1: self.im_index_x.shape[1] + 1, 0:self.im_index_x.shape[2]]
            # self.normalization_matrix = np.ascontiguousarray(self.normalization_matrix, dtype=np.float32)
            self.normalization_matrix = np.ascontiguousarray(normalization_matrix, dtype=np.float32)

        if cut_fov:
            self.apply_circular_mask_fov()

        self.load_initial_arrays()

        if pixeltoangle:
            self.load_roi_map()
            if self.roi_map is not None:
                self.get_active_roi_voxels()
                self.remove_events_outside_ROI()
                self.im = self._fullFoVPreviousProbability()
                self.im = np.ascontiguousarray(self.im, dtype=np.float32)
                # self.normalization_matrix *= im
                # self.normalization_matrix /= np.sum(self.normalization_matrix)
                # self.normalization_matrix[self.normalization_matrix < 0.00001] = 0
                # self.im = np.ascontiguousarray(self.im, dtype=np.float32)
            # self.im = np.ascontiguousarray(
            #     np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
            #             dtype=np.float32))
            # self.im = self.roi_map / np.max(self.roi_map)
            #     self.im = self.surface / np.max(self.surface)
            #     self.im = np.ascontiguousarray(self.im, dtype=np.float32)
            self.fov_matrix_cut = self.surface
            self.fov_matrix_cut = np.ascontiguousarray(self.fov_matrix_cut, dtype=np.byte)

        # if self.normalization_calculation_flag:
        #     self.im = np.ascontiguousarray(
        #         np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
        #                 dtype=np.float32))
        #     # self.im = self.roi_map / np.max(self.roi_map)
        #     self.im = self.surface / np.max(self.surface)

        if GPU:

            tic = time.time()
            if not multiple_kernel:
                self.im = self._vor_design_gpu_shared_memory(self.a, self.a_normal, self.a_cf, self.im_index_x, self.b,
                                                             self.b_normal, self.b_cf, self.im_index_y, self.c,
                                                             self.c_normal, self.c_cf, self.im_index_z,
                                                             self.d, self.d_normal, self.d_cf, self.adjust_coef,
                                                             self.im, self.half_crystal_pitch_xy,
                                                             self.half_crystal_pitch_z, self.sum_pixel,
                                                             self.fov_matrix_cut,
                                                             self.normalization_matrix, self.time_correction)
            #     self.im = self._vor_design_gpu_shared_memory_multiple_reads(self.a, self.a_normal,self.a_cf, self.im_index_x, self.b, self.b_normal, self.b_cf, self.im_index_y, self.c, self.c_normal,self.c_cf,self.im_index_z,
            #                        self.d, self.d_normal, self.d_cf, self.adjust_coef, self.im, self.half_crystal_pitch_xy,
            #                                                  self.half_crystal_pitch_z, self.sum_pixel, self.sensivity_matrix,
            #                                                  self.normalization_matrix, self.time_correction)
            elif multiple_kernel:
                gpu_alg = GPUSharedMemoryMultipleKernel(self, optimize_reads_and_calcs=False)
                # gpu_alg.multikernel_optimized_memory_reads()
                gpu_alg.multiplekernel()
                self.im = gpu_alg.im
                # self.cdrf = True
                # self.im = self._vor_design_gpu_shared_memory_multiple_reads(self.a, self.a_normal, self.a_cf,
                #                                                             self.im_index_x, self.b,
                #                                                             self.b_normal, self.b_cf,
                #                                                             self.im_index_y, self.c,
                #                                                             self.c_normal, self.c_cf,
                #                                                             self.im_index_z,
                #                                                             self.d, self.d_normal, self.d_cf,
                #                                                             self.adjust_coef,
                #                                                             self.im, self.half_crystal_pitch_xy,
                #                                                             self.half_crystal_pitch_z,
                #                                                             self.sum_pixel,
                #                                                             self.fov_matrix_cut,
                #                                                             self.normalization_matrix,
                #                                                             self.time_correction)
                # # if use_half_precision:
                #     self.im = self._vor_design_gpu_shared_memory_multiple_reads_streamed(self.a, self.a_normal,
                #                                                                          self.a_cf, self.im_index_x,
                #                                                                          self.b,
                #                                                                          self.b_normal, self.b_cf,
                #                                                                          self.im_index_y, self.c,
                #                                                                          self.c_normal, self.c_cf,
                #                                                                          self.im_index_z,
                #                                                                          self.d, self.d_normal,
                #                                                                          self.d_cf, self.adjust_coef,
                #                                                                          self.im,
                #                                                                          self.half_crystal_pitch_xy,
                #                                                                          self.half_crystal_pitch_z,
                #                                                                          self.sum_pixel,
                #                                                                          self.fov_matrix_cut,
                #                                                                          self.normalization_matrix,
                #                                                                          self.time_correction)
                # else:

            toc = time.time()
            print('Optimizer CALCULATION OSEM: {}'.format(toc - tic))
            # np.save('im.npy', im)

            # self.vor_design_cpu()
            # self.im =self.im_cpu-self.im
        else:
            cpu_alg = IterativeAlgorithmCPU(self)
            self.im = cpu_alg.im
        if self.normalization_calculation_flag:
            self._save_image_by_it(self.im, normalization=True)


[docs]
    def load_initial_arrays(self):
        self.attenuation_matrix = np.ascontiguousarray(
            np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
                    dtype=np.float32))

        self.adjust_coef = np.ascontiguousarray(
            np.zeros((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z), dtype=np.float32))

        self.sum_pixel = np.ascontiguousarray(
            np.zeros(self.a.shape, dtype=np.float32))

        if self.normalization_calculation_flag:
            self.im = np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z)) * \
                      len(self.a) / (self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z)
            self.im = np.ascontiguousarray(self.im, dtype=np.float32)
            # self.im = np.ascontiguousarray(
            #     np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
            #             dtype=np.float32))
            # self.im = self.roi_map / np.max(self.roi_map)
        else:
            if self.roi_map is not None:
                self.im = np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z)) * \
                          len(self.a) / (self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z)
                self.im = np.ascontiguousarray(self.im, dtype=np.float32)
            else:
                self.im = np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z)) * \
                          len(self.a) / (self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z)
                self.im = np.ascontiguousarray(self.im, dtype=np.float32)



[docs]
    def get_active_roi_voxels(self):
        self.isosurface = FindISOSurface(volume=self.roi_map, directory=self.directory)
        # self.isosurface.threshold3DISOSurface()
        self.isosurface.get_active_pixels()
        self.isosurface.save_calculated_surface()
        active_pixels = self.isosurface.active_pixels
        self.surface = self.isosurface.surface_volume
        self.active_pixel_x = active_pixels[0]
        self.active_pixel_y = active_pixels[1]
        self.active_pixel_z = active_pixels[2]
        self.active_pixel_x = np.ascontiguousarray(self.active_pixel_x, dtype=np.int32)
        self.active_pixel_y = np.ascontiguousarray(self.active_pixel_y, dtype=np.int32)
        self.active_pixel_z = np.ascontiguousarray(self.active_pixel_z, dtype=np.int32)
        self.sum_pixel = np.ascontiguousarray(
            np.zeros(self.a.shape, dtype=np.float32))


        # file_name = os.path.join(self.directory, "map_total")
        # sizefile = self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z
        # output_file = open(file_name, 'rb')
        # a = array('f')
        # a.fromfile(output_file, sizefile)
        # output_file.close()
        # self.roi_map_total = np.array(a)
        # self.roi_map_total = self.roi_map.reshape(
        #     (self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
        #     order='F')

        # self.im = np.ascontiguousarray(
        #     np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
        #             dtype=np.float32)) *self.surface

    def _fullFoVPreviousProbability(self):
        try:
            file_name = os.path.join(self.directory, "whole_body", "ID_26 Jan 2022 - 14h 57m 00s_None_ IMAGE (154, 154, 373).T")
            # file_name = os.path.join(self.directory, "map_heart")
            sizefile = self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z
            # sizefile = 101 * 101 * 153
            sizeMatrix = [154,154,373]
            sizefile = sizeMatrix[0]*sizeMatrix[1]*sizeMatrix[2]
            output_file = open(file_name, 'rb')
            a = array('f')
            a.fromfile(output_file, sizefile)
            output_file.close()
            map_probability= np.array(a)
            # self.roi_map = self.roi_map.reshape(
            #     (self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
            #     order='F')
            # self.roi_map = self.roi_map.reshape(
            #     (101, 101, 153),
            #     order='F')
            map_probability = map_probability.reshape(
                (sizeMatrix),
                order='F')
            import scipy.ndimage
            map_probability = scipy.ndimage.zoom(map_probability, np.array([self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z])/map_probability.shape, order=0)

            map_probability = map_probability / np.sum(map_probability)
            map_probability = 1 - map_probability

        except FileNotFoundError:
            map_probability = None

        return map_probability


[docs]
    def load_roi_map(self):
        try:
            file_name = os.path.join(self.directory, "map_2")
            # file_name = os.path.join(self.directory, "map_heart")
            sizefile = self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z
            # sizefile = 101 * 101 * 153
            sizeMatrix = [154, 154, 373]
            sizefile = sizeMatrix[0] * sizeMatrix[1] * sizeMatrix[2]
            output_file = open(file_name, 'rb')
            a = array('f')
            a.fromfile(output_file, sizefile)
            output_file.close()
            self.roi_map = np.array(a)
            # self.roi_map = self.roi_map.reshape(
            #     (self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
            #     order='F')
            # self.roi_map = self.roi_map.reshape(
            #     (101, 101, 153),
            #     order='F')
            self.roi_map = self.roi_map.reshape(
                (sizeMatrix),
                order='F')
            import scipy.ndimage
            pixel_size_roi=np.array([0.2,0.2,0.2])
            pixel_size_image=np.array([0.8,0.8,0.8])
            pixel_relation = 1/((pixel_size_image/pixel_size_roi)*(np.array([self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z])/self.roi_map.shape))
            self.roi_map = scipy.ndimage.zoom(self.roi_map, pixel_relation*np.array([self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z])/self.roi_map.shape, order=0)
            mask = np.zeros(self.im_index_x.shape)
            sens_size = np.abs((np.array(self.roi_map.shape) - np.array(self.im_index_x.shape)) / 2).astype(
                    np.int16)
            mask[sens_size[0]:(mask.shape[0] - sens_size[0]),
            sens_size[1]:(mask.shape[1] - sens_size[1]),
            sens_size[2]:(mask.shape[2] - sens_size[2])]+=self.roi_map

            self.roi_map = mask
            # sens_size = ((np.array(self.roi_map.shape) - np.array(self.im_index_x.shape)) / 2).astype(
            #     np.int16)
            # if sens_size[0] >= 0:
            #     self.roi_map = self.roi_map[
            #                                 sens_size[0]:(
            #                                         self.roi_map.shape[0] - sens_size[0]),
            #                                 sens_size[1]:(
            #                                         self.roi_map.shape[1] - sens_size[1]),
            #                                 sens_size[2]:(
            #                                         self.roi_map.shape[2] - sens_size[2])]

            # self.roi_map = self.roi_map / self.roi_map
        except FileNotFoundError:
            self.roi_map = None



[docs]
    def remove_events_outside_ROI(self):
        roievents = ROIEvents(self)
        valid_vor = roievents.pixel2Position()
        self.a = self.a[valid_vor == 1]
        self.b = self.b[valid_vor == 1]
        self.c = self.c[valid_vor == 1]
        self.d = self.d[valid_vor == 1]

        self.a_normal = self.a_normal[valid_vor == 1]
        self.b_normal = self.b_normal[valid_vor == 1]
        self.c_normal = self.c_normal[valid_vor == 1]
        self.d_normal = self.d_normal[valid_vor == 1]

        self.a_cf = self.a_cf[valid_vor == 1]
        self.b_cf = self.b_cf[valid_vor == 1]
        self.c_cf = self.c_cf[valid_vor == 1]
        self.d_cf = self.d_cf[valid_vor == 1]
        self.time_correction = self.time_correction[valid_vor == 1]
        self.sum_pixel = self.sum_pixel[valid_vor == 1]
        self.distance_to_center_plane_normal = self.distance_to_center_plane_normal[valid_vor == 1]
        self.distance_to_center_plane = self.distance_to_center_plane[valid_vor == 1]
        # self.distance_between_array_pixel = self.distance_between_array_pixel[valid_vor == 1]
        listmode = self.easypetdata
        np.save("_listmode.npy", listmode)
        listmode = listmode[valid_vor == 1]
        np.save("_listmode_cut.npy", listmode)

        self.a = np.ascontiguousarray(self.a, dtype=np.float32)
        self.b = np.ascontiguousarray(self.b, dtype=np.float32)
        self.c = np.ascontiguousarray(self.c, dtype=np.float32)
        self.d = np.ascontiguousarray(self.d, dtype=np.float32)

        self.a_normal = np.ascontiguousarray(self.a_normal, dtype=np.float32)
        self.b_normal = np.ascontiguousarray(self.b_normal, dtype=np.float32)
        self.c_normal = np.ascontiguousarray(self.c_normal, dtype=np.float32)
        self.d_normal = np.ascontiguousarray(self.d_normal, dtype=np.float32)

        self.a_cf = np.ascontiguousarray(self.a_cf, dtype=np.float32)
        self.b_cf = np.ascontiguousarray(self.b_cf, dtype=np.float32)
        self.c_cf = np.ascontiguousarray(self.c_cf, dtype=np.float32)
        self.d_cf = np.ascontiguousarray(self.d_cf, dtype=np.float32)

        self.time_correction = np.ascontiguousarray(self.time_correction, dtype=np.float32)
        self.sum_pixel = np.ascontiguousarray(self.sum_pixel, dtype=np.float32)
        self.distance_to_center_plane_normal = np.ascontiguousarray(self.distance_to_center_plane_normal,
                                                                    dtype=np.float32)
        self.distance_to_center_plane = np.ascontiguousarray(self.distance_to_center_plane, dtype=np.float32)



[docs]
    def apply_circular_mask_fov(self):
        self.fov_matrix_cut = np.ascontiguousarray(
            np.ones((self.number_of_pixels_x, self.number_of_pixels_y, self.number_of_pixels_z),
                    dtype=np.float32))
        xx = (np.tile(np.arange(0, self.im_index_x.shape[0]), (self.im_index_x.shape[0], 1)) - (
                self.im_index_x.shape[0] - 1) / 2) ** 2
        yy = (np.tile(np.arange(0, self.im_index_x.shape[1]), (self.im_index_x.shape[1], 1)) - (
                self.im_index_x.shape[1] - 1) / 2) ** 2
        xx = xx.T
        # correct xx and yy shape in case of missmatch
        if xx.shape[0] != yy.shape[0]:
            xx, yy = np.meshgrid(np.arange(0, self.im_index_x.shape[0])-((self.im_index_x.shape[0] - 1) / 2) ** 2,
                                 np.arange(0, self.im_index_x.shape[1])-((self.im_index_x.shape[1] - 1) / 2) ** 2)

            xx = xx.T
            yy=yy.T

        # circle_cut = xx + yy - (self.im_index_x.shape[1] * np.sin(np.radians(easypetdata.half_real_range))*0.5) ** 2
        circle_cut = xx + yy - (self.im_index_x.shape[1] * 0.5) ** 2
        # circle_cut = xx + yy - (self.im_index_x.shape[1] * np.sin(np.radians(120/2))*.50) ** 2

        circle_cut[circle_cut > 0] = 0
        circle_cut[circle_cut < 0] = 1
        circle_cut = np.tile(circle_cut[:, :, None], (1, 1, self.im_index_x.shape[2]))
        self.fov_matrix_cut = self.fov_matrix_cut * circle_cut
        self.fov_matrix_cut = np.ascontiguousarray(self.fov_matrix_cut, dtype=np.byte)

        # calculate non-zero pixels


    # def _vor_design_gpu_shared_memory_multiple_reads(self, a, a_normal, a_cf, A, b, b_normal, b_cf, B, c, c_normal,
    #                                                  c_cf, C,
    #                                                  d, d_normal, d_cf, adjust_coef, im, half_crystal_pitch_xy,
    #                                                  half_crystal_pitch_z,
    #                                                  sum_vor, fov_cut_matrix, normalization_matrix, time_factor):
    #     print('Optimizer STARTED - Multiple reads')
    #     # cuda.init()
    #     cuda = self.cuda_drv
    #     # device = cuda.Device(0)  # enter your gpu id here
    #     # ctx = device.make_context()
    #     number_of_events = np.int32(len(a))
    #     weight = np.int32(A.shape[0])
    #     height = np.int32(A.shape[1])
    #     depth = np.int32(A.shape[2])
    #     # start_x = np.int32(A[0, 0, 0])
    #     start_x = np.int32(A[0, 0, 0])
    #     start_y = np.int32(B[0, 0, 0])
    #     start_z = np.int32(C[0, 0, 0])
    #     print("Start_point: {},{},{}".format(start_x, start_y, start_z))
    #     print('Image size: {},{}, {}'.format(weight, height, depth))
    #
    #     half_distance_between_array_pixel = np.float32(self.distance_between_array_pixel / 2)
    #     normalization_matrix = normalization_matrix.reshape(
    #         normalization_matrix.shape[0] * normalization_matrix.shape[1] * normalization_matrix.shape[2])
    #
    #     # SOURCE MODELS (DEVICE CODE)
    #
    #     mod_forward_projection_shared_mem = SourceModule("""
    #     #include <stdint.h>
    #
    #     texture<char, 1> tex;
    #
    #
    #     __global__ void forward_projection
    #     (const int n, const int m, const int p, const int start_x, const int start_y, const int start_z,
    #     const float crystal_pitch_XY,const float crystal_pitch_Z,const float distance_between_array_pixel,
    #     int number_of_events, const int begin_event_gpu_limitation,  const int end_event_gpu_limitation,
    #     float *a, float *a_normal, float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal,
    #     float *c_cf, float *d, float *d_normal, float *d_cf, float *sum_vor, const char *fov_cut_matrix, const float *im_old)
    #                            {
    #                               const int shared_memory_size = 256;
    #                                __shared__ float a_shared[shared_memory_size];
    #                                __shared__ float b_shared[shared_memory_size];
    #                                __shared__ float c_shared[shared_memory_size];
    #                                __shared__ float d_shared[shared_memory_size];
    #                                __shared__ float a_normal_shared[shared_memory_size];
    #                                __shared__ float b_normal_shared[shared_memory_size];
    #                                __shared__ float c_normal_shared[shared_memory_size];
    #                                __shared__ float d_normal_shared[shared_memory_size];
    #                                __shared__ float a_cf_shared[shared_memory_size];
    #                                __shared__ float b_cf_shared[shared_memory_size];
    #                                __shared__ float c_cf_shared[shared_memory_size];
    #                                __shared__ float d_cf_shared[shared_memory_size];
    #                                __shared__ float sum_vor_shared[shared_memory_size];
    #                                __shared__ char fov_cut_matrix_shared[shared_memory_size];
    #                                /*
    #                               const int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
    #                               const int threadId= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
    #                               */
    #                               int threadId=blockIdx.x *blockDim.x + threadIdx.x;
    #                               int e;
    #
    #                               float d2;
    #                               float d2_normal;
    #                               float d2_cf;
    #                               float value;
    #                               float value_normal;
    #                               float value_cf;
    #                               float sum_vor_temp;
    #                               int index;
    #                               int x_t;
    #                               int y_t;
    #                               int z_t;
    #                               float max_distance_projector;
    #                               const int number_events_max = end_event_gpu_limitation-begin_event_gpu_limitation;
    #                               const float error_pixel = 0.0000f;
    #                                if (threadIdx.x>shared_memory_size)
    #                                {
    #                                return;
    #                                }
    #                                if(threadId>number_events_max)
    #                                {
    #                                return;
    #                                }
    #
    #                               __syncthreads();
    #                               e = threadId;
    #                               int e_m = threadIdx.x;
    #                               a_shared[e_m] = a[e];
    #                               b_shared[e_m] = b[e];
    #                               c_shared[e_m] = c[e];
    #                               d_shared[e_m] = d[e];
    #                               a_normal_shared[e_m] = a_normal[e];
    #                               b_normal_shared[e_m] = b_normal[e];
    #                               c_normal_shared[e_m] = c_normal[e];
    #                               d_normal_shared[e_m] = d_normal[e];
    #                               a_cf_shared[e_m] = a_cf[e];
    #                               b_cf_shared[e_m] = b_cf[e];
    #                               c_cf_shared[e_m] = c_cf[e];
    #                               d_cf_shared[e_m] = d_cf[e];
    #                               sum_vor_shared[e_m] = sum_vor[e];
    #
    #                               d2_normal = crystal_pitch_XY * sqrt(a_normal_shared[e_m]*a_normal_shared[e_m]+b_normal_shared[e_m]*b_normal_shared[e_m]+c_normal_shared[e_m]*c_normal_shared[e_m]);
    #                               d2 = crystal_pitch_Z * sqrt(a_shared[e_m]*a_shared[e_m] + b_shared[e_m]*b_shared[e_m] + c_shared[e_m]*c_shared[e_m]);
    #                               d2_cf = distance_between_array_pixel*sqrt(a_cf_shared[e_m]*a_cf_shared[e_m]+b_cf_shared[e_m]*b_cf_shared[e_m]+c_cf_shared[e_m]*c_cf_shared[e_m]);
    #                               max_distance_projector=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf);
    #
    #                               for(int l=0; l<n; l++)
    #                               {
    #                                   x_t = l+start_x;
    #                                   for(int j=0; j<m; j++)
    #                                   {
    #                                      y_t = j+start_y;
    #                                      for(int k=0; k<p; k++)
    #                                           {
    #                                           /*
    #                                           index = l+j*n+k*m*n;
    #                                           fov_cut_matrix_shared[k]=fov_cut_matrix[k+j*p+l*p*m];
    #                                           index = l+j*n+k*m*n;
    #                                           */
    #                                             index = k+j*p+l*p*m;
    #
    #
    #                                           if (fov_cut_matrix[index]!=0)
    #                                           {
    #
    #                                               z_t = k+start_z;
    #                                               value_normal = a_normal_shared[e_m]*x_t+b_normal_shared[e_m]*y_t+c_normal_shared[e_m] * z_t -d_normal_shared[e_m];
    #
    #                                                if (value_normal < d2_normal && value_normal >= -d2_normal)
    #                                                {
    #                                                 value = a_shared[e_m]*x_t+b_shared[e_m]*y_t+c_shared[e_m]*z_t-d_shared[e_m];
    #
    #                                                 if (value < d2 && value >=-d2 )
    #                                                  {
    #                                                       value_cf = a_cf_shared[e_m]*x_t+b_cf_shared[e_m]*y_t+c_cf_shared[e_m]*z_t-d_cf_shared[e_m];
    #
    #
    #                                                   if (value_cf >= -d2_cf && value_cf<d2_cf)
    #                                                       {
    #
    #                                                        sum_vor_temp +=im_old[index]*(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector);
    #                                                        /* im_old[index]*(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector)
    #
    #                                                         */
    #                                                         }
    #
    #
    #                                               }
    #
    #
    #
    #
    #                                           }
    #                                           }
    #
    #                                        }
    #                                    }
    #                                }
    #
    #                                /*
    #                                 sum_vor[e]= sum_vor_temp;
    #                                sum_vor[e]= sum_vor_shared[e_m];
    #                                __syncthreads();
    #                                sum_vor[e]= sum_vor_shared[e_m];
    #                                */
    #                                sum_vor[e]= sum_vor_temp;
    #
    #
    #
    #            }""")
    #     mod_normalization_shared_mem = SourceModule("""
    #                          #include <stdint.h>
    #                          texture<uint8_t, 1> tex;
    #
    #                   __global__ void normalization
    #                    ( int dataset_number, int n, int m, int p, const float crystal_pitch_XY, const float crystal_pitch_Z,
    #                    const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation,
    #                    const int end_event_gpu_limitation, const float *a, const float *a_normal, const float *a_cf, const float *b,
    #                    const float *b_normal, const float *b_cf, const float *c, const float *c_normal, const float *c_cf, const float *d, const float *d_normal,
    #                    const float *d_cf, const short *A, const short *B, const short *C, float *adjust_coef, float *sum_vor,
    #                    char *fov_cut_matrix, float *time_factor)
    #                    {
    #                              extern __shared__ float adjust_coef_shared[];
    #                              int idt=blockIdx.x *blockDim.x + threadIdx.x;
    #
    #
    #                              float d2;
    #                              float d2_normal;
    #                              float d2_cf;
    #                              float normal_value;
    #                              float value;
    #                              float value_cf;
    #                              short a_temp;
    #                              short b_temp;
    #                              short c_temp;
    #                              char fov_cut_temp;
    #                              int i_s=threadIdx.x;
    #
    #                              if (idt>n*m*p)
    #                              {
    #                                   return;
    #                               }
    #
    #                              __syncthreads();
    #                              adjust_coef_shared[i_s] = adjust_coef[idt];
    #                              a_temp = A[idt];
    #                              b_temp = B[idt];
    #                              c_temp = C[idt];
    #                              fov_cut_temp = fov_cut_matrix[idt];
    #
    #
    #
    #                              for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
    #                              {
    #                                  if (fov_cut_temp!=0)
    #                                  {
    #                                  normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                  d2_normal = crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #
    #                                  if (normal_value< d2_normal && normal_value >= -d2_normal)
    #                                  {
    #                                  value = a[e]*a_temp+b[e]*b_temp +c[e]*c_temp- d[e];
    #                                  d2 = crystal_pitch_Z * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);
    #
    #
    #                                      if (value < d2 && value >=-d2)
    #                                      {
    #                                                value_cf = a_cf[e]*a_temp+b_cf[e]*b_temp +c_cf[e] * c_temp-d_cf[e];
    #                                                d2_cf = distance_between_array_pixel*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);
    #
    #
    #                                                  if (value_cf >= -d2_cf && value_cf<d2_cf)
    #                                                  {
    #
    #
    #                                                         adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
    #
    #
    #                                                 }
    #
    #                                                      /*
    #
    #                                                      adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
    #                                                      adjust_coef_shared[i_s] += 1/sum_vor[e];
    #                                                      normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                       d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #                                                     adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);
    #                                                    adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);
    #                                                       */
    #
    #
    #
    #
    #
    #                                           }
    #                                       }
    #
    #                              }
    #
    #                              }
    #
    #                              adjust_coef[idt] = adjust_coef_shared[i_s];
    #                              __syncthreads();
    #
    #
    #
    #                          }
    #                          """)
    #
    #     mod_backward_projection_shared_mem = SourceModule("""
    #                   #include <stdint.h>
    #                   texture<uint8_t, 1> tex;
    #
    #            __global__ void backprojection
    #             ( int dataset_number, int n, int m, int p, const float crystal_pitch_XY, const float crystal_pitch_Z,
    #             const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation,
    #             const int end_event_gpu_limitation, const float *a, const float *a_normal, const float *a_cf, const float *b,
    #             const float *b_normal, const float *b_cf, const float *c, const float *c_normal, const float *c_cf, const float *d, const float *d_normal,
    #             const float *d_cf, const short *A, const short *B, const short *C, float *adjust_coef, float *sum_vor,
    #             char *fov_cut_matrix, float *time_factor)
    #             {
    #                       extern __shared__ float adjust_coef_shared[];
    #                       int idt=blockIdx.x *blockDim.x + threadIdx.x;
    #
    #
    #                       float d2;
    #                       float d2_normal;
    #                       float d2_cf;
    #                       float normal_value;
    #                       float value;
    #                       float value_cf;
    #                       short a_temp;
    #                       short b_temp;
    #                       short c_temp;
    #                       char fov_cut_temp;
    #                       int i_s=threadIdx.x;
    #
    #                       if (idt>n*m*p)
    #                       {
    #                            return;
    #                        }
    #
    #                       __syncthreads();
    #                       adjust_coef_shared[i_s] = adjust_coef[idt];
    #                       a_temp = A[idt];
    #                       b_temp = B[idt];
    #                       c_temp = C[idt];
    #                       fov_cut_temp = fov_cut_matrix[idt];
    #
    #
    #
    #                       for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
    #                       {
    #                           if (fov_cut_temp!=0)
    #                           {
    #                           normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                           d2_normal = crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #
    #                           if (normal_value< d2_normal && normal_value >= -d2_normal)
    #                           {
    #                           value = a[e]*a_temp+b[e]*b_temp +c[e]*c_temp- d[e];
    #                           d2 = crystal_pitch_Z * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);
    #
    #
    #                               if (value < d2 && value >=-d2)
    #                               {
    #                                         value_cf = a_cf[e]*a_temp+b_cf[e]*b_temp +c_cf[e] * c_temp-d_cf[e];
    #                                         d2_cf = distance_between_array_pixel*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);
    #
    #
    #                                           if (value_cf >= -d2_cf && value_cf<d2_cf)
    #                                           {
    #
    #
    #
    #                                                 adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
    #
    #                                          }
    #
    #                                               /*
    #
    #                                               adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
    #                                               adjust_coef_shared[i_s] += 1/sum_vor[e];
    #                                               normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #                                              adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);
    #                                             adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);
    #                                                */
    #
    #
    #
    #
    #
    #                                    }
    #                                }
    #
    #                       }
    #
    #                       }
    #
    #                       adjust_coef[idt] = adjust_coef_shared[i_s];
    #                       __syncthreads();
    #
    #
    #
    #                   }
    #                   """)
    #
    #     mod_forward_projection_shared_mem_cdrf = SourceModule("""
    #            #include <stdint.h>
    #
    #            texture<char, 1> tex;
    #
    #
    #            __global__ void forward_projection_cdrf
    #            (const int n, const int m, const int p, const int start_x, const int start_y, const int start_z,
    #            float *crystal_pitch_XY, float *crystal_pitch_Z,const float distance_between_array_pixel,
    #            int number_of_events, const int begin_event_gpu_limitation,  const int end_event_gpu_limitation,
    #            float *a, float *a_normal, float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal,
    #            float *c_cf, float *d, float *d_normal, float *d_cf, float *sum_vor, const char *fov_cut_matrix, const float *im_old)
    #                                   {
    #                                      const int shared_memory_size = 256;
    #                                       __shared__ float a_shared[shared_memory_size];
    #                                       __shared__ float b_shared[shared_memory_size];
    #                                       __shared__ float c_shared[shared_memory_size];
    #                                       __shared__ float d_shared[shared_memory_size];
    #                                       __shared__ float a_normal_shared[shared_memory_size];
    #                                       __shared__ float b_normal_shared[shared_memory_size];
    #                                       __shared__ float c_normal_shared[shared_memory_size];
    #                                       __shared__ float d_normal_shared[shared_memory_size];
    #                                       __shared__ float a_cf_shared[shared_memory_size];
    #                                       __shared__ float b_cf_shared[shared_memory_size];
    #                                       __shared__ float c_cf_shared[shared_memory_size];
    #                                       __shared__ float d_cf_shared[shared_memory_size];
    #                                       __shared__ float sum_vor_shared[shared_memory_size];
    #                                       __shared__ char fov_cut_matrix_shared[shared_memory_size];
    #                                       __shared__ float crystal_pitch_Z_shared[shared_memory_size];
    #                                       __shared__ float crystal_pitch_XY_shared[shared_memory_size];
    #                                       /*
    #                                      const int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
    #                                      const int threadId= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
    #                                      */
    #                                      int threadId=blockIdx.x *blockDim.x + threadIdx.x;
    #                                      int e;
    #
    #                                      float d2;
    #                                      float d2_normal;
    #                                      float d2_cf;
    #                                      float value;
    #                                      float value_normal;
    #                                      float value_cf;
    #                                      float sum_vor_temp;
    #                                      int index;
    #                                      int x_t;
    #                                      int y_t;
    #                                      int z_t;
    #                                           float width;
    #                                       float height;
    #                                     float distance;
    #                                     float distance_other;
    #                                     float solid_angle;
    #
    #
    #                                      float max_distance_projector;
    #
    #                                      const int number_events_max = end_event_gpu_limitation-begin_event_gpu_limitation;
    #                                      const float error_pixel = 0.0000f;
    #                                       if (threadIdx.x>shared_memory_size)
    #                                       {
    #                                       return;
    #                                       }
    #                                       if(threadId>number_events_max)
    #                                       {
    #                                       return;
    #                                       }
    #
    #                                      __syncthreads();
    #                                      e = threadId;
    #                                      int e_m = threadIdx.x;
    #                                      a_shared[e_m] = a[e];
    #                                      b_shared[e_m] = b[e];
    #                                      c_shared[e_m] = c[e];
    #                                      d_shared[e_m] = d[e];
    #                                      a_normal_shared[e_m] = a_normal[e];
    #                                      b_normal_shared[e_m] = b_normal[e];
    #                                      c_normal_shared[e_m] = c_normal[e];
    #                                      d_normal_shared[e_m] = d_normal[e];
    #                                      a_cf_shared[e_m] = a_cf[e];
    #                                      b_cf_shared[e_m] = b_cf[e];
    #                                      c_cf_shared[e_m] = c_cf[e];
    #                                      d_cf_shared[e_m] = d_cf[e];
    #                                      sum_vor_shared[e_m] = sum_vor[e];
    #                                      crystal_pitch_Z_shared[e_m] =  crystal_pitch_Z[e];
    #                                      crystal_pitch_XY_shared[e_m] =  crystal_pitch_XY[e];
    #
    #                                      d2_normal = crystal_pitch_XY_shared[e_m] * sqrt(a_normal_shared[e_m]*a_normal_shared[e_m]+b_normal_shared[e_m]*b_normal_shared[e_m]+c_normal_shared[e_m]*c_normal_shared[e_m]);
    #                                      d2 = crystal_pitch_Z_shared[e_m]* sqrt(a_shared[e_m]*a_shared[e_m] + b_shared[e_m]*b_shared[e_m] + c_shared[e_m]*c_shared[e_m]);
    #                                      d2_cf = distance_between_array_pixel*sqrt(a_cf_shared[e_m]*a_cf_shared[e_m]+b_cf_shared[e_m]*b_cf_shared[e_m]+c_cf_shared[e_m]*c_cf_shared[e_m]);
    #                                      max_distance_projector=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf);
    #
    #                                      for(int l=0; l<n; l++)
    #                                      {
    #                                          x_t = l+start_x;
    #                                          for(int j=0; j<m; j++)
    #                                          {
    #                                             y_t = j+start_y;
    #                                             for(int k=0; k<p; k++)
    #                                                  {
    #                                                  /*
    #                                                  index = l+j*n+k*m*n;
    #                                                  fov_cut_matrix_shared[k]=fov_cut_matrix[k+j*p+l*p*m];
    #                                                  index = l+j*n+k*m*n;
    #                                                  */
    #                                                    index = k+j*p+l*p*m;
    #
    #
    #                                                  if (fov_cut_matrix[index]!=0)
    #                                                  {
    #
    #                                                      z_t = k+start_z;
    #                                                      value_normal = a_normal_shared[e_m]*x_t+b_normal_shared[e_m]*y_t+c_normal_shared[e_m] * z_t -d_normal_shared[e_m];
    #
    #                                                       if (value_normal < d2_normal && value_normal >= -d2_normal)
    #                                                       {
    #                                                        value = a_shared[e_m]*x_t+b_shared[e_m]*y_t+c_shared[e_m]*z_t-d_shared[e_m];
    #
    #                                                        if (value < d2 && value >=-d2 )
    #                                                         {
    #                                                              value_cf = a_cf_shared[e_m]*x_t+b_cf_shared[e_m]*y_t+c_cf_shared[e_m]*z_t-d_cf_shared[e_m];
    #
    #
    #                                                          if (value_cf >= -d2_cf && value_cf<d2_cf)
    #                                                              {
    #                                                              if (sqrt(value*value+value_normal*value_normal+value_cf*value_cf)<=max_distance_projector)
    #                                                              {
    #                                                                  width = 2*(crystal_pitch_Z_shared[e_m]  - abs(value));
    #                                                                 height = 2*(crystal_pitch_XY_shared[e_m] - abs(value_normal));
    #
    #                                                                  distance =d2_cf+abs(value_cf);
    #                                                                 distance_other = abs(d2_cf+value_cf);
    #                                                                 solid_angle = 4*(width*width*height*height/(distance*distance*(4*distance*distance+width*width+height*height)));
    #                                                               sum_vor_shared[e_m]+= im_old[index]*solid_angle;
    #
    #                                                               }
    #                                                               /*
    #                                                               sum_vor_shared[e_m]+= im_old[index]*(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector);
    #                                                               4*arctan(
    #                                                               4*np.arctan(width*height/(2*distance_to_anh*np.sqrt(4*distance_between_array_pixel-value_cf)**2+width**2+height**2)))
    #                                                               *(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector)
    #                                                               */
    #                                                                }
    #
    #
    #                                                      }
    #
    #
    #
    #
    #                                                  }
    #                                                  }
    #
    #                                               }
    #                                           }
    #                                       }
    #
    #                                       /*
    #                                        sum_vor[e]= sum_vor_temp;
    #                                       sum_vor[e]= sum_vor_shared[e_m];
    #
    #
    #                                       sum_vor[e]= sum_vor_temp;
    #                                       */
    #                                       __syncthreads();
    #                                       sum_vor[e]= sum_vor_shared[e_m];
    #
    #
    #
    #                   }""")
    #
    #     mod_normalization_shared_mem_cdrf = SourceModule("""
    #                                  #include <stdint.h>
    #                                  texture<uint8_t, 1> tex;
    #
    #                           __global__ void normalization_cdrf
    #                            ( int dataset_number, int n, int m, int p, float *crystal_pitch_XY,  float *crystal_pitch_Z,
    #                            const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation,
    #                            const int end_event_gpu_limitation, const float *a, const float *a_normal, const float *a_cf, const float *b,
    #                            const float *b_normal, const float *b_cf, const float *c, const float *c_normal, const float *c_cf, const float *d, const float *d_normal,
    #                            const float *d_cf, const short *A, const short *B, const short *C, float *adjust_coef, float *sum_vor,
    #                            char *fov_cut_matrix, float *time_factor)
    #                            {
    #                                      extern __shared__ float adjust_coef_shared[];
    #                                      int idt=blockIdx.x *blockDim.x + threadIdx.x;
    #
    #
    #                                      float d2;
    #                                      float d2_normal;
    #                                      float d2_cf;
    #                                      float normal_value;
    #                                      float value;
    #                                      float value_cf;
    #                                      short a_temp;
    #                                      short b_temp;
    #                                      short c_temp;
    #                                      char fov_cut_temp;
    #                                      int i_s=threadIdx.x;
    #                                      float width;
    #                                     float height;
    #                                     float distance;
    #                                     float distance_other;
    #                                      float solid_angle;
    #
    #                                      if (idt>=n*m*p)
    #                                      {
    #                                           return;
    #                                       }
    #
    #                                      __syncthreads();
    #                                      adjust_coef_shared[i_s] = adjust_coef[idt];
    #                                      a_temp = A[idt];
    #                                      b_temp = B[idt];
    #                                      c_temp = C[idt];
    #                                      fov_cut_temp = fov_cut_matrix[idt];
    #
    #
    #
    #                                      for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
    #                                      {
    #                                          if (fov_cut_temp!=0)
    #                                          {
    #                                          normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                          d2_normal =  crystal_pitch_XY[e]* sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #
    #                                          if (normal_value< d2_normal && normal_value >= -d2_normal)
    #                                          {
    #                                          value = a[e]*a_temp+b[e]*b_temp +c[e]*c_temp- d[e];
    #                                          d2 = crystal_pitch_Z[e] * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);
    #
    #
    #                                              if (value < d2 && value >=-d2)
    #                                              {
    #                                                        value_cf = a_cf[e]*a_temp+b_cf[e]*b_temp +c_cf[e] * c_temp-d_cf[e];
    #                                                        d2_cf = distance_between_array_pixel*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);
    #
    #
    #                                                          if (value_cf >= -d2_cf && value_cf<d2_cf)
    #                                                          {
    #                                                             if (sqrt(value*value+normal_value*normal_value+value_cf*value_cf)<=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))
    #                                                             {
    #                                                              width = 2*(crystal_pitch_Z[e]- abs(value));
    #                                                             height = 2*(crystal_pitch_XY[e] - abs(normal_value));
    #                                                             distance =d2_cf+abs(value_cf);
    #
    #                                                             solid_angle = 4*(width*width*height*height/(distance*distance*(4*distance*distance+width*width+height*height)));
    #                                                              adjust_coef_shared[i_s] += solid_angle/(sum_vor[e]);
    #                                                             /*
    #                                                             distance_other =d2_cf-abs(value_cf);
    #                                                             solid_angle = 4*(width*height/(2*distance*sqrt(4*distance*distance+width*width+height*height)));
    #                                                                adjust_coef_shared[i_s] += time_factor[e]*(1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]);
    #                                                               */
    #                                                                }
    #
    #                                                         }
    #
    #                                                              /*
    #                                                              (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/
    #                                                              normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                               d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #                                                             adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);
    #                                                            adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);
    #                                                               */
    #
    #
    #
    #
    #
    #                                                   }
    #                                               }
    #
    #                                      }
    #
    #                                      }
    #
    #                                      adjust_coef[idt] = adjust_coef_shared[i_s];
    #                                      __syncthreads();
    #
    #
    #
    #                                  }
    #                                  """)
    #
    #     mod_backward_projection_shared_mem_cdrf = SourceModule("""
    #                                 #include <stdint.h>
    #                                 texture<uint8_t, 1> tex;
    #
    #                          __global__ void backprojection_cdrf
    #                           ( int dataset_number, int n, int m, int p, const float *crystal_pitch_XY,  const float *crystal_pitch_Z,
    #                           const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation,
    #                           const int end_event_gpu_limitation, const float *a, const float *a_normal, const float *a_cf, const float *b,
    #                           const float *b_normal, const float *b_cf, const float *c, const float *c_normal, const float *c_cf, const float *d, const float *d_normal,
    #                           const float *d_cf, const short *A, const short *B, const short *C, float *adjust_coef, float *sum_vor,
    #                           char *fov_cut_matrix, float *time_factor)
    #                           {
    #                                     extern __shared__ float adjust_coef_shared[];
    #                                     int idt=blockIdx.x *blockDim.x + threadIdx.x;
    #
    #
    #                                     float d2;
    #                                     float d2_normal;
    #                                     float d2_cf;
    #                                     float normal_value;
    #                                     float value;
    #                                     float value_cf;
    #                                     short a_temp;
    #                                     short b_temp;
    #                                     short c_temp;
    #                                     char fov_cut_temp;
    #                                     int i_s=threadIdx.x;
    #                                     float width;
    #                                     float height;
    #                                     float distance;
    #                                     float distance_other;
    #                                     float solid_angle;
    #
    #                                     if (idt>=n*m*p)
    #                                     {
    #                                          return;
    #                                      }
    #
    #                                     __syncthreads();
    #                                     adjust_coef_shared[i_s] = adjust_coef[idt];
    #                                     a_temp = A[idt];
    #                                     b_temp = B[idt];
    #                                     c_temp = C[idt];
    #
    #                                     fov_cut_temp = fov_cut_matrix[idt];
    #
    #
    #
    #                                     for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
    #                                     {
    #                                         if (fov_cut_temp!=0)
    #                                         {
    #                                         normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                         d2_normal =  crystal_pitch_XY[e]* sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #
    #                                         if (normal_value< d2_normal && normal_value >= -d2_normal)
    #                                         {
    #                                         value = a[e]*a_temp+b[e]*b_temp +c[e]*c_temp- d[e];
    #                                         d2 = crystal_pitch_Z[e] * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);
    #
    #
    #                                             if (value < d2 && value >=-d2)
    #                                             {
    #                                                       value_cf = a_cf[e]*a_temp+b_cf[e]*b_temp +c_cf[e] * c_temp-d_cf[e];
    #                                                       d2_cf = distance_between_array_pixel*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);
    #
    #
    #                                                         if (value_cf >= -d2_cf && value_cf<d2_cf)
    #                                                         {
    #                                                            if (sqrt(value*value+normal_value*normal_value+value_cf*value_cf)<=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))
    #                                                            {
    #                                                            width = 2*(crystal_pitch_Z[e]- abs(value));
    #                                                            height = 2*(crystal_pitch_XY[e] - abs(normal_value));
    #                                                            distance =d2_cf+abs(value_cf);
    #                                                            distance_other =d2_cf-abs(value_cf);
    #                                                            solid_angle = 4*(width*width*height*height/(distance*distance*(4*distance*distance+width*width+height*height)));
    #                                                            if (sum_vor[e]!=0)
    #
    #                                                             {
    #                                                             adjust_coef_shared[i_s] += solid_angle/(sum_vor[e]);
    #                                                             }
    #                                                              /*
    #                                                              (4 * asin(sin(tan(width/distance))*sin(tan(height/distance))))*(4 * asin(sin(tan(width/distance))*sin(tan(height/distance))))/sum_vor[e];
    #                                                          adjust_coef_shared[i_s] += time_factor[e]*(1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]);
    #
    #                                                              */
    #                                                               }
    #
    #                                                        }
    #
    #                                                             /*
    #                                                             (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/
    #                                                             normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
    #                                              d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
    #                                                            adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);
    #                                                           adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);
    #                                                              */
    #
    #
    #
    #
    #
    #                                                  }
    #                                              }
    #
    #                                     }
    #
    #                                     }
    #
    #                                     adjust_coef[idt] = adjust_coef_shared[i_s];
    #                                     __syncthreads();
    #
    #
    #
    #                                 }
    #                                 """)
    #
    #     # float crystal_pitch, int number_of_events, float *a, float *b, float *c, float *d, int *A, int *B, int *C, float *im, float *vector_matrix
    #     # Host Code   B, C, im, vector_matrix,
    #     number_of_datasets = np.int32(1)  # Number of datasets (and concurrent operations) used.
    #     number_of_datasets_back = np.int32(1)  # Number of datasets (and concurrent operations) used.
    #     # Start concurrency Test
    #     # Event as reference point
    #     ref = cuda.Event()
    #     ref.record()
    #
    #     # Create the streams and events needed to calculation
    #     stream, event = [], []
    #     marker_names = ['kernel_begin', 'kernel_end']
    #     # Create List to allocate chunks of data
    #     A_cut_gpu, B_cut_gpu, C_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
    #         None] * number_of_datasets
    #     A_cut, B_cut, C_cut = [None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
    #
    #     a_gpu, b_gpu, c_gpu, d_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets
    #     a_normal_gpu, b_normal_gpu, c_normal_gpu, d_normal_gpu = [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
    #
    #     a_cut, b_cut, c_cut, d_cut = [None] * number_of_datasets, [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets
    #     a_normal_cut, b_normal_cut, c_normal_cut, d_normal_cut = [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
    #
    #     a_cf_cut, b_cf_cut, c_cf_cut, d_cf_cut = [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
    #
    #     a_cut_gpu, b_cut_gpu, c_cut_gpu, d_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets
    #
    #     a_cut_normal_gpu, b_cut_normal_gpu, c_cut_normal_gpu, d_cut_normal_gpu = [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
    #
    #     a_cf_cut_gpu, b_cf_cut_gpu, c_cf_cut_gpu, d_cf_cut_gpu = [None] * number_of_datasets, [
    #         None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets
    #
    #     time_factor_cut_gpu = [None] * number_of_datasets
    #     sum_vor_gpu = [None] * number_of_datasets
    #     sum_vor_cut = [None] * number_of_datasets
    #     probability_cut = [None] * number_of_datasets
    #     probability_gpu = [None] * number_of_datasets
    #     adjust_coef_cut = [None] * number_of_datasets
    #     adjust_coef_gpu = [None] * number_of_datasets
    #     adjust_coef_pinned = [None] * number_of_datasets_back
    #     fov_cut_matrix_cutted_gpu = [None] * number_of_datasets_back
    #     fov_cut_matrix_cut = [None] * number_of_datasets
    #     # fov_cut_matrix_gpu = [None] * number_of_datasets
    #     sum_vor_pinned = [None] * number_of_datasets
    #
    #     distance_to_center_plane_cut = [None] * number_of_datasets
    #     distance_to_center_plane_gpu_cut = [None] * number_of_datasets
    #     distance_to_center_plane_normal_cut = [None] * number_of_datasets
    #     distance_to_center_plane_normal_gpu_cut = [None] * number_of_datasets
    #
    #     # Streams and Events creation
    #     for dataset in range(number_of_datasets):
    #         stream.append(cuda.Stream())
    #         event.append(dict([(marker_names[l], cuda.Event()) for l in range(len(marker_names))]))
    #
    #     # Foward Projection Memory Allocation
    #     # Variables that need an unique alocation
    #     # A_shappened = np.ascontiguousarray(A.reshape(A.shape[0]*A.shape[1]*A.shape[2]), dtype=np.int32)
    #     # B_shappened = np.ascontiguousarray(B.reshape(B.shape[0]*B.shape[1]*B.shape[2]), dtype=np.int32)
    #     # C_shappened = np.ascontiguousarray(C.reshape(C.shape[0]*C.shape[1]*C.shape[2]), dtype=np.int32)
    #     im_shappened = np.ascontiguousarray(im.reshape(im.shape[0] * im.shape[1] * im.shape[2]), dtype=np.float32)
    #     fov_cut_matrix_shappened = np.ascontiguousarray(
    #         fov_cut_matrix.reshape(fov_cut_matrix.shape[0] * fov_cut_matrix.shape[1] * fov_cut_matrix.shape[2]),
    #         dtype=np.byte)
    #
    #     # forward_projection_arrays_page_locked_memory_allocations = [A_gpu, B_gpu, C_gpu, im_gpu]
    #
    #     # A_gpu = cuda.mem_alloc(A_shappened.size * A_shappened.dtype.itemsize)
    #     # B_gpu = cuda.mem_alloc(B_shappened.size * B_shappened.dtype.itemsize)
    #     # C_gpu = cuda.mem_alloc(C_shappened.size * C_shappened.dtype.itemsize)
    #     im_gpu = cuda.mem_alloc(im_shappened.size * im_shappened.dtype.itemsize)
    #     fov_cut_matrix_gpu = cuda.mem_alloc(fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize)
    #     # texref = mod_forward_projection_shared_mem.get_texref('tex')
    #     # texref.set_address(fov_cut_matrix_gpu, fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize)
    #     # # texref.set_format(cuda.array_format.UNSIGNED_INT8, 1)
    #
    #     # cuda.memcpy_htod_async(A_gpu, A_shappened)
    #     # cuda.memcpy_htod_async(B_gpu, B_shappened)
    #     # cuda.memcpy_htod_async(C_gpu, C_shappened)
    #     cuda.memcpy_htod_async(im_gpu, im_shappened)
    #     cuda.memcpy_htod_async(fov_cut_matrix_gpu, fov_cut_matrix_shappened)
    #
    #     a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
    #     b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
    #     c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)
    #     d_gpu = cuda.mem_alloc(d.size * d.dtype.itemsize)
    #     a_normal_gpu = cuda.mem_alloc(a_normal.size * a_normal.dtype.itemsize)
    #     b_normal_gpu = cuda.mem_alloc(b_normal.size * b_normal.dtype.itemsize)
    #     c_normal_gpu = cuda.mem_alloc(c_normal.size * c_normal.dtype.itemsize)
    #     d_normal_gpu = cuda.mem_alloc(d_normal.size * d_normal.dtype.itemsize)
    #
    #     a_cf_gpu = cuda.mem_alloc(a_cf.size * a_cf.dtype.itemsize)
    #     b_cf_gpu = cuda.mem_alloc(b_cf.size * b_cf.dtype.itemsize)
    #     c_cf_gpu = cuda.mem_alloc(c_cf.size * c_cf.dtype.itemsize)
    #     d_cf_gpu = cuda.mem_alloc(d_cf.size * d_cf.dtype.itemsize)
    #     sum_vor_t_gpu = cuda.mem_alloc(sum_vor.size * sum_vor.dtype.itemsize)
    #     distance_to_center_plane_gpu = cuda.mem_alloc(
    #         self.distance_to_center_plane.size * self.distance_to_center_plane.dtype.itemsize)
    #     distance_to_center_plane_normal_gpu = cuda.mem_alloc(
    #         self.distance_to_center_plane_normal.size * self.distance_to_center_plane_normal.dtype.itemsize)
    #
    #     time_factor_gpu = cuda.mem_alloc(time_factor.size * time_factor.dtype.itemsize)
    #     # Transfer memory to Optimizer
    #     cuda.memcpy_htod_async(a_gpu, a)
    #     cuda.memcpy_htod_async(b_gpu, b)
    #     cuda.memcpy_htod_async(c_gpu, c)
    #     cuda.memcpy_htod_async(d_gpu, d)
    #     cuda.memcpy_htod_async(a_normal_gpu, a_normal)
    #     cuda.memcpy_htod_async(b_normal_gpu, b_normal)
    #     cuda.memcpy_htod_async(c_normal_gpu, c_normal)
    #     cuda.memcpy_htod_async(d_normal_gpu, d_normal)
    #
    #     cuda.memcpy_htod_async(a_cf_gpu, a_cf)
    #     cuda.memcpy_htod_async(b_cf_gpu, b_cf)
    #     cuda.memcpy_htod_async(c_cf_gpu, c_cf)
    #     cuda.memcpy_htod_async(d_cf_gpu, d_cf)
    #     cuda.memcpy_htod_async(time_factor_gpu, time_factor)
    #     cuda.memcpy_htod_async(distance_to_center_plane_gpu, self.distance_to_center_plane)
    #     cuda.memcpy_htod_async(distance_to_center_plane_normal_gpu, self.distance_to_center_plane_normal)
    #
    #     for dataset in range(number_of_datasets):
    #         # if dataset == number_of_datasets:
    #         #     begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
    #         #     end_dataset = number_of_events
    #         #     adjust_coef_cut[dataset] = np.ascontiguousarray(
    #         #         adjust_coef[int(np.floor(im_cut_dim[0] * dataset)):adjust_coef.shape[0],
    #         #         int(np.floor(im_cut_dim[1] * dataset)):adjust_coef.shape[1],
    #         #         int(np.floor(im_cut_dim[2] * dataset)):adjust_coef.shape[2]],
    #         #         dtype=np.float32)
    #         #     fov_cut_matrix_cut[dataset] = np.ascontiguousarray(
    #         #         fov_cut_matrix[int(np.floor(im_cut_dim[0] * dataset)):fov_cut_matrix.shape[0],
    #         #         int(np.floor(im_cut_dim[1] * dataset)):fov_cut_matrix.shape[1],
    #         #         int(np.floor(im_cut_dim[2] * dataset)):fov_cut_matrix.shape[2]],
    #         #         dtype=np.float32)
    #         # else:
    #         begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
    #         end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
    #
    #         # Cutting dataset
    #         # For forward projection the data is cutted by number of events. For backprojection is cutted per image pieces of image
    #         a_cut[dataset] = a[begin_dataset:end_dataset]
    #         b_cut[dataset] = b[begin_dataset:end_dataset]
    #         c_cut[dataset] = c[begin_dataset:end_dataset]
    #         d_cut[dataset] = d[begin_dataset:end_dataset]
    #         a_normal_cut[dataset] = a_normal[begin_dataset:end_dataset]
    #         b_normal_cut[dataset] = b_normal[begin_dataset:end_dataset]
    #         c_normal_cut[dataset] = c_normal[begin_dataset:end_dataset]
    #         d_normal_cut[dataset] = d_normal[begin_dataset:end_dataset]
    #
    #         a_cf_cut[dataset] = a_cf[begin_dataset:end_dataset]
    #         b_cf_cut[dataset] = b_cf[begin_dataset:end_dataset]
    #         c_cf_cut[dataset] = c_cf[begin_dataset:end_dataset]
    #         d_cf_cut[dataset] = d_cf[begin_dataset:end_dataset]
    #         sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
    #         distance_to_center_plane_cut[dataset] = self.distance_to_center_plane[begin_dataset:end_dataset]
    #         distance_to_center_plane_normal_cut[dataset] = self.distance_to_center_plane_normal[
    #                                                        begin_dataset:end_dataset]
    #
    #         # Forward
    #         a_cut_gpu[dataset] = cuda.mem_alloc(a_cut[dataset].size * a_cut[dataset].dtype.itemsize)
    #         b_cut_gpu[dataset] = cuda.mem_alloc(b_cut[dataset].size * b_cut[dataset].dtype.itemsize)
    #         c_cut_gpu[dataset] = cuda.mem_alloc(c_cut[dataset].size * c_cut[dataset].dtype.itemsize)
    #         d_cut_gpu[dataset] = cuda.mem_alloc(d_cut[dataset].size * d_cut[dataset].dtype.itemsize)
    #         a_cut_normal_gpu[dataset] = cuda.mem_alloc(
    #             a_normal_cut[dataset].size * a_normal_cut[dataset].dtype.itemsize)
    #         b_cut_normal_gpu[dataset] = cuda.mem_alloc(
    #             b_normal_cut[dataset].size * b_normal_cut[dataset].dtype.itemsize)
    #         c_cut_normal_gpu[dataset] = cuda.mem_alloc(
    #             c_normal_cut[dataset].size * c_normal_cut[dataset].dtype.itemsize)
    #         d_cut_normal_gpu[dataset] = cuda.mem_alloc(
    #             d_normal_cut[dataset].size * d_normal_cut[dataset].dtype.itemsize)
    #
    #         a_cf_cut_gpu[dataset] = cuda.mem_alloc(a_cf_cut[dataset].size * a_cf_cut[dataset].dtype.itemsize)
    #         b_cf_cut_gpu[dataset] = cuda.mem_alloc(b_cf_cut[dataset].size * b_cf_cut[dataset].dtype.itemsize)
    #         c_cf_cut_gpu[dataset] = cuda.mem_alloc(c_cf_cut[dataset].size * c_cf_cut[dataset].dtype.itemsize)
    #         d_cf_cut_gpu[dataset] = cuda.mem_alloc(d_cf_cut[dataset].size * d_cf_cut[dataset].dtype.itemsize)
    #
    #         distance_to_center_plane_gpu_cut[dataset] = cuda.mem_alloc(
    #             distance_to_center_plane_cut[dataset].size * distance_to_center_plane_cut[dataset].dtype.itemsize)
    #         distance_to_center_plane_normal_gpu_cut[dataset] = cuda.mem_alloc(
    #             distance_to_center_plane_normal_cut[dataset].size * distance_to_center_plane_normal_cut[
    #                 dataset].dtype.itemsize)
    #
    #         sum_vor_gpu[dataset] = cuda.mem_alloc(sum_vor_cut[dataset].size * sum_vor_cut[dataset].dtype.itemsize)
    #
    #         sum_vor_pinned[dataset] = cuda.register_host_memory(sum_vor_cut[dataset])
    #         assert np.all(sum_vor_pinned[dataset] == sum_vor_cut[dataset])
    #         cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_pinned[dataset], stream[dataset])
    #         # sum_vor_gpu[dataset] = np.intp(x.base.get_device_pointer())
    #         # cuda.memcpy_htod_async(probability_gpu[dataset], probability_cut[dataset])
    #         cuda.memcpy_htod_async(a_cut_gpu[dataset], a_cut[dataset])
    #         cuda.memcpy_htod_async(b_cut_gpu[dataset], b_cut[dataset])
    #         cuda.memcpy_htod_async(c_cut_gpu[dataset], c_cut[dataset])
    #         cuda.memcpy_htod_async(d_cut_gpu[dataset], d_cut[dataset])
    #         cuda.memcpy_htod_async(a_cut_normal_gpu[dataset], a_normal_cut[dataset])
    #         cuda.memcpy_htod_async(b_cut_normal_gpu[dataset], b_normal_cut[dataset])
    #         cuda.memcpy_htod_async(c_cut_normal_gpu[dataset], c_normal_cut[dataset])
    #         cuda.memcpy_htod_async(d_cut_normal_gpu[dataset], d_normal_cut[dataset])
    #
    #         cuda.memcpy_htod_async(a_cf_cut_gpu[dataset], a_cf_cut[dataset])
    #         cuda.memcpy_htod_async(b_cf_cut_gpu[dataset], b_cf_cut[dataset])
    #         cuda.memcpy_htod_async(c_cf_cut_gpu[dataset], c_cf_cut[dataset])
    #         cuda.memcpy_htod_async(d_cf_cut_gpu[dataset], d_cf_cut[dataset])
    #         cuda.memcpy_htod_async(distance_to_center_plane_gpu_cut[dataset], distance_to_center_plane_cut[dataset])
    #         cuda.memcpy_htod_async(distance_to_center_plane_normal_gpu_cut[dataset],
    #                                distance_to_center_plane_normal_cut[dataset])
    #
    #     adjust_coef = np.ascontiguousarray(adjust_coef.reshape(
    #         adjust_coef.shape[0] * adjust_coef.shape[1] * adjust_coef.shape[2]),
    #         dtype=np.float32)
    #     # fov_cut_matrix = np.ascontiguousarray(fov_cut_matrix.reshape(
    #     #     fov_cut_matrix.shape[0] * fov_cut_matrix.shape[1] * fov_cut_matrix.shape[2]),
    #     #     dtype=np.float32)
    #     A = np.ascontiguousarray(A.reshape(
    #         A.shape[0] * A.shape[1] * A.shape[2]),
    #         dtype=np.short)
    #     B = np.ascontiguousarray(B.reshape(
    #         B.shape[0] * B.shape[1] * B.shape[2]),
    #         dtype=np.short)
    #     C = np.ascontiguousarray(C.reshape(
    #         C.shape[0] * C.shape[1] * C.shape[2]),
    #         dtype=np.short)
    #
    #     # ---- Divide into datasets variables backprojection
    #     for dataset in range(number_of_datasets_back):
    #         voxels_division = adjust_coef.shape[0] // number_of_datasets_back
    #         adjust_coef_cut[dataset] = np.ascontiguousarray(
    #             adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division],
    #             dtype=np.float32)
    #
    #         fov_cut_matrix_cut[dataset] = np.ascontiguousarray(
    #             fov_cut_matrix_shappened[dataset * voxels_division:(dataset + 1) * voxels_division],
    #             dtype=np.byte)
    #
    #         A_cut[dataset] = np.ascontiguousarray(
    #             A[dataset * voxels_division:(dataset + 1) * voxels_division],
    #             dtype=np.short)
    #
    #         B_cut[dataset] = np.ascontiguousarray(
    #             B[dataset * voxels_division:(dataset + 1) * voxels_division],
    #             dtype=np.short)
    #
    #         C_cut[dataset] = np.ascontiguousarray(
    #             C[dataset * voxels_division:(dataset + 1) * voxels_division],
    #             dtype=np.short)
    #         # Backprojection
    #         adjust_coef_gpu[dataset] = cuda.mem_alloc(
    #             adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize)
    #
    #         adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset])
    #         assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset])
    #         cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset])
    #
    #         fov_cut_matrix_cutted_gpu[dataset] = cuda.mem_alloc(
    #             fov_cut_matrix_cut[dataset].size * fov_cut_matrix_cut[dataset].dtype.itemsize)
    #
    #         A_cut_gpu[dataset] = cuda.mem_alloc(
    #             A_cut[dataset].size * A_cut[dataset].dtype.itemsize)
    #
    #         B_cut_gpu[dataset] = cuda.mem_alloc(
    #             B_cut[dataset].size * B_cut[dataset].dtype.itemsize)
    #
    #         C_cut_gpu[dataset] = cuda.mem_alloc(
    #             C_cut[dataset].size * C_cut[dataset].dtype.itemsize)
    #
    #         cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset])
    #         cuda.memcpy_htod_async(fov_cut_matrix_cutted_gpu[dataset], fov_cut_matrix_cut[dataset])
    #         cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset])
    #         cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset])
    #         cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset])
    #
    #     free, total = cuda.mem_get_info()
    #
    #     print('%.1f %% of device memory is free.' % ((free / float(total)) * 100))
    #
    #     # -------------OSEM---------
    #     it = self.number_of_iterations
    #     subsets = self.number_of_subsets
    #     print('Number events for reconstruction: {}'.format(number_of_events))
    #
    #     im = np.ascontiguousarray(im.reshape(im.shape[0] * im.shape[1] * im.shape[2]), dtype=np.float32)
    #     for i in range(it):
    #         print('Iteration number: {}\n----------------'.format(i + 1))
    #         begin_event = np.int32(0)
    #         end_event = np.int32(number_of_events / subsets)
    #         for sb in range(subsets):
    #             print('Subset number: {}'.format(sb))
    #             number_of_events_subset = np.int32(end_event - begin_event)
    #             tic = time.time()
    #             # Cycle forward Projection
    #             for dataset in range(number_of_datasets):
    #
    #                 if dataset == number_of_datasets:
    #                     begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
    #                     end_dataset = number_of_events_subset
    #                 else:
    #                     begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
    #                     end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)
    #
    #                 threadsperblock = (256, 1, 1)
    #                 blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0]))
    #                 blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
    #                 blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
    #                 blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
    #                 event[dataset]['kernel_begin'].record(stream[dataset])
    #                 # depth = np.int32(5)
    #                 # weight = np.int32(A.shape[2]/2)
    #                 # height = np.int32(A.shape[2]/2)
    #                 if self.cdrf:
    #                     func_forward = mod_forward_projection_shared_mem_cdrf.get_function("forward_projection_cdrf")
    #                     func_forward(weight, height, depth, start_x, start_y, start_z,
    #                                  distance_to_center_plane_normal_gpu_cut[dataset],
    #                                  distance_to_center_plane_gpu_cut[dataset],
    #                                  half_distance_between_array_pixel,
    #                                  number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset],
    #                                  a_cut_normal_gpu[dataset], a_cf_cut_gpu[dataset],
    #                                  b_cut_gpu[dataset], b_cut_normal_gpu[dataset], b_cf_cut_gpu[dataset],
    #                                  c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
    #                                  c_cf_cut_gpu[dataset],
    #                                  d_cut_gpu[dataset],
    #                                  d_cut_normal_gpu[dataset], d_cf_cut_gpu[dataset],
    #                                  sum_vor_gpu[dataset], fov_cut_matrix_gpu, im_gpu,
    #                                  block=threadsperblock,
    #                                  grid=blockspergrid,
    #                                  stream=stream[dataset])
    #                 else:
    #                     func_forward = mod_forward_projection_shared_mem.get_function("forward_projection")
    #                     func_forward(weight, height, depth, start_x, start_y, start_z, half_crystal_pitch_xy,
    #                                  half_crystal_pitch_z,
    #                                  half_distance_between_array_pixel,
    #                                  number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset],
    #                                  a_cut_normal_gpu[dataset], a_cf_cut_gpu[dataset],
    #                                  b_cut_gpu[dataset], b_cut_normal_gpu[dataset], b_cf_cut_gpu[dataset],
    #                                  c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
    #                                  c_cf_cut_gpu[dataset],
    #                                  d_cut_gpu[dataset],
    #                                  d_cut_normal_gpu[dataset], d_cf_cut_gpu[dataset],
    #                                  sum_vor_gpu[dataset], fov_cut_matrix_gpu, im_gpu,
    #                                  block=threadsperblock,
    #                                  grid=blockspergrid,
    #                                  stream=stream[dataset])
    #
    #             # Sincronization of streams
    #             for dataset in range(number_of_datasets):  # Commenting out this line should break concurrency.
    #                 event[dataset]['kernel_end'].record(stream[dataset])
    #
    #             # Transfering data from Optimizer
    #             for dataset in range(number_of_datasets):
    #                 begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
    #                 end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
    #                 # cuda.memcpy_dtoh_async(, sum_vor_gpu[dataset])
    #                 cuda.memcpy_dtoh_async(sum_vor_pinned[dataset], sum_vor_gpu[dataset], stream[dataset])
    #                 sum_vor[begin_dataset:end_dataset] = sum_vor_pinned[dataset]
    #                 # cuda.cudaStream.Synchronize(stream[dataset])
    #
    #                 toc = time.time()
    #
    #             cuda.Context.synchronize()
    #
    #             print('Time part Forward Projection {} : {}'.format(1, toc - tic))
    #             # number_of_datasets = np.int32(2)
    #             # teste = np.copy(sum_vor)
    #             # # sum_vor[sum_vor<1]=0
    #             # sum_vor = np.ascontiguousarray(teste, dtype=np.float32)
    #             # sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32)
    #             print('SUM VOR: {}'.format(np.sum(sum_vor)))
    #             # cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor)
    #
    #             cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor)
    #
    #             # ------------BACKPROJECTION-----------
    #
    #             for dataset in range(number_of_datasets_back):
    #                 dataset = np.int32(dataset)
    #                 begin_dataset = np.int32(0)
    #                 end_dataset = np.int32(number_of_events_subset)
    #
    #                 # begin_dataset = np.int32(0)
    #                 # end_dataset = np.int32(number_of_events)
    #
    #                 event[dataset]['kernel_begin'].record(stream[dataset])
    #                 # weight_cutted, height_cutted, depth_cutted = np.int32(adjust_coef_cut[dataset].shape[0]), np.int32(
    #                 #     adjust_coef_cut[dataset].shape[1]), np.int32(adjust_coef_cut[dataset].shape[2])
    #                 weight_cutted, height_cutted, depth_cutted = np.int32(adjust_coef_cut[dataset].shape[0]), np.int32(
    #                     1), np.int32(1)
    #
    #                 number_of_voxels_thread = 128
    #                 threadsperblock = (np.int(number_of_voxels_thread), 1, 1)
    #                 blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0]))
    #                 blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
    #                 blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
    #                 # blockspergrid_y = int(math.ceil(adjust_coef_cut[dataset].shape[1] / threadsperblock[1]))
    #                 # blockspergrid_z = int(math.ceil(adjust_coef_cut[dataset].shape[2] / threadsperblock[2]))
    #                 blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
    #                 shared_memory = threadsperblock[0] * threadsperblock[1] * threadsperblock[2] * 4
    #                 if self.cdrf:
    #                     if self.normalization_calculation_flag:
    #                         func_backward = mod_normalization_shared_mem_cdrf.get_function("normalization_cdrf")
    #
    #
    #                     else:
    #                         func_backward = mod_backward_projection_shared_mem_cdrf.get_function("backprojection_cdrf")
    #                     func_backward(dataset, weight_cutted, height_cutted, depth_cutted,
    #                                   distance_to_center_plane_normal_gpu,
    #                                   distance_to_center_plane_gpu, half_distance_between_array_pixel,
    #                                   number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
    #                                   b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
    #                                   d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset],
    #                                   C_cut_gpu[dataset],
    #                                   adjust_coef_gpu[dataset],
    #                                   sum_vor_t_gpu, fov_cut_matrix_cutted_gpu[dataset], time_factor_gpu,
    #                                   block=threadsperblock,
    #                                   grid=blockspergrid,
    #                                   shared=np.int(4 * number_of_voxels_thread),
    #                                   stream=stream[dataset],
    #                                   )
    #                 else:
    #                     if self.normalization_calculation_flag:
    #                         func_backward = mod_normalization_shared_mem.get_function("normalization")
    #
    #
    #                     else:
    #                         func_backward = mod_backward_projection_shared_mem.get_function("backprojection")
    #
    #                     func_backward(dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy,
    #                                   half_crystal_pitch_z, half_distance_between_array_pixel,
    #                                   number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
    #                                   b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
    #                                   d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset],
    #                                   C_cut_gpu[dataset],
    #                                   adjust_coef_gpu[dataset],
    #                                   sum_vor_t_gpu, fov_cut_matrix_cutted_gpu[dataset], time_factor_gpu,
    #                                   block=threadsperblock,
    #                                   grid=blockspergrid,
    #                                   shared=np.int(4 * number_of_voxels_thread),
    #                                   stream=stream[dataset],
    #                                   )
    #
    #             for dataset in range(number_of_datasets_back):  # Commenting out this line should break concurrency.
    #                 event[dataset]['kernel_end'].record(stream[dataset])
    #
    #             for dataset in range(number_of_datasets_back):
    #                 cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset])
    #                 adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division] = adjust_coef_cut[dataset]
    #
    #             cuda.Context.synchronize()
    #             print('Time part Backward Projection {} : {}'.format(1, time.time() - toc))
    #
    #             # Image Normalization
    #             # if i ==0:
    #             #     norm_im=np.copy(adjust_coef)
    #             #     norm_im=norm_im/np.max(norm_im)
    #             #     norm_im[norm_im == 0] = np.min(norm_im[np.nonzero(norm_im)])
    #             # normalization_matrix = gaussian_filter(normalization_matrix, 0.5)
    #
    #             # im_med = np.load("C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")
    #             # self.algorithm = "LM-MRP"
    #             if self.algorithm == "LM-MRP":
    #                 beta = self.algorithm_options[0]
    #                 kernel_filter_size = self.algorithm_options[1]
    #                 im_to_filter = im.reshape(weight, height, depth)
    #                 im_med = median_filter(im_to_filter, kernel_filter_size)
    #                 penalized_term = np.copy(im_to_filter)
    #                 penalized_term[im_med != 0] = 1 + beta * (im_to_filter[im_med != 0] - im_med[im_med != 0]) / im_med[
    #                     im_med != 0]
    #                 penalized_term = np.ascontiguousarray(penalized_term.reshape(weight * height * depth),
    #                                                       dtype=np.float32)
    #                 # penalized_term = np.ascontiguousarray(penalized_term, dtype=np.float32)
    #
    #             if self.algorithm == "MAP":
    #                 beta = 0.5
    #                 im_map = np.load(
    #                     "C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")
    #
    #             im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[
    #                 normalization_matrix != 0] / (normalization_matrix[normalization_matrix != 0])
    #             im[normalization_matrix == 0] = 0
    #             if self.algorithm == "LM-MRP":
    #                 im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0]
    #             # im = fourier_gaussian(im, sigma=0.2)
    #             # im = gaussian_filter(im, 0.4)
    #             print('SUM IMAGE: {}'.format(np.sum(im)))
    #             im = np.ascontiguousarray(im, dtype=np.float32)
    #             # im = im * adjust_coef / sensivity_matrix[np.nonzero(sensivity_matrix)]
    #             cuda.memcpy_htod_async(im_gpu, im)
    #
    #             # Clearing variables
    #             sum_vor = np.ascontiguousarray(
    #                 np.zeros(self.a.shape, dtype=np.float32))
    #
    #             adjust_coef = np.ascontiguousarray(
    #                 np.zeros((self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z),
    #                          dtype=np.float32))
    #
    #             for dataset in range(number_of_datasets):
    #                 # if dataset == number_of_datasets:
    #                 #     begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
    #                 #     end_dataset = number_of_events
    #                 #
    #                 # else:
    #                 begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
    #                 end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
    #                 # adjust_coef_cut[dataset] = np.ascontiguousarray(adjust_coef[:, :,
    #                 #                                                 int(np.floor(im_cut_dim[2] * dataset)):int(
    #                 #                                                     np.floor(im_cut_dim[2] * (dataset + 1)))],
    #                 #                                                 dtype=np.float32)
    #
    #                 sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
    #                 # cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_cut[dataset])
    #                 sum_vor_pinned[dataset] = cuda.register_host_memory(sum_vor_cut[dataset])
    #                 assert np.all(sum_vor_pinned[dataset] == sum_vor_cut[dataset])
    #                 cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_pinned[dataset], stream[dataset])
    #
    #             for dataset in range(number_of_datasets_back):
    #                 adjust_coef_cut[dataset] = adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division]
    #                 adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset])
    #                 assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset])
    #                 cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset])
    #
    #             if self.saved_image_by_iteration:
    #                 im = im.reshape(weight, height, depth)
    #                 self._save_image_by_it(im, i, sb)
    #
    #             if self.signals_interface is not None:
    #                 self.signals_interface.trigger_update_label_reconstruction_status.emit(
    #                     "{}: Iteration {}".format(self.current_info_step, i + 1))
    #                 self.signals_interface.trigger_progress_reconstruction_partial.emit(
    #                     int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0)))
    #
    #     im = im.reshape(weight, height, depth)
    #     return im * subsets

    def _vor_design_gpu_shared_memory_multiple_reads_DOI(self, a, a_normal, a_cf, A, b, b_normal, b_cf, B, c, c_normal,
                                                         c_cf, C,
                                                         d, d_normal, d_cf, adjust_coef, im, half_crystal_pitch_xy,
                                                         half_crystal_pitch_z,
                                                         sum_vor, fov_cut_matrix, normalization_matrix, time_factor):
        print('Optimizer STARTED - Multiple reads')
        # cuda.init()
        cuda = self.cuda_drv
        # device = cuda.Device(0)  # enter your gpu id here
        # ctx = device.make_context()
        number_of_events = np.int32(len(a))
        weight = np.int32(A.shape[0])
        height = np.int32(A.shape[1])
        depth = np.int32(A.shape[2])
        # start_x = np.int32(A[0, 0, 0])
        start_x = np.int32(A[0, 0, 0])
        start_y = np.int32(B[0, 0, 0])
        start_z = np.int32(C[0, 0, 0])
        print("Start_point: {},{},{}".format(start_x, start_y, start_z))
        print('Image size: {},{}, {}'.format(weight, height, depth))

        half_distance_between_array_pixel = np.float32(self.distance_between_array_pixel / 2)
        normalization_matrix = normalization_matrix.reshape(
            normalization_matrix.shape[0] * normalization_matrix.shape[1] * normalization_matrix.shape[2])

        # SOURCE MODELS (DEVICE CODE)

        mod_forward_projection_shared_mem = SourceModule("""
        #include <stdint.h>

        texture<char, 1> tex;


        __global__ void forward_projection
        (const int n, const int m, const int p, const int start_x, const int start_y, const int start_z,
        const float crystal_pitch_XY,const float crystal_pitch_Z,const float distance_between_array_pixel,
        int number_of_events, const int begin_event_gpu_limitation,  const int end_event_gpu_limitation, 
        float *a, float *a_normal, float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal, 
        float *c_cf, float *d, float *d_normal, float *d_cf, float *sum_vor, const char *fov_cut_matrix, const float *im_old)
                               {
                                  const int shared_memory_size = 256;
                                   __shared__ float a_shared[shared_memory_size];
                                   __shared__ float b_shared[shared_memory_size];
                                   __shared__ float c_shared[shared_memory_size];
                                   __shared__ float d_shared[shared_memory_size];
                                   __shared__ float a_normal_shared[shared_memory_size];
                                   __shared__ float b_normal_shared[shared_memory_size];
                                   __shared__ float c_normal_shared[shared_memory_size];
                                   __shared__ float d_normal_shared[shared_memory_size];
                                   __shared__ float a_cf_shared[shared_memory_size];
                                   __shared__ float b_cf_shared[shared_memory_size];
                                   __shared__ float c_cf_shared[shared_memory_size];
                                   __shared__ float d_cf_shared[shared_memory_size];
                                   __shared__ float sum_vor_shared[shared_memory_size];
                                   __shared__ char fov_cut_matrix_shared[shared_memory_size];
                                   /*
                                  const int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
                                  const int threadId= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
                                  */
                                  int threadId=blockIdx.x *blockDim.x + threadIdx.x;
                                  int e;

                                  float d2;
                                  float d2_normal;
                                  float d2_cf;
                                  float value;
                                  float value_normal;
                                  float value_cf;
                                  float sum_vor_temp;
                                  int index;                                  
                                  int x_t;
                                  int y_t;
                                  int z_t;
                                  float max_distance_projector;     
                                  const int number_events_max = end_event_gpu_limitation-begin_event_gpu_limitation;
                                  const float error_pixel = 0.0000f;
                                   if (threadIdx.x>shared_memory_size)
                                   {
                                   return;
                                   }
                                   if(threadId>number_events_max)
                                   {
                                   return;
                                   }

                                  __syncthreads();
                                  e = threadId;
                                  int e_m = threadIdx.x;
                                  a_shared[e_m] = a[e];
                                  b_shared[e_m] = b[e];
                                  c_shared[e_m] = c[e];
                                  d_shared[e_m] = d[e];
                                  a_normal_shared[e_m] = a_normal[e];
                                  b_normal_shared[e_m] = b_normal[e];
                                  c_normal_shared[e_m] = c_normal[e];
                                  d_normal_shared[e_m] = d_normal[e];
                                  a_cf_shared[e_m] = a_cf[e];
                                  b_cf_shared[e_m] = b_cf[e];
                                  c_cf_shared[e_m] = c_cf[e];
                                  d_cf_shared[e_m] = d_cf[e];
                                  sum_vor_shared[e_m] = sum_vor[e];   

                                  d2_normal = crystal_pitch_XY * sqrt(a_normal_shared[e_m]*a_normal_shared[e_m]+b_normal_shared[e_m]*b_normal_shared[e_m]+c_normal_shared[e_m]*c_normal_shared[e_m]); 
                                  d2 = crystal_pitch_Z * sqrt(a_shared[e_m]*a_shared[e_m] + b_shared[e_m]*b_shared[e_m] + c_shared[e_m]*c_shared[e_m]);
                                  d2_cf = distance_between_array_pixel*sqrt(a_cf_shared[e_m]*a_cf_shared[e_m]+b_cf_shared[e_m]*b_cf_shared[e_m]+c_cf_shared[e_m]*c_cf_shared[e_m]);
                                  max_distance_projector=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf);                             

                                  for(int l=0; l<n; l++)
                                  {  
                                      x_t = l+start_x;
                                      for(int j=0; j<m; j++)
                                      {  
                                         y_t = j+start_y;
                                         for(int k=0; k<p; k++)
                                              {
                                              /*
                                              index = l+j*n+k*m*n;                                              
                                              fov_cut_matrix_shared[k]=fov_cut_matrix[k+j*p+l*p*m];
                                              index = l+j*n+k*m*n; 
                                              */
                                                index = k+j*p+l*p*m;                                                


                                              if (fov_cut_matrix[index]!=0)
                                              {                                                

                                                  z_t = k+start_z;
                                                  value_normal = a_normal_shared[e_m]*x_t+b_normal_shared[e_m]*y_t+c_normal_shared[e_m] * z_t -d_normal_shared[e_m];                                                 

                                                   if (value_normal < d2_normal && value_normal >= -d2_normal)
                                                   {
                                                    value = a_shared[e_m]*x_t+b_shared[e_m]*y_t+c_shared[e_m]*z_t-d_shared[e_m];

                                                    if (value < d2 && value >=-d2 )
                                                     {
                                                          value_cf = a_cf_shared[e_m]*x_t+b_cf_shared[e_m]*y_t+c_cf_shared[e_m]*z_t-d_cf_shared[e_m];


                                                      if (value_cf >= -d2_cf && value_cf<d2_cf)
                                                          {

                                                           sum_vor_temp +=im_old[index]*(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector);
                                                           /* im_old[index]*(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector)

                                                            */
                                                            }


                                                  }




                                              }
                                              }

                                           }
                                       }
                                   }

                                   /*
                                    sum_vor[e]= sum_vor_temp;
                                   sum_vor[e]= sum_vor_shared[e_m];
                                   __syncthreads();
                                   sum_vor[e]= sum_vor_shared[e_m];
                                   */
                                   sum_vor[e]= sum_vor_temp;



               }""")
        mod_normalization_shared_mem = SourceModule("""
                             #include <stdint.h>
                             texture<uint8_t, 1> tex;

                      __global__ void normalization
                       ( int dataset_number, int n, int m, int p, const float crystal_pitch_XY, const float crystal_pitch_Z, 
                       const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation, 
                       const int end_event_gpu_limitation, const float *a, const float *a_normal, const float *a_cf, const float *b,
                       const float *b_normal, const float *b_cf, const float *c, const float *c_normal, const float *c_cf, const float *d, const float *d_normal,
                       const float *d_cf, const short *A, const short *B, const short *C, float *adjust_coef, float *sum_vor, 
                       char *fov_cut_matrix, float *time_factor)
                       {
                                 extern __shared__ float adjust_coef_shared[];      
                                 int idt=blockIdx.x *blockDim.x + threadIdx.x;                            


                                 float d2;
                                 float d2_normal;
                                 float d2_cf;
                                 float normal_value;
                                 float value;
                                 float value_cf;                                              
                                 short a_temp;
                                 short b_temp;
                                 short c_temp;
                                 char fov_cut_temp;   
                                 int i_s=threadIdx.x;

                                 if (idt>n*m*p)
                                 {
                                      return;
                                  }

                                 __syncthreads();
                                 adjust_coef_shared[i_s] = adjust_coef[idt];
                                 a_temp = A[idt];
                                 b_temp = B[idt];
                                 c_temp = C[idt];
                                 fov_cut_temp = fov_cut_matrix[idt];



                                 for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
                                 {
                                     if (fov_cut_temp!=0)
                                     {   
                                     normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
                                     d2_normal = crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);

                                     if (normal_value< d2_normal && normal_value >= -d2_normal)
                                     {
                                     value = a[e]*a_temp+b[e]*b_temp +c[e]*c_temp- d[e];
                                     d2 = crystal_pitch_Z * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);


                                         if (value < d2 && value >=-d2)
                                         {
                                                   value_cf = a_cf[e]*a_temp+b_cf[e]*b_temp +c_cf[e] * c_temp-d_cf[e];
                                                   d2_cf = distance_between_array_pixel*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);


                                                     if (value_cf >= -d2_cf && value_cf<d2_cf)
                                                     {    


                                                            adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));


                                                    }

                                                         /*

                                                         adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf));
                                                         adjust_coef_shared[i_s] += 1/sum_vor[e];  
                                                         normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
                                          d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
                                                        adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);  
                                                       adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);  
                                                          */





                                              }
                                          }

                                 }

                                 }

                                 adjust_coef[idt] = adjust_coef_shared[i_s];
                                 __syncthreads();



                             }
                             """)
        mod_backward_projection_shared_mem = SourceModule("""
                      #include <stdint.h>
                      texture<uint8_t, 1> tex;

               __global__ void backprojection
                ( int dataset_number, int n, int m, int p, const float crystal_pitch_XY, const float crystal_pitch_Z, 
                const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation, 
                const int end_event_gpu_limitation, const float *a, const float *a_normal, const float *a_cf, const float *b,
                const float *b_normal, const float *b_cf, const float *c, const float *c_normal, const float *c_cf, const float *d, const float *d_normal,
                const float *d_cf, const short *A, const short *B, const short *C, float *adjust_coef, float *sum_vor, 
                char *fov_cut_matrix, float *time_factor)
                {
                          extern __shared__ float adjust_coef_shared[];      
                          int idt=blockIdx.x *blockDim.x + threadIdx.x;                            


                          float d2;
                          float d2_normal;
                          float d2_cf;
                          float normal_value;
                          float value;
                          float value_cf;                                              
                          short a_temp;
                          short b_temp;
                          short c_temp;
                          char fov_cut_temp;   
                          int i_s=threadIdx.x;

                          if (idt>n*m*p)
                          {
                               return;
                           }

                          __syncthreads();
                          adjust_coef_shared[i_s] = adjust_coef[idt];
                          a_temp = A[idt];
                          b_temp = B[idt];
                          c_temp = C[idt];
                          fov_cut_temp = fov_cut_matrix[idt];



                          for(int e=begin_event_gpu_limitation; e<end_event_gpu_limitation; e++)
                          {
                              if (fov_cut_temp!=0)
                              {   
                              normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
                              d2_normal = crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);

                              if (normal_value< d2_normal && normal_value >= -d2_normal)
                              {
                              value = a[e]*a_temp+b[e]*b_temp +c[e]*c_temp- d[e];
                              d2 = crystal_pitch_Z * sqrt(a[e]*a[e] + b[e]*b[e] + c[e]*c[e]);


                                  if (value < d2 && value >=-d2)
                                  {
                                            value_cf = a_cf[e]*a_temp+b_cf[e]*b_temp +c_cf[e] * c_temp-d_cf[e];
                                            d2_cf = distance_between_array_pixel*sqrt(a_cf[e]*a_cf[e]+b_cf[e]*b_cf[e]+c_cf[e]*c_cf[e]);


                                              if (value_cf >= -d2_cf && value_cf<d2_cf)
                                              {    



                                                    adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);

                                             }

                                                  /*

                                                  adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]*time_factor[e]);
                                                  adjust_coef_shared[i_s] += 1/sum_vor[e];  
                                                  normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
                                   d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
                                                 adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);  
                                                adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);  
                                                   */





                                       }
                                   }

                          }

                          }

                          adjust_coef[idt] = adjust_coef_shared[i_s];
                          __syncthreads();



                      }
                      """)
        mod_forward_projection_shared_mem_cdrf = SourceModule("""
               #include <stdint.h>

               texture<char, 1> tex;


               __global__ void forward_projection_cdrf
               (const int n, const int m, const int p, const int start_x, const int start_y, const int start_z,
               float *crystal_pitch_XY, float *crystal_pitch_Z,const float distance_between_array_pixel,
               int number_of_events, const int begin_event_gpu_limitation,  const int end_event_gpu_limitation, 
               float *a, float *a_normal, float *a_cf, float *b,float *b_normal, float *b_cf,float *c,float *c_normal, 
               float *c_cf, float *d, float *d_normal, float *d_cf, float *sum_vor, const char *fov_cut_matrix, 
               const float *im_old, float* m_values, float* b_values, float* m_values_at, float* b_values_at,
                                     float* max_D, float* inflex_points_x, float* linear_attenuation_A,
                                     float* linear_attenuation_B)
                                      {
                                         const int shared_memory_size = 256;
                                          __shared__ float a_shared[shared_memory_size];
                                          __shared__ float b_shared[shared_memory_size];
                                          __shared__ float c_shared[shared_memory_size];
                                          __shared__ float d_shared[shared_memory_size];
                                          __shared__ float a_normal_shared[shared_memory_size];
                                          __shared__ float b_normal_shared[shared_memory_size];
                                          __shared__ float c_normal_shared[shared_memory_size];
                                          __shared__ float d_normal_shared[shared_memory_size];
                                          __shared__ float a_cf_shared[shared_memory_size];
                                          __shared__ float b_cf_shared[shared_memory_size];
                                          __shared__ float c_cf_shared[shared_memory_size];
                                          __shared__ float d_cf_shared[shared_memory_size];
                                          __shared__ float sum_vor_shared[shared_memory_size];
                                          __shared__ char fov_cut_matrix_shared[shared_memory_size];
                                          __shared__ float crystal_pitch_Z_shared[shared_memory_size];
                                          __shared__ float crystal_pitch_XY_shared[shared_memory_size];
                                          __shared__ float m_values_init_shared[shared_memory_size];
                                          __shared__ float m_values_end_shared[shared_memory_size];
                                          __shared__ float b_values_init_shared[shared_memory_size];
                                          __shared__ float b_values_end_shared[shared_memory_size];
                                          __shared__ float m_values_at_shared[shared_memory_size];
                                          __shared__ float b_values_at_shared[shared_memory_size];
                                          __shared__ float max_D_shared[shared_memory_size];
                                          __shared__ float inflex_points_x_init_shared[shared_memory_size];
                                          __shared__ float inflex_points_x_end_shared[shared_memory_size];
                                          __shared__ float linear_attenuation_A_shared[shared_memory_size];
                                          __shared__ float linear_attenuation_B_shared[shared_memory_size];
                                          
                                          /*
                                         const int blockId = blockIdx.x+ blockIdx.y * gridDim.x+ gridDim.x * gridDim.y * blockIdx.z;
                                         const int threadId= blockId * (blockDim.x * blockDim.y * blockDim.z)+ (threadIdx.z * (blockDim.x * blockDim.y))+ (threadIdx.y * blockDim.x)+ threadIdx.x;
                                         */
                                         int threadId=blockIdx.x *blockDim.x + threadIdx.x;
                                         int e;

                                         float d2;
                                         float d2_normal;
                                         float d2_cf;
                                         float value;
                                         float value_normal;
                                         float value_cf;
                                         float sum_vor_temp;
                                         int index;                                  
                                         int x_t;
                                         int y_t;
                                         int z_t;
                                         float width;
                                         float height;
                                         float distance;
                                         float distance_other; 
                                         float distance_crystal;
                                         float distance_at;
                                         float idrf;
                                         


                                         float max_distance_projector;

                                         const int number_events_max = end_event_gpu_limitation-begin_event_gpu_limitation;
                                         const float error_pixel = 0.0000f;
                                          if (threadIdx.x>shared_memory_size)
                                          {
                                          return;
                                          }
                                          if(threadId>number_events_max)
                                          {
                                          return;
                                          }

                                         __syncthreads();
                                         e = threadId;
                                         int e_m = threadIdx.x;
                                         a_shared[e_m] = a[e];
                                         b_shared[e_m] = b[e];
                                         c_shared[e_m] = c[e];
                                         d_shared[e_m] = d[e];
                                         a_normal_shared[e_m] = a_normal[e];
                                         b_normal_shared[e_m] = b_normal[e];
                                         c_normal_shared[e_m] = c_normal[e];
                                         d_normal_shared[e_m] = d_normal[e];
                                         a_cf_shared[e_m] = a_cf[e];
                                         b_cf_shared[e_m] = b_cf[e];
                                         c_cf_shared[e_m] = c_cf[e];
                                         d_cf_shared[e_m] = d_cf[e];
                                         sum_vor_shared[e_m] = sum_vor[e];
                                         crystal_pitch_Z_shared[e_m] =  crystal_pitch_Z[e];  
                                         crystal_pitch_XY_shared[e_m] =  crystal_pitch_XY[e];  
                                         m_values_init_shared[e_m] =  m_values[2*e];  
                                         m_values_end_shared[e_m] =  m_values[2*e+1];  
                                         b_values_init_shared[e_m] =  b_values[2*e];  
                                         b_values_end_shared[e_m] =  b_values[2*e+1];  
                                         m_values_at_shared[e_m] =  m_values_at[e];  
                                         b_values_at_shared[e_m] =  b_values_at[e];  
                                         max_D_shared[e_m] = max_D[e];  
                                         inflex_points_x_init_shared[e_m] =  inflex_points_x[2*e];  
                                         inflex_points_x_end_shared[e_m] =  inflex_points_x[2*e+1];  
                                         linear_attenuation_A_shared[e_m] =  linear_attenuation_A[e];  
                                         linear_attenuation_B_shared[e_m] =  linear_attenuation_B[e];  

                                         d2_normal = crystal_pitch_XY_shared[e_m] * sqrt(a_normal_shared[e_m]*a_normal_shared[e_m]+b_normal_shared[e_m]*b_normal_shared[e_m]+c_normal_shared[e_m]*c_normal_shared[e_m]); 
                                         d2 = crystal_pitch_Z_shared[e_m]* sqrt(a_shared[e_m]*a_shared[e_m] + b_shared[e_m]*b_shared[e_m] + c_shared[e_m]*c_shared[e_m]);
                                         d2_cf = distance_between_array_pixel*sqrt(a_cf_shared[e_m]*a_cf_shared[e_m]+b_cf_shared[e_m]*b_cf_shared[e_m]+c_cf_shared[e_m]*c_cf_shared[e_m]);
                                         max_distance_projector=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf);                             

                                         for(int l=0; l<n; l++)
                                         {  
                                             x_t = l+start_x;
                                             for(int j=0; j<m; j++)
                                             {  
                                                y_t = j+start_y;
                                                for(int k=0; k<p; k++)
                                                     {
                                                     /*
                                                     index = l+j*n+k*m*n;                                              
                                                     fov_cut_matrix_shared[k]=fov_cut_matrix[k+j*p+l*p*m];
                                                     index = l+j*n+k*m*n; 
                                                     */
                                                       index = k+j*p+l*p*m;                                                


                                                     if (fov_cut_matrix[index]!=0)
                                                     {                                                

                                                         z_t = k+start_z;
                                                         value_normal = a_normal_shared[e_m]*x_t+b_normal_shared[e_m]*y_t+c_normal_shared[e_m] * z_t -d_normal_shared[e_m];                                                 

                                                          if (value_normal < d2_normal && value_normal >= -d2_normal)
                                                          {
                                                           value = a_shared[e_m]*x_t+b_shared[e_m]*y_t+c_shared[e_m]*z_t-d_shared[e_m];

                                                           if (value < d2 && value >=-d2 )
                                                            {
                                                                 value_cf = a_cf_shared[e_m]*x_t+b_cf_shared[e_m]*y_t+c_cf_shared[e_m]*z_t-d_cf_shared[e_m];


                                                             if (value_cf >= -d2_cf && value_cf<d2_cf)
                                                                 {
                                                                 if (sqrt(value*value+value_normal*value_normal+value_cf*value_cf)<=max_distance_projector)
                                                                 {      
                                                                     width = 2*(crystal_pitch_Z_shared[e_m]  - abs(value));
                                                                    height = 2*(crystal_pitch_XY_shared[e_m] - abs(value_normal));

                                                                     distance =d2_cf+abs(value_cf);
                                                                    distance_other = abs(d2_cf+value_cf);
                                                                        
                                                                    distance_at = m_values_at_shared[e_m]*value+b_values_at_shared[e_m];
                                                                 
                                                                      
                                                                if(value<=inflex_points_x_init_shared[e_m])
                                                                {
                                                                    distance_crystal = m_values_init_shared[e_m]*value+b_values_init_shared[e_m];
                                                                    distance_at = 0;
                                                                }
                                                                 
                                                             
                                                                else if(value>=inflex_points_x_end_shared[e_m])
                                                                {
                                                                    distance_crystal = m_values_end_shared[e_m]*value+b_values_end_shared[e_m];
                                                                }
                                                                
                                                                else 
                                                                {
                                                                    distance_crystal = max_D_shared[e_m];
                                                                    
                                                                    
                                                                }
                                                                
                                                              
                                                              
                                                               idrf=(1-exp(-linear_attenuation_A_shared[e_m]*distance_crystal))*exp(-linear_attenuation_A_shared[e_m]*distance_at);
                                                               if (idrf<0)
                                                               {
                                                              
                                                               idrf = 0;
                                                               }
                                                                
                                                              
                                                              
                                                                
                                                                  sum_vor_shared[e_m]+= idrf*im_old[index];
                                                                  
                                                                  

                                                                  }
                                                                  /*
                                                                  pow(4*atan(width*height/(2*distance*sqrt(4*distance*distance+width*width+height*height))),2)
                                                                  sum_vor_shared[e_m]+= im_old[index]*(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector);
                                                                  4*arctan(
                                                                  4*np.arctan(width*height/(2*distance_to_anh*np.sqrt(4*distance_between_array_pixel-value_cf)**2+width**2+height**2)))
                                                                  *(1-sqrt(value*value+value_normal*value_normal+value_cf*value_cf)/max_distance_projector)
                                                                  */
                                                                   }


                                                         }




                                                     }
                                                     }

                                                  }
                                              }
                                          }

                                          /*
                                           sum_vor[e]= sum_vor_temp;
                                          sum_vor[e]= sum_vor_shared[e_m];


                                          sum_vor[e]= sum_vor_temp;
                                          */
                                          __syncthreads();
                                          sum_vor[e]= sum_vor_shared[e_m];



                      }""")

        mod_normalization_shared_mem_cdrf = SourceModule("""
        #include <stdint.h>
        
        texture<uint8_t, 1> tex;
        __device__ float* three_plane_intersection(float plane1A, float plane1B,
            float plane1C, float plane1D, float plane2A, float plane2B, float plane2C,
            float plane2D, float plane3A, float plane3B, float plane3C, float plane3D);
        __device__ float intersection_determinant(float matrix[3][3]);
        __device__ float point_distance_to_plane(float *point, float A, float B, float C, float D);
        
        __global__ void normalization_cdrf
        (int dataset_number, int n, int m, int p, float* crystal_pitch_XY, float* crystal_pitch_Z,
            const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation,
            const int end_event_gpu_limitation, const float* a, const float* a_normal, const float* a_cf, const float* b,
            const float* b_normal, const float* b_cf, const float* c, const float* c_normal, const float* c_cf, const float* d, const float* d_normal,
            const float* d_cf, const short* A, const short* B, const short* C, float* adjust_coef, float* sum_vor,
            char* fov_cut_matrix, float* time_factor, float* plane_centerA1_A,
            float* plane_centerA1_B, float* plane_centerA1_C, float* plane_centerA1_D,
            float* plane_centerB1_A, float* plane_centerB1_B, float* plane_centerB1_C,
            float* plane_centerB1_D, float* plane_centerC1_A, float* plane_centerC1_B,
            float* plane_centerC1_C, float* plane_centerC1_D, float* intersection_points, float* m_values, float* m_values_at,
            float* b_values, float* b_values_at, float* max_D, float* inflex_points_x, float* linear_attenuation_A,
            float* linear_attenuation_B)
        {
            extern __shared__ float adjust_coef_shared[];
            int idt = blockIdx.x * blockDim.x + threadIdx.x;
        
        
            float d2;
            float d2_normal;
            float d2_cf;
            float normal_value;
            float value;
            float value_cf;
            short a_temp;
            short b_temp;
            short c_temp;
            char fov_cut_temp;
            int i_s = threadIdx.x;
            float width;
            float height;
            float distance;           
            float solid_angle;
            float face_1_distance_to_center;
            float face_2_distance_to_center;
            float face_3_distance_to_center;
            float* p1;          
            float* p2;          
            float* p3;          
            float* p4;          
            float* p5;          
            float* p6;            
            float dist_p1;
            float distance_crystal;    
            float distance_at;
            float idrf;      
           
            if (idt >= n * m * p)
            {
                return;
            }
        
            __syncthreads();
            adjust_coef_shared[i_s] = adjust_coef[idt];
            a_temp = A[idt];
            b_temp = B[idt];
            c_temp = C[idt];
            fov_cut_temp = fov_cut_matrix[idt];
        
            for (int e = begin_event_gpu_limitation; e < end_event_gpu_limitation; e++)
            {
                if (fov_cut_temp != 0)
                {
                    normal_value = a_normal[e] * a_temp + b_normal[e] * b_temp + c_normal[e] * c_temp - d_normal[e];
                    d2_normal = crystal_pitch_XY[e] * sqrt(a_normal[e] * a_normal[e] + b_normal[e] * b_normal[e] + c_normal[e] * c_normal[e]);
        
                    if (normal_value < d2_normal && normal_value >= -d2_normal)
                    {
                        value = a[e] * a_temp + b[e] * b_temp + c[e] * c_temp - d[e];
                        d2 = crystal_pitch_Z[e] * sqrt(a[e] * a[e] + b[e] * b[e] + c[e] * c[e]);
        
        
                        if (value < d2 && value >= -d2)
                        {
                            value_cf = a_cf[e] * a_temp + b_cf[e] * b_temp + c_cf[e] * c_temp - d_cf[e];
                            d2_cf = distance_between_array_pixel * sqrt(a_cf[e] * a_cf[e] + b_cf[e] * b_cf[e] + c_cf[e] * c_cf[e]);
        
        
                            if (value_cf >= -d2_cf && value_cf < d2_cf)
                            {
        
                                width = 2 * (crystal_pitch_Z[e] - abs(value));
                                height = 2 * (crystal_pitch_XY[e] - abs(normal_value));
                                distance = d2_cf + abs(value_cf);
                                                              
                               /*
                                face_1_distance_to_center = 1.0f * sqrt(plane_centerA1_A[e] * plane_centerA1_A[e] + plane_centerA1_B[e] * plane_centerA1_B[e] + plane_centerA1_C[e] * plane_centerA1_C[e]);
                                face_2_distance_to_center = 1.0f * sqrt(plane_centerB1_A[e] * plane_centerB1_A[e] + plane_centerB1_B[e] * plane_centerB1_B[e] + plane_centerB1_C[e] * plane_centerB1_C[e]);
                                face_3_distance_to_center = 15.0f * sqrt(plane_centerC1_A[e] * plane_centerC1_A[e] + plane_centerC1_B[e] * plane_centerC1_B[e] + plane_centerC1_C[e] * plane_centerC1_C[e]);
                                
                                p1 = three_plane_intersection(a[e],
                                    b[e], c[e], d[e],
                                    a_normal[e], b_normal[e], c_normal[e],
                                    d_normal[e], plane_centerA1_A[e], plane_centerA1_B[e],
                                    plane_centerA1_C[e], plane_centerA1_D[e]+face_1_distance_to_center);
                                    
                                p2 = three_plane_intersection(a[e],
                                    b[e], c[e], d[e],
                                    a_normal[e], b_normal[e], c_normal[e],
                                    d_normal[e], plane_centerA1_A[e], plane_centerA1_B[e],
                                    plane_centerA1_C[e], plane_centerA1_D[e]-face_1_distance_to_center);                                    
                                               
                                p3 = three_plane_intersection(a[e],
                                    b[e], c[e], d[e],
                                    a_normal[e], b_normal[e], c_normal[e],
                                    d_normal[e], plane_centerB1_A[e], plane_centerB1_B[e],
                                    plane_centerB1_C[e], plane_centerB1_D[e]+face_2_distance_to_center);
                                    
                                p4 = three_plane_intersection(a[e],
                                    b[e], c[e], d[e],
                                    a_normal[e], b_normal[e], c_normal[e],
                                    d_normal[e], plane_centerB1_A[e], plane_centerB1_B[e],
                                    plane_centerB1_C[e], plane_centerB1_D[e]-face_2_distance_to_center);
                                    
                                p5 = three_plane_intersection(a[e],
                                    b[e], c[e], d[e],
                                    a_normal[e], b_normal[e], c_normal[e],
                                    d_normal[e], plane_centerC1_A[e], plane_centerC1_B[e],
                                    plane_centerC1_C[e], plane_centerC1_D[e]+face_3_distance_to_center);
                                    
                                p6 = three_plane_intersection(a[e],
                                    b[e], c[e], d[e],
                                    a_normal[e], b_normal[e], c_normal[e],
                                    d_normal[e], plane_centerC1_A[e], plane_centerC1_B[e],
                                    plane_centerC1_C[e], plane_centerC1_D[e]-face_3_distance_to_center);            
                                    
                                intersection_points[e*6] = point_distance_to_plane(p1, a_cf[e], b_cf[e],c_cf[e], d_cf[e]); 
                                dist_p2 = point_distance_to_plane(p2, a_cf[e], b_cf[e],c_cf[e], d_cf[e]);
                                
                              
                                                     
                                 printf(" Distance p1: %f", intersection_points[e*6]);
                               
                                  dist_p2 = point_distance_to_plane(p2, a_cf[e], b_cf[e],c_cf[e], d_cf[e]);                     
                                dist_p3 = point_distance_to_plane(p3, a_cf[e], b_cf[e],c_cf[e], d_cf[e]);                     
                                dist_p4 = point_distance_to_plane(p4, a_cf[e], b_cf[e],c_cf[e], d_cf[e]);                     
                                dist_p5 = point_distance_to_plane(p5, a_cf[e], b_cf[e],c_cf[e], d_cf[e]);                     
                                dist_p6 = point_distance_to_plane(p6, a_cf[e], b_cf[e],c_cf[e], d_cf[e]);
                                printf(" Distance p1: %f", dist_p1);
                                printf(" x: %f, y: %f, z: %f ",p1[0], p1[1], p1[2]);
                                printf("----------------");
                                printf(" x_2: %f, y_2: %f, z_2: %f ",p2[0], p2[1], p2[2]);
                              */
                                distance_at = m_values_at[e]*value+b_values_at[e];
                                                                   
                                if(value<=inflex_points_x[2*e])
                                {
                                    distance_crystal = m_values[2*e]*value+b_values[2*e];
                                    distance_at = 0;
                                }
                                 else if(value>=inflex_points_x[2*e+1])
                                {
                                    distance_crystal = m_values[2*e+1]*value+b_values[2*e+1];
                                }
                               
                                else 
                                {
                                    distance_crystal = max_D[e];
                                    
                                    
                                }
                                distance_crystal = max_D[e];
                                idrf = (1-exp(-linear_attenuation_A[e]*distance_crystal))*exp(-linear_attenuation_A[e]*distance_at);
                             
                                
                                
                                solid_angle = 4 * width * height / (2 * distance * sqrt(4 * distance * distance + width * width + height * height));
        
        
                                adjust_coef_shared[i_s] +=  idrf/(sum_vor[e]);
                                 
                                /*
                             else if(value<=inflex_points_x[2*e+1])
                                {
                                    distance_crystal = m_values[2*e+1]*value+b_values[2*e+1];
                                }
                                
                                   printf(" x_2: %f,  distance: %f, att: %f ",idrf,distance_crystal, linear_attenuation_A[e] );
                                 if (sqrt(value*value+normal_value*normal_value+value_cf*value_cf)<=sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))
                                {
                                   adjust_coef_shared[i_s] += time_factor[e]*(1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]);
                                    }
                                  */
        
        
                            }
        
                            /*
                            (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/
                            normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
             d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
                           adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);
                          adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);
                             */
        
                        }
                    }
        
                }
        
            }
        
            adjust_coef[idt] = adjust_coef_shared[i_s];
            __syncthreads();
        
        }
        __device__ float point_distance_to_plane(float* point, float A, float B, float C, float D)
        {
             float distance;
             distance = abs(A*point[0]+B*point[1]+C*point[2]-D)/sqrt(A*A+B*B+C*C);
             return distance;
        }
        
        __device__  float* three_plane_intersection(float plane1A, float plane1B,
            float plane1C, float plane1D, float plane2A, float plane2B, float plane2C,
            float plane2D, float plane3A, float plane3B, float plane3C, float plane3D)
        {
            float m_det[3][3];
            float m_x[3][3];
            float m_y[3][3];
            float m_z[3][3];
            float det;
            float det_x;
            float det_y;
            float det_z;
            float result[3];
        
            m_det[0][0] = plane1A;
            m_det[0][1] = plane1B;
            m_det[0][2] = plane1C;
            m_det[1][0] = plane2A;
            m_det[1][1] = plane2B;
            m_det[1][2] = plane2C;
            m_det[2][0] = plane3A;
            m_det[2][1] = plane3B;
            m_det[2][2] = plane3C;
        
            m_x[0][0] = plane1D;
            m_x[0][1] = plane1B;
            m_x[0][2] = plane1C;
            m_x[1][0] = plane2D;
            m_x[1][1] = plane2B;
            m_x[1][2] = plane2C;
            m_x[2][0] = plane3D;
            m_x[2][1] = plane3B;
            m_x[2][2] = plane3C;
        
            m_y[0][0] = plane1A;
            m_y[0][1] = plane1D;
            m_y[0][2] = plane1C;
            m_y[1][0] = plane2A;
            m_y[1][1] = plane2D;
            m_y[1][2] = plane2C;
            m_y[2][0] = plane3A;
            m_y[2][1] = plane3D;
            m_y[2][2] = plane3C;
        
            m_z[0][0] = plane1A;
            m_z[0][1] = plane1B;
            m_z[0][2] = plane1D;
            m_z[1][0] = plane2A;
            m_z[1][1] = plane2B;
            m_z[1][2] = plane2D;
            m_z[2][0] = plane3A;
            m_z[2][1] = plane3B;
            m_z[2][2] = plane3D;
        
            det = intersection_determinant(m_det);            
            det_x = intersection_determinant(m_x);
            det_y = intersection_determinant(m_y);
            det_z = intersection_determinant(m_z);
           
           
            if (det != 0.0f)
            {
                result[0] = det_x / det;
                result[1] = det_y / det;
                result[2] = det_z / det;
            }
           
            return result;
        }
        
        __device__ float intersection_determinant(float matrix[3][3])
        {
            float a;
            float b;
            float c;
            float d;
            float e;
            float f;
            float g;
            float h;
            float i;
            float det;
        
            a = matrix[0][0];
            b = matrix[0][1];
            c = matrix[0][2];
            d = matrix[1][0];
            e = matrix[1][1];
            f = matrix[1][2];
            g = matrix[2][0];
            h = matrix[2][1];
            i = matrix[2][2];
        
            det = (a * e * i + b * f * g + c * d * h) - (a * f * h + b * d * i + c * e * g);
            return det;
        }
                                     """)

        mod_backward_projection_shared_mem_cdrf = SourceModule("""
            #include <stdint.h>
            texture<uint8_t, 1> tex;
            
            __global__ void backprojection_cdrf
            (int dataset_number, int n, int m, int p, const float* crystal_pitch_XY, const float* crystal_pitch_Z,
                const float distance_between_array_pixel, int number_of_events, const int begin_event_gpu_limitation,
                const int end_event_gpu_limitation, const float* a, const float* a_normal, const float* a_cf, const float* b,
                const float* b_normal, const float* b_cf, const float* c, const float* c_normal, const float* c_cf, const float* d, const float* d_normal,
                const float* d_cf, const short* A, const short* B, const short* C, float* adjust_coef, float* sum_vor,
                char* fov_cut_matrix, float* time_factor, float* m_values, float* m_values_at,
                float* b_values, float* b_values_at, float* max_D, float* inflex_points_x, float* linear_attenuation_A,
                float* linear_attenuation_B)
                       {
                       extern __shared__ float adjust_coef_shared[];
                       int idt = blockIdx.x * blockDim.x + threadIdx.x;


                       float d2;
                       float d2_normal;
                       float d2_cf;
                       float normal_value;
                       float value;
                       float value_cf;
                       short a_temp;
                       short b_temp;
                       short c_temp;
                       char fov_cut_temp;
                       int i_s = threadIdx.x;
                       float width;
                       float height;
                       float distance;
                       float distance_other;
                       float solid_angle;
                       float distance_crystal;
                       float distance_at;
                       float idrf;

                       if (idt >= n * m * p)
                       {
                           return;
                       }

                       __syncthreads();
                       adjust_coef_shared[i_s] = adjust_coef[idt];
                       a_temp = A[idt];
                       b_temp = B[idt];
                       c_temp = C[idt];
                       fov_cut_temp = fov_cut_matrix[idt];



                       for (int e = begin_event_gpu_limitation; e < end_event_gpu_limitation; e++)
                       {
                           if (fov_cut_temp != 0)
                           {
                               normal_value = a_normal[e] * a_temp + b_normal[e] * b_temp + c_normal[e] * c_temp - d_normal[e];
                               d2_normal = crystal_pitch_XY[e] * sqrt(a_normal[e] * a_normal[e] + b_normal[e] * b_normal[e] + c_normal[e] * c_normal[e]);

                               if (normal_value < d2_normal && normal_value >= -d2_normal)
                               {
                                   value = a[e] * a_temp + b[e] * b_temp + c[e] * c_temp - d[e];
                                   d2 = crystal_pitch_Z[e] * sqrt(a[e] * a[e] + b[e] * b[e] + c[e] * c[e]);


                                   if (value < d2 && value >= -d2)
                                   {
                                       value_cf = a_cf[e] * a_temp + b_cf[e] * b_temp + c_cf[e] * c_temp - d_cf[e];
                                       d2_cf = distance_between_array_pixel * sqrt(a_cf[e] * a_cf[e] + b_cf[e] * b_cf[e] + c_cf[e] * c_cf[e]);


                                       if (value_cf >= -d2_cf && value_cf < d2_cf)
                                       {
                                           if (sqrt(value * value + normal_value * normal_value + value_cf * value_cf) <= sqrt(d2 * d2 + d2_normal * d2_normal + d2_cf * d2_cf))
                                           {
                                               width = 2 * (crystal_pitch_Z[e] - abs(value));
                                               height = 2 * (crystal_pitch_XY[e] - abs(normal_value));
                                               distance = d2_cf + abs(value_cf);
                                               distance_other = d2_cf - abs(value_cf);
                                               distance_at = m_values_at[e] * value + b_values_at[e];

                                               if (value <= inflex_points_x[2 * e])
                                               {
                                                   distance_crystal = m_values[2 * e] * value + b_values[2 * e];
                                                   distance_at = 0;
                                               }
                                               else if (value > inflex_points_x[2 * e + 1])
                                               {
                                                   distance_crystal = m_values[2 * e + 1] * value + b_values[2 * e + 1];
                                               }

                                               else
                                               {
                                                   distance_crystal = max_D[e];


                                               }
                                              
                                               idrf = (1 - exp(-linear_attenuation_A[e] * distance_crystal)) * exp(-linear_attenuation_A[e] * distance_at);

                                               if (idrf < 0)
                                               {
                                                   idrf = 0;
                                               }
                                               solid_angle = 4 * width * height / (2 * distance * sqrt(4 * distance * distance + width * width + height * height));

                                                if (sum_vor[e]!=0)
                                                {
                                               adjust_coef_shared[i_s] += idrf / sum_vor[e];
                                                 }


                                               /*
                                               (4 * asin(sin(tan(width/distance))*sin(tan(height/distance))))*(4 * asin(sin(tan(width/distance))*sin(tan(height/distance))))/sum_vor[e];
                                           adjust_coef_shared[i_s] += time_factor[e]*(1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[e]);

                                               */
                                           }

                                       }

                                       /*
                                       (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/
                                       normal_value= a_normal[e]*a_temp+b_normal[e]*b_temp+c_normal[e] * c_temp-d_normal[e];
                        d2_normal = error_pixel+crystal_pitch_XY * sqrt(a_normal[e]*a_normal[e]+b_normal[e]*b_normal[e]+c_normal[e]*c_normal[e]);
                                      adjust_coef_shared[i_s] += /(sum_vor[e]*time_factor[e]);
                                     adjust_coef_shared[i_s] += (1-(sqrt(value*value+normal_value*normal_value+value_cf*value_cf))/sqrt(d2*d2+d2_normal*d2_normal+d2_cf*d2_cf))/(sum_vor[0]*time_factor[e]);
                                        */





                                   }
                               }

                           }

                       }

                       adjust_coef[idt] = adjust_coef_shared[i_s];
                       __syncthreads();
}
                             """)

        # float crystal_pitch, int number_of_events, float *a, float *b, float *c, float *d, int *A, int *B, int *C, float *im, float *vector_matrix
        # Host Code   B, C, im, vector_matrix,
        number_of_datasets = np.int32(1)  # Number of datasets (and concurrent operations) used.
        number_of_datasets_back = np.int32(1)  # Number of datasets (and concurrent operations) used.
        # Start concurrency Test
        # Event as reference point
        ref = cuda.Event()
        ref.record()

        # Create the streams and events needed to calculation
        stream, event = [], []
        marker_names = ['kernel_begin', 'kernel_end']
        # Create List to allocate chunks of data
        A_cut_gpu, B_cut_gpu, C_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets
        A_cut, B_cut, C_cut = [None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_gpu, b_gpu, c_gpu, d_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets
        a_normal_gpu, b_normal_gpu, c_normal_gpu, d_normal_gpu = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cut, b_cut, c_cut, d_cut = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets

        a_normal_cut, b_normal_cut, c_normal_cut, d_normal_cut = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cf_cut, b_cf_cut, c_cf_cut, d_cf_cut = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cut_gpu, b_cut_gpu, c_cut_gpu, d_cut_gpu = [None] * number_of_datasets, [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets

        a_cut_normal_gpu, b_cut_normal_gpu, c_cut_normal_gpu, d_cut_normal_gpu = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        a_cf_cut_gpu, b_cf_cut_gpu, c_cf_cut_gpu, d_cf_cut_gpu = [None] * number_of_datasets, [
            None] * number_of_datasets, [None] * number_of_datasets, [None] * number_of_datasets

        time_factor_cut_gpu = [None] * number_of_datasets
        sum_vor_gpu = [None] * number_of_datasets
        sum_vor_cut = [None] * number_of_datasets

        adjust_coef_cut = [None] * number_of_datasets
        adjust_coef_gpu = [None] * number_of_datasets
        adjust_coef_pinned = [None] * number_of_datasets_back
        fov_cut_matrix_cutted_gpu = [None] * number_of_datasets_back
        fov_cut_matrix_cut = [None] * number_of_datasets
        # fov_cut_matrix_gpu = [None] * number_of_datasets
        sum_vor_pinned = [None] * number_of_datasets

        distance_to_center_plane_cut = [None] * number_of_datasets
        distance_to_center_plane_gpu_cut = [None] * number_of_datasets
        distance_to_center_plane_normal_cut = [None] * number_of_datasets
        distance_to_center_plane_normal_gpu_cut = [None] * number_of_datasets
        plane_centerA1_cut = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerA1_gpu_cut = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerB1_cut = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerB1_gpu_cut = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerC1_cut = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerC1_gpu_cut = [None] * len(self.crystal_central_planes.plane_centerA1)

        plane_centerA1_gpu = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerB1_gpu = [None] * len(self.crystal_central_planes.plane_centerA1)
        plane_centerC1_gpu = [None] * len(self.crystal_central_planes.plane_centerA1)

        m_values_listMode_cut = [None] * number_of_datasets
        m_values_at_listMode_cut = [None] * number_of_datasets
        b_values_listMode_cut = [None] * number_of_datasets
        b_values_at_listMode_cut = [None] * number_of_datasets
        max_D_listMode_cut = [None] * number_of_datasets
        inflex_points_x_listMode_cut = [None] * number_of_datasets
        linear_attenuation_crystal_A_listMode_cut = [None] * number_of_datasets
        linear_attenuation_crystal_B_listMode_cut = [None] * number_of_datasets
        m_values_gpu_cut = [None] * number_of_datasets
        b_values_gpu_cut = [None] * number_of_datasets
        m_values_at_gpu_cut = [None] * number_of_datasets
        b_values_at_gpu_cut = [None] * number_of_datasets
        max_D_gpu_cut = [None] * number_of_datasets
        inflex_points_x_gpu_cut = [None] * number_of_datasets
        linear_attenuation_A_gpu_cut = [None] * number_of_datasets
        linear_attenuation_B_gpu_cut = [None] * number_of_datasets

        intersection_points = np.ascontiguousarray(np.zeros((number_of_events, 6)), dtype=np.float32)
        intersection_points_gpu = cuda.mem_alloc(intersection_points.size
                                                 * intersection_points.dtype.itemsize)
        cuda.memcpy_htod_async(intersection_points_gpu, intersection_points)

        for i in range(len(self.crystal_central_planes.plane_centerA1)):
            plane_centerA1_cut[i] = [None] * number_of_datasets
            plane_centerA1_gpu_cut[i] = [None] * number_of_datasets
            plane_centerB1_cut[i] = [None] * number_of_datasets
            plane_centerB1_gpu_cut[i] = [None] * number_of_datasets
            plane_centerC1_cut[i] = [None] * number_of_datasets
            plane_centerC1_gpu_cut[i] = [None] * number_of_datasets

            plane_centerA1_gpu[i] = cuda.mem_alloc(self.crystal_central_planes.plane_centerA1[i].size
                                                   * self.crystal_central_planes.plane_centerA1[i].dtype.itemsize)
            plane_centerB1_gpu[i] = cuda.mem_alloc(self.crystal_central_planes.plane_centerB1[i].size
                                                   * self.crystal_central_planes.plane_centerB1[i].dtype.itemsize)
            plane_centerC1_gpu[i] = cuda.mem_alloc(self.crystal_central_planes.plane_centerC1[i].size
                                                   * self.crystal_central_planes.plane_centerC1[i].dtype.itemsize)
            cuda.memcpy_htod_async(plane_centerA1_gpu[i], self.crystal_central_planes.plane_centerA1[i])
            cuda.memcpy_htod_async(plane_centerB1_gpu[i], self.crystal_central_planes.plane_centerB1[i])
            cuda.memcpy_htod_async(plane_centerC1_gpu[i], self.crystal_central_planes.plane_centerC1[i])

        # Streams and Events creation
        for dataset in range(number_of_datasets):
            stream.append(cuda.Stream())
            event.append(dict([(marker_names[l], cuda.Event()) for l in range(len(marker_names))]))

        # Foward Projection Memory Allocation
        # Variables that need an unique alocation
        # A_shappened = np.ascontiguousarray(A.reshape(A.shape[0]*A.shape[1]*A.shape[2]), dtype=np.int32)
        # B_shappened = np.ascontiguousarray(B.reshape(B.shape[0]*B.shape[1]*B.shape[2]), dtype=np.int32)
        # C_shappened = np.ascontiguousarray(C.reshape(C.shape[0]*C.shape[1]*C.shape[2]), dtype=np.int32)
        im_shappened = np.ascontiguousarray(im.reshape(im.shape[0] * im.shape[1] * im.shape[2]), dtype=np.float32)
        fov_cut_matrix_shappened = np.ascontiguousarray(
            fov_cut_matrix.reshape(fov_cut_matrix.shape[0] * fov_cut_matrix.shape[1] * fov_cut_matrix.shape[2]),
            dtype=np.byte)

        # forward_projection_arrays_page_locked_memory_allocations = [A_gpu, B_gpu, C_gpu, im_gpu]

        # A_gpu = cuda.mem_alloc(A_shappened.size * A_shappened.dtype.itemsize)
        # B_gpu = cuda.mem_alloc(B_shappened.size * B_shappened.dtype.itemsize)
        # C_gpu = cuda.mem_alloc(C_shappened.size * C_shappened.dtype.itemsize)
        im_gpu = cuda.mem_alloc(im_shappened.size * im_shappened.dtype.itemsize)
        fov_cut_matrix_gpu = cuda.mem_alloc(fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize)
        # texref = mod_forward_projection_shared_mem.get_texref('tex')
        # texref.set_address(fov_cut_matrix_gpu, fov_cut_matrix_shappened.size * fov_cut_matrix_shappened.dtype.itemsize)
        # # texref.set_format(cuda.array_format.UNSIGNED_INT8, 1)

        # cuda.memcpy_htod_async(A_gpu, A_shappened)
        # cuda.memcpy_htod_async(B_gpu, B_shappened)
        # cuda.memcpy_htod_async(C_gpu, C_shappened)
        cuda.memcpy_htod_async(im_gpu, im_shappened)
        cuda.memcpy_htod_async(fov_cut_matrix_gpu, fov_cut_matrix_shappened)

        a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
        b_gpu = cuda.mem_alloc(b.size * b.dtype.itemsize)
        c_gpu = cuda.mem_alloc(c.size * c.dtype.itemsize)
        d_gpu = cuda.mem_alloc(d.size * d.dtype.itemsize)
        a_normal_gpu = cuda.mem_alloc(a_normal.size * a_normal.dtype.itemsize)
        b_normal_gpu = cuda.mem_alloc(b_normal.size * b_normal.dtype.itemsize)
        c_normal_gpu = cuda.mem_alloc(c_normal.size * c_normal.dtype.itemsize)
        d_normal_gpu = cuda.mem_alloc(d_normal.size * d_normal.dtype.itemsize)

        a_cf_gpu = cuda.mem_alloc(a_cf.size * a_cf.dtype.itemsize)
        b_cf_gpu = cuda.mem_alloc(b_cf.size * b_cf.dtype.itemsize)
        c_cf_gpu = cuda.mem_alloc(c_cf.size * c_cf.dtype.itemsize)
        d_cf_gpu = cuda.mem_alloc(d_cf.size * d_cf.dtype.itemsize)
        sum_vor_t_gpu = cuda.mem_alloc(sum_vor.size * sum_vor.dtype.itemsize)
        distance_to_center_plane_gpu = cuda.mem_alloc(
            self.distance_to_center_plane.size * self.distance_to_center_plane.dtype.itemsize)
        distance_to_center_plane_normal_gpu = cuda.mem_alloc(
            self.distance_to_center_plane_normal.size * self.distance_to_center_plane_normal.dtype.itemsize)

        time_factor_gpu = cuda.mem_alloc(time_factor.size * time_factor.dtype.itemsize)
        # Transfer memory to Optimizer
        cuda.memcpy_htod_async(a_gpu, a)
        cuda.memcpy_htod_async(b_gpu, b)
        cuda.memcpy_htod_async(c_gpu, c)
        cuda.memcpy_htod_async(d_gpu, d)
        cuda.memcpy_htod_async(a_normal_gpu, a_normal)
        cuda.memcpy_htod_async(b_normal_gpu, b_normal)
        cuda.memcpy_htod_async(c_normal_gpu, c_normal)
        cuda.memcpy_htod_async(d_normal_gpu, d_normal)

        cuda.memcpy_htod_async(a_cf_gpu, a_cf)
        cuda.memcpy_htod_async(b_cf_gpu, b_cf)
        cuda.memcpy_htod_async(c_cf_gpu, c_cf)
        cuda.memcpy_htod_async(d_cf_gpu, d_cf)
        cuda.memcpy_htod_async(time_factor_gpu, time_factor)
        cuda.memcpy_htod_async(distance_to_center_plane_gpu, self.distance_to_center_plane)
        cuda.memcpy_htod_async(distance_to_center_plane_normal_gpu, self.distance_to_center_plane_normal)

        m_values_gpu = cuda.mem_alloc(
            self.doi_mapping.m_values_listMode.size * self.doi_mapping.m_values_listMode.dtype.itemsize)
        b_values_gpu = cuda.mem_alloc(
            self.doi_mapping.b_values_listMode.size * self.doi_mapping.b_values_listMode.dtype.itemsize)

        m_values_at_gpu = cuda.mem_alloc(
            self.doi_mapping.m_values_at_listMode.size * self.doi_mapping.m_values_at_listMode.dtype.itemsize)
        b_values_at_gpu = cuda.mem_alloc(
            self.doi_mapping.b_values_at_listMode.size * self.doi_mapping.b_values_at_listMode.dtype.itemsize)

        max_D_gpu = cuda.mem_alloc(
            self.doi_mapping.max_D_listMode.size * self.doi_mapping.max_D_listMode.dtype.itemsize)

        inflex_points_x_gpu = cuda.mem_alloc(
            self.doi_mapping.inflex_points_x_listMode.size * self.doi_mapping.inflex_points_x_listMode.dtype.itemsize)
        linear_attenuation_A_gpu = cuda.mem_alloc(
            self.doi_mapping.linear_attenuation_crystal_A_listMode.size * self.doi_mapping.linear_attenuation_crystal_A_listMode.dtype.itemsize)
        linear_attenuation_B_gpu = cuda.mem_alloc(
            self.doi_mapping.linear_attenuation_crystal_B_listMode.size * self.doi_mapping.linear_attenuation_crystal_B_listMode.dtype.itemsize)

        cuda.memcpy_htod_async(m_values_gpu, self.doi_mapping.m_values_listMode)
        cuda.memcpy_htod_async(b_values_gpu, self.doi_mapping.b_values_listMode)
        cuda.memcpy_htod_async(m_values_at_gpu, self.doi_mapping.m_values_at_listMode)
        cuda.memcpy_htod_async(b_values_at_gpu, self.doi_mapping.b_values_at_listMode)
        cuda.memcpy_htod_async(max_D_gpu, self.doi_mapping.max_D_listMode)
        cuda.memcpy_htod_async(inflex_points_x_gpu, self.doi_mapping.inflex_points_x_listMode)
        cuda.memcpy_htod_async(linear_attenuation_A_gpu, self.doi_mapping.linear_attenuation_crystal_A_listMode)
        cuda.memcpy_htod_async(linear_attenuation_B_gpu, self.doi_mapping.linear_attenuation_crystal_B_listMode)

        for dataset in range(number_of_datasets):
            # if dataset == number_of_datasets:
            #     begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
            #     end_dataset = number_of_events
            #     adjust_coef_cut[dataset] = np.ascontiguousarray(
            #         adjust_coef[int(np.floor(im_cut_dim[0] * dataset)):adjust_coef.shape[0],
            #         int(np.floor(im_cut_dim[1] * dataset)):adjust_coef.shape[1],
            #         int(np.floor(im_cut_dim[2] * dataset)):adjust_coef.shape[2]],
            #         dtype=np.float32)
            #     fov_cut_matrix_cut[dataset] = np.ascontiguousarray(
            #         fov_cut_matrix[int(np.floor(im_cut_dim[0] * dataset)):fov_cut_matrix.shape[0],
            #         int(np.floor(im_cut_dim[1] * dataset)):fov_cut_matrix.shape[1],
            #         int(np.floor(im_cut_dim[2] * dataset)):fov_cut_matrix.shape[2]],
            #         dtype=np.float32)
            # else:
            begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
            end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)

            # Cutting dataset
            # For forward projection the data is cutted by number of events. For backprojection is cutted per image pieces of image
            a_cut[dataset] = a[begin_dataset:end_dataset]
            b_cut[dataset] = b[begin_dataset:end_dataset]
            c_cut[dataset] = c[begin_dataset:end_dataset]
            d_cut[dataset] = d[begin_dataset:end_dataset]
            a_normal_cut[dataset] = a_normal[begin_dataset:end_dataset]
            b_normal_cut[dataset] = b_normal[begin_dataset:end_dataset]
            c_normal_cut[dataset] = c_normal[begin_dataset:end_dataset]
            d_normal_cut[dataset] = d_normal[begin_dataset:end_dataset]

            a_cf_cut[dataset] = a_cf[begin_dataset:end_dataset]
            b_cf_cut[dataset] = b_cf[begin_dataset:end_dataset]
            c_cf_cut[dataset] = c_cf[begin_dataset:end_dataset]
            d_cf_cut[dataset] = d_cf[begin_dataset:end_dataset]
            sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
            distance_to_center_plane_cut[dataset] = self.distance_to_center_plane[begin_dataset:end_dataset]
            distance_to_center_plane_normal_cut[dataset] = self.distance_to_center_plane_normal[
                                                           begin_dataset:end_dataset]

            # Forward
            a_cut_gpu[dataset] = cuda.mem_alloc(a_cut[dataset].size * a_cut[dataset].dtype.itemsize)
            b_cut_gpu[dataset] = cuda.mem_alloc(b_cut[dataset].size * b_cut[dataset].dtype.itemsize)
            c_cut_gpu[dataset] = cuda.mem_alloc(c_cut[dataset].size * c_cut[dataset].dtype.itemsize)
            d_cut_gpu[dataset] = cuda.mem_alloc(d_cut[dataset].size * d_cut[dataset].dtype.itemsize)
            a_cut_normal_gpu[dataset] = cuda.mem_alloc(
                a_normal_cut[dataset].size * a_normal_cut[dataset].dtype.itemsize)
            b_cut_normal_gpu[dataset] = cuda.mem_alloc(
                b_normal_cut[dataset].size * b_normal_cut[dataset].dtype.itemsize)
            c_cut_normal_gpu[dataset] = cuda.mem_alloc(
                c_normal_cut[dataset].size * c_normal_cut[dataset].dtype.itemsize)
            d_cut_normal_gpu[dataset] = cuda.mem_alloc(
                d_normal_cut[dataset].size * d_normal_cut[dataset].dtype.itemsize)

            a_cf_cut_gpu[dataset] = cuda.mem_alloc(a_cf_cut[dataset].size * a_cf_cut[dataset].dtype.itemsize)
            b_cf_cut_gpu[dataset] = cuda.mem_alloc(b_cf_cut[dataset].size * b_cf_cut[dataset].dtype.itemsize)
            c_cf_cut_gpu[dataset] = cuda.mem_alloc(c_cf_cut[dataset].size * c_cf_cut[dataset].dtype.itemsize)
            d_cf_cut_gpu[dataset] = cuda.mem_alloc(d_cf_cut[dataset].size * d_cf_cut[dataset].dtype.itemsize)

            distance_to_center_plane_gpu_cut[dataset] = cuda.mem_alloc(
                distance_to_center_plane_cut[dataset].size * distance_to_center_plane_cut[dataset].dtype.itemsize)
            distance_to_center_plane_normal_gpu_cut[dataset] = cuda.mem_alloc(
                distance_to_center_plane_normal_cut[dataset].size * distance_to_center_plane_normal_cut[
                    dataset].dtype.itemsize)

            sum_vor_gpu[dataset] = cuda.mem_alloc(sum_vor_cut[dataset].size * sum_vor_cut[dataset].dtype.itemsize)

            sum_vor_pinned[dataset] = cuda.register_host_memory(sum_vor_cut[dataset])
            assert np.all(sum_vor_pinned[dataset] == sum_vor_cut[dataset])

            cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_pinned[dataset], stream[dataset])
            # sum_vor_gpu[dataset] = np.intp(x.base.get_device_pointer())
            # cuda.memcpy_htod_async(probability_gpu[dataset], probability_cut[dataset])
            cuda.memcpy_htod_async(a_cut_gpu[dataset], a_cut[dataset])
            cuda.memcpy_htod_async(b_cut_gpu[dataset], b_cut[dataset])
            cuda.memcpy_htod_async(c_cut_gpu[dataset], c_cut[dataset])
            cuda.memcpy_htod_async(d_cut_gpu[dataset], d_cut[dataset])
            cuda.memcpy_htod_async(a_cut_normal_gpu[dataset], a_normal_cut[dataset])
            cuda.memcpy_htod_async(b_cut_normal_gpu[dataset], b_normal_cut[dataset])
            cuda.memcpy_htod_async(c_cut_normal_gpu[dataset], c_normal_cut[dataset])
            cuda.memcpy_htod_async(d_cut_normal_gpu[dataset], d_normal_cut[dataset])

            cuda.memcpy_htod_async(a_cf_cut_gpu[dataset], a_cf_cut[dataset])
            cuda.memcpy_htod_async(b_cf_cut_gpu[dataset], b_cf_cut[dataset])
            cuda.memcpy_htod_async(c_cf_cut_gpu[dataset], c_cf_cut[dataset])
            cuda.memcpy_htod_async(d_cf_cut_gpu[dataset], d_cf_cut[dataset])
            cuda.memcpy_htod_async(distance_to_center_plane_gpu_cut[dataset], distance_to_center_plane_cut[dataset])
            cuda.memcpy_htod_async(distance_to_center_plane_normal_gpu_cut[dataset],
                                   distance_to_center_plane_normal_cut[dataset])

            m_values_listMode_cut[dataset] = np.copy(self.doi_mapping.m_values_listMode[begin_dataset:end_dataset])
            m_values_at_listMode_cut[dataset] = np.copy(
                self.doi_mapping.m_values_at_listMode[begin_dataset:end_dataset])
            b_values_listMode_cut[dataset] = np.copy(self.doi_mapping.b_values_listMode[begin_dataset:end_dataset])
            b_values_at_listMode_cut[dataset] = np.copy(
                self.doi_mapping.b_values_at_listMode[begin_dataset:end_dataset])
            max_D_listMode_cut[dataset] = np.copy(self.doi_mapping.max_D_listMode[begin_dataset:end_dataset])
            inflex_points_x_listMode_cut[dataset] = np.copy(
                self.doi_mapping.inflex_points_x_listMode[begin_dataset:end_dataset])
            linear_attenuation_crystal_A_listMode_cut[dataset] = np.copy(
                self.doi_mapping.linear_attenuation_crystal_A_listMode[begin_dataset:end_dataset])
            linear_attenuation_crystal_B_listMode_cut[dataset] = np.copy(
                self.doi_mapping.linear_attenuation_crystal_B_listMode[begin_dataset:end_dataset])

            m_values_gpu_cut[dataset] = cuda.mem_alloc(
                m_values_listMode_cut[dataset].size * m_values_listMode_cut[dataset].dtype.itemsize)

            b_values_gpu_cut[dataset] = cuda.mem_alloc(
                b_values_listMode_cut[dataset].size * b_values_listMode_cut[dataset].dtype.itemsize)

            m_values_at_gpu_cut[dataset] = cuda.mem_alloc(
                m_values_at_listMode_cut[dataset].size * m_values_at_listMode_cut[dataset].dtype.itemsize)

            b_values_at_gpu_cut[dataset] = cuda.mem_alloc(
                b_values_at_listMode_cut[dataset].size * b_values_at_listMode_cut[dataset].dtype.itemsize)

            max_D_gpu_cut[dataset] = cuda.mem_alloc(
                max_D_listMode_cut[dataset].size * max_D_listMode_cut[dataset].dtype.itemsize)

            inflex_points_x_gpu_cut[dataset] = cuda.mem_alloc(
                inflex_points_x_listMode_cut[dataset].size * inflex_points_x_listMode_cut[dataset].dtype.itemsize)

            linear_attenuation_A_gpu_cut[dataset] = cuda.mem_alloc(
                linear_attenuation_crystal_A_listMode_cut[dataset].size * linear_attenuation_crystal_A_listMode_cut[
                    dataset].dtype.itemsize)
            linear_attenuation_B_gpu_cut[dataset] = cuda.mem_alloc(
                linear_attenuation_crystal_B_listMode_cut[dataset].size * linear_attenuation_crystal_B_listMode_cut[
                    dataset].dtype.itemsize)

            cuda.memcpy_htod_async(m_values_gpu_cut[dataset], m_values_listMode_cut[dataset])
            cuda.memcpy_htod_async(b_values_gpu_cut[dataset], b_values_listMode_cut[dataset])
            cuda.memcpy_htod_async(m_values_at_gpu_cut[dataset], m_values_at_listMode_cut[dataset])
            cuda.memcpy_htod_async(b_values_at_gpu_cut[dataset], b_values_at_listMode_cut[dataset])
            cuda.memcpy_htod_async(max_D_gpu_cut[dataset], max_D_listMode_cut[dataset])
            cuda.memcpy_htod_async(inflex_points_x_gpu_cut[dataset], inflex_points_x_listMode_cut[dataset])
            cuda.memcpy_htod_async(linear_attenuation_A_gpu_cut[dataset],
                                   linear_attenuation_crystal_A_listMode_cut[dataset])
            cuda.memcpy_htod_async(linear_attenuation_B_gpu_cut[dataset],
                                   linear_attenuation_crystal_B_listMode_cut[dataset])

        adjust_coef = np.ascontiguousarray(adjust_coef.reshape(
            adjust_coef.shape[0] * adjust_coef.shape[1] * adjust_coef.shape[2]),
            dtype=np.float32)
        # fov_cut_matrix = np.ascontiguousarray(fov_cut_matrix.reshape(
        #     fov_cut_matrix.shape[0] * fov_cut_matrix.shape[1] * fov_cut_matrix.shape[2]),
        #     dtype=np.float32)
        A = np.ascontiguousarray(A.reshape(
            A.shape[0] * A.shape[1] * A.shape[2]),
            dtype=np.short)
        B = np.ascontiguousarray(B.reshape(
            B.shape[0] * B.shape[1] * B.shape[2]),
            dtype=np.short)
        C = np.ascontiguousarray(C.reshape(
            C.shape[0] * C.shape[1] * C.shape[2]),
            dtype=np.short)

        # ---- Divide into datasets variables backprojection
        for dataset in range(number_of_datasets_back):
            voxels_division = adjust_coef.shape[0] // number_of_datasets_back
            adjust_coef_cut[dataset] = np.ascontiguousarray(
                adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division],
                dtype=np.float32)

            fov_cut_matrix_cut[dataset] = np.ascontiguousarray(
                fov_cut_matrix_shappened[dataset * voxels_division:(dataset + 1) * voxels_division],
                dtype=np.byte)

            A_cut[dataset] = np.ascontiguousarray(
                A[dataset * voxels_division:(dataset + 1) * voxels_division],
                dtype=np.short)

            B_cut[dataset] = np.ascontiguousarray(
                B[dataset * voxels_division:(dataset + 1) * voxels_division],
                dtype=np.short)

            C_cut[dataset] = np.ascontiguousarray(
                C[dataset * voxels_division:(dataset + 1) * voxels_division],
                dtype=np.short)
            # Backprojection
            adjust_coef_gpu[dataset] = cuda.mem_alloc(
                adjust_coef_cut[dataset].size * adjust_coef_cut[dataset].dtype.itemsize)

            adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset])
            assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset])
            cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset])

            fov_cut_matrix_cutted_gpu[dataset] = cuda.mem_alloc(
                fov_cut_matrix_cut[dataset].size * fov_cut_matrix_cut[dataset].dtype.itemsize)

            A_cut_gpu[dataset] = cuda.mem_alloc(
                A_cut[dataset].size * A_cut[dataset].dtype.itemsize)

            B_cut_gpu[dataset] = cuda.mem_alloc(
                B_cut[dataset].size * B_cut[dataset].dtype.itemsize)

            C_cut_gpu[dataset] = cuda.mem_alloc(
                C_cut[dataset].size * C_cut[dataset].dtype.itemsize)

            cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_cut[dataset])
            cuda.memcpy_htod_async(fov_cut_matrix_cutted_gpu[dataset], fov_cut_matrix_cut[dataset])
            cuda.memcpy_htod_async(A_cut_gpu[dataset], A_cut[dataset])
            cuda.memcpy_htod_async(B_cut_gpu[dataset], B_cut[dataset])
            cuda.memcpy_htod_async(C_cut_gpu[dataset], C_cut[dataset])

        free, total = cuda.mem_get_info()

        print('%.1f %% of device memory is free.' % ((free / float(total)) * 100))

        # -------------OSEM---------
        it = self.number_of_iterations
        subsets = self.number_of_subsets
        print('Number events for reconstruction: {}'.format(number_of_events))

        im = np.ascontiguousarray(im.reshape(im.shape[0] * im.shape[1] * im.shape[2]), dtype=np.float32)
        for i in range(it):
            print('Iteration number: {}\n----------------'.format(i + 1))
            begin_event = np.int32(0)
            end_event = np.int32(number_of_events / subsets)
            for sb in range(subsets):
                print('Subset number: {}'.format(sb))
                number_of_events_subset = np.int32(end_event - begin_event)
                tic = time.time()
                # Cycle forward Projection
                for dataset in range(number_of_datasets):

                    if dataset == number_of_datasets:
                        begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
                        end_dataset = number_of_events_subset
                    else:
                        begin_dataset = np.int32(dataset * number_of_events_subset / number_of_datasets)
                        end_dataset = np.int32((dataset + 1) * number_of_events_subset / number_of_datasets)

                    threadsperblock = (256, 1, 1)
                    blockspergrid_x = int(math.ceil(((end_dataset - begin_dataset)) / threadsperblock[0]))
                    blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
                    blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
                    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
                    event[dataset]['kernel_begin'].record(stream[dataset])
                    # depth = np.int32(5)
                    # weight = np.int32(A.shape[2]/2)
                    # height = np.int32(A.shape[2]/2)
                    if self.cdrf:
                        func_forward = mod_forward_projection_shared_mem_cdrf.get_function("forward_projection_cdrf")
                        func_forward(weight, height, depth, start_x, start_y, start_z,
                                     distance_to_center_plane_normal_gpu_cut[dataset],
                                     distance_to_center_plane_gpu_cut[dataset],
                                     half_distance_between_array_pixel,
                                     number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset],
                                     a_cut_normal_gpu[dataset], a_cf_cut_gpu[dataset],
                                     b_cut_gpu[dataset], b_cut_normal_gpu[dataset], b_cf_cut_gpu[dataset],
                                     c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
                                     c_cf_cut_gpu[dataset],
                                     d_cut_gpu[dataset],
                                     d_cut_normal_gpu[dataset], d_cf_cut_gpu[dataset],
                                     sum_vor_gpu[dataset], fov_cut_matrix_gpu, im_gpu, m_values_gpu_cut[dataset],
                                     b_values_gpu_cut[dataset], m_values_at_gpu_cut[dataset],
                                     b_values_at_gpu_cut[dataset],
                                     max_D_gpu_cut[dataset], inflex_points_x_gpu_cut[dataset],
                                     linear_attenuation_A_gpu_cut[dataset],
                                     linear_attenuation_B_gpu_cut[dataset],
                                     block=threadsperblock,
                                     grid=blockspergrid,
                                     stream=stream[dataset])
                    else:
                        func_forward = mod_forward_projection_shared_mem.get_function("forward_projection")
                        func_forward(weight, height, depth, start_x, start_y, start_z, half_crystal_pitch_xy,
                                     half_crystal_pitch_z,
                                     half_distance_between_array_pixel,
                                     number_of_events, begin_dataset, end_dataset, a_cut_gpu[dataset],
                                     a_cut_normal_gpu[dataset], a_cf_cut_gpu[dataset],
                                     b_cut_gpu[dataset], b_cut_normal_gpu[dataset], b_cf_cut_gpu[dataset],
                                     c_cut_gpu[dataset], c_cut_normal_gpu[dataset],
                                     c_cf_cut_gpu[dataset],
                                     d_cut_gpu[dataset],
                                     d_cut_normal_gpu[dataset], d_cf_cut_gpu[dataset],
                                     sum_vor_gpu[dataset], fov_cut_matrix_gpu, im_gpu,
                                     block=threadsperblock,
                                     grid=blockspergrid,
                                     stream=stream[dataset])

                # Sincronization of streams
                for dataset in range(number_of_datasets):  # Commenting out this line should break concurrency.
                    event[dataset]['kernel_end'].record(stream[dataset])

                # Transfering data from Optimizer
                for dataset in range(number_of_datasets):
                    begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                    end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
                    # cuda.memcpy_dtoh_async(sum_vor[begin_dataset:end_dataset], sum_vor_gpu[dataset])
                    cuda.memcpy_dtoh_async(sum_vor_pinned[dataset], sum_vor_gpu[dataset], stream[dataset])
                    # cuda.cudaStream.Synchronize(stream[dataset])

                    toc = time.time()

                cuda.Context.synchronize()

                print('Time part Forward Projection {} : {}'.format(1, toc - tic))
                # number_of_datasets = np.int32(2)
                teste = np.copy(sum_vor)
                # sum_vor[sum_vor<1]=0
                sum_vor = np.ascontiguousarray(teste, dtype=np.float32)
                # sum_vor=np.ascontiguousarray(np.ones((self.a.shape)), dtype=np.float32)
                print('SUM VOR: {}'.format(np.sum(teste)))
                print('SUM VOR: {}'.format(teste))
                # cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor)

                cuda.memcpy_htod_async(sum_vor_t_gpu, sum_vor)

                # ------------BACKPROJECTION-----------

                for dataset in range(number_of_datasets_back):
                    dataset = np.int32(dataset)
                    begin_dataset = np.int32(0)
                    end_dataset = np.int32(number_of_events_subset)

                    # begin_dataset = np.int32(0)
                    # end_dataset = np.int32(number_of_events)

                    event[dataset]['kernel_begin'].record(stream[dataset])
                    # weight_cutted, height_cutted, depth_cutted = np.int32(adjust_coef_cut[dataset].shape[0]), np.int32(
                    #     adjust_coef_cut[dataset].shape[1]), np.int32(adjust_coef_cut[dataset].shape[2])
                    weight_cutted, height_cutted, depth_cutted = np.int32(adjust_coef_cut[dataset].shape[0]), np.int32(
                        1), np.int32(1)

                    number_of_voxels_thread = 64
                    threadsperblock = (np.int(number_of_voxels_thread), 1, 1)
                    blockspergrid_x = int(math.ceil(adjust_coef_cut[dataset].shape[0] / threadsperblock[0]))
                    blockspergrid_y = int(math.ceil(1 / threadsperblock[1]))
                    blockspergrid_z = int(math.ceil(1 / threadsperblock[2]))
                    # blockspergrid_y = int(math.ceil(adjust_coef_cut[dataset].shape[1] / threadsperblock[1]))
                    # blockspergrid_z = int(math.ceil(adjust_coef_cut[dataset].shape[2] / threadsperblock[2]))
                    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)
                    shared_memory = threadsperblock[0] * threadsperblock[1] * threadsperblock[2] * 4
                    if self.cdrf:
                        if self.normalization_calculation_flag:
                            func_backward = mod_normalization_shared_mem_cdrf.get_function("normalization_cdrf")
                            func_backward(dataset, weight_cutted, height_cutted, depth_cutted,
                                          distance_to_center_plane_normal_gpu,
                                          distance_to_center_plane_gpu, half_distance_between_array_pixel,
                                          number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
                                          b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
                                          d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset],
                                          C_cut_gpu[dataset],
                                          adjust_coef_gpu[dataset],
                                          sum_vor_t_gpu, fov_cut_matrix_cutted_gpu[dataset], time_factor_gpu,
                                          plane_centerA1_gpu[0], plane_centerA1_gpu[1], plane_centerA1_gpu[2],
                                          plane_centerA1_gpu[3],
                                          plane_centerB1_gpu[0], plane_centerB1_gpu[1], plane_centerB1_gpu[2],
                                          plane_centerB1_gpu[3],
                                          plane_centerC1_gpu[0], plane_centerC1_gpu[1], plane_centerC1_gpu[2],
                                          plane_centerC1_gpu[3], intersection_points_gpu, m_values_gpu, m_values_at_gpu,
                                          b_values_gpu, b_values_at_gpu,
                                          max_D_gpu, inflex_points_x_gpu, linear_attenuation_A_gpu,
                                          linear_attenuation_B_gpu,
                                          block=threadsperblock,
                                          grid=blockspergrid,
                                          shared=np.int(4 * number_of_voxels_thread),
                                          stream=stream[dataset],
                                          )


                        else:
                            func_backward = mod_backward_projection_shared_mem_cdrf.get_function("backprojection_cdrf")
                            func_backward(dataset, weight_cutted, height_cutted, depth_cutted,
                                          distance_to_center_plane_normal_gpu,
                                          distance_to_center_plane_gpu, half_distance_between_array_pixel,
                                          number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
                                          b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
                                          d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset],
                                          C_cut_gpu[dataset],
                                          adjust_coef_gpu[dataset],
                                          sum_vor_t_gpu, fov_cut_matrix_cutted_gpu[dataset], time_factor_gpu,
                                          m_values_gpu, m_values_at_gpu,
                                          b_values_gpu, b_values_at_gpu,
                                          max_D_gpu, inflex_points_x_gpu, linear_attenuation_A_gpu,
                                          linear_attenuation_B_gpu,
                                          block=threadsperblock,
                                          grid=blockspergrid,
                                          shared=np.int(4 * number_of_voxels_thread),
                                          stream=stream[dataset],
                                          )

                    else:
                        if self.normalization_calculation_flag:
                            func_backward = mod_normalization_shared_mem.get_function("normalization")


                        else:
                            func_backward = mod_backward_projection_shared_mem.get_function("backprojection")

                        func_backward(dataset, weight_cutted, height_cutted, depth_cutted, half_crystal_pitch_xy,
                                      half_crystal_pitch_z, half_distance_between_array_pixel,
                                      number_of_events, begin_dataset, end_dataset, a_gpu, a_normal_gpu, a_cf_gpu,
                                      b_gpu, b_normal_gpu, b_cf_gpu, c_gpu, c_normal_gpu, c_cf_gpu, d_gpu,
                                      d_normal_gpu, d_cf_gpu, A_cut_gpu[dataset], B_cut_gpu[dataset],
                                      C_cut_gpu[dataset],
                                      adjust_coef_gpu[dataset],
                                      sum_vor_t_gpu, fov_cut_matrix_cutted_gpu[dataset], time_factor_gpu,
                                      block=threadsperblock,
                                      grid=blockspergrid,
                                      shared=np.int(4 * number_of_voxels_thread),
                                      stream=stream[dataset],
                                      )

                for dataset in range(number_of_datasets_back):  # Commenting out this line should break concurrency.
                    event[dataset]['kernel_end'].record(stream[dataset])

                for dataset in range(number_of_datasets_back):
                    cuda.memcpy_dtoh_async(adjust_coef_cut[dataset], adjust_coef_gpu[dataset])
                    adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division] = adjust_coef_cut[dataset]

                cuda.Context.synchronize()
                print('Time part Backward Projection {} : {}'.format(1, time.time() - toc))

                # Image Normalization
                # if i ==0:
                #     norm_im=np.copy(adjust_coef)
                #     norm_im=norm_im/np.max(norm_im)
                #     norm_im[norm_im == 0] = np.min(norm_im[np.nonzero(norm_im)])
                # normalization_matrix = gaussian_filter(normalization_matrix, 0.5)

                # im_med = np.load("C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")
                # self.algorithm = "LM-MRP"
                if self.algorithm == "LM-MRP":
                    beta = self.algorithm_options[0]
                    kernel_filter_size = self.algorithm_options[1]
                    im_to_filter = im.reshape(weight, height, depth)
                    im_med = median_filter(im_to_filter, kernel_filter_size)
                    penalized_term = np.copy(im_to_filter)
                    penalized_term[im_med != 0] = 1 + beta * (im_to_filter[im_med != 0] - im_med[im_med != 0]) / im_med[
                        im_med != 0]
                    penalized_term = np.ascontiguousarray(penalized_term.reshape(weight * height * depth),
                                                          dtype=np.float32)
                    # penalized_term = np.ascontiguousarray(penalized_term, dtype=np.float32)

                if self.algorithm == "MAP":
                    beta = 0.5
                    im_map = np.load(
                        "C:\\Users\\pedro.encarnacao\\OneDrive - Universidade de Aveiro\\PhD\\Reconstrução\\NAF+FDG\\Easypet Scan 05 Aug 2019 - 14h 36m 33s\\static_image\\Easypet Scan 05 Aug 2019 - 14h 36m 33s mlem.npy")

                im[normalization_matrix != 0] = im[normalization_matrix != 0] * adjust_coef[
                    normalization_matrix != 0] / (normalization_matrix[normalization_matrix != 0])
                im[normalization_matrix == 0] = 0
                if self.algorithm == "LM-MRP":
                    im[penalized_term != 0] = im[penalized_term != 0] / penalized_term[penalized_term != 0]
                # im = fourier_gaussian(im, sigma=0.2)
                # im = gaussian_filter(im, 0.4)
                print('SUM IMAGE: {}'.format(np.sum(im)))
                im = np.ascontiguousarray(im, dtype=np.float32)
                # im = im * adjust_coef / sensivity_matrix[np.nonzero(sensivity_matrix)]
                cuda.memcpy_htod_async(im_gpu, im)

                # Clearing variables
                sum_vor = np.ascontiguousarray(
                    np.zeros(self.a.shape, dtype=np.float32))

                adjust_coef = np.ascontiguousarray(
                    np.zeros((self.number_of_pixels_x * self.number_of_pixels_y * self.number_of_pixels_z),
                             dtype=np.float32))

                for dataset in range(number_of_datasets):
                    # if dataset == number_of_datasets:
                    #     begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                    #     end_dataset = number_of_events
                    #
                    # else:
                    begin_dataset = np.int32(dataset * number_of_events / number_of_datasets)
                    end_dataset = np.int32((dataset + 1) * number_of_events / number_of_datasets)
                    # adjust_coef_cut[dataset] = np.ascontiguousarray(adjust_coef[:, :,
                    #                                                 int(np.floor(im_cut_dim[2] * dataset)):int(
                    #                                                     np.floor(im_cut_dim[2] * (dataset + 1)))],
                    #                                                 dtype=np.float32)

                    sum_vor_cut[dataset] = sum_vor[begin_dataset:end_dataset]
                    # cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_cut[dataset])
                    sum_vor_pinned[dataset] = cuda.register_host_memory(sum_vor_cut[dataset])
                    assert np.all(sum_vor_pinned[dataset] == sum_vor_cut[dataset])
                    cuda.memcpy_htod_async(sum_vor_gpu[dataset], sum_vor_pinned[dataset], stream[dataset])

                for dataset in range(number_of_datasets_back):
                    adjust_coef_cut[dataset] = adjust_coef[dataset * voxels_division:(dataset + 1) * voxels_division]
                    adjust_coef_pinned[dataset] = cuda.register_host_memory(adjust_coef_cut[dataset])
                    assert np.all(adjust_coef_pinned[dataset] == adjust_coef_cut[dataset])
                    cuda.memcpy_htod_async(adjust_coef_gpu[dataset], adjust_coef_pinned[dataset], stream[dataset])

                if self.saved_image_by_iteration:
                    im = im.reshape(weight, height, depth)
                    self._save_image_by_it(im, i, sb)

                if self.signals_interface is not None:
                    self.signals_interface.trigger_update_label_reconstruction_status.emit(
                        "{}: Iteration {}".format(self.current_info_step, i + 1))
                    self.signals_interface.trigger_progress_reconstruction_partial.emit(
                        int(np.round(100 * (i + 1) * (sb + subsets) / (it * subsets), 0)))

        im = im.reshape(weight, height, depth)
        return im * subsets

    def _save_image_by_it(self, im, normalization=False, it=None, sb=None):
        directory = os.path.dirname(os.path.abspath(__file__))
        if normalization:
            file_name = os.path.join(self.directory, "Normalization".format(it, sb))
        else:
            file_name = os.path.join(self.directory, "EasyPETScan_it{}_sb{}".format(it, sb))
        print(file_name)
        volume = im.astype(np.float32)
        length = 1
        for i in volume.shape:
            length *= i
        # length = volume.shape[0] * volume.shape[2] * volume.shape[1]
        if len(volume.shape) > 1:
            data = np.reshape(volume, [1, length], order='F')
        else:
            data = volume
        shapeIm = volume.shape

        output_file = open(file_name, 'wb')
        arr = array('f', data[0])

        arr.tofile(output_file)
        output_file.close()

        # file_name = directory+"/Acquisitions/SENS_0_5"
        # file_name = directory+"/Acquisitions/sens_0_25"