mx1.adamsgaard.dk

       tRemoved legacy raytracer folder, moved rt doc into main doc - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit d30540ae62abcfd984b3eda396cf534e72aaad66
 (DIR) parent 5cd210a9136c5c2dfee90711cedf294fc6f0484e
 (HTM) Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Tue, 18 Dec 2012 13:58:29 +0100
       
       Removed legacy raytracer folder, moved rt doc into main doc
       
       Diffstat:
         R raytracer/doc/IEEEtran.cls -> doc/… |       0 
         R raytracer/doc/np-performance.pdf -… |       0 
         R raytracer/doc/np-performance.txt -… |       0 
         R raytracer/doc/np500.png -> doc/ray… |       0 
         R raytracer/doc/np50000.png -> doc/r… |       0 
         R raytracer/doc/px-performance.pdf -… |       0 
         R raytracer/doc/px-performance.txt -… |       0 
         A doc/raytracer/raytracer.pdf         |       0 
         A doc/raytracer/raytracer.tex         |     264 +++++++++++++++++++++++++++++++
         D raytracer/Makefile                  |     101 -------------------------------
         D raytracer/README                    |      43 ------------------------------
         D raytracer/colorbar.h                |      22 ----------------------
         D raytracer/doc/.DS_Store             |       0 
         D raytracer/doc/.raytracer.tex.swp    |       0 
         D raytracer/doc/bloopers/black.png    |       0 
         D raytracer/doc/bloopers/colorful.png |       0 
         D raytracer/doc/bloopers/cpu_wrong_c… |       0 
         D raytracer/doc/bloopers/twisted.png  |       0 
         D raytracer/doc/bloopers/wrong_order… |       0 
         D raytracer/doc/raytracer.pdf         |       0 
         D raytracer/doc/raytracer.tex         |     317 -------------------------------
         D raytracer/header.h                  |      26 --------------------------
         D raytracer/main.cpp                  |     264 -------------------------------
         D raytracer/o-ppm.cpp                 |      30 ------------------------------
         D raytracer/o-ppm.h                   |       7 -------
         D raytracer/render_all_outputs_CPU.sh |      27 ---------------------------
         D raytracer/render_all_outputs_GPU.sh |      29 -----------------------------
         D raytracer/rt-kernel-cpu.cpp         |     273 -------------------------------
         D raytracer/rt-kernel-cpu.h           |      14 --------------
         D raytracer/rt-kernel.cu              |     454 -------------------------------
         D raytracer/rt-kernel.h               |      35 -------------------------------
         D raytracer/rt_GPU_init_pres.sh       |      36 -------------------------------
         D raytracer/rt_GPU_pres.sh            |      36 -------------------------------
         D raytracer/rt_GPU_tall_pres.sh       |      36 -------------------------------
       
       34 files changed, 264 insertions(+), 1750 deletions(-)
       ---
 (DIR) diff --git a/raytracer/doc/IEEEtran.cls b/doc/raytracer/IEEEtran.cls
 (DIR) diff --git a/raytracer/doc/np-performance.pdf b/doc/raytracer/np-performance.pdf
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/np-performance.txt b/doc/raytracer/np-performance.txt
 (DIR) diff --git a/raytracer/doc/np500.png b/doc/raytracer/np500.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/np50000.png b/doc/raytracer/np50000.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/px-performance.pdf b/doc/raytracer/px-performance.pdf
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/px-performance.txt b/doc/raytracer/px-performance.txt
 (DIR) diff --git a/doc/raytracer/raytracer.pdf b/doc/raytracer/raytracer.pdf
       Binary files differ.
 (DIR) diff --git a/doc/raytracer/raytracer.tex b/doc/raytracer/raytracer.tex
       t@@ -0,0 +1,264 @@
       +\documentclass[journal]{IEEEtran}
       +
       +
       +\ifCLASSINFOpdf
       +  \usepackage[pdftex]{graphicx}
       +\else
       +   \usepackage[dvips]{graphicx}
       +\fi
       +
       +\usepackage{color}
       +
       +\usepackage{url}
       +%\usepackage{showkeys}
       +\usepackage{siunitx}
       +
       +% correct bad hyphenation here
       +\hyphenation{op-tical net-works semi-conduc-tor}
       +
       +\definecolor{DarkGreen}{rgb}{0,0.45,0.08}
       +\usepackage{listings}
       +\lstset{language=C++,breaklines=true,breakatwhitespace=true,basicstyle=\small,prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}}, commentstyle=\color{DarkGreen}, tabsize=4}
       +
       +
       +\begin{document}
       +\title{CUDA raytracing algorithm for visualizing\\ discrete element model output}
       +
       +\author{Anders Damsgaard Christensen,~\IEEEmembership{20062213}
       +\thanks{Contact: {anders.damsgaard@geo.au.dk}}%
       +\thanks{Webpage: {http://users-cs.au.dk/adc}}%
       +\thanks{Manuscript, last revision: \today.}}
       +
       +% The paper headers
       +\markboth{Data Parallel Computing 2011}{Christensen: CUDA raytracing algorithm for visualizing DEM output}
       +
       +% make the title area
       +\maketitle
       +
       +
       +\begin{abstract}
       +%\boldmath
       +A raytracing algorithm is constructed using the CUDA API for visualizing output from a CUDA discrete element model, which outputs spatial information in dynamic particle systems. 
       +The raytracing algorithm is optimized with constant memory and compilation flags, and performance is measured as a function of the number of particles and the number of pixels. The execution time is compared to equivalent CPU code, and the speedup under a variety of conditions is found to have a mean value of 55.6 times.
       +\end{abstract}
       +\begin{IEEEkeywords}
       +CUDA, discrete element method, raytracing
       +\end{IEEEkeywords}
       +
       +
       +\IEEEpeerreviewmaketitle
       +
       +
       +\section{Introduction}
       +\IEEEPARstart{V}{isualizing} systems containing many spheres using traditional object-order graphics rendering can often result in very high computational requirements, as the usual automated approach is to construct a meshed surface with a specified resolution for each sphere. The memory requirements are thus quite high, as each surface will consist of many vertices. Raytracing \cite{Whitted:1980} is a viable alternative, where spheric entities are saved as data structures with a centre coordinate and a radius. The rendering is performed on the base of these values, which results in a perfectly smooth surfaced sphere. To accelerate the rendering, the algorithm is constructed utilizing the CUDA API \cite{Nvidia-1:2010}, where the problem is divided into $n\times m$ threads, corresponding to the desired output image resolution. Each thread iterates through all particles and applies a simple shading model to determine the final RGB values of the pixel.
       +
       +Previous studies of GPU or CUDA implementations of ray tracing algorithms reported major speedups, compared to corresponding CPU applications (e.g. \cite{Horn:2007,Shih:2009,Popov:2007,Luebke:2008}). None of the software was however found to be open-source and GPL licensed, so a simple raytracer was constructed, customized to render particles, where the data was stored in a specific data format.
       +
       +\subsection{Discrete Element Method}
       +The input particle data to the raytracer is the output of a custom CUDA-based Discrete Element Method (DEM) application currently in development. The DEM model is used to numerically simulate the response of a drained, soft, granular sediment bed upon normal stresses and shearing velocities similar to subglacial environments under ice streams \cite{Evans:2006}. In contrast to laboratory experiments on granular material, the discrete element method \cite{Cundall:1979} approach allows close monitoring of the progressive deformation, where all involved physical parameters of the particles and spatial boundaries are readily available for continuous inspection. 
       +
       +The discrete element method (DEM) is a subtype of molecular dynamics (MD), and discretizes time into sufficiently small timesteps, and treats the granular material as discrete grains, interacting through contact forces. Between time steps, the particles are allowed to overlap slightly, and the magnitude of the overlap and the kinematic states of the particles is used to compute normal- and shear components of the contact force. The particles are treated as spherical entities, which simplifies the contact search. The spatial simulation domain is divided using a homogeneous, uniform, cubic grid, which greatly reduces the amount of possible contacts that are checked during each timestep. The grid-particle list is sorted using Thrust\footnote{\url{http://code.google.com/p/thrust/}}, and updated each timestep. The new particle positions and kinematic values are updated by inserting the resulting force and torque into Newton's second law, and using a Taylor-based second order integration scheme to calculate new linear and rotational accelerations, velocities and positions.
       +
       +\subsection{Application usage}
       +The CUDA DEM application is a command line executable, and writes updated particle information to custom binary files with a specific interval. This raytracing algorithm is constructed to also run from the command line, be non-interactive, and write output images in the PPM image format. This format is chosen to allow rendering to take place on cluster nodes with CUDA compatible devices.
       +
       +Both the CUDA DEM and raytracing applications are open-source\footnote{\url{http://users-cs.au.dk/adc/files/sphere.tar.gz}}, although still under heavy development. 
       +
       +This document consists of a short introduction to the basic mathematics behind the ray tracing algorithm, an explaination of the implementation using the CUDA API \cite{Nvidia-1:2010} and a presentation of the results. The CUDA device source code  and C++ host source code for the ray tracing algorithm can be found in the appendix, along with instructions for compilation and execution of the application.
       +
       +
       +\section{Ray tracing algorithm}
       +The goal of the ray tracing algorithm is to compute the shading of each pixel in the image \cite{Shirley:2009}. This is performed by creating a viewing ray from the eye into the scene, finding the closest intersection with a scene object, and computing the resulting color. The general structure of the program is demonstrated in the following pseudo-code:
       +\begin{lstlisting}[basicstyle=\footnotesize]
       +for each pixel do
       +  compute viewing ray origin and direction
       +  iterate through objects and find the closest hit
       +  set pixel color to value computed from hit point, light, n
       +\end{lstlisting}
       +The implemented code does not utilize recursive rays, since the modeled material grains are matte in appearance.
       +
       +\subsection{Ray generation}
       +The rays are in vector form defined as:
       +\begin{equation}
       +  \mathbf{p}(t) = \mathbf{e} + t(\mathbf{s} - \mathbf{e})
       +  \label{eq:raygen}
       +\end{equation}
       +The perspective can be either \emph{orthograpic}, where all viewing rays have the same direction, but different starting points, or use \emph{perspective projection}, where the starting point is the same, but the direction is slightly different \cite{Shirley:2009}. For the purposes of this application, a perspective projection was chosen, as it results in the most natural looking image. The ray data structures were held flexible enough to allow an easy implementation of orthographic perspective, if this is desired at a later point.
       +
       +The ray origin $\mathbf{e}$ is the position of the eye, and is constant. The direction is unique for each ray, and is computed using:
       +\begin{equation}
       +  \mathbf{s} - \mathbf{e} = -d \mathbf{w} + u \mathbf{u} + v \mathbf{v}
       +  \label{eq:raydirection}
       +\end{equation}
       +where $\{\mathbf{u},\mathbf{v},\mathbf{w}\}$ are the orthonormal bases of the camera coordinate system, and $d$ is the focal length \cite{Shirley:2009}. The camera coordinates of pixel ($i,j$) in the image plane, $u$ and $v$, are calculated by:
       +\[ u = l + (r-l)(i+0.5)/n \]
       +\[ v = b + (t-b)(j+0.5)/m \]
       +where $l$, $r$, $t$ and $b$ are the positions of the image borders (left, right, top and bottom) in camera space. The values $n$ and $m$ are the number of pixels in each dimension.
       +
       +
       +\subsection{Ray-sphere intersection}
       +Given a sphere with a center $\mathbf{c}$, and radius $R$, a equation can be constrained, where $\mathbf{p}$ are all points placed on the sphere surface:
       +\begin{equation}
       +  (\mathbf{p} - \mathbf{c}) \cdot (\mathbf{p} - \mathbf{c}) - R^2 = 0
       +  \label{eq:sphere}
       +\end{equation}
       +By substituting the points $\mathbf{p}$ with ray equation \ref{eq:raygen}, and rearranging the terms, a quadratic equation emerges:
       +\begin{equation}
       +  (\mathbf{d}\cdot\mathbf{d})t^2 + 2\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})t
       +  + (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2 = 0
       +  \label{eq:quad}
       +\end{equation}
       +The number of ray steps $t$ is the only unknown, so the number of intersections is found by calculating the determinant:
       +\begin{equation}
       +  \Delta = (2(\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})))^2 - 4(\mathbf{d}\cdot\mathbf{d}) ( (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2
       +  \label{eq:Delta}
       +\end{equation}
       +A negative value denotes no intersection between the sphere and the ray, a value of zero means that the ray touches the sphere at a single point (ignored in this implementation), and a positive value denotes that there are two intersections, one when the ray enters the sphere, and one when it exits. In the code, a conditional branch checks wether the determinant is positive. If this is the case, the distance to the intersection in ray ``steps'' is calculated using:
       +\begin{equation}
       +  t = \frac{-\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}) \pm \sqrt{\eta} } { (\mathbf{d}\cdot\mathbf{d})}
       +  \label{eq:intersect}
       +\end{equation}
       +where
       +\[ \eta = (\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}))^2 - (\mathbf{d}\cdot\mathbf{d})( (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2) \]
       +Only the smallest intersection ($t_\text{minus}$) is calculated, since this marks the point where the sphere enters the particle. If this value is smaller than previous intersection distances, the intersection point $\mathbf{p}$ and surface normal $\mathbf{n}$ at the intersection point is calculated:
       +\begin{equation}
       +  \mathbf{p} = \mathbf{e} + t_\text{minus} \mathbf{d}
       +  \label{eq:intersection}
       +\end{equation}
       +\begin{equation}
       +  \mathbf{n} = 2(\mathbf{p}-\mathbf{c})
       +  \label{eq:normal}
       +\end{equation}
       +The intersection distance in vector steps ($t_\text{minus}$) is saved in order to allow comparison of the distance with later intersections. 
       +
       +\subsection{Pixel shading}
       +The pixel is shaded using \emph{Lambertian} shading \cite{Shirley:2009}, where the pixel color is proportional to the angle between the light vector ($\mathbf{l}$) and the surface normal. An ambient shading component is added to simulate global illumination, and prevent that the spheres are completely black:
       +\begin{equation}
       +  L = k_a I_a + k_d I_d \max(0,(\mathbf{n}\cdot\mathbf{l}))
       +  \label{eq:shading}
       +\end{equation}
       +where the $a$ and $d$ subscripts denote the ambient and diffusive (Lambertian) components of the ambient/diffusive coefficients ($k$) and light intensities ($I$). The pixel color $L$ is calculated once per color channel.
       +
       +
       +\subsection{Computational implementation}
       +The above routines were first implemented in CUDA for device execution, and afterwards ported to a CPU C++ equivalent, used for comparing performance. The CPU raytracing algorithm was optimized to shared-memory parallelism using OpenMP \cite{Chapman:2007}. The execution method can be chosen when launching the raytracer from the command line, see the appendix for details. In the CPU implementation, all data was stored in linear arrays of the right size, ensuring 100\% memory efficiency.
       +
       +
       +\section{CUDA implementation}
       +When constructing the algorithm for execution on the GPGPU device, the data-parallel nature of the problem (SIMD: single instruction, multiple data) is used to deconstruct the rendering task into a single thread per pixel. Each thread iterates through all particles, and ends up writing the resulting color to the image memory.
       +
       +The application starts by reading the discrete element method data from a custom binary file. The particle data, consisting of position vectors in three-dimensional Euclidean space ($\mathbf{R}^3$) and particle radii, is stored together in a \texttt{float4} array, with the particle radius in the \texttt{w} position. This has large advantages to storing the data in separate \texttt{float3} and \texttt{float} arrays; Using \texttt{float4} (instead of \texttt{float3}) data allows coalesced memory access \cite{Nvidia-2:2010} to the arrays of data in device memory, resulting in efficient memory requests and transfers \cite{Nyland:2007}, and the data access pattern is coherent and convenient. Other three-component vectors were also stored as \texttt{float4} for the same reasons, even though this sometimes caused a slight memory redundancy. The image data is saved in a three-channel linear \texttt{unsigned char} array. Global memory access are coalesced whenever possible.
       +Divergent branches in the kernel code were avoided as much as possible \cite{Nvidia-2:2010}.
       +
       +\begin{figure}[!t]
       +\centering
       +\includegraphics[width=0.47\textwidth]{np500.png}
       +\caption{Sample output of GPU raytracer rendering of \num{512} particles.}
       +\label{fig:np500.png}
       +\end{figure}
       +
       +
       +\begin{figure}[!t]
       +\centering
       +\includegraphics[width=0.47\textwidth]{np50000.png}
       +\caption{Sample output of GPU raytracer rendering of \num{50653} particles.}
       +\label{fig:np50000.png}
       +\end{figure}
       +
       +The algorithm starts by allocating memory on the device for the particle data, the ray parameters, and the image RGB values. Afterwards, all particle data is transfered from the host- to the device memory.
       +
       +All pixel values are initialized to $[R,G,B] = [0,0,0]$, which serves as the image background color. Afterwards, a kernel is executed with a thread for each pixel, testing for intersections between the pixel's viewing ray and all particles, and returning the closest particle. This information is used when computing the shading of the pixel. 
       +
       +After all pixel values have been computed, the image data is transfered back to the host memory, and written to the disk. The application ends by liberating dynamically allocated memory on both the device and the host.
       +
       +\subsection{Thread and block layout}
       +The thread/block layout passed during kernel launches is arranged in the following manner:
       +\begin{lstlisting}
       +  dim3 threads(16, 16);
       +  dim3 blocks((width+15)/16, (height+15)/16);
       +\end{lstlisting}
       +The image pixel position of the thread can be determined from the thread- and block index and dimensions. The layout corresponds to a thread tile size of 256, and a dynamic number of blocks, ensured to fit the image dimensions with only small eventual redundancy \cite{Sanders:2010}. Since this method will initialize extra threads in most situations, all kernels (with return type \texttt{void}) start by checking wether the thread-/block index actually falls inside of the image dimensions:
       +\begin{lstlisting}
       +  int i = threadIdx.x + blockIdx.x * blockDim.x;
       +  int j = threadIdx.y + blockIdx.y * blockDim.y;
       +  unsigned int mempos = x + y * blockDim.x * gridDim.x;
       +  if (mempos > pixels)
       +    return;
       +\end{lstlisting}
       +The linear memory position (\texttt{mempos}) is used as the index when reading or writing to the linear arrays residing in global device memory.
       +
       +\subsection{Image output}
       +After completing all pixel shading computations on the device, the image data is transfered back to the host memory, and together with a header written to a PPM\footnote{\url{http://paulbourke.net/dataformats/ppm/}} image file. This file is converted to the PNG format using ImageMagick.
       +
       +\subsection{Performance}
       +Since this simple raytracing algorithm generates a single non-recursive ray for each pixel, which in turn checks all spheres for intersection, the application is expected to scale in the form of $O(n\times m \times N)$, where $n$ and $m$ are the output image dimensions in pixels, and $N$ is the number of particles.
       +
       +The data transfer between the host and device is kept at a bare minimum, as the intercommunication is considered a bottleneck in relation to the potential device performance \cite{Nvidia-2:2010}. Thread synchronization points are only inserted were necessary, and the code is optimized by the compilers to the target architecture (see appendix). 
       +
       +The host execution time was profiled using a \texttt{clock()} based CPU timer from \texttt{time.h}, which was normalized using the constant \texttt{CLOCKS\_PER\_SEC}. 
       +
       +The device execution time was profiled using two \texttt{cudaEvent\_t} timers, one measuring the time spent in the entire device code section, including device memory allocation, data transfer to- and from the host, execution of the kernels, and memory deallocation. The other timer only measured time spent in the kernels. The threads were synchronized before stopping the timers. A simple CPU timer using \texttt{clock()} will \emph{not} work, since control is returned to the host thread before the device code has completed all tasks.
       +
       +Figures \ref{fig:np-performance.pdf} and \ref{fig:px-performance.pdf} show the profiling results, where the number of particles and the image dimensions were varied. With exception of executions with small image dimensions, the kernel execution time results agree with the $O(n \times m \times N)$ scaling predicion.
       +
       +\begin{figure}[!t]
       +\centering
       +\includegraphics[width=0.47\textwidth]{np-performance.pdf}
       +\caption{Performance scaling with varying particle numbers at image dimensions 800 by 800 pixels.}
       +\label{fig:np-performance.pdf}
       +\end{figure}
       +
       +\begin{figure}[!t]
       +\centering
       +\includegraphics[width=0.47\textwidth]{px-performance.pdf}
       +\caption{Performance scaling with varying image dimensions ($n \times m$) with 5832 particles.}
       +\label{fig:px-performance.pdf}
       +\end{figure}
       +
       +The device memory allocation and data transfer was also profiled, and turns out to be only weakly dependant on the particle numbers (fig. \ref{fig:np-performance.pdf}), but more strongly correlated to image dimensions (fig. \ref{fig:px-performance.pdf}). As with kernel execution times, the execution time converges against an overhead value at small image dimensions.
       +
       +The CPU time spent in the host kernels proves to be linear with the particle numbers, and linear with the image dimensions. This is due to the non-existant overhead caused by initialization of the device code, and reduced memory transfer.
       +
       +The ratio between CPU computational times and the sum of the device kernel execution time and the host---device memory transfer and additional memory allocation was calculated, and had a mean value of \num{55.6} and a variance of \num{739} out of the 11 comparative measurements presented in the figures. It should be noted, that the smallest speedups were recorded when using very small image dimensions, probably unrealistic in real use.
       +
       +As the number of particles are not known by compilation, it is not possible to store particle positions and -radii in constant memory. Shared memory was also on purpose avoided, since the memory per thread block (64 kb) would not be sufficient in rendering of simulations containing containing more than \num{16000} particles (\num{16000} \texttt{float4} values). The constant memory was however utilized for storing the camera related parameters; the orthonormal base vectors, the observer position, the image dimensions, the focal length, and the light vector.
       +
       +Previous GPU implementations often rely on k-D trees, constructed as an sorting method for static scene objects \cite{Horn:2007,Popov:2007}. A k-D tree implementation would drastically reduce the global memory access induced by each thread, so it is therefore the next logical step with regards to optimizing the ray tracing algorithm presented here.
       +
       +\section{Conclusion}
       +This document presented the implementation of a basic ray tracing algorithm, utilizing the highly data-parallel nature of the problem when porting the work load to CUDA.
       +Performance tests showed the expected, linear correlation between image dimensions, particle numbers and execution time. Comparisons with an equivalent CPU algorithm showed large speedups, typically up to two orders of magnitude. This speedup did not come at a cost of less correct results.
       +
       +The final product will come into good use during further development and completion of the CUDA DEM particle model, and is ideal since it can be used for offline rendering on dedicated, heterogeneous GPU-CPU computing nodes. The included device code will be the prefered method of execution, whenever the host system allows it.
       +
       +\bibliographystyle{IEEEtran}
       +\bibliography{IEEEabrv,/Users/adc/Documents/University/BIB}
       +
       +
       +
       +\appendices
       +
       +\section{Test environment}
       +The raytracing algorithm was developed, tested and profiled on a mid 2010 Mac Pro with a \num{2.8} Ghz Quad-Core Intel Xeon CPU and a NVIDIA Quadro 4000 for Mac, dedicated to CUDA applications. The CUDA driver was version 4.0.50, the CUDA compilation tools release 4.0, V0.2.1221. The GCC tools were version 4.2.1. Each CPU core is multithreaded by two threads for a total of 8 threads.
       +
       +The CUDA source code was compiled with \texttt{nvcc}, and linked to \texttt{g++} compiled C++ source code with \texttt{g++}. For all benchmark tests, the code was compiled with the following commands:
       +\begin{lstlisting}
       +g++ -c -Wall -O3 -arch x86_64 -fopenmp ... 
       +nvcc -c -use_fast_math -gencode arch=compute_20,code=sm_20 -m64 ...
       +g++ -arch x86_64 -lcuda -lcudart -fopenmp *.o -o rt
       +\end{lstlisting}
       +When profiling device code performance, the application was executed two times, and the time of the second run was noted. This was performed to avoid latency caused by device driver initialization.
       +
       +The host system was measured to have a memory bandwidth of 4642.1 MB/s when transfering data from the host to the device, and 3805.6 MB/s when transfering data from the device to the host.
       +
       +\ifCLASSOPTIONcaptionsoff
       +  \newpage
       +\fi
       +
       +
       +
       +
       +% that's all folks
       +\end{document}
       +
       +
 (DIR) diff --git a/raytracer/Makefile b/raytracer/Makefile
       t@@ -1,101 +0,0 @@
       -NVCC=nvcc
       -CC=g++
       -LD=g++
       -
       -CCFLAGS=-c -Wall -O3 -fopenmp
       -LDFLAGS=-fopenmp
       -
       -# Verbose compile?
       -#NVCCFLAGS+=--verbose
       -
       -# Profile code?
       -#NVCCFLAGS+=-pg
       -
       -# Debugable code?
       -#CCFLAGS+=-g
       -#NVCCFLAGS+=-g -G
       -
       -CCFILES=main.cpp o-ppm.cpp rt-kernel-cpu.cpp
       -CUFILES=rt-kernel.cu
       -CCOBJECTS=$(CCFILES:.cpp=.o)
       -CUOBJECTS=$(CUFILES:.cu=.o)
       -OBJECTS=$(CCOBJECTS) $(CUOBJECTS)
       -
       -EXECUTABLE=rt
       -
       -# NVCC flags
       -NVCCFLAGS+=-use_fast_math -gencode arch=compute_20,code=sm_20
       -
       -# Generate device code
       -#NVCCFLAGS+=--export-dir=$(EXECUTABLE).devcode
       -
       -INCLUDES = -I. -I../libs -I/usr/local/cuda/include
       -LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib -lcudart -lcuda 
       -#GL_LIBS = -lGLEW
       -
       -# detect OS
       -OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
       -# 'linux' is output for Linux system, 'darwin' for OS X
       -DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
       -ifneq ($(DARWIN),)
       -    #INCLUDES += -I/opt/local/include
       -    #LIBS += -L/opt/local/lib
       -    #GL_LIBS += -framework glut -framework OpenGL 
       -    CUDA_SDK=/Developer/GPU\ Computing/C
       -else
       -    INCLUDES += -I/usr/include -I/usr/local/include
       -    LIBS += -L/usr/lib -L/usr/local/lib
       -    #GL_LIBS += -L/usr/X11/lib -lglut -lGL -lGLU
       -    CUDA_SDK=$(HOME)/NVIDIA_GPU_Computing_SDK/C
       -endif
       -
       -# detect OS
       -ARCHUPPER = $(shell uname -m 2>/dev/null | tr [:lower:] [:upper:])
       -# 'linux' is output for Linux system, 'darwin' for OS X
       -ARCH := $(strip $(findstring X86_64, $(ARCHUPPER)))
       -ifneq ($(ARCH),)
       -    LIB_ARCH = x86_64
       -    NVCCFLAGS += -m64
       -    ifneq ($(DARWIN),)
       -        CXX_ARCH_FLAGS += -arch x86_64
       -    else
       -        CXX_ARCH_FLAGS += -m64
       -    endif
       -else
       -    LIB_ARCH = i386
       -    NVCCFLAGS += -m32
       -    ifneq ($(DARWIN),)
       -        CXX_ARCH_FLAGS += -arch i386
       -    else
       -        CXX_ARCH_FLAGS += -m32
       -    endif
       -endif
       -
       -
       -INCLUDES += -I$(CUDA_SDK)/common/inc
       -#LIBS += -L$(CUDA_SDK)/lib #-lcutil_$(LIB_ARCH)
       -LIBS += -L$(CUDA_SDK)/lib -lcutil_$(LIB_ARCH)
       -
       -$(EXECUTABLE): $(OBJECTS)
       -        $(LD) $(CXX_ARCH_FLAGS) $(LDFLAGS) $(LIBS) $(OBJECTS) -o $(EXECUTABLE)
       -
       -main.o: main.cpp
       -        $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
       -
       -o-ppm.o: o-ppm.cpp
       -        $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
       -
       -rt-kernel-cpu.o: rt-kernel-cpu.cpp
       -        $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
       -
       -rt-kernel.o: rt-kernel.cu
       -        $(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
       -
       -clean:
       -        rm -f doc/*.{log,aux,bbl,blg,out,gz}
       -        rm -f *.o *.ppm *.png
       -        rm -f $(EXECUTABLE)
       -
       -edit:
       -        vim -p Makefile $(CCFILES) $(CUFILES) *.h
       -
 (DIR) diff --git a/raytracer/README b/raytracer/README
       t@@ -1,43 +0,0 @@
       -------------------------------------------------------------------
       - This is the README file for the SPHERE ray tracing module
       -   Last revision: May. 18., 2012
       -------------------------------------------------------------------
       -
       -BACKGROUND INFORMATION
       -======================
       -
       -The ray tracing module was initially developed for the Sphere DEM
       -software: http://github.com/anders-dc/sphere
       -
       -This is a port for visualizing simulation output from the 
       -SPHERE discrete element method software.
       -
       -It is developed by Anders Damsgaard Christensen, adc@geo.au.dk, and
       -is licenced under the GNU General Public Licence (GPL).
       -
       -Documentation can be found in the file doc/raytracer.pdf
       -
       -
       -REQUIREMENTS
       -============
       -
       -For compiling the raytracer, the following software is required:
       - - GNU compiler collection
       - - CUDA developer drivers
       - - CUDA toolkit
       -When executing the compiled code, a CUDA compatible GPU is
       -necessary, along with appropriate developer device drivers.
       -
       -
       -USAGE INSTRUCTIONS
       -==================
       - 
       -The source code is built using GNU make with the command:
       -  ./make
       -
       -A sample run can subsequently be started with:
       -  ./make run
       -
       -Temporary files can be removed with:
       -  ./make clean
       -
 (DIR) diff --git a/raytracer/colorbar.h b/raytracer/colorbar.h
       t@@ -1,22 +0,0 @@
       -#ifndef COLORBAR_H_
       -#define COLORBAR_H_
       -
       -// Functions that determine red-, green- and blue color components
       -// in a blue-white-red colormap. Ratio should be between 0.0-1.0.
       -
       -__inline__ __host__ __device__ float red(float ratio)
       -{
       -  return fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
       -};
       -
       -__inline__ __host__ __device__ float green(float ratio)
       -{
       -  return fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
       -};
       -
       -__inline__ __host__ __device__ float blue(float ratio)
       -{
       -  return fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
       -};
       -
       -#endif
 (DIR) diff --git a/raytracer/doc/.DS_Store b/raytracer/doc/.DS_Store
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/.raytracer.tex.swp b/raytracer/doc/.raytracer.tex.swp
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/bloopers/black.png b/raytracer/doc/bloopers/black.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/bloopers/colorful.png b/raytracer/doc/bloopers/colorful.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/bloopers/cpu_wrong_colorful.png b/raytracer/doc/bloopers/cpu_wrong_colorful.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/bloopers/twisted.png b/raytracer/doc/bloopers/twisted.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/bloopers/wrong_order.png b/raytracer/doc/bloopers/wrong_order.png
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/raytracer.pdf b/raytracer/doc/raytracer.pdf
       Binary files differ.
 (DIR) diff --git a/raytracer/doc/raytracer.tex b/raytracer/doc/raytracer.tex
       t@@ -1,317 +0,0 @@
       -\documentclass[journal]{IEEEtran}
       -
       -
       -\ifCLASSINFOpdf
       -  \usepackage[pdftex]{graphicx}
       -\else
       -   \usepackage[dvips]{graphicx}
       -\fi
       -
       -\usepackage{color}
       -
       -\usepackage{url}
       -%\usepackage{showkeys}
       -\usepackage{siunitx}
       -
       -% correct bad hyphenation here
       -\hyphenation{op-tical net-works semi-conduc-tor}
       -
       -\definecolor{DarkGreen}{rgb}{0,0.45,0.08}
       -\usepackage{listings}
       -\lstset{language=C++,breaklines=true,breakatwhitespace=true,basicstyle=\small,prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}}, commentstyle=\color{DarkGreen}, tabsize=4}
       -
       -
       -\begin{document}
       -\title{CUDA raytracing algorithm for visualizing\\ discrete element model output}
       -
       -\author{Anders Damsgaard Christensen,~\IEEEmembership{20062213}
       -\thanks{Contact: {anders.damsgaard@geo.au.dk}}%
       -\thanks{Webpage: {http://users-cs.au.dk/adc}}%
       -\thanks{Manuscript, last revision: \today.}}
       -
       -% The paper headers
       -\markboth{Data Parallel Computing 2011}{Christensen: CUDA raytracing algorithm for visualizing DEM output}
       -
       -% make the title area
       -\maketitle
       -
       -
       -\begin{abstract}
       -%\boldmath
       -A raytracing algorithm is constructed using the CUDA API for visualizing output from a CUDA discrete element model, which outputs spatial information in dynamic particle systems. 
       -The raytracing algorithm is optimized with constant memory and compilation flags, and performance is measured as a function of the number of particles and the number of pixels. The execution time is compared to equivalent CPU code, and the speedup under a variety of conditions is found to have a mean value of 55.6 times.
       -\end{abstract}
       -\begin{IEEEkeywords}
       -CUDA, discrete element method, raytracing
       -\end{IEEEkeywords}
       -
       -
       -\IEEEpeerreviewmaketitle
       -
       -
       -\section{Introduction}
       -\IEEEPARstart{V}{isualizing} systems containing many spheres using traditional object-order graphics rendering can often result in very high computational requirements, as the usual automated approach is to construct a meshed surface with a specified resolution for each sphere. The memory requirements are thus quite high, as each surface will consist of many vertices. Raytracing \cite{Whitted:1980} is a viable alternative, where spheric entities are saved as data structures with a centre coordinate and a radius. The rendering is performed on the base of these values, which results in a perfectly smooth surfaced sphere. To accelerate the rendering, the algorithm is constructed utilizing the CUDA API \cite{Nvidia-1:2010}, where the problem is divided into $n\times m$ threads, corresponding to the desired output image resolution. Each thread iterates through all particles and applies a simple shading model to determine the final RGB values of the pixel.
       -
       -Previous studies of GPU or CUDA implementations of ray tracing algorithms reported major speedups, compared to corresponding CPU applications (e.g. \cite{Horn:2007,Shih:2009,Popov:2007,Luebke:2008}). None of the software was however found to be open-source and GPL licensed, so a simple raytracer was constructed, customized to render particles, where the data was stored in a specific data format.
       -
       -\subsection{Discrete Element Method}
       -The input particle data to the raytracer is the output of a custom CUDA-based Discrete Element Method (DEM) application currently in development. The DEM model is used to numerically simulate the response of a drained, soft, granular sediment bed upon normal stresses and shearing velocities similar to subglacial environments under ice streams \cite{Evans:2006}. In contrast to laboratory experiments on granular material, the discrete element method \cite{Cundall:1979} approach allows close monitoring of the progressive deformation, where all involved physical parameters of the particles and spatial boundaries are readily available for continuous inspection. 
       -
       -The discrete element method (DEM) is a subtype of molecular dynamics (MD), and discretizes time into sufficiently small timesteps, and treats the granular material as discrete grains, interacting through contact forces. Between time steps, the particles are allowed to overlap slightly, and the magnitude of the overlap and the kinematic states of the particles is used to compute normal- and shear components of the contact force. The particles are treated as spherical entities, which simplifies the contact search. The spatial simulation domain is divided using a homogeneous, uniform, cubic grid, which greatly reduces the amount of possible contacts that are checked during each timestep. The grid-particle list is sorted using Thrust\footnote{\url{http://code.google.com/p/thrust/}}, and updated each timestep. The new particle positions and kinematic values are updated by inserting the resulting force and torque into Newton's second law, and using a Taylor-based second order integration scheme to calculate new linear and rotational accelerations, velocities and positions.
       -
       -\subsection{Application usage}
       -The CUDA DEM application is a command line executable, and writes updated particle information to custom binary files with a specific interval. This raytracing algorithm is constructed to also run from the command line, be non-interactive, and write output images in the PPM image format. This format is chosen to allow rendering to take place on cluster nodes with CUDA compatible devices.
       -
       -Both the CUDA DEM and raytracing applications are open-source\footnote{\url{http://users-cs.au.dk/adc/files/sphere.tar.gz}}, although still under heavy development. 
       -
       -This document consists of a short introduction to the basic mathematics behind the ray tracing algorithm, an explaination of the implementation using the CUDA API \cite{Nvidia-1:2010} and a presentation of the results. The CUDA device source code  and C++ host source code for the ray tracing algorithm can be found in the appendix, along with instructions for compilation and execution of the application.
       -
       -
       -\section{Ray tracing algorithm}
       -The goal of the ray tracing algorithm is to compute the shading of each pixel in the image \cite{Shirley:2009}. This is performed by creating a viewing ray from the eye into the scene, finding the closest intersection with a scene object, and computing the resulting color. The general structure of the program is demonstrated in the following pseudo-code:
       -\begin{lstlisting}[basicstyle=\footnotesize]
       -for each pixel do
       -  compute viewing ray origin and direction
       -  iterate through objects and find the closest hit
       -  set pixel color to value computed from hit point, light, n
       -\end{lstlisting}
       -The implemented code does not utilize recursive rays, since the modeled material grains are matte in appearance.
       -
       -\subsection{Ray generation}
       -The rays are in vector form defined as:
       -\begin{equation}
       -  \mathbf{p}(t) = \mathbf{e} + t(\mathbf{s} - \mathbf{e})
       -  \label{eq:raygen}
       -\end{equation}
       -The perspective can be either \emph{orthograpic}, where all viewing rays have the same direction, but different starting points, or use \emph{perspective projection}, where the starting point is the same, but the direction is slightly different \cite{Shirley:2009}. For the purposes of this application, a perspective projection was chosen, as it results in the most natural looking image. The ray data structures were held flexible enough to allow an easy implementation of orthographic perspective, if this is desired at a later point.
       -
       -The ray origin $\mathbf{e}$ is the position of the eye, and is constant. The direction is unique for each ray, and is computed using:
       -\begin{equation}
       -  \mathbf{s} - \mathbf{e} = -d \mathbf{w} + u \mathbf{u} + v \mathbf{v}
       -  \label{eq:raydirection}
       -\end{equation}
       -where $\{\mathbf{u},\mathbf{v},\mathbf{w}\}$ are the orthonormal bases of the camera coordinate system, and $d$ is the focal length \cite{Shirley:2009}. The camera coordinates of pixel ($i,j$) in the image plane, $u$ and $v$, are calculated by:
       -\[ u = l + (r-l)(i+0.5)/n \]
       -\[ v = b + (t-b)(j+0.5)/m \]
       -where $l$, $r$, $t$ and $b$ are the positions of the image borders (left, right, top and bottom) in camera space. The values $n$ and $m$ are the number of pixels in each dimension.
       -
       -
       -\subsection{Ray-sphere intersection}
       -Given a sphere with a center $\mathbf{c}$, and radius $R$, a equation can be constrained, where $\mathbf{p}$ are all points placed on the sphere surface:
       -\begin{equation}
       -  (\mathbf{p} - \mathbf{c}) \cdot (\mathbf{p} - \mathbf{c}) - R^2 = 0
       -  \label{eq:sphere}
       -\end{equation}
       -By substituting the points $\mathbf{p}$ with ray equation \ref{eq:raygen}, and rearranging the terms, a quadratic equation emerges:
       -\begin{equation}
       -  (\mathbf{d}\cdot\mathbf{d})t^2 + 2\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})t
       -  + (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2 = 0
       -  \label{eq:quad}
       -\end{equation}
       -The number of ray steps $t$ is the only unknown, so the number of intersections is found by calculating the determinant:
       -\begin{equation}
       -  \Delta = (2(\mathbf{d}\cdot(\mathbf{e}-\mathbf{c})))^2 - 4(\mathbf{d}\cdot\mathbf{d}) ( (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2
       -  \label{eq:Delta}
       -\end{equation}
       -A negative value denotes no intersection between the sphere and the ray, a value of zero means that the ray touches the sphere at a single point (ignored in this implementation), and a positive value denotes that there are two intersections, one when the ray enters the sphere, and one when it exits. In the code, a conditional branch checks wether the determinant is positive. If this is the case, the distance to the intersection in ray ``steps'' is calculated using:
       -\begin{equation}
       -  t = \frac{-\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}) \pm \sqrt{\eta} } { (\mathbf{d}\cdot\mathbf{d})}
       -  \label{eq:intersect}
       -\end{equation}
       -where
       -\[ \eta = (\mathbf{d}\cdot(\mathbf{e}-\mathbf{c}))^2 - (\mathbf{d}\cdot\mathbf{d})( (\mathbf{e}-\mathbf{c})\cdot(\mathbf{e}-\mathbf{c}) - R^2) \]
       -Only the smallest intersection ($t_\text{minus}$) is calculated, since this marks the point where the sphere enters the particle. If this value is smaller than previous intersection distances, the intersection point $\mathbf{p}$ and surface normal $\mathbf{n}$ at the intersection point is calculated:
       -\begin{equation}
       -  \mathbf{p} = \mathbf{e} + t_\text{minus} \mathbf{d}
       -  \label{eq:intersection}
       -\end{equation}
       -\begin{equation}
       -  \mathbf{n} = 2(\mathbf{p}-\mathbf{c})
       -  \label{eq:normal}
       -\end{equation}
       -The intersection distance in vector steps ($t_\text{minus}$) is saved in order to allow comparison of the distance with later intersections. 
       -
       -\subsection{Pixel shading}
       -The pixel is shaded using \emph{Lambertian} shading \cite{Shirley:2009}, where the pixel color is proportional to the angle between the light vector ($\mathbf{l}$) and the surface normal. An ambient shading component is added to simulate global illumination, and prevent that the spheres are completely black:
       -\begin{equation}
       -  L = k_a I_a + k_d I_d \max(0,(\mathbf{n}\cdot\mathbf{l}))
       -  \label{eq:shading}
       -\end{equation}
       -where the $a$ and $d$ subscripts denote the ambient and diffusive (Lambertian) components of the ambient/diffusive coefficients ($k$) and light intensities ($I$). The pixel color $L$ is calculated once per color channel.
       -
       -
       -\subsection{Computational implementation}
       -The above routines were first implemented in CUDA for device execution, and afterwards ported to a CPU C++ equivalent, used for comparing performance. The CPU raytracing algorithm was optimized to shared-memory parallelism using OpenMP \cite{Chapman:2007}. The execution method can be chosen when launching the raytracer from the command line, see the appendix for details. In the CPU implementation, all data was stored in linear arrays of the right size, ensuring 100\% memory efficiency.
       -
       -
       -\section{CUDA implementation}
       -When constructing the algorithm for execution on the GPGPU device, the data-parallel nature of the problem (SIMD: single instruction, multiple data) is used to deconstruct the rendering task into a single thread per pixel. Each thread iterates through all particles, and ends up writing the resulting color to the image memory.
       -
       -The application starts by reading the discrete element method data from a custom binary file. The particle data, consisting of position vectors in three-dimensional Euclidean space ($\mathbf{R}^3$) and particle radii, is stored together in a \texttt{float4} array, with the particle radius in the \texttt{w} position. This has large advantages to storing the data in separate \texttt{float3} and \texttt{float} arrays; Using \texttt{float4} (instead of \texttt{float3}) data allows coalesced memory access \cite{Nvidia-2:2010} to the arrays of data in device memory, resulting in efficient memory requests and transfers \cite{Nyland:2007}, and the data access pattern is coherent and convenient. Other three-component vectors were also stored as \texttt{float4} for the same reasons, even though this sometimes caused a slight memory redundancy. The image data is saved in a three-channel linear \texttt{unsigned char} array. Global memory access are coalesced whenever possible.
       -Divergent branches in the kernel code were avoided as much as possible \cite{Nvidia-2:2010}.
       -
       -\begin{figure}[!t]
       -\centering
       -\includegraphics[width=0.47\textwidth]{np500.png}
       -\caption{Sample output of GPU raytracer rendering of \num{512} particles.}
       -\label{fig:np500.png}
       -\end{figure}
       -
       -
       -\begin{figure}[!t]
       -\centering
       -\includegraphics[width=0.47\textwidth]{np50000.png}
       -\caption{Sample output of GPU raytracer rendering of \num{50653} particles.}
       -\label{fig:np50000.png}
       -\end{figure}
       -
       -The algorithm starts by allocating memory on the device for the particle data, the ray parameters, and the image RGB values. Afterwards, all particle data is transfered from the host- to the device memory.
       -
       -All pixel values are initialized to $[R,G,B] = [0,0,0]$, which serves as the image background color. Afterwards, a kernel is executed with a thread for each pixel, testing for intersections between the pixel's viewing ray and all particles, and returning the closest particle. This information is used when computing the shading of the pixel. 
       -
       -After all pixel values have been computed, the image data is transfered back to the host memory, and written to the disk. The application ends by liberating dynamically allocated memory on both the device and the host.
       -
       -\subsection{Thread and block layout}
       -The thread/block layout passed during kernel launches is arranged in the following manner:
       -\begin{lstlisting}
       -  dim3 threads(16, 16);
       -  dim3 blocks((width+15)/16, (height+15)/16);
       -\end{lstlisting}
       -The image pixel position of the thread can be determined from the thread- and block index and dimensions. The layout corresponds to a thread tile size of 256, and a dynamic number of blocks, ensured to fit the image dimensions with only small eventual redundancy \cite{Sanders:2010}. Since this method will initialize extra threads in most situations, all kernels (with return type \texttt{void}) start by checking wether the thread-/block index actually falls inside of the image dimensions:
       -\begin{lstlisting}
       -  int i = threadIdx.x + blockIdx.x * blockDim.x;
       -  int j = threadIdx.y + blockIdx.y * blockDim.y;
       -  unsigned int mempos = x + y * blockDim.x * gridDim.x;
       -  if (mempos > pixels)
       -    return;
       -\end{lstlisting}
       -The linear memory position (\texttt{mempos}) is used as the index when reading or writing to the linear arrays residing in global device memory.
       -
       -\subsection{Image output}
       -After completing all pixel shading computations on the device, the image data is transfered back to the host memory, and together with a header written to a PPM\footnote{\url{http://paulbourke.net/dataformats/ppm/}} image file. This file is converted to the PNG format using ImageMagick.
       -
       -\subsection{Performance}
       -Since this simple raytracing algorithm generates a single non-recursive ray for each pixel, which in turn checks all spheres for intersection, the application is expected to scale in the form of $O(n\times m \times N)$, where $n$ and $m$ are the output image dimensions in pixels, and $N$ is the number of particles.
       -
       -The data transfer between the host and device is kept at a bare minimum, as the intercommunication is considered a bottleneck in relation to the potential device performance \cite{Nvidia-2:2010}. Thread synchronization points are only inserted were necessary, and the code is optimized by the compilers to the target architecture (see appendix). 
       -
       -The host execution time was profiled using a \texttt{clock()} based CPU timer from \texttt{time.h}, which was normalized using the constant \texttt{CLOCKS\_PER\_SEC}. 
       -
       -The device execution time was profiled using two \texttt{cudaEvent\_t} timers, one measuring the time spent in the entire device code section, including device memory allocation, data transfer to- and from the host, execution of the kernels, and memory deallocation. The other timer only measured time spent in the kernels. The threads were synchronized before stopping the timers. A simple CPU timer using \texttt{clock()} will \emph{not} work, since control is returned to the host thread before the device code has completed all tasks.
       -
       -Figures \ref{fig:np-performance.pdf} and \ref{fig:px-performance.pdf} show the profiling results, where the number of particles and the image dimensions were varied. With exception of executions with small image dimensions, the kernel execution time results agree with the $O(n \times m \times N)$ scaling predicion.
       -
       -\begin{figure}[!t]
       -\centering
       -\includegraphics[width=0.47\textwidth]{np-performance.pdf}
       -\caption{Performance scaling with varying particle numbers at image dimensions 800 by 800 pixels.}
       -\label{fig:np-performance.pdf}
       -\end{figure}
       -
       -\begin{figure}[!t]
       -\centering
       -\includegraphics[width=0.47\textwidth]{px-performance.pdf}
       -\caption{Performance scaling with varying image dimensions ($n \times m$) with 5832 particles.}
       -\label{fig:px-performance.pdf}
       -\end{figure}
       -
       -The device memory allocation and data transfer was also profiled, and turns out to be only weakly dependant on the particle numbers (fig. \ref{fig:np-performance.pdf}), but more strongly correlated to image dimensions (fig. \ref{fig:px-performance.pdf}). As with kernel execution times, the execution time converges against an overhead value at small image dimensions.
       -
       -The CPU time spent in the host kernels proves to be linear with the particle numbers, and linear with the image dimensions. This is due to the non-existant overhead caused by initialization of the device code, and reduced memory transfer.
       -
       -The ratio between CPU computational times and the sum of the device kernel execution time and the host---device memory transfer and additional memory allocation was calculated, and had a mean value of \num{55.6} and a variance of \num{739} out of the 11 comparative measurements presented in the figures. It should be noted, that the smallest speedups were recorded when using very small image dimensions, probably unrealistic in real use.
       -
       -As the number of particles are not known by compilation, it is not possible to store particle positions and -radii in constant memory. Shared memory was also on purpose avoided, since the memory per thread block (64 kb) would not be sufficient in rendering of simulations containing containing more than \num{16000} particles (\num{16000} \texttt{float4} values). The constant memory was however utilized for storing the camera related parameters; the orthonormal base vectors, the observer position, the image dimensions, the focal length, and the light vector.
       -
       -Previous GPU implementations often rely on k-D trees, constructed as an sorting method for static scene objects \cite{Horn:2007,Popov:2007}. A k-D tree implementation would drastically reduce the global memory access induced by each thread, so it is therefore the next logical step with regards to optimizing the ray tracing algorithm presented here.
       -
       -\section{Conclusion}
       -This document presented the implementation of a basic ray tracing algorithm, utilizing the highly data-parallel nature of the problem when porting the work load to CUDA.
       -Performance tests showed the expected, linear correlation between image dimensions, particle numbers and execution time. Comparisons with an equivalent CPU algorithm showed large speedups, typically up to two orders of magnitude. This speedup did not come at a cost of less correct results.
       -
       -The final product will come into good use during further development and completion of the CUDA DEM particle model, and is ideal since it can be used for offline rendering on dedicated, heterogeneous GPU-CPU computing nodes. The included device code will be the prefered method of execution, whenever the host system allows it.
       -
       -\bibliographystyle{IEEEtran}
       -\bibliography{IEEEabrv,/Users/adc/Documents/University/BIB}
       -
       -
       -
       -\appendices
       -
       -\section{Test environment}
       -The raytracing algorithm was developed, tested and profiled on a mid 2010 Mac Pro with a \num{2.8} Ghz Quad-Core Intel Xeon CPU and a NVIDIA Quadro 4000 for Mac, dedicated to CUDA applications. The CUDA driver was version 4.0.50, the CUDA compilation tools release 4.0, V0.2.1221. The GCC tools were version 4.2.1. Each CPU core is multithreaded by two threads for a total of 8 threads.
       -
       -The CUDA source code was compiled with \texttt{nvcc}, and linked to \texttt{g++} compiled C++ source code with \texttt{g++}. For all benchmark tests, the code was compiled with the following commands:
       -\begin{lstlisting}
       -g++ -c -Wall -O3 -arch x86_64 -fopenmp ... 
       -nvcc -c -use_fast_math -gencode arch=compute_20,code=sm_20 -m64 ...
       -g++ -arch x86_64 -lcuda -lcudart -fopenmp *.o -o rt
       -\end{lstlisting}
       -When profiling device code performance, the application was executed two times, and the time of the second run was noted. This was performed to avoid latency caused by device driver initialization.
       -
       -The host system was measured to have a memory bandwidth of 4642.1 MB/s when transfering data from the host to the device, and 3805.6 MB/s when transfering data from the device to the host.
       -
       -\ifCLASSOPTIONcaptionsoff
       -  \newpage
       -\fi
       -
       -
       -\section{Source code}
       -The entire source code, as well as input data files, can be found in the following archive
       -\url{http://users-cs.au.dk/adc/files/sphere-rt.tar.gz}
       -The source code is built and run with the commands:
       -\begin{lstlisting}
       -  make
       -  make run
       -\end{lstlisting}
       -With the \texttt{make run} command, the Makefile uses ImageMagick to convert the PPM file to PNG format, and the OS X command \texttt{open} to display the image. Other input data files are included with other particle number magnitudes. The syntax for the raytracer application is the following:
       -\begin{lstlisting}
       -  ./rt <CPU | GPU> <sphere-binary.bin> <width> <height> <output-image.ppm>
       -\end{lstlisting}
       -This appendix contains the following source code files:
       -\lstlistoflistings
       -
       -\subsection{CUDA raytracing source code}
       -%\subsubsection{rt\_kernel.h}
       -\lstinputlisting[caption={rt-kernel.h},label={rt-kernel.h},numbers=left,breaklines=true,xleftmargin=15pt,basicstyle=\scriptsize]{../rt-kernel.h}
       -
       -%\subsubsection{rt\_kernel.cu}
       -\lstinputlisting[caption={rt-kernel.cu},label={rt-kernel.cu},numbers=left,breaklines=true,xleftmargin=15pt,basicstyle=\scriptsize]{../rt-kernel.cu}
       -
       -\subsection{CPU raytracing source code}
       -%\subsubsection{rt\_kernel\_cpu.h}
       -\lstinputlisting[caption={rt-kernel-cpu.h},label={rt-kernel-cpu.h},numbers=left,breaklines=true,xleftmargin=15pt,basicstyle=\scriptsize]{../rt-kernel-cpu.h}
       -
       -%\subsubsection{rt\_kernel\_cpu.cpp}
       -\lstinputlisting[caption={rt-kernel-cpu.cpp},label={rt-kernel-cpu.cpp},numbers=left,breaklines=true,xleftmargin=15pt,basicstyle=\scriptsize]{../rt-kernel-cpu.cpp}
       -
       -
       -
       -\clearpage
       -
       -
       -
       -
       -
       -
       -
       -
       -
       -% You can push biographies down or up by placing
       -% a \vfill before or after them. The appropriate
       -% use of \vfill depends on what kind of text is
       -% on the last page and whether or not the columns
       -% are being equalized.
       -
       -%\vfill
       -
       -% Can be used to pull up biographies so that the bottom of the last one
       -% is flush with the other column.
       -%\enlargethispage{-5in}
       -
       -
       -
       -% that's all folks
       -\end{document}
       -
       -
 (DIR) diff --git a/raytracer/header.h b/raytracer/header.h
       t@@ -1,26 +0,0 @@
       -// header.h -- Structure templates and function prototypes
       -
       -// Standard technique for avoiding multiple inclusions of header file
       -#ifndef HEADER_H_
       -#define HEADER_H_
       -
       -// Type declaration
       -typedef unsigned int Inttype;
       -
       -//// Structure declarations ////
       -
       -struct f3 {
       -  float x;
       -  float y;
       -  float z;
       -};
       -
       -// Image structure
       -struct rgb {
       -  unsigned char r;
       -  unsigned char g;
       -  unsigned char b;
       -  unsigned char alpha;
       -};
       -
       -#endif
 (DIR) diff --git a/raytracer/main.cpp b/raytracer/main.cpp
       t@@ -1,264 +0,0 @@
       -#include <iostream>
       -#include <cstdio>
       -#include <cstdlib>
       -#include <cstring>
       -#include <cmath>
       -#include <vector_functions.h>
       -#include "header.h"
       -#include "rt-kernel.h"
       -#include "rt-kernel-cpu.h"
       -#include "colorbar.h"
       -#include "o-ppm.h"
       -
       -int main(const int argc, const char* argv[])
       -{
       -  using std::cout;
       -
       -  if (argc < 6 || argc > 8 ){
       -    cout << "Usage: " << argv[0] << " <GPU or CPU> <particle-data.txt> <width> <height> <output-image.ppm>\n"
       -         << "or\n"
       -         << "Usage: " << argv[0] << " GPU <particle-data.txt | colorbar> <width> <height> <output-image.ppm> [pressure,es_dot,es,vel max_val]";
       -    return 1;
       -  }
       -
       -  cout << "\n----------------------------\n"
       -       << "This is the SPHERE raytracer\n"
       -       << "----------------------------\n";
       -
       -  // Allocate memory for image
       -  unsigned int width = atoi(argv[3]);
       -  unsigned int height = atoi(argv[4]);
       -  if (width < 1 || height < 1) {
       -    cout << "Image dimensions invalid.\n";
       -    return 1;
       -  }
       -  rgb* img;
       -  img = new rgb [height*width];
       -
       -  // Render colorbar image
       -  if (strcmp(argv[2],"colorbar") == 0) {
       -
       -    for (unsigned int y=0; y<height; ++y) {
       -      for (unsigned int x=0; x<width; ++x) {
       -
       -        // Colormap value is relative to width position
       -        float ratio = (float) (x+1)/width;
       -
       -        // Write pixel value to image array
       -        img[x + y*width].r = red(ratio) * 250.f;
       -        img[x + y*width].g = green(ratio) * 250.f;
       -        img[x + y*width].b = blue(ratio) * 250.f;
       -
       -      }
       -    }
       -  } else { // Render sphere binary
       -
       -    cout << "Reading binary: " << argv[2] << "\n";
       -
       -    FILE* fin;
       -    if ((fin = fopen(argv[2], "rb")) == NULL) {
       -      cout << "  Error encountered while trying to open binary '"
       -        << argv[2] << "'\n";
       -      return 1;
       -    }
       -
       -    // Read the number of dimensions and particles
       -    unsigned int nd, np;
       -    (void)fread(&nd, sizeof(nd), 1, fin);
       -    if (nd != 3) {
       -      cout << "  Input binary contains " << nd
       -        << "D data, this raytracer is made for 3D data\n";
       -      return 1;
       -    }
       -
       -    (void)fread(&np, sizeof(np), 1, fin);
       -    cout << "  Number of particles: " << np << "\n";
       -
       -    // Allocate particle structure array
       -    cout << "  Allocating host memory\n";
       -
       -    // Single precision arrays used for computations
       -    float4* p      = new float4[np];
       -    float*  fixvel = new float[np];
       -    float*  xsum   = new float[np];
       -    float*  es_dot = new float[np];
       -    float*  ev_dot = new float[np];
       -    float*  es     = new float[np];
       -    float*  ev     = new float[np];
       -    float*  pres   = new float[np];
       -    float*  vel           = new float[np];
       -
       -    // Read temporal information
       -    double dt, file_dt, current, total;
       -    unsigned int step_count;
       -    (void)fread(&dt, sizeof(dt), 1, fin);
       -    (void)fread(&current, sizeof(current), 1, fin);
       -    (void)fread(&total, sizeof(total), 1, fin);
       -    (void)fread(&file_dt, sizeof(file_dt), 1, fin);
       -    (void)fread(&step_count, sizeof(step_count), 1, fin);
       -
       -    double d; // Double precision temporary value holder
       -
       -    // Canonical coordinate system origo
       -    f3 origo;
       -    (void)fread(&d, sizeof(d), 1, fin);
       -    origo.x = (float)d;
       -    (void)fread(&d, sizeof(d), 1, fin);
       -    origo.y = (float)d;
       -    (void)fread(&d, sizeof(d), 1, fin);
       -    origo.z = (float)d;
       -
       -    // Canonical coordinate system dimensions
       -    f3 L;
       -    (void)fread(&d, sizeof(d), 1, fin);
       -    L.x = (float)d;
       -    (void)fread(&d, sizeof(d), 1, fin);
       -    L.y = (float)d;
       -    (void)fread(&d, sizeof(d), 1, fin);
       -    L.z = (float)d;
       -
       -
       -    // Skip over irrelevant data
       -    double blankd;
       -    //f3 blankd3;
       -    unsigned int blankui;
       -    for (int j=0; j<3; j++)         // Skip over grid.num data
       -      (void)fread(&blankui, sizeof(blankui), 1, fin);
       -
       -    // Velocity vector, later converted to lenght
       -    float3 v;
       -
       -    // Load data into particle array
       -    for (unsigned int i=0; i<np; i++) {
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      p[i].x = (float)d;        // Typecast to single precision
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      v.x  = (float)d;
       -      for (int j=0; j<4; j++)
       -        (void)fread(&blankd, sizeof(blankd), 1, fin);
       -
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      p[i].y = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      v.y  = (float)d;
       -      for (int j=0; j<4; j++)
       -        (void)fread(&blankd, sizeof(blankd), 1, fin);
       -
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      p[i].z = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      v.z  = (float)d;
       -      for (int j=0; j<4; j++)
       -        (void)fread(&blankd, sizeof(blankd), 1, fin);
       -
       -      // Save velocity vector length
       -      vel[i] = sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
       -    }
       -    for (unsigned int i=0; i<np; i++) {
       -      (void)fread(&d, sizeof(d), 1, fin); // fixvel
       -      fixvel[i] = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin); // xsum
       -      xsum[i] = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);  // radius
       -      p[i].w = (float)d;
       -      for (int j=0; j<10; j++)
       -        (void)fread(&blankd, sizeof(blankd), 1, fin);
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      es_dot[i] = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      ev_dot[i] = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      es[i] = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      ev[i] = (float)d;
       -      (void)fread(&d, sizeof(d), 1, fin);
       -      pres[i] = (float)d;
       -    }
       -
       -    fclose(fin);        // Binary read complete
       -
       -    cout << "  Spatial dimensions: " 
       -      << L.x << "*" << L.y << "*" << L.z << " m\n";
       -
       -    // Eye position and focus point
       -    //f3 eye    = {0.5f*L.x, -4.2f*L.y, 0.5f*L.z};
       -    //f3 eye    = {2.5f*L.x, -4.2f*L.y, 2.0f*L.z};
       -    //f3 eye    = {0.5f*L.x, -5.0f*L.y, 0.5f*L.z};
       -    f3 eye    = {2.5f*L.x, -5.0f*L.y, 1.5f*L.z};
       -    f3 lookat = {0.5f*L.x, 0.5f*L.y, 0.5f*L.z};
       -    if (L.z > (L.x + L.y)/1.5f) { // Render only bottom of world (for initsetup's)
       -      eye.x = 4.1f*L.x;
       -      eye.y = -15.1*L.y;
       -      eye.z = 1.1*L.z;
       -      /*lookat.x = 0.45f*L.x;
       -        lookat.y = 0.5f*L.y;
       -        lookat.z = 0.30f*L.z;*/
       -    }
       -
       -    // Screen width in world coordinates
       -    //float imgw = 1.4f*L.x;
       -    //float imgw = pow(L.x*L.y*L.z, 0.32f);
       -    //float imgw = sqrt(L.x*L.y)*1.2f; // Adjust last float to capture entire height
       -    float imgw = L.x*1.35f;
       -
       -    // Determine visualization mode
       -    int visualize = 0; // 0: ordinary view
       -    float max_val = 0;
       -    if (argc == 8) {
       -      if(strcmp(argv[6],"pressure") == 0)
       -        visualize = 1; // 1: pressure view
       -      if(strcmp(argv[6],"es_dot") == 0)
       -        visualize = 2; // 2: es_dot view
       -      if(strcmp(argv[6],"es") == 0)
       -        visualize = 3; // 3: es view
       -      if(strcmp(argv[6],"vel") == 0)
       -        visualize = 4; // 4: velocity view
       -      if(strcmp(argv[6],"xsum") == 0)
       -        visualize = 5; // 5: xsum view
       -
       -      // Read max. value specified in command args.
       -      max_val = atof(argv[7]);
       -
       -    }
       -
       -
       -    if (strcmp(argv[1],"GPU") == 0) {
       -
       -      // Call cuda wrapper
       -      if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize, max_val, fixvel, xsum, pres, es_dot, es, vel) != 0) {
       -        cout << "\nError in rt()\n";
       -        return 1;
       -      }
       -    } else if (strcmp(argv[1],"CPU") == 0) {
       -
       -      // Call CPU wrapper
       -      if (rt_cpu(p, np, img, width, height, origo, L, eye, lookat, imgw) != 0) {
       -        cout << "\nError in rt_cpu()\n";
       -        return 1;
       -      }
       -    } else {
       -      cout << "Please specify CPU or GPU for execution\n";
       -      return 1;
       -    }
       -
       -    // Free dynamically allocated memory
       -    delete [] p;
       -    delete [] fixvel;
       -    delete [] xsum;
       -    delete [] pres;
       -    delete [] es;
       -    delete [] ev;
       -    delete [] es_dot;
       -    delete [] ev_dot;
       -    delete [] vel;
       -
       -  }
       -
       -  // Write final image to PPM file
       -  image_to_ppm(img, argv[5], width, height);
       -
       -  delete [] img;
       -
       -  cout << "Terminating successfully\n\n";
       -  return 0;
       -}
 (DIR) diff --git a/raytracer/o-ppm.cpp b/raytracer/o-ppm.cpp
       t@@ -1,30 +0,0 @@
       -// o-ppm.cpp
       -//         Write images in the PPM format
       -
       -#include <iostream>
       -#include <cstdio>
       -#include "header.h"
       -
       -
       -// Write image structure to PPM file
       -int image_to_ppm(rgb* rgb_img, const char* filename, const unsigned int width, const unsigned int height)
       -{
       -  FILE *fout = fopen(filename, "wb"); 
       -  if (fout == NULL) {
       -    std::cout << "File error in img_write() while trying to writing " << filename;
       -    return 1;
       -  }
       -
       -  fprintf(fout, "P6 %d %d 255\n", width, height);
       -
       -  // Write pixel array to ppm image file
       -  for (unsigned int i=0; i<height*width; ++i) {
       -      putc(rgb_img[i].r, fout);
       -      putc(rgb_img[i].g, fout);
       -      putc(rgb_img[i].b, fout);
       -  }
       -
       -  fclose(fout);
       -  return 0;
       -}
       -
 (DIR) diff --git a/raytracer/o-ppm.h b/raytracer/o-ppm.h
       t@@ -1,7 +0,0 @@
       -#ifndef O_PPM_H_
       -#define O_PPM_H_
       -
       -int image_to_ppm(rgb* rgb_img, const char* filename,
       -                     const unsigned int width, const unsigned int height);
       -
       -#endif
 (DIR) diff --git a/raytracer/render_all_outputs_CPU.sh b/raytracer/render_all_outputs_CPU.sh
       t@@ -1,27 +0,0 @@
       -#!/bin/bash
       -
       -FILES=`ls ../output/*.bin`
       -
       -XRES=800
       -YRES=800
       -
       -WORKHORSE=CPU
       -
       -echo "# Rendering PPM images and converting to JPG"
       -for F in ../output/*.bin
       -do
       -  (BASE=`basename $F`;
       -  ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm;
       -  convert ../img_out/$BASE.ppm ../img_out/$BASE.jpg;)
       -  #rm ../img_out/$BASE.ppm &)
       -done
       -
       -#echo "# Converting PPM files to JPG using ImageMagick in parallel"
       -#for F in ../img_out/*.ppm
       -#do
       -#  (BASE=`basename $F`; convert $F $F.jpg &)
       -#done
       -
       -#echo "# Removed temporary files"
       -#rm ../img_out/*.ppm
       -
 (DIR) diff --git a/raytracer/render_all_outputs_GPU.sh b/raytracer/render_all_outputs_GPU.sh
       t@@ -1,29 +0,0 @@
       -#!/bin/bash
       -
       -FILES=`ls ../output/*.bin`
       -
       -XRES=800
       -YRES=800
       -
       -WORKHORSE=GPU
       -
       -echo "# Rendering PPM images"
       -for F in ../output/*.bin
       -do
       -  BASE=`basename $F`
       -  ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
       -  if [ $? -ne 0 ] ; then
       -    echo "Error rendering $F, trying again..."
       -    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm > /dev/null
       -  fi
       -done
       -
       -echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       -for F in ../img_out/*.ppm
       -do
       -  (BASE=`basename $F`; convert $F $F.jpg &)
       -done
       -
       -#echo "# Removed temporary files"
       -#rm ../img_out/*.ppm
       -
 (DIR) diff --git a/raytracer/rt-kernel-cpu.cpp b/raytracer/rt-kernel-cpu.cpp
       t@@ -1,273 +0,0 @@
       -#include <iostream>
       -#include <cstdio>
       -#include <cmath>
       -#include <time.h>
       -#include <cuda.h>
       -#include <cutil_math.h>
       -#include <string.h>
       -#include <stdlib.h>
       -#include "header.h"
       -#include "rt-kernel-cpu.h"
       -
       -// Constants
       -float3 constc_u;
       -float3 constc_v;
       -float3 constc_w;
       -float3 constc_eye;
       -float4 constc_imgplane;
       -float  constc_d;
       -float3 constc_light;
       -
       -__inline__ float3 f4_to_f3(float4 in)
       -{
       -  return make_float3(in.x, in.y, in.z);
       -}
       -
       -__inline__ float4 f3_to_f4(float3 in)
       -{
       -  return make_float4(in.x, in.y, in.z, 0.0f);
       -}
       -
       -__inline__ float lengthf3(float3 in)
       -{
       -  return sqrt(in.x*in.x + in.y*in.y + in.z*in.z);
       -}
       -
       -// Kernel for initializing image data
       -void imageInit_cpu(unsigned char* _img, unsigned int pixels)
       -{
       -  for (unsigned int mempos=0; mempos<pixels; ++mempos) {
       -    _img[mempos*4]     = 255;        // Red channel
       -    _img[mempos*4 + 1] = 255;        // Green channel
       -    _img[mempos*4 + 2] = 255;        // Blue channel
       -  }
       -}
       -
       -// Calculate ray origins and directions
       -void rayInitPerspective_cpu(float3* _ray_origo, 
       -                            float3* _ray_direction, 
       -                        float3 eye, 
       -                        unsigned int width,
       -                        unsigned int height)
       -{
       -  int i,j;
       -  unsigned int mempos;
       -  float p_u, p_v;
       -  #pragma omp parallel for private(mempos,j,p_u,p_v)
       -  for (i=0; i<(int)width; ++i) {
       -    for (j=0; j<(int)height; ++j) {
       -
       -      mempos = i + j*width;
       -
       -      // Calculate pixel coordinates in image plane
       -      p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
       -        * (i + 0.5f) / width;
       -      p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
       -        * (j + 0.5f) / height;
       -
       -      // Write ray origo and direction to global memory
       -      _ray_origo[mempos]     = constc_eye;
       -      _ray_direction[mempos] = -constc_d*constc_w + p_u*constc_u + p_v*constc_v;
       -    }
       -  }
       -}
       -
       -// Check wether the pixel's viewing ray intersects with the spheres,
       -// and shade the pixel correspondingly
       -void rayIntersectSpheres_cpu(float3* _ray_origo, 
       -                         float3* _ray_direction,
       -                         float4* _p, 
       -                         float* _nuance,
       -                         unsigned char* _img, 
       -                         unsigned int pixels,
       -                         unsigned int np)
       -{
       -  long int mempos;
       -  float3 e, d, n, p, c;
       -  float tdist, R, Delta, t_minus, dotprod, I_d, k_d, k_a, I_a, nuance;
       -  Inttype i, ifinal;
       -  #pragma omp parallel for private(e,d,n,p,c,tdist,R,Delta,t_minus,dotprod,I_d,k_d,k_a,I_a,nuance,i,ifinal)
       -  for (mempos=0; mempos<pixels; ++mempos) {
       -    
       -    // Read ray data from global memory
       -    e = _ray_origo[mempos];
       -    d = _ray_direction[mempos];
       -
       -    // Distance, in ray steps, between object and eye initialized with a large value
       -    tdist = 1e10f;
       -
       -    // Iterate through all particles
       -    for (i=0; i<np; ++i) {
       -
       -      // Read sphere coordinate and radius
       -      c = f4_to_f3(_p[i]);
       -      R = _p[i].w;
       -
       -      // Calculate the discriminant: d = B^2 - 4AC
       -      Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
       -        - 4.0f*dot(d,d)        // -4*A
       -        * (dot((e-c),(e-c)) - R*R);  // C
       -
       -      // If the determinant is positive, there are two solutions
       -      // One where the line enters the sphere, and one where it exits
       -      if (Delta > 0.0f) { 
       -
       -        // Calculate roots, Shirley 2009 p. 77
       -        t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       -                * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       -
       -        // Check wether intersection is closer than previous values
       -        if (fabs(t_minus) < tdist) {
       -          p = e + t_minus*d;
       -          tdist = fabs(t_minus);
       -          n = normalize(2.0f * (p - c));   // Surface normal
       -          ifinal = i;
       -        }
       -
       -      } // End of solution branch
       -
       -    } // End of particle loop
       -
       -    // Write pixel color
       -    if (tdist < 1e10) {
       -
       -      // Lambertian shading parameters
       -      //float dotprod = fabs(dot(n, constc_light));
       -      dotprod = fmax(0.0f,dot(n, constc_light));
       -      I_d = 70.0f;  // Light intensity
       -      k_d = 5.0f;  // Diffuse coefficient
       -
       -      // Ambient shading
       -      k_a = 30.0f;
       -      I_a = 5.0f;
       -
       -      // Read color nuance of grain
       -      nuance = _nuance[ifinal];
       -
       -      // Write shading model values to pixel color channels
       -      _img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       -            + k_a * I_a)*0.48f*nuance);
       -      _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       -            + k_a * I_a)*0.41f*nuance);
       -      _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       -            + k_a * I_a)*0.27f*nuance);
       -    }
       -  }
       -}
       -
       -
       -void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio)
       -{
       -  // Image dimensions in world space (l, r, b, t)
       -  float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.5f*imgw*hw_ratio);
       -
       -  // The view vector
       -  float3 view = eye - lookat;
       -
       -  // Construct the camera view orthonormal base
       -  //float3 up = make_float3(0.0f, 1.0f, 0.0f);  // Pointing upward along +y
       -  float3 up = make_float3(0.0f, 0.0f, 1.0f);  // Pointing upward along +z
       -  float3 w = -view/length(view);                   // w: Pointing backwards
       -  float3 u = cross(up, w) / length(cross(up, w));
       -  float3 v = cross(w, u);
       -
       -  // Focal length 20% of eye vector length
       -  float d = lengthf3(view)*0.8f;
       -
       -  // Light direction (points towards light source)
       -  float3 light = normalize(-1.0f*eye*make_float3(1.0f, 0.2f, 0.6f));
       -
       -  std::cout << "  Transfering camera values to constant memory\n";
       -
       -  constc_u = u;
       -  constc_v = v;
       -  constc_w = w;
       -  constc_eye = eye;
       -  constc_imgplane = imgplane;
       -  constc_d = d;
       -  constc_light = light;
       -
       -  std::cout << "Rendering image...";
       -}
       -
       -
       -// Wrapper for the rt algorithm
       -int rt_cpu(float4* p, unsigned int np,
       -       rgb* img, unsigned int width, unsigned int height,
       -       f3 origo, f3 L, f3 eye, f3 lookat, float imgw) {
       -
       -  using std::cout;
       -
       -  cout << "Initializing CPU raytracer:\n";
       -
       -  // Initialize GPU timestamp recorders
       -  float t1_go, t2_go, t1_stop, t2_stop;
       -
       -  // Start timer 1
       -  t1_go = clock();
       -
       -  // Generate random nuance values for all grains
       -  static float* _nuance; // Values between 0.5 and 1.0
       -  _nuance = new float[np];
       -  srand(0); // Initialize seed at same value at every run
       -  for (unsigned int i=0; i<np; ++i)
       -    _nuance[i] = (float)rand()/RAND_MAX * 0.5f + 0.5f;
       -
       -  // Allocate memory
       -  cout << "  Allocating memory\n";
       -  static unsigned char *_img;                 // RGBw values in image
       -  static float3* _ray_origo;                // Ray origo (x,y,z)
       -  static float3* _ray_direction;        // Ray direction (x,y,z)
       -  _img                  = new unsigned char[width*height*4];
       -  _ray_origo          = new float3[width*height];
       -  _ray_direction = new float3[width*height];
       -
       -  // Arrange thread/block structure
       -  unsigned int pixels = width*height;
       -  float hw_ratio = (float)height/(float)width;
       -
       -  // Start timer 2
       -  t2_go = clock();
       -  
       -  // Initialize image to background color
       -  imageInit_cpu(_img, pixels);
       -
       -  // Initialize camera
       -  cameraInit_cpu(make_float3(eye.x, eye.y, eye.z), 
       -                       make_float3(lookat.x, lookat.y, lookat.z),
       -                 imgw, hw_ratio);
       -
       -  // Construct rays for perspective projection
       -  rayInitPerspective_cpu(
       -      _ray_origo, _ray_direction, 
       -      make_float3(eye.x, eye.y, eye.z), 
       -      width, height);
       -  
       -  // Find closest intersection between rays and spheres
       -  rayIntersectSpheres_cpu(
       -      _ray_origo, _ray_direction,
       -      p, _nuance, _img, pixels, np);
       -
       -  // Stop timer 2
       -  t2_stop = clock();
       -
       -  memcpy(img, _img, sizeof(unsigned char)*pixels*4);
       -
       -  // Free dynamically allocated device memory
       -  delete [] _nuance;
       -  delete [] _img;
       -  delete [] _ray_origo;
       -  delete [] _ray_direction;
       -
       -  // Stop timer 1
       -  t1_stop = clock();
       -  
       -  // Report time spent 
       -  cout << " done.\n"
       -       << "  Time spent on entire CPU raytracing routine: "
       -       << (t1_stop-t1_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
       -  cout << "  - Functions: " << (t2_stop-t2_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
       -
       -  // Return successfully
       -  return 0;
       -}
 (DIR) diff --git a/raytracer/rt-kernel-cpu.h b/raytracer/rt-kernel-cpu.h
       t@@ -1,14 +0,0 @@
       -#ifndef RT_KERNEL_CPU_H_
       -#define RT_KERNEL_CPU_H_
       -
       -#include <vector_functions.h>
       -
       -// Host prototype functions
       -
       -void cameraInit(float3 eye, float3 lookat, float imgw, float hw_ratio);
       -
       -int rt_cpu(float4* p, const unsigned int np, 
       -       rgb* img, const unsigned int width, const unsigned int height,
       -       f3 origo, f3 L, f3 eye, f3 lookat, float imgw);
       -
       -#endif
 (DIR) diff --git a/raytracer/rt-kernel.cu b/raytracer/rt-kernel.cu
       t@@ -1,454 +0,0 @@
       -#include <iostream>
       -#include <cutil_math.h>
       -#include "header.h"
       -#include "rt-kernel.h"
       -#include "colorbar.h"
       -
       -unsigned int iDivUp (unsigned int a, unsigned int b) {
       -  return (a % b != 0) ? (a / b + 1) : (a / b);
       -}
       -
       -__inline__ __host__ __device__ float3 f4_to_f3(float4 in)
       -{
       -  return make_float3(in.x, in.y, in.z);
       -}
       -
       -__inline__ __host__ __device__ float4 f3_to_f4(float3 in)
       -{
       -  return make_float4(in.x, in.y, in.z, 0.0f);
       -}
       -
       -// Kernel for initializing image data
       -__global__ void imageInit(unsigned char* _img, unsigned int pixels)
       -{
       -  // Compute pixel position from threadIdx/blockIdx
       -  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       -  if (mempos > pixels)
       -    return;
       -
       -  _img[mempos*4]     = 255;        // Red channel
       -  _img[mempos*4 + 1] = 255;        // Green channel
       -  _img[mempos*4 + 2] = 255;        // Blue channel
       -}
       -
       -// Calculate ray origins and directions
       -__global__ void rayInitPerspective(float4* _ray_origo, 
       -                                       float4* _ray_direction, 
       -                                   float4 eye, 
       -                                   unsigned int width,
       -                                   unsigned int height)
       -{
       -  // Compute pixel position from threadIdx/blockIdx
       -  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       -  if (mempos > width*height) 
       -    return;
       -
       -  // Calculate 2D position from linear index
       -  unsigned int i = mempos % width;
       -  unsigned int j = (int)floor((float)mempos/width) % width;
       - 
       -  // Calculate pixel coordinates in image plane
       -  float p_u = const_imgplane.x + (const_imgplane.y - const_imgplane.x)
       -              * (i + 0.5f) / width;
       -  float p_v = const_imgplane.z + (const_imgplane.w - const_imgplane.z)
       -              * (j + 0.5f) / height;
       -
       -  // Write ray origo and direction to global memory
       -  _ray_origo[mempos]     = const_eye;
       -  _ray_direction[mempos] = -const_d*const_w + p_u*const_u + p_v*const_v;
       -}
       -
       -// Check wether the pixel's viewing ray intersects with the spheres,
       -// and shade the pixel correspondingly
       -__global__ void rayIntersectSpheres(float4* _ray_origo, 
       -                                    float4* _ray_direction,
       -                                    float4* _p, 
       -                                    unsigned char* _img)
       -{
       -  // Compute pixel position from threadIdx/blockIdx
       -  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       -  if (mempos > const_pixels)
       -    return;
       -
       -  // Read ray data from global memory
       -  float3 e = f4_to_f3(_ray_origo[mempos]);
       -  float3 d = f4_to_f3(_ray_direction[mempos]);
       -  //float  step = length(d);
       -
       -  // Distance, in ray steps, between object and eye initialized with a large value
       -  float tdist = 1e10f;
       -
       -  // Surface normal at closest sphere intersection
       -  float3 n;
       -
       -  // Intersection point coordinates
       -  float3 p;
       -
       -  // Iterate through all particles
       -  for (Inttype i=0; i<const_np; ++i) {
       -
       -    // Read sphere coordinate and radius
       -    float3 c     = f4_to_f3(_p[i]);
       -    float  R     = _p[i].w;
       -
       -    // Calculate the discriminant: d = B^2 - 4AC
       -    float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
       -                  - 4.0f*dot(d,d)        // -4*A
       -                  * (dot((e-c),(e-c)) - R*R);  // C
       -
       -    // If the determinant is positive, there are two solutions
       -    // One where the line enters the sphere, and one where it exits
       -    if (Delta > 0.0f) { 
       -      
       -      // Calculate roots, Shirley 2009 p. 77
       -      float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       -                      * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       -
       -      // Check wether intersection is closer than previous values
       -      if (fabs(t_minus) < tdist) {
       -        p = e + t_minus*d;
       -        tdist = fabs(t_minus);
       -        n = normalize(2.0f * (p - c));   // Surface normal
       -      }
       -
       -    } // End of solution branch
       -
       -  } // End of particle loop
       -
       -  // Write pixel color
       -  if (tdist < 1e10f) {
       -
       -    // Lambertian shading parameters
       -    float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
       -    float I_d = 40.0f;  // Light intensity
       -    float k_d = 5.0f;  // Diffuse coefficient
       -    
       -    // Ambient shading
       -    float k_a = 10.0f;
       -    float I_a = 5.0f; // 5.0 for black background
       -
       -    // Write shading model values to pixel color channels
       -    _img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       -                       + k_a * I_a)*0.48f);
       -    _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       -                              + k_a * I_a)*0.41f);
       -    _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       -                             + k_a * I_a)*0.27f);
       -
       -  }
       -}
       -
       -// Check wether the pixel's viewing ray intersects with the spheres,
       -// and shade the pixel correspondingly using a colormap
       -__global__ void rayIntersectSpheresColormap(float4* _ray_origo, 
       -                                            float4* _ray_direction,
       -                                            float4* _p, 
       -                                            float*  _fixvel,
       -                                            float*  _linarr,
       -                                            float max_val,
       -                                            unsigned char* _img)
       -{
       -  // Compute pixel position from threadIdx/blockIdx
       -  unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       -  if (mempos > const_pixels)
       -    return;
       -
       -  // Read ray data from global memory
       -  float3 e = f4_to_f3(_ray_origo[mempos]);
       -  float3 d = f4_to_f3(_ray_direction[mempos]);
       -
       -  // Distance, in ray steps, between object and eye initialized with a large value
       -  float tdist = 1e10f;
       -
       -  // Surface normal at closest sphere intersection
       -  float3 n;
       -
       -  // Intersection point coordinates
       -  float3 p;
       -
       -  unsigned int hitidx;
       -
       -  // Iterate through all particles
       -  for (Inttype i=0; i<const_np; ++i) {
       -
       -    // Read sphere coordinate and radius
       -    float3 c     = f4_to_f3(_p[i]);
       -    float  R     = _p[i].w;
       -
       -    // Calculate the discriminant: d = B^2 - 4AC
       -    float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
       -                  - 4.0f*dot(d,d)        // -4*A
       -                  * (dot((e-c),(e-c)) - R*R);  // C
       -
       -    // If the determinant is positive, there are two solutions
       -    // One where the line enters the sphere, and one where it exits
       -    if (Delta > 0.0f) {
       -      
       -      // Calculate roots, Shirley 2009 p. 77
       -      float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       -                      * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       -
       -      // Check wether intersection is closer than previous values
       -      if (fabs(t_minus) < tdist) {
       -        p = e + t_minus*d;
       -        tdist = fabs(t_minus);
       -        n = normalize(2.0f * (p - c));   // Surface normal
       -        hitidx = i;
       -      }
       -
       -    } // End of solution branch
       -
       -  } // End of particle loop
       -
       -  // Write pixel color
       -  if (tdist < 1e10) {
       -
       -    // Fetch particle data used for color
       -    float ratio = _linarr[hitidx] / max_val;
       -    float fixvel   = _fixvel[hitidx];
       -
       -    // Make sure the ratio doesn't exceed the 0.0-1.0 interval
       -    if (ratio < 0.01f)
       -      ratio = 0.01f;
       -    if (ratio > 0.99f)
       -      ratio = 0.99f;
       -
       -    // Lambertian shading parameters
       -    float dotprod = fmax(0.0f,dot(n, f4_to_f3(const_light)));
       -    float I_d = 40.0f;  // Light intensity
       -    float k_d = 5.0f;  // Diffuse coefficient
       -    
       -    // Ambient shading
       -    float k_a = 10.0f;
       -    float I_a = 5.0f;
       -
       -    float redv   = red(ratio);
       -    float greenv = green(ratio);
       -    float bluev  = blue(ratio);
       -
       -    // Make particle dark grey if the horizontal velocity is fixed
       -    if (fixvel > 0.f) {
       -      redv = 0.5;
       -      greenv = 0.5;
       -      bluev = 0.5;
       -    }
       -
       -    // Write shading model values to pixel color channels
       -    _img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       -                       + k_a * I_a)*redv);
       -    _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       -                              + k_a * I_a)*greenv);
       -    _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       -                             + k_a * I_a)*bluev);
       -  }
       -}
       -
       -extern "C"
       -__host__ void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
       -                             unsigned int pixels, Inttype np)
       -{
       -  // Image dimensions in world space (l, r, b, t)
       -  float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.5f*imgw*hw_ratio);
       -
       -  // The view vector
       -  float4 view = eye - lookat;
       -
       -  // Construct the camera view orthonormal base
       -  //float4 up = make_float4(0.0f, 1.0f, 0.0f, 0.0f);  // Pointing upward along +y
       -  float4 up = make_float4(0.0f, 0.0f, 1.0f, 0.0f);  // Pointing upward along +z
       -  float4 w = -view/length(view);                   // w: Pointing backwards
       -  float4 u = make_float4(cross(make_float3(up.x, up.y, up.z),
       -                               make_float3(w.x, w.y, w.z)), 0.0f)
       -            / length(cross(make_float3(up.x, up.y, up.z), make_float3(w.x, w.y, w.z)));
       -  float4 v = make_float4(cross(make_float3(w.x, w.y, w.z), make_float3(u.x, u.y, u.z)), 0.0f);
       -
       -  // Focal length 20% of eye vector length
       -  float d = length(view)*0.8f;
       -
       -  // Light direction (points towards light source)
       -  float4 light = normalize(-1.0f*eye*make_float4(1.0f, 0.2f, 0.6f, 0.0f));
       -
       -  std::cout << "  Transfering camera values to constant memory\n";
       -
       -  cudaMemcpyToSymbol("const_u", &u, sizeof(u));
       -  cudaMemcpyToSymbol("const_v", &v, sizeof(v));
       -  cudaMemcpyToSymbol("const_w", &w, sizeof(w));
       -  cudaMemcpyToSymbol("const_eye", &eye, sizeof(eye));
       -  cudaMemcpyToSymbol("const_imgplane", &imgplane, sizeof(imgplane));
       -  cudaMemcpyToSymbol("const_d", &d, sizeof(d));
       -  cudaMemcpyToSymbol("const_light", &light, sizeof(light));
       -  cudaMemcpyToSymbol("const_pixels", &pixels, sizeof(pixels));
       -  cudaMemcpyToSymbol("const_np", &np, sizeof(np));
       -}
       -
       -// Check for CUDA errors
       -extern "C"
       -__host__ void checkForCudaErrors(const char* checkpoint_description)
       -{
       -  cudaError_t err = cudaGetLastError();
       -  if (err != cudaSuccess) {
       -    std::cout << "\nCuda error detected, checkpoint: " << checkpoint_description
       -              << "\nError string: " << cudaGetErrorString(err) << "\n";
       -    exit(EXIT_FAILURE);
       -  }
       -}
       -
       -
       -// Wrapper for the rt kernel
       -extern "C"
       -__host__ int rt(float4* p, Inttype np,
       -                rgb* img, unsigned int width, unsigned int height,
       -                f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
       -                int visualize, float max_val,
       -                float* fixvel,
       -                float* xsum,
       -                float* pres,
       -                float* es_dot,
       -                float* es,
       -                float* vel)
       -{
       -  using std::cout;
       -
       -  cout << "Initializing CUDA:\n";
       -
       -  // Initialize GPU timestamp recorders
       -  float t1, t2;
       -  cudaEvent_t t1_go, t2_go, t1_stop, t2_stop;
       -  cudaEventCreate(&t1_go);
       -  cudaEventCreate(&t2_go);
       -  cudaEventCreate(&t2_stop);
       -  cudaEventCreate(&t1_stop);
       -
       -  // Start timer 1
       -  cudaEventRecord(t1_go, 0);
       -
       -  // Allocate memory
       -  cout << "  Allocating device memory\n";
       -  static float4 *_p;                        // Particle positions (x,y,z) and radius (w)
       -  static float  *_fixvel;               // Indicates whether a particle has a fixed horizontal velocity
       -  static float  *_linarr;                // Array for linear values to color the particles after
       -  static unsigned char *_img;                 // RGBw values in image
       -  static float4 *_ray_origo;                // Ray origo (x,y,z)
       -  static float4 *_ray_direction;         // Ray direction (x,y,z)
       -  cudaMalloc((void**)&_p, np*sizeof(float4));
       -  cudaMalloc((void**)&_fixvel, np*sizeof(float));
       -  cudaMalloc((void**)&_linarr, np*sizeof(float)); // 0 size if visualize = 0;
       -  cudaMalloc((void**)&_img, width*height*4*sizeof(unsigned char));
       -  cudaMalloc((void**)&_ray_origo, width*height*sizeof(float4));
       -  cudaMalloc((void**)&_ray_direction, width*height*sizeof(float4));
       -
       -  // Transfer particle data
       -  cout << "  Transfering particle data: host -> device\n";
       -  cudaMemcpy(_p, p, np*sizeof(float4), cudaMemcpyHostToDevice);
       -  cudaMemcpy(_fixvel, fixvel, np*sizeof(float), cudaMemcpyHostToDevice);
       -  if (visualize == 1 || visualize == 4)
       -    cudaMemcpy(_linarr, pres, np*sizeof(float), cudaMemcpyHostToDevice);
       -  if (visualize == 2)
       -    cudaMemcpy(_linarr, es_dot, np*sizeof(float), cudaMemcpyHostToDevice);
       -  if (visualize == 3)
       -    cudaMemcpy(_linarr, es, np*sizeof(float), cudaMemcpyHostToDevice);
       -  if (visualize == 4)
       -    cudaMemcpy(_linarr, vel, np*sizeof(float), cudaMemcpyHostToDevice);
       -  if (visualize == 5)
       -    cudaMemcpy(_linarr, xsum, np*sizeof(float), cudaMemcpyHostToDevice);
       -
       -  // Check for errors after memory allocation
       -  checkForCudaErrors("CUDA error after memory allocation"); 
       -
       -  // Arrange thread/block structure
       -  unsigned int pixels = width*height;
       -  float hw_ratio = (float)height/(float)width;
       -  const unsigned int threadsPerBlock = 256;
       -  const unsigned int blocksPerGrid = iDivUp(pixels, threadsPerBlock);
       -
       -  // Start timer 2
       -  cudaEventRecord(t2_go, 0);
       -  
       -  // Initialize image to background color
       -  imageInit<<< blocksPerGrid, threadsPerBlock >>>(_img, pixels);
       -
       -  // Initialize camera
       -  cameraInit(make_float4(eye.x, eye.y, eye.z, 0.0f), 
       -                   make_float4(lookat.x, lookat.y, lookat.z, 0.0f),
       -             imgw, hw_ratio, pixels, np);
       -  checkForCudaErrors("CUDA error after cameraInit");
       -
       -  // Construct rays for perspective projection
       -  rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
       -      _ray_origo, _ray_direction, 
       -      make_float4(eye.x, eye.y, eye.z, 0.0f), 
       -      width, height);
       -
       -  cudaThreadSynchronize();
       -  
       -  // Find closest intersection between rays and spheres
       -  if (visualize == 1) { // Visualize pressure
       -    cout << "  Pressure color map range: [0, " << max_val << "] Pa\n"; 
       -    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       -        _ray_origo, _ray_direction,
       -        _p, _fixvel, _linarr, max_val, _img);
       -  } else if (visualize == 2) { // es_dot visualization
       -    cout << "  Shear heat production rate color map range: [0, " << max_val << "] W\n";
       -    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       -        _ray_origo, _ray_direction,
       -        _p, _fixvel, _linarr, max_val, _img);
       -  } else if (visualize == 3) { // es visualization
       -    cout << "  Total shear heat color map range: [0, " << max_val << "] J\n";
       -    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       -        _ray_origo, _ray_direction,
       -        _p, _fixvel, _linarr, max_val, _img);
       -  } else if (visualize == 4) { // velocity visualization
       -    cout << "  Velocity color map range: [0, " << max_val << "] m/s\n";
       -    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       -        _ray_origo, _ray_direction,
       -        _p, _fixvel, _linarr, max_val, _img);
       -  } else if (visualize == 5) { // xsum visualization
       -    cout << "  XSum color map range: [0, " << max_val << "] m\n";
       -    rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       -        _ray_origo, _ray_direction,
       -        _p, _fixvel, _linarr, max_val, _img);
       -  } else { // Normal visualization
       -    rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
       -        _ray_origo, _ray_direction,
       -        _p, _img);
       -  }
       -
       -  // Make sure all threads are done before continuing CPU control sequence
       -  cudaThreadSynchronize();
       -  
       -  // Check for errors
       -  checkForCudaErrors("CUDA error after kernel execution");
       -
       -  // Stop timer 2
       -  cudaEventRecord(t2_stop, 0);
       -  cudaEventSynchronize(t2_stop);
       -
       -  // Transfer image data from device to host
       -  cout << "  Transfering image data: device -> host\n";
       -  cudaMemcpy(img, _img, width*height*4*sizeof(unsigned char), cudaMemcpyDeviceToHost);
       -
       -  // Free dynamically allocated device memory
       -  cudaFree(_p);
       -  cudaFree(_fixvel);
       -  cudaFree(_linarr);
       -  cudaFree(_img);
       -  cudaFree(_ray_origo);
       -  cudaFree(_ray_direction);
       -
       -  // Stop timer 1
       -  cudaEventRecord(t1_stop, 0);
       -  cudaEventSynchronize(t1_stop);
       -  
       -  // Calculate time spent in t1 and t2
       -  cudaEventElapsedTime(&t1, t1_go, t1_stop);
       -  cudaEventElapsedTime(&t2, t2_go, t2_stop);
       -
       -  // Report time spent
       -  cout << "  Time spent on entire GPU routine: "
       -       << t1 << " ms\n";
       -  cout << "  - Kernels: " << t2 << " ms\n"
       -       << "  - Memory alloc. and transfer: " << t1-t2 << " ms\n";
       -
       -  // Return successfully
       -  return 0;
       -}
 (DIR) diff --git a/raytracer/rt-kernel.h b/raytracer/rt-kernel.h
       t@@ -1,35 +0,0 @@
       -#ifndef RT_KERNEL_H_
       -#define RT_KERNEL_H_
       -
       -#include <vector_functions.h>
       -#include "../src/datatypes.h"
       -#include "header.h"
       -
       -// Constants
       -__constant__ float4 const_u;
       -__constant__ float4 const_v;
       -__constant__ float4 const_w;
       -__constant__ float4 const_eye;
       -__constant__ float4 const_imgplane;
       -__constant__ float  const_d;
       -__constant__ float4 const_light;
       -__constant__ unsigned int const_pixels;
       -__constant__ Inttype const_np;
       -
       -
       -// Host prototype functions
       -
       -extern "C"
       -void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
       -                    unsigned int pixels, Inttype np);
       -
       -extern "C"
       -void checkForCudaErrors(const char* checkpoint_description);
       -
       -extern "C"
       -int rt(float4* p, Inttype np,
       -       rgb* img, unsigned int width, unsigned int height,
       -       f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
       -       int visualize, float max_val,
       -       float* fixvel, float* xsum, float* pres, float* es_dot, float* es, float* vel);
       -#endif
 (DIR) diff --git a/raytracer/rt_GPU_init_pres.sh b/raytracer/rt_GPU_init_pres.sh
       t@@ -1,36 +0,0 @@
       -#!/bin/bash
       -
       -# Pass max. pressure as command line argument
       -
       -XRES=400
       -YRES=1600
       -
       -WORKHORSE=GPU
       -
       -echo "# Rendering PPM images"
       -for F in ../output/*init*.bin
       -do
       -  BASE=`basename $F`
       -  OUTFILE=$BASE.ppm.jpg
       -  if [ -e ../img_out/$OUTFILE ]
       -  then
       -    echo "$OUTFILE exists." > /dev/null
       -  else
       -    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       -    if [ $? -ne 0 ] ; then
       -      echo "Error rendering $F, trying again..."
       -      ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       -    fi
       -  fi
       -done
       -
       -echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       -for F in ../img_out/*.ppm
       -do
       -  (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
       -done
       -
       -sleep 5
       -echo "# Removing temporary PPM files"
       -rm ../img_out/*.ppm
       -
 (DIR) diff --git a/raytracer/rt_GPU_pres.sh b/raytracer/rt_GPU_pres.sh
       t@@ -1,36 +0,0 @@
       -#!/bin/bash
       -
       -# Pass max. pressure as command line argument
       -
       -XRES=800
       -YRES=800
       -
       -WORKHORSE=GPU
       -
       -echo "# Rendering PPM images"
       -for F in ../output/*.bin
       -do
       -  BASE=`basename $F`
       -  OUTFILE=$BASE.ppm.jpg
       -  if [ -e ../img_out/$OUTFILE ]
       -  then
       -    echo "$OUTFILE exists." > /dev/null
       -  else
       -    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       -    if [ $? -ne 0 ] ; then
       -      echo "Error rendering $F, trying again..."
       -      ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       -    fi
       -  fi
       -done
       -
       -echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       -for F in ../img_out/*.ppm
       -do
       -  (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
       -done
       -
       -sleep 5
       -echo "# Removing temporary PPM files"
       -rm ../img_out/*.ppm
       -
 (DIR) diff --git a/raytracer/rt_GPU_tall_pres.sh b/raytracer/rt_GPU_tall_pres.sh
       t@@ -1,36 +0,0 @@
       -#!/bin/bash
       -
       -# Pass max. pressure as command line argument
       -
       -XRES=800
       -YRES=1200
       -
       -WORKHORSE=GPU
       -
       -echo "# Rendering PPM images"
       -for F in ../output/*.bin
       -do
       -  BASE=`basename $F`
       -  OUTFILE=$BASE.ppm.jpg
       -  if [ -e ../img_out/$OUTFILE ]
       -  then
       -    echo "$OUTFILE exists." > /dev/null
       -  else
       -    ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       -    if [ $? -ne 0 ] ; then
       -      echo "Error rendering $F, trying again..."
       -      ./rt $WORKHORSE $F $XRES $YRES ../img_out/$BASE.ppm pressure $@ > /dev/null
       -    fi
       -  fi
       -done
       -
       -echo "# Converting PPM files to JPEG using ImageMagick in parallel"
       -for F in ../img_out/*.ppm
       -do
       -  (BASE=`basename $F`; convert $F $F.jpg > /dev/null &)
       -done
       -
       -sleep 5
       -echo "# Removing temporary PPM files"
       -rm ../img_out/*.ppm
       -