\documentclass[a4paper,twoside,11pt]{article}
%\documentclass[a4paper,twoside,openright,english,11pt]{article}

\usepackage{graphicx}
%\usepackage{lcy}
\usepackage{color}
\usepackage{epsfig}

\usepackage{amsmath}
\usepackage{algpseudocode}

\usepackage[cp1251]{inputenc} % delme
\usepackage[russian]{babel}   % delme
\newcommand{\TODO}[1]{\textcolor {red}{TODO: #1}} % del me !!!!

% text settings
% --------------
% \setlength{\voffset}{-3.5cm}
% \setlength{\hoffset}{-3cm}
% \setlength{\topmargin}{1cm}
% \setlength{\oddsidemargin}{3.3cm}
% \setlength{\evensidemargin}{2.6cm}
% \setlength{\textwidth}{16cm}
% \setlength{\textheight}{24cm}
% \setlength{\parindent}{0cm}
% \setlength{\parskip}{0.2cm}
% \setlength{\headheight}{2cm}
% \setlength{\headsep}{1.3cm}
% \setlength{\footskip}{1.3cm}


\setlength{\voffset}{-3.5cm} 
\setlength{\hoffset}{-3cm}
\setlength{\topmargin}{1cm}
\setlength{\oddsidemargin}{3.3cm}
\setlength{\evensidemargin}{2.6cm} 
\setlength{\textwidth}{15.5cm}
\setlength{\textheight}{24cm} 
% \setlength{\parindent}{1cm}   % otstup v abzace
% \setlength{\parskip}{0.2cm}		% rasstojanie mezdu abzacami
\setlength{\headheight}{2cm}
\setlength{\headsep}{1.3cm}
\setlength{\footskip}{1.3cm}


\renewcommand{\a}{{\bf a}}
\renewcommand{\r}{{\bf r}}
\newcommand{\m}{{\bf m}}
\newcommand{\h}{{\bf h}}
\newcommand{\B}{{\bf B}}

%\renewcommand{\r}{\vr}
%\newcommand{\m}{\vm}
%\newcommand{\h}{{\bf h}}
\newcommand{\vzeta}{{\boldsymbol \zeta}}
\newcommand{\veta}{{\boldsymbol \eta}}
\newcommand{\vxi}{{\boldsymbol \xi}}
\newcommand{\vnu}{{\boldsymbol \nu}}
\newcommand{\vmu}{{\boldsymbol \mu}}
\newcommand{\vphi}{{\boldsymbol \varphi}}


% \newcommand{\vzero}{{\bf 0}}
% \renewcommand{\r}{\vr}
% \newcommand{\m}{\vm}
% \newcommand{\h}{{\bf h}}
% \newcommand{\zz}{{\mbox{\boldmath$\zeta$}}}
% \newcommand{\veta}{{\mbox{\boldmath$\eta$}}}
% \newcommand{\vnu}{{\mbox{\boldmath$\nu$}}}
% \renewcommand{\v}{{\bf v}}
% \newcommand{\p}{{\bf p}}
% \newcommand{\ff}[2]{{\displaystyle\frac{#1}{#2}}}
% \newcommand{\dd}[2]{ \displaystyle\frac{\partial #1}{\partial #2} }
% \newcommand{\tuned}{\phantom{{\left.\begin{array}{c} A \end{array}\right|_A}_{A_{A_A}}}}
% \newcommand{\tunedd}{{{\left.\begin{array}{c} A \\ A \end{array}\right|_{A_{A_{A_A}}}}}}
% %\newcommand{\tunedd}{[13pt]}
% \newcommand{\tuneddd}{\phantom{{\left.\begin{array}{c} A \\ A \\ A \end{array}\right|_A}_{A_{A_A}}}}
% \newcommand{\tuneu}{\phantom{{\left.\begin{array}{c} A \end{array}\right|^A}^A}}
% \newcommand{\tuneuu}{\phantom{{\left.\begin{array}{c} A \\ A \end{array}\right|^A}^A}}
% \newcommand{\tuneuuu}{\phantom{{\left.\begin{array}{c} A \\ A \\ A \end{array}\right|^A}^A}}
% \newcommand{\e}[1]{\widetilde{#1}}
% \newcommand{\n}[1]{\widehat{#1}}


\begin{document}

\title{\sc Vectorized and parallelized \\ Kalman filter based library for track fitting}

\author{ I.~Kisel$^{1,2}$, I.~Kulakov$^{1,3}$, H. Pabst$^{4}$, M. Zyzak$^{1,3}$ }

\date{
	$^1${\small Goethe University Frankfurt am Main, Germany}\\
	$^2${\small GSI Helmholtzzentrum fuer Schwerionenforschung GmbH, Germany}\\
	$^3${\small National Taras Shevchenko University of Kyiv, Ukraine}\\
	$^4${\small Intel GmbH, Germany}
\\
\textcolor {blue}{\today} 
}

% \date{\currenttime}

\maketitle

\begin{abstract}
  \TODO{	change $r^-+$ notations. }
	\TODO{array}
	\TODO{порядок ссылок}
	\TODO{нумерация для всех формул}
 % Modern high energy physics experiments have to process terabytes of input data produced in particle collisions. The core of the data reconstruction in high energy physics is the Kalman filter. Therefore, the speed of Kalman filter based algorithms is of crucial importance in on-line data processing. Therefore, developing fast reconstruction algorithms, which use maximum available power of processors, is important, in particular for initial selection of events interesting for the new physics.
  
 % In the paper described here, several approaches of Kalman filter based track fitting algorithm of the CBM experiment has been implemented and tested in order to find optimal algorithm with respect to speed and precision. The scalability of it has been tested.
   
% 
% 	One of feature, which can speed up algorithms, is a SIMD instruction set, which allows to packing several data items in one register and operate on all of them in one go, thus achieving more operations per clock cycle.
% 
% 	The Kalman filter based track fitting algorithm of the CBM experiment has been already vectorized using in-line operator overloading in 2005 year. Now, in 2009 year, a more powerful processors with more complicated and useful features exists. Flexible and useful interface for such features, which correspondы to use SIMD instruction set on different CPU and GPU processors architectures, was realized by the name of vector classes library.
% 
% 	In the paper described here, the Kalman filter based track fitting algorithm of the CBM experiment has been realized with use of vector classes. Thus the algorithm became even more flexible and modern.
\end{abstract}

\tableofcontents
% \pagebreak

\section{Introduction}

Современные эксперименты в физике высоких энергий (ФВЭ) работают в условиях высоких плотностей треков (порядка $100$ на cm$^2$). Кроме того, особый интерес представляют очень редкие сигналы (down to $10^{-7}$ на событие), для их поиска приходится обрабатывать терабайты данных. В связи с этим реконструкция событий в ФВЭ требует как высокой точности, так и высокой скорости. Основная задача при реконструкции события --- реконструкция параметров треков частиц. Именно от точности, с которой определяются параметры треков, и зависит точность нахождения искомых сигналов. Метод фильтра Калмана (КФ) --- это рекурсивный метод анализа линейных дискретных динамических систем, описываемых вектором параметров, который называется вектором состояния. Он позволяет найти оптимальную оценку параметров треков частиц и достигнуть максимально возможной точности. 

Во многих случаях КФ является ядром реконструкции события в ФВЭ, что обусловленно его свойствами: 
\begin{itemize}
 \item оптимальность оценки параметров;
 \item локальность по отношению к измерениям: измерения добавляются одно за другим последовательно и независимо;
 \item удобство учета нелинейности траектории частицы в общем случае неоднородного магнитного поля;
 \item удобство и корректность учета материала детектора, через который проходит частица;
 \item работа с матрицами маленького размера;
 \item количество вычислительных операций пропорционально числу измерений.
\end{itemize}
Оптимизация скорости работы КФ алгоритмов критична на современных вычислительных серверах. Принимая во внимание архитектуру современных процессоров и тенденции их развития, одним из необходимых условий для достижения высокой скорости является использование векторных инструкций процессора и распараллеливание вычислений между ядрами процессора.

Большинство современных процессоров оснащены SIMD (Single Instruction, Multiple Data) unit с векторными инструкциями, который позволяет выполнять базовые операции с несколькими величинами одновременно. Современные процессоры могут одновременно обрабатывать 4 переменных в одинарной точности. Уже существуют процессоры, SIMD unit которых способен обрабатывать 8 переменных в единичной точности, анонсирован выход процессоров для которых это число увеличится до 16. Кроме того, современные вычислительные серверы многоядерные, типичное количество ядер составляет 8-16, существуют серверы с 80 ядрами. Таким образом, векторизация и распараллеливание вычислений на современных серверах делают возможным использование всего потенциала процессоров и ускорение работы программ на 1-2 порядка. 

Для эффективного использования векторных инструкций необходимо работать в одинарной точности. Такой подход позволяет ускорить вычисления в 4 раза по сравнению со стандратной скалярной реализацией КФ алгоритма в двойной точности. 
Для повышения стабильности Conventional KF в одинарной точности применяется аккуратная начальная оценка параметров трека и аккуратное обращение с ковариционной матрицей на стадии учета измерения~\cite{SIMDKFCPC}. Также существуют методы, позволяющие алгоритмически повысить точность вычислений: square root KF и UD-filtering~\cite{KFbook,Kaminski,Thornton}, использование которых ведет к повышению стабильности КФ.

На базе различных методов реализации КФ была создана библиотека, которая включает в себя:
\begin{itemize}
 \item алгоритмы track fit и track smooother;
 \item методы conventional KF, Potter and Carlson square root KF, UD-filtering;
 \item два подхода для экстраполяции параметров треков, базирующихся на стандартном методе 4-th order Runge-Kutta~\cite{NumRecC} и аналитической формуле~\cite{AnalyticExtra}, которая представляет собой разложение в ряд Тейлора решения уравнений движения частицы в магнитном поле;
 \item возможность переключения c одинарной точности вычислений на двойную;
 \item оптимизацию с точки зрения использования памяти, в частности, использование апроксимированного магнитного поля и упрощенной геометрии детектора, что делает возможным работу в пределах кеша CPU;
\end{itemize}
а также:
\begin{itemize}
 \item векторизацию с помощью inline operator overloading, библиотеки Vector classes (Vc)~\cite{Vc} и Intel Array Building Blocks (ArBB)~\cite{ArBB};
 \item распараллеливание с помощью библиотек Intel Threading Building Blocks (ITBB)~\cite{ITBB}, Intel ArBB и Open Multi-Processing (OpenMP)~\cite{OpenMP}.
\end{itemize}

Алгоритм track fit предназначен для нахождения оценки параметров в точках первого и последнего измерения трека. Алгоритм track smooother позволяет получить оценку параметров в точке любого измерения; используется, в частности, для alingment-a и для отбора некорректно ассоциированных с треком измерений.

% основывается на предыдущей реализации алгоритма фита треков для СВМ эксперимента~\cite{SIMDKFCPC}, которая уже 
% Многообразие имплементированных методов позволяет пользователю выбирать оптимальный для конкретной задачи.

Библиотека реализована и протестирована на примере CBM~(Compressed Baryonic Matter) эксперимента~\cite{TSR}, в котором задача разработки быстрых и эффективных алгоритмов для реконструкции событий стоит наиболее остро. CBM эксперимент разрабатывается как эксперимент по столкновению тяжелых ионов на FAIR (Darmstadt, Germany). Основная цель эксперимента --- иследование highly compressed nuclear matter и её фазовой диаграммы. Для достижения поставленых целей планируется изучение очень редких сигналов в условиях большой множественности треков (около 700 заряженых частиц на столкновение), большой частоты столкновений (до $10^7$ за секунду) и неоднородного магнитного поля. Отсутствие простых критериев отбора информации требует полной реконструкции каждого события в режиме реального времени.

Tests of track parameters quality and time tests have been performed on many-core Intel \TODO{and AMD} servers, which have up to 80 cores.
\TODO{Переход к след. главе.}

\section{Parallel implementation of the algorithms}
The KF library is both vectorized and parallelized (see Fig.~\ref{fig:parallelIll}). The library uses header files, Vector classes and Intel ArBB for vectorization. OpenMP, ITBB and Intel ArBB are used for parallelization between cores and threads.

\begin{figure}[htbp]
\centering\includegraphics[angle=0, width=10cm]{ParallelizationIllustr}
\caption{Vectorization and parallelization in the library}
\label{fig:parallelIll}
\end{figure}

% \subsection{Header files}
Header files overload SIMD instructions implementing all operands and inlining basic arithmetic and logic functions, that makes a code compact and easily readable. For example, a simple code for calculation of a polynomial function of the first order, which is written using SSE instructions~is:
\begin{verbatim}
__m128 y = _mm_add_ps(_mm_mul_ps(a,x),b);
\end{verbatim}
The same code using the header files is:
\begin{verbatim}
fvec y = a*x + b;
\end{verbatim}
with
\begin{verbatim}
friend fvec operator+(const fvec &a, const fvec &b) {
  return _mm_add_ps(a,b); }
friend fvec operator*(const fvec &a, const fvec &b) {
  return _mm_mul_ps(a,b); } 
\end{verbatim}
in the header file.

The implementation is simple and therefore flexible with respect to different CPU architectures. It allows keeping the main program code unchangeable, only the header file correspondent to the architecture should be included. Also the header file with scalar implementation exists and allows easy debugging and comparison.

% \subsection{Vector classes}
The vector classes library is a thin wrapper around the SSE, AVX and LRBni intrinsic. Similar to the header files, it provides all basic arithmetic operations and functions and affords the scalar implementation. The correspondent code for the polynomial function with Vc is also very simple and intuitive:
\begin{verbatim}
float_v y = a*x + b;
\end{verbatim}

The Vc library automatically determines the platform during a compilation and choose corresponding instructions to be used. In addition to the operations with full vectors (vertical operations), it implements operations with elements of one SIMD vector (horizontal operations), allowing, for example, add or sort elements within one vector. In order to create an API for conditional execution all functions/operators of the Vector classes are able to take a mask argument optionally. For instance, to replace all negative elements of the vector ``y'' from the previous example with ``0'', the following code should be added:
\begin{verbatim}
float_m mask = y<0;
y(mask) = 0;
\end{verbatim}

In order to help a programmer make fewer mistakes Vc enforces additional type-safety. Random accesses to arrays elements are implemented with a gather and scatter functionality, which also allows to easily use vectors with arrays of structures.

% \subsection{Intel ArBB}
The Intel Array Building Blocks software~\cite{ArBB} is another way to free the code developer from dependencies on low-level parallelism mechanisms or hardware architectures. It consists of a standard C++ library and powerful runtime-dynamic compiler. The whole available SIMD functionality is provided by this software. The code example with ArBB is as easily understandable, as with the previous tools:
\begin{verbatim}
dense<f32> y = a*x + b;
dense<boolean> mask = y<0;
y = select( mask, y, 0 );
\end{verbatim}
In addition to vectorization, ArBB can automatically parallelize computations between cores of CPU. No additional modifications of the code is needed for that.

% \subsection{OpenMP}
The OpenMP~\cite{OpenMP} is an API, which supports multi-platform shared-memory parallel pro\-gram\-ming. It defines a simple interface for using parallelization between cores in a user application. The user prompt OpenMP, which section of code should be run in parallel, marking the section with a preprocessor directive. That forms threads before the section is executed and the computations are divided between the threads. The threads are allocated to processors by the runtime environment, which takes into account different factors like processors usage and machine load.
%, which can be run aftewards on different platforms

% \subsection{Intel TBB}
Intel Threading Building Blocks~\cite{ITBB} is a C++ template library, which offers a rich functiona\-li\-ty allowing parallel execution of a big number of different types of problems. In order to abstract access to the multiple processors the groups of operations is treated as tasks. An ITBB program creates, synchronizes and destroys graphs of dependent tasks.
Tasks are then executed respecting graph dependencies.

\section{Kalman filter based track fit}

\subsection{Conventional Kalman filter method}
Linear discrete dynamic systems are fully characterized by a vector of parameters $\r^t$, which is called state vector.
Kalman filter method~\cite{SIMDKFCPC,Kalman,KFbook,DataAnalysis}, which is linear recursive estimator, is intended for finding the optimum estimation of the state vector $\r^t$ according to known measurements.
The estimation $\r$ can be determined within a certain error $\vxi = \r - \r^t$. In order to estimate this error a covariance matrix $C \equiv <\vxi \cdot \vxi^T>$ is introduced and is estimated by the Kalman filter.
The idea of the Kalman filter method is to start with a certain initial approximation $\r=\r_0$ and refine the vector $\r$ consecutively taking into account the measurements. The optimum estimation and its covariance matrix are obtained after addition of the last measurement.

The state vector can not be always measured directly. In this case vectors $\m^t_k,\ k=1\ldots n$, which depend on $\r^t$, are measured directly. Here $n$ is a number of measurements, $k$ --- index of a measurement. The linear estimator assumes that $\m^t_k$ depends linearly on $\r^t$:
\begin{eqnarray}\label{eq:mkHk}
 \m^t_k &=& H_k \r^t,
\end{eqnarray}
where the coefficient matrix $H_k$ is called the model of measurement.

Each vector $\m^t_k$ is measured with an error $\veta_k$. The obtained measurement $\m_k$ is:
\begin{eqnarray}
   \m_k &=& \m^t_k + \veta_k.
\end{eqnarray}
%
The errors are assumed to be uncorrelated and unbiased: 
\begin{eqnarray}
  <\veta_k\cdot\veta_j> &=& \hat{0}, ~~~k=1\ldots n, ~~j=1\ldots n \\
  <\veta_k> &=& {\bf0}.
\end{eqnarray}
The covariance matrix of each measurement $V_k \equiv <\veta_k\cdot\veta_k^T>$ has to be known.

Generally, the state vector $\r^t$ of a discrete dynamic system changes from one measurement point to another. For example, the measurements can be separated in space. Therefore let us denote the state vector $\r^t$ within conditions of the $k$-th measurement as $\r^t_k \equiv \r^t(k)$.

The evolution of the linear system from the point of measurement $\m_k$ to the point of the next measurement $\m_{k+1}$ is described by an linear equation:
\begin{eqnarray} \label{eq:rF}
 \r^t_{k+1} &=& F_{k}\r^t_{k} + \vnu_{k+1},
\end{eqnarray}
here $F_{k}$ is a known propagation matrix \TODO{or prediction matrix?}, $\vnu_{k+1}$ is a random process noise. The noise can be caused by multiple scattering in the detector material, for example.
The process noise is assumed to be uncorrelated and unbiased. The process noise covariance matrices $Q_k~\equiv~<~\vnu_k~\cdot~\vnu_k^T~>$ have to be known in order to take it into account during the state vector estimation.

\begin{figure}[htbp]
\begin{center}

\setlength{\unitlength}{0.2cm}

\begin{picture}(50,62)(0,0)

\put(10,53){\framebox(30,8){}}
\put(10,58){\makebox(30,2){Initial approximation} }
\put(10,55){\makebox(30,2){$\r^+_0$, $C^+_0$ }}
\thicklines
\put(25,53){\vector(0,-1){6}}

\put(10,36){\framebox(30,11){}}
\put(10,44){\makebox(30,2){Prediction} }
\put(10,41){\makebox(30,3){
\ \ $\r^-_k = F_{k-1} \r^+_{k-1}$\hfill
}}
\put(10,37){\makebox(30,3){
\ \ $C_{k}^-  =  F_{k-1}  C_{k-1}^+  F^T_{k-1} +  Q_k$ \hfill
}}

\put(25,36){\vector(0,-1){4}}


\put(10,16){\framebox(30,16){}}
\put(10,29){\makebox(30,2){Filtering} }
\put(10,26){\makebox(30,3){
\ \ $K_k = C_k^- H_k^T (V_k + H_k C_k^- H_k^T)^{-1}$\hfill
}}
\put(10,22){\makebox(30,3){
\ \ $\r_{k}^+ = \r_{k}^- + K_k ( \m_k-H_k \r_{k}^- ) $ \hfill
}}
\put(10,18){\makebox(30,3){
\ \ $C_k^+ = (I - K_k H_k) \cdot C_k^-$ \hfill
}}

\put(25,11){\line(-1,0){20}}
\put(5,11){\vector(0,1){26}}
\put(5,33){\line(0,1){17}}
\put(5,50){\line(1,0){20}}
\put(10,12){\makebox(10,2){
$ \r_k$, $C_k$
}}

\put(25,16){\vector(0,-1){9}}

\put(10,0 ){\framebox(30,7){}}
\put(10,4){\makebox(30,2){Fitted parameters} }
\put(10,2){\makebox(30,2){$\r^+_n$, $C^+_n$ }}

\put(45,36){\framebox(15,8){}}
\put(45,41){\makebox(15,2){Noise} }
\put(45,38){\makebox(15,2){ $Q_k$ }}

\put(45,40){\vector(-1,0){5}}

\put(45,20){\framebox(15,8){}}
\put(45,25){\makebox(15,2){Measurement} }
\put(45,22){\makebox(15,2){ $\m_k$, $H_k$, $V_k$ }}

\put(45,24){\vector(-1,0){5}}

\end{picture}

\end{center}
\caption{\label{KFscheme}Block diagram representation of the conventional Kalman filter.}
\end{figure}

Then \TODO{(or without ``then''?)} the conventional Kalman filter algorithm (is illustrated at a
block diagram in Figure~\ref{KFscheme}) can be formulated as follows :
\begin{enumerate}
	\item Initialization step. The initial vector $\r_0$ and its covariance matrix $C_0$ are set. A value of $\r_0$ can be chosen arbitrary, for example, using some approximate state vector value. If the approximate value is not known $\r_0$ can be set to zero. The diagonal matrix with infinities along the diagonal is taken as the initial covariance matrix $C_0$, since the final estimation of the state vector must not depend on the initial value.
  \newcounter{enumTemp}
  \setcounter{enumTemp}{\theenumi}
\end{enumerate}

Then prediction and filtration steps, which are described below, are repeated for each measurement. 
%
\begin{enumerate}
  \setcounter{enumi}{\theenumTemp}
	\item Prediction step. Estimation of the state vector and its covariance matrix are propagated to a point of the measurement $\m_k$. The noise of propagation $\vnu_k$ is also taken into account at this step. The propagated state vector $\r^{-}_{k}$ and the covariance matrix $C^{-}_{k}$ are calculated according to formulas:
\begin{subequations}
\begin{eqnarray}
	\r^-_k &=& F_{k-1} \r^+_{k-1}, \label{eq:ConvTimeUpdate1} \\
	C_{k}^-  &=&  F_{k-1}  C_{k-1}^+  F^T_{k-1} +  Q_k, \label{eq:ConvTimeUpdate2}
\end{eqnarray}
\end{subequations}
here  $\r^{+}_{k-1}$ and $C^{+}_{k-1}$ are the state vector and its covariance matrix after $(k-1)$ filtration step. At the first iteration $\r^{+}_0 = \r_0$, $C^{+}_0 = C_0$ are taken.

	\item \TODO{Filtration} step. The optimum estimation of the vector $\r^t_k$ and its covariance matrix  according to the first $k$ measurements are obtained at this step by taking into account a measurement $\m_k$. 

A correction term, which is also called residual and is basically a distance between obtained measurement and its predicted value, is calculated:
%
\begin{subequations}
\begin{align}
\vzeta_k &= \m_k-H_k \r_{k}^-. \label{eq:ConvMeasurementUpdate1}
%
\intertext{The inverse covariance matrix of the residual is calculated according to the formula:}
%
W_k &= (V_k + H_k C_k^- H_k^T)^{-1}.
%
\intertext{It is called also weighting matrix and weights the $k$-th measurement term in the $\chi^2$-deviation, which is also  calculated recursively. The total $\chi^2$-deviation of the estimation $\r^{+}_k$ from the first $k$ measurements are obtained according to the formula: }
%
\chi^2_k &= \chi^2_{k-1} + \vzeta_k^TW_k\vzeta_k.
%
%objective function, which is minimized by the Kalman filter method: $L_k = \sum_{i=1}^k{ \vzeta_i^TW_i\vzeta_i }$. This makes the final state vector estimation more strongly dependent on precise measurements with small errors than on less precise measurements.
%\intertext{It is called also weighting matrix and weights the $k$-th measurement term in the objective function, which is minimized by the Kalman filter method: $L_k = \sum_{i=1}^k{ \vzeta_i^TW_i\vzeta_i }$. This makes the final state vector estimation more strongly dependent on precise measurements with small errors than on less precise measurements.
% }\intertext{
%One can see that the objective function $L_k$ is the total $\chi^2$-deviation of the estimation $\r^{+}_k$ from the first $k$ measurements. It is calculated as follows:}
%
%\chi^2_k &= \chi^2_{k-1} + \vzeta_k^TW_k\vzeta_k.
%
\intertext{The residual $\vzeta$ together with $\chi^2$ are important test statistics, which have known distributions and are used often in order to estimate correctness of the procedure.}
\intertext{After the residual and the weighting matrix are found, the state vector is corrected on the residual $\vzeta_k$, which is taken with a certain gain $K_k$:}
%
\r_{k}^+ &= \r_{k}^- + K_k\vzeta_k. 		\\
%
\intertext{Note that the state vector keeps unchanged if either the correction term $\vzeta_k$ or the gain matrix $K_k$ is zero.}
%
\intertext{The gain matrix $K_k$ is proportional to the current covariance matrix $C_k^-$ of the state vector and to the weighting matrix $W_k$: }
K_k &= C_k^- H_k^T W_k. \\
\intertext{Thereby, measurements with the biggest weights lead to the biggest corrections of the state vector. A state vector estimation, which is already precise and has small covariance matrix, will be changed insignificantly.}
%
\intertext{Finally, the updated covariance matrix $C_k^+$ of improved state vector $\r_{k}^+$ is calculated as follows:}
%
C_k^+ &= (I - K_k H_k) \cdot C_k^-. \label{eq:ConvMeasurementUpdate6}
\end{align}
\end{subequations}

\end{enumerate}

The vector $\r^{+}_n$ obtained after the filtration of the last measurement is the desired optimal estimation with the covariance matrix $C^{+}_n$.

If an expected value of a measurement $<\m_k> = \m^t_k$ does not depend linearly on $\r^t$ the dependence $\m^t_k(\r^t) = {\bf h}_k(\r^t)$ is linearized by the Taylor expansion:
% \begin{subequations}
\begin{eqnarray}
\m^t_k(\r^t_k) &=& {\bf h}_k(\r^t_k) \approx \nonumber \\
      &\approx&  {\bf h}_k(\r^{lin_m}_k) + H_k(\r^t_k-\r^{lin_m}_k), \\
H_{k_{ij}} &=& \frac{{\partial}{{\bf h}}_{k_{i}}({\bf x})}{{\partial}x_j}\Bigg|_{{\bf x}=\r^{lin_m}_{k}},
\end{eqnarray}
% \end{subequations}
%
where $H_{k_{ij}}$ denotes element $(i,j)$ of the matrix $H_k$, ${{\bf h}}_{k_{i}}$ denotes element $i$ of the vector ${{\bf h}}_{k}$, $x_j$ --- an element $j$ of the vector $\bf x$. The vector $\r^{lin_m}_k$ is a base point of the expansion, it is chosen to be close enough to a possible value of $\r^t_k$. For example, $\r^{lin_m}_k$ can be set to $\r^-_k$.

If the propagation formula $<\r^t_{k+1}> = {\bf f}_k( \r^t_{k} )$ is nonlinear, it is linearized in the same way:
% \begin{subequations}
\begin{eqnarray}
<\r^t_{k+1}> &=& {\bf f}_k( \r^t_{k} ) \approx  \nonumber \\
  &\approx&  {\bf f}_k(\r^{lin_p}_k) + F_{k}(\r^t_{k}-\r^{lin_p}_{k}),  \\
F_{k_{ij}} &=& \frac{{\partial}{{\bf f}}_{k_i}({\bf x})}{{\partial}x_j}\Bigg|_{{\bf x}=\r^{lin_p}_{k}}. \label{eq:transport_extend}
\end{eqnarray}
% \end{subequations}
Here the base point of the expansion $\r^{lin_p}_k$ can be chosen  equal to $\r^+_k$, for example.
\TODO{dzeta is changed}.

In case of the Kalman filter track fit, the state vector $\r_k$ is a vector of track parameters, the propagation matrix $F_k$ 
describes extrapolation of the track in the magnetic field from one detector to another, and the matrix of noise $Q_k$ describes the effect of multiple scattering in the material.

\subsection{Kalman filter based track fit for the CBM experiment}

One of the most important applications of the Kalman filter algorithm in HEP is track fitting. A particle passing through a detector leaves hits there, that are measurements of the particle trajectory. A state vector of track parameters, which define the particle trajectory is estimated according to hits using the Kalman filter algorithm.
The Kalman filter based track fit in the KF library is implemented on the example of the CBM experiment.
The CBM experiment is a fix target experiment with a forward detector geometry. The detector setup of the experiment includes two main tracking detectors: the Micro-Vertex Detector~(MVD) and the Silicon Tracking System (STS), 10 detector planes in total (see Figure~\ref{fig:geometry}). The detectors are placed inside a magnetic field, which is generated by a dipole magnet in order to make possible determination of particle momenta.
%  The KF library is implemented with the detector geometry with the same position of the detector sensors on each station along $z$ direction. 
The Cellular Automaton (CA) track finding algorithm~\cite{NIM-TIME05} is used to group hits, which are created by a particle, into a track.
A typical central Au+Au collision is shown in MVD and STS detectors in Figure~\ref{fig:event} and illustrates the complexity of the task.

\begin{figure}[htbp]
\centering\includegraphics[angle=0, width=7cm]{geometry}
\caption{The setup of tracking detectors in the CBM experiment: 2 stations of the Micro-Vertex Detector (MVD) and 8 stations of the Silicon Tracking System (STS). A target is shown as a red dot.}
\label{fig:geometry}
\end{figure}

\begin{figure}[htb]
  \begin{minipage}[b]{0.48\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth]{simEvent}
 \end{minipage}
 \hfill
 \begin{minipage}[b]{0.48\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth,height=6.2cm]{recoEvent}
 \end{minipage}
\caption{A central Au+Au event at 25$A$GeV collision energy in the CBM experiment, in average about 700 tracks of charged particles. A simulated event is shown on the left and reconstructed with the CA and KF based track finder --- on the right. \TODO{good resolution}}
\label{fig:event}
\end{figure}

In forward experiments most of track trajectories are almost straight. The straight line is naturally defined by a point on this line and slopes at this point. In the magnetic field the slopes are not constant. They change from point to point according to the equations of motion:
\begin{subequations}
\begin{align}
  dt_x/dz &= c~t_r~q/p~(~~t_y ( B_z + t_x B_x ) - (1 + t_x^2) B_y ), \label{eq:EqMotion1} \\	
  dt_y/dz &= c~t_r~q/p~(- t_x ( B_z + t_y B_y ) + (1 + t_y^2) B_x ), \\
      t_r &= \sqrt{ t_x^2 + t_y^2 + 1 }, \label{eq:EqMotion3}
\end{align}
\end{subequations}
here $x$ and $y$ are track coordinates at a given $z$ position\footnote{The $z$-coordinate points downstream the spectrometer axis, and $x$ and $y$ are transverse coordinates (see Figure~\ref{fig:geometry}).}; $t_x = dx/dz$ and $t_y = dy/dz$ are track slopes in $XZ$ and $YZ$ planes respectively; $q/p$ is an inverse particle momentum, signed according to a particle charge; $B_x$, $B_y$, $B_z$ are field components at the given $z$; $c$ is a speed of light.
Therefore the full and convinient set of the track parameters is:
\begin{equation}
\r = \{ ~x, ~y, ~t_x, ~t_y, ~q/p ~\}\, .
\end{equation}

Initial estimation $\r_0$, the model of measurement $H$ and the propagation matrix $F$ have to be defined for estimation of the state vector $\r$ of the track parameters by the Kalman filter algorithm.

% According to the Kalman filter algorithm, which is described above, the track fit starts from initialization of state vector $r$.
Для начальной оценки параметров трека применяется метод наименьших квадратов в приближении однокомпонентного магнитного поля и отсутвия многократного рассеяния на материале детекторных станций. 
Такое приближение позволяет оценить параметры с достаточной точностью для быстрой сходимости алгоритма.
% Такое приближение позволяет получить достаточно высокую точность (разрешение по импульсу 5\%).
% Такое приближение позволяет за пренебрежимо малое время получить оценки параметров с точностью сравнимую с оптимальной.
% Такое приближение позволяет получить достаточно высокую точность, которая позволяет пропагировать трек в нужном направлении.

Each hit in the STS detector consists of two fired strips, that gives two one-dimensional measurements. An MVD hit is a two-dimensional pixel with independent $x$ and $y$ coordinates, thereby it is also decomposed into two one-dimensional measurements for consistency with STS measurements and to speed up the computations.
A measurement depends linearly on the position of a particle at the detector plane, therefore the model of measurement is:
\begin{equation}
H_k = \{ ~cos(\alpha_k), ~sin(\alpha_k), ~0, ~0, ~0 ~\}\,,
\end{equation}
where $\alpha_k$ is an angle between the strip and $Y$-axis, $\alpha_k = 0^o$ and $15^o$ for STS and $0^o$, $90^o$ for MVD.

% The prediction matrix $F_k$ describes extrapolation of the track in the magnetic field from one detector to another and can be calculated numericaly. Two methods of the calculations are implemented: analytic formula and 4th order Runge-Kutta.

% Equations of motions is rewrited via track parameters:


The propagation matrix $F_k$ discribes the motion of a particle in the nongomogenious magnetic field $\B$. The trajectory of a particle is defined by differential equations~(\ref{eq:EqMotion1}-\ref{eq:EqMotion3}), which can be refomulated via track parameters\TODO{show the exact equations?}.%: $d\r/dz = {\bf f}(\r,\B(x,y,z))$.
% \begin{align}
%           | x |          tx
%           | y |          ty
% d\r/dz = | tx| = ft * ( ty * ( B(3)+tx*b(1) ) - (1+tx**2)*B(2) )	
%           | ty|   ft * (-tx * ( B(3)+ty+b(2) ) - (1+ty**2)*B(1) ) 
% ft = c_light*qp*sqrt ( 1 + tx**2 + ty**2 ) .
% \end{align}
%
% This equations can be solved exactly, since $\B$ depends 
Полученные таким образом уравнения не могут быть решены точно, так как $\B$ сложным образом зависит от координат. Поэтому необходимо применять приближенные методы решения нелинейных диференциальных уравнений. Имплементированно два метода нахождения параметров трека в условиях $(k+1)$-ого измерения: стандартный 4th order Runge-Kutta~\cite{NumRecC} и аналитическая формула, основанная на разложении в ряд Тейлора~\cite{AnalyticExtra}.
Стандартный 4th order Runge-Kutta метод имеет точность порядка $O(qcB\Delta z/p)^5$.
%, что позволяет пропагировать статус вектор до следующей станции за один шаг ($qcB\Delta z/p \approx 0.01$).
Благодаря малости расстояния между станциями ($qcB\Delta z/p \approx 0.01$) параметры трека на станции следующего измерения могут быть найдены за один шаг.
Скорость и точность работы аналитической формулы определяются числом членов разложения в ряд Тейлора. Выбором этого числа достигается компромис между точностью вычислений и временем.
		
Thereby both methods provides the dependence between parameters on $(k+1)$-th and $k$-th stations: $<\r^t_{k+1}> = {\bf f}_k( \r^t_{k} )$. This depencence is linearized according to the formula~(\ref{eq:transport_extend}), that gives the required propagation matrix $F_k$.
As the point of a linearization a vector 
$$\r^{lin}_k = \{ ~x^+_k, ~y^+_k, ~t^+_{x~k}, ~t^+_{y~k}, ~q/p_0 ~\}\,$$
is taken. Here $x^+_k$, $y^+_k$, $t^+_{x~k}$, $t^+_{y~k}$ are track parameters after a previous filtration step; $q/p_0$ is a charge over momentum, which was used for initialization of the state vector parameters.

Additionally approximation of the magnetic field is required for more effective use of the cache memory of CPU and speed up of the implementation.
The magnetic field of the CBM experiment is relatively smooth and, therefore, can be locally approximated
by polynomials~\cite{SIMDKFCPC}. It was found to be sufficient to use a polynomial of 5-th order to approximate the field cross-sections in the planes of each station. Since only a field along the given track is needed, a field behavior between stations is approximated by a parabola with coefficients calculated from three closest hits of the current track. It was shown that such field simplification does not effect performance of the track fitting procedure, meanwhile allowing to get speed up factor about 50.

The matrix of noise $Q_k$ describes multiple scattering in the material. Планируются, что в CBM эксперименте трекинговые детекторы будут состоять из тонких модулей (порядка 300~мкм). В этом случае, рассеяние на материале детектора происходит на малые углы. В тоже время, материала станции достаточно для того, чтобы происходило порядка 20 рассеяний, что позволяет описывать распределение угла суммарного отклонения частицы функцией Гауса согласно центральной предельной теореме.
Ширина распределения угла отклонения оценивается в соответствии с формулой~\cite{Dahl}:
%
\begin{eqnarray}
 \sigma(\theta) &=& \frac{13.6 MeV}{\beta p} q \sqrt{S/X_0} \left[1 + 0.038 \ln(S/X_0)\right],
\end{eqnarray}
%
где $p$ --- импульс частицы, $q$ --- её заряд в единицах элементарного заряда, $S$ --- толщина материала, который проходит частица, $X_0$ --- радиационная толщина материала.
Исходя из приведенных предположений можно получить~\cite{NoiseMatrix}\TODO{нужно ли ссылаться на эту статью, если там описан лишь алгоритм получения, и $Q_k$ полученно для другого наобора параметров?}, что $Q_k$ имеет вид:
%
\begin{eqnarray}
 Q_k = 
 \sigma_k^2(\theta) \left(
  \begin{array}{lllll}
    0 & 0 & 0                 & 0 & 0\\
    0 & 0 & 0                 & 0 & 0\\
    0 & 0 & (t_x^2+1) t_r^2 & t_xt_y t_r^2    & 0\\
    0 & 0 & t_xt_y t_r^2    & (t_y^2+1) t_r^2 & 0\\
    0 & 0 & 0 & 0 & 0
  \end{array}
 \right).
\end{eqnarray}

Потери энергии частицы в материале детектора не учитываются, поскольку они пренебрежимо малы по сравнению с другими эффектами.

Обычно реализация Калман фильтра основана на вычислениях в двойной точности.
Использование одинарной точности вычислений позволяет эффективно использовать SIMD unit of CPU и кэш память, ввиду этого в KF library базовой считается реализация алгоритмов в одинарной точности.
В этом случае ошибки численного округления возрастают на 8 порядков и часть треков фитируется некорректно. Для повышения стабильности процедуры фита были реализованы следущие улучшения алгоритма~\cite{SIMDKFCPC}: описанная выше accurate initial approximation of track parameters, стабилизация вычисления ковариационной матрицы на шаге фильтрации, специальная процедура для фильтрации первого измерения.
На шаге фильтрации возникают численные расходимости в случае, когда погрешность экстраполированных параметров значительно превышает погрешность измерения. It has been found that
the direct comparison of the errors before the filtration step and neglecting the errors which are more then 4 times larger then the mean measurement error keeps the algorithm stable. Аналогичная проблема возникает при добавлении первого измерения, где ошибки параметров бесконечны и представляются в виде больших чисел. Для этого специфического случая формулы могут быть упрощены аналитически, что позволяет избежать вычислений с большими числами. Заметим, что сделанные изменения положительным образом влияют и на вычисления в двойной точности.


\subsection{Conventional KF track fit performance}

The basic quality characteristics of all fitting algorithms are residuals and pulls.
Residuals $\rho$ of the track parameters, for instance, of the x-coordinate, are determined as:
\begin{equation}
\rho_x = x_{reco} - x_{mc},	
\end{equation}
where $x_{reco}$ --- reconstructed and $x_{mc}$ --- true Monte-Carlo values of the x-coordinate.
The normalized residual (pull) distributions of the fitted track parameters is a measure of the reliability of the fit. Pulls are determined according to the formula:
\begin{equation}
P(x) = \frac{\rho_x}{\sqrt{C_{xx}}},	
\end{equation}
where $C_{xx}$ --- the corresponding diagonal element of the covariance matrix, obtained in the
track fit. In the ideal case the normalized error distributions of coordinates and slopes of the
track should be unbiased and Gaussian distributed with width of 1.0.

\begin{figure}[htbp]
\centering\includegraphics[angle=0,width=\textwidth]{pullsAndResCONV}
\caption{Residuals and pulls of the estimated track parameters of the CBM experiment~\cite{TSR} obtained with the conventional Kalman filter approach using the analityc formula for extrapolation.}
\label{fig:trackFitQuality} 
% \end{rotate}
\end{figure}

For tests 20000 long primary tracks with momentum higher than 1~GeV, reconstructed with the CA track finder, have been used. The tracks could have on average 0.5\% incorrectly attached hits.
Tests of the track fit quality and time have been performed. 

Residual and pulls are calculated at the production vertex of particles. 
The Gaussian fits to the obtained residual and pulls distributions are given in Figure~\ref{fig:trackFitQuality}. Here the analytic formula for extrapolation has been used. The distributions obtained using the 4-th order Runge-Kutta propagation are basically the same.
The pulls are not biased and have widths close to 1.0 indicating correctness of the fitting procedure. The diviation from 1.0 are caused by several assumptions made in the fitting procedure mainly in the part of the detector material treatment. The $q/p$ pull is the widest because the momentum is the most sensitive to all approximations. The reason for this is that the measurement of the particle momentum is indirect and at least three hits are needed in order to determine the momentum. % The small tails in distributions are caused by the various non-Gaussian contributions to the fit. 

\begin{figure}[htb]
  \begin{minipage}[b]{0.48\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth]{make_tbbtime_AN_CONV}
 \end{minipage}
 \hfill
 \begin{minipage}[b]{0.48\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth]{make_tbbtime_CONV_ITBB_RKvsAN}
 \end{minipage}
 \hfill
 \begin{minipage}[b]{0.48\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth]{make_tbbtime_CONV_ITBB_DPvsSP}
 \end{minipage}
\caption{Scalabilities of the conventional KF track fitter. Upper left plot shows comparison of ITBB and ArBB libraries for parallelization (the analytic formula for extrapolation is used). Upper right --- comparison of the 4-th order Runge-Kutta propagation (RK) and the analytic formula (AN) for propagation (the ITBB for parallelization is used). Bottom left --- comparison of the single (SP) and double (DP) precision implementations (the ITBB for parallelization and the analytic formula for propagation is used). }
\label{fig:scalCONV}
\end{figure}

Three servers with Intel Xeon E7-4860, X5550 and X5680 processors (see Table~\ref{tab:servers}) have been used for scalability tests. All processors have the hyper-threading technology, therefore each physical core has two logical cores. That gives 80 logical cores in total for the fastest server.

\begin{table}[h]
	\begin{center}
		\begin{tabular}{|l|c|c|c|c|}
			\hline
			Processor & Clock, GHz & L3 Cache, MB & Number of processors & Total number of cores  \\
\hline
			X5550   & 2.66 & ~8  & 2 & 16  \\
			X5680   & 3.33 & 12 & 2 & 24  \\
			E7-4860 & 2.27 & 24 & 4 & 80  \\
\hline
		\end{tabular}
	\end{center}
	\caption{Characteristics of the servers used for scalability tests.}
	\label{tab:servers}
\end{table}

The track fit has been parallelized using ITBB by execution of one thread per one logical core. Fit of 1000 tracks packed in SIMD vectors has been executed per each thread. The program has been also rewritten in order to use Intel ArBB.
Figures~\ref{fig:scalCONV} show strong linear scalabilities for all many-core systems, for both parallelization libraries and for both propagation methods.
Intel ArBB effectively and automatically parallelizes and vectorizes the track fit lossing only 20\% of the maximum performance. The track fit using analytic formula for propagation is shown to be 40\% faster in comparison with the track fit using 4-th order Runge-Kutta method, while track parameters quality is the same. The single precision implementation is twice faster than double implementation due to packing 4 entries into a SIMD vectors instead of 2.

\subsection{Square root Kalman filter method}

The square root approach operates with a square root $S$ of the covariance matrix instead of the covariance matrix $C=SS^T$ itself~\cite{KFbook,Kaminski}. The logarithm of a condition number of the $S$ matrix is smaller by a factor of two, therefore the precision of computations can be effectively doubled.

The prediction step equations are formulated via square root of the covariance matrix. Since the state vector is exactly same as for the conventional approach the state vector prediction equation is same as well:
%
\begin{align}
\r_k^- &= F_k \r_{k-1}^+.
%
\intertext{The corresponding covariance matrix update is based on the following relation between a square root of the covariance matrix after $(k-1)$-th filtration step $S^+_{k-1}$ and an updated square root $S^-_k$ of the covariance matrix:}
%
\left[ \begin{array}{c}
      (S^-_k)^T  \\
      \hat{0}   \\
     \end{array} \right] &= 
T^p_k \cdot \left[ \begin{array}{cc}
      (S^+_{k-1})^T F^T_{k-1}  \\
      (Q_{k-1}^{1/2})^T    \\
     \end{array} \right].
 \label{eq:SRTimeUpdate1}
\end{align}
Here $Q^{1/2}$ --- a square root of the process noise covariance matrix, the matrix $T^p$ is some unknown orthogonal matrix. This relation can be checked to be equalent to conventional matrix propagation equation~(\ref{eq:ConvTimeUpdate1}), in order to do this one should multiply each side by it's transpose.

There are many pairs of matrices $S^-_k$ and $T^p_k$, which sutisfy the equation~(\ref{eq:SRTimeUpdate1}) with given $S^+_{k-1}$, $Q^{1/2}$ and $F_{k-1}$. The matrix $S^-_k$ is found in the low triangular form for the simplicity of computations. In this case all elements of matrices $S^-_k$ and $T^p_k$ are uniquely determined. The $S^-_k$ is calculated via the Modified Gram-Schmidt orthogonalization procedure given in Appendix~\ref{app:MGS} (\TODO{move algorithm from appendix here?}). Since the matrix $T^p_k$ is not used further, it is not calculated explicitly.

The filtration step is reformulated as well. There are two approaches for the filtration step~\cite{KFbook,Kaminski,Thornton}: based on the Potter algorithm and the Carlson triangular square root algorithm.

The Potter algorithm works with scalar measurements. If the covariance matrix $V_k$ of measurement is diagonal, then elements of the vector measurement $\m_k$ are independent and can be simply considered as scalar measurements $m_{ki}, i=1...l$.
Otherwise the measurement $\m_k$ should be preprocessed. The matrix $V_k$ is symmetric and always can be diagonalized: $V_k = ODO^T$. In order to obtain orthogonal and diagonal matrices $O$ and $D$ the diagonalization procedure by Jacobi Transformations~\cite{NumRecC} can be used, for example. Then the obtained matrix $O$ is used as orthogonal transformation matrix for the measurement, which leads to replacement of $\m_k$, $V_k$ and $H_k$ by $\m'_k = O^T\m_k$ , $V'_k = O^TV_kO = D$ and $H'_k = O^TH_k$. After that the covariance matrix of the measurement becomes diagonal, scalar measurements $m'_{ki}$ --- independent, and the Potter algorithm is applied to each of them iteratively.

In order to formulate the algorithm, let us use $H_{ki}$ notation for the $i$-th row of the model of measurement $H_k$ and initialize $S^+_{k,0} = S^-_{k}$, $\r^+_{k,0} = \r^-_{k}$, $\chi_{k,0} = \chi_{k-1}$. Then intermediate variables $\vphi_k$, $a_k$ and $\gamma_k$ are calculated:
%
\begin{align}
  \vphi_k  &= S^+_{k,i-1}H'^T_{ki}, \\
  a_k        &= \frac{1}{\vphi_k\vphi^T_k+V'_{ki}},  \\
  \gamma_k   &= \frac{1}{1 + \sqrt{a_kV'_{ki}}}.  \label{eq:SRPotterMeasurementUpdate3} \\
\intertext{The updated square-root matrix is obtainted:}
  S^+_{ki} &= S^+_{k,i-1}(I - a_k\gamma_k\vphi_k\vphi_k^T), \label{eq:SRPotterMeasurementUpdate4}
\intertext{the gain matrix and the residual are calculated:}
  K_{ki}   &= a_k S^+_{ki} \vphi_k, \\
  \zeta_{ki} &= m'_{ki} - H_{ki} \r_{k,i-1}^+, \\
\intertext{and eventually the updated state vector and the $\chi^2$-deviation are:}
  \r^+_{ki} &= \r^+_{k,i-1} + K_{ki} \zeta_{ki}, \\ %\label{eq:SRPotterMeasurementUpdate1}\\
  \chi^2_{ki} &= \chi^2_{k,i-1} + \zeta_{ki}^Ta_k\zeta_{ki}.
\end{align}
%
The final state vector $\r^+_{k} = \r^+_{k,l}$ updated by the $k$-th vector measurement and its covariance square root  $S^+_k = S^+_{k,l}$ are obtainted after processing the last scalar measurement.

Such standard Potter algorithm requires a full $n\times n$ square-root matrix with $n^2$ elements. This means additional $(n-1)n/2$ elements to store in comparison with a covariance matrix, which is symmetric and requires only $(n+1)n/2$ elements to store. Additional memory consumption can be avoided by replacing equations~(\ref{eq:SRPotterMeasurementUpdate3}, \ref{eq:SRPotterMeasurementUpdate4}) with equations: 
%
\begin{eqnarray}
  \tilde{S_k}\tilde{S_k}^T &=& (I - a_k\vphi_k\vphi_k^T), \label{eq:SRModifiedPotterMeasurementUpdate1}  \\
  S^+_{ki} &=& S^+_{k,i-1}\tilde{S},
\end{eqnarray}
%
here the triangular square-root matrix $\tilde{S_k}$ can be found from the equation~(\ref{eq:SRModifiedPotterMeasurementUpdate1}) using a Choletsky decomposition algorithm~\cite{NumRecC} given in Appendix~\ref{app:Choletsky}. In this case the covariance square root matrix is always low triangular with $(n+1)n/2$ nontrivial elements.

This method we call modified Potter algorithm. It is especially usefull when the model of measurement $H$ has zero-columns, which is true for the CBM track fit. Then computations for $\tilde{S_k}\tilde{S_k}^T$-decomposition are negligible, since in this case almost all elements of $\vphi\vphi^T$ are zero.

The Carlson algorithm for filtration is implemented using an orthogonalization procedure similar to the one used for the prediction step. First updated covariance matrix square root $S^{+}_{k}$ and intermediate matrices $R_k$ and $G_k$ are found using a relation:
%
\begin{align}
\left[ \begin{array}{cc}
      R_k^T & G_k  \\
      \hat{0} & (S^{+}_{k})^T    
     \end{array} \right]
     &=
T_k^f \cdot \left[ \begin{array}{cc}
      V^{T/2}_k      & \hat{0} \\
      (S^-_k)^T H_k  &  (S_k^-)^T  
     \end{array} \right], \\
%
\intertext{here $T_k^f$ is an orthogonal matrix. The matrices $S^{+}_{k}$ and $R_k$ are found in the form of lower triangular matrices. Then the residual is calculated:}
%
     \vzeta_k &= \m_k-H_k \r_{k}^-. \label{eq:SRMeasurementUpdate2}\\
%
\intertext{An intermidiate vector $\vmu_k$ is found:}
%
     \vmu_k   &= R_k^{-1}\vzeta_k. \\
%
\intertext{and eventually the updated state vector and the $\chi^2$-deviation are:}
%
     \r^+_{k}       &= \r^-_{k} + G_k^T \vmu_k, \\
     \chi^2_k      &= \chi^2_{k-1} + \vmu^T_k\vmu_k.
\end{align}

The final matrix $S^+_k$ is lower triangular, so only $(n+1)n/2$ elements has to be stored.

\subsection{Square root KF track fit performance}

Two variations of the square root approach have been implemented for the CBM track fit: with filtration via the Modified Potter algorithm and via the Carlson algorithm.

Tests of the time and the track fit quality have been performed.
The residuals and pulls distributions are similar to those obtained with the conventional KF approach.

\begin{figure}[htb]
 \begin{minipage}[b]{0.50\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth]{make_tbbtime_AN_SR1}
 \end{minipage}
 \begin{minipage}[b]{0.50\textwidth}
  \centering\includegraphics[angle=0,width=\textwidth]{make_tbbtime_AN_SR1}
 \end{minipage}
\caption{Scalabilities of the modified Potter (left) and Carlson (\TODO{right}) square root KF track fitter.}
\label{fig:scalSR}
\end{figure}

Figure~\ref{fig:scalSR} shows strong linear scalabilities of the modified Potter and Carlson square root KF approaches for all tested many-core systems and for both parallelization methods \TODO{OpenMP??}.


\subsection{U-D filtering}

U-D filtering~\cite{KFbook,Thornton,Bierman} is another way to algorithmically increase the numerical precision the of Kalman filter computations. It operates with two matrices $U$ and $D$ instead of the covariance matrix $C = UDU^T$ itself. The matrix $U$ is an upper triangular matrix with diagonal elements equal to one, the matrix $D$ is a diagonal.

The prediction equation for the state vector is the same as for the conventional approach:
%
\begin{align}
\r_k &= F_k \r_{k-1}, \\
%
\intertext{while the prediction of $U$ and $D$ matrices uses orthogonalization and is based on the following relation:}
%
U^-_k W_k &= ~~~~\left[ \begin{array}{cc} F_k U^+_{k-1} & I \\  \end{array} \right], \label{eq:UDTimeUpdate1} \\
    D^-_k &= W_k \left[ \begin{array}{cc} 
              D^+_{k-1} & \hat{0} \\ 
              \hat{0}   & Q_k
              \end{array} \right] W^T_k, \label{eq:UDTimeUpdate2}
\end{align}
%
here $U^+_{k-1}$ and $D^+_{k-1}$ are a $UDU^T$-factorization of the covariance matrix after $(k-1)$-th filtration step. The intermediate matrix $W_k$ and disired predicted matrices $D^-_k$ and $U^-_k$ are unknown and uniquely determined by the equations~(\ref{eq:UDTimeUpdate1}, \ref{eq:UDTimeUpdate2}). They can be found using the Gram-Schmidt orthogonalization procedure discribed in Appendix~\ref{app:UDGS}.

The filtration step is propoused to be done via decomposition of the vector measurement into scalars (see the Potter square root algorithm). Then decomposed measurements $m_{ki}, i = 1...l$ are processed iteratively: we initialize $D^+_{k,0} = D^-_{k}$, $U^+_{k,0} = U^-_{k}$, $r^+_{k,0} = r^-_{k}$, $\chi_{k,0} = \chi_{k-1}$, and for each measurement we calculate an intermediate variable:
%
\begin{align}
   a_k       &= (H_{ki} U^+_{k,i-1} D^+_{k,i-1} U^{+T}_{k,i-1} H_{ki}^T + V_{ki})^{-1}, \label{eq:UDMeasurementUpdateS1} \\
%
\intertext{find $\tilde{U}$ and $\tilde{D}$ intermediate matrises using $UDU^T$ decomposition: }
%
 	\tilde{U} \tilde{D} \tilde{U}^T &= U^+_{k,i-1}(D_{k,i-1} - a_k(D^+_{k,i-1} U^{+T}_{k,i-1} H_k^T)(D^+_{k,i-1} U^{+T}_{k,i-1} H_k^T)^T)U^{+T}_{k-1}, \\
%
\intertext{find $U$ and $D$ updated by the measurement $m_{ki}$: }
%
  U^+_{ki}   &= U^+_{k,i-1}\tilde{U}, \\
  D^+_{ki}   &= \tilde{D}, \\
%
\intertext{find the gain matrix and the residual: }
%
  K_{ki}   &= a_k H_{ki} U^+_{k,i-1} D^+_{k,i-1} U^{+T}_{k,i-1}, \\
  \zeta_{ki} &= {m}_{ki}-H_{ki} \r_{ki}^+,  \\
%
\intertext{and eventually the updated state vector and the $\chi^2$-deviation: }
%
  \r^+_{ki} &= \r^+_{k,i-1} + K_{ki} \zeta_{ki},  \\
  \chi^2_{ki} &= \chi^2_{k,i-1} + \zeta_{ki}^TS_{ki}\zeta_{ki}, \label{eq:UDMeasurementUpdateS8}
\end{align}
%
here $H_{ki}$ notation is used for the $i$-th row of the model of measurement $H_k$. The final state vector $\r^+_{k} = \r^+_{k,l}$ updated by a $k$-th vector measurement and its covariance U-D factorization $U^+_k = U^+_{k,l}, D^+_k = D^+_{k,l}$ are obtainted after processing the last scalar measurement.

Also it is possible to avoid the diagonalization, which is a highly time consuming procedure. In this case exactly same procedure as the one discribed before can be used, where the scalar $a_k$ is replaced with a weighting matrix $S_k$, and scalar $m_{ki}$ is replaced with the vector measurement $\m_k$. The equations~(\ref{eq:UDMeasurementUpdateS1}-\ref{eq:UDMeasurementUpdateS8}) transform into:
%
\begin{eqnarray}
  S_k        &=& (H_k U^-_{k} D^-_{k} U^{-T}_{k} H_k^T +V_{k})^{-1},  \\
	\tilde{U}_{k} \tilde{D}_{k} \tilde{U}_{k}^T &=& U^-_{k-1}(D^-_{k} - (D^-_{k} U^{-T}_{k} H_k^T)S_k(D^-_{k} U^{-T}_{k} H_k^T)^T)U^{-T}_{k}, \\
  U^+_{k}   &=& U^-_{k}\tilde{U}_{k}, \\
  D^+_{k}   &=& \tilde{D}_{k}, \\
  K_{k}    &=& S_k H_k U^-_{k} D^-_{k} U^{-T}_{k}, \label{eq:UDMeasurementUpdateV} \\
  \vzeta_k &=& \m_k-H_k \r_{k}^-,  \\
  \r^+_{k} &=& \r^-_{k} + K_{k} \vzeta_k,  \\
  \chi^2_k &=& \chi^2_{k-1} + \vzeta_k^TS_k\vzeta_k 
\end{eqnarray}


\subsection{U-D filtering KF track fit performance}

\begin{figure}[htb]
%  \begin{minipage}[b]{0.50\textwidth}
  \centering\includegraphics[angle=0,width=0.50\textwidth]{make_tbbtime_AN_UD}
%  \end{minipage}
\caption{Scalabilities of the U-D filtering KF track fitter.}
\label{fig:scalUD}
\end{figure}

The U-D filtering approach have been implemented for the CBM track fit. Since the measurement $m_k$ is a scalar only one implementation is possible.

Tests of the time and track fit quality have been performed.
The residuals and pulls distributions are similar to the one obtained with the conventional KF approach.

Figure~\ref{fig:scalUD} shows strong linear scalabilities of the U-D filtering KF approach for all tested many-core systems and for both libraries. \TODO{OMP?}

\section{Kalman filter based smoother}

 The smoother is intended for finding the optimum estimation $\r_i$  of an unknown state vector $\r^t_i$ at the point of any measurement $\m_i$, according to either all known measurements or all known measurements excluding $\m_i$ itself. Smoothing of the track state vector is required for detector alignment purpouses. It is also can be used for fake hits (using the Deterministic annealing filter, for example) and ghost tracks rejection.

The standard smoothing approach propagates the optimal estimation from last measurement point to the interesting point $\m_i$.
This procedure can result with negative-semidefinite covariance matrix due to round-off errors~\cite{DataAnalysis}. Therefore the alternative method based on two Kalman filters going forward and backward has been implemented. This method obtains two optimal estimations of a state vector $\r^t_i$ according to the first $i$ and the last $(n-i)$ measurements and merges them into one. If the measurement $\m_i$ itself should be excluded, then the forward estimation based on $(i-1)$ first measurements only is taken. The merging is done using Kalman filter equations. One estimation is interpreted as 5-dimensional measurement and added to the another estimation according to the filtration step formulas, using model of measurement $H$ equal to the unit matrix. 

The smoothing algorithm has been implemented using all discribed KF approaches: conventional Kalman filter, Potter, modified Potter and Carlson square root approaches, U-D-filtering with and without diagonalization.
The diagonalization procedure by Jacobi Transformations~\cite{NumRecC} is used for square root and U-D-filtering implementations.
Conventional implementation of the merger needs matrix inversion (see equation~\ref{eq:ConvMeasurementUpdate1}). The algorithm based on the modified Cholesky decomposition (see Appendix~\ref{app:MatrixInversion}) is used.

\subsection{Kalman filter based smoother performance}

All approaches have been implemented and tested with respect to distributions of track parameter resolutions and pulls  on each detector station. For illustration, results obtained with smoother using conventional approach and excluding hit on the given station are presented in Table~\ref{tab:smoother}

The pulls have widths close to 1. The resolution of track parameters are basicaly same for all stations. The resolution at inner stations is slightly better due to interpolation.

\begin{table}[h]
  \begin{center}
    \begin{tabular}{|c|c|c|c|c|c|c|c|}
          \hline
      \multicolumn{2}{|c|}{  } & \multicolumn{3}{|c|}{ Resolution } & \multicolumn{3}{|c|}{ Pull }  \\ \cline{3-8}
      \multicolumn{2}{|c|}{  } & $~~x, \mu$m~~ & $t_x, 10^{-3}$  & $~~p, \%~~$ & $~~~x~~~$ & $~~~t_x~~~$  & $~~q/p~~$  \\
      \hline
MVD & 1 &  23.82 &  0.63 &  0.90  &   1.04 &  1.10 &  1.26  \\    \cline{2-8}
    & 2 &  19.46 &  0.39 &  0.89  &   1.03 &  1.02 &  1.27  \\    \cline{2-8}
STS & 1 &  19.56 &  0.19 &  0.88  &   1.13 &  1.42 &  1.27  \\    \cline{2-8}
    & 2 &  15.29 &  0.25 &  0.88  &   1.09 &  1.16 &  1.27  \\    \cline{2-8}
    & 3 &  17.72 &  0.26 &  0.88  &   1.10 &  1.16 &  1.26  \\    \cline{2-8}
    & 4 &  21.43 &  0.22 &  0.88  &   1.12 &  1.16 &  1.26  \\    \cline{2-8}
    & 5 &  23.79 &  0.24 &  0.88  &   1.12 &  1.16 &  1.26  \\    \cline{2-8}
    & 6 &  29.55 &  0.23 &  0.89  &   1.14 &  1.17 &  1.24  \\    \cline{2-8}
    & 7 &  22.85 &  0.21 &  0.87  &   1.13 &  1.37 &  1.22  \\    \cline{2-8}
    & 8 &  29.93 &  0.50 &  0.90  &   1.09 &  1.14 &  1.20  \\
       \hline
    \end{tabular}
  \end{center}
  \caption{Smoother based on conventional KF approach using the analytic formula. Sigma of gaussian fit to resolution and pull distributions.}
  \label{tab:smoother}
\end{table}
\TODO{Scalability?}

\section{Discussion}
\TODO{}

% The KF approaches have been compared with respect to the track momentum resolution, q/p normalised resolution (pull), correctness of covariance matrix and a fitting time (see. Table~\ref{tab:comparison}).
% Precision of track momentum is important for event reconstruction and also it is sensitive to stability of fit, since measurement of it is indirect, at least three 2D-measurements of a detector are needed in order to determine a momentum. Column ``P residual'' at the table shows that momentum residual is 1.25\% for single precision conventional method and about 1.10\% for all other methods. It was shown that the 0.15\% difference is significant, for example, it leads to factor of 1.5 \TODO{(???)} in signal to background ratio during $\Lambda^0$ reconstruction. Pull, which is width of resolution distribution normalized by error, can show quality of fit, since it should be equal 1 for perfectly working fit. Pull for inverse momentum is presented at column ``q/p pull''. Because input measurements are not gaussian distributed, fit algorithm has some approximations and presicion of computations is finite pull is bigger than 1. The table shows that increasing presicion by factor of 2 improves pull by 10\%, the same can be done by using alternative approaches. Values which has no meaning can appear during fitting due to round-off errors. The cases when covariance matrix has negative diagonal elements were monitored and are shown in ``Bad CovMatrix'' column. The table shows factor of 100 improvment for square root approaches in comparison with the conventional one. Computational time increases unsignificantly for all of the approaches.
% 
% Based on this comparison the conclusion that the ``Square root I + Runge-Kutta'' approach is best one has been made. It shows same quality as double presicion conventional approach, but is two times faster.
% 
% \begin{table}[ht]
% % 	\begin{center}
%  		\begin{tabular}{|l|c|c|c|c|}
%  		\hline
%  				                             & P residual, \% & q/p pull & Bad CovMatrix, 1/\TODO{ev} & Time, $\mu$s/track \\
%  		\hline
%  		 	  Conventional single          & 1.25  &  1.40  &  ~988.0 & 2.1 \\
%  			  Conventional double          & 1.09  &  1.32  & ~~~~4.9 & 4.4 \\
%  			  U-D                          & 1.10  &  1.33  &  1014.4 & 2.5 \\
%  			  Square root I                & 1.08  &  1.31  & ~~~~9.2 & 2.4 \\
%  			  Square root II               & 1.09  &  1.32  & ~~~~7.8 & 3.1 \\
%  			  Square root I + Runge-Kutta  & 1.07  &  1.31  & ~~~~5.5 & 2.5 \\
%     \hline
% 		\end{tabular}
% %	\end{center}
% \caption{ \TODO{}Comparison of different track fit based on Kalman Filter approaches. }
%   \label{tab:comparison}
% \end{table}

\section{Summary}

Fast Kalman filter based library for track fitting has been created. The library implements two main fitting tasks: track fit, track smooth; four KF-approaches: conventional, two square root and U-D filtering; two methods for track propagation: the 4-th order Runge-Kutta and the analytic formula. The library is parallelized both on data (SIMD) and task (multithreading) levels. The methods for parallelization are optional and include operator overloading, Vector classes, Intel Array Building Blocks for SIMD and Intel Treading Building Blocks, Intel Array Building Blocks, OpenMP for multithreading.
All tasks, approaches and methods are tested and shows good track fit quality: momentum resolution about 1\%, pulls width about 1.0. The scalabilities on three different modern many-core systems are shown to be linear.

\section{Acknowledgements}
\TODO{This work was supported by the Hessian LOEWE initiative through the Helmholtz International Center for FAIR (HIC for FAIR), HGS-HIRe, GSI F\&E, BMBF Verbundforschung and EU-FP7 HadronPhysics2. Das Projekt wird vom Hessischen Ministerium fuer Wissenschaft und Kunst gefoerdert.}


\appendix
\section{\appendixname. Modified Gram-Schmidt algorithm}
\label{app:MGS}
\TODO{flops?}.
We have the orthoronalization problem:
For given $n \times n$ matrix $A$ find an upper triangular $n \times n$ matrix $W$, which sutisfy:
\begin{eqnarray*}
\left[ \begin{array}{c}
      W  \\
      0   \\
     \end{array} \right] &=& 
T \cdot A,
%  \label{eq:MGSFormulation}
\end{eqnarray*}
where $T$ has to be $(n+r) \times (n+r)$ orthogonal matrix.

The next Modified Gram-Schmidt (MGS) algorithm can be applied~\cite{GramSchmidt}:

\TODO{choose a representation}

\begin{algorithmic}
\State $A^{(0)} \gets A$
\For{$k\gets 0, n-1$}
  \State $\sigma_k \gets \sqrt{A_k^{(k)T} A_k^{(k)}}$
  \For{$j\gets 0, n-1$}
    \If {$j \leq k-1$}
        \State $W_{kj} \gets 0$
    \ElsIf {$j = k$}
        \State $W_{kj} \gets \sigma_k$
    \ElsIf {$j \geq k+1$}
        \State $W_{kj} \gets A_k^{(k)T}A_j^{(k)}$
\State $A_j^{(k+1)} \gets A_j^{(k)} - \frac{W_{kj}}{\sigma_k}A_k^{(k)}$
    \EndIf
  \EndFor
\EndFor
% \If {$i\geq maxval$}
%     \State $i\gets 0$
% \ElsIf {$i+k\leq maxval$}
%         \State $i\gets i+k$
% \EndIf
\end{algorithmic}

\textit{$Set A^{(0)} = A$.}

\textit{$For k = 0~to~n-1$:}

\begin{align}
\sigma_k &= \sqrt{A_k^{(k)T} A_k^{(k)}} \\
W_{kj} &= \left\{ \begin{array}{ll}		
		0, & j = 1,~k-1 \\ 
		\sigma_k, & j = k \\ 
		A_k^{(k)T}A_j^{(k)}, & j = k+1,~n-1
	\end{array} \right. \\
A_j^{(k+1)} &= A_j^{(k)} - \frac{W_{kj}}{\sigma_k}A_k^{(k)}, ~~~~ j = k+1,~n-1
%
\intertext{At stage $k$, the first $(k-1)$ columns of $A^{(k)}$ are zero below the diagonal and at the last iteration we have:}
A^{(n)} &= \left[ \begin{array}{c}
      W  \\
      0   \\
     \end{array} \right]
\end{align}

The MGS algorithm is a numerically improved adaptation of the classical Gram-Schmidt orthogonalization procedure.
When computations are made exactly the result, is equivalent to the classical Gram-Schmidt result. However, when roundoff errors occur, Bjorck~\cite{GramSchmidt} has shown that, the MGS procedure is much more accurate.

\section{\appendixname. Choletsky algorithm}
\label{app:Choletsky}
The following algorithm computes an $S$ matrix such that $C = SS^T$ for an $n \times n$ matrix C:

\textit{For each $i = 0,~n-1$ repeat for $j = 0,~n-1$:}
\begin{eqnarray*}
S_{ji} = \left\{ \begin{array}{ll}		
		0, & j < i \\ 
    \sqrt{C_{ii}-\sum_{k=0}^{i-1}S^2_{ik}}, & j = i \\
		\frac{1}{S_{ii}} \left( C_{ji} - \sum_{k=0}^{i-1}S_{jk}S_{ik} \right), & j > i
	\end{array} \right. 
\end{eqnarray*}

\section{\appendixname. Gram-Schmidt algorithm}
\label{app:UDGS}
We have the orthoronalization problem:
For given $n \times 2n$ matrix $A$ and $2n \times 2n$ matrix $B$ find an upper triangular $n \times n$ matrix with ones along the diagonal $U$ and diagonal matrix $D$, which sutisfy:
\begin{eqnarray*}
UW &=& A, \\
D &=& WBW,
%  \label{eq:MGSFormulation}
\end{eqnarray*}
where $W$ --- is an arbitrary $n \times 2n$ matrix.

The following algorithm is applied:

\textit{Define $a_i$ as $i$-th row of $A$, and $w_i$ as $i$-th row of $W$.}

\textit{Put $w_{n-1} = a_{n-1}$.}

\textit{Then repeat for $i = n-2,~0$:}
\begin{eqnarray*}
U_{ij} &=& \left\{ \begin{array}{ll}
		0, & j < i \\
    1, & j = i \\
		\frac{w_iBw_j^T}{w_jBw_j^T}, & j > i
	\end{array} \right. \\
w_i &=& a_i - \sum_{j=i+1}^{n-1} U_{ij}w_j
\end{eqnarray*}

\section{\appendixname. Matrix inversion algorithm}
\label{app:MatrixInversion}

Since covariance matrix is symmetric, it is possible to apply Cholesky decomposition~\cite{NumRecC}. The method has been chosen because of the numerical stability: the error of the decomposition is stable, does not depend on the pivoting. This is of the particular importance in case of the single precision calculation with SIMD units.

If the matrix $A = \left\lbrace a_{ij} \right\rbrace$ is symmetric and positive-defined, it could be decomposed with the Cholecky method:
\begin{eqnarray}
A = U^TU. \label{eq:Cholesky_init}
\end{eqnarray}
$U$ is a lower or upper triangular matrix. We have chosen the upper case. During the fit the diagonal elements of the covariance matrix can become negative. In order to apply the algorithm for the not positive-defined matrix $A$, it is necessary to introduce additional diagonal matrix $D = \left\lbrace d_{ii} \right\rbrace$:
\begin{eqnarray}
A = U^TDU. \label{eq:Cholesky}
\end{eqnarray}
We have defined the diagonal elements of the matrix $D$ to be equal $\pm1$. With this definition the division by the element $d_{ii}$ is replaced by multiplication, that increase the stability of the method. The formulas for the elements of the matricies $U$ and $D$ are derived directly from the definition of the matrices $U$ and $D$:
\begin{eqnarray} \label {eq:Cholesky_eq}
 u_{ii}^{'2} &=& a_{ii} - \sum_{k=0}^{i} u_{ki}^2 d_{kk},  \\
 d_{ii} &=& sgn( u_{ii}^{'2} ),  \\
 u_{ii} &=& \sqrt{d_{ii} u_{ii}^{'2}},  \\
 u_{ij} &=& \frac{d_{ii}}{u_{ii}}\left( a_{ij} - \sum_{k=0}^{i} u_{ki}u_{kj} d_{kk} \right), j= (i+1) \ldots 4.
\end{eqnarray}

The next step is to invert obtained matrices. As follows from~(\ref{eq:Cholesky}) and from the definition of the matrix $D$:
\begin{eqnarray} \label{eq:invMatrix}
 A^{-1} = U^{-1}D(U^{-1})^T
\end{eqnarray}


\begin{thebibliography}{99}
% 
\bibitem{TSR}
CBM Collaboration, Compressed Baryonic Matter Experiment. 
Technical Status Report. GSI, Darmstadt, 2005; 2006 Update\\
({\tt http://www.gsi.de/documents/DOC-2006-Feb-108-1.pdf}).
% 
\bibitem{NIM-TIME05}
I.~Kisel, Event reconstruction in the CBM experiment. Nucl.\ Instr.\ and Meth.\ A566 (2006) 85-88. 
% 
\bibitem{SIMDKFCPC}
S. Gorbunov, U. Kebschull, I. Kisel, V. Lindenstruth and W.F.J. M{\"u}ller,   Fast SIMDized Kalman filter based track fit,\\
Comp. Phys. Comm. 178 (2008) 374-383.
% 
% \bibitem{IA-32}
% IA-32 Intel Architecture Optimization Reference Manual. Intel, June 2005.
% 
 \bibitem{Vc}
 Vector Classes, ({\tt http://gitorious.org/vc}).
 
 \bibitem{ArBB}
Intel Array Building Blocks, http://software.intel.com/en-us/articles/intel-array-building-blocks.

\bibitem{ITBB}
ITBB Reference Manual, {\tt http://threadingbuildingblocks.org/} .
 
\bibitem{OpenMP}
OpenMP, http://openmp.org.
% 
\bibitem{Kalman}
R.E.\ Kalman, A new approach to linear filtering and prediction problems. Trans.\ ASME, Series D, J.\ Basic Eng., 82 (1960) 35-45.

\bibitem{KFbook}
D.~Simon,   Optimal State Estimation: Kalman, H-infinity, and Nonlinear Approaches, \\
John Wiley \& Sons, 2006

\bibitem{Kaminski}
P.G.~Kaminski, Discrete Square Root Filtering: A Survey of Current Techniques. IEEE Vol. AC-16, No. 6, December 1971.
% 
\bibitem{DataAnalysis}
R.~Fr\"{u}hwirth et al., Data Analysis Techniques for High-Energy Physics. Second Edition, Cambridge Univ.\ Press (2000).

% ------
% \bibitem{DAF}
% R.~Fr\"{u}hwirth and A. Strandlie, Track fitting with ambiguities and noise: a study of elastic tracking and nonlinear filters. Comp. Phys. Comm. 120 (1999) 197-214.

\bibitem{NumRecC}
William H. Press et. al., Numerical Recipes in C, Second Edition, Cambridge Univ. Press (1992), (http://www.nrbook.com/a/bookcpdf.php).

%\bibitem{NumRecC}
%Numerical Recipes in C % P 464-469

\bibitem{Dahl}
G.R.~Lynch and O.I.~Dahl, Approximations to multiple Coulomb scattering. Nucl.\ Instr.\ and Meth.\ B58 (1991) 6-10.

\bibitem{AnalyticExtra}
S.\ Gorbunov and I.\ Kisel, Analytic formula for track extrapolation in non-homogeneous magnetic field.  Nucl.\ Instr.\ and Meth.\ A559 (2006) 148-152. 

\bibitem{NoiseMatrix}
E.J.\ Wolin and L.L.\ Ho, Covariance matrices for track fitting with the Kalman filter. Nucl. Instr. and Meth. A329 (1993) 493-500.

\bibitem{Thornton}
C.L.\ Thornton, Triangular Covariance Factorizations for Kalman Filtering, PHD. thesis, University of California at Los Angeles, School of Engineering, 1976.

\bibitem{Bierman}
G.J.\ Bierman, Measurement Updating Using the U-D Factorization. Proc. IEEE Conf. on Decision and Control, Houston, Texas 337-346 (1975)

\bibitem{GramSchmidt}
A.\ Bjorck, Solving linear least squares problems by Gram-Schmidt orthogonalization, BIT, vol. 7, pp. 1-21, 1967.

\end{thebibliography}

\end{document}