%% RT09_Instructions.tex
%% 4/2009
%% By Bo Yu (yu@bnl.gov)
%% based on:
%% bare_jrnl.tex
%% V1.3
%% 2007/01/11
%% by Michael Shell
%% see http://www.michaelshell.org/
%% for current contact information.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\documentclass[journal]{IEEEtran}
\pagestyle{empty}
\usepackage{graphicx}
\begin{document}
\title{Standalone First Level Event Selection Package\\ for the CBM Experiment}
%
% author names and IEEE memberships
% note positions of commas and nonbreaking spaces ( ~ ) LaTeX will not break
% a structure at a ~ so this keeps an author's name from being broken across
% two lines.
% use \thanks{} to gain access to the first footnote area
% a separate \thanks must be used for each paragraph as LaTeX2e's \thanks
% was not built to handle multiple paragraphs

\author{I.~Kisel,
        I.~Kulakov
        and~M.~Zyzak% <-this % stops a space
\thanks{Manuscript received June 9, 2012. This work was supported by the Hessian LOEWE initiative through the Helmholtz International Center for FAIR (HIC for FAIR), HGS-HIRe, GSI F\&E, BMBF Verbundforschung and EU-FP7 HadronPhysics2. Das Projekt wird vom Hessischen Ministerium fuer Wissenschaft und Kunst gefoerdert.}% <-this % stops a space
% \thanks{Full names of authors are preferred in the author field, but are not required. Put a space between authors' initials. Do not use all uppercase for authors' surnames.}%
\thanks{Ivan Kisel is with Uni-Frankfurt, FIAS and GSI (telephone: +49-69-798-44102, e-mail: I.Kisel@compeng.uni-frankfurt.de).}%
\thanks{Igor Kulakov is with Uni-Frankfurt and GSI (telephone: +49-6159-71-1623, e-mail: I.Kulakov@gsi.de).}%
\thanks{Maksym Zyzak is with Uni-Frankfurt and GSI (telephone: +49-6159-71-1743, e-mail: M.Zyzak@gsi.de).}%
\thanks{Uni-Frankfurt --- Goethe University, Frankfurt am Main, 60325 Germany}%
\thanks{FIAS --- Frankfurt Institute for Advanced Studies, Frankfurt am Main, 60438 Germany}%
\thanks{GSI  --- Helmholtz Center for Heavy Ion Research, Darmstadt, 64291 Germany}%

% \thanks{S. B. Author, Jr., was with Rice University, Houston, TX 77005 USA. He is now with the Department of Physics, Colorado State University, Ft. Collins, CO 80523 USA (telephone: 970-491-6206, e-mail: author@lamar. colostate.edu).}%
% \thanks{T. C. Author is with the Electrical Engineering Department, University of Colorado, Boulder, CO 80309 USA, on leave from the National Research Institute for Metals, Tsukuba, Japan (e-mail: author@nrim.go.jp).}%

}

\maketitle
\thispagestyle{empty}

\begin{abstract}
The main focus of the CBM experiment (FAIR, Darmstadt, Germany) is the measurement of very rare probes, which requires interaction rates of up to 10~MHz. It requires the full on-line event reconstruction at the first level trigger and will operate with  huge data rates of up to 1~TB/s.

The standalone First Level Event Selection (FLES) package has been created for the CBM experiment. It contains all reconstruction stages: track finding, track fitting, short lived particle finding and selection. Reconstruction of about 50~particle decay channels is implemented. 
The algorithms are intrinsically local and the implementation is both vectorized (SIMD) and parallelized (between CPU cores).

For track reconstruction Cellular Automaton and Kalman filter algorithms are used, that allows to achieve high track reconstruction efficiency (up to 97\%) and track parameters quality (1.1\% momentum resolution). The KF particle finder shows high reconstruction efficiency with optimal signal to background ratio.
The FLES package shows strong scalability on many-core systems and speed of 1700~events per second on the 80-core computer.
\end{abstract}

\section{Introduction}
The CBM (Compressed Baryonic Matter) experiment~\cite{TSR} is an experiment being prepared to operate at the future Facility for Anti-Proton and Ion Research (FAIR, Darmstadt, Germany). Its main focus is the measurement of very rare probes, which requires interaction rates of up to 10 MHz.
Together with the high multiplicity of charged tracks (up to 1000, see \figurename~\ref{fig:SimEvent}) produced in heavy-ion collisions, this leads to huge data rates of up to 1 TB/s. Most trigger signatures are complex (short lived particles, e.g. open charm decays) and require information from several detector sub-systems.

\begin{figure}[htb]
\centering
\includegraphics[width=2.5in]{simEvent} %width=2.5in,
\caption{A central gold-gold event at 25$A$GeV collision energy in the CBM experiment, in average about 1000 tracks of charged particles.}
\label{fig:SimEvent}
\end{figure}

The first level event selection package of the CBM experiment is intended to reconstruct the full topology of the event including tracks of charged particles and short lived particles.
The FLES package consists of several modules (the block scheme is shown on \figurename~\ref{fig:FLES}): track finder, track fitter, particle finder and selection. As an input the FLES package receives simplified geometry of the tracking detectors and hits, which are created by charged particles crossing the detectors.
Tracks of the charged particles are reconstructed by the Cellular Automaton (CA) track finder~\cite{CA} according to the registered hits. 
The Kalman filter (KF) based track fit~\cite{SIMDKF} is used for precise estimation of track parameters. The short lived particles, which decay before the tracking detectors, can be reconstructed via decay products only.
The KF particle finder, which is based on the KFParticle package~\cite{KFParticle} is used in order to find and reconstruct parameters of short lived particles combining already found tracks of the charged particles. The KF particle finder also selects  particle-candidates from a big number of random combinations.
In addition a module for quality assurance is implemented, that allows to control the quality of the reconstruction at all stages. It produces an output in the portable ASCII format. The output files are interpreted later as efficiencies and histograms using ROOT framework~\cite{ROOT}.
The package is platform and operating system independent.

\begin{figure}[htb]
\centering
\includegraphics[width=3.2in]{FLES}
\caption{The block scheme of the FLES package.}
\label{fig:FLES}
\end{figure}

First Level Event Selection (FLES) in the CBM experiment will be performed on-line on a dedicated processor farm. This requires the development of fast and precise reconstruction algorithms suitable for on-line data processing. The algorithms have to be intrinsically local and parallel and thus require a fundamental redesign of traditional approaches to event data processing in order to use the full potential of modern many-core CPU/GPU architectures. 
The development of the fast reconstruction algorithms, which use maximum power of available processors, is necessary for the FLES selection of events with rare signals of interesting physics.
Massive hardware parallelization has to be reflected in mathematical and computational optimization of the algorithms.

One of the efficient features which are supported by almost all modern processors is the SIMD (Single Instruction - Multiple Data, vector operations) instruction set. It permits to pack several data values into a vector register and to work with them simultaneously and in this way one can reach more calculations per clock. Therefore the reconstruction routines have been revised in order to use SIMD.

The reconstruction algorithms have been parallelized using the Intel Threading Building Blocks package (ITBB)~\cite{ITBB} to provide a scalable behavior with respect to the number of CPU cores and hardware threads.

\section{Parallelism in the FLES package}

The SIMD instruction set provides possibility to perform one operation on several values simultaneously (\figurename~\ref{fig:SIMD}). The majority of modern computers have 128 bit SIMD registers allowing packing of 4 single precision floats into one register. The newest CPUs have already 256 bit registers (8 floats), CPUs with 16 floats wide registers are under development. That makes utilization of SIMD registers by the FLES package important.
All four modules of the FLES package are vectorized using headers, which let to keep the algorithms and low level code separated.

\begin{figure}[htb]
\centering
\includegraphics[width=2.5in]{SIMD} %width=2.5in,
\caption{Scalar calculations versus SIMD.}
\label{fig:SIMD}
\end{figure}

% \subsection{Header files}
Header files overload SIMD instructions implementing all operands and inlining basic arithmetic and logic functions, that makes a code compact and easily readable. For example, a simple code for calculation of a polynomial function of the first order, which is written using SSE instructions~is:
\begin{verbatim}
__m128 y = _mm_add_ps(_mm_mul_ps(a,x),b);
\end{verbatim}
The same code using the header files is:
\begin{verbatim}
fvec y = a*x + b;
\end{verbatim}
with
\begin{verbatim}
friend fvec operator+( const fvec &a, 
                       const fvec &b ) {
  return _mm_add_ps(a,b); }
friend fvec operator*( const fvec &a, 
                       const fvec &b ) {
  return _mm_mul_ps(a,b); } 
\end{verbatim}
in the header file.

The implementation is simple and therefore flexible with respect to different CPU architectures. It allows keeping the main program code unchangeable, only the header file correspondent to the architecture should be included. Also the header file with scalar implementation exists and allows easy debugging and comparison.

Modern computer development is mainly focused on rather increasing of number of cores then frequency of the processors, forcing users to exploit MIMD (Multiple Instruction - Multiple Data) architecture. In order to be fast programs have to be parallelized in efficient and scalable way.

Intel Threading Building Blocks package (ITBB)~\cite{ITBB} is used for parallelization of the FLES package.
ITBB is a C++ template library, which offers a rich functiona\-li\-ty allowing parallel execution of a big number of different types of problems. In order to abstract access to the multiple processors the groups of operations is treated as tasks. An ITBB program creates, synchronizes and destroys graphs of dependent tasks.
Tasks are then executed respecting graph dependencies.

As it will be shown later the FLES package has strong scalability using ITBB.


\section{Cellular Automaton track finder}
Every track finder must handle a very specific and complicated combinatorial optimization process, grouping together one- or two-dimensional measurements into five-dimensional tracks. The CA method profits from building up cells, i.e. short track segments with a higher than measurements dimensionality, before starting the combinatorial search~\cite{CA}. The method is intrinsically local working with data only in a neighborhood. It consolidates steadily the track information identified in the course of the algorithm evolution. In addition, the CA method based algorithms can be parallelized in order to be implemented on the modern many-core CPU/GPU computer systems.

\begin{figure}[htb]
\centering
\includegraphics[height=3.5in]{CA} %width=2.5in,
\caption{The Cellular Automaton track finding algorithm illustration. Tracking stations are shown by dashed lines. Hits, which are corresponded to two different particles, are shown in blue and green rounds, noise hit is shown in white round. Track segments are shown by lines, color of a line corresponds to position on a track.}
\label{fig:CA}
\end{figure}

In the CA method (\figurename~\ref{fig:CA}) first (1) short track segments, so-called cells, are created. After that the method does not work with hits any more but with the created track segments. It puts neighbor relations between the segments according to the track model here and then (2) one can estimate for every segment its possible position on a track. After this process a set of tree connections of possible track candidates appears. Then one starts with segments with the largest position counters (3) and goes along the continuous connection tree to easily collect the track segments into track candidates. In the last step (4) one sorts the track candidates according to their length and $\chi^2$-values and then selects the best tracks.

Additional modifications are done in order to make the algorithm more effective in specific conditions of the CBM experiment.
Track segments are created from three hits from neighbor stations of the detector, that is a minimum number of measurements to reconstruct momentum. Hits in the track segments can be separated by one inefficient station. This allows compensating for the detector inefficiency, which expected to be about 3\%. In the end broken tracks are merged by the Kalman filter based merger procedure.
The track finder procedure is divided into several iterations over all CA stages to make possible fast reconstruction in spite of high tracks density: at the first iteration only fast primary tracks are reconstructed, at the second --- slow primary tracks, at the third --- all other tracks. All used hits at each iteration are removed from consideration. This reduces combinatory and increase the speed of the algorithm by one-two orders of magnitude.

\section{Kalman Filter track fitter}
The estimation (or fit) of the track parameters and their errors is done by the Kalman filter (KF) method~\cite{SIMDKF}. 
The Kalman filter is widely used for track fitting because of its features: optimal estimation of the parameters, locality with respect to the measurements, convenience of track propagation through a nonhomogeneous magnetic field and material, operation with matrices of a small size, a number of computations is proportional to a number of measurements.

The Kalman filter (1) starts with an initial approximation (\figurename~\ref{fig:KF}); (2) adds hits one after the other; (3) refines the state vector and gets the optimal values of track parameters after the last hit. The initial parameters are chosen arbitrary, a covariance matrix (matrix of errors) is chosen with large positive numbers on diagonal and zero nonlinear elements. Such choice of the covariance matrix minimizes the influence of the approximation on the final result.

\begin{figure}[htb]
\centering
\includegraphics[width=3.45in]{KF} %width=2.5in,
\caption{The Kalman Filter based track fitting algorithm illustration.
% Estimation of track parameters on each station are shown in red.
 }
\label{fig:KF}
\end{figure}

The Kalman filter is intensively used in a combinatorial part of the CA track finder, therefore its stability in single precision and its fast and optimal implementation on modern CPU and GPU computer systems are crucial.

Starting out from the idea of using the SIMD units of modern processors the algorithm of track fitting was examined aiming of increasing the speed of the track finding as a part of the event reconstruction. After detailed optimizations of the memory utilization and numerical analysis, the KF algorithm had been vectorized~\cite{SIMDKF}. To optimize the memory usage the magnetic field approximation is used for particle propagation instead of the magnetic field map, which consumes large fraction of the memory (about 70~MB) and therefore is not fitted to the cache memory of the CPU. The magnetic field is approximated on each detector station plane with the polynomial function of fifth order. During the fit of a track the field behavior between the stations is approximated along the track with a parabola taking field values at the three closest measurements. To stabilize the fit the initial approximation is done by the least square estimator in assumption of the one-component magnetic field. The first measurement is added in a special way, which also increases the stability of the method: the equations can be simplified analytically in this case due to the special form of the initial covariance matrix. The propagation in the nonhomogeneous magnetic field is done by an analytic formula, which is based on the Taylor expansion~\cite{AnaliticPropagation}. The formula allows getting the same track fit quality as the standard fourth order Runge-Kutta method, while it is about 40\% faster. Operator overloading was used to keep the maximum possible flexibility with respect to use of different CPU families for the data reconstruction. By the mentioned changes the SIMD optimized algorithm has being accelerated down to 1~$\mu$s per particle track. This is an improvement with a factor 10000 comparing to the original scalar version.

\section{Track reconstruction performance}
The main quality characteristics of track reconstruction are efficiency and residuals and pulls of the track parameters.

\begin{figure}[htb]
\centering
\includegraphics[width=3.45in]{EffMBias} %width=2.5in,
\caption{Efficiency of the track reconstruction for minimum bias gold-gold collisions at 25 AGeV.}
\label{fig:EffMBias}
\end{figure}

Efficiency of the track reconstruction for 1000 minimum bias gold-gold UrQMD collisions at 25 AGeV is presented on \figurename~\ref{fig:EffMBias}. Track reconstruction efficiency for different sets of tracks and ratios of double found and wrong tracks are shown in table~\ref{tab:efficiency}. The tests have been performed on the server with Intel Xeon E7-4860 CPUs.

%Due to multiple scattering in the material of the detector the reconstruction efficiency for low momentum (less then 1~GeV/c) particles is 77.7\%, while for particles with a high momentum the efficiency is 95.1\%. The total efficiency for all tracks is 88.5\% with a high fraction of ``soft'' secondary tracks. The ratio of double found tracks is 0.2\% and of wrong tracks is 0.7\%.

% The efficiency for primary particles with a high momentum (higher then 1~GeV/c) is 97

The majority of signal tracks (decay products of D-mesons, charmonium, light vector mesons) are high momentum particles (momentum higher then 1~GeV/c), which originated from the region of the primary vertex thereby the efficiency of such tracks is defined by fast primary set and is equal to 97.1\%. The secondary high momentum particles can be created in decays of $K_s^0$ and $\Lambda$ particles and cascade decays of $\Xi$ and $\Omega$. Such tracks are created far from the primary vertex and it is impossible to use the information of the target position for their reconstruction, as a consequence the efficiency is lower ---  81.2\%. Secondary low momentum tracks (momentum lower then 1~GeV/c) can be created in $\Xi$ and $\Omega$ hyperons decays. In order to decrease the background level for the light vector mesons and charmonium it is important to reconstruct slow secondary tracks because a fraction of the background is created by the electrons from $\pi^0 \rightarrow \gamma e^{+}e^{-}$ and $\gamma \rightarrow e^{+}e^{-}$.
Multiple scattering in material and a high curvature of trajectories leads to even bigger losses during low momentum particles reconstruction, the efficiencies are 51.1\% for slow secondary and 90.4\% for slow primary tracks.

The total efficiency for all tracks is 88.5\% with a high fraction of ``soft'' secondary tracks. The ratio of double found tracks is 0.2\% and of wrong tracks is 0.7\%.

\begin{table}[ht]
\renewcommand{\arraystretch}{1.3}
\caption{Track reconstruction efficiency for different sets of tracks and ratios of double found and wrong tracks}
\label{tab:efficiency}
\centering
\begin{tabular}{l|c}
\hline
\bfseries Set of tracks & \bfseries Efficiency, \% \\
\hline\hline
Fast primary     & 97.1 \\
Fast secondary   & 81.2 \\
Slow primary     & 90.4 \\
Slow secondary   & 51.1 \\
All tracks       & 88.5 \\
Double found     & ~0.2 \\
Wrong tracks     & ~0.7 \\
\hline
Reconstructed tracks/event     & 120 \\
Time/event/core & 8.2 ms \\
\hline
\end{tabular}
\end{table}

The CBM experiment is an experiment with a forward geometry and the natural choice of tracks parameters for this case is:
$x$ and $y$ track coordinates at the reference $z$-plane (the $z$-coordinate points downstream the spectrometer axis), $t_x = \tan \theta_x$ is the track slope in the $XZ$-plane, $t_y = \tan \theta_y$ is the track slope in the $YZ$-plane, $q/p$ is the inverse particle momentum, signed according to charge.

Residuals of the track parameters are determined as a difference between a reconstructed parameter and a true Monte-Carlo value. The normalized residual (pull) distributions of the fitted track parameters are a measure of the reliability of the fit. Pulls are determined as residuals normalized on the corresponding estimated error obtained in the track fit. In the ideal case the normalized error distributions of coordinates and slopes of the track should be unbiased and Gaussian distributed with width of 1.0.

\begin{figure}[htb]
\centering
\includegraphics[width=3.4in]{PullsMBias} %width=2.5in,
\caption{Residual and pulls distributions of the tracks parameters together with their Gaussian fits.}
\label{fig:PullsMBias}
\end{figure}

Residual and pulls for all parameters are calculated at the first hit of the track. The distributions for $x$, $t_x$ and $q/p$ parameters together with Gaussian fits are shown on \figurename~\ref{fig:PullsMBias} (for $y$ and $t_y$ results are similar). The residuals of all parameters are fitted with Gaussian to estimate the width of the distribution.
All distributions are not biased, pulls have widths close to 1.0 indicating correctness of the fitting procedure. The deviation from 1.0 is caused by several assumptions made in the fitting procedure mainly in the part of the detector material treatment. The $q/p$ pull is the widest because the momentum is the most sensitive to all approximations. 

The high efficiency and track fit quality are crucial for the short lived particles reconstruction that are of the particular interest of the CBM experiment. The efficiency of the short lived particles reconstruction depends quadratically on the track reconstruction efficiency for two particle decay. The situation is even worse for decays with three and more daughters. The level of a combinatorial background for the short lived particles strongly depends on the track fit quality, high resolutions and correct estimation of the errors helps to distinguish signal and background particle candidates and suppress the background. The wrong tracks usually have bad parameters and easily combine with other tracks into particle candidates, therefore a low level of such tracks also important to keep low the combinatorial background. As a result the high track reconstruction efficiency and the low level of the combinatorial background positively affect the data reduction by FLES.

\section{Reconstruction of the short lived particles with KF particle finder}

The new physics is now hidden mainly in the properties of very short lived particles, which are not registered in the detectors, but can be reconstructed by its daughter products only (see \figurename~\ref{fig:D0Decay}). This stage of the event reconstruction is important for the success of the whole experiment. 

The KFParticle~\cite{KFParticle} software package for the reconstruction of short lived particles has been developed so that all parameters of these particles are provided in the most suitable form for the final physics analysis. The vector of the parameters contains three coordinates ($x$, $y$ and $z$), three components of momentum ($p_x$, $p_y$ and $p_z$) and energy ($E$). These parameters are conventionally used in physics for a moving particle description, that makes the package geometry and experiment independent. KFParticle is based on the Kalman filter method thereby it provides an estimation of a covariance matrix together with parameters.
The package also provides an easy access to such important physical quantities of a particle as mass, decay length, lifetime, momentum, transverse momentum, rapidity, etc. together with their errors. The errors are calculated mathematically correctly from the covariance matrix of the parameters vector. The KFParticle package treats all particles (long lived and short lived) in the same way. Thereby the reconstruction of the decay chains is very easy task with KFParticle.

\begin{figure}[htb]
\centering
\includegraphics[height=1.5in]{D0Decay} %width=2.5in,
\caption{The illustration of the $D^{0}$ decay into $K^{-}$ and $\pi^{+}$. Dashed red line is $D^{0}$ trajectory. Green and blue lines show primary and secondary particles trajectories respectively. Red stars are hits in tracking detectors. Decay points are shown in gray.}
\label{fig:D0Decay}
\end{figure}

The fit of the particle is done iteratively. The KFParticle package starts with an initial approximation of the vertex, adds particles one after another, refines the state vector and gets the optimal values after the last particle. If the initial approximation is not known precisely enough the procedure can be repeated few times and values from the obtained state vector are taken as the initial approximation for the next repetition.

Since the speed of the algorithms is crucial for the FLES package KFParticle has been fully vectorized and moved to the single precision calculations. To optimize the memory usage by the package the magnetic field approximation is used for particle propagation instead of the magnetic field map. The magnetic field is approximated by the polynomial function of the second order along the trajectory of the particle and stored for each particle. Also to speed up the calculations the analytic formula for particle propagation~\cite{AnaliticPropagation} is used.

Based on the KFParticle package the KF particle finder has been developed. At the first stage all tracks of the charged particles, which are found by the CA track finder, are divided into two groups: primary and secondary. Strange, multi-strange and open-charm particles are constructed from the secondary tracks taking into account corresponding mass assumptions for the tracks. Combining obtained strange and open-charm particles with primary tracks strange, multi-strange and open-charm resonances are constructed. Primary tracks with corresponding mass hypothesis are combined into strange resonances (which decay into charged long-lived particles), light vector mesons and charmonium. The search of about 50 decay modes of the short lived particles in total is implemented. Taking into account the topology of the decay and $\chi^{2}$ criteria particle candidates are selected and stored. 

The search of the particles is done in one go minimizing an access to the memory. Together with code optimization and vectorization this allows to achieve a high speed in spite of the huge combinatory. The speed of the KF particle finder per core on the server with Intel Xeon E7-4860 CPUs is 1.4~ms per minbias gold-gold event at 25~AGeV and 10.5~ms per central gold-gold event at 25~AGeV.

\begin{figure}[htb]
\centering
\includegraphics[height=3.0in]{KsLambda} %width=2.5in,
\caption{Mass distribution of $K^{0}_{s}$ and $\Lambda$ candidates for 1000 minbias gold-gold UrQMD events at 25~AGeV. }
\label{fig:KsLambda}
\end{figure}

The KF particle finder package provides high efficiency and signal to background (S/B) ratio for the reconstructed decays. For instance, for 1000 minbias gold-gold UrQMD events at 25~AGeV for $K^{0}_{s}$ meson reconstruction efficiency (normalized on $4\pi$) is 11.3\% with S/B ratio 1.15 and for $\Lambda$ hyperon --- 9.2\% and 2.14 respectively. The mass distributions of $K^{0}_{s}$ and $\Lambda$ particles are shown on \figurename~\ref{fig:KsLambda}.

\section{ Scalability }

Four servers with Intel Xeon E7-4860, L5640 and X5550 processors and AMD 6164EH have been used for scalability tests (table~\ref{tab:servers}). The AMD server has 4 processors 12 physical cores each. All Intel processors have the hyper-threading technology, therefore each physical core has two logical cores. That gives 80 logical cores in total for the fastest server, which has 4 processors 10 physical cores each.

\begin{table}[ht]
\renewcommand{\arraystretch}{1.3}
\caption{Characteristics of the servers used for scalability tests}
\label{tab:servers}
\centering
\begin{tabular}{l|c|c|c|c}
\hline
\bfseries Processor & \bfseries Clock, & \bfseries L3 Cache, & \bfseries Number & \bfseries Total number\\
\bfseries  & \bfseries GHz & \bfseries MB & \bfseries of CPUs & \bfseries of cores\\
\hline\hline
			Intel E7-4860 & 2.27 & 24 & 4 & 80  \\
      AMD 6164EH    & 1.7  & 12 & 4 & 48  \\
			Intel L5640   & 2.27 & 12 & 2 & 24  \\
			Intel X5550   & 2.66 & ~8 & 2 & 16  \\
\hline
\end{tabular}
\end{table}

The FLES package has been parallelized using Intel Threading Building Blocks by execution of one thread per one logical core. Reconstruction of 1000 events has been executed per each thread. In order to minimize interference of the operation system each thread is fixed to a certain core using pthread functional provided by the C++ standard library. This is necessary requirement on computers with several processors: different CPUs have different cache memory, disturbed thread can change CPUs and would need to access other CPUs memory, which is much slower. All systems being tested are NUMA (non-uniform memory access) systems and for such systems the disturbed thread would access the RAM (random access memory, the main memory of the computer) which is close to the previously used CPU, that makes the program even slower.
\figurename~\ref{fig:Scalability} shows strong scalability for all many-core systems achieving reconstruction speed of 1700 events per second on one node.

\begin{figure}[htb]\centering
\includegraphics[width=3.2in]{Scalability} %width=2.5in,
\caption{Scalability of the FLES package on many-core systems}
\label{fig:Scalability}
\end{figure}

\section*{Conclusion}
The standalone FLES package has been created for the CBM experiment. It contains all reconstruction stages: track finding, track fitting, short lived particle finding and selection. Reconstruction of about 50 particle decay channels is implemented. The algorithms implementation is optimized with respect to memory usage and time. The package depends on compiler only therefore portable. It is both vectorized (using SIMD instructions) and parallelized (between CPU cores).

Cellular Automaton and Kalman filter algorithms are used, that allows to achieve high track reconstruction efficiency (up to 97\%) and track parameters quality (1.1\% momentum resolution). The KF particle finder shows high reconstruction efficiency with optimal signal to background ratio. For instance, reconstruction efficiency ($4\pi$) of 10\% is achieved for $K^{0}_{s}$ and $\Lambda$ particles with signal to background ratio about 1 and 2 respectively.
The FLES package shows strong scalability on many-core systems and speed of 1700 events per second on the 80-core computer.

\section*{Acknowledgment}
We wish to acknowledge the help provided by Dr. Iouri Vassiliev in developing of the KF particle finder.

\begin{thebibliography}{2}
%TODO format
% 
\bibitem{TSR}
CBM Collaboration, Compressed Baryonic Matter Experiment. 
Technical Status Report. GSI, Darmstadt, 2005; 2006 Update. Available:
http://www.gsi.de/documents/DOC-2006-Feb-108-1.pdf.
% 
\bibitem{CA}
I.~Kisel, Event reconstruction in the CBM experiment. Nucl.\ Instr.\ and Meth.\ A566 (2006) 85-88. 
% 
\bibitem{SIMDKF}
S. Gorbunov, U. Kebschull, I. Kisel, V. Lindenstruth and W.F.J. M{\"u}ller,   Fast SIMDized Kalman filter based track fit,
Comp. Phys. Comm. 178 (2008) 374-383.
%
\bibitem{KFParticle}
S.~Gorbunov and I.~Kisel, Reconstruction of decayed particles based on the Kalman filter. \relax CBM-SOFT-note-2007-003, 7 May 2007
%
\bibitem{ROOT}
An object oriented framework for large scale data analysis. Available:
http://root.cern.ch
%
\bibitem{ITBB}
ITBB Reference Manual, Available:
http://threadingbuildingblocks.org .
%
\bibitem{AnaliticPropagation}
S.~Gorbunov and I.~Kisel, Analitic formula for track extrapolation in non-homogeneous magnetic field. \relax Nucl. Instr. And Meth. - 2006. - Vol. A559. - P. 148-152.
%
% \bibitem{IEEEhowto:kopka}
% H.~Kopka and P.~W. Daly, \emph{A Guide to \LaTeX}, 3rd~ed.\hskip 1em plus
%   0.5em minus 0.4em\relax Harlow, England: Addison-Wesley, 1999.
% 
% \bibitem{IEEEPDFRequirement401}
% IEEE Content Engineering, \emph{IEEE PDF Specification Version 4.10}. Available: http://www.ieee.org/documents/31296\_IEEE\_PDF\_Spec.zip.

\end{thebibliography}


% that's all folks
\end{document}