Compare commits
2 Commits
f6dd2a2d6b
...
64f3d38dc9
Author | SHA1 | Date |
---|---|---|
Claudio Maggioni | 64f3d38dc9 | |
Claudio Maggioni | b99792f558 |
Binary file not shown.
|
@ -4,9 +4,8 @@
|
|||
\usepackage{float}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{fancyvrb}
|
||||
\usepackage{tikz}
|
||||
|
||||
\usepackage{multirow}
|
||||
|
||||
\begin{document}
|
||||
|
||||
|
@ -40,42 +39,34 @@ L1 cache & 32 kB per core
|
|||
|
||||
All values are reported using base 2 IEC byte units. The cluster has 2 sockets
|
||||
and a total of 20 cores (10 per socket). The cache topology diagram reported by
|
||||
\texttt{likwid-topology -g} is the following:
|
||||
\texttt{likwid-topology -g} is shown in Figure \ref{fig:topo}.
|
||||
|
||||
\pagebreak[4]
|
||||
% https://tex.stackexchange.com/a/171818
|
||||
\begin{Verbatim}[fontsize=\tiny]
|
||||
Socket 0:
|
||||
+---------------------------------------------------------------------------------------------------------------+
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| | 0 | | 1 | | 2 | | 3 | | 4 | | 5 | | 6 | | 7 | | 8 | | 9 | |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| +-----------------------------------------------------------------------------------------------------------+ |
|
||||
| | 25 MB | |
|
||||
| +-----------------------------------------------------------------------------------------------------------+ |
|
||||
+---------------------------------------------------------------------------------------------------------------+
|
||||
Socket 1:
|
||||
+---------------------------------------------------------------------------------------------------------------+
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| | 10 | | 11 | | 12 | | 13 | | 14 | | 15 | | 16 | | 17 | | 18 | | 19 | |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | | 32 kB | |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | | 256 kB | |
|
||||
| +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+ |
|
||||
| +-----------------------------------------------------------------------------------------------------------+ |
|
||||
| | 25 MB | |
|
||||
| +-----------------------------------------------------------------------------------------------------------+ |
|
||||
+---------------------------------------------------------------------------------------------------------------+
|
||||
\end{Verbatim}
|
||||
\begin{figure}[t]
|
||||
\begin{center}
|
||||
Socket 0:\vspace{0.3cm}
|
||||
|
||||
\begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
|
||||
\hline 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 \\\hline
|
||||
32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
|
||||
kB & 32 kB \\\hline
|
||||
256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
|
||||
kB & 256 kB & 256 kB \\\hline
|
||||
\multicolumn{10}{|c|}{25 MB} \\\hline
|
||||
\end{tabular}\vspace{0.8cm}\\
|
||||
Socket 1:\vspace{0.3cm}
|
||||
\begin{tabular}{|l|l|l|l|l|l|l|l|l|l|}
|
||||
\hline 10 & 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 \\\hline
|
||||
32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32 kB & 32
|
||||
kB & 32 kB \\\hline
|
||||
256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256 kB & 256
|
||||
kB & 256 kB & 256 kB \\\hline
|
||||
\multicolumn{10}{|c|}{25 MB} \\\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\caption{Cache topology diagram as outputted by \texttt{likwid-topology -g}.
|
||||
Byte sizes all in IEC units.}
|
||||
\label{fig:topo}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Memory Access Pattern of \texttt{membench.c}}
|
||||
|
||||
|
@ -221,7 +212,12 @@ implementing the pseudocode, my implementation:
|
|||
\end{figure}
|
||||
|
||||
The results of the matrix multiplication benchmark for the naive, blocked, and
|
||||
BLAS implementations are shown in Figure \ref{fig:bench}.
|
||||
BLAS implementations are shown in Figure \ref{fig:bench}. The blocked
|
||||
implementation achieves approximately 50\% more FLOPS than the naive
|
||||
implementation thanks to the optimisations in space and temporal cache locality
|
||||
described. However, the blocked implementation achives less than a tenth of
|
||||
FLOPS compared to Intel MKL BLAS based one due to the microarchitecture
|
||||
optimization the latter one is able to exploit.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=\textwidth]{timing.pdf}
|
||||
|
|
Reference in New Issue