1/* ----------------------------------------------------------------------------
2
3* GTSAM Copyright 2010, Georgia Tech Research Corporation,
4* Atlanta, Georgia 30332-0415
5* All Rights Reserved
6* Authors: Frank Dellaert, et al. (see THANKS for the full author list)
7
8* See LICENSE for the license information
9* -------------------------------------------------------------------------- */
10
11/**
12* @file TimeTBB.cpp
13* @brief Measure task scheduling overhead in TBB
14* @author Richard Roberts
15* @date November 6, 2013
16*/
17
18#include <gtsam/base/Matrix.h>
19#include <gtsam/global_includes.h>
20
21#include <algorithm>
22#include <iostream>
23#include <map>
24#include <random>
25
26using namespace std;
27using namespace gtsam;
28
29#ifdef GTSAM_USE_TBB
30
31#include <tbb/blocked_range.h> // tbb::blocked_range
32#include <tbb/tick_count.h> // tbb::tick_count
33#include <tbb/parallel_for.h> // tbb::parallel_for
34#include <tbb/cache_aligned_allocator.h> // tbb::cache_aligned_allocator
35#include <tbb/task_arena.h> // tbb::task_arena
36#include <tbb/task_group.h> // tbb::task_group
37
38static const DenseIndex numberOfProblems = 1000000;
39static const DenseIndex problemSize = 4;
40
41typedef Eigen::Matrix<double, problemSize, problemSize> FixedMatrix;
42
43/* ************************************************************************* */
44struct ResultWithThreads
45{
46 typedef map<int, double>::value_type value_type;
47 map<int, double> grainSizesWithoutAllocation;
48 map<int, double> grainSizesWithAllocation;
49};
50
51typedef map<int, ResultWithThreads> Results;
52
53/* ************************************************************************* */
54struct WorkerWithoutAllocation
55{
56 vector<double>& results;
57
58 WorkerWithoutAllocation(vector<double>& results)
59 : results(results), gen(rd()), dis(-1.0, 1.0) {}
60
61 // Copy constructor for TBB
62 WorkerWithoutAllocation(const WorkerWithoutAllocation& other)
63 : results(other.results), gen(rd()), dis(-1.0, 1.0) {}
64
65 void operator()(const tbb::blocked_range<size_t>& r) const
66 {
67 for(size_t i = r.begin(); i != r.end(); ++i)
68 {
69 FixedMatrix m1;
70 FixedMatrix m2;
71 std::generate(m1.data(), m1.data() + m1.size(),
72 [&]() { return dis(gen); });
73 std::generate(m2.data(), m2.data() + m2.size(),
74 [&]() { return dis(gen); });
75 FixedMatrix prod = m1 * m2;
76 results[i] = prod.norm();
77 }
78 }
79 std::random_device rd;
80 mutable std::minstd_rand gen;
81 mutable std::uniform_real_distribution<double> dis;
82};
83
84/* ************************************************************************* */
85map<int, double> testWithoutMemoryAllocation(int num_threads)
86{
87 // A function to do some matrix operations without allocating any memory
88
89 // Create task_arena and task_group
90 tbb::task_arena arena(num_threads);
91 tbb::task_group tg;
92
93 // Now call it
94 vector<double> results(numberOfProblems);
95
96 const vector<size_t> grainSizes = {1, 10, 100, 1000};
97 map<int, double> timingResults;
98 for(size_t grainSize: grainSizes)
99 {
100 tbb::tick_count t0 = tbb::tick_count::now();
101
102 // Run parallel code (as a task group) inside of task arena
103 arena.execute([&] {
104 tg.run_and_wait([&] {
105 tbb::parallel_for(
106 tbb::blocked_range<size_t>(0, numberOfProblems, grainSize),
107 WorkerWithoutAllocation(results));
108 });
109 });
110
111 tbb::tick_count t1 = tbb::tick_count::now();
112 cout << "Without memory allocation, grain size = " << grainSize << ", time = " << (t1 - t0).seconds() << endl;
113 timingResults[(int)grainSize] = (t1 - t0).seconds();
114 }
115
116 return timingResults;
117}
118
119/* ************************************************************************* */
120struct WorkerWithAllocation
121{
122 vector<double>& results;
123
124 WorkerWithAllocation(vector<double>& results)
125 : results(results), gen(rd()), dis(-1.0, 1.0) {}
126
127 // Copy constructor for TBB
128 WorkerWithAllocation(const WorkerWithAllocation& other)
129 : results(other.results), gen(rd()), dis(-1.0, 1.0) {}
130
131 void operator()(const tbb::blocked_range<size_t>& r) const
132 {
133 tbb::cache_aligned_allocator<double> allocator;
134 for(size_t i = r.begin(); i != r.end(); ++i)
135 {
136 double *m1data = allocator.allocate(problemSize * problemSize);
137 Eigen::Map<Matrix> m1(m1data, problemSize, problemSize);
138 double *m2data = allocator.allocate(problemSize * problemSize);
139 Eigen::Map<Matrix> m2(m2data, problemSize, problemSize);
140 double *proddata = allocator.allocate(problemSize * problemSize);
141 Eigen::Map<Matrix> prod(proddata, problemSize, problemSize);
142
143 std::generate(m1.data(), m1.data() + m1.size(),
144 [&]() { return dis(gen); });
145 std::generate(m2.data(), m2.data() + m2.size(),
146 [&]() { return dis(gen); });
147 prod = m1 * m2;
148 results[i] = prod.norm();
149
150 allocator.deallocate(m1data, problemSize * problemSize);
151 allocator.deallocate(m2data, problemSize * problemSize);
152 allocator.deallocate(proddata, problemSize * problemSize);
153 }
154 }
155
156 std::random_device rd;
157 mutable std::minstd_rand gen;
158 mutable std::uniform_real_distribution<double> dis;
159};
160
161/* ************************************************************************* */
162map<int, double> testWithMemoryAllocation(int num_threads)
163{
164 // A function to do some matrix operations with allocating memory
165
166 // Create task_arena and task_group
167 tbb::task_arena arena(num_threads);
168 tbb::task_group tg;
169
170 // Now call it
171 vector<double> results(numberOfProblems);
172
173 const vector<size_t> grainSizes = {1, 10, 100, 1000};
174 map<int, double> timingResults;
175 for(size_t grainSize: grainSizes)
176 {
177 tbb::tick_count t0 = tbb::tick_count::now();
178
179 // Run parallel code (as a task group) inside of task arena
180 arena.execute([&] {
181 tg.run_and_wait([&] {
182 tbb::parallel_for(
183 tbb::blocked_range<size_t>(0, numberOfProblems, grainSize),
184 WorkerWithAllocation(results));
185 });
186 });
187
188 tbb::tick_count t1 = tbb::tick_count::now();
189 cout << "With memory allocation, grain size = " << grainSize << ", time = " << (t1 - t0).seconds() << endl;
190 timingResults[(int)grainSize] = (t1 - t0).seconds();
191 }
192
193 return timingResults;
194}
195
196/* ************************************************************************* */
197int main(int argc, char* argv[])
198{
199 cout << "numberOfProblems = " << numberOfProblems << endl;
200 cout << "problemSize = " << problemSize << endl;
201
202 const vector<int> numThreads = {1, 4, 8};
203 Results results;
204
205 for(size_t n: numThreads)
206 {
207 cout << "With " << n << " threads:" << endl;
208 results[(int)n].grainSizesWithoutAllocation = testWithoutMemoryAllocation((int)n);
209 results[(int)n].grainSizesWithAllocation = testWithMemoryAllocation((int)n);
210 cout << endl;
211 }
212
213 cout << "Summary of results:" << endl;
214 for(const Results::value_type& threads_result: results)
215 {
216 const int threads = threads_result.first;
217 const ResultWithThreads& result = threads_result.second;
218 if(threads != 1)
219 {
220 for(const ResultWithThreads::value_type& grainsize_time: result.grainSizesWithoutAllocation)
221 {
222 const int grainsize = grainsize_time.first;
223 const double speedup = results[1].grainSizesWithoutAllocation[grainsize] / grainsize_time.second;
224 cout << threads << " threads, without allocation, grain size = " << grainsize << ", speedup = " << speedup << endl;
225 }
226 for(const ResultWithThreads::value_type& grainsize_time: result.grainSizesWithAllocation)
227 {
228 const int grainsize = grainsize_time.first;
229 const double speedup = results[1].grainSizesWithAllocation[grainsize] / grainsize_time.second;
230 cout << threads << " threads, with allocation, grain size = " << grainsize << ", speedup = " << speedup << endl;
231 }
232 }
233 }
234
235 return 0;
236}
237
238#else
239
240/* ************************************************************************* */
241int main(int argc, char* argv [])
242{
243 cout << "GTSAM is compiled without TBB, please compile with TBB to use this program." << endl;
244 return 0;
245}
246
247#endif
248