1 | /*************************************************************************** |
---|
2 | * blitz/array/eval.cc Evaluate expression and assign to an array. |
---|
3 | * |
---|
4 | * $Id$ |
---|
5 | * |
---|
6 | * Copyright (C) 1997-2011 Todd Veldhuizen <tveldhui@acm.org> |
---|
7 | * |
---|
8 | * This file is a part of Blitz. |
---|
9 | * |
---|
10 | * Blitz is free software: you can redistribute it and/or modify |
---|
11 | * it under the terms of the GNU Lesser General Public License |
---|
12 | * as published by the Free Software Foundation, either version 3 |
---|
13 | * of the License, or (at your option) any later version. |
---|
14 | * |
---|
15 | * Blitz is distributed in the hope that it will be useful, |
---|
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
18 | * GNU Lesser General Public License for more details. |
---|
19 | * |
---|
20 | * You should have received a copy of the GNU Lesser General Public |
---|
21 | * License along with Blitz. If not, see <http://www.gnu.org/licenses/>. |
---|
22 | * |
---|
23 | * Suggestions: blitz-devel@lists.sourceforge.net |
---|
24 | * Bugs: blitz-support@lists.sourceforge.net |
---|
25 | * |
---|
26 | * For more information, please see the Blitz++ Home Page: |
---|
27 | * https://sourceforge.net/projects/blitz/ |
---|
28 | * |
---|
29 | ****************************************************************************/ |
---|
30 | #ifndef BZ_GLOBEVAL_CC |
---|
31 | #define BZ_GLOBEVAL_CC |
---|
32 | |
---|
33 | #include <blitz/ranks.h> |
---|
34 | #include <blitz/tvevaluate.h> |
---|
35 | #include <blitz/blitz.h> |
---|
36 | |
---|
37 | BZ_NAMESPACE(blitz) |
---|
38 | |
---|
39 | |
---|
40 | // Fast traversals require <set> from the ISO/ANSI C++ standard library |
---|
41 | #ifdef BZ_HAVE_STD |
---|
42 | #ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL |
---|
43 | |
---|
44 | |
---|
45 | /** _bz_tryFastTraversal is a helper class. Fast traversals are only |
---|
46 | attempted if the expression looks like a stencil -- it's at least |
---|
47 | three-dimensional, has at least six array operands, and there are |
---|
48 | no index placeholders in the expression. These are all things |
---|
49 | which can be checked at compile time, so the if()/else() syntax |
---|
50 | has been replaced with this class template. |
---|
51 | */ |
---|
52 | template<bool canTryFastTraversal> |
---|
53 | struct _bz_tryFastTraversal { |
---|
54 | template<typename T_numtype, int N_rank, typename T_expr, typename T_update> |
---|
55 | static bool tryFast(Array<T_numtype,N_rank>& array, |
---|
56 | T_expr expr, T_update) |
---|
57 | { |
---|
58 | return false; |
---|
59 | } |
---|
60 | }; |
---|
61 | |
---|
62 | template<> |
---|
63 | struct _bz_tryFastTraversal<true> { |
---|
64 | template<typename T_numtype, int N_rank, typename T_expr, typename T_update> |
---|
65 | static bool tryFast(Array<T_numtype,N_rank>& array, |
---|
66 | T_expr expr, T_update) |
---|
67 | { |
---|
68 | // See if there's an appropriate space filling curve available. |
---|
69 | // Currently fast traversals use an N-1 dimensional curve. The |
---|
70 | // Nth dimension column corresponding to each point on the curve |
---|
71 | // is traversed in the normal fashion. |
---|
72 | TraversalOrderCollection<N_rank-1> traversals; |
---|
73 | TinyVector<int, N_rank - 1> traversalGridSize; |
---|
74 | |
---|
75 | for (int i=0; i < N_rank - 1; ++i) |
---|
76 | traversalGridSize[i] = array.length(array.ordering(i+1)); |
---|
77 | |
---|
78 | #ifdef BZ_DEBUG_TRAVERSE |
---|
79 | cout << "traversalGridSize = " << traversalGridSize << endl; |
---|
80 | cout.flush(); |
---|
81 | #endif |
---|
82 | |
---|
83 | const TraversalOrder<N_rank-1>* order = |
---|
84 | traversals.find(traversalGridSize); |
---|
85 | |
---|
86 | if (order) |
---|
87 | { |
---|
88 | #ifdef BZ_DEBUG_TRAVERSE |
---|
89 | cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype) |
---|
90 | << ", " << N_rank << ">: Using stack traversal" << endl; |
---|
91 | #endif |
---|
92 | // A curve was available -- use fast traversal. |
---|
93 | array.evaluateWithFastTraversal(*order, expr, T_update()); |
---|
94 | return true; |
---|
95 | } |
---|
96 | |
---|
97 | return false; |
---|
98 | } |
---|
99 | }; |
---|
100 | #endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL |
---|
101 | #endif // BZ_HAVE_STD |
---|
102 | |
---|
103 | |
---|
104 | /** Helper class that implements the evaluation routines for different |
---|
105 | ranks. */ |
---|
106 | template<int N> struct _bz_evaluator { |
---|
107 | template<typename T_dest, typename T_expr, typename T_update> |
---|
108 | static void evaluateWithStackTraversal(T_dest&, T_expr, T_update); |
---|
109 | template<typename T_dest, typename T_expr, typename T_update> |
---|
110 | static void evaluateWithIndexTraversal(T_dest&, T_expr, T_update); |
---|
111 | }; |
---|
112 | template<> struct _bz_evaluator<1> { |
---|
113 | template<typename T_dest, typename T_expr, typename T_update> |
---|
114 | static void evaluateWithStackTraversal(T_dest&, T_expr, T_update); |
---|
115 | template<typename T_dest, typename T_expr, typename T_update> |
---|
116 | static void evaluateWithIndexTraversal(T_dest&, T_expr, T_update); |
---|
117 | }; |
---|
118 | |
---|
119 | /** |
---|
120 | Assign an expression to a container. For performance reasons, this |
---|
121 | function forwards to functions implementing one of several traversal |
---|
122 | mechanisms: |
---|
123 | |
---|
124 | - Index traversal scans through the destination array in storage order. |
---|
125 | The expression is evaluated using a TinyVector<int,N> operand. This |
---|
126 | version is used only when there are index placeholders in the expression |
---|
127 | (see <blitz/indexexpr.h>) |
---|
128 | - Stack traversal also scans through the destination array in storage |
---|
129 | order. However, push/pop stack iterators are used. |
---|
130 | - Fast traversal follows a Hilbert (or other) space-filling curve to |
---|
131 | improve cache reuse for stencilling operations. Currently, the |
---|
132 | space filling curves must be generated by calling |
---|
133 | generateFastTraversalOrder(TinyVector<int,N_dimensions>). |
---|
134 | - 2D tiled traversal follows a tiled traversal, to improve cache reuse |
---|
135 | for 2D stencils. Space filling curves have too much overhead to use |
---|
136 | in two-dimensions. |
---|
137 | */ |
---|
138 | template<typename T_dest, typename T_expr, typename T_update> |
---|
139 | _bz_forceinline void |
---|
140 | _bz_evaluate(T_dest& dest, T_expr expr, T_update) |
---|
141 | { |
---|
142 | typedef typename T_dest::T_numtype T_numtype; |
---|
143 | const int N_rank = T_dest::rank_; |
---|
144 | |
---|
145 | // Check that all arrays have the same shape |
---|
146 | #ifdef BZ_DEBUG |
---|
147 | if (!expr.shapeCheck(dest.shape())) |
---|
148 | { |
---|
149 | if (assertFailMode == false) |
---|
150 | { |
---|
151 | cerr << "[Blitz++] Shape check failed: Module " << __FILE__ |
---|
152 | << " line " << __LINE__ << endl |
---|
153 | << " Expression: "; |
---|
154 | prettyPrintFormat format(true); // Use terse formatting |
---|
155 | BZ_STD_SCOPE(string) str; |
---|
156 | expr.prettyPrint(str, format); |
---|
157 | cerr << str << endl ; |
---|
158 | } |
---|
159 | |
---|
160 | #if 0 |
---|
161 | // Shape dumping is broken by change to using string for prettyPrint |
---|
162 | << " Shapes: " << shape() << " = "; |
---|
163 | prettyPrintFormat format2; |
---|
164 | format2.setDumpArrayShapesMode(); |
---|
165 | expr.prettyPrint(cerr, format2); |
---|
166 | cerr << endl; |
---|
167 | #endif |
---|
168 | BZ_PRE_FAIL; |
---|
169 | } |
---|
170 | #endif |
---|
171 | |
---|
172 | BZPRECHECK(expr.shapeCheck(dest.shape()), |
---|
173 | "Shape check failed." << endl << "Expression:"); |
---|
174 | |
---|
175 | BZPRECHECK((T_expr::rank_ == T_dest::rank_) || |
---|
176 | (T_expr::numArrayOperands == 0), |
---|
177 | "Assigned rank " << T_expr::rank_ << " expression to rank " |
---|
178 | << T_dest::rank_ << " array."); |
---|
179 | |
---|
180 | /* |
---|
181 | * Check that the arrays are not empty (e.g. length 0 arrays) |
---|
182 | * This fixes a bug found by Peter Bienstman, 6/16/99, where |
---|
183 | * Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i); |
---|
184 | * went into an infinite loop. |
---|
185 | */ |
---|
186 | |
---|
187 | const sizeType n = dest.numElements(); |
---|
188 | if (n == 0) { |
---|
189 | #ifdef BZ_DEBUG_TRAVERSE |
---|
190 | BZ_DEBUG_MESSAGE("Evaluating empty array, nothing to do"); |
---|
191 | #endif |
---|
192 | return; |
---|
193 | } |
---|
194 | // \todo this does not alvays compile, so eliminate for now. |
---|
195 | // if (n == 1) { |
---|
196 | // // shortcut here since it's easy |
---|
197 | // T_update::update(*dest.dataFirst(), expr(expr.lbound())); |
---|
198 | // return; |
---|
199 | // } |
---|
200 | |
---|
201 | #ifdef BZ_DEBUG_TRAVERSE |
---|
202 | BZ_DEBUG_MESSAGE( "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders); |
---|
203 | #endif |
---|
204 | |
---|
205 | // Tau profiling code. Provide Tau with a pretty-printed version of |
---|
206 | // the expression. |
---|
207 | // NEEDS_WORK-- use a static initializer somehow. |
---|
208 | |
---|
209 | #ifdef BZ_TAU_PROFILING |
---|
210 | static BZ_STD_SCOPE(string) exprDescription; |
---|
211 | if (!exprDescription.length()) // faked static initializer |
---|
212 | { |
---|
213 | exprDescription = "A"; |
---|
214 | prettyPrintFormat format(true); // Terse mode on |
---|
215 | format.nextArrayOperandSymbol(); |
---|
216 | T_update::prettyPrint(exprDescription); |
---|
217 | expr.prettyPrint(exprDescription, format); |
---|
218 | } |
---|
219 | TAU_PROFILE(" ", exprDescription, TAU_BLITZ); |
---|
220 | #endif |
---|
221 | |
---|
222 | // Determine which evaluation mechanism to use |
---|
223 | if (T_expr::numIndexPlaceholders > 0) |
---|
224 | { |
---|
225 | // The expression involves index placeholders, so have to |
---|
226 | // use index traversal rather than stack traversal. |
---|
227 | |
---|
228 | _bz_evaluator<T_dest::rank_>::evaluateWithIndexTraversal(dest, expr, T_update()); |
---|
229 | return; |
---|
230 | } |
---|
231 | else { |
---|
232 | |
---|
233 | // If this expression looks like an array stencil, then attempt to |
---|
234 | // use a fast traversal order. |
---|
235 | // Fast traversals require <set> from the ISO/ANSI C++ standard |
---|
236 | // library. |
---|
237 | |
---|
238 | #ifdef BZ_HAVE_STD |
---|
239 | #ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL |
---|
240 | |
---|
241 | enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6) |
---|
242 | && (T_expr::numIndexPlaceholders == 0) }; |
---|
243 | |
---|
244 | if (_bz_tryFastTraversal<isStencil>::tryFast(dest, expr, T_update())) |
---|
245 | return; |
---|
246 | |
---|
247 | #endif |
---|
248 | #endif |
---|
249 | |
---|
250 | #ifdef BZ_ARRAY_2D_STENCIL_TILING |
---|
251 | // Does this look like a 2-dimensional stencil on a largeish |
---|
252 | // array? |
---|
253 | |
---|
254 | if ((N_rank == 2) && (T_expr::numArrayOperands >= 5)) |
---|
255 | { |
---|
256 | // Use a heuristic to determine whether a tiled traversal |
---|
257 | // is desirable. First, estimate how much L1 cache is needed |
---|
258 | // to achieve a high hit rate using the stack traversal. |
---|
259 | // Try to err on the side of using tiled traversal even when |
---|
260 | // it isn't strictly needed. |
---|
261 | |
---|
262 | // Assumptions: |
---|
263 | // Stencil width 3 |
---|
264 | // 3 arrays involved in stencil |
---|
265 | // Uniform data type in arrays (all T_numtype) |
---|
266 | |
---|
267 | int cacheNeeded = 3 * 3 * sizeof(T_numtype) * dest.length(dest.ordering(0)); |
---|
268 | if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE) { |
---|
269 | _bz_evaluateWithTiled2DTraversal(dest, expr, T_update()); |
---|
270 | return; |
---|
271 | } |
---|
272 | } |
---|
273 | |
---|
274 | #endif |
---|
275 | |
---|
276 | // If fast traversal isn't available or appropriate, then just |
---|
277 | // do a stack traversal. |
---|
278 | //#pragma forceinline recursive |
---|
279 | _bz_evaluator<T_dest::rank_>::evaluateWithStackTraversal(dest, expr, T_update()); |
---|
280 | return; |
---|
281 | } |
---|
282 | } |
---|
283 | |
---|
284 | /** This class performs the vectorized update through the update() |
---|
285 | method. It is a class because it is specialized to do nothing for |
---|
286 | instances where the simd vector width is 1. This avoids tricky |
---|
287 | infinite template recursions on multicomponent containers. */ |
---|
288 | template<typename T_numtype, typename T_expr, typename T_update, int N> |
---|
289 | struct chunked_updater { |
---|
290 | |
---|
291 | static _bz_forceinline void |
---|
292 | aligned_update(T_numtype* data, T_expr expr, diffType i) { |
---|
293 | |
---|
294 | const bool unroll = N < BZ_TV_EVALUATE_UNROLL_LENGTH; |
---|
295 | _tv_evaluator<unroll, N>::evaluate_aligned |
---|
296 | (data+i, expr.template fastRead_tv<N>(i), T_update()); |
---|
297 | }; |
---|
298 | |
---|
299 | static _bz_forceinline void |
---|
300 | unaligned_update(T_numtype* data, T_expr expr, diffType i) { |
---|
301 | const bool unroll = N < BZ_TV_EVALUATE_UNROLL_LENGTH; |
---|
302 | _tv_evaluator<unroll, N>::evaluate_unaligned |
---|
303 | (data+i, expr.template fastRead_tv<N>(i), T_update()); |
---|
304 | }; |
---|
305 | |
---|
306 | }; |
---|
307 | |
---|
308 | /** specialization ensures we don't try to instantiate chunked_updates |
---|
309 | for types with a vecWidth of 1, as this leads to infinite template |
---|
310 | instantiation recursion. */ |
---|
311 | template<typename T_numtype, typename T_expr, typename T_update> |
---|
312 | struct chunked_updater<T_numtype, T_expr, T_update, 1> { |
---|
313 | static _bz_forceinline void |
---|
314 | aligned_update(T_numtype* data, T_expr expr, diffType i) { |
---|
315 | BZPRECONDITION(0); }; |
---|
316 | static _bz_forceinline void |
---|
317 | unaligned_update(T_numtype* data, T_expr expr, diffType i) { |
---|
318 | BZPRECONDITION(0); }; |
---|
319 | }; |
---|
320 | |
---|
321 | |
---|
322 | /** A metaprogram that uses the chunked_updater to assign an |
---|
323 | unknown-length expression to a pointer by unrolling in a binary |
---|
324 | fashion. This way, we can get "almost-compile-time" unrolling of a |
---|
325 | length only known at runtime. I+1 is the number of significant |
---|
326 | bits in the longest length to consider. In this way, assigning a |
---|
327 | vector of length 7 is I=2 and will take 3 operations. The |
---|
328 | metaprogram counts down, that way it will start with large updates |
---|
329 | which will be aligned for aligned expressions. */ |
---|
330 | template<int I> |
---|
331 | class _bz_meta_binaryAssign { |
---|
332 | public: |
---|
333 | template<typename T_data, typename T_expr, typename T_update> |
---|
334 | static _bz_forceinline void assign(T_data* data, T_expr expr, |
---|
335 | diffType ubound, diffType pos, |
---|
336 | T_update) { |
---|
337 | if(ubound&(1<<I)) { |
---|
338 | chunked_updater<T_data, T_expr, T_update, 1<<I >:: |
---|
339 | unaligned_update(data, expr, pos); |
---|
340 | pos += (1<<I); |
---|
341 | } |
---|
342 | _bz_meta_binaryAssign<I-1>::assign(data, expr, ubound, pos, T_update()); |
---|
343 | } |
---|
344 | |
---|
345 | }; |
---|
346 | |
---|
347 | /** Partial specialization for bit 0 uses the scalar update. */ |
---|
348 | template<> |
---|
349 | class _bz_meta_binaryAssign<0> { |
---|
350 | public: |
---|
351 | template<typename T_data, typename T_expr, typename T_update> |
---|
352 | static _bz_forceinline void assign(T_data* data, T_expr expr, |
---|
353 | diffType ubound, diffType pos, |
---|
354 | T_update) { |
---|
355 | if(ubound&1) { |
---|
356 | T_update::update(data[pos], expr.fastRead(pos)); |
---|
357 | ++pos; |
---|
358 | } |
---|
359 | // this ends the metaprogram. |
---|
360 | } |
---|
361 | }; |
---|
362 | |
---|
363 | /** Unit-stride evaluator, which takes pre-computed destination and |
---|
364 | bounds and just does the unit-stride evaluation for a specified |
---|
365 | length. This is essentially a 1D operation used by both the rank-1 |
---|
366 | and rank-N traversals, so it's common to both. This can use |
---|
367 | vectorized update, so if both dest and expr are unit stride, we |
---|
368 | redirect here. This function then deals with unaligned or |
---|
369 | misaligned situations. There is no explicit unrolling option here, |
---|
370 | since it's already vectorized using the chunk_updater. \todo Would |
---|
371 | it be useful to retain the unrolled loop for scalar |
---|
372 | architectures? */ |
---|
373 | template<typename T_dest, typename T_expr, typename T_update> |
---|
374 | _bz_forceinline void |
---|
375 | _bz_evaluateWithUnitStride(T_dest& dest, typename T_dest::T_iterator& iter, |
---|
376 | T_expr expr, diffType ubound, T_update) |
---|
377 | { |
---|
378 | typedef typename T_dest::T_numtype T_numtype; |
---|
379 | T_numtype* restrict data = const_cast<T_numtype*>(iter.data()); |
---|
380 | diffType i=0; |
---|
381 | |
---|
382 | #ifdef BZ_DEBUG_TRAVERSE |
---|
383 | BZ_DEBUG_MESSAGE("\tunit stride expression with length: "<< ubound << "."); |
---|
384 | #endif |
---|
385 | |
---|
386 | // If the minWidth is set to 0, there are elements in the expression |
---|
387 | // which can NOT use the vectorized expression (i.e, stencils). In |
---|
388 | // that case, we fall through to the scalar loop |
---|
389 | const bool unvectorizable = (T_expr::minWidth==0); |
---|
390 | |
---|
391 | if(!unvectorizable && (ubound < 1<<BZ_MAX_BITS_FOR_BINARY_UNROLL)) { |
---|
392 | // for short expressions, it's more important to lose |
---|
393 | // overhead. Single-element ones have already been dealt with, but |
---|
394 | // for lengths that are have fewer significant bits than |
---|
395 | // max_bits_for_unroll we do a binary-style unroll here. (We don't |
---|
396 | // worry about simd widths either, because we essentially just |
---|
397 | // present the compiler with a vectorizable view. It will do |
---|
398 | // sensible things even if the expressions are not vectorizable.) |
---|
399 | #ifdef BZ_DEBUG_TRAVERSE |
---|
400 | BZ_DEBUG_MESSAGE("\tshort expression, using binary meta-unroll assignment."); |
---|
401 | #endif |
---|
402 | |
---|
403 | _bz_meta_binaryAssign<BZ_MAX_BITS_FOR_BINARY_UNROLL-1>:: |
---|
404 | assign(data, expr, ubound, 0, T_update()); |
---|
405 | return; |
---|
406 | } |
---|
407 | |
---|
408 | // calculate uneven elements at the beginning of dest |
---|
409 | const diffType uneven_start=simdTypes<T_numtype>::offsetToAlignment(data); |
---|
410 | |
---|
411 | // we can only guarantee alignment if all operands have the same |
---|
412 | // width and are not mutually misaligned |
---|
413 | const bool can_align = |
---|
414 | (T_expr::minWidth == T_expr::maxWidth) && |
---|
415 | (T_expr::minWidth == int(simdTypes<T_numtype>::vecWidth)) && |
---|
416 | expr.isVectorAligned(uneven_start); |
---|
417 | |
---|
418 | // When we come out here, we KNOW that expressions shorter than |
---|
419 | // 1<<BZ_MAX_BITS_FOR_BINARY_UNROLL have been taken care of. At that |
---|
420 | // point, it is efficient to effectively unroll the loop using a |
---|
421 | // vector width larger than the simd width. |
---|
422 | const int loop_width= BZ_VECTORIZED_LOOP_WIDTH; |
---|
423 | |
---|
424 | #ifdef BZ_DEBUG_TRAVERSE |
---|
425 | if(T_expr::minWidth!=T_expr::maxWidth) { |
---|
426 | BZ_DEBUG_MESSAGE("\texpression has mixed width: " << T_expr::minWidth << "-" <<T_expr::maxWidth); |
---|
427 | } else { |
---|
428 | BZ_DEBUG_MESSAGE("\texpression SIMD width: " << T_expr::minWidth); |
---|
429 | } |
---|
430 | BZ_DEBUG_MESSAGE("\tdestination SIMD width: " << simdTypes<T_numtype>::vecWidth); |
---|
431 | if(loop_width>1) { |
---|
432 | if(!expr.isVectorAligned(uneven_start)) { |
---|
433 | BZ_DEBUG_MESSAGE("\toperands have different alignments"); |
---|
434 | } |
---|
435 | if(!can_align) { |
---|
436 | BZ_DEBUG_MESSAGE("\tcannot guarantee alignment - using unaligned vectorization") |
---|
437 | } else { |
---|
438 | BZ_DEBUG_MESSAGE("\texpression can be aligned"); |
---|
439 | } |
---|
440 | if(loop_width<=ubound) { |
---|
441 | BZ_DEBUG_MESSAGE("\tusing vectorization width " << loop_width); |
---|
442 | } else { |
---|
443 | BZ_DEBUG_MESSAGE("\texpression not long enough to be vectorized"); |
---|
444 | } |
---|
445 | } else { |
---|
446 | BZ_DEBUG_MESSAGE("\texpression cannot be vectorized"); |
---|
447 | } |
---|
448 | #endif |
---|
449 | |
---|
450 | |
---|
451 | if(!unvectorizable && (loop_width>1)) { |
---|
452 | // If the expression can be aligned, we do so. |
---|
453 | if(can_align) { |
---|
454 | #ifdef BZ_DEBUG_TRAVERSE |
---|
455 | if(i<uneven_start) { |
---|
456 | BZ_DEBUG_MESSAGE("\tscalar loop for " << uneven_start << " unaligned starting elements"); |
---|
457 | } |
---|
458 | #endif |
---|
459 | #ifdef BZ_USE_ALIGNMENT_PRAGMAS |
---|
460 | #pragma ivdep |
---|
461 | #endif |
---|
462 | for (; i < uneven_start; ++i) |
---|
463 | //#pragma forceinline recursive |
---|
464 | T_update::update(data[i], expr.fastRead(i)); |
---|
465 | |
---|
466 | // and then the vectorized part |
---|
467 | #ifdef BZ_DEBUG_TRAVERSE |
---|
468 | if(i<=ubound-loop_width) { |
---|
469 | BZ_DEBUG_MESSAGE("\taligned vectorized loop with width " << loop_width << " starting at " << i); |
---|
470 | } |
---|
471 | #endif |
---|
472 | for (; i <= ubound-loop_width; i+=loop_width) |
---|
473 | //#pragma forceinline recursive |
---|
474 | chunked_updater<T_numtype, T_expr, T_update, loop_width>:: |
---|
475 | aligned_update(data, expr, i); |
---|
476 | } |
---|
477 | else { |
---|
478 | // if we can not line up the expressions, alignment doesn't |
---|
479 | // matter and we just start using unaligned vectorized |
---|
480 | // instructions from element 0 |
---|
481 | #ifdef BZ_DEBUG_TRAVERSE |
---|
482 | if(i<=ubound-loop_width) { |
---|
483 | BZ_DEBUG_MESSAGE("\tunaligned vectorized loop with width " << loop_width << " starting at " << i); |
---|
484 | } |
---|
485 | #endif |
---|
486 | for (; i <= ubound-loop_width; i+=loop_width) |
---|
487 | //#pragma forceinline recursive |
---|
488 | chunked_updater<T_numtype, T_expr, T_update, loop_width>:: |
---|
489 | unaligned_update(data, expr, i); |
---|
490 | } |
---|
491 | } |
---|
492 | |
---|
493 | // now complete the loop with the tailing scalar elements not done |
---|
494 | // in the chunked loop. |
---|
495 | #ifdef BZ_DEBUG_TRAVERSE |
---|
496 | if(i<ubound) { |
---|
497 | BZ_DEBUG_MESSAGE("\tscalar loop for " << ubound-i << " trailing elements starting at " << i); |
---|
498 | } |
---|
499 | #endif |
---|
500 | #ifdef BZ_USE_ALIGNMENT_PRAGMAS |
---|
501 | #pragma ivdep |
---|
502 | #endif |
---|
503 | for (; i < ubound; ++i) |
---|
504 | //#pragma forceinline recursive |
---|
505 | T_update::update(data[i], expr.fastRead(i)); |
---|
506 | |
---|
507 | #ifdef BZ_DEBUG_TRAVERSE |
---|
508 | BZ_DEBUG_MESSAGE("\tunit stride evaluation done") |
---|
509 | #endif |
---|
510 | } |
---|
511 | |
---|
512 | |
---|
513 | /** Common-stride evaluator. Used for common but non-unit |
---|
514 | strides. Note that the stride can be negative, so we need to use a |
---|
515 | signed type. */ |
---|
516 | template<typename T_dest, typename T_expr, typename T_update> |
---|
517 | _bz_forceinline void |
---|
518 | _bz_evaluateWithCommonStride(T_dest& dest, typename T_dest::T_iterator& iter, |
---|
519 | T_expr expr, diffType ubound, |
---|
520 | diffType commonStride, |
---|
521 | T_update) |
---|
522 | { |
---|
523 | #ifdef BZ_DEBUG_TRAVERSE |
---|
524 | BZ_DEBUG_MESSAGE("\tcommon stride = " << commonStride); |
---|
525 | #endif |
---|
526 | |
---|
527 | typedef typename T_dest::T_numtype T_numtype; |
---|
528 | T_numtype* restrict data = const_cast<T_numtype*>(iter.data()); |
---|
529 | |
---|
530 | #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL |
---|
531 | # ifdef BZ_USE_ALIGNMENT_PRAGMAS |
---|
532 | # pragma ivdep |
---|
533 | # endif |
---|
534 | for (diffType i=0; i != ubound; i += commonStride) |
---|
535 | T_update::update(data[i], expr.fastRead(i)); |
---|
536 | #else |
---|
537 | diffType n1 = (dest.length(firstRank) & 3) * commonStride; |
---|
538 | |
---|
539 | diffType i = 0; |
---|
540 | for (; i != n1; i += commonStride) |
---|
541 | T_update::update(data[i], expr.fastRead(i)); |
---|
542 | |
---|
543 | diffType strideInc = 4 * commonStride; |
---|
544 | for (; i != ubound; i += strideInc) |
---|
545 | { |
---|
546 | T_update::update(data[i], expr.fastRead(i)); |
---|
547 | diffType i2 = i + commonStride; |
---|
548 | T_update::update(data[i2], expr.fastRead(i2)); |
---|
549 | diffType i3 = i + 2 * commonStride; |
---|
550 | T_update::update(data[i3], expr.fastRead(i3)); |
---|
551 | diffType i4 = i + 3 * commonStride; |
---|
552 | T_update::update(data[i4], expr.fastRead(i4)); |
---|
553 | } |
---|
554 | #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL |
---|
555 | return; |
---|
556 | } |
---|
557 | |
---|
558 | |
---|
559 | /* 1-d stack traversal evaluation. Forwards to evaluateWithUnitStride |
---|
560 | or evaluateWithCommonStride, if applicable, otherwise does the slow |
---|
561 | different-stride update. */ |
---|
562 | template<typename T_dest, typename T_expr, typename T_update> |
---|
563 | _bz_forceinline void |
---|
564 | _bz_evaluator<1>:: |
---|
565 | evaluateWithStackTraversal(T_dest& dest, T_expr expr, T_update) |
---|
566 | { |
---|
567 | #ifdef BZ_DEBUG_TRAVERSE |
---|
568 | BZ_DEBUG_MESSAGE("_bz_evaluator<1>: Using stack traversal"); |
---|
569 | #endif |
---|
570 | |
---|
571 | typename T_dest::T_iterator iter(dest); |
---|
572 | |
---|
573 | // if we only have one element, strides don't matter. In that case, |
---|
574 | // we just evaluate that right now so we don't have to deal with it. |
---|
575 | if(dest.length(firstRank)==1) { |
---|
576 | #ifdef BZ_DEBUG_TRAVERSE |
---|
577 | BZ_DEBUG_MESSAGE("\tshortcutting evaluation of single-element expression"); |
---|
578 | #endif |
---|
579 | T_update::update(*const_cast<typename T_dest::T_numtype*>(iter.data()), *expr); |
---|
580 | return; |
---|
581 | } |
---|
582 | |
---|
583 | iter.loadStride(firstRank); |
---|
584 | expr.loadStride(firstRank); |
---|
585 | |
---|
586 | const bool useUnitStride = iter.isUnitStride() |
---|
587 | && expr.isUnitStride(); |
---|
588 | |
---|
589 | if(useUnitStride) { |
---|
590 | const diffType ubound = dest.length(firstRank); |
---|
591 | _bz_evaluateWithUnitStride(dest, iter, expr, ubound, T_update()); |
---|
592 | return; |
---|
593 | } |
---|
594 | |
---|
595 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
596 | diffType commonStride = expr.suggestStride(firstRank); |
---|
597 | if (iter.suggestStride(firstRank) > commonStride) |
---|
598 | commonStride = iter.suggestStride(firstRank); |
---|
599 | bool useCommonStride = iter.isStride(firstRank,commonStride) |
---|
600 | && expr.isStride(firstRank,commonStride); |
---|
601 | #else |
---|
602 | diffType commonStride = 1; |
---|
603 | bool useCommonStride = false; |
---|
604 | #endif |
---|
605 | |
---|
606 | if (useCommonStride) { |
---|
607 | const diffType ubound = dest.length(firstRank) * commonStride; |
---|
608 | _bz_evaluateWithCommonStride(dest, iter, expr, ubound, commonStride, T_update()); |
---|
609 | return; |
---|
610 | } |
---|
611 | |
---|
612 | #ifdef BZ_DEBUG_TRAVERSE |
---|
613 | BZ_DEBUG_MESSAGE("\tnot common stride"); |
---|
614 | #endif |
---|
615 | |
---|
616 | // not common stride |
---|
617 | typedef typename T_dest::T_numtype T_numtype; |
---|
618 | const T_numtype * last = iter.data() + dest.length(firstRank) |
---|
619 | * dest.stride(firstRank); |
---|
620 | |
---|
621 | while (iter.data() != last) |
---|
622 | { |
---|
623 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); |
---|
624 | iter.advance(); |
---|
625 | expr.advance(); |
---|
626 | } |
---|
627 | } |
---|
628 | |
---|
629 | |
---|
630 | /** |
---|
631 | Perform a stack traversal of a rank >1 expression. A stack |
---|
632 | traversal replaces the usual nested loops: |
---|
633 | |
---|
634 | for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i) |
---|
635 | for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j) |
---|
636 | for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k) |
---|
637 | A(i,j,k) = 0; |
---|
638 | |
---|
639 | with a stack data structure. The stack allows this single routine |
---|
640 | to replace any number of nested loops. |
---|
641 | |
---|
642 | For each dimension (loop), these quantities are needed: |
---|
643 | - a pointer to the first element encountered in the loop |
---|
644 | - the stride associated with the dimension/loop |
---|
645 | - a pointer to the last element encountered in the loop |
---|
646 | |
---|
647 | The basic idea is that entering each loop is a "push" onto the |
---|
648 | stack, and exiting each loop is a "pop". In practice, this |
---|
649 | routine treats accesses the stack in a random-access way, |
---|
650 | which confuses the picture a bit. But conceptually, that's |
---|
651 | what is going on. |
---|
652 | |
---|
653 | ordering(0) gives the dimension associated with the smallest |
---|
654 | stride (usually; the exceptions have to do with subarrays and |
---|
655 | are uninteresting). We call this dimension maxRank; it will |
---|
656 | become the innermost "loop". |
---|
657 | |
---|
658 | Ordering the loops from ordering(N_rank-1) down to |
---|
659 | ordering(0) ensures that the largest stride is associated |
---|
660 | with the outermost loop, and the smallest stride with the |
---|
661 | innermost. This is critical for good performance on |
---|
662 | cached machines. |
---|
663 | */ |
---|
664 | |
---|
665 | template<int N> |
---|
666 | template<typename T_dest, typename T_expr, typename T_update> |
---|
667 | _bz_forceinline void |
---|
668 | _bz_evaluator<N>:: |
---|
669 | evaluateWithStackTraversal(T_dest& dest, T_expr expr, T_update) |
---|
670 | { |
---|
671 | #ifdef BZ_DEBUG_TRAVERSE |
---|
672 | BZ_DEBUG_MESSAGE("_bz_evaluator<" << N << ">: Using stack traversal"); |
---|
673 | #endif |
---|
674 | |
---|
675 | typedef typename T_dest::T_numtype T_numtype; |
---|
676 | const int N_rank = T_dest::rank(); |
---|
677 | |
---|
678 | const int maxRank = dest.ordering(0); |
---|
679 | // const int secondLastRank = ordering(1); |
---|
680 | |
---|
681 | // Create an iterator for the array receiving the result |
---|
682 | typename T_dest::T_iterator iter(dest); |
---|
683 | |
---|
684 | // Set the initial stack configuration by pushing the pointer |
---|
685 | // to the first element of the array onto the stack N times. |
---|
686 | |
---|
687 | int i; |
---|
688 | for (i=1; i < N_rank; ++i) |
---|
689 | { |
---|
690 | iter.push(i); |
---|
691 | expr.push(i); |
---|
692 | } |
---|
693 | |
---|
694 | // Load the strides associated with the innermost loop. |
---|
695 | iter.loadStride(maxRank); |
---|
696 | expr.loadStride(maxRank); |
---|
697 | |
---|
698 | /* |
---|
699 | * Is the stride in the innermost loop equal to 1? If so, |
---|
700 | * we might take advantage of this and generate more |
---|
701 | * efficient code. |
---|
702 | */ |
---|
703 | const bool useUnitStride = iter.isUnitStride() |
---|
704 | && expr.isUnitStride(); |
---|
705 | |
---|
706 | /* |
---|
707 | * Do all array operands share a common stride in the innermost |
---|
708 | * loop? If so, we can generate more efficient code (but only |
---|
709 | * if this optimization has been enabled). |
---|
710 | */ |
---|
711 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
712 | diffType commonStride = expr.suggestStride(maxRank); |
---|
713 | if (iter.suggestStride(maxRank) > commonStride) |
---|
714 | commonStride = iter.suggestStride(maxRank); |
---|
715 | bool useCommonStride = iter.isStride(maxRank,commonStride) |
---|
716 | && expr.isStride(maxRank,commonStride); |
---|
717 | |
---|
718 | #ifdef BZ_DEBUG_TRAVERSE |
---|
719 | BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl |
---|
720 | << "commonStride = " << commonStride << " useCommonStride = " |
---|
721 | << useCommonStride); |
---|
722 | #endif |
---|
723 | |
---|
724 | #else |
---|
725 | const diffType commonStride = 1; |
---|
726 | const bool useCommonStride = false; |
---|
727 | #endif |
---|
728 | |
---|
729 | /* |
---|
730 | * The "last" array contains a pointer to the last element |
---|
731 | * encountered in each "loop". |
---|
732 | */ |
---|
733 | const T_numtype* last[T_dest::rank_]; |
---|
734 | |
---|
735 | // Set up the initial state of the "last" array |
---|
736 | for (i=1; i < N_rank; ++i) |
---|
737 | last[i] = iter.data() + dest.length(dest.ordering(i)) * dest.stride(dest.ordering(i)); |
---|
738 | |
---|
739 | diffType lastLength = dest.length(maxRank); |
---|
740 | int firstNoncollapsedLoop = 1; |
---|
741 | |
---|
742 | #ifdef BZ_COLLAPSE_LOOPS |
---|
743 | |
---|
744 | /* |
---|
745 | * This bit of code handles collapsing loops. When possible, |
---|
746 | * the N nested loops are converted into a single loop (basically, |
---|
747 | * the N-dimensional array is treated as a long vector). |
---|
748 | * This is important for cases where the length of the innermost |
---|
749 | * loop is very small, for example a 100x100x3 array. |
---|
750 | * If this code can't collapse all the loops into a single loop, |
---|
751 | * it will collapse as many loops as possible starting from the |
---|
752 | * innermost and working out. |
---|
753 | */ |
---|
754 | |
---|
755 | // Collapse loops when possible |
---|
756 | for (i=1; i < N_rank; ++i) |
---|
757 | { |
---|
758 | // Figure out which pair of loops we are considering combining. |
---|
759 | int outerLoopRank = iter.ordering(i); |
---|
760 | int innerLoopRank = iter.ordering(i-1); |
---|
761 | |
---|
762 | /* |
---|
763 | * The canCollapse() routines look at the strides and extents |
---|
764 | * of the loops, and determine if they can be combined into |
---|
765 | * one loop. |
---|
766 | */ |
---|
767 | |
---|
768 | if (iter.canCollapse(outerLoopRank,innerLoopRank) |
---|
769 | && expr.canCollapse(outerLoopRank,innerLoopRank)) |
---|
770 | { |
---|
771 | #ifdef BZ_DEBUG_TRAVERSE |
---|
772 | cout << "Collapsing " << outerLoopRank << " and " |
---|
773 | << innerLoopRank << endl; |
---|
774 | #endif |
---|
775 | lastLength *= dest.length(outerLoopRank); |
---|
776 | firstNoncollapsedLoop = i+1; |
---|
777 | } |
---|
778 | else |
---|
779 | break; |
---|
780 | } |
---|
781 | |
---|
782 | #endif // BZ_COLLAPSE_LOOPS |
---|
783 | |
---|
784 | /* |
---|
785 | * Now we actually perform the loops. This while loop contains |
---|
786 | * two parts: first, the innermost loop is performed. Then we |
---|
787 | * exit the loop, and pop our way down the stack until we find |
---|
788 | * a loop that isn't completed. We then restart the inner loops |
---|
789 | * and push them onto the stack. |
---|
790 | */ |
---|
791 | |
---|
792 | while (true) { |
---|
793 | |
---|
794 | /* |
---|
795 | * This bit of code handles the innermost loop. It just uses |
---|
796 | * the separate evaluation functions depeding on the |
---|
797 | * stride. */ |
---|
798 | |
---|
799 | diffType ubound = lastLength * commonStride; |
---|
800 | |
---|
801 | if (useUnitStride || useCommonStride) { |
---|
802 | if(useUnitStride) |
---|
803 | _bz_evaluateWithUnitStride(dest, iter, expr, ubound, T_update()); |
---|
804 | else |
---|
805 | _bz_evaluateWithCommonStride(dest, iter, expr, ubound, commonStride, |
---|
806 | T_update()); |
---|
807 | |
---|
808 | /* |
---|
809 | * Tidy up for the fact that we haven't actually been |
---|
810 | * incrementing the iterators in the innermost loop, by |
---|
811 | * faking it afterward. |
---|
812 | */ |
---|
813 | iter.advance(lastLength * commonStride); |
---|
814 | expr.advance(lastLength * commonStride); |
---|
815 | } |
---|
816 | else { |
---|
817 | /* |
---|
818 | * We don't have a unit stride or common stride in the innermost |
---|
819 | * loop. This is going to hurt performance. Luckily 95% of |
---|
820 | * the time, we hit the cases above. |
---|
821 | */ |
---|
822 | T_numtype * restrict end = const_cast<T_numtype*>(iter.data()) |
---|
823 | + lastLength * dest.stride(maxRank); |
---|
824 | |
---|
825 | while (iter.data() != end) |
---|
826 | { |
---|
827 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); |
---|
828 | iter.advance(); |
---|
829 | expr.advance(); |
---|
830 | } |
---|
831 | } |
---|
832 | |
---|
833 | |
---|
834 | /* |
---|
835 | * We just finished the innermost loop. Now we pop our way down |
---|
836 | * the stack, until we hit a loop that hasn't completed yet. |
---|
837 | */ |
---|
838 | int j = firstNoncollapsedLoop; |
---|
839 | for (; j < N_rank; ++j) |
---|
840 | { |
---|
841 | // Get the next loop |
---|
842 | int r = dest.ordering(j); |
---|
843 | |
---|
844 | // Pop-- this restores the data pointers to the first element |
---|
845 | // encountered in the loop. |
---|
846 | iter.pop(j); |
---|
847 | expr.pop(j); |
---|
848 | |
---|
849 | // Load the stride associated with this loop, and increment |
---|
850 | // once. |
---|
851 | iter.loadStride(r); |
---|
852 | expr.loadStride(r); |
---|
853 | iter.advance(); |
---|
854 | expr.advance(); |
---|
855 | |
---|
856 | // If we aren't at the end of this loop, then stop popping. |
---|
857 | if (iter.data() != last[j]) |
---|
858 | break; |
---|
859 | } |
---|
860 | |
---|
861 | // Are we completely done? |
---|
862 | if (j == N_rank) |
---|
863 | break; |
---|
864 | |
---|
865 | // No, so push all the inner loops back onto the stack. |
---|
866 | for (; j >= firstNoncollapsedLoop; --j) |
---|
867 | { |
---|
868 | int r2 = dest.ordering(j-1); |
---|
869 | iter.push(j); |
---|
870 | expr.push(j); |
---|
871 | last[j-1] = iter.data() + dest.length(r2) * dest.stride(r2); |
---|
872 | } |
---|
873 | |
---|
874 | // Load the stride for the innermost loop again. |
---|
875 | iter.loadStride(maxRank); |
---|
876 | expr.loadStride(maxRank); |
---|
877 | } |
---|
878 | } |
---|
879 | |
---|
880 | |
---|
881 | template<typename T_dest, typename T_expr, typename T_update> |
---|
882 | _bz_forceinline void |
---|
883 | _bz_evaluator<1>:: |
---|
884 | evaluateWithIndexTraversal(T_dest& dest, T_expr expr, T_update) |
---|
885 | { |
---|
886 | typedef typename T_dest::T_numtype T_numtype; |
---|
887 | |
---|
888 | TinyVector<int,T_dest::rank_> index; |
---|
889 | |
---|
890 | if (dest.stride(firstRank) == 1) { |
---|
891 | T_numtype * restrict iter = dest.data(); |
---|
892 | int last = dest.ubound(firstRank); |
---|
893 | |
---|
894 | for (index[0] = dest.lbound(firstRank); index[0] <= last; |
---|
895 | ++index[0]) { |
---|
896 | T_update::update(*iter++, expr(index)); |
---|
897 | } |
---|
898 | } |
---|
899 | else { |
---|
900 | typename T_dest::T_iterator iter(dest); |
---|
901 | iter.loadStride(0); |
---|
902 | int last = iter.ubound(firstRank); |
---|
903 | |
---|
904 | for (index[0] = iter.lbound(firstRank); index[0] <= last; |
---|
905 | ++index[0]) { |
---|
906 | T_update::update(*const_cast<T_numtype*>(iter.data()), |
---|
907 | expr(index)); |
---|
908 | iter.advance(); |
---|
909 | } |
---|
910 | } |
---|
911 | } |
---|
912 | |
---|
913 | template<int N> |
---|
914 | template<typename T_dest, typename T_expr, typename T_update> |
---|
915 | _bz_forceinline void |
---|
916 | _bz_evaluator<N>:: |
---|
917 | evaluateWithIndexTraversal(T_dest& dest, T_expr expr, T_update) |
---|
918 | { |
---|
919 | typedef typename T_dest::T_numtype T_numtype; |
---|
920 | const int N_rank = T_dest::rank(); |
---|
921 | |
---|
922 | // Do a stack-type traversal for the destination array and use |
---|
923 | // index traversal for the source expression |
---|
924 | |
---|
925 | const int maxRank = dest.ordering(0); |
---|
926 | |
---|
927 | #ifdef BZ_DEBUG_TRAVERSE |
---|
928 | const int secondLastRank = dest.ordering(1); |
---|
929 | cout << "Index traversal: N_rank = " << N_rank << endl; |
---|
930 | cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank |
---|
931 | << endl; |
---|
932 | cout.flush(); |
---|
933 | #endif |
---|
934 | |
---|
935 | typename T_dest::T_iterator iter(dest); |
---|
936 | for (int i=1; i < N_rank; ++i) |
---|
937 | iter.push(iter.ordering(i)); |
---|
938 | |
---|
939 | iter.loadStride(maxRank); |
---|
940 | |
---|
941 | TinyVector<int,T_dest::rank_> index, last; |
---|
942 | |
---|
943 | index = dest.base(); |
---|
944 | |
---|
945 | for (int i=0; i < N_rank; ++i) |
---|
946 | last(i) = dest.base(i) + dest.length(i); |
---|
947 | |
---|
948 | // int lastLength = length(maxRank); |
---|
949 | |
---|
950 | while (true) { |
---|
951 | |
---|
952 | for (index[maxRank] = dest.base(maxRank); |
---|
953 | index[maxRank] < last[maxRank]; |
---|
954 | ++index[maxRank]) |
---|
955 | { |
---|
956 | #ifdef BZ_DEBUG_TRAVERSE |
---|
957 | #if 0 |
---|
958 | cout << "(" << index[0] << "," << index[1] << ") " << endl; |
---|
959 | cout.flush(); |
---|
960 | #endif |
---|
961 | #endif |
---|
962 | |
---|
963 | T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index)); |
---|
964 | iter.advance(); |
---|
965 | } |
---|
966 | |
---|
967 | int j = 1; |
---|
968 | for (; j < N_rank; ++j) |
---|
969 | { |
---|
970 | iter.pop(dest.ordering(j)); |
---|
971 | iter.loadStride(dest.ordering(j)); |
---|
972 | iter.advance(); |
---|
973 | |
---|
974 | index[dest.ordering(j-1)] = dest.base(dest.ordering(j-1)); |
---|
975 | ++index[dest.ordering(j)]; |
---|
976 | if (index[dest.ordering(j)] != last[dest.ordering(j)]) |
---|
977 | break; |
---|
978 | } |
---|
979 | |
---|
980 | if (j == N_rank) |
---|
981 | break; |
---|
982 | |
---|
983 | for (; j > 0; --j) |
---|
984 | { |
---|
985 | iter.push(dest.ordering(j)); |
---|
986 | } |
---|
987 | iter.loadStride(maxRank); |
---|
988 | } |
---|
989 | } |
---|
990 | |
---|
991 | // Fast traversals require <set> from the ISO/ANSI C++ standard library |
---|
992 | |
---|
993 | #ifdef BZ_HAVE_STD |
---|
994 | #ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL |
---|
995 | |
---|
996 | template<typename T_dest, typename T_expr, typename T_update> |
---|
997 | _bz_forceinline void |
---|
998 | _bz_evaluateWithFastTraversal(T_dest& dest, |
---|
999 | const TraversalOrder<T_dest::rank() - 1>& order, |
---|
1000 | T_expr expr, T_update) |
---|
1001 | { |
---|
1002 | typedef typename T_dest::T_numtype T_numtype; |
---|
1003 | const int N_rank = T_dest::rank(); |
---|
1004 | |
---|
1005 | const int maxRank = dest.ordering(0); |
---|
1006 | |
---|
1007 | #ifdef BZ_DEBUG_TRAVERSE |
---|
1008 | const int secondLastRank = dest.ordering(1); |
---|
1009 | cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank |
---|
1010 | << endl; |
---|
1011 | #endif |
---|
1012 | |
---|
1013 | T_dest::T_iterator iter(dest); |
---|
1014 | iter.push(0); |
---|
1015 | expr.push(0); |
---|
1016 | |
---|
1017 | bool useUnitStride = iter.isUnitStride(maxRank) |
---|
1018 | && expr.isUnitStride(maxRank); |
---|
1019 | |
---|
1020 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
1021 | diffType commonStride = expr.suggestStride(maxRank); |
---|
1022 | if (iter.suggestStride(maxRank) > commonStride) |
---|
1023 | commonStride = iter.suggestStride(maxRank); |
---|
1024 | bool useCommonStride = iter.isStride(maxRank,commonStride) |
---|
1025 | && expr.isStride(maxRank,commonStride); |
---|
1026 | #else |
---|
1027 | diffType commonStride = 1; |
---|
1028 | bool useCommonStride = false; |
---|
1029 | #endif |
---|
1030 | |
---|
1031 | int lastLength = dest.length(maxRank); |
---|
1032 | |
---|
1033 | for (int i=0; i < order.length(); ++i) |
---|
1034 | { |
---|
1035 | iter.pop(0); |
---|
1036 | expr.pop(0); |
---|
1037 | |
---|
1038 | #ifdef BZ_DEBUG_TRAVERSE |
---|
1039 | cerr << "Traversing: " << order[i] << endl; |
---|
1040 | #endif |
---|
1041 | // Position the iterator at the start of the next column |
---|
1042 | for (int j=1; j < N_rank; ++j) |
---|
1043 | { |
---|
1044 | iter.loadStride(ordering(j)); |
---|
1045 | expr.loadStride(ordering(j)); |
---|
1046 | |
---|
1047 | int offset = order[i][j-1]; |
---|
1048 | iter.advance(offset); |
---|
1049 | expr.advance(offset); |
---|
1050 | } |
---|
1051 | |
---|
1052 | iter.loadStride(maxRank); |
---|
1053 | expr.loadStride(maxRank); |
---|
1054 | |
---|
1055 | // Evaluate the expression along the column |
---|
1056 | |
---|
1057 | if ((useUnitStride) || (useCommonStride)) |
---|
1058 | { |
---|
1059 | #ifdef BZ_USE_FAST_READ_ARRAY_EXPR |
---|
1060 | diffType ubound = lastLength * commonStride; |
---|
1061 | T_numtype* restrict data = const_cast<T_numtype*>(iter.data()); |
---|
1062 | |
---|
1063 | if (commonStride == 1) |
---|
1064 | { |
---|
1065 | #ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL |
---|
1066 | for (diffType i=0; i < ubound; ++i) |
---|
1067 | T_update::update(*data++, expr.fastRead(i)); |
---|
1068 | #else |
---|
1069 | diffType n1 = ubound & 3; |
---|
1070 | diffType i=0; |
---|
1071 | for (; i < n1; ++i) |
---|
1072 | T_update::update(*data++, expr.fastRead(i)); |
---|
1073 | |
---|
1074 | for (; i < ubound; i += 4) |
---|
1075 | { |
---|
1076 | T_update::update(*data++, expr.fastRead(i)); |
---|
1077 | T_update::update(*data++, expr.fastRead(i+1)); |
---|
1078 | T_update::update(*data++, expr.fastRead(i+2)); |
---|
1079 | T_update::update(*data++, expr.fastRead(i+3)); |
---|
1080 | } |
---|
1081 | #endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL |
---|
1082 | } |
---|
1083 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
1084 | else { |
---|
1085 | for (diffType i=0; i < ubound; i += commonStride) |
---|
1086 | T_update::update(data[i], expr.fastRead(i)); |
---|
1087 | } |
---|
1088 | #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
1089 | |
---|
1090 | iter.advance(lastLength * commonStride); |
---|
1091 | expr.advance(lastLength * commonStride); |
---|
1092 | #else // ! BZ_USE_FAST_READ_ARRAY_EXPR |
---|
1093 | T_numtype* restrict last = const_cast<T_numtype*>(iter.data()) |
---|
1094 | + lastLength * commonStride; |
---|
1095 | |
---|
1096 | while (iter.data() != last) |
---|
1097 | { |
---|
1098 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); |
---|
1099 | iter.advance(commonStride); |
---|
1100 | expr.advance(commonStride); |
---|
1101 | } |
---|
1102 | #endif // BZ_USE_FAST_READ_ARRAY_EXPR |
---|
1103 | |
---|
1104 | } |
---|
1105 | else { |
---|
1106 | // No common stride |
---|
1107 | |
---|
1108 | T_numtype* restrict last = const_cast<T_numtype*>(iter.data()) |
---|
1109 | + lastLength * stride(maxRank); |
---|
1110 | |
---|
1111 | while (iter.data() != last) |
---|
1112 | { |
---|
1113 | T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); |
---|
1114 | iter.advance(); |
---|
1115 | expr.advance(); |
---|
1116 | } |
---|
1117 | } |
---|
1118 | } |
---|
1119 | } |
---|
1120 | |
---|
1121 | #endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL |
---|
1122 | #endif // BZ_HAVE_STD |
---|
1123 | |
---|
1124 | #ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING |
---|
1125 | |
---|
1126 | #ifdef BZ_ARRAY_2D_STENCIL_TILING |
---|
1127 | |
---|
1128 | // what is diff between new and old? |
---|
1129 | template<typename T_dest, typename T_expr, typename T_update> |
---|
1130 | _bz_forceinline void |
---|
1131 | _bz_evaluateWithTiled2DTraversal(T_dest& dest, T_expr expr, T_update) |
---|
1132 | { |
---|
1133 | typedef typename T_dest::T_numtype T_numtype; |
---|
1134 | const int N_rank = T_dest::rank(); |
---|
1135 | |
---|
1136 | typename T_dest::T_iterator iter(dest); |
---|
1137 | |
---|
1138 | const int minorRank = iter.ordering(0); |
---|
1139 | const int majorRank = iter.ordering(1); |
---|
1140 | |
---|
1141 | iter.push(0); |
---|
1142 | expr.push(0); |
---|
1143 | |
---|
1144 | #ifdef BZ_2D_STENCIL_DEBUG |
---|
1145 | int count = 0; |
---|
1146 | #endif |
---|
1147 | |
---|
1148 | bool useUnitStride = iter.isUnitStride(minorRank) |
---|
1149 | && expr.isUnitStride(minorRank); |
---|
1150 | |
---|
1151 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
1152 | diffType commonStride = expr.suggestStride(minorRank); |
---|
1153 | if (iter.suggestStride(minorRank) > commonStride) |
---|
1154 | commonStride = iter.suggestStride(minorRank); |
---|
1155 | bool useCommonStride = iter.isStride(minorRank,commonStride) |
---|
1156 | && expr.isStride(minorRank,commonStride); |
---|
1157 | #else |
---|
1158 | diffType commonStride = 1; |
---|
1159 | bool useCommonStride = false; |
---|
1160 | #endif |
---|
1161 | |
---|
1162 | // Determine if a common major stride exists |
---|
1163 | diffType commonMajorStride = expr.suggestStride(majorRank); |
---|
1164 | if (iter.suggestStride(majorRank) > commonMajorStride) |
---|
1165 | commonMajorStride = iter.suggestStride(majorRank); |
---|
1166 | bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride) |
---|
1167 | && expr.isStride(majorRank,commonMajorStride); |
---|
1168 | |
---|
1169 | |
---|
1170 | int maxi = dest.length(majorRank); |
---|
1171 | int maxj = dest.length(minorRank); |
---|
1172 | |
---|
1173 | const int tileHeight = 16, tileWidth = 3; |
---|
1174 | |
---|
1175 | int bi, bj; |
---|
1176 | for (bi=0; bi < maxi; bi += tileHeight) |
---|
1177 | { |
---|
1178 | int ni = bi + tileHeight; |
---|
1179 | if (ni > maxi) |
---|
1180 | ni = maxi; |
---|
1181 | |
---|
1182 | // Move back to the beginning of the array |
---|
1183 | iter.pop(0); |
---|
1184 | expr.pop(0); |
---|
1185 | |
---|
1186 | // Move to the start of this tile row |
---|
1187 | iter.loadStride(majorRank); |
---|
1188 | iter.advance(bi); |
---|
1189 | expr.loadStride(majorRank); |
---|
1190 | expr.advance(bi); |
---|
1191 | |
---|
1192 | // Save this position |
---|
1193 | iter.push(1); |
---|
1194 | expr.push(1); |
---|
1195 | |
---|
1196 | for (bj=0; bj < maxj; bj += tileWidth) |
---|
1197 | { |
---|
1198 | // Move to the beginning of the tile row |
---|
1199 | iter.pop(1); |
---|
1200 | expr.pop(1); |
---|
1201 | |
---|
1202 | // Move to the top of the current tile (bi,bj) |
---|
1203 | iter.loadStride(minorRank); |
---|
1204 | iter.advance(bj); |
---|
1205 | expr.loadStride(minorRank); |
---|
1206 | expr.advance(bj); |
---|
1207 | |
---|
1208 | if (bj + tileWidth <= maxj) |
---|
1209 | { |
---|
1210 | // Strip mining |
---|
1211 | |
---|
1212 | if ((useUnitStride) && (haveCommonMajorStride)) |
---|
1213 | { |
---|
1214 | diffType offset = 0; |
---|
1215 | T_numtype* restrict data = const_cast<T_numtype*> |
---|
1216 | (iter.data()); |
---|
1217 | |
---|
1218 | for (int i=bi; i < ni; ++i) |
---|
1219 | { |
---|
1220 | _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3; |
---|
1221 | |
---|
1222 | // Common subexpression elimination -- compilers |
---|
1223 | // won't necessarily do this on their own. |
---|
1224 | diffType t1 = offset+1; |
---|
1225 | diffType t2 = offset+2; |
---|
1226 | |
---|
1227 | tmp1 = expr.fastRead(offset); |
---|
1228 | tmp2 = expr.fastRead(t1); |
---|
1229 | tmp3 = expr.fastRead(t2); |
---|
1230 | |
---|
1231 | T_update::update(data[0], tmp1); |
---|
1232 | T_update::update(data[1], tmp2); |
---|
1233 | T_update::update(data[2], tmp3); |
---|
1234 | |
---|
1235 | offset += commonMajorStride; |
---|
1236 | data += commonMajorStride; |
---|
1237 | |
---|
1238 | #ifdef BZ_2D_STENCIL_DEBUG |
---|
1239 | count += 3; |
---|
1240 | #endif |
---|
1241 | } |
---|
1242 | } |
---|
1243 | else { |
---|
1244 | |
---|
1245 | for (int i=bi; i < ni; ++i) |
---|
1246 | { |
---|
1247 | iter.loadStride(minorRank); |
---|
1248 | expr.loadStride(minorRank); |
---|
1249 | |
---|
1250 | // Loop through current row elements |
---|
1251 | T_update::update(*const_cast<T_numtype*>(iter.data()), |
---|
1252 | *expr); |
---|
1253 | iter.advance(); |
---|
1254 | expr.advance(); |
---|
1255 | |
---|
1256 | T_update::update(*const_cast<T_numtype*>(iter.data()), |
---|
1257 | *expr); |
---|
1258 | iter.advance(); |
---|
1259 | expr.advance(); |
---|
1260 | |
---|
1261 | T_update::update(*const_cast<T_numtype*>(iter.data()), |
---|
1262 | *expr); |
---|
1263 | iter.advance(-2); |
---|
1264 | expr.advance(-2); |
---|
1265 | |
---|
1266 | iter.loadStride(majorRank); |
---|
1267 | expr.loadStride(majorRank); |
---|
1268 | iter.advance(); |
---|
1269 | expr.advance(); |
---|
1270 | |
---|
1271 | #ifdef BZ_2D_STENCIL_DEBUG |
---|
1272 | count += 3; |
---|
1273 | #endif |
---|
1274 | |
---|
1275 | } |
---|
1276 | } |
---|
1277 | } |
---|
1278 | else { |
---|
1279 | |
---|
1280 | // This code handles partial tiles at the bottom of the |
---|
1281 | // array. |
---|
1282 | |
---|
1283 | for (int j=bj; j < maxj; ++j) |
---|
1284 | { |
---|
1285 | iter.loadStride(majorRank); |
---|
1286 | expr.loadStride(majorRank); |
---|
1287 | |
---|
1288 | for (int i=bi; i < ni; ++i) |
---|
1289 | { |
---|
1290 | T_update::update(*const_cast<T_numtype*>(iter.data()), |
---|
1291 | *expr); |
---|
1292 | iter.advance(); |
---|
1293 | expr.advance(); |
---|
1294 | #ifdef BZ_2D_STENCIL_DEBUG |
---|
1295 | ++count; |
---|
1296 | #endif |
---|
1297 | |
---|
1298 | } |
---|
1299 | |
---|
1300 | // Move back to the top of this column |
---|
1301 | iter.advance(bi-ni); |
---|
1302 | expr.advance(bi-ni); |
---|
1303 | |
---|
1304 | // Move over to the next column |
---|
1305 | iter.loadStride(minorRank); |
---|
1306 | expr.loadStride(minorRank); |
---|
1307 | |
---|
1308 | iter.advance(); |
---|
1309 | expr.advance(); |
---|
1310 | } |
---|
1311 | } |
---|
1312 | } |
---|
1313 | } |
---|
1314 | |
---|
1315 | #ifdef BZ_2D_STENCIL_DEBUG |
---|
1316 | cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl; |
---|
1317 | #endif |
---|
1318 | } |
---|
1319 | |
---|
1320 | #endif // BZ_ARRAY_2D_STENCIL_TILING |
---|
1321 | #endif // BZ_ARRAY_2D_NEW_STENCIL_TILING |
---|
1322 | |
---|
1323 | |
---|
1324 | |
---|
1325 | #ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING |
---|
1326 | |
---|
1327 | #ifdef BZ_ARRAY_2D_STENCIL_TILING |
---|
1328 | |
---|
1329 | // what is diff between new and old? |
---|
1330 | template<typename T_dest, typename T_expr, typename T_update> |
---|
1331 | _bz_forceinline void |
---|
1332 | _bz_evaluateWithTiled2DTraversal(T_dest& dest, T_expr expr, T_update) |
---|
1333 | { |
---|
1334 | typedef typename T_dest::T_numtype T_numtype; |
---|
1335 | |
---|
1336 | typename T_dest::T_iterator iter(dest); |
---|
1337 | |
---|
1338 | const int minorRank = iter.ordering(0); |
---|
1339 | const int majorRank = iter.ordering(1); |
---|
1340 | |
---|
1341 | const int blockSize = 16; |
---|
1342 | |
---|
1343 | iter.push(0); |
---|
1344 | expr.push(0); |
---|
1345 | |
---|
1346 | bool useUnitStride = iter.isUnitStride(minorRank) |
---|
1347 | && expr.isUnitStride(minorRank); |
---|
1348 | |
---|
1349 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
1350 | diffType commonStride = expr.suggestStride(minorRank); |
---|
1351 | if (iter.suggestStride(minorRank) > commonStride) |
---|
1352 | commonStride = iter.suggestStride(minorRank); |
---|
1353 | bool useCommonStride = iter.isStride(minorRank,commonStride) |
---|
1354 | && expr.isStride(minorRank,commonStride); |
---|
1355 | #else |
---|
1356 | diffType commonStride = 1; |
---|
1357 | bool useCommonStride = false; |
---|
1358 | #endif |
---|
1359 | |
---|
1360 | int maxi = dest.length(majorRank); |
---|
1361 | int maxj = dest.length(minorRank); |
---|
1362 | |
---|
1363 | int bi, bj; |
---|
1364 | for (bi=0; bi < maxi; bi += blockSize) |
---|
1365 | { |
---|
1366 | int ni = bi + blockSize; |
---|
1367 | if (ni > maxi) |
---|
1368 | ni = maxi; |
---|
1369 | |
---|
1370 | for (bj=0; bj < maxj; bj += blockSize) |
---|
1371 | { |
---|
1372 | int nj = bj + blockSize; |
---|
1373 | if (nj > maxj) |
---|
1374 | nj = maxj; |
---|
1375 | |
---|
1376 | // Move to the beginning of the array |
---|
1377 | iter.pop(0); |
---|
1378 | expr.pop(0); |
---|
1379 | |
---|
1380 | // Move to the beginning of the tile (bi,bj) |
---|
1381 | iter.loadStride(majorRank); |
---|
1382 | iter.advance(bi); |
---|
1383 | iter.loadStride(minorRank); |
---|
1384 | iter.advance(bj); |
---|
1385 | |
---|
1386 | expr.loadStride(majorRank); |
---|
1387 | expr.advance(bi); |
---|
1388 | expr.loadStride(minorRank); |
---|
1389 | expr.advance(bj); |
---|
1390 | |
---|
1391 | // Loop through tile rows |
---|
1392 | for (int i=bi; i < ni; ++i) |
---|
1393 | { |
---|
1394 | // Save the beginning of this tile row |
---|
1395 | iter.push(1); |
---|
1396 | expr.push(1); |
---|
1397 | |
---|
1398 | // Load the minor stride |
---|
1399 | iter.loadStride(minorRank); |
---|
1400 | expr.loadStride(minorRank); |
---|
1401 | |
---|
1402 | if (useUnitStride) |
---|
1403 | { |
---|
1404 | T_numtype* restrict data = const_cast<T_numtype*> |
---|
1405 | (iter.data()); |
---|
1406 | |
---|
1407 | int ubound = (nj-bj); |
---|
1408 | for (int j=0; j < ubound; ++j) |
---|
1409 | T_update::update(*data++, expr.fastRead(j)); |
---|
1410 | } |
---|
1411 | #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE |
---|
1412 | else if (useCommonStride) |
---|
1413 | { |
---|
1414 | const diffType ubound = (nj-bj) * commonStride; |
---|
1415 | T_numtype* restrict data = const_cast<T_numtype*> |
---|
1416 | (iter.data()); |
---|
1417 | |
---|
1418 | for (diffType j=0; j < ubound; j += commonStride) |
---|
1419 | T_update::update(data[j], expr.fastRead(j)); |
---|
1420 | } |
---|
1421 | #endif |
---|
1422 | else { |
---|
1423 | for (int j=bj; j < nj; ++j) |
---|
1424 | { |
---|
1425 | // Loop through current row elements |
---|
1426 | T_update::update(*const_cast<T_numtype*>(iter.data()), |
---|
1427 | *expr); |
---|
1428 | iter.advance(); |
---|
1429 | expr.advance(); |
---|
1430 | } |
---|
1431 | } |
---|
1432 | |
---|
1433 | // Move back to the beginning of the tile row, then |
---|
1434 | // move to the next row |
---|
1435 | iter.pop(1); |
---|
1436 | iter.loadStride(majorRank); |
---|
1437 | iter.advance(1); |
---|
1438 | |
---|
1439 | expr.pop(1); |
---|
1440 | expr.loadStride(majorRank); |
---|
1441 | expr.advance(1); |
---|
1442 | } |
---|
1443 | } |
---|
1444 | } |
---|
1445 | } |
---|
1446 | #endif // BZ_ARRAY_2D_STENCIL_TILING |
---|
1447 | #endif // BZ_ARRAY_2D_NEW_STENCIL_TILING |
---|
1448 | |
---|
1449 | BZ_NAMESPACE_END |
---|
1450 | |
---|
1451 | #endif // BZ_ARRAYEVAL_CC |
---|
1452 | |
---|