source: XIOS/dev/dev_olga/src/extern/blitz/include/blitz/globeval.cc @ 1022

Last change on this file since 1022 was 1022, checked in by mhnguyen, 7 years ago
File size: 46.1 KB
Line 
1/***************************************************************************
2 * blitz/array/eval.cc  Evaluate expression and assign to an array.
3 *
4 * $Id$
5 *
6 * Copyright (C) 1997-2011 Todd Veldhuizen <tveldhui@acm.org>
7 *
8 * This file is a part of Blitz.
9 *
10 * Blitz is free software: you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation, either version 3
13 * of the License, or (at your option) any later version.
14 *
15 * Blitz is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Blitz.  If not, see <http://www.gnu.org/licenses/>.
22 *
23 * Suggestions:          blitz-devel@lists.sourceforge.net
24 * Bugs:                 blitz-support@lists.sourceforge.net   
25 *
26 * For more information, please see the Blitz++ Home Page:
27 *    https://sourceforge.net/projects/blitz/
28 *
29 ****************************************************************************/
30#ifndef BZ_GLOBEVAL_CC
31#define BZ_GLOBEVAL_CC
32
33#include <blitz/ranks.h>
34#include <blitz/tvevaluate.h>
35#include <blitz/blitz.h>
36
37BZ_NAMESPACE(blitz)
38
39
40// Fast traversals require <set> from the ISO/ANSI C++ standard library
41#ifdef BZ_HAVE_STD
42#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL
43
44
45/** _bz_tryFastTraversal is a helper class.  Fast traversals are only
46    attempted if the expression looks like a stencil -- it's at least
47    three-dimensional, has at least six array operands, and there are
48    no index placeholders in the expression.  These are all things
49    which can be checked at compile time, so the if()/else() syntax
50    has been replaced with this class template. 
51*/
52template<bool canTryFastTraversal>
53struct _bz_tryFastTraversal {
54    template<typename T_numtype, int N_rank, typename T_expr, typename T_update>
55    static bool tryFast(Array<T_numtype,N_rank>& array, 
56        T_expr expr, T_update)
57    {
58        return false;
59    }
60};
61
62template<>
63struct _bz_tryFastTraversal<true> {
64    template<typename T_numtype, int N_rank, typename T_expr, typename T_update>
65    static bool tryFast(Array<T_numtype,N_rank>& array, 
66        T_expr expr, T_update)
67    {
68        // See if there's an appropriate space filling curve available.
69        // Currently fast traversals use an N-1 dimensional curve.  The
70        // Nth dimension column corresponding to each point on the curve
71        // is traversed in the normal fashion.
72        TraversalOrderCollection<N_rank-1> traversals;
73        TinyVector<int, N_rank - 1> traversalGridSize;
74
75        for (int i=0; i < N_rank - 1; ++i)
76            traversalGridSize[i] = array.length(array.ordering(i+1));
77
78#ifdef BZ_DEBUG_TRAVERSE
79cout << "traversalGridSize = " << traversalGridSize << endl;
80cout.flush();
81#endif
82
83        const TraversalOrder<N_rank-1>* order =
84            traversals.find(traversalGridSize);
85
86        if (order)
87        {
88#ifdef BZ_DEBUG_TRAVERSE
89    cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
90         << ", " << N_rank << ">: Using stack traversal" << endl;
91#endif
92            // A curve was available -- use fast traversal.
93            array.evaluateWithFastTraversal(*order, expr, T_update());
94            return true;
95        }
96
97        return false;
98    }
99};
100#endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL
101#endif // BZ_HAVE_STD
102
103
104/** Helper class that implements the evaluation routines for different
105    ranks. */
106template<int N> struct _bz_evaluator {
107  template<typename T_dest, typename T_expr, typename T_update>
108  static void evaluateWithStackTraversal(T_dest&, T_expr, T_update);
109  template<typename T_dest, typename T_expr, typename T_update>
110  static void evaluateWithIndexTraversal(T_dest&, T_expr, T_update);
111};
112template<> struct _bz_evaluator<1> {
113  template<typename T_dest, typename T_expr, typename T_update>
114  static void evaluateWithStackTraversal(T_dest&, T_expr, T_update);
115  template<typename T_dest, typename T_expr, typename T_update>
116  static void evaluateWithIndexTraversal(T_dest&, T_expr, T_update);
117};
118
119/**
120  Assign an expression to a container.  For performance reasons, this
121  function forwards to functions implementing one of several traversal
122  mechanisms:
123 
124  - Index traversal scans through the destination array in storage order.
125    The expression is evaluated using a TinyVector<int,N> operand.  This
126    version is used only when there are index placeholders in the expression
127    (see <blitz/indexexpr.h>)
128  - Stack traversal also scans through the destination array in storage
129    order.  However, push/pop stack iterators are used.
130  - Fast traversal follows a Hilbert (or other) space-filling curve to
131    improve cache reuse for stencilling operations.  Currently, the
132    space filling curves must be generated by calling
133    generateFastTraversalOrder(TinyVector<int,N_dimensions>).
134  - 2D tiled traversal follows a tiled traversal, to improve cache reuse
135    for 2D stencils.  Space filling curves have too much overhead to use
136    in two-dimensions.
137 */
138template<typename T_dest, typename T_expr, typename T_update>
139_bz_forceinline void
140_bz_evaluate(T_dest& dest, T_expr expr, T_update)
141{
142  typedef typename T_dest::T_numtype T_numtype;
143  const int N_rank = T_dest::rank_;
144
145    // Check that all arrays have the same shape
146#ifdef BZ_DEBUG
147    if (!expr.shapeCheck(dest.shape()))
148    {
149      if (assertFailMode == false)
150      {
151        cerr << "[Blitz++] Shape check failed: Module " << __FILE__
152             << " line " << __LINE__ << endl
153             << "          Expression: ";
154        prettyPrintFormat format(true);   // Use terse formatting
155        BZ_STD_SCOPE(string) str;
156        expr.prettyPrint(str, format);
157        cerr << str << endl ;
158      }
159
160#if 0
161// Shape dumping is broken by change to using string for prettyPrint
162             << "          Shapes: " << shape() << " = ";
163        prettyPrintFormat format2;
164        format2.setDumpArrayShapesMode();
165        expr.prettyPrint(cerr, format2);
166        cerr << endl;
167#endif
168        BZ_PRE_FAIL;
169    }
170#endif
171
172    BZPRECHECK(expr.shapeCheck(dest.shape()),
173        "Shape check failed." << endl << "Expression:");
174
175    BZPRECHECK((T_expr::rank_ == T_dest::rank_) || 
176               (T_expr::numArrayOperands == 0), 
177               "Assigned rank " << T_expr::rank_ << " expression to rank " 
178               << T_dest::rank_ << " array.");
179
180    /*
181     * Check that the arrays are not empty (e.g. length 0 arrays)
182     * This fixes a bug found by Peter Bienstman, 6/16/99, where
183     * Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i);
184     * went into an infinite loop.
185     */
186
187    const sizeType n = dest.numElements();
188    if (n == 0) {
189#ifdef BZ_DEBUG_TRAVERSE
190      BZ_DEBUG_MESSAGE("Evaluating empty array, nothing to do");
191#endif
192      return;
193    }
194    // \todo this does not alvays compile, so eliminate for now.
195    // if (n == 1) {
196    //   // shortcut here since it's easy
197    //   T_update::update(*dest.dataFirst(), expr(expr.lbound()));
198    //   return;
199    // }
200
201#ifdef BZ_DEBUG_TRAVERSE
202    BZ_DEBUG_MESSAGE( "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders);
203#endif
204
205    // Tau profiling code.  Provide Tau with a pretty-printed version of
206    // the expression.
207    // NEEDS_WORK-- use a static initializer somehow.
208
209#ifdef BZ_TAU_PROFILING
210    static BZ_STD_SCOPE(string) exprDescription;
211    if (!exprDescription.length())   // faked static initializer
212    {
213        exprDescription = "A";
214        prettyPrintFormat format(true);   // Terse mode on
215        format.nextArrayOperandSymbol();
216        T_update::prettyPrint(exprDescription);
217        expr.prettyPrint(exprDescription, format);
218    }
219    TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
220#endif
221
222    // Determine which evaluation mechanism to use
223    if (T_expr::numIndexPlaceholders > 0)
224    {
225        // The expression involves index placeholders, so have to
226        // use index traversal rather than stack traversal.
227
228      _bz_evaluator<T_dest::rank_>::evaluateWithIndexTraversal(dest, expr, T_update());
229      return;
230    }
231    else {
232
233        // If this expression looks like an array stencil, then attempt to
234        // use a fast traversal order.
235        // Fast traversals require <set> from the ISO/ANSI C++ standard
236        // library.
237
238#ifdef BZ_HAVE_STD
239#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL
240
241        enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
242            && (T_expr::numIndexPlaceholders == 0) };
243
244        if (_bz_tryFastTraversal<isStencil>::tryFast(dest, expr, T_update()))
245            return;
246
247#endif
248#endif
249
250#ifdef BZ_ARRAY_2D_STENCIL_TILING
251        // Does this look like a 2-dimensional stencil on a largeish
252        // array?
253
254        if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
255        {
256            // Use a heuristic to determine whether a tiled traversal
257            // is desirable.  First, estimate how much L1 cache is needed
258            // to achieve a high hit rate using the stack traversal.
259            // Try to err on the side of using tiled traversal even when
260            // it isn't strictly needed.
261
262            // Assumptions:
263            //    Stencil width 3
264            //    3 arrays involved in stencil
265            //    Uniform data type in arrays (all T_numtype)
266           
267            int cacheNeeded = 3 * 3 * sizeof(T_numtype) * dest.length(dest.ordering(0));
268            if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE) {
269              _bz_evaluateWithTiled2DTraversal(dest, expr, T_update());
270                return;
271            }
272        }
273
274#endif
275
276        // If fast traversal isn't available or appropriate, then just
277        // do a stack traversal.
278//#pragma forceinline recursive
279        _bz_evaluator<T_dest::rank_>::evaluateWithStackTraversal(dest, expr, T_update());
280        return;
281    }
282}
283
284/** This class performs the vectorized update through the update()
285    method. It is a class because it is specialized to do nothing for
286    instances where the simd vector width is 1. This avoids tricky
287    infinite template recursions on multicomponent containers. */
288template<typename T_numtype, typename T_expr, typename T_update, int N>
289struct chunked_updater {
290
291  static _bz_forceinline void
292  aligned_update(T_numtype* data, T_expr expr, diffType i) {
293
294    const bool unroll = N < BZ_TV_EVALUATE_UNROLL_LENGTH;
295    _tv_evaluator<unroll, N>::evaluate_aligned
296      (data+i, expr.template fastRead_tv<N>(i), T_update());
297  };
298
299  static _bz_forceinline void
300  unaligned_update(T_numtype* data, T_expr expr, diffType i) {
301    const bool unroll = N < BZ_TV_EVALUATE_UNROLL_LENGTH;
302    _tv_evaluator<unroll, N>::evaluate_unaligned
303      (data+i, expr.template fastRead_tv<N>(i), T_update());
304  };
305
306};
307
308/** specialization ensures we don't try to instantiate chunked_updates
309    for types with a vecWidth of 1, as this leads to infinite template
310    instantiation recursion. */
311template<typename T_numtype, typename T_expr, typename T_update>
312struct chunked_updater<T_numtype, T_expr, T_update, 1> {
313  static _bz_forceinline void 
314  aligned_update(T_numtype* data, T_expr expr, diffType i) {
315    BZPRECONDITION(0); };
316  static _bz_forceinline void
317  unaligned_update(T_numtype* data, T_expr expr, diffType i) {
318    BZPRECONDITION(0); };
319};
320
321
322/** A metaprogram that uses the chunked_updater to assign an
323    unknown-length expression to a pointer by unrolling in a binary
324    fashion. This way, we can get "almost-compile-time" unrolling of a
325    length only known at runtime. I+1 is the number of significant
326    bits in the longest length to consider. In this way, assigning a
327    vector of length 7 is I=2 and will take 3 operations. The
328    metaprogram counts down, that way it will start with large updates
329    which will be aligned for aligned expressions. */
330template<int I> 
331class _bz_meta_binaryAssign {
332public:
333    template<typename T_data, typename T_expr, typename T_update>
334    static _bz_forceinline void assign(T_data* data, T_expr expr,
335                                       diffType ubound, diffType pos, 
336                                       T_update) {
337      if(ubound&(1<<I)) {
338        chunked_updater<T_data, T_expr, T_update, 1<<I >::
339          unaligned_update(data, expr, pos); 
340        pos += (1<<I);
341      }
342      _bz_meta_binaryAssign<I-1>::assign(data, expr, ubound, pos, T_update());
343      }
344       
345};
346
347/** Partial specialization for bit 0 uses the scalar update. */
348template<> 
349class _bz_meta_binaryAssign<0> {
350public:
351    template<typename T_data, typename T_expr, typename T_update>
352    static _bz_forceinline void assign(T_data* data, T_expr expr,
353                                       diffType ubound, diffType pos, 
354                                       T_update) {
355      if(ubound&1) {
356        T_update::update(data[pos], expr.fastRead(pos));
357        ++pos;
358      }
359      // this ends the metaprogram.
360    }
361};
362
363/** Unit-stride evaluator, which takes pre-computed destination and
364    bounds and just does the unit-stride evaluation for a specified
365    length. This is essentially a 1D operation used by both the rank-1
366    and rank-N traversals, so it's common to both. This can use
367    vectorized update, so if both dest and expr are unit stride, we
368    redirect here. This function then deals with unaligned or
369    misaligned situations. There is no explicit unrolling option here,
370    since it's already vectorized using the chunk_updater. \todo Would
371    it be useful to retain the unrolled loop for scalar
372    architectures?  */
373template<typename T_dest, typename T_expr, typename T_update>
374_bz_forceinline void
375_bz_evaluateWithUnitStride(T_dest& dest, typename T_dest::T_iterator& iter,
376                           T_expr expr, diffType ubound, T_update)
377{
378  typedef typename T_dest::T_numtype T_numtype;
379  T_numtype* restrict data = const_cast<T_numtype*>(iter.data());
380  diffType i=0;
381
382#ifdef BZ_DEBUG_TRAVERSE
383  BZ_DEBUG_MESSAGE("\tunit stride expression with length: "<< ubound << ".");
384#endif
385
386  // If the minWidth is set to 0, there are elements in the expression
387  // which can NOT use the vectorized expression (i.e, stencils). In
388  // that case, we fall through to the scalar loop
389  const bool unvectorizable = (T_expr::minWidth==0);
390
391  if(!unvectorizable && (ubound < 1<<BZ_MAX_BITS_FOR_BINARY_UNROLL)) {
392    // for short expressions, it's more important to lose
393    // overhead. Single-element ones have already been dealt with, but
394    // for lengths that are have fewer significant bits than
395    // max_bits_for_unroll we do a binary-style unroll here. (We don't
396    // worry about simd widths either, because we essentially just
397    // present the compiler with a vectorizable view. It will do
398    // sensible things even if the expressions are not vectorizable.)
399#ifdef BZ_DEBUG_TRAVERSE
400    BZ_DEBUG_MESSAGE("\tshort expression, using binary meta-unroll assignment.");
401#endif
402
403    _bz_meta_binaryAssign<BZ_MAX_BITS_FOR_BINARY_UNROLL-1>::
404      assign(data, expr, ubound, 0, T_update());
405    return;
406  }
407
408  // calculate uneven elements at the beginning of dest
409  const diffType uneven_start=simdTypes<T_numtype>::offsetToAlignment(data);
410
411  // we can only guarantee alignment if all operands have the same
412  // width and are not mutually misaligned
413  const bool can_align = 
414    (T_expr::minWidth == T_expr::maxWidth) &&
415    (T_expr::minWidth == int(simdTypes<T_numtype>::vecWidth)) &&
416    expr.isVectorAligned(uneven_start);
417
418  // When we come out here, we KNOW that expressions shorter than
419  // 1<<BZ_MAX_BITS_FOR_BINARY_UNROLL have been taken care of. At that
420  // point, it is efficient to effectively unroll the loop using a
421  // vector width larger than the simd width.
422  const int loop_width= BZ_VECTORIZED_LOOP_WIDTH;
423
424#ifdef BZ_DEBUG_TRAVERSE
425  if(T_expr::minWidth!=T_expr::maxWidth) {
426    BZ_DEBUG_MESSAGE("\texpression has mixed width: " << T_expr::minWidth << "-" <<T_expr::maxWidth);
427  } else {
428    BZ_DEBUG_MESSAGE("\texpression SIMD width: " << T_expr::minWidth);
429  }
430  BZ_DEBUG_MESSAGE("\tdestination SIMD width: " << simdTypes<T_numtype>::vecWidth);
431  if(loop_width>1) {
432  if(!expr.isVectorAligned(uneven_start)) {
433    BZ_DEBUG_MESSAGE("\toperands have different alignments");
434  }
435  if(!can_align) {
436    BZ_DEBUG_MESSAGE("\tcannot guarantee alignment - using unaligned vectorization")
437      } else {
438    BZ_DEBUG_MESSAGE("\texpression can be aligned");
439  }
440  if(loop_width<=ubound) {
441    BZ_DEBUG_MESSAGE("\tusing vectorization width " << loop_width);
442  } else {
443    BZ_DEBUG_MESSAGE("\texpression not long enough to be vectorized");
444  }
445  } else {
446    BZ_DEBUG_MESSAGE("\texpression cannot be vectorized");
447  }
448#endif
449
450
451  if(!unvectorizable && (loop_width>1)) {
452    // If the expression can be aligned, we do so.
453    if(can_align) {
454#ifdef BZ_DEBUG_TRAVERSE
455      if(i<uneven_start) {
456        BZ_DEBUG_MESSAGE("\tscalar loop for " << uneven_start << " unaligned starting elements");
457      }
458#endif
459#ifdef BZ_USE_ALIGNMENT_PRAGMAS
460#pragma ivdep
461#endif
462      for (; i < uneven_start; ++i)
463//#pragma forceinline recursive
464        T_update::update(data[i], expr.fastRead(i));
465     
466      // and then the vectorized part
467#ifdef BZ_DEBUG_TRAVERSE
468      if(i<=ubound-loop_width) {
469        BZ_DEBUG_MESSAGE("\taligned vectorized loop with width " << loop_width << " starting at " << i);
470      }
471#endif
472      for (; i <= ubound-loop_width; i+=loop_width)
473//#pragma forceinline recursive
474        chunked_updater<T_numtype, T_expr, T_update, loop_width>::
475          aligned_update(data, expr, i);
476    }
477    else {
478      // if we can not line up the expressions, alignment doesn't
479      // matter and we just start using unaligned vectorized
480      // instructions from element 0
481#ifdef BZ_DEBUG_TRAVERSE
482      if(i<=ubound-loop_width) {
483        BZ_DEBUG_MESSAGE("\tunaligned vectorized loop with width " << loop_width << " starting at " << i);
484      }
485#endif
486      for (; i <= ubound-loop_width; i+=loop_width)
487//#pragma forceinline recursive
488        chunked_updater<T_numtype, T_expr, T_update, loop_width>::
489          unaligned_update(data, expr, i);
490    }
491  }
492
493  // now complete the loop with the tailing scalar elements not done
494  // in the chunked loop.
495#ifdef BZ_DEBUG_TRAVERSE
496  if(i<ubound) {
497    BZ_DEBUG_MESSAGE("\tscalar loop for " << ubound-i << " trailing elements starting at " << i);
498  }
499#endif
500#ifdef BZ_USE_ALIGNMENT_PRAGMAS
501#pragma ivdep
502#endif
503  for (; i < ubound; ++i)
504//#pragma forceinline recursive
505    T_update::update(data[i], expr.fastRead(i));
506
507#ifdef BZ_DEBUG_TRAVERSE
508  BZ_DEBUG_MESSAGE("\tunit stride evaluation done")
509#endif
510}
511
512
513/** Common-stride evaluator. Used for common but non-unit
514    strides. Note that the stride can be negative, so we need to use a
515    signed type. */
516template<typename T_dest, typename T_expr, typename T_update>
517_bz_forceinline void
518_bz_evaluateWithCommonStride(T_dest& dest, typename T_dest::T_iterator& iter,
519                             T_expr expr, diffType ubound, 
520                             diffType commonStride, 
521                             T_update)
522{
523#ifdef BZ_DEBUG_TRAVERSE
524     BZ_DEBUG_MESSAGE("\tcommon stride = " << commonStride);
525#endif
526
527  typedef typename T_dest::T_numtype T_numtype;
528  T_numtype* restrict data = const_cast<T_numtype*>(iter.data());
529
530#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
531# ifdef BZ_USE_ALIGNMENT_PRAGMAS
532# pragma ivdep
533# endif
534  for (diffType i=0; i != ubound; i += commonStride)
535    T_update::update(data[i], expr.fastRead(i));
536#else
537  diffType n1 = (dest.length(firstRank) & 3) * commonStride;
538         
539  diffType i = 0;
540  for (; i != n1; i += commonStride)
541    T_update::update(data[i], expr.fastRead(i));
542         
543  diffType strideInc = 4 * commonStride;
544  for (; i != ubound; i += strideInc)
545    {
546      T_update::update(data[i], expr.fastRead(i));
547      diffType i2 = i + commonStride;
548      T_update::update(data[i2], expr.fastRead(i2));
549      diffType i3 = i + 2 * commonStride;
550      T_update::update(data[i3], expr.fastRead(i3));
551      diffType i4 = i + 3 * commonStride;
552      T_update::update(data[i4], expr.fastRead(i4));
553    }
554#endif  // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
555  return;
556}
557
558
559/* 1-d stack traversal evaluation. Forwards to evaluateWithUnitStride
560   or evaluateWithCommonStride, if applicable, otherwise does the slow
561   different-stride update. */
562template<typename T_dest, typename T_expr, typename T_update>
563_bz_forceinline void
564_bz_evaluator<1>::
565evaluateWithStackTraversal(T_dest& dest, T_expr expr, T_update)
566{
567#ifdef BZ_DEBUG_TRAVERSE
568  BZ_DEBUG_MESSAGE("_bz_evaluator<1>: Using stack traversal");
569#endif
570
571  typename T_dest::T_iterator iter(dest);
572
573  // if we only have one element, strides don't matter. In that case,
574  // we just evaluate that right now so we don't have to deal with it.
575  if(dest.length(firstRank)==1) {
576#ifdef BZ_DEBUG_TRAVERSE
577  BZ_DEBUG_MESSAGE("\tshortcutting evaluation of single-element expression");
578#endif
579    T_update::update(*const_cast<typename T_dest::T_numtype*>(iter.data()), *expr);
580    return;
581  }
582
583  iter.loadStride(firstRank);
584  expr.loadStride(firstRank);
585
586  const bool useUnitStride = iter.isUnitStride()
587    && expr.isUnitStride();
588
589  if(useUnitStride) {
590    const diffType ubound = dest.length(firstRank);
591    _bz_evaluateWithUnitStride(dest, iter, expr, ubound, T_update());
592    return;
593  }
594
595#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
596  diffType commonStride = expr.suggestStride(firstRank);
597  if (iter.suggestStride(firstRank) > commonStride)
598    commonStride = iter.suggestStride(firstRank);
599  bool useCommonStride = iter.isStride(firstRank,commonStride)
600    && expr.isStride(firstRank,commonStride);
601#else
602  diffType commonStride = 1;
603  bool useCommonStride = false;
604#endif
605
606  if (useCommonStride) {
607    const diffType ubound = dest.length(firstRank) * commonStride;
608    _bz_evaluateWithCommonStride(dest, iter, expr, ubound, commonStride, T_update());
609    return;
610  }
611
612#ifdef BZ_DEBUG_TRAVERSE
613  BZ_DEBUG_MESSAGE("\tnot common stride");
614#endif
615
616  // not common stride
617  typedef typename T_dest::T_numtype T_numtype;
618  const T_numtype * last = iter.data() + dest.length(firstRank) 
619    * dest.stride(firstRank);
620     
621  while (iter.data() != last)
622    {
623      T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
624      iter.advance();
625      expr.advance();
626    }
627}
628
629
630/**
631   Perform a stack traversal of a rank >1 expression. A stack
632   traversal replaces the usual nested loops:
633     
634   for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
635     for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
636       for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
637         A(i,j,k) = 0;
638     
639   with a stack data structure.  The stack allows this single routine
640   to replace any number of nested loops.
641     
642   For each dimension (loop), these quantities are needed:
643   - a pointer to the first element encountered in the loop
644   - the stride associated with the dimension/loop
645   - a pointer to the last element encountered in the loop
646   
647   The basic idea is that entering each loop is a "push" onto the
648   stack, and exiting each loop is a "pop".  In practice, this
649   routine treats accesses the stack in a random-access way,
650   which confuses the picture a bit.  But conceptually, that's
651   what is going on.
652
653   ordering(0) gives the dimension associated with the smallest
654   stride (usually; the exceptions have to do with subarrays and
655   are uninteresting).  We call this dimension maxRank; it will
656   become the innermost "loop".
657     
658   Ordering the loops from ordering(N_rank-1) down to
659   ordering(0) ensures that the largest stride is associated
660   with the outermost loop, and the smallest stride with the
661   innermost.  This is critical for good performance on
662   cached machines.
663*/
664
665template<int N>
666template<typename T_dest, typename T_expr, typename T_update>
667_bz_forceinline void
668_bz_evaluator<N>::
669evaluateWithStackTraversal(T_dest& dest, T_expr expr, T_update)
670 {
671#ifdef BZ_DEBUG_TRAVERSE
672   BZ_DEBUG_MESSAGE("_bz_evaluator<" << N << ">: Using stack traversal");
673#endif
674
675   typedef typename T_dest::T_numtype T_numtype;
676   const int N_rank = T_dest::rank();
677
678     const int maxRank = dest.ordering(0);
679     // const int secondLastRank = ordering(1);
680
681     // Create an iterator for the array receiving the result
682     typename T_dest::T_iterator iter(dest);
683
684     // Set the initial stack configuration by pushing the pointer
685     // to the first element of the array onto the stack N times.
686
687     int i;
688     for (i=1; i < N_rank; ++i)
689     {
690         iter.push(i);
691         expr.push(i);
692     }
693
694     // Load the strides associated with the innermost loop.
695     iter.loadStride(maxRank);
696     expr.loadStride(maxRank);
697
698     /*
699      * Is the stride in the innermost loop equal to 1?  If so,
700      * we might take advantage of this and generate more
701      * efficient code.
702      */
703     const bool useUnitStride = iter.isUnitStride()
704                          && expr.isUnitStride();
705
706    /*
707     * Do all array operands share a common stride in the innermost
708     * loop?  If so, we can generate more efficient code (but only
709     * if this optimization has been enabled).
710     */
711#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
712    diffType commonStride = expr.suggestStride(maxRank);
713    if (iter.suggestStride(maxRank) > commonStride)
714        commonStride = iter.suggestStride(maxRank);
715    bool useCommonStride = iter.isStride(maxRank,commonStride)
716        && expr.isStride(maxRank,commonStride);
717
718#ifdef BZ_DEBUG_TRAVERSE
719    BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
720        << "commonStride = " << commonStride << " useCommonStride = "
721        << useCommonStride);
722#endif
723
724#else
725    const diffType commonStride = 1;
726    const bool useCommonStride = false;
727#endif
728
729    /*
730     * The "last" array contains a pointer to the last element
731     * encountered in each "loop".
732     */
733    const T_numtype* last[T_dest::rank_];
734
735    // Set up the initial state of the "last" array
736    for (i=1; i < N_rank; ++i)
737        last[i] = iter.data() + dest.length(dest.ordering(i)) * dest.stride(dest.ordering(i));
738
739    diffType lastLength = dest.length(maxRank);
740    int firstNoncollapsedLoop = 1;
741
742#ifdef BZ_COLLAPSE_LOOPS
743
744    /*
745     * This bit of code handles collapsing loops.  When possible,
746     * the N nested loops are converted into a single loop (basically,
747     * the N-dimensional array is treated as a long vector).
748     * This is important for cases where the length of the innermost
749     * loop is very small, for example a 100x100x3 array.
750     * If this code can't collapse all the loops into a single loop,
751     * it will collapse as many loops as possible starting from the
752     * innermost and working out.
753     */
754
755    // Collapse loops when possible
756    for (i=1; i < N_rank; ++i)
757    {
758        // Figure out which pair of loops we are considering combining.
759        int outerLoopRank = iter.ordering(i);
760        int innerLoopRank = iter.ordering(i-1);
761
762        /*
763         * The canCollapse() routines look at the strides and extents
764         * of the loops, and determine if they can be combined into
765         * one loop.
766         */
767
768        if (iter.canCollapse(outerLoopRank,innerLoopRank) 
769          && expr.canCollapse(outerLoopRank,innerLoopRank))
770        {
771#ifdef BZ_DEBUG_TRAVERSE
772            cout << "Collapsing " << outerLoopRank << " and " 
773                 << innerLoopRank << endl;
774#endif
775            lastLength *= dest.length(outerLoopRank);
776            firstNoncollapsedLoop = i+1;
777        }
778        else 
779            break;
780    }
781
782#endif // BZ_COLLAPSE_LOOPS
783
784    /*
785     * Now we actually perform the loops.  This while loop contains
786     * two parts: first, the innermost loop is performed.  Then we
787     * exit the loop, and pop our way down the stack until we find
788     * a loop that isn't completed.  We then restart the inner loops
789     * and push them onto the stack.
790     */
791
792    while (true) {
793
794        /*
795         * This bit of code handles the innermost loop.  It just uses
796         * the separate evaluation functions depeding on the
797         * stride. */
798
799      diffType ubound = lastLength * commonStride;
800     
801      if (useUnitStride || useCommonStride) {
802        if(useUnitStride)
803          _bz_evaluateWithUnitStride(dest, iter, expr, ubound, T_update());
804        else 
805          _bz_evaluateWithCommonStride(dest, iter, expr, ubound, commonStride,
806                                       T_update());
807
808        /*
809         * Tidy up for the fact that we haven't actually been
810         * incrementing the iterators in the innermost loop, by
811         * faking it afterward.
812         */
813        iter.advance(lastLength * commonStride);
814        expr.advance(lastLength * commonStride);
815      }
816      else {
817            /*
818             * We don't have a unit stride or common stride in the innermost
819             * loop.  This is going to hurt performance.  Luckily 95% of
820             * the time, we hit the cases above.
821             */
822            T_numtype * restrict end = const_cast<T_numtype*>(iter.data())
823                + lastLength * dest.stride(maxRank);
824
825            while (iter.data() != end)
826            {
827              T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
828                iter.advance();
829                expr.advance();
830            }
831        }
832
833
834        /*
835         * We just finished the innermost loop.  Now we pop our way down
836         * the stack, until we hit a loop that hasn't completed yet.
837         */ 
838        int j = firstNoncollapsedLoop;
839        for (; j < N_rank; ++j)
840        {
841            // Get the next loop
842            int r = dest.ordering(j);
843
844            // Pop-- this restores the data pointers to the first element
845            // encountered in the loop.
846            iter.pop(j);
847            expr.pop(j);
848
849            // Load the stride associated with this loop, and increment
850            // once.
851            iter.loadStride(r);
852            expr.loadStride(r);
853            iter.advance();
854            expr.advance();
855
856            // If we aren't at the end of this loop, then stop popping.
857            if (iter.data() != last[j])
858                break;
859        }
860
861        // Are we completely done?
862        if (j == N_rank)
863            break;
864
865        // No, so push all the inner loops back onto the stack.
866        for (; j >= firstNoncollapsedLoop; --j)
867        {
868            int r2 = dest.ordering(j-1);
869            iter.push(j);
870            expr.push(j);
871            last[j-1] = iter.data() + dest.length(r2) * dest.stride(r2);
872        }
873
874        // Load the stride for the innermost loop again.
875        iter.loadStride(maxRank);
876        expr.loadStride(maxRank);
877    }
878}
879
880
881template<typename T_dest, typename T_expr, typename T_update>
882_bz_forceinline void
883_bz_evaluator<1>::
884evaluateWithIndexTraversal(T_dest& dest, T_expr expr, T_update)
885{
886  typedef typename T_dest::T_numtype T_numtype;
887
888  TinyVector<int,T_dest::rank_> index;
889
890  if (dest.stride(firstRank) == 1) {
891    T_numtype * restrict iter = dest.data();
892    int last = dest.ubound(firstRank);
893   
894    for (index[0] = dest.lbound(firstRank); index[0] <= last;
895         ++index[0]) {
896      T_update::update(*iter++, expr(index));
897    }
898  }
899  else {
900    typename T_dest::T_iterator iter(dest);
901    iter.loadStride(0);
902    int last = iter.ubound(firstRank);
903   
904    for (index[0] = iter.lbound(firstRank); index[0] <= last;
905         ++index[0]) {
906      T_update::update(*const_cast<T_numtype*>(iter.data()), 
907                       expr(index));
908      iter.advance();
909    }
910  }
911}
912
913  template<int N>
914template<typename T_dest, typename T_expr, typename T_update>
915_bz_forceinline void
916_bz_evaluator<N>::
917evaluateWithIndexTraversal(T_dest& dest, T_expr expr, T_update)
918{
919  typedef typename T_dest::T_numtype T_numtype;
920  const int N_rank = T_dest::rank();
921
922    // Do a stack-type traversal for the destination array and use
923    // index traversal for the source expression
924   
925    const int maxRank = dest.ordering(0);
926
927#ifdef BZ_DEBUG_TRAVERSE
928    const int secondLastRank = dest.ordering(1);
929    cout << "Index traversal: N_rank = " << N_rank << endl;
930    cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
931         << endl;
932    cout.flush();
933#endif
934
935    typename T_dest::T_iterator iter(dest);
936    for (int i=1; i < N_rank; ++i)
937        iter.push(iter.ordering(i));
938
939    iter.loadStride(maxRank);
940
941    TinyVector<int,T_dest::rank_> index, last;
942
943    index = dest.base();
944
945    for (int i=0; i < N_rank; ++i)
946      last(i) = dest.base(i) + dest.length(i);
947
948    // int lastLength = length(maxRank);
949
950    while (true) {
951
952        for (index[maxRank] = dest.base(maxRank); 
953             index[maxRank] < last[maxRank]; 
954             ++index[maxRank])
955        {
956#ifdef BZ_DEBUG_TRAVERSE
957#if 0
958    cout << "(" << index[0] << "," << index[1] << ") " << endl;
959    cout.flush();
960#endif
961#endif
962
963            T_update::update(*const_cast<T_numtype*>(iter.data()), expr(index));
964            iter.advance();
965        }
966
967        int j = 1;
968        for (; j < N_rank; ++j)
969        {
970            iter.pop(dest.ordering(j));
971            iter.loadStride(dest.ordering(j));
972            iter.advance();
973
974            index[dest.ordering(j-1)] = dest.base(dest.ordering(j-1));
975            ++index[dest.ordering(j)];
976            if (index[dest.ordering(j)] != last[dest.ordering(j)])
977                break;
978        }
979
980        if (j == N_rank)
981            break;
982
983        for (; j > 0; --j)
984        {
985            iter.push(dest.ordering(j));
986        }
987        iter.loadStride(maxRank);
988    }
989}
990
991// Fast traversals require <set> from the ISO/ANSI C++ standard library
992
993#ifdef BZ_HAVE_STD
994#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL
995
996template<typename T_dest, typename T_expr, typename T_update>
997_bz_forceinline void
998_bz_evaluateWithFastTraversal(T_dest& dest,     
999                              const TraversalOrder<T_dest::rank() - 1>& order, 
1000                              T_expr expr, T_update)
1001{
1002  typedef typename T_dest::T_numtype T_numtype;
1003  const int N_rank = T_dest::rank();
1004
1005    const int maxRank = dest.ordering(0);
1006
1007#ifdef BZ_DEBUG_TRAVERSE
1008    const int secondLastRank = dest.ordering(1);
1009    cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
1010         << endl;
1011#endif
1012
1013    T_dest::T_iterator iter(dest);
1014    iter.push(0);
1015    expr.push(0);
1016
1017    bool useUnitStride = iter.isUnitStride(maxRank) 
1018                          && expr.isUnitStride(maxRank);
1019
1020#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1021    diffType commonStride = expr.suggestStride(maxRank);
1022    if (iter.suggestStride(maxRank) > commonStride)
1023        commonStride = iter.suggestStride(maxRank);
1024    bool useCommonStride = iter.isStride(maxRank,commonStride)
1025        && expr.isStride(maxRank,commonStride);
1026#else
1027    diffType commonStride = 1;
1028    bool useCommonStride = false;
1029#endif
1030
1031    int lastLength = dest.length(maxRank);
1032
1033    for (int i=0; i < order.length(); ++i)
1034    {
1035        iter.pop(0);
1036        expr.pop(0);
1037
1038#ifdef BZ_DEBUG_TRAVERSE
1039    cerr << "Traversing: " << order[i] << endl;
1040#endif
1041        // Position the iterator at the start of the next column       
1042        for (int j=1; j < N_rank; ++j)
1043        {
1044            iter.loadStride(ordering(j));
1045            expr.loadStride(ordering(j));
1046
1047            int offset = order[i][j-1];
1048            iter.advance(offset);
1049            expr.advance(offset);
1050        }
1051
1052        iter.loadStride(maxRank);
1053        expr.loadStride(maxRank);
1054
1055        // Evaluate the expression along the column
1056
1057        if ((useUnitStride) || (useCommonStride))
1058        {
1059#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
1060            diffType ubound = lastLength * commonStride;
1061            T_numtype* restrict data = const_cast<T_numtype*>(iter.data());
1062
1063            if (commonStride == 1)
1064            {           
1065 #ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
1066                for (diffType i=0; i < ubound; ++i)
1067                    T_update::update(*data++, expr.fastRead(i));
1068 #else
1069                diffType n1 = ubound & 3;
1070                diffType i=0;
1071                for (; i < n1; ++i)
1072                    T_update::update(*data++, expr.fastRead(i));
1073
1074                for (; i < ubound; i += 4)
1075                {
1076                    T_update::update(*data++, expr.fastRead(i));
1077                    T_update::update(*data++, expr.fastRead(i+1));
1078                    T_update::update(*data++, expr.fastRead(i+2));
1079                    T_update::update(*data++, expr.fastRead(i+3));
1080                }
1081 #endif  // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
1082            }
1083 #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1084            else {
1085                for (diffType i=0; i < ubound; i += commonStride)
1086                    T_update::update(data[i], expr.fastRead(i));
1087            }
1088 #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1089
1090            iter.advance(lastLength * commonStride);
1091            expr.advance(lastLength * commonStride);
1092#else   // ! BZ_USE_FAST_READ_ARRAY_EXPR
1093            T_numtype* restrict last = const_cast<T_numtype*>(iter.data()) 
1094                + lastLength * commonStride;
1095
1096            while (iter.data() != last)
1097            {
1098                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
1099                iter.advance(commonStride);
1100                expr.advance(commonStride);
1101            }
1102#endif  // BZ_USE_FAST_READ_ARRAY_EXPR
1103
1104        }
1105        else {
1106            // No common stride
1107
1108            T_numtype* restrict last = const_cast<T_numtype*>(iter.data()) 
1109                + lastLength * stride(maxRank);
1110
1111            while (iter.data() != last)
1112            {
1113                T_update::update(*const_cast<T_numtype*>(iter.data()), *expr);
1114                iter.advance();
1115                expr.advance();
1116            }
1117        }
1118    }
1119}
1120
1121#endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL
1122#endif // BZ_HAVE_STD
1123
1124#ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
1125
1126#ifdef BZ_ARRAY_2D_STENCIL_TILING
1127
1128// what is diff between new and old?
1129template<typename T_dest, typename T_expr, typename T_update>
1130_bz_forceinline void
1131_bz_evaluateWithTiled2DTraversal(T_dest& dest, T_expr expr, T_update)
1132{
1133  typedef typename T_dest::T_numtype T_numtype;
1134  const int N_rank = T_dest::rank();
1135
1136  typename T_dest::T_iterator iter(dest);
1137
1138    const int minorRank = iter.ordering(0);
1139    const int majorRank = iter.ordering(1);
1140
1141    iter.push(0);
1142    expr.push(0);
1143
1144#ifdef BZ_2D_STENCIL_DEBUG
1145    int count = 0;
1146#endif
1147
1148    bool useUnitStride = iter.isUnitStride(minorRank)
1149                          && expr.isUnitStride(minorRank);
1150
1151#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1152    diffType commonStride = expr.suggestStride(minorRank);
1153    if (iter.suggestStride(minorRank) > commonStride)
1154        commonStride = iter.suggestStride(minorRank);
1155    bool useCommonStride = iter.isStride(minorRank,commonStride)
1156        && expr.isStride(minorRank,commonStride);
1157#else
1158    diffType commonStride = 1;
1159    bool useCommonStride = false;
1160#endif
1161
1162    // Determine if a common major stride exists
1163    diffType commonMajorStride = expr.suggestStride(majorRank);
1164    if (iter.suggestStride(majorRank) > commonMajorStride)
1165        commonMajorStride = iter.suggestStride(majorRank);
1166    bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
1167        && expr.isStride(majorRank,commonMajorStride);
1168
1169
1170    int maxi = dest.length(majorRank);
1171    int maxj = dest.length(minorRank);
1172
1173    const int tileHeight = 16, tileWidth = 3;
1174
1175    int bi, bj;
1176    for (bi=0; bi < maxi; bi += tileHeight)
1177    {
1178        int ni = bi + tileHeight;
1179        if (ni > maxi)
1180            ni = maxi;
1181
1182        // Move back to the beginning of the array
1183        iter.pop(0);
1184        expr.pop(0);
1185
1186        // Move to the start of this tile row
1187        iter.loadStride(majorRank);
1188        iter.advance(bi);
1189        expr.loadStride(majorRank);
1190        expr.advance(bi);
1191
1192        // Save this position
1193        iter.push(1);
1194        expr.push(1);
1195
1196        for (bj=0; bj < maxj; bj += tileWidth)
1197        {
1198            // Move to the beginning of the tile row
1199            iter.pop(1);
1200            expr.pop(1);
1201
1202            // Move to the top of the current tile (bi,bj)
1203            iter.loadStride(minorRank);
1204            iter.advance(bj);
1205            expr.loadStride(minorRank);
1206            expr.advance(bj);
1207
1208            if (bj + tileWidth <= maxj)
1209            {
1210                // Strip mining
1211
1212                if ((useUnitStride) && (haveCommonMajorStride))
1213                {
1214                    diffType offset = 0;
1215                    T_numtype* restrict data = const_cast<T_numtype*>
1216                        (iter.data());
1217
1218                    for (int i=bi; i < ni; ++i)
1219                    {
1220                        _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
1221
1222                        // Common subexpression elimination -- compilers
1223                        // won't necessarily do this on their own.
1224                        diffType t1 = offset+1;
1225                        diffType t2 = offset+2;
1226
1227                        tmp1 = expr.fastRead(offset);
1228                        tmp2 = expr.fastRead(t1);
1229                        tmp3 = expr.fastRead(t2);
1230
1231                        T_update::update(data[0], tmp1);
1232                        T_update::update(data[1], tmp2);
1233                        T_update::update(data[2], tmp3);
1234
1235                        offset += commonMajorStride;
1236                        data += commonMajorStride;
1237
1238#ifdef BZ_2D_STENCIL_DEBUG
1239    count += 3;
1240#endif
1241                    }
1242                }
1243                else {
1244
1245                    for (int i=bi; i < ni; ++i)
1246                    {
1247                        iter.loadStride(minorRank);
1248                        expr.loadStride(minorRank);
1249
1250                        // Loop through current row elements
1251                        T_update::update(*const_cast<T_numtype*>(iter.data()),
1252                            *expr);
1253                        iter.advance();
1254                        expr.advance();
1255
1256                        T_update::update(*const_cast<T_numtype*>(iter.data()),
1257                            *expr);
1258                        iter.advance();
1259                        expr.advance();
1260
1261                        T_update::update(*const_cast<T_numtype*>(iter.data()),
1262                            *expr);
1263                        iter.advance(-2);
1264                        expr.advance(-2);
1265
1266                        iter.loadStride(majorRank);
1267                        expr.loadStride(majorRank);
1268                        iter.advance();
1269                        expr.advance();
1270
1271#ifdef BZ_2D_STENCIL_DEBUG
1272    count += 3;
1273#endif
1274
1275                    }
1276                }
1277            }
1278            else {
1279
1280                // This code handles partial tiles at the bottom of the
1281                // array.
1282
1283                for (int j=bj; j < maxj; ++j)
1284                {
1285                    iter.loadStride(majorRank);
1286                    expr.loadStride(majorRank);
1287
1288                    for (int i=bi; i < ni; ++i)
1289                    {
1290                        T_update::update(*const_cast<T_numtype*>(iter.data()),
1291                            *expr);
1292                        iter.advance();
1293                        expr.advance();
1294#ifdef BZ_2D_STENCIL_DEBUG
1295    ++count;
1296#endif
1297
1298                    }
1299
1300                    // Move back to the top of this column
1301                    iter.advance(bi-ni);
1302                    expr.advance(bi-ni);
1303
1304                    // Move over to the next column
1305                    iter.loadStride(minorRank);
1306                    expr.loadStride(minorRank);
1307
1308                    iter.advance();
1309                    expr.advance();
1310                }
1311            }
1312        }
1313    }
1314
1315#ifdef BZ_2D_STENCIL_DEBUG
1316    cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
1317#endif
1318}
1319
1320#endif // BZ_ARRAY_2D_STENCIL_TILING
1321#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1322
1323
1324
1325#ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
1326
1327#ifdef BZ_ARRAY_2D_STENCIL_TILING
1328
1329// what is diff between new and old?
1330template<typename T_dest, typename T_expr, typename T_update>
1331_bz_forceinline void
1332_bz_evaluateWithTiled2DTraversal(T_dest& dest, T_expr expr, T_update)
1333{
1334  typedef typename T_dest::T_numtype T_numtype;
1335
1336    typename T_dest::T_iterator iter(dest);
1337
1338    const int minorRank = iter.ordering(0);
1339    const int majorRank = iter.ordering(1);
1340
1341    const int blockSize = 16;
1342   
1343    iter.push(0);
1344    expr.push(0);
1345
1346    bool useUnitStride = iter.isUnitStride(minorRank)
1347                          && expr.isUnitStride(minorRank);
1348
1349#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1350    diffType commonStride = expr.suggestStride(minorRank);
1351    if (iter.suggestStride(minorRank) > commonStride)
1352        commonStride = iter.suggestStride(minorRank);
1353    bool useCommonStride = iter.isStride(minorRank,commonStride)
1354        && expr.isStride(minorRank,commonStride);
1355#else
1356    diffType commonStride = 1;
1357    bool useCommonStride = false;
1358#endif
1359
1360    int maxi = dest.length(majorRank);
1361    int maxj = dest.length(minorRank);
1362
1363    int bi, bj;
1364    for (bi=0; bi < maxi; bi += blockSize)
1365    {
1366        int ni = bi + blockSize;
1367        if (ni > maxi)
1368            ni = maxi;
1369
1370        for (bj=0; bj < maxj; bj += blockSize)
1371        {
1372            int nj = bj + blockSize;
1373            if (nj > maxj)
1374                nj = maxj;
1375
1376            // Move to the beginning of the array
1377            iter.pop(0);
1378            expr.pop(0);
1379
1380            // Move to the beginning of the tile (bi,bj)
1381            iter.loadStride(majorRank);
1382            iter.advance(bi);
1383            iter.loadStride(minorRank);
1384            iter.advance(bj);
1385
1386            expr.loadStride(majorRank);
1387            expr.advance(bi);
1388            expr.loadStride(minorRank);
1389            expr.advance(bj);
1390
1391            // Loop through tile rows
1392            for (int i=bi; i < ni; ++i)
1393            {
1394                // Save the beginning of this tile row
1395                iter.push(1);
1396                expr.push(1);
1397
1398                // Load the minor stride
1399                iter.loadStride(minorRank);
1400                expr.loadStride(minorRank);
1401
1402                if (useUnitStride)
1403                {
1404                    T_numtype* restrict data = const_cast<T_numtype*>
1405                        (iter.data());
1406
1407                    int ubound = (nj-bj);
1408                    for (int j=0; j < ubound; ++j)
1409                        T_update::update(*data++, expr.fastRead(j));
1410                }
1411#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1412                else if (useCommonStride)
1413                {
1414                    const diffType ubound = (nj-bj) * commonStride;
1415                    T_numtype* restrict data = const_cast<T_numtype*>
1416                        (iter.data());
1417
1418                    for (diffType j=0; j < ubound; j += commonStride)
1419                        T_update::update(data[j], expr.fastRead(j));
1420                }
1421#endif
1422                else {
1423                    for (int j=bj; j < nj; ++j)
1424                    {
1425                        // Loop through current row elements
1426                        T_update::update(*const_cast<T_numtype*>(iter.data()), 
1427                                         *expr);
1428                        iter.advance();
1429                        expr.advance();
1430                    }
1431                }
1432
1433                // Move back to the beginning of the tile row, then
1434                // move to the next row
1435                iter.pop(1);
1436                iter.loadStride(majorRank);
1437                iter.advance(1);
1438
1439                expr.pop(1);
1440                expr.loadStride(majorRank);
1441                expr.advance(1);
1442            }
1443        }
1444    }
1445}
1446#endif // BZ_ARRAY_2D_STENCIL_TILING
1447#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1448
1449BZ_NAMESPACE_END
1450
1451#endif // BZ_ARRAYEVAL_CC
1452
Note: See TracBrowser for help on using the repository browser.