Context Navigation

source: XIOS/dev/dev_olga/src/extern/blitz/include/blitz/globeval.cc @ 1022

Last change on this file since 1022 was 1022, checked in by mhnguyen, 7 years ago

File size: 46.1 KB

Line
1	/***************************************************************************
2	* blitz/array/eval.cc Evaluate expression and assign to an array.
3	*
4	* $Id$
5	*
6	* Copyright (C) 1997-2011 Todd Veldhuizen <tveldhui@acm.org>
7	*
8	* This file is a part of Blitz.
9	*
10	* Blitz is free software: you can redistribute it and/or modify
11	* it under the terms of the GNU Lesser General Public License
12	* as published by the Free Software Foundation, either version 3
13	* of the License, or (at your option) any later version.
14	*
15	* Blitz is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU Lesser General Public License for more details.
19	*
20	* You should have received a copy of the GNU Lesser General Public
21	* License along with Blitz. If not, see <http://www.gnu.org/licenses/>.
22	*
23	* Suggestions: blitz-devel@lists.sourceforge.net
24	* Bugs: blitz-support@lists.sourceforge.net
25	*
26	* For more information, please see the Blitz++ Home Page:
27	* https://sourceforge.net/projects/blitz/
28	*
29	****************************************************************************/
30	#ifndef BZ_GLOBEVAL_CC
31	#define BZ_GLOBEVAL_CC
32
33	#include <blitz/ranks.h>
34	#include <blitz/tvevaluate.h>
35	#include <blitz/blitz.h>
36
37	BZ_NAMESPACE(blitz)
38
39
40	// Fast traversals require <set> from the ISO/ANSI C++ standard library
41	#ifdef BZ_HAVE_STD
42	#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL
43
44
45	/** _bz_tryFastTraversal is a helper class. Fast traversals are only
46	attempted if the expression looks like a stencil -- it's at least
47	three-dimensional, has at least six array operands, and there are
48	no index placeholders in the expression. These are all things
49	which can be checked at compile time, so the if()/else() syntax
50	has been replaced with this class template.
51	*/
52	template<bool canTryFastTraversal>
53	struct _bz_tryFastTraversal {
54	template<typename T_numtype, int N_rank, typename T_expr, typename T_update>
55	static bool tryFast(Array<T_numtype,N_rank>& array,
56	T_expr expr, T_update)
57	{
58	return false;
59	}
60	};
61
62	template<>
63	struct _bz_tryFastTraversal<true> {
64	template<typename T_numtype, int N_rank, typename T_expr, typename T_update>
65	static bool tryFast(Array<T_numtype,N_rank>& array,
66	T_expr expr, T_update)
67	{
68	// See if there's an appropriate space filling curve available.
69	// Currently fast traversals use an N-1 dimensional curve. The
70	// Nth dimension column corresponding to each point on the curve
71	// is traversed in the normal fashion.
72	TraversalOrderCollection<N_rank-1> traversals;
73	TinyVector<int, N_rank - 1> traversalGridSize;
74
75	for (int i=0; i < N_rank - 1; ++i)
76	traversalGridSize[i] = array.length(array.ordering(i+1));
77
78	#ifdef BZ_DEBUG_TRAVERSE
79	cout << "traversalGridSize = " << traversalGridSize << endl;
80	cout.flush();
81	#endif
82
83	const TraversalOrder<N_rank-1>* order =
84	traversals.find(traversalGridSize);
85
86	if (order)
87	{
88	#ifdef BZ_DEBUG_TRAVERSE
89	cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype)
90	<< ", " << N_rank << ">: Using stack traversal" << endl;
91	#endif
92	// A curve was available -- use fast traversal.
93	array.evaluateWithFastTraversal(*order, expr, T_update());
94	return true;
95	}
96
97	return false;
98	}
99	};
100	#endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL
101	#endif // BZ_HAVE_STD
102
103
104	/** Helper class that implements the evaluation routines for different
105	ranks. */
106	template<int N> struct _bz_evaluator {
107	template<typename T_dest, typename T_expr, typename T_update>
108	static void evaluateWithStackTraversal(T_dest&, T_expr, T_update);
109	template<typename T_dest, typename T_expr, typename T_update>
110	static void evaluateWithIndexTraversal(T_dest&, T_expr, T_update);
111	};
112	template<> struct _bz_evaluator<1> {
113	template<typename T_dest, typename T_expr, typename T_update>
114	static void evaluateWithStackTraversal(T_dest&, T_expr, T_update);
115	template<typename T_dest, typename T_expr, typename T_update>
116	static void evaluateWithIndexTraversal(T_dest&, T_expr, T_update);
117	};
118
119	/**
120	Assign an expression to a container. For performance reasons, this
121	function forwards to functions implementing one of several traversal
122	mechanisms:
123
124	- Index traversal scans through the destination array in storage order.
125	The expression is evaluated using a TinyVector<int,N> operand. This
126	version is used only when there are index placeholders in the expression
127	(see <blitz/indexexpr.h>)
128	- Stack traversal also scans through the destination array in storage
129	order. However, push/pop stack iterators are used.
130	- Fast traversal follows a Hilbert (or other) space-filling curve to
131	improve cache reuse for stencilling operations. Currently, the
132	space filling curves must be generated by calling
133	generateFastTraversalOrder(TinyVector<int,N_dimensions>).
134	- 2D tiled traversal follows a tiled traversal, to improve cache reuse
135	for 2D stencils. Space filling curves have too much overhead to use
136	in two-dimensions.
137	*/
138	template<typename T_dest, typename T_expr, typename T_update>
139	_bz_forceinline void
140	_bz_evaluate(T_dest& dest, T_expr expr, T_update)
141	{
142	typedef typename T_dest::T_numtype T_numtype;
143	const int N_rank = T_dest::rank_;
144
145	// Check that all arrays have the same shape
146	#ifdef BZ_DEBUG
147	if (!expr.shapeCheck(dest.shape()))
148	{
149	if (assertFailMode == false)
150	{
151	cerr << "[Blitz++] Shape check failed: Module " << __FILE__
152	<< " line " << __LINE__ << endl
153	<< " Expression: ";
154	prettyPrintFormat format(true); // Use terse formatting
155	BZ_STD_SCOPE(string) str;
156	expr.prettyPrint(str, format);
157	cerr << str << endl ;
158	}
159
160	#if 0
161	// Shape dumping is broken by change to using string for prettyPrint
162	<< " Shapes: " << shape() << " = ";
163	prettyPrintFormat format2;
164	format2.setDumpArrayShapesMode();
165	expr.prettyPrint(cerr, format2);
166	cerr << endl;
167	#endif
168	BZ_PRE_FAIL;
169	}
170	#endif
171
172	BZPRECHECK(expr.shapeCheck(dest.shape()),
173	"Shape check failed." << endl << "Expression:");
174
175	BZPRECHECK((T_expr::rank_ == T_dest::rank_) \|\|
176	(T_expr::numArrayOperands == 0),
177	"Assigned rank " << T_expr::rank_ << " expression to rank "
178	<< T_dest::rank_ << " array.");
179
180	/*
181	* Check that the arrays are not empty (e.g. length 0 arrays)
182	* This fixes a bug found by Peter Bienstman, 6/16/99, where
183	* Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i);
184	* went into an infinite loop.
185	*/
186
187	const sizeType n = dest.numElements();
188	if (n == 0) {
189	#ifdef BZ_DEBUG_TRAVERSE
190	BZ_DEBUG_MESSAGE("Evaluating empty array, nothing to do");
191	#endif
192	return;
193	}
194	// \todo this does not alvays compile, so eliminate for now.
195	// if (n == 1) {
196	// // shortcut here since it's easy
197	// T_update::update(*dest.dataFirst(), expr(expr.lbound()));
198	// return;
199	// }
200
201	#ifdef BZ_DEBUG_TRAVERSE
202	BZ_DEBUG_MESSAGE( "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders);
203	#endif
204
205	// Tau profiling code. Provide Tau with a pretty-printed version of
206	// the expression.
207	// NEEDS_WORK-- use a static initializer somehow.
208
209	#ifdef BZ_TAU_PROFILING
210	static BZ_STD_SCOPE(string) exprDescription;
211	if (!exprDescription.length()) // faked static initializer
212	{
213	exprDescription = "A";
214	prettyPrintFormat format(true); // Terse mode on
215	format.nextArrayOperandSymbol();
216	T_update::prettyPrint(exprDescription);
217	expr.prettyPrint(exprDescription, format);
218	}
219	TAU_PROFILE(" ", exprDescription, TAU_BLITZ);
220	#endif
221
222	// Determine which evaluation mechanism to use
223	if (T_expr::numIndexPlaceholders > 0)
224	{
225	// The expression involves index placeholders, so have to
226	// use index traversal rather than stack traversal.
227
228	_bz_evaluator<T_dest::rank_>::evaluateWithIndexTraversal(dest, expr, T_update());
229	return;
230	}
231	else {
232
233	// If this expression looks like an array stencil, then attempt to
234	// use a fast traversal order.
235	// Fast traversals require <set> from the ISO/ANSI C++ standard
236	// library.
237
238	#ifdef BZ_HAVE_STD
239	#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL
240
241	enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6)
242	&& (T_expr::numIndexPlaceholders == 0) };
243
244	if (_bz_tryFastTraversal<isStencil>::tryFast(dest, expr, T_update()))
245	return;
246
247	#endif
248	#endif
249
250	#ifdef BZ_ARRAY_2D_STENCIL_TILING
251	// Does this look like a 2-dimensional stencil on a largeish
252	// array?
253
254	if ((N_rank == 2) && (T_expr::numArrayOperands >= 5))
255	{
256	// Use a heuristic to determine whether a tiled traversal
257	// is desirable. First, estimate how much L1 cache is needed
258	// to achieve a high hit rate using the stack traversal.
259	// Try to err on the side of using tiled traversal even when
260	// it isn't strictly needed.
261
262	// Assumptions:
263	// Stencil width 3
264	// 3 arrays involved in stencil
265	// Uniform data type in arrays (all T_numtype)
266
267	int cacheNeeded = 3 * 3 * sizeof(T_numtype) * dest.length(dest.ordering(0));
268	if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE) {
269	_bz_evaluateWithTiled2DTraversal(dest, expr, T_update());
270	return;
271	}
272	}
273
274	#endif
275
276	// If fast traversal isn't available or appropriate, then just
277	// do a stack traversal.
278	//#pragma forceinline recursive
279	_bz_evaluator<T_dest::rank_>::evaluateWithStackTraversal(dest, expr, T_update());
280	return;
281	}
282	}
283
284	/** This class performs the vectorized update through the update()
285	method. It is a class because it is specialized to do nothing for
286	instances where the simd vector width is 1. This avoids tricky
287	infinite template recursions on multicomponent containers. */
288	template<typename T_numtype, typename T_expr, typename T_update, int N>
289	struct chunked_updater {
290
291	static _bz_forceinline void
292	aligned_update(T_numtype* data, T_expr expr, diffType i) {
293
294	const bool unroll = N < BZ_TV_EVALUATE_UNROLL_LENGTH;
295	_tv_evaluator<unroll, N>::evaluate_aligned
296	(data+i, expr.template fastRead_tv<N>(i), T_update());
297	};
298
299	static _bz_forceinline void
300	unaligned_update(T_numtype* data, T_expr expr, diffType i) {
301	const bool unroll = N < BZ_TV_EVALUATE_UNROLL_LENGTH;
302	_tv_evaluator<unroll, N>::evaluate_unaligned
303	(data+i, expr.template fastRead_tv<N>(i), T_update());
304	};
305
306	};
307
308	/** specialization ensures we don't try to instantiate chunked_updates
309	for types with a vecWidth of 1, as this leads to infinite template
310	instantiation recursion. */
311	template<typename T_numtype, typename T_expr, typename T_update>
312	struct chunked_updater<T_numtype, T_expr, T_update, 1> {
313	static _bz_forceinline void
314	aligned_update(T_numtype* data, T_expr expr, diffType i) {
315	BZPRECONDITION(0); };
316	static _bz_forceinline void
317	unaligned_update(T_numtype* data, T_expr expr, diffType i) {
318	BZPRECONDITION(0); };
319	};
320
321
322	/** A metaprogram that uses the chunked_updater to assign an
323	unknown-length expression to a pointer by unrolling in a binary
324	fashion. This way, we can get "almost-compile-time" unrolling of a
325	length only known at runtime. I+1 is the number of significant
326	bits in the longest length to consider. In this way, assigning a
327	vector of length 7 is I=2 and will take 3 operations. The
328	metaprogram counts down, that way it will start with large updates
329	which will be aligned for aligned expressions. */
330	template<int I>
331	class _bz_meta_binaryAssign {
332	public:
333	template<typename T_data, typename T_expr, typename T_update>
334	static _bz_forceinline void assign(T_data* data, T_expr expr,
335	diffType ubound, diffType pos,
336	T_update) {
337	if(ubound&(1<<I)) {
338	chunked_updater<T_data, T_expr, T_update, 1<<I >::
339	unaligned_update(data, expr, pos);
340	pos += (1<<I);
341	}
342	_bz_meta_binaryAssign<I-1>::assign(data, expr, ubound, pos, T_update());
343	}
344
345	};
346
347	/** Partial specialization for bit 0 uses the scalar update. */
348	template<>
349	class _bz_meta_binaryAssign<0> {
350	public:
351	template<typename T_data, typename T_expr, typename T_update>
352	static _bz_forceinline void assign(T_data* data, T_expr expr,
353	diffType ubound, diffType pos,
354	T_update) {
355	if(ubound&1) {
356	T_update::update(data[pos], expr.fastRead(pos));
357	++pos;
358	}
359	// this ends the metaprogram.
360	}
361	};
362
363	/** Unit-stride evaluator, which takes pre-computed destination and
364	bounds and just does the unit-stride evaluation for a specified
365	length. This is essentially a 1D operation used by both the rank-1
366	and rank-N traversals, so it's common to both. This can use
367	vectorized update, so if both dest and expr are unit stride, we
368	redirect here. This function then deals with unaligned or
369	misaligned situations. There is no explicit unrolling option here,
370	since it's already vectorized using the chunk_updater. \todo Would
371	it be useful to retain the unrolled loop for scalar
372	architectures? */
373	template<typename T_dest, typename T_expr, typename T_update>
374	_bz_forceinline void
375	_bz_evaluateWithUnitStride(T_dest& dest, typename T_dest::T_iterator& iter,
376	T_expr expr, diffType ubound, T_update)
377	{
378	typedef typename T_dest::T_numtype T_numtype;
379	T_numtype* restrict data = const_cast<T_numtype*>(iter.data());
380	diffType i=0;
381
382	#ifdef BZ_DEBUG_TRAVERSE
383	BZ_DEBUG_MESSAGE("\tunit stride expression with length: "<< ubound << ".");
384	#endif
385
386	// If the minWidth is set to 0, there are elements in the expression
387	// which can NOT use the vectorized expression (i.e, stencils). In
388	// that case, we fall through to the scalar loop
389	const bool unvectorizable = (T_expr::minWidth==0);
390
391	if(!unvectorizable && (ubound < 1<<BZ_MAX_BITS_FOR_BINARY_UNROLL)) {
392	// for short expressions, it's more important to lose
393	// overhead. Single-element ones have already been dealt with, but
394	// for lengths that are have fewer significant bits than
395	// max_bits_for_unroll we do a binary-style unroll here. (We don't
396	// worry about simd widths either, because we essentially just
397	// present the compiler with a vectorizable view. It will do
398	// sensible things even if the expressions are not vectorizable.)
399	#ifdef BZ_DEBUG_TRAVERSE
400	BZ_DEBUG_MESSAGE("\tshort expression, using binary meta-unroll assignment.");
401	#endif
402
403	_bz_meta_binaryAssign<BZ_MAX_BITS_FOR_BINARY_UNROLL-1>::
404	assign(data, expr, ubound, 0, T_update());
405	return;
406	}
407
408	// calculate uneven elements at the beginning of dest
409	const diffType uneven_start=simdTypes<T_numtype>::offsetToAlignment(data);
410
411	// we can only guarantee alignment if all operands have the same
412	// width and are not mutually misaligned
413	const bool can_align =
414	(T_expr::minWidth == T_expr::maxWidth) &&
415	(T_expr::minWidth == int(simdTypes<T_numtype>::vecWidth)) &&
416	expr.isVectorAligned(uneven_start);
417
418	// When we come out here, we KNOW that expressions shorter than
419	// 1<<BZ_MAX_BITS_FOR_BINARY_UNROLL have been taken care of. At that
420	// point, it is efficient to effectively unroll the loop using a
421	// vector width larger than the simd width.
422	const int loop_width= BZ_VECTORIZED_LOOP_WIDTH;
423
424	#ifdef BZ_DEBUG_TRAVERSE
425	if(T_expr::minWidth!=T_expr::maxWidth) {
426	BZ_DEBUG_MESSAGE("\texpression has mixed width: " << T_expr::minWidth << "-" <<T_expr::maxWidth);
427	} else {
428	BZ_DEBUG_MESSAGE("\texpression SIMD width: " << T_expr::minWidth);
429	}
430	BZ_DEBUG_MESSAGE("\tdestination SIMD width: " << simdTypes<T_numtype>::vecWidth);
431	if(loop_width>1) {
432	if(!expr.isVectorAligned(uneven_start)) {
433	BZ_DEBUG_MESSAGE("\toperands have different alignments");
434	}
435	if(!can_align) {
436	BZ_DEBUG_MESSAGE("\tcannot guarantee alignment - using unaligned vectorization")
437	} else {
438	BZ_DEBUG_MESSAGE("\texpression can be aligned");
439	}
440	if(loop_width<=ubound) {
441	BZ_DEBUG_MESSAGE("\tusing vectorization width " << loop_width);
442	} else {
443	BZ_DEBUG_MESSAGE("\texpression not long enough to be vectorized");
444	}
445	} else {
446	BZ_DEBUG_MESSAGE("\texpression cannot be vectorized");
447	}
448	#endif
449
450
451	if(!unvectorizable && (loop_width>1)) {
452	// If the expression can be aligned, we do so.
453	if(can_align) {
454	#ifdef BZ_DEBUG_TRAVERSE
455	if(i<uneven_start) {
456	BZ_DEBUG_MESSAGE("\tscalar loop for " << uneven_start << " unaligned starting elements");
457	}
458	#endif
459	#ifdef BZ_USE_ALIGNMENT_PRAGMAS
460	#pragma ivdep
461	#endif
462	for (; i < uneven_start; ++i)
463	//#pragma forceinline recursive
464	T_update::update(data[i], expr.fastRead(i));
465
466	// and then the vectorized part
467	#ifdef BZ_DEBUG_TRAVERSE
468	if(i<=ubound-loop_width) {
469	BZ_DEBUG_MESSAGE("\taligned vectorized loop with width " << loop_width << " starting at " << i);
470	}
471	#endif
472	for (; i <= ubound-loop_width; i+=loop_width)
473	//#pragma forceinline recursive
474	chunked_updater<T_numtype, T_expr, T_update, loop_width>::
475	aligned_update(data, expr, i);
476	}
477	else {
478	// if we can not line up the expressions, alignment doesn't
479	// matter and we just start using unaligned vectorized
480	// instructions from element 0
481	#ifdef BZ_DEBUG_TRAVERSE
482	if(i<=ubound-loop_width) {
483	BZ_DEBUG_MESSAGE("\tunaligned vectorized loop with width " << loop_width << " starting at " << i);
484	}
485	#endif
486	for (; i <= ubound-loop_width; i+=loop_width)
487	//#pragma forceinline recursive
488	chunked_updater<T_numtype, T_expr, T_update, loop_width>::
489	unaligned_update(data, expr, i);
490	}
491	}
492
493	// now complete the loop with the tailing scalar elements not done
494	// in the chunked loop.
495	#ifdef BZ_DEBUG_TRAVERSE
496	if(i<ubound) {
497	BZ_DEBUG_MESSAGE("\tscalar loop for " << ubound-i << " trailing elements starting at " << i);
498	}
499	#endif
500	#ifdef BZ_USE_ALIGNMENT_PRAGMAS
501	#pragma ivdep
502	#endif
503	for (; i < ubound; ++i)
504	//#pragma forceinline recursive
505	T_update::update(data[i], expr.fastRead(i));
506
507	#ifdef BZ_DEBUG_TRAVERSE
508	BZ_DEBUG_MESSAGE("\tunit stride evaluation done")
509	#endif
510	}
511
512
513	/** Common-stride evaluator. Used for common but non-unit
514	strides. Note that the stride can be negative, so we need to use a
515	signed type. */
516	template<typename T_dest, typename T_expr, typename T_update>
517	_bz_forceinline void
518	_bz_evaluateWithCommonStride(T_dest& dest, typename T_dest::T_iterator& iter,
519	T_expr expr, diffType ubound,
520	diffType commonStride,
521	T_update)
522	{
523	#ifdef BZ_DEBUG_TRAVERSE
524	BZ_DEBUG_MESSAGE("\tcommon stride = " << commonStride);
525	#endif
526
527	typedef typename T_dest::T_numtype T_numtype;
528	T_numtype* restrict data = const_cast<T_numtype*>(iter.data());
529
530	#ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL
531	# ifdef BZ_USE_ALIGNMENT_PRAGMAS
532	# pragma ivdep
533	# endif
534	for (diffType i=0; i != ubound; i += commonStride)
535	T_update::update(data[i], expr.fastRead(i));
536	#else
537	diffType n1 = (dest.length(firstRank) & 3) * commonStride;
538
539	diffType i = 0;
540	for (; i != n1; i += commonStride)
541	T_update::update(data[i], expr.fastRead(i));
542
543	diffType strideInc = 4 * commonStride;
544	for (; i != ubound; i += strideInc)
545	{
546	T_update::update(data[i], expr.fastRead(i));
547	diffType i2 = i + commonStride;
548	T_update::update(data[i2], expr.fastRead(i2));
549	diffType i3 = i + 2 * commonStride;
550	T_update::update(data[i3], expr.fastRead(i3));
551	diffType i4 = i + 3 * commonStride;
552	T_update::update(data[i4], expr.fastRead(i4));
553	}
554	#endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL
555	return;
556	}
557
558
559	/* 1-d stack traversal evaluation. Forwards to evaluateWithUnitStride
560	or evaluateWithCommonStride, if applicable, otherwise does the slow
561	different-stride update. */
562	template<typename T_dest, typename T_expr, typename T_update>
563	_bz_forceinline void
564	_bz_evaluator<1>::
565	evaluateWithStackTraversal(T_dest& dest, T_expr expr, T_update)
566	{
567	#ifdef BZ_DEBUG_TRAVERSE
568	BZ_DEBUG_MESSAGE("_bz_evaluator<1>: Using stack traversal");
569	#endif
570
571	typename T_dest::T_iterator iter(dest);
572
573	// if we only have one element, strides don't matter. In that case,
574	// we just evaluate that right now so we don't have to deal with it.
575	if(dest.length(firstRank)==1) {
576	#ifdef BZ_DEBUG_TRAVERSE
577	BZ_DEBUG_MESSAGE("\tshortcutting evaluation of single-element expression");
578	#endif
579	T_update::update(const_cast<typename T_dest::T_numtype>(iter.data()), *expr);
580	return;
581	}
582
583	iter.loadStride(firstRank);
584	expr.loadStride(firstRank);
585
586	const bool useUnitStride = iter.isUnitStride()
587	&& expr.isUnitStride();
588
589	if(useUnitStride) {
590	const diffType ubound = dest.length(firstRank);
591	_bz_evaluateWithUnitStride(dest, iter, expr, ubound, T_update());
592	return;
593	}
594
595	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
596	diffType commonStride = expr.suggestStride(firstRank);
597	if (iter.suggestStride(firstRank) > commonStride)
598	commonStride = iter.suggestStride(firstRank);
599	bool useCommonStride = iter.isStride(firstRank,commonStride)
600	&& expr.isStride(firstRank,commonStride);
601	#else
602	diffType commonStride = 1;
603	bool useCommonStride = false;
604	#endif
605
606	if (useCommonStride) {
607	const diffType ubound = dest.length(firstRank) * commonStride;
608	_bz_evaluateWithCommonStride(dest, iter, expr, ubound, commonStride, T_update());
609	return;
610	}
611
612	#ifdef BZ_DEBUG_TRAVERSE
613	BZ_DEBUG_MESSAGE("\tnot common stride");
614	#endif
615
616	// not common stride
617	typedef typename T_dest::T_numtype T_numtype;
618	const T_numtype * last = iter.data() + dest.length(firstRank)
619	* dest.stride(firstRank);
620
621	while (iter.data() != last)
622	{
623	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
624	iter.advance();
625	expr.advance();
626	}
627	}
628
629
630	/**
631	Perform a stack traversal of a rank >1 expression. A stack
632	traversal replaces the usual nested loops:
633
634	for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i)
635	for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j)
636	for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k)
637	A(i,j,k) = 0;
638
639	with a stack data structure. The stack allows this single routine
640	to replace any number of nested loops.
641
642	For each dimension (loop), these quantities are needed:
643	- a pointer to the first element encountered in the loop
644	- the stride associated with the dimension/loop
645	- a pointer to the last element encountered in the loop
646
647	The basic idea is that entering each loop is a "push" onto the
648	stack, and exiting each loop is a "pop". In practice, this
649	routine treats accesses the stack in a random-access way,
650	which confuses the picture a bit. But conceptually, that's
651	what is going on.
652
653	ordering(0) gives the dimension associated with the smallest
654	stride (usually; the exceptions have to do with subarrays and
655	are uninteresting). We call this dimension maxRank; it will
656	become the innermost "loop".
657
658	Ordering the loops from ordering(N_rank-1) down to
659	ordering(0) ensures that the largest stride is associated
660	with the outermost loop, and the smallest stride with the
661	innermost. This is critical for good performance on
662	cached machines.
663	*/
664
665	template<int N>
666	template<typename T_dest, typename T_expr, typename T_update>
667	_bz_forceinline void
668	_bz_evaluator<N>::
669	evaluateWithStackTraversal(T_dest& dest, T_expr expr, T_update)
670	{
671	#ifdef BZ_DEBUG_TRAVERSE
672	BZ_DEBUG_MESSAGE("_bz_evaluator<" << N << ">: Using stack traversal");
673	#endif
674
675	typedef typename T_dest::T_numtype T_numtype;
676	const int N_rank = T_dest::rank();
677
678	const int maxRank = dest.ordering(0);
679	// const int secondLastRank = ordering(1);
680
681	// Create an iterator for the array receiving the result
682	typename T_dest::T_iterator iter(dest);
683
684	// Set the initial stack configuration by pushing the pointer
685	// to the first element of the array onto the stack N times.
686
687	int i;
688	for (i=1; i < N_rank; ++i)
689	{
690	iter.push(i);
691	expr.push(i);
692	}
693
694	// Load the strides associated with the innermost loop.
695	iter.loadStride(maxRank);
696	expr.loadStride(maxRank);
697
698	/*
699	* Is the stride in the innermost loop equal to 1? If so,
700	* we might take advantage of this and generate more
701	* efficient code.
702	*/
703	const bool useUnitStride = iter.isUnitStride()
704	&& expr.isUnitStride();
705
706	/*
707	* Do all array operands share a common stride in the innermost
708	* loop? If so, we can generate more efficient code (but only
709	* if this optimization has been enabled).
710	*/
711	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
712	diffType commonStride = expr.suggestStride(maxRank);
713	if (iter.suggestStride(maxRank) > commonStride)
714	commonStride = iter.suggestStride(maxRank);
715	bool useCommonStride = iter.isStride(maxRank,commonStride)
716	&& expr.isStride(maxRank,commonStride);
717
718	#ifdef BZ_DEBUG_TRAVERSE
719	BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE" << endl
720	<< "commonStride = " << commonStride << " useCommonStride = "
721	<< useCommonStride);
722	#endif
723
724	#else
725	const diffType commonStride = 1;
726	const bool useCommonStride = false;
727	#endif
728
729	/*
730	* The "last" array contains a pointer to the last element
731	* encountered in each "loop".
732	*/
733	const T_numtype* last[T_dest::rank_];
734
735	// Set up the initial state of the "last" array
736	for (i=1; i < N_rank; ++i)
737	last[i] = iter.data() + dest.length(dest.ordering(i)) * dest.stride(dest.ordering(i));
738
739	diffType lastLength = dest.length(maxRank);
740	int firstNoncollapsedLoop = 1;
741
742	#ifdef BZ_COLLAPSE_LOOPS
743
744	/*
745	* This bit of code handles collapsing loops. When possible,
746	* the N nested loops are converted into a single loop (basically,
747	* the N-dimensional array is treated as a long vector).
748	* This is important for cases where the length of the innermost
749	* loop is very small, for example a 100x100x3 array.
750	* If this code can't collapse all the loops into a single loop,
751	* it will collapse as many loops as possible starting from the
752	* innermost and working out.
753	*/
754
755	// Collapse loops when possible
756	for (i=1; i < N_rank; ++i)
757	{
758	// Figure out which pair of loops we are considering combining.
759	int outerLoopRank = iter.ordering(i);
760	int innerLoopRank = iter.ordering(i-1);
761
762	/*
763	* The canCollapse() routines look at the strides and extents
764	* of the loops, and determine if they can be combined into
765	* one loop.
766	*/
767
768	if (iter.canCollapse(outerLoopRank,innerLoopRank)
769	&& expr.canCollapse(outerLoopRank,innerLoopRank))
770	{
771	#ifdef BZ_DEBUG_TRAVERSE
772	cout << "Collapsing " << outerLoopRank << " and "
773	<< innerLoopRank << endl;
774	#endif
775	lastLength *= dest.length(outerLoopRank);
776	firstNoncollapsedLoop = i+1;
777	}
778	else
779	break;
780	}
781
782	#endif // BZ_COLLAPSE_LOOPS
783
784	/*
785	* Now we actually perform the loops. This while loop contains
786	* two parts: first, the innermost loop is performed. Then we
787	* exit the loop, and pop our way down the stack until we find
788	* a loop that isn't completed. We then restart the inner loops
789	* and push them onto the stack.
790	*/
791
792	while (true) {
793
794	/*
795	* This bit of code handles the innermost loop. It just uses
796	* the separate evaluation functions depeding on the
797	* stride. */
798
799	diffType ubound = lastLength * commonStride;
800
801	if (useUnitStride \|\| useCommonStride) {
802	if(useUnitStride)
803	_bz_evaluateWithUnitStride(dest, iter, expr, ubound, T_update());
804	else
805	_bz_evaluateWithCommonStride(dest, iter, expr, ubound, commonStride,
806	T_update());
807
808	/*
809	* Tidy up for the fact that we haven't actually been
810	* incrementing the iterators in the innermost loop, by
811	* faking it afterward.
812	*/
813	iter.advance(lastLength * commonStride);
814	expr.advance(lastLength * commonStride);
815	}
816	else {
817	/*
818	* We don't have a unit stride or common stride in the innermost
819	* loop. This is going to hurt performance. Luckily 95% of
820	* the time, we hit the cases above.
821	*/
822	T_numtype * restrict end = const_cast<T_numtype*>(iter.data())
823	+ lastLength * dest.stride(maxRank);
824
825	while (iter.data() != end)
826	{
827	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
828	iter.advance();
829	expr.advance();
830	}
831	}
832
833
834	/*
835	* We just finished the innermost loop. Now we pop our way down
836	* the stack, until we hit a loop that hasn't completed yet.
837	*/
838	int j = firstNoncollapsedLoop;
839	for (; j < N_rank; ++j)
840	{
841	// Get the next loop
842	int r = dest.ordering(j);
843
844	// Pop-- this restores the data pointers to the first element
845	// encountered in the loop.
846	iter.pop(j);
847	expr.pop(j);
848
849	// Load the stride associated with this loop, and increment
850	// once.
851	iter.loadStride(r);
852	expr.loadStride(r);
853	iter.advance();
854	expr.advance();
855
856	// If we aren't at the end of this loop, then stop popping.
857	if (iter.data() != last[j])
858	break;
859	}
860
861	// Are we completely done?
862	if (j == N_rank)
863	break;
864
865	// No, so push all the inner loops back onto the stack.
866	for (; j >= firstNoncollapsedLoop; --j)
867	{
868	int r2 = dest.ordering(j-1);
869	iter.push(j);
870	expr.push(j);
871	last[j-1] = iter.data() + dest.length(r2) * dest.stride(r2);
872	}
873
874	// Load the stride for the innermost loop again.
875	iter.loadStride(maxRank);
876	expr.loadStride(maxRank);
877	}
878	}
879
880
881	template<typename T_dest, typename T_expr, typename T_update>
882	_bz_forceinline void
883	_bz_evaluator<1>::
884	evaluateWithIndexTraversal(T_dest& dest, T_expr expr, T_update)
885	{
886	typedef typename T_dest::T_numtype T_numtype;
887
888	TinyVector<int,T_dest::rank_> index;
889
890	if (dest.stride(firstRank) == 1) {
891	T_numtype * restrict iter = dest.data();
892	int last = dest.ubound(firstRank);
893
894	for (index[0] = dest.lbound(firstRank); index[0] <= last;
895	++index[0]) {
896	T_update::update(*iter++, expr(index));
897	}
898	}
899	else {
900	typename T_dest::T_iterator iter(dest);
901	iter.loadStride(0);
902	int last = iter.ubound(firstRank);
903
904	for (index[0] = iter.lbound(firstRank); index[0] <= last;
905	++index[0]) {
906	T_update::update(const_cast<T_numtype>(iter.data()),
907	expr(index));
908	iter.advance();
909	}
910	}
911	}
912
913	template<int N>
914	template<typename T_dest, typename T_expr, typename T_update>
915	_bz_forceinline void
916	_bz_evaluator<N>::
917	evaluateWithIndexTraversal(T_dest& dest, T_expr expr, T_update)
918	{
919	typedef typename T_dest::T_numtype T_numtype;
920	const int N_rank = T_dest::rank();
921
922	// Do a stack-type traversal for the destination array and use
923	// index traversal for the source expression
924
925	const int maxRank = dest.ordering(0);
926
927	#ifdef BZ_DEBUG_TRAVERSE
928	const int secondLastRank = dest.ordering(1);
929	cout << "Index traversal: N_rank = " << N_rank << endl;
930	cout << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
931	<< endl;
932	cout.flush();
933	#endif
934
935	typename T_dest::T_iterator iter(dest);
936	for (int i=1; i < N_rank; ++i)
937	iter.push(iter.ordering(i));
938
939	iter.loadStride(maxRank);
940
941	TinyVector<int,T_dest::rank_> index, last;
942
943	index = dest.base();
944
945	for (int i=0; i < N_rank; ++i)
946	last(i) = dest.base(i) + dest.length(i);
947
948	// int lastLength = length(maxRank);
949
950	while (true) {
951
952	for (index[maxRank] = dest.base(maxRank);
953	index[maxRank] < last[maxRank];
954	++index[maxRank])
955	{
956	#ifdef BZ_DEBUG_TRAVERSE
957	#if 0
958	cout << "(" << index[0] << "," << index[1] << ") " << endl;
959	cout.flush();
960	#endif
961	#endif
962
963	T_update::update(const_cast<T_numtype>(iter.data()), expr(index));
964	iter.advance();
965	}
966
967	int j = 1;
968	for (; j < N_rank; ++j)
969	{
970	iter.pop(dest.ordering(j));
971	iter.loadStride(dest.ordering(j));
972	iter.advance();
973
974	index[dest.ordering(j-1)] = dest.base(dest.ordering(j-1));
975	++index[dest.ordering(j)];
976	if (index[dest.ordering(j)] != last[dest.ordering(j)])
977	break;
978	}
979
980	if (j == N_rank)
981	break;
982
983	for (; j > 0; --j)
984	{
985	iter.push(dest.ordering(j));
986	}
987	iter.loadStride(maxRank);
988	}
989	}
990
991	// Fast traversals require <set> from the ISO/ANSI C++ standard library
992
993	#ifdef BZ_HAVE_STD
994	#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL
995
996	template<typename T_dest, typename T_expr, typename T_update>
997	_bz_forceinline void
998	_bz_evaluateWithFastTraversal(T_dest& dest,
999	const TraversalOrder<T_dest::rank() - 1>& order,
1000	T_expr expr, T_update)
1001	{
1002	typedef typename T_dest::T_numtype T_numtype;
1003	const int N_rank = T_dest::rank();
1004
1005	const int maxRank = dest.ordering(0);
1006
1007	#ifdef BZ_DEBUG_TRAVERSE
1008	const int secondLastRank = dest.ordering(1);
1009	cerr << "maxRank = " << maxRank << " secondLastRank = " << secondLastRank
1010	<< endl;
1011	#endif
1012
1013	T_dest::T_iterator iter(dest);
1014	iter.push(0);
1015	expr.push(0);
1016
1017	bool useUnitStride = iter.isUnitStride(maxRank)
1018	&& expr.isUnitStride(maxRank);
1019
1020	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1021	diffType commonStride = expr.suggestStride(maxRank);
1022	if (iter.suggestStride(maxRank) > commonStride)
1023	commonStride = iter.suggestStride(maxRank);
1024	bool useCommonStride = iter.isStride(maxRank,commonStride)
1025	&& expr.isStride(maxRank,commonStride);
1026	#else
1027	diffType commonStride = 1;
1028	bool useCommonStride = false;
1029	#endif
1030
1031	int lastLength = dest.length(maxRank);
1032
1033	for (int i=0; i < order.length(); ++i)
1034	{
1035	iter.pop(0);
1036	expr.pop(0);
1037
1038	#ifdef BZ_DEBUG_TRAVERSE
1039	cerr << "Traversing: " << order[i] << endl;
1040	#endif
1041	// Position the iterator at the start of the next column
1042	for (int j=1; j < N_rank; ++j)
1043	{
1044	iter.loadStride(ordering(j));
1045	expr.loadStride(ordering(j));
1046
1047	int offset = order[i][j-1];
1048	iter.advance(offset);
1049	expr.advance(offset);
1050	}
1051
1052	iter.loadStride(maxRank);
1053	expr.loadStride(maxRank);
1054
1055	// Evaluate the expression along the column
1056
1057	if ((useUnitStride) \|\| (useCommonStride))
1058	{
1059	#ifdef BZ_USE_FAST_READ_ARRAY_EXPR
1060	diffType ubound = lastLength * commonStride;
1061	T_numtype* restrict data = const_cast<T_numtype*>(iter.data());
1062
1063	if (commonStride == 1)
1064	{
1065	#ifndef BZ_ARRAY_FAST_TRAVERSAL_UNROLL
1066	for (diffType i=0; i < ubound; ++i)
1067	T_update::update(*data++, expr.fastRead(i));
1068	#else
1069	diffType n1 = ubound & 3;
1070	diffType i=0;
1071	for (; i < n1; ++i)
1072	T_update::update(*data++, expr.fastRead(i));
1073
1074	for (; i < ubound; i += 4)
1075	{
1076	T_update::update(*data++, expr.fastRead(i));
1077	T_update::update(*data++, expr.fastRead(i+1));
1078	T_update::update(*data++, expr.fastRead(i+2));
1079	T_update::update(*data++, expr.fastRead(i+3));
1080	}
1081	#endif // BZ_ARRAY_FAST_TRAVERSAL_UNROLL
1082	}
1083	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1084	else {
1085	for (diffType i=0; i < ubound; i += commonStride)
1086	T_update::update(data[i], expr.fastRead(i));
1087	}
1088	#endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1089
1090	iter.advance(lastLength * commonStride);
1091	expr.advance(lastLength * commonStride);
1092	#else // ! BZ_USE_FAST_READ_ARRAY_EXPR
1093	T_numtype* restrict last = const_cast<T_numtype*>(iter.data())
1094	+ lastLength * commonStride;
1095
1096	while (iter.data() != last)
1097	{
1098	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
1099	iter.advance(commonStride);
1100	expr.advance(commonStride);
1101	}
1102	#endif // BZ_USE_FAST_READ_ARRAY_EXPR
1103
1104	}
1105	else {
1106	// No common stride
1107
1108	T_numtype* restrict last = const_cast<T_numtype*>(iter.data())
1109	+ lastLength * stride(maxRank);
1110
1111	while (iter.data() != last)
1112	{
1113	T_update::update(const_cast<T_numtype>(iter.data()), *expr);
1114	iter.advance();
1115	expr.advance();
1116	}
1117	}
1118	}
1119	}
1120
1121	#endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL
1122	#endif // BZ_HAVE_STD
1123
1124	#ifdef BZ_ARRAY_2D_NEW_STENCIL_TILING
1125
1126	#ifdef BZ_ARRAY_2D_STENCIL_TILING
1127
1128	// what is diff between new and old?
1129	template<typename T_dest, typename T_expr, typename T_update>
1130	_bz_forceinline void
1131	_bz_evaluateWithTiled2DTraversal(T_dest& dest, T_expr expr, T_update)
1132	{
1133	typedef typename T_dest::T_numtype T_numtype;
1134	const int N_rank = T_dest::rank();
1135
1136	typename T_dest::T_iterator iter(dest);
1137
1138	const int minorRank = iter.ordering(0);
1139	const int majorRank = iter.ordering(1);
1140
1141	iter.push(0);
1142	expr.push(0);
1143
1144	#ifdef BZ_2D_STENCIL_DEBUG
1145	int count = 0;
1146	#endif
1147
1148	bool useUnitStride = iter.isUnitStride(minorRank)
1149	&& expr.isUnitStride(minorRank);
1150
1151	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1152	diffType commonStride = expr.suggestStride(minorRank);
1153	if (iter.suggestStride(minorRank) > commonStride)
1154	commonStride = iter.suggestStride(minorRank);
1155	bool useCommonStride = iter.isStride(minorRank,commonStride)
1156	&& expr.isStride(minorRank,commonStride);
1157	#else
1158	diffType commonStride = 1;
1159	bool useCommonStride = false;
1160	#endif
1161
1162	// Determine if a common major stride exists
1163	diffType commonMajorStride = expr.suggestStride(majorRank);
1164	if (iter.suggestStride(majorRank) > commonMajorStride)
1165	commonMajorStride = iter.suggestStride(majorRank);
1166	bool haveCommonMajorStride = iter.isStride(majorRank,commonMajorStride)
1167	&& expr.isStride(majorRank,commonMajorStride);
1168
1169
1170	int maxi = dest.length(majorRank);
1171	int maxj = dest.length(minorRank);
1172
1173	const int tileHeight = 16, tileWidth = 3;
1174
1175	int bi, bj;
1176	for (bi=0; bi < maxi; bi += tileHeight)
1177	{
1178	int ni = bi + tileHeight;
1179	if (ni > maxi)
1180	ni = maxi;
1181
1182	// Move back to the beginning of the array
1183	iter.pop(0);
1184	expr.pop(0);
1185
1186	// Move to the start of this tile row
1187	iter.loadStride(majorRank);
1188	iter.advance(bi);
1189	expr.loadStride(majorRank);
1190	expr.advance(bi);
1191
1192	// Save this position
1193	iter.push(1);
1194	expr.push(1);
1195
1196	for (bj=0; bj < maxj; bj += tileWidth)
1197	{
1198	// Move to the beginning of the tile row
1199	iter.pop(1);
1200	expr.pop(1);
1201
1202	// Move to the top of the current tile (bi,bj)
1203	iter.loadStride(minorRank);
1204	iter.advance(bj);
1205	expr.loadStride(minorRank);
1206	expr.advance(bj);
1207
1208	if (bj + tileWidth <= maxj)
1209	{
1210	// Strip mining
1211
1212	if ((useUnitStride) && (haveCommonMajorStride))
1213	{
1214	diffType offset = 0;
1215	T_numtype* restrict data = const_cast<T_numtype*>
1216	(iter.data());
1217
1218	for (int i=bi; i < ni; ++i)
1219	{
1220	_bz_typename T_expr::T_numtype tmp1, tmp2, tmp3;
1221
1222	// Common subexpression elimination -- compilers
1223	// won't necessarily do this on their own.
1224	diffType t1 = offset+1;
1225	diffType t2 = offset+2;
1226
1227	tmp1 = expr.fastRead(offset);
1228	tmp2 = expr.fastRead(t1);
1229	tmp3 = expr.fastRead(t2);
1230
1231	T_update::update(data[0], tmp1);
1232	T_update::update(data[1], tmp2);
1233	T_update::update(data[2], tmp3);
1234
1235	offset += commonMajorStride;
1236	data += commonMajorStride;
1237
1238	#ifdef BZ_2D_STENCIL_DEBUG
1239	count += 3;
1240	#endif
1241	}
1242	}
1243	else {
1244
1245	for (int i=bi; i < ni; ++i)
1246	{
1247	iter.loadStride(minorRank);
1248	expr.loadStride(minorRank);
1249
1250	// Loop through current row elements
1251	T_update::update(const_cast<T_numtype>(iter.data()),
1252	*expr);
1253	iter.advance();
1254	expr.advance();
1255
1256	T_update::update(const_cast<T_numtype>(iter.data()),
1257	*expr);
1258	iter.advance();
1259	expr.advance();
1260
1261	T_update::update(const_cast<T_numtype>(iter.data()),
1262	*expr);
1263	iter.advance(-2);
1264	expr.advance(-2);
1265
1266	iter.loadStride(majorRank);
1267	expr.loadStride(majorRank);
1268	iter.advance();
1269	expr.advance();
1270
1271	#ifdef BZ_2D_STENCIL_DEBUG
1272	count += 3;
1273	#endif
1274
1275	}
1276	}
1277	}
1278	else {
1279
1280	// This code handles partial tiles at the bottom of the
1281	// array.
1282
1283	for (int j=bj; j < maxj; ++j)
1284	{
1285	iter.loadStride(majorRank);
1286	expr.loadStride(majorRank);
1287
1288	for (int i=bi; i < ni; ++i)
1289	{
1290	T_update::update(const_cast<T_numtype>(iter.data()),
1291	*expr);
1292	iter.advance();
1293	expr.advance();
1294	#ifdef BZ_2D_STENCIL_DEBUG
1295	++count;
1296	#endif
1297
1298	}
1299
1300	// Move back to the top of this column
1301	iter.advance(bi-ni);
1302	expr.advance(bi-ni);
1303
1304	// Move over to the next column
1305	iter.loadStride(minorRank);
1306	expr.loadStride(minorRank);
1307
1308	iter.advance();
1309	expr.advance();
1310	}
1311	}
1312	}
1313	}
1314
1315	#ifdef BZ_2D_STENCIL_DEBUG
1316	cout << "BZ_2D_STENCIL_DEBUG: count = " << count << endl;
1317	#endif
1318	}
1319
1320	#endif // BZ_ARRAY_2D_STENCIL_TILING
1321	#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1322
1323
1324
1325	#ifndef BZ_ARRAY_2D_NEW_STENCIL_TILING
1326
1327	#ifdef BZ_ARRAY_2D_STENCIL_TILING
1328
1329	// what is diff between new and old?
1330	template<typename T_dest, typename T_expr, typename T_update>
1331	_bz_forceinline void
1332	_bz_evaluateWithTiled2DTraversal(T_dest& dest, T_expr expr, T_update)
1333	{
1334	typedef typename T_dest::T_numtype T_numtype;
1335
1336	typename T_dest::T_iterator iter(dest);
1337
1338	const int minorRank = iter.ordering(0);
1339	const int majorRank = iter.ordering(1);
1340
1341	const int blockSize = 16;
1342
1343	iter.push(0);
1344	expr.push(0);
1345
1346	bool useUnitStride = iter.isUnitStride(minorRank)
1347	&& expr.isUnitStride(minorRank);
1348
1349	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1350	diffType commonStride = expr.suggestStride(minorRank);
1351	if (iter.suggestStride(minorRank) > commonStride)
1352	commonStride = iter.suggestStride(minorRank);
1353	bool useCommonStride = iter.isStride(minorRank,commonStride)
1354	&& expr.isStride(minorRank,commonStride);
1355	#else
1356	diffType commonStride = 1;
1357	bool useCommonStride = false;
1358	#endif
1359
1360	int maxi = dest.length(majorRank);
1361	int maxj = dest.length(minorRank);
1362
1363	int bi, bj;
1364	for (bi=0; bi < maxi; bi += blockSize)
1365	{
1366	int ni = bi + blockSize;
1367	if (ni > maxi)
1368	ni = maxi;
1369
1370	for (bj=0; bj < maxj; bj += blockSize)
1371	{
1372	int nj = bj + blockSize;
1373	if (nj > maxj)
1374	nj = maxj;
1375
1376	// Move to the beginning of the array
1377	iter.pop(0);
1378	expr.pop(0);
1379
1380	// Move to the beginning of the tile (bi,bj)
1381	iter.loadStride(majorRank);
1382	iter.advance(bi);
1383	iter.loadStride(minorRank);
1384	iter.advance(bj);
1385
1386	expr.loadStride(majorRank);
1387	expr.advance(bi);
1388	expr.loadStride(minorRank);
1389	expr.advance(bj);
1390
1391	// Loop through tile rows
1392	for (int i=bi; i < ni; ++i)
1393	{
1394	// Save the beginning of this tile row
1395	iter.push(1);
1396	expr.push(1);
1397
1398	// Load the minor stride
1399	iter.loadStride(minorRank);
1400	expr.loadStride(minorRank);
1401
1402	if (useUnitStride)
1403	{
1404	T_numtype* restrict data = const_cast<T_numtype*>
1405	(iter.data());
1406
1407	int ubound = (nj-bj);
1408	for (int j=0; j < ubound; ++j)
1409	T_update::update(*data++, expr.fastRead(j));
1410	}
1411	#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE
1412	else if (useCommonStride)
1413	{
1414	const diffType ubound = (nj-bj) * commonStride;
1415	T_numtype* restrict data = const_cast<T_numtype*>
1416	(iter.data());
1417
1418	for (diffType j=0; j < ubound; j += commonStride)
1419	T_update::update(data[j], expr.fastRead(j));
1420	}
1421	#endif
1422	else {
1423	for (int j=bj; j < nj; ++j)
1424	{
1425	// Loop through current row elements
1426	T_update::update(const_cast<T_numtype>(iter.data()),
1427	*expr);
1428	iter.advance();
1429	expr.advance();
1430	}
1431	}
1432
1433	// Move back to the beginning of the tile row, then
1434	// move to the next row
1435	iter.pop(1);
1436	iter.loadStride(majorRank);
1437	iter.advance(1);
1438
1439	expr.pop(1);
1440	expr.loadStride(majorRank);
1441	expr.advance(1);
1442	}
1443	}
1444	}
1445	}
1446	#endif // BZ_ARRAY_2D_STENCIL_TILING
1447	#endif // BZ_ARRAY_2D_NEW_STENCIL_TILING
1448
1449	BZ_NAMESPACE_END
1450
1451	#endif // BZ_ARRAYEVAL_CC
1452

Note: See TracBrowser for help on using the repository browser.

Download in other formats: