LLVM OpenMP* Runtime Library
kmp_collapse.cpp
1/*
2 * kmp_collapse.cpp -- loop collapse feature
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_error.h"
15#include "kmp_i18n.h"
16#include "kmp_itt.h"
17#include "kmp_stats.h"
18#include "kmp_str.h"
19#include "kmp_collapse.h"
20
21#if OMPT_SUPPORT
22#include "ompt-specific.h"
23#endif
24
25// OMPTODO: different style of comments (see kmp_sched)
26// OMPTODO: OMPT/OMPD
27
28// avoid inadevertently using a library based abs
29template <typename T> T __kmp_abs(const T val) {
30 return (val < 0) ? -val: val;
31}
32kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34
35//----------------------------------------------------------------------------
36// Common functions for working with rectangular and non-rectangular loops
37//----------------------------------------------------------------------------
38
39template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); }
40
41//----------Loop canonicalization---------------------------------------------
42
43// For loop nest (any shape):
44// convert != to < or >;
45// switch from using < or > to <= or >=.
46// "bounds" array has to be allocated per thread.
47// All other internal functions will work only with canonicalized loops.
48template <typename T>
49void kmp_canonicalize_one_loop_XX(
50 ident_t *loc,
51 /*in/out*/ bounds_infoXX_template<T> *bounds) {
52
53 if (__kmp_env_consistency_check) {
54 if (bounds->step == 0) {
55 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
56 loc);
57 }
58 }
59
60 if (bounds->comparison == comparison_t::comp_not_eq) {
61 // We can convert this to < or >, depends on the sign of the step:
62 if (bounds->step > 0) {
63 bounds->comparison = comparison_t::comp_less;
64 } else {
65 bounds->comparison = comparison_t::comp_greater;
66 }
67 }
68
69 if (bounds->comparison == comparison_t::comp_less) {
70 // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
71 // because ub0 + ub1*j should be still positive (otherwise loop was not
72 // well formed)
73 bounds->ub0 -= 1;
74 bounds->comparison = comparison_t::comp_less_or_eq;
75 } else if (bounds->comparison == comparison_t::comp_greater) {
76 bounds->ub0 += 1;
77 bounds->comparison = comparison_t::comp_greater_or_eq;
78 }
79}
80
81// Canonicalize loop nest. original_bounds_nest is an array of length n.
82void kmp_canonicalize_loop_nest(ident_t *loc,
83 /*in/out*/ bounds_info_t *original_bounds_nest,
84 kmp_index_t n) {
85
86 for (kmp_index_t ind = 0; ind < n; ++ind) {
87 auto bounds = &(original_bounds_nest[ind]);
88
89 switch (bounds->loop_type) {
90 case loop_type_t::loop_type_int32:
91 kmp_canonicalize_one_loop_XX<kmp_int32>(
92 loc,
93 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
94 break;
95 case loop_type_t::loop_type_uint32:
96 kmp_canonicalize_one_loop_XX<kmp_uint32>(
97 loc,
98 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
99 break;
100 case loop_type_t::loop_type_int64:
101 kmp_canonicalize_one_loop_XX<kmp_int64>(
102 loc,
103 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
104 break;
105 case loop_type_t::loop_type_uint64:
106 kmp_canonicalize_one_loop_XX<kmp_uint64>(
107 loc,
108 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
109 break;
110 default:
111 KMP_ASSERT(false);
112 }
113 }
114}
115
116//----------Calculating trip count on one level-------------------------------
117
118// Calculate trip count on this loop level.
119// We do this either for a rectangular loop nest,
120// or after an adjustment bringing the loops to a parallelepiped shape.
121// This number should not depend on the value of outer IV
122// even if the formular has lb1 and ub1.
123// Note: for non-rectangular loops don't use span for this, it's too big.
124
125template <typename T>
126kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
127 /*in/out*/ bounds_infoXX_template<T> *bounds) {
128
129 if (bounds->comparison == comparison_t::comp_less_or_eq) {
130 if (bounds->ub0 < bounds->lb0) {
131 // Note: after this we don't need to calculate inner loops,
132 // but that should be an edge case:
133 bounds->trip_count = 0;
134 } else {
135 // ub - lb may exceed signed type range; we need to cast to
136 // kmp_loop_nest_iv_t anyway
137 bounds->trip_count =
138 static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
139 __kmp_abs(bounds->step) +
140 1;
141 }
142 } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
143 if (bounds->lb0 < bounds->ub0) {
144 // Note: after this we don't need to calculate inner loops,
145 // but that should be an edge case:
146 bounds->trip_count = 0;
147 } else {
148 // lb - ub may exceed signed type range; we need to cast to
149 // kmp_loop_nest_iv_t anyway
150 bounds->trip_count =
151 static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
152 __kmp_abs(bounds->step) +
153 1;
154 }
155 } else {
156 KMP_ASSERT(false);
157 }
158 return bounds->trip_count;
159}
160
161// Calculate trip count on this loop level.
162kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
163
164 kmp_loop_nest_iv_t trip_count = 0;
165
166 switch (bounds->loop_type) {
167 case loop_type_t::loop_type_int32:
168 trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
169 /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
170 break;
171 case loop_type_t::loop_type_uint32:
172 trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
173 /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
174 break;
175 case loop_type_t::loop_type_int64:
176 trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
177 /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
178 break;
179 case loop_type_t::loop_type_uint64:
180 trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
181 /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
182 break;
183 default:
184 KMP_ASSERT(false);
185 }
186
187 return trip_count;
188}
189
190//----------Trim original iv according to its type----------------------------
191
192// Trim original iv according to its type.
193// Return kmp_uint64 value which can be easily used in all internal calculations
194// And can be statically cast back to original type in user code.
195kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
196 kmp_uint64 res = 0;
197
198 switch (loop_iv_type) {
199 case loop_type_t::loop_type_int8:
200 res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
201 break;
202 case loop_type_t::loop_type_uint8:
203 res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
204 break;
205 case loop_type_t::loop_type_int16:
206 res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
207 break;
208 case loop_type_t::loop_type_uint16:
209 res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
210 break;
211 case loop_type_t::loop_type_int32:
212 res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
213 break;
214 case loop_type_t::loop_type_uint32:
215 res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
216 break;
217 case loop_type_t::loop_type_int64:
218 res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
219 break;
220 case loop_type_t::loop_type_uint64:
221 res = static_cast<kmp_uint64>(original_iv);
222 break;
223 default:
224 KMP_ASSERT(false);
225 }
226
227 return res;
228}
229
230//----------Compare two IVs (remember they have a type)-----------------------
231
232bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
233 kmp_uint64 original_iv2) {
234 bool res = false;
235
236 switch (loop_iv_type) {
237 case loop_type_t::loop_type_int8:
238 res = static_cast<kmp_int8>(original_iv1) ==
239 static_cast<kmp_int8>(original_iv2);
240 break;
241 case loop_type_t::loop_type_uint8:
242 res = static_cast<kmp_uint8>(original_iv1) ==
243 static_cast<kmp_uint8>(original_iv2);
244 break;
245 case loop_type_t::loop_type_int16:
246 res = static_cast<kmp_int16>(original_iv1) ==
247 static_cast<kmp_int16>(original_iv2);
248 break;
249 case loop_type_t::loop_type_uint16:
250 res = static_cast<kmp_uint16>(original_iv1) ==
251 static_cast<kmp_uint16>(original_iv2);
252 break;
253 case loop_type_t::loop_type_int32:
254 res = static_cast<kmp_int32>(original_iv1) ==
255 static_cast<kmp_int32>(original_iv2);
256 break;
257 case loop_type_t::loop_type_uint32:
258 res = static_cast<kmp_uint32>(original_iv1) ==
259 static_cast<kmp_uint32>(original_iv2);
260 break;
261 case loop_type_t::loop_type_int64:
262 res = static_cast<kmp_int64>(original_iv1) ==
263 static_cast<kmp_int64>(original_iv2);
264 break;
265 case loop_type_t::loop_type_uint64:
266 res = static_cast<kmp_uint64>(original_iv1) ==
267 static_cast<kmp_uint64>(original_iv2);
268 break;
269 default:
270 KMP_ASSERT(false);
271 }
272
273 return res;
274}
275
276//----------Calculate original iv on one level--------------------------------
277
278// Return true if the point fits into upper bounds on this level,
279// false otherwise
280template <typename T>
281bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
282 const kmp_point_t original_ivs,
283 kmp_index_t ind) {
284
285 T iv = static_cast<T>(original_ivs[ind]);
286 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
287
288 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
289 (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
290 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
291 (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
292 // The calculated point is outside of loop upper boundary:
293 return false;
294 }
295
296 return true;
297}
298
299// Calculate one iv corresponding to iteration on the level ind.
300// Return true if it fits into lower-upper bounds on this level
301// (if not, we need to re-calculate)
302template <typename T>
303bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
304 /*in/out*/ kmp_point_t original_ivs,
305 const kmp_iterations_t iterations, kmp_index_t ind,
306 bool start_with_lower_bound, bool checkBounds) {
307
308 kmp_uint64 temp = 0;
309 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
310
311 if (start_with_lower_bound) {
312 // we moved to the next iteration on one of outer loops, should start
313 // with the lower bound here:
314 temp = bounds->lb0 + bounds->lb1 * outer_iv;
315 } else {
316 auto iteration = iterations[ind];
317 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
318 }
319
320 // Now trim original iv according to its type:
321 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
322
323 if (checkBounds) {
324 return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
325 } else {
326 return true;
327 }
328}
329
330bool kmp_calc_one_iv(const bounds_info_t *bounds,
331 /*in/out*/ kmp_point_t original_ivs,
332 const kmp_iterations_t iterations, kmp_index_t ind,
333 bool start_with_lower_bound, bool checkBounds) {
334
335 switch (bounds->loop_type) {
336 case loop_type_t::loop_type_int32:
337 return kmp_calc_one_iv_XX<kmp_int32>(
339 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
340 checkBounds);
341 break;
342 case loop_type_t::loop_type_uint32:
343 return kmp_calc_one_iv_XX<kmp_uint32>(
345 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
346 checkBounds);
347 break;
348 case loop_type_t::loop_type_int64:
349 return kmp_calc_one_iv_XX<kmp_int64>(
351 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
352 checkBounds);
353 break;
354 case loop_type_t::loop_type_uint64:
355 return kmp_calc_one_iv_XX<kmp_uint64>(
357 /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
358 checkBounds);
359 break;
360 default:
361 KMP_ASSERT(false);
362 return false;
363 }
364}
365
366//----------Calculate original iv on one level for rectangular loop nest------
367
368// Calculate one iv corresponding to iteration on the level ind.
369// Return true if it fits into lower-upper bounds on this level
370// (if not, we need to re-calculate)
371template <typename T>
372void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
373 /*in/out*/ kmp_uint64 *original_ivs,
374 const kmp_iterations_t iterations,
375 kmp_index_t ind) {
376
377 auto iteration = iterations[ind];
378
379 kmp_uint64 temp =
380 bounds->lb0 +
381 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
382 iteration * bounds->step;
383
384 // Now trim original iv according to its type:
385 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
386}
387
388void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
389 /*in/out*/ kmp_uint64 *original_ivs,
390 const kmp_iterations_t iterations,
391 kmp_index_t ind) {
392
393 switch (bounds->loop_type) {
394 case loop_type_t::loop_type_int32:
395 kmp_calc_one_iv_rectang_XX<kmp_int32>(
397 /*in/out*/ original_ivs, iterations, ind);
398 break;
399 case loop_type_t::loop_type_uint32:
400 kmp_calc_one_iv_rectang_XX<kmp_uint32>(
402 /*in/out*/ original_ivs, iterations, ind);
403 break;
404 case loop_type_t::loop_type_int64:
405 kmp_calc_one_iv_rectang_XX<kmp_int64>(
407 /*in/out*/ original_ivs, iterations, ind);
408 break;
409 case loop_type_t::loop_type_uint64:
410 kmp_calc_one_iv_rectang_XX<kmp_uint64>(
412 /*in/out*/ original_ivs, iterations, ind);
413 break;
414 default:
415 KMP_ASSERT(false);
416 }
417}
418
419//----------------------------------------------------------------------------
420// Rectangular loop nest
421//----------------------------------------------------------------------------
422
423//----------Canonicalize loop nest and calculate trip count-------------------
424
425// Canonicalize loop nest and calculate overall trip count.
426// "bounds_nest" has to be allocated per thread.
427// API will modify original bounds_nest array to bring it to a canonical form
428// (only <= and >=, no !=, <, >). If the original loop nest was already in a
429// canonical form there will be no changes to bounds in bounds_nest array
430// (only trip counts will be calculated).
431// Returns trip count of overall space.
432extern "C" kmp_loop_nest_iv_t
433__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
434 /*in/out*/ bounds_info_t *original_bounds_nest,
435 kmp_index_t n) {
436
437 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
438
439 kmp_loop_nest_iv_t total = 1;
440
441 for (kmp_index_t ind = 0; ind < n; ++ind) {
442 auto bounds = &(original_bounds_nest[ind]);
443
444 kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
445 total *= trip_count;
446 }
447
448 return total;
449}
450
451//----------Calculate old induction variables---------------------------------
452
453// Calculate old induction variables corresponding to overall new_iv.
454// Note: original IV will be returned as if it had kmp_uint64 type,
455// will have to be converted to original type in user code.
456// Note: trip counts should be already calculated by
457// __kmpc_process_loop_nest_rectang.
458// OMPTODO: special case 2, 3 nested loops: either do different
459// interface without array or possibly template this over n
460extern "C" void
461__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
462 const bounds_info_t *original_bounds_nest,
463 /*out*/ kmp_uint64 *original_ivs,
464 kmp_index_t n) {
465
466 kmp_iterations_t iterations =
467 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
468
469 // First, calc corresponding iteration in every original loop:
470 for (kmp_index_t ind = n; ind > 0;) {
471 --ind;
472 auto bounds = &(original_bounds_nest[ind]);
473
474 // should be optimized to OPDIVREM:
475 auto temp = new_iv / bounds->trip_count;
476 auto iteration = new_iv % bounds->trip_count;
477 new_iv = temp;
478
479 iterations[ind] = iteration;
480 }
481 KMP_ASSERT(new_iv == 0);
482
483 for (kmp_index_t ind = 0; ind < n; ++ind) {
484 auto bounds = &(original_bounds_nest[ind]);
485
486 kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
487 }
488 __kmp_free(iterations);
489}
490
491//----------------------------------------------------------------------------
492// Non-rectangular loop nest
493//----------------------------------------------------------------------------
494
495//----------Calculate maximum possible span of iv values on one level---------
496
497// Calculate span for IV on this loop level for "<=" case.
498// Note: it's for <= on this loop nest level, so lower bound should be smallest
499// value, upper bound should be the biggest value. If the loop won't execute,
500// 'smallest' may be bigger than 'biggest', but we'd better not switch them
501// around.
502template <typename T>
503void kmp_calc_span_lessoreq_XX(
504 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
505 /* in/out*/ bounds_info_internal_t *bounds_nest) {
506
507 typedef typename traits_t<T>::unsigned_t UT;
508 // typedef typename traits_t<T>::signed_t ST;
509
510 // typedef typename big_span_t span_t;
511 typedef T span_t;
512
513 auto &bbounds = bounds->b;
514
515 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
516 // This dimention depends on one of previous ones; can't be the outermost
517 // one.
518 bounds_info_internalXX_template<T> *previous =
519 reinterpret_cast<bounds_info_internalXX_template<T> *>(
520 &(bounds_nest[bbounds.outer_iv]));
521
522 // OMPTODO: assert that T is compatible with loop variable type on
523 // 'previous' loop
524
525 {
526 span_t bound_candidate1 =
527 bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
528 span_t bound_candidate2 =
529 bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
530 if (bound_candidate1 < bound_candidate2) {
531 bounds->span_smallest = bound_candidate1;
532 } else {
533 bounds->span_smallest = bound_candidate2;
534 }
535 }
536
537 {
538 // We can't adjust the upper bound with respect to step, because
539 // lower bound might be off after adjustments
540
541 span_t bound_candidate1 =
542 bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
543 span_t bound_candidate2 =
544 bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
545 if (bound_candidate1 < bound_candidate2) {
546 bounds->span_biggest = bound_candidate2;
547 } else {
548 bounds->span_biggest = bound_candidate1;
549 }
550 }
551 } else {
552 // Rectangular:
553 bounds->span_smallest = bbounds.lb0;
554 bounds->span_biggest = bbounds.ub0;
555 }
556 if (!bounds->loop_bounds_adjusted) {
557 // Here it's safe to reduce the space to the multiply of step.
558 // OMPTODO: check if the formular is correct.
559 // Also check if it would be safe to do this if we didn't adjust left side.
560 bounds->span_biggest -=
561 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
562 }
563}
564
565// Calculate span for IV on this loop level for ">=" case.
566template <typename T>
567void kmp_calc_span_greateroreq_XX(
568 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
569 /* in/out*/ bounds_info_internal_t *bounds_nest) {
570
571 typedef typename traits_t<T>::unsigned_t UT;
572 // typedef typename traits_t<T>::signed_t ST;
573
574 // typedef typename big_span_t span_t;
575 typedef T span_t;
576
577 auto &bbounds = bounds->b;
578
579 if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
580 // This dimention depends on one of previous ones; can't be the outermost
581 // one.
582 bounds_info_internalXX_template<T> *previous =
583 reinterpret_cast<bounds_info_internalXX_template<T> *>(
584 &(bounds_nest[bbounds.outer_iv]));
585
586 // OMPTODO: assert that T is compatible with loop variable type on
587 // 'previous' loop
588
589 {
590 span_t bound_candidate1 =
591 bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
592 span_t bound_candidate2 =
593 bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
594 if (bound_candidate1 >= bound_candidate2) {
595 bounds->span_smallest = bound_candidate1;
596 } else {
597 bounds->span_smallest = bound_candidate2;
598 }
599 }
600
601 {
602 // We can't adjust the upper bound with respect to step, because
603 // lower bound might be off after adjustments
604
605 span_t bound_candidate1 =
606 bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
607 span_t bound_candidate2 =
608 bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
609 if (bound_candidate1 >= bound_candidate2) {
610 bounds->span_biggest = bound_candidate2;
611 } else {
612 bounds->span_biggest = bound_candidate1;
613 }
614 }
615
616 } else {
617 // Rectangular:
618 bounds->span_biggest = bbounds.lb0;
619 bounds->span_smallest = bbounds.ub0;
620 }
621 if (!bounds->loop_bounds_adjusted) {
622 // Here it's safe to reduce the space to the multiply of step.
623 // OMPTODO: check if the formular is correct.
624 // Also check if it would be safe to do this if we didn't adjust left side.
625 bounds->span_biggest -=
626 (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
627 }
628}
629
630// Calculate maximum possible span for IV on this loop level.
631template <typename T>
632void kmp_calc_span_XX(
633 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
634 /* in/out*/ bounds_info_internal_t *bounds_nest) {
635
636 if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
637 kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
638 } else {
639 KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
640 kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
641 }
642}
643
644//----------All initial processing of the loop nest---------------------------
645
646// Calculate new bounds for this loop level.
647// To be able to work with the nest we need to get it to a parallelepiped shape.
648// We need to stay in the original range of values, so that there will be no
649// overflow, for that we'll adjust both upper and lower bounds as needed.
650template <typename T>
651void kmp_calc_new_bounds_XX(
652 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
653 /* in/out*/ bounds_info_internal_t *bounds_nest) {
654
655 auto &bbounds = bounds->b;
656
657 if (bbounds.lb1 == bbounds.ub1) {
658 // Already parallel, no need to adjust:
659 bounds->loop_bounds_adjusted = false;
660 } else {
661 bounds->loop_bounds_adjusted = true;
662
663 T old_lb1 = bbounds.lb1;
664 T old_ub1 = bbounds.ub1;
665
666 if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
667 // With this shape we can adjust to a rectangle:
668 bbounds.lb1 = 0;
669 bbounds.ub1 = 0;
670 } else {
671 // get upper and lower bounds to be parallel
672 // with values in the old range.
673 // Note: abs didn't work here.
674 if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
675 ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
676 bbounds.lb1 = old_ub1;
677 } else {
678 bbounds.ub1 = old_lb1;
679 }
680 }
681
682 // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
683 // The idea here that for this IV we are now getting the same span
684 // irrespective of the previous IV value.
685 bounds_info_internalXX_template<T> *previous =
686 reinterpret_cast<bounds_info_internalXX_template<T> *>(
687 &bounds_nest[bbounds.outer_iv]);
688
689 if (bbounds.comparison == comparison_t::comp_less_or_eq) {
690 if (old_lb1 < bbounds.lb1) {
691 KMP_ASSERT(old_lb1 < 0);
692 // The length is good on outer_iv biggest number,
693 // can use it to find where to move the lower bound:
694
695 T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
696 bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
697 // e.g. it was 0?? (same below)
698 } else if (old_lb1 > bbounds.lb1) {
699 // still need to move lower bound:
700 T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
701 bbounds.lb0 += add;
702 }
703
704 if (old_ub1 > bbounds.ub1) {
705 KMP_ASSERT(old_ub1 > 0);
706 // The length is good on outer_iv biggest number,
707 // can use it to find where to move upper bound:
708
709 T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
710 bbounds.ub0 += add;
711 } else if (old_ub1 < bbounds.ub1) {
712 // still need to move upper bound:
713 T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
714 bbounds.ub0 -= sub;
715 }
716 } else {
717 KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
718 if (old_lb1 < bbounds.lb1) {
719 KMP_ASSERT(old_lb1 < 0);
720 T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
721 bbounds.lb0 -= sub;
722 } else if (old_lb1 > bbounds.lb1) {
723 T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
724 bbounds.lb0 += add;
725 }
726
727 if (old_ub1 > bbounds.ub1) {
728 KMP_ASSERT(old_ub1 > 0);
729 T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
730 bbounds.ub0 += add;
731 } else if (old_ub1 < bbounds.ub1) {
732 T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
733 bbounds.ub0 -= sub;
734 }
735 }
736 }
737}
738
739// Do all processing for one canonicalized loop in the nest
740// (assuming that outer loops already were processed):
741template <typename T>
742kmp_loop_nest_iv_t kmp_process_one_loop_XX(
743 /* in/out*/ bounds_info_internalXX_template<T> *bounds,
744 /*in/out*/ bounds_info_internal_t *bounds_nest) {
745
746 kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
747 kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
748 return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
749}
750
751// Non-rectangular loop nest, canonicalized to use <= or >=.
752// Process loop nest to have a parallelepiped shape,
753// calculate biggest spans for IV's on all levels and calculate overall trip
754// count. "bounds_nest" has to be allocated per thread.
755// Returns overall trip count (for adjusted space).
756kmp_loop_nest_iv_t kmp_process_loop_nest(
757 /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
758
759 kmp_loop_nest_iv_t total = 1;
760
761 for (kmp_index_t ind = 0; ind < n; ++ind) {
762 auto bounds = &(bounds_nest[ind]);
763 kmp_loop_nest_iv_t trip_count = 0;
764
765 switch (bounds->b.loop_type) {
766 case loop_type_t::loop_type_int32:
767 trip_count = kmp_process_one_loop_XX<kmp_int32>(
768 /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
769 /*in/out*/ bounds_nest);
770 break;
771 case loop_type_t::loop_type_uint32:
772 trip_count = kmp_process_one_loop_XX<kmp_uint32>(
773 /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
774 /*in/out*/ bounds_nest);
775 break;
776 case loop_type_t::loop_type_int64:
777 trip_count = kmp_process_one_loop_XX<kmp_int64>(
778 /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
779 /*in/out*/ bounds_nest);
780 break;
781 case loop_type_t::loop_type_uint64:
782 trip_count = kmp_process_one_loop_XX<kmp_uint64>(
783 /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
784 /*in/out*/ bounds_nest);
785 break;
786 default:
787 KMP_ASSERT(false);
788 }
789 total *= trip_count;
790 }
791
792 return total;
793}
794
795//----------Calculate iterations (in the original or updated space)-----------
796
797// Calculate number of iterations in original or updated space resulting in
798// original_ivs[ind] (only on this level, non-negative)
799// (not counting initial iteration)
800template <typename T>
801kmp_loop_nest_iv_t
802kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
803 const kmp_point_t original_ivs,
804 kmp_index_t ind) {
805
806 kmp_loop_nest_iv_t iterations = 0;
807
808 if (bounds->comparison == comparison_t::comp_less_or_eq) {
809 iterations =
810 (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
811 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
812 __kmp_abs(bounds->step);
813 } else {
814 KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
815 iterations = (bounds->lb0 +
816 bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
817 static_cast<T>(original_ivs[ind])) /
818 __kmp_abs(bounds->step);
819 }
820
821 return iterations;
822}
823
824// Calculate number of iterations in the original or updated space resulting in
825// original_ivs[ind] (only on this level, non-negative)
826kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
827 const kmp_point_t original_ivs,
828 kmp_index_t ind) {
829
830 switch (bounds->loop_type) {
831 case loop_type_t::loop_type_int32:
832 return kmp_calc_number_of_iterations_XX<kmp_int32>(
833 (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
834 break;
835 case loop_type_t::loop_type_uint32:
836 return kmp_calc_number_of_iterations_XX<kmp_uint32>(
837 (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
838 break;
839 case loop_type_t::loop_type_int64:
840 return kmp_calc_number_of_iterations_XX<kmp_int64>(
841 (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
842 break;
843 case loop_type_t::loop_type_uint64:
844 return kmp_calc_number_of_iterations_XX<kmp_uint64>(
845 (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
846 break;
847 default:
848 KMP_ASSERT(false);
849 return 0;
850 }
851}
852
853//----------Calculate new iv corresponding to original ivs--------------------
854
855// We got a point in the original loop nest.
856// Take updated bounds and calculate what new_iv will correspond to this point.
857// When we are getting original IVs from new_iv, we have to adjust to fit into
858// original loops bounds. Getting new_iv for the adjusted original IVs will help
859// with making more chunks non-empty.
860kmp_loop_nest_iv_t
861kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
862 const kmp_point_t original_ivs,
863 kmp_index_t n) {
864
865 kmp_loop_nest_iv_t new_iv = 0;
866
867 for (kmp_index_t ind = 0; ind < n; ++ind) {
868 auto bounds = &(bounds_nest[ind].b);
869
870 new_iv = new_iv * bounds->trip_count +
871 kmp_calc_number_of_iterations(bounds, original_ivs, ind);
872 }
873
874 return new_iv;
875}
876
877//----------Calculate original ivs for provided iterations--------------------
878
879// Calculate original IVs for provided iterations, assuming iterations are
880// calculated in the original space.
881// Loop nest is in canonical form (with <= / >=).
882bool kmp_calc_original_ivs_from_iterations(
883 const bounds_info_t *original_bounds_nest, kmp_index_t n,
884 /*in/out*/ kmp_point_t original_ivs,
885 /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
886
887 kmp_index_t lengthened_ind = n;
888
889 for (; ind < n;) {
890 auto bounds = &(original_bounds_nest[ind]);
891 bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
892 ind, (lengthened_ind < ind), true);
893
894 if (!good) {
895 // The calculated iv value is too big (or too small for >=):
896 if (ind == 0) {
897 // Space is empty:
898 return false;
899 } else {
900 // Go to next iteration on the outer loop:
901 --ind;
902 ++iterations[ind];
903 lengthened_ind = ind;
904 for (kmp_index_t i = ind + 1; i < n; ++i) {
905 iterations[i] = 0;
906 }
907 continue;
908 }
909 }
910 ++ind;
911 }
912
913 return true;
914}
915
916//----------Calculate original ivs for the beginning of the loop nest---------
917
918// Calculate IVs for the beginning of the loop nest.
919// Note: lower bounds of all loops may not work -
920// if on some of the iterations of the outer loops inner loops are empty.
921// Loop nest is in canonical form (with <= / >=).
922bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
923 kmp_index_t n,
924 /*out*/ kmp_point_t original_ivs) {
925
926 // Iterations in the original space, multiplied by step:
927 kmp_iterations_t iterations =
928 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
929
930 for (kmp_index_t ind = n; ind > 0;) {
931 --ind;
932 iterations[ind] = 0;
933 }
934
935 // Now calculate the point:
936 bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
937 /*in/out*/ original_ivs,
938 /*in/out*/ iterations, 0);
939 __kmp_free(iterations);
940 return b;
941}
942
943//----------Calculate next point in the original loop space-------------------
944
945// From current set of original IVs calculate next point.
946// Return false if there is no next point in the loop bounds.
947bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
948 kmp_index_t n, const kmp_point_t original_ivs,
949 /*out*/ kmp_point_t next_original_ivs) {
950 // Iterations in the original space, multiplied by step (so can be negative):
951 kmp_iterations_t iterations =
952 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
953
954 // First, calc corresponding iteration in every original loop:
955 for (kmp_index_t ind = 0; ind < n; ++ind) {
956 auto bounds = &(original_bounds_nest[ind]);
957 iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
958 }
959
960 for (kmp_index_t ind = 0; ind < n; ++ind) {
961 next_original_ivs[ind] = original_ivs[ind];
962 }
963
964 // Next add one step to the iterations on the inner-most level, and see if we
965 // need to move up the nest:
966 kmp_index_t ind = n - 1;
967 ++iterations[ind];
968
969 bool b = kmp_calc_original_ivs_from_iterations(
970 original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
971
972 __kmp_free(iterations);
973 return b;
974}
975
976//----------Calculate chunk end in the original loop space--------------------
977
978// For one level calculate old induction variable corresponding to overall
979// new_iv for the chunk end.
980// Return true if it fits into upper bound on this level
981// (if not, we need to re-calculate)
982template <typename T>
983bool kmp_calc_one_iv_for_chunk_end_XX(
984 const bounds_infoXX_template<T> *bounds,
985 const bounds_infoXX_template<T> *updated_bounds,
986 /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
987 kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
988 const kmp_point_t original_ivs_start) {
989
990 // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
991 // big_span_t;
992
993 // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
994 T temp = 0;
995
996 T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
997
998 if (start_with_lower_bound) {
999 // we moved to the next iteration on one of outer loops, may as well use
1000 // the lower bound here:
1001 temp = bounds->lb0 + bounds->lb1 * outer_iv;
1002 } else {
1003 // Start in expanded space, but:
1004 // - we need to hit original space lower bound, so need to account for
1005 // that
1006 // - we have to go into original space, even if that means adding more
1007 // iterations than was planned
1008 // - we have to go past (or equal to) previous point (which is the chunk
1009 // starting point)
1010
1011 auto iteration = iterations[ind];
1012
1013 auto step = bounds->step;
1014
1015 // In case of >= it's negative:
1016 auto accountForStep =
1017 ((bounds->lb0 + bounds->lb1 * outer_iv) -
1018 (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1019 step;
1020
1021 temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1022 accountForStep + iteration * step;
1023
1024 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1025 (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1026 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1027 (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1028 // Too small (or too big), didn't reach the original lower bound. Use
1029 // heuristic:
1030 temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1031 }
1032
1033 if (compare_with_start) {
1034
1035 T start = static_cast<T>(original_ivs_start[ind]);
1036
1037 temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1038
1039 // On all previous levels start of the chunk is same as the end, need to
1040 // be really careful here:
1041 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1042 (temp < start)) ||
1043 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1044 (temp > start))) {
1045 // End of the chunk can't be smaller (for >= bigger) than it's start.
1046 // Use heuristic:
1047 temp = start + iteration / 4 * step;
1048 }
1049 }
1050 }
1051
1052 original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1053
1054 if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1055 (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1056 ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1057 (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1058 // Too big (or too small for >=).
1059 return false;
1060 }
1061
1062 return true;
1063}
1064
1065// For one level calculate old induction variable corresponding to overall
1066// new_iv for the chunk end.
1067bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1068 const bounds_info_t *updated_bounds,
1069 /*in/out*/ kmp_point_t original_ivs,
1070 const kmp_iterations_t iterations,
1071 kmp_index_t ind, bool start_with_lower_bound,
1072 bool compare_with_start,
1073 const kmp_point_t original_ivs_start) {
1074
1075 switch (bounds->loop_type) {
1076 case loop_type_t::loop_type_int32:
1077 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1079 (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1080 /*in/out*/
1081 original_ivs, iterations, ind, start_with_lower_bound,
1082 compare_with_start, original_ivs_start);
1083 break;
1084 case loop_type_t::loop_type_uint32:
1085 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1087 (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1088 /*in/out*/
1089 original_ivs, iterations, ind, start_with_lower_bound,
1090 compare_with_start, original_ivs_start);
1091 break;
1092 case loop_type_t::loop_type_int64:
1093 return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1095 (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1096 /*in/out*/
1097 original_ivs, iterations, ind, start_with_lower_bound,
1098 compare_with_start, original_ivs_start);
1099 break;
1100 case loop_type_t::loop_type_uint64:
1101 return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1103 (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1104 /*in/out*/
1105 original_ivs, iterations, ind, start_with_lower_bound,
1106 compare_with_start, original_ivs_start);
1107 break;
1108 default:
1109 KMP_ASSERT(false);
1110 return false;
1111 }
1112}
1113
1114// Calculate old induction variables corresponding to overall new_iv for the
1115// chunk end. If due to space extension we are getting old IVs outside of the
1116// boundaries, bring them into the boundaries. Need to do this in the runtime,
1117// esp. on the lower bounds side. When getting result need to make sure that the
1118// new chunk starts at next position to old chunk, not overlaps with it (this is
1119// done elsewhere), and need to make sure end of the chunk is further than the
1120// beginning of the chunk. We don't need an exact ending point here, just
1121// something more-or-less close to the desired chunk length, bigger is fine
1122// (smaller would be fine, but we risk going into infinite loop, so do smaller
1123// only at the very end of the space). result: false if could not find the
1124// ending point in the original loop space. In this case the caller can use
1125// original upper bounds as the end of the chunk. Chunk won't be empty, because
1126// it'll have at least the starting point, which is by construction in the
1127// original space.
1128bool kmp_calc_original_ivs_for_chunk_end(
1129 const bounds_info_t *original_bounds_nest, kmp_index_t n,
1130 const bounds_info_internal_t *updated_bounds_nest,
1131 const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1132 /*out*/ kmp_point_t original_ivs) {
1133
1134 // Iterations in the expanded space:
1135 kmp_iterations_t iterations =
1136 (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
1137
1138 // First, calc corresponding iteration in every modified loop:
1139 for (kmp_index_t ind = n; ind > 0;) {
1140 --ind;
1141 auto &updated_bounds = updated_bounds_nest[ind];
1142
1143 // should be optimized to OPDIVREM:
1144 auto new_ind = new_iv / updated_bounds.b.trip_count;
1145 auto iteration = new_iv % updated_bounds.b.trip_count;
1146
1147 new_iv = new_ind;
1148 iterations[ind] = iteration;
1149 }
1150 KMP_DEBUG_ASSERT(new_iv == 0);
1151
1152 kmp_index_t lengthened_ind = n;
1153 kmp_index_t equal_ind = -1;
1154
1155 // Next calculate the point, but in original loop nest.
1156 for (kmp_index_t ind = 0; ind < n;) {
1157 auto bounds = &(original_bounds_nest[ind]);
1158 auto updated_bounds = &(updated_bounds_nest[ind].b);
1159
1160 bool good = kmp_calc_one_iv_for_chunk_end(
1161 bounds, updated_bounds,
1162 /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1163 (equal_ind >= ind - 1), original_ivs_start);
1164
1165 if (!good) {
1166 // Too big (or too small for >=).
1167 if (ind == 0) {
1168 // Need to reduce to the end.
1169 __kmp_free(iterations);
1170 return false;
1171 } else {
1172 // Go to next iteration on outer loop:
1173 --ind;
1174 ++(iterations[ind]);
1175 lengthened_ind = ind;
1176 if (equal_ind >= lengthened_ind) {
1177 // We've changed the number of iterations here,
1178 // can't be same anymore:
1179 equal_ind = lengthened_ind - 1;
1180 }
1181 for (kmp_index_t i = ind + 1; i < n; ++i) {
1182 iterations[i] = 0;
1183 }
1184 continue;
1185 }
1186 }
1187
1188 if ((equal_ind == ind - 1) &&
1189 (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1190 original_ivs_start[ind]))) {
1191 equal_ind = ind;
1192 } else if ((equal_ind > ind - 1) &&
1193 !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1194 original_ivs_start[ind]))) {
1195 equal_ind = ind - 1;
1196 }
1197 ++ind;
1198 }
1199
1200 __kmp_free(iterations);
1201 return true;
1202}
1203
1204//----------Calculate upper bounds for the last chunk-------------------------
1205
1206// Calculate one upper bound for the end.
1207template <typename T>
1208void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1209 /*in/out*/ kmp_point_t original_ivs,
1210 kmp_index_t ind) {
1211
1212 T temp = bounds->ub0 +
1213 bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1214
1215 original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1216}
1217
1218void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1219 /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1220
1221 switch (bounds->loop_type) {
1222 default:
1223 KMP_ASSERT(false);
1224 break;
1225 case loop_type_t::loop_type_int32:
1226 kmp_calc_one_iv_end_XX<kmp_int32>(
1228 /*in/out*/ original_ivs, ind);
1229 break;
1230 case loop_type_t::loop_type_uint32:
1231 kmp_calc_one_iv_end_XX<kmp_uint32>(
1233 /*in/out*/ original_ivs, ind);
1234 break;
1235 case loop_type_t::loop_type_int64:
1236 kmp_calc_one_iv_end_XX<kmp_int64>(
1238 /*in/out*/ original_ivs, ind);
1239 break;
1240 case loop_type_t::loop_type_uint64:
1241 kmp_calc_one_iv_end_XX<kmp_uint64>(
1243 /*in/out*/ original_ivs, ind);
1244 break;
1245 }
1246}
1247
1248// Calculate upper bounds for the last loop iteration. Just use original upper
1249// bounds (adjusted when canonicalized to use <= / >=). No need to check that
1250// this point is in the original space (it's likely not)
1251void kmp_calc_original_ivs_for_end(
1252 const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1253 /*out*/ kmp_point_t original_ivs) {
1254 for (kmp_index_t ind = 0; ind < n; ++ind) {
1255 auto bounds = &(original_bounds_nest[ind]);
1256 kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1257 }
1258}
1259
1260//----------Init API for non-rectangular loops--------------------------------
1261
1262// Init API for collapsed loops (static, no chunks defined).
1263// "bounds_nest" has to be allocated per thread.
1264// API will modify original bounds_nest array to bring it to a canonical form
1265// (only <= and >=, no !=, <, >). If the original loop nest was already in a
1266// canonical form there will be no changes to bounds in bounds_nest array
1267// (only trip counts will be calculated). Internally API will expand the space
1268// to parallelogram/parallelepiped, calculate total, calculate bounds for the
1269// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1270// important on the left side, to hit the lower bounds and not step over), and
1271// pick the correct chunk for this thread (so it will calculate chunks up to the
1272// needed one). It could be optimized to calculate just this chunk, potentially
1273// a bit less well distributed among threads. It is designed to make sure that
1274// threads will receive predictable chunks, deterministically (so that next nest
1275// of loops with similar characteristics will get exactly same chunks on same
1276// threads).
1277// Current contract: chunk_bounds_nest has only lb0 and ub0,
1278// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1279extern "C" kmp_int32
1280__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1281 /*in/out*/ bounds_info_t *original_bounds_nest,
1282 /*out*/ bounds_info_t *chunk_bounds_nest,
1283 kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1284
1285 KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1286 KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1287
1288 if (__kmp_env_consistency_check) {
1289 __kmp_push_workshare(gtid, ct_pdo, loc);
1290 }
1291
1292 kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1293
1294 bounds_info_internal_t *updated_bounds_nest =
1295 (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) *
1296 n);
1297
1298 for (kmp_index_t i = 0; i < n; ++i) {
1299 updated_bounds_nest[i].b = original_bounds_nest[i];
1300 }
1301
1302 kmp_loop_nest_iv_t total =
1303 kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1304
1305 if (plastiter != NULL) {
1306 *plastiter = FALSE;
1307 }
1308
1309 if (total == 0) {
1310 // Loop won't execute:
1311 __kmp_free(updated_bounds_nest);
1312 return FALSE;
1313 }
1314
1315 // OMPTODO: DISTRIBUTE is not supported yet
1316 __kmp_assert_valid_gtid(gtid);
1317 kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1318
1319 kmp_info_t *th = __kmp_threads[gtid];
1320 kmp_team_t *team = th->th.th_team;
1321 kmp_uint32 nth = team->t.t_nproc; // Number of threads
1322
1323 KMP_DEBUG_ASSERT(tid < nth);
1324
1325 kmp_point_t original_ivs_start =
1326 (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1327 kmp_point_t original_ivs_end =
1328 (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1329 kmp_point_t original_ivs_next_start =
1330 (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
1331
1332 if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1333 /*out*/ original_ivs_start)) {
1334 // Loop won't execute:
1335 __kmp_free(updated_bounds_nest);
1336 __kmp_free(original_ivs_start);
1337 __kmp_free(original_ivs_end);
1338 __kmp_free(original_ivs_next_start);
1339 return FALSE;
1340 }
1341
1342 // Not doing this optimization for one thread:
1343 // (1) more to test
1344 // (2) without it current contract that chunk_bounds_nest has only lb0 and
1345 // ub0, lb1 and ub1 are set to 0 and can be ignored.
1346 // if (nth == 1) {
1347 // // One thread:
1348 // // Copy all info from original_bounds_nest, it'll be good enough.
1349
1350 // for (kmp_index_t i = 0; i < n; ++i) {
1351 // chunk_bounds_nest[i] = original_bounds_nest[i];
1352 // }
1353
1354 // if (plastiter != NULL) {
1355 // *plastiter = TRUE;
1356 // }
1357 // __kmp_free(updated_bounds_nest);
1358 // __kmp_free(original_ivs_start);
1359 // __kmp_free(original_ivs_end);
1360 // __kmp_free(original_ivs_next_start);
1361 // return TRUE;
1362 //}
1363
1364 kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1365 updated_bounds_nest, original_ivs_start, n);
1366
1367 bool last_iter = false;
1368
1369 for (; nth > 0;) {
1370 // We could calculate chunk size once, but this is to compensate that the
1371 // original space is not parallelepiped and some threads can be left
1372 // without work:
1373 KMP_DEBUG_ASSERT(total >= new_iv);
1374
1375 kmp_loop_nest_iv_t total_left = total - new_iv;
1376 kmp_loop_nest_iv_t chunk_size = total_left / nth;
1377 kmp_loop_nest_iv_t remainder = total_left % nth;
1378
1379 kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1380
1381 if (remainder > 0) {
1382 ++curr_chunk_size;
1383 --remainder;
1384 }
1385
1386#if defined(KMP_DEBUG)
1387 kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1388#endif
1389
1390 if (curr_chunk_size > 1) {
1391 new_iv += curr_chunk_size - 1;
1392 }
1393
1394 if ((nth == 1) || (new_iv >= total - 1)) {
1395 // Do this one till the end - just in case we miscalculated
1396 // and either too much is left to process or new_iv is a bit too big:
1397 kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1398 /*out*/ original_ivs_end);
1399
1400 last_iter = true;
1401 } else {
1402 // Note: here we make sure it's past (or equal to) the previous point.
1403 if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1404 updated_bounds_nest,
1405 original_ivs_start, new_iv,
1406 /*out*/ original_ivs_end)) {
1407 // We could not find the ending point, use the original upper bounds:
1408 kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1409 /*out*/ original_ivs_end);
1410
1411 last_iter = true;
1412 }
1413 }
1414
1415#if defined(KMP_DEBUG)
1416 auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1417 updated_bounds_nest, original_ivs_end, n);
1418 KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1419#endif
1420
1421 if (last_iter && (tid != 0)) {
1422 // We are done, this was last chunk, but no chunk for current thread was
1423 // found:
1424 __kmp_free(updated_bounds_nest);
1425 __kmp_free(original_ivs_start);
1426 __kmp_free(original_ivs_end);
1427 __kmp_free(original_ivs_next_start);
1428 return FALSE;
1429 }
1430
1431 if (tid == 0) {
1432 // We found the chunk for this thread, now we need to check if it's the
1433 // last chunk or not:
1434
1435 if (last_iter ||
1436 !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1437 /*out*/ original_ivs_next_start)) {
1438 // no more loop iterations left to process,
1439 // this means that currently found chunk is the last chunk:
1440 if (plastiter != NULL) {
1441 *plastiter = TRUE;
1442 }
1443 }
1444
1445 // Fill in chunk bounds:
1446 for (kmp_index_t i = 0; i < n; ++i) {
1447 chunk_bounds_nest[i] =
1448 original_bounds_nest[i]; // To fill in types, etc. - optional
1449 chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1450 chunk_bounds_nest[i].lb1_u64 = 0;
1451
1452 chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1453 chunk_bounds_nest[i].ub1_u64 = 0;
1454 }
1455
1456 __kmp_free(updated_bounds_nest);
1457 __kmp_free(original_ivs_start);
1458 __kmp_free(original_ivs_end);
1459 __kmp_free(original_ivs_next_start);
1460 return TRUE;
1461 }
1462
1463 --tid;
1464 --nth;
1465
1466 bool next_chunk = kmp_calc_next_original_ivs(
1467 original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1468 if (!next_chunk) {
1469 // no more loop iterations to process,
1470 // the prevoius chunk was the last chunk
1471 break;
1472 }
1473
1474 // original_ivs_start is next to previous chunk original_ivs_end,
1475 // we need to start new chunk here, so chunks will be one after another
1476 // without any gap or overlap:
1477 new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1478 original_ivs_start, n);
1479 }
1480
1481 __kmp_free(updated_bounds_nest);
1482 __kmp_free(original_ivs_start);
1483 __kmp_free(original_ivs_end);
1484 __kmp_free(original_ivs_next_start);
1485 return FALSE;
1486}
Definition: kmp.h:234