diff options
Diffstat (limited to 'gcc48-libgomp-20160715.patch')
-rw-r--r-- | gcc48-libgomp-20160715.patch | 10653 |
1 files changed, 10653 insertions, 0 deletions
diff --git a/gcc48-libgomp-20160715.patch b/gcc48-libgomp-20160715.patch new file mode 100644 index 0000000..9b6a61e --- /dev/null +++ b/gcc48-libgomp-20160715.patch @@ -0,0 +1,10653 @@ +--- libgomp/config/linux/wait.h.jj 2013-01-31 20:29:10.091548989 +0100 ++++ libgomp/config/linux/wait.h 2016-07-13 16:57:18.902355979 +0200 +@@ -34,13 +34,13 @@ + + #define FUTEX_WAIT 0 + #define FUTEX_WAKE 1 +-#define FUTEX_PRIVATE_FLAG 128L ++#define FUTEX_PRIVATE_FLAG 128 + + #ifdef HAVE_ATTRIBUTE_VISIBILITY + # pragma GCC visibility push(hidden) + #endif + +-extern long int gomp_futex_wait, gomp_futex_wake; ++extern int gomp_futex_wait, gomp_futex_wake; + + #include <futex.h> + +@@ -48,7 +48,9 @@ static inline int do_spin (int *addr, in + { + unsigned long long i, count = gomp_spin_count_var; + +- if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0)) ++ if (__builtin_expect (__atomic_load_n (&gomp_managed_threads, ++ MEMMODEL_RELAXED) ++ > gomp_available_cpus, 0)) + count = gomp_throttled_spin_count_var; + for (i = 0; i < count; i++) + if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0)) +--- libgomp/config/linux/affinity.c.jj 2014-05-15 10:56:37.499502573 +0200 ++++ libgomp/config/linux/affinity.c 2016-07-13 16:57:18.902355979 +0200 +@@ -352,6 +352,45 @@ gomp_affinity_print_place (void *p) + fprintf (stderr, ":%lu", len); + } + ++int ++omp_get_place_num_procs (int place_num) ++{ ++ if (place_num < 0 || place_num >= gomp_places_list_len) ++ return 0; ++ ++ cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num]; ++ return gomp_cpuset_popcount (gomp_cpuset_size, cpusetp); ++} ++ ++void ++omp_get_place_proc_ids (int place_num, int *ids) ++{ ++ if (place_num < 0 || place_num >= gomp_places_list_len) ++ return; ++ ++ cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num]; ++ unsigned long i, max = 8 * gomp_cpuset_size; ++ for (i = 0; i < max; i++) ++ if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp)) ++ *ids++ = i; ++} ++ ++void ++gomp_get_place_proc_ids_8 (int place_num, int64_t *ids) ++{ ++ if (place_num < 0 || place_num >= gomp_places_list_len) ++ return; ++ ++ cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num]; ++ unsigned long i, max = 8 * gomp_cpuset_size; ++ for (i = 0; i < max; i++) ++ if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp)) ++ *ids++ = i; ++} ++ ++ialias(omp_get_place_num_procs) ++ialias(omp_get_place_proc_ids) ++ + #else + + #include "../posix/affinity.c" +--- libgomp/config/linux/mutex.c.jj 2013-01-21 16:00:38.220917670 +0100 ++++ libgomp/config/linux/mutex.c 2016-07-13 16:57:18.870356375 +0200 +@@ -28,8 +28,8 @@ + + #include "wait.h" + +-long int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; +-long int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; ++int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; ++int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; + + void + gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval) +--- libgomp/config/posix/affinity.c.jj 2014-05-15 10:56:37.987498844 +0200 ++++ libgomp/config/posix/affinity.c 2016-07-15 12:08:28.410015743 +0200 +@@ -113,3 +113,27 @@ gomp_affinity_print_place (void *p) + { + (void) p; + } ++ ++int ++omp_get_place_num_procs (int place_num) ++{ ++ (void) place_num; ++ return 0; ++} ++ ++void ++omp_get_place_proc_ids (int place_num, int *ids) ++{ ++ (void) place_num; ++ (void) ids; ++} ++ ++void ++gomp_get_place_proc_ids_8 (int place_num, int64_t *ids) ++{ ++ (void) place_num; ++ (void) ids; ++} ++ ++ialias(omp_get_place_num_procs) ++ialias(omp_get_place_proc_ids) +--- libgomp/loop_ull.c.jj 2013-01-21 16:00:46.477871806 +0100 ++++ libgomp/loop_ull.c 2016-07-13 16:57:18.918355780 +0200 +@@ -174,15 +174,15 @@ GOMP_loop_ull_runtime_start (bool up, go + { + case GFS_STATIC: + return gomp_loop_ull_static_start (up, start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: + return gomp_loop_ull_dynamic_start (up, start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: + return gomp_loop_ull_guided_start (up, start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback +@@ -278,15 +278,15 @@ GOMP_loop_ull_ordered_runtime_start (boo + { + case GFS_STATIC: + return gomp_loop_ull_ordered_static_start (up, start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: + return gomp_loop_ull_ordered_dynamic_start (up, start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: + return gomp_loop_ull_ordered_guided_start (up, start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback +@@ -298,6 +298,114 @@ GOMP_loop_ull_ordered_runtime_start (boo + } + } + ++/* The *_doacross_*_start routines are similar. The only difference is that ++ this work-share construct is initialized to expect an ORDERED(N) - DOACROSS ++ section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 ++ and other COUNTS array elements tell the library number of iterations ++ in the ordered inner loops. */ ++ ++static bool ++gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ ++ thr->ts.static_trip = 0; ++ if (gomp_work_share_start (false)) ++ { ++ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, ++ GFS_STATIC, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size); ++ gomp_work_share_init_done (); ++ } ++ ++ return !gomp_iter_ull_static_next (istart, iend); ++} ++ ++static bool ++gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ bool ret; ++ ++ if (gomp_work_share_start (false)) ++ { ++ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, ++ GFS_DYNAMIC, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size); ++ gomp_work_share_init_done (); ++ } ++ ++#if defined HAVE_SYNC_BUILTINS && defined __LP64__ ++ ret = gomp_iter_ull_dynamic_next (istart, iend); ++#else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ ret = gomp_iter_ull_dynamic_next_locked (istart, iend); ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif ++ ++ return ret; ++} ++ ++static bool ++gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ bool ret; ++ ++ if (gomp_work_share_start (false)) ++ { ++ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, ++ GFS_GUIDED, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size); ++ gomp_work_share_init_done (); ++ } ++ ++#if defined HAVE_SYNC_BUILTINS && defined __LP64__ ++ ret = gomp_iter_ull_guided_next (istart, iend); ++#else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ ret = gomp_iter_ull_guided_next_locked (istart, iend); ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif ++ ++ return ret; ++} ++ ++bool ++GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull *istart, gomp_ull *iend) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ switch (icv->run_sched_var) ++ { ++ case GFS_STATIC: ++ return gomp_loop_ull_doacross_static_start (ncounts, counts, ++ icv->run_sched_chunk_size, ++ istart, iend); ++ case GFS_DYNAMIC: ++ return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, ++ icv->run_sched_chunk_size, ++ istart, iend); ++ case GFS_GUIDED: ++ return gomp_loop_ull_doacross_guided_start (ncounts, counts, ++ icv->run_sched_chunk_size, ++ istart, iend); ++ case GFS_AUTO: ++ /* For now map to schedule(static), later on we could play with feedback ++ driven choice. */ ++ return gomp_loop_ull_doacross_static_start (ncounts, counts, ++ 0, istart, iend); ++ default: ++ abort (); ++ } ++} ++ + /* The *_next routines are called when the thread completes processing of + the iteration block currently assigned to it. If the work-share + construct is bound directly to a parallel construct, then the iteration +@@ -457,6 +565,10 @@ extern __typeof(gomp_loop_ull_dynamic_st + __attribute__((alias ("gomp_loop_ull_dynamic_start"))); + extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_guided_start + __attribute__((alias ("gomp_loop_ull_guided_start"))); ++extern __typeof(gomp_loop_ull_dynamic_start) GOMP_loop_ull_nonmonotonic_dynamic_start ++ __attribute__((alias ("gomp_loop_ull_dynamic_start"))); ++extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start ++ __attribute__((alias ("gomp_loop_ull_guided_start"))); + + extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start + __attribute__((alias ("gomp_loop_ull_ordered_static_start"))); +@@ -465,12 +577,23 @@ extern __typeof(gomp_loop_ull_ordered_dy + extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start + __attribute__((alias ("gomp_loop_ull_ordered_guided_start"))); + ++extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start ++ __attribute__((alias ("gomp_loop_ull_doacross_static_start"))); ++extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start ++ __attribute__((alias ("gomp_loop_ull_doacross_dynamic_start"))); ++extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start ++ __attribute__((alias ("gomp_loop_ull_doacross_guided_start"))); ++ + extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next + __attribute__((alias ("gomp_loop_ull_static_next"))); + extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next + __attribute__((alias ("gomp_loop_ull_dynamic_next"))); + extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_guided_next + __attribute__((alias ("gomp_loop_ull_guided_next"))); ++extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_nonmonotonic_dynamic_next ++ __attribute__((alias ("gomp_loop_ull_dynamic_next"))); ++extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next ++ __attribute__((alias ("gomp_loop_ull_guided_next"))); + + extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next + __attribute__((alias ("gomp_loop_ull_ordered_static_next"))); +@@ -507,6 +630,25 @@ GOMP_loop_ull_guided_start (bool up, gom + } + + bool ++GOMP_loop_ull_nonmonotonic_dynamic_start (bool up, gomp_ull start, ++ gomp_ull end, gomp_ull incr, ++ gomp_ull chunk_size, ++ gomp_ull *istart, gomp_ull *iend) ++{ ++ return gomp_loop_ull_dynamic_start (up, start, end, incr, chunk_size, istart, ++ iend); ++} ++ ++bool ++GOMP_loop_ull_nonmonotonic_guided_start (bool up, gomp_ull start, gomp_ull end, ++ gomp_ull incr, gomp_ull chunk_size, ++ gomp_ull *istart, gomp_ull *iend) ++{ ++ return gomp_loop_ull_guided_start (up, start, end, incr, chunk_size, istart, ++ iend); ++} ++ ++bool + GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end, + gomp_ull incr, gomp_ull chunk_size, + gomp_ull *istart, gomp_ull *iend) +@@ -534,6 +676,33 @@ GOMP_loop_ull_ordered_guided_start (bool + } + + bool ++GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size, ++ istart, iend); ++} ++ ++bool ++GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size, ++ istart, iend); ++} ++ ++bool ++GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size, ++ istart, iend); ++} ++ ++bool + GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend) + { + return gomp_loop_ull_static_next (istart, iend); +@@ -550,6 +719,18 @@ GOMP_loop_ull_guided_next (gomp_ull *ist + { + return gomp_loop_ull_guided_next (istart, iend); + } ++ ++bool ++GOMP_loop_ull_nonmonotonic_dynamic_next (gomp_ull *istart, gomp_ull *iend) ++{ ++ return gomp_loop_ull_dynamic_next (istart, iend); ++} ++ ++bool ++GOMP_loop_ull_nonmonotonic_guided_next (gomp_ull *istart, gomp_ull *iend) ++{ ++ return gomp_loop_ull_guided_next (istart, iend); ++} + + bool + GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend) +--- libgomp/team.c.jj 2014-05-15 10:56:32.092524669 +0200 ++++ libgomp/team.c 2016-07-13 17:58:01.907291111 +0200 +@@ -133,6 +133,25 @@ gomp_thread_start (void *xdata) + return NULL; + } + ++static inline struct gomp_team * ++get_last_team (unsigned nthreads) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->ts.team == NULL) ++ { ++ struct gomp_thread_pool *pool = thr->thread_pool; ++ if (pool != NULL) ++ { ++ struct gomp_team *last_team = pool->last_team; ++ if (last_team != NULL && last_team->nthreads == nthreads) ++ { ++ pool->last_team = NULL; ++ return last_team; ++ } ++ } ++ } ++ return NULL; ++} + + /* Create a new team data structure. */ + +@@ -140,18 +159,27 @@ struct gomp_team * + gomp_new_team (unsigned nthreads) + { + struct gomp_team *team; +- size_t size; + int i; + +- size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0]) +- + sizeof (team->implicit_task[0])); +- team = gomp_malloc (size); ++ team = get_last_team (nthreads); ++ if (team == NULL) ++ { ++ size_t extra = sizeof (team->ordered_release[0]) ++ + sizeof (team->implicit_task[0]); ++ team = gomp_malloc (sizeof (*team) + nthreads * extra); ++ ++#ifndef HAVE_SYNC_BUILTINS ++ gomp_mutex_init (&team->work_share_list_free_lock); ++#endif ++ gomp_barrier_init (&team->barrier, nthreads); ++ gomp_mutex_init (&team->task_lock); ++ ++ team->nthreads = nthreads; ++ } + + team->work_share_chunk = 8; + #ifdef HAVE_SYNC_BUILTINS + team->single_count = 0; +-#else +- gomp_mutex_init (&team->work_share_list_free_lock); + #endif + team->work_shares_to_free = &team->work_shares[0]; + gomp_init_work_share (&team->work_shares[0], false, nthreads); +@@ -162,15 +190,11 @@ gomp_new_team (unsigned nthreads) + team->work_shares[i].next_free = &team->work_shares[i + 1]; + team->work_shares[i].next_free = NULL; + +- team->nthreads = nthreads; +- gomp_barrier_init (&team->barrier, nthreads); +- + gomp_sem_init (&team->master_release, 0); + team->ordered_release = (void *) &team->implicit_task[nthreads]; + team->ordered_release[0] = &team->master_release; + +- gomp_mutex_init (&team->task_lock); +- team->task_queue = NULL; ++ priority_queue_init (&team->task_queue); + team->task_count = 0; + team->task_queued_count = 0; + team->task_running_count = 0; +@@ -186,8 +210,12 @@ gomp_new_team (unsigned nthreads) + static void + free_team (struct gomp_team *team) + { ++#ifndef HAVE_SYNC_BUILTINS ++ gomp_mutex_destroy (&team->work_share_list_free_lock); ++#endif + gomp_barrier_destroy (&team->barrier); + gomp_mutex_destroy (&team->task_lock); ++ priority_queue_free (&team->task_queue); + free (team); + } + +@@ -258,6 +286,8 @@ gomp_free_thread (void *arg __attribute_ + free (pool); + thr->thread_pool = NULL; + } ++ if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0)) ++ gomp_team_end (); + if (thr->task != NULL) + { + struct gomp_task *task = thr->task; +@@ -287,7 +317,7 @@ gomp_team_start (void (*fn) (void *), vo + struct gomp_thread **affinity_thr = NULL; + + thr = gomp_thread (); +- nested = thr->ts.team != NULL; ++ nested = thr->ts.level; + if (__builtin_expect (thr->thread_pool == NULL, 0)) + { + thr->thread_pool = gomp_new_thread_pool (); +@@ -894,9 +924,6 @@ gomp_team_end (void) + while (ws != NULL); + } + gomp_sem_destroy (&team->master_release); +-#ifndef HAVE_SYNC_BUILTINS +- gomp_mutex_destroy (&team->work_share_list_free_lock); +-#endif + + if (__builtin_expect (thr->ts.team != NULL, 0) + || __builtin_expect (team->nthreads == 1, 0)) +--- libgomp/target.c.jj 2014-05-15 10:56:38.313498020 +0200 ++++ libgomp/target.c 2016-07-15 16:58:29.249328861 +0200 +@@ -22,14 +22,22 @@ + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +-/* This file handles the maintainence of threads in response to team +- creation and termination. */ ++/* This file contains the support of offloading. */ + ++#include "config.h" + #include "libgomp.h" ++#include "oacc-plugin.h" ++#include "oacc-int.h" ++#include "gomp-constants.h" + #include <limits.h> + #include <stdbool.h> + #include <stdlib.h> ++#ifdef HAVE_INTTYPES_H ++# include <inttypes.h> /* For PRIu64. */ ++#endif + #include <string.h> ++#include <assert.h> ++#include <errno.h> + + attribute_hidden int + gomp_get_num_devices (void) +@@ -37,22 +45,87 @@ gomp_get_num_devices (void) + return 0; + } + +-/* Called when encountering a target directive. If DEVICE +- is -1, it means use device-var ICV. If it is -2 (or any other value +- larger than last available hw device, use host fallback. +- FN is address of host code, OPENMP_TARGET contains value of the +- __OPENMP_TARGET__ symbol in the shared library or binary that invokes +- GOMP_target. HOSTADDRS, SIZES and KINDS are arrays +- with MAPNUM entries, with addresses of the host objects, +- sizes of the host objects (resp. for pointer kind pointer bias +- and assumed sizeof (void *) size) and kinds. */ ++/* This function should be called from every offload image while loading. ++ It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of ++ the target, and TARGET_DATA needed by target plugin. */ + + void +-GOMP_target (int device, void (*fn) (void *), const void *openmp_target, +- size_t mapnum, void **hostaddrs, size_t *sizes, +- unsigned char *kinds) ++GOMP_offload_register_ver (unsigned version, const void *host_table, ++ int target_type, const void *target_data) ++{ ++ (void) version; ++ (void) host_table; ++ (void) target_type; ++ (void) target_data; ++} ++ ++void ++GOMP_offload_register (const void *host_table, int target_type, ++ const void *target_data) ++{ ++ (void) host_table; ++ (void) target_type; ++ (void) target_data; ++} ++ ++/* This function should be called from every offload image while unloading. ++ It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of ++ the target, and TARGET_DATA needed by target plugin. */ ++ ++void ++GOMP_offload_unregister_ver (unsigned version, const void *host_table, ++ int target_type, const void *target_data) ++{ ++ (void) version; ++ (void) host_table; ++ (void) target_type; ++ (void) target_data; ++} ++ ++void ++GOMP_offload_unregister (const void *host_table, int target_type, ++ const void *target_data) ++{ ++ (void) host_table; ++ (void) target_type; ++ (void) target_data; ++} ++ ++/* This function initializes the target device, specified by DEVICEP. DEVICEP ++ must be locked on entry, and remains locked on return. */ ++ ++attribute_hidden void ++gomp_init_device (struct gomp_device_descr *devicep) ++{ ++ devicep->state = GOMP_DEVICE_INITIALIZED; ++} ++ ++attribute_hidden void ++gomp_unload_device (struct gomp_device_descr *devicep) ++{ ++} ++ ++/* Free address mapping tables. MM must be locked on entry, and remains locked ++ on return. */ ++ ++attribute_hidden void ++gomp_free_memmap (struct splay_tree_s *mem_map) ++{ ++ while (mem_map->root) ++ { ++ struct target_mem_desc *tgt = mem_map->root->key.tgt; ++ ++ splay_tree_remove (mem_map, &mem_map->root->key); ++ free (tgt->array); ++ free (tgt); ++ } ++} ++ ++/* Host fallback for GOMP_target{,_ext} routines. */ ++ ++static void ++gomp_target_fallback (void (*fn) (void *), void **hostaddrs) + { +- /* Host fallback. */ + struct gomp_thread old_thr, *thr = gomp_thread (); + old_thr = *thr; + memset (thr, '\0', sizeof (*thr)); +@@ -66,10 +139,167 @@ GOMP_target (int device, void (*fn) (voi + *thr = old_thr; + } + ++/* Calculate alignment and size requirements of a private copy of data shared ++ as GOMP_MAP_FIRSTPRIVATE and store them to TGT_ALIGN and TGT_SIZE. */ ++ ++static inline void ++calculate_firstprivate_requirements (size_t mapnum, size_t *sizes, ++ unsigned short *kinds, size_t *tgt_align, ++ size_t *tgt_size) ++{ ++ size_t i; ++ for (i = 0; i < mapnum; i++) ++ if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) ++ { ++ size_t align = (size_t) 1 << (kinds[i] >> 8); ++ if (*tgt_align < align) ++ *tgt_align = align; ++ *tgt_size = (*tgt_size + align - 1) & ~(align - 1); ++ *tgt_size += sizes[i]; ++ } ++} ++ ++/* Copy data shared as GOMP_MAP_FIRSTPRIVATE to DST. */ ++ ++static inline void ++copy_firstprivate_data (char *tgt, size_t mapnum, void **hostaddrs, ++ size_t *sizes, unsigned short *kinds, size_t tgt_align, ++ size_t tgt_size) ++{ ++ uintptr_t al = (uintptr_t) tgt & (tgt_align - 1); ++ if (al) ++ tgt += tgt_align - al; ++ tgt_size = 0; ++ size_t i; ++ for (i = 0; i < mapnum; i++) ++ if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) ++ { ++ size_t align = (size_t) 1 << (kinds[i] >> 8); ++ tgt_size = (tgt_size + align - 1) & ~(align - 1); ++ memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]); ++ hostaddrs[i] = tgt + tgt_size; ++ tgt_size = tgt_size + sizes[i]; ++ } ++} ++ ++/* Called when encountering a target directive. If DEVICE ++ is GOMP_DEVICE_ICV, it means use device-var ICV. If it is ++ GOMP_DEVICE_HOST_FALLBACK (or any value ++ larger than last available hw device), use host fallback. ++ FN is address of host code, UNUSED is part of the current ABI, but ++ we're not actually using it. HOSTADDRS, SIZES and KINDS are arrays ++ with MAPNUM entries, with addresses of the host objects, ++ sizes of the host objects (resp. for pointer kind pointer bias ++ and assumed sizeof (void *) size) and kinds. */ ++ ++void ++GOMP_target (int device, void (*fn) (void *), const void *unused, ++ size_t mapnum, void **hostaddrs, size_t *sizes, ++ unsigned char *kinds) ++{ ++ return gomp_target_fallback (fn, hostaddrs); ++} ++ ++/* Like GOMP_target, but KINDS is 16-bit, UNUSED is no longer present, ++ and several arguments have been added: ++ FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h. ++ DEPEND is array of dependencies, see GOMP_task for details. ++ ++ ARGS is a pointer to an array consisting of a variable number of both ++ device-independent and device-specific arguments, which can take one two ++ elements where the first specifies for which device it is intended, the type ++ and optionally also the value. If the value is not present in the first ++ one, the whole second element the actual value. The last element of the ++ array is a single NULL. Among the device independent can be for example ++ NUM_TEAMS and THREAD_LIMIT. ++ ++ NUM_TEAMS is positive if GOMP_teams will be called in the body with ++ that value, or 1 if teams construct is not present, or 0, if ++ teams construct does not have num_teams clause and so the choice is ++ implementation defined, and -1 if it can't be determined on the host ++ what value will GOMP_teams have on the device. ++ THREAD_LIMIT similarly is positive if GOMP_teams will be called in the ++ body with that value, or 0, if teams construct does not have thread_limit ++ clause or the teams construct is not present, or -1 if it can't be ++ determined on the host what value will GOMP_teams have on the device. */ ++ ++void ++GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned short *kinds, ++ unsigned int flags, void **depend, void **args) ++{ ++ size_t tgt_align = 0, tgt_size = 0; ++ bool fpc_done = false; ++ ++ if (flags & GOMP_TARGET_FLAG_NOWAIT) ++ { ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->ts.team ++ && !thr->task->final_task) ++ { ++ gomp_create_target_task (NULL, fn, mapnum, hostaddrs, ++ sizes, kinds, flags, depend, args, ++ GOMP_TARGET_TASK_BEFORE_MAP); ++ return; ++ } ++ } ++ ++ /* If there are depend clauses, but nowait is not present ++ (or we are in a final task), block the parent task until the ++ dependencies are resolved and then just continue with the rest ++ of the function as if it is a merged task. */ ++ if (depend != NULL) ++ { ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->task && thr->task->depend_hash) ++ { ++ /* If we might need to wait, copy firstprivate now. */ ++ calculate_firstprivate_requirements (mapnum, sizes, kinds, ++ &tgt_align, &tgt_size); ++ if (tgt_align) ++ { ++ char *tgt = gomp_alloca (tgt_size + tgt_align - 1); ++ copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, ++ tgt_align, tgt_size); ++ } ++ fpc_done = true; ++ gomp_task_maybe_wait_for_dependencies (depend); ++ } ++ } ++ ++ if (!fpc_done) ++ { ++ calculate_firstprivate_requirements (mapnum, sizes, kinds, ++ &tgt_align, &tgt_size); ++ if (tgt_align) ++ { ++ char *tgt = gomp_alloca (tgt_size + tgt_align - 1); ++ copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds, ++ tgt_align, tgt_size); ++ } ++ } ++ gomp_target_fallback (fn, hostaddrs); ++} ++ ++/* Host fallback for GOMP_target_data{,_ext} routines. */ ++ ++static void ++gomp_target_data_fallback (void) ++{ ++} ++ + void +-GOMP_target_data (int device, const void *openmp_target, size_t mapnum, ++GOMP_target_data (int device, const void *unused, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned char *kinds) + { ++ return gomp_target_data_fallback (); ++} ++ ++void ++GOMP_target_data_ext (int device, size_t mapnum, void **hostaddrs, ++ size_t *sizes, unsigned short *kinds) ++{ ++ return gomp_target_data_fallback (); + } + + void +@@ -78,12 +308,112 @@ GOMP_target_end_data (void) + } + + void +-GOMP_target_update (int device, const void *openmp_target, size_t mapnum, ++GOMP_target_update (int device, const void *unused, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned char *kinds) + { + } + + void ++GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs, ++ size_t *sizes, unsigned short *kinds, ++ unsigned int flags, void **depend) ++{ ++ /* If there are depend clauses, but nowait is not present, ++ block the parent task until the dependencies are resolved ++ and then just continue with the rest of the function as if it ++ is a merged task. Until we are able to schedule task during ++ variable mapping or unmapping, ignore nowait if depend clauses ++ are not present. */ ++ if (depend != NULL) ++ { ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->task && thr->task->depend_hash) ++ { ++ if ((flags & GOMP_TARGET_FLAG_NOWAIT) ++ && thr->ts.team ++ && !thr->task->final_task) ++ { ++ if (gomp_create_target_task (NULL, (void (*) (void *)) NULL, ++ mapnum, hostaddrs, sizes, kinds, ++ flags | GOMP_TARGET_FLAG_UPDATE, ++ depend, NULL, GOMP_TARGET_TASK_DATA)) ++ return; ++ } ++ else ++ { ++ struct gomp_team *team = thr->ts.team; ++ /* If parallel or taskgroup has been cancelled, don't start new ++ tasks. */ ++ if (team ++ && (gomp_team_barrier_cancelled (&team->barrier) ++ || (thr->task->taskgroup ++ && thr->task->taskgroup->cancelled))) ++ return; ++ ++ gomp_task_maybe_wait_for_dependencies (depend); ++ } ++ } ++ } ++} ++ ++void ++GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs, ++ size_t *sizes, unsigned short *kinds, ++ unsigned int flags, void **depend) ++{ ++ /* If there are depend clauses, but nowait is not present, ++ block the parent task until the dependencies are resolved ++ and then just continue with the rest of the function as if it ++ is a merged task. Until we are able to schedule task during ++ variable mapping or unmapping, ignore nowait if depend clauses ++ are not present. */ ++ if (depend != NULL) ++ { ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->task && thr->task->depend_hash) ++ { ++ if ((flags & GOMP_TARGET_FLAG_NOWAIT) ++ && thr->ts.team ++ && !thr->task->final_task) ++ { ++ if (gomp_create_target_task (NULL, (void (*) (void *)) NULL, ++ mapnum, hostaddrs, sizes, kinds, ++ flags, depend, NULL, ++ GOMP_TARGET_TASK_DATA)) ++ return; ++ } ++ else ++ { ++ struct gomp_team *team = thr->ts.team; ++ /* If parallel or taskgroup has been cancelled, don't start new ++ tasks. */ ++ if (team ++ && (gomp_team_barrier_cancelled (&team->barrier) ++ || (thr->task->taskgroup ++ && thr->task->taskgroup->cancelled))) ++ return; ++ ++ gomp_task_maybe_wait_for_dependencies (depend); ++ } ++ } ++ } ++} ++ ++bool ++gomp_target_task_fn (void *data) ++{ ++ struct gomp_target_task *ttask = (struct gomp_target_task *) data; ++ ++ if (ttask->fn != NULL) ++ { ++ ttask->state = GOMP_TARGET_TASK_FALLBACK; ++ gomp_target_fallback (ttask->fn, ttask->hostaddrs); ++ return false; ++ } ++ return false; ++} ++ ++void + GOMP_teams (unsigned int num_teams, unsigned int thread_limit) + { + if (thread_limit) +@@ -94,3 +424,153 @@ GOMP_teams (unsigned int num_teams, unsi + } + (void) num_teams; + } ++ ++void * ++omp_target_alloc (size_t size, int device_num) ++{ ++ if (device_num == GOMP_DEVICE_HOST_FALLBACK) ++ return malloc (size); ++ ++ return NULL; ++} ++ ++void ++omp_target_free (void *device_ptr, int device_num) ++{ ++ if (device_ptr == NULL) ++ return; ++ ++ if (device_num == GOMP_DEVICE_HOST_FALLBACK) ++ { ++ free (device_ptr); ++ return; ++ } ++} ++ ++int ++omp_target_is_present (void *ptr, int device_num) ++{ ++ if (ptr == NULL) ++ return 1; ++ ++ if (device_num == GOMP_DEVICE_HOST_FALLBACK) ++ return 1; ++ ++ return 0; ++} ++ ++int ++omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, ++ size_t src_offset, int dst_device_num, int src_device_num) ++{ ++ if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK) ++ return EINVAL; ++ if (src_device_num != GOMP_DEVICE_HOST_FALLBACK) ++ return EINVAL; ++ memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length); ++ return 0; ++} ++ ++#define HALF_SIZE_T (((size_t) 1) << (8 * sizeof (size_t) / 2)) ++ ++#define __builtin_mul_overflow(x, y, z) \ ++ ({ bool retval = false; \ ++ size_t xval = (x); \ ++ size_t yval = (y); \ ++ size_t zval = xval * yval; \ ++ if (__builtin_expect ((xval | yval) >= HALF_SIZE_T, 0)) \ ++ { \ ++ if (xval && zval / xval != yval) \ ++ retval = true; \ ++ } \ ++ *(z) = zval; \ ++ retval; }) ++ ++static int ++omp_target_memcpy_rect_worker (void *dst, void *src, size_t element_size, ++ int num_dims, const size_t *volume, ++ const size_t *dst_offsets, ++ const size_t *src_offsets, ++ const size_t *dst_dimensions, ++ const size_t *src_dimensions) ++{ ++ size_t dst_slice = element_size; ++ size_t src_slice = element_size; ++ size_t j, dst_off, src_off, length; ++ int i, ret; ++ ++ ++ if (num_dims == 1) ++ { ++ if (__builtin_mul_overflow (element_size, volume[0], &length) ++ || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) ++ || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) ++ return EINVAL; ++ memcpy ((char *) dst + dst_off, (char *) src + src_off, length); ++ ret = 1; ++ return ret ? 0 : EINVAL; ++ } ++ ++ /* FIXME: it would be nice to have some plugin function to handle ++ num_dims == 2 and num_dims == 3 more efficiently. Larger ones can ++ be handled in the generic recursion below, and for host-host it ++ should be used even for any num_dims >= 2. */ ++ ++ for (i = 1; i < num_dims; i++) ++ if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice) ++ || __builtin_mul_overflow (src_slice, src_dimensions[i], &src_slice)) ++ return EINVAL; ++ if (__builtin_mul_overflow (dst_slice, dst_offsets[0], &dst_off) ++ || __builtin_mul_overflow (src_slice, src_offsets[0], &src_off)) ++ return EINVAL; ++ for (j = 0; j < volume[0]; j++) ++ { ++ ret = omp_target_memcpy_rect_worker ((char *) dst + dst_off, ++ (char *) src + src_off, ++ element_size, num_dims - 1, ++ volume + 1, dst_offsets + 1, ++ src_offsets + 1, dst_dimensions + 1, ++ src_dimensions + 1); ++ if (ret) ++ return ret; ++ dst_off += dst_slice; ++ src_off += src_slice; ++ } ++ return 0; ++} ++ ++int ++omp_target_memcpy_rect (void *dst, void *src, size_t element_size, ++ int num_dims, const size_t *volume, ++ const size_t *dst_offsets, ++ const size_t *src_offsets, ++ const size_t *dst_dimensions, ++ const size_t *src_dimensions, ++ int dst_device_num, int src_device_num) ++{ ++ if (!dst && !src) ++ return INT_MAX; ++ ++ if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK) ++ return EINVAL; ++ if (src_device_num != GOMP_DEVICE_HOST_FALLBACK) ++ return EINVAL; ++ ++ int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims, ++ volume, dst_offsets, src_offsets, ++ dst_dimensions, src_dimensions); ++ return ret; ++} ++ ++int ++omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size, ++ size_t device_offset, int device_num) ++{ ++ return EINVAL; ++} ++ ++int ++omp_target_disassociate_ptr (void *ptr, int device_num) ++{ ++ return EINVAL; ++} +--- libgomp/fortran.c.jj 2014-05-15 10:56:31.593531223 +0200 ++++ libgomp/fortran.c 2016-07-13 16:57:04.432535397 +0200 +@@ -67,12 +67,20 @@ ialias_redirect (omp_get_active_level) + ialias_redirect (omp_in_final) + ialias_redirect (omp_get_cancellation) + ialias_redirect (omp_get_proc_bind) ++ialias_redirect (omp_get_num_places) ++ialias_redirect (omp_get_place_num_procs) ++ialias_redirect (omp_get_place_proc_ids) ++ialias_redirect (omp_get_place_num) ++ialias_redirect (omp_get_partition_num_places) ++ialias_redirect (omp_get_partition_place_nums) + ialias_redirect (omp_set_default_device) + ialias_redirect (omp_get_default_device) + ialias_redirect (omp_get_num_devices) + ialias_redirect (omp_get_num_teams) + ialias_redirect (omp_get_team_num) + ialias_redirect (omp_is_initial_device) ++ialias_redirect (omp_get_initial_device) ++ialias_redirect (omp_get_max_task_priority) + #endif + + #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING +@@ -342,35 +350,35 @@ omp_get_wtime_ (void) + } + + void +-omp_set_schedule_ (const int32_t *kind, const int32_t *modifier) ++omp_set_schedule_ (const int32_t *kind, const int32_t *chunk_size) + { +- omp_set_schedule (*kind, *modifier); ++ omp_set_schedule (*kind, *chunk_size); + } + + void +-omp_set_schedule_8_ (const int32_t *kind, const int64_t *modifier) ++omp_set_schedule_8_ (const int32_t *kind, const int64_t *chunk_size) + { +- omp_set_schedule (*kind, TO_INT (*modifier)); ++ omp_set_schedule (*kind, TO_INT (*chunk_size)); + } + + void +-omp_get_schedule_ (int32_t *kind, int32_t *modifier) ++omp_get_schedule_ (int32_t *kind, int32_t *chunk_size) + { + omp_sched_t k; +- int m; +- omp_get_schedule (&k, &m); ++ int cs; ++ omp_get_schedule (&k, &cs); + *kind = k; +- *modifier = m; ++ *chunk_size = cs; + } + + void +-omp_get_schedule_8_ (int32_t *kind, int64_t *modifier) ++omp_get_schedule_8_ (int32_t *kind, int64_t *chunk_size) + { + omp_sched_t k; +- int m; +- omp_get_schedule (&k, &m); ++ int cs; ++ omp_get_schedule (&k, &cs); + *kind = k; +- *modifier = m; ++ *chunk_size = cs; + } + + int32_t +@@ -451,6 +459,69 @@ omp_get_proc_bind_ (void) + return omp_get_proc_bind (); + } + ++int32_t ++omp_get_num_places_ (void) ++{ ++ return omp_get_num_places (); ++} ++ ++int32_t ++omp_get_place_num_procs_ (const int32_t *place_num) ++{ ++ return omp_get_place_num_procs (*place_num); ++} ++ ++int32_t ++omp_get_place_num_procs_8_ (const int64_t *place_num) ++{ ++ return omp_get_place_num_procs (TO_INT (*place_num)); ++} ++ ++void ++omp_get_place_proc_ids_ (const int32_t *place_num, int32_t *ids) ++{ ++ omp_get_place_proc_ids (*place_num, (int *) ids); ++} ++ ++void ++omp_get_place_proc_ids_8_ (const int64_t *place_num, int64_t *ids) ++{ ++ gomp_get_place_proc_ids_8 (TO_INT (*place_num), ids); ++} ++ ++int32_t ++omp_get_place_num_ (void) ++{ ++ return omp_get_place_num (); ++} ++ ++int32_t ++omp_get_partition_num_places_ (void) ++{ ++ return omp_get_partition_num_places (); ++} ++ ++void ++omp_get_partition_place_nums_ (int32_t *place_nums) ++{ ++ omp_get_partition_place_nums ((int *) place_nums); ++} ++ ++void ++omp_get_partition_place_nums_8_ (int64_t *place_nums) ++{ ++ if (gomp_places_list == NULL) ++ return; ++ ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->place == 0) ++ gomp_init_affinity (); ++ ++ unsigned int i; ++ for (i = 0; i < thr->ts.place_partition_len; i++) ++ *place_nums++ = (int64_t) thr->ts.place_partition_off + i; ++} ++ + void + omp_set_default_device_ (const int32_t *device_num) + { +@@ -492,3 +563,15 @@ omp_is_initial_device_ (void) + { + return omp_is_initial_device (); + } ++ ++int32_t ++omp_get_initial_device_ (void) ++{ ++ return omp_get_initial_device (); ++} ++ ++int32_t ++omp_get_max_task_priority_ (void) ++{ ++ return omp_get_max_task_priority (); ++} +--- libgomp/libgomp.map.jj 2014-05-15 10:56:31.927533549 +0200 ++++ libgomp/libgomp.map 2016-07-13 16:57:04.434535373 +0200 +@@ -134,6 +134,36 @@ OMP_4.0 { + omp_is_initial_device_; + } OMP_3.1; + ++OMP_4.5 { ++ global: ++ omp_get_max_task_priority; ++ omp_get_max_task_priority_; ++ omp_get_num_places; ++ omp_get_num_places_; ++ omp_get_place_num_procs; ++ omp_get_place_num_procs_; ++ omp_get_place_num_procs_8_; ++ omp_get_place_proc_ids; ++ omp_get_place_proc_ids_; ++ omp_get_place_proc_ids_8_; ++ omp_get_place_num; ++ omp_get_place_num_; ++ omp_get_partition_num_places; ++ omp_get_partition_num_places_; ++ omp_get_partition_place_nums; ++ omp_get_partition_place_nums_; ++ omp_get_partition_place_nums_8_; ++ omp_get_initial_device; ++ omp_get_initial_device_; ++ omp_target_alloc; ++ omp_target_free; ++ omp_target_is_present; ++ omp_target_memcpy; ++ omp_target_memcpy_rect; ++ omp_target_associate_ptr; ++ omp_target_disassociate_ptr; ++} OMP_4.0; ++ + GOMP_1.0 { + global: + GOMP_atomic_end; +@@ -227,3 +257,158 @@ GOMP_4.0 { + GOMP_target_update; + GOMP_teams; + } GOMP_3.0; ++ ++GOMP_4.0.1 { ++ global: ++ GOMP_offload_register; ++ GOMP_offload_unregister; ++} GOMP_4.0; ++ ++GOMP_4.5 { ++ global: ++ GOMP_target_ext; ++ GOMP_target_data_ext; ++ GOMP_target_update_ext; ++ GOMP_target_enter_exit_data; ++ GOMP_taskloop; ++ GOMP_taskloop_ull; ++ GOMP_offload_register_ver; ++ GOMP_offload_unregister_ver; ++ GOMP_loop_doacross_dynamic_start; ++ GOMP_loop_doacross_guided_start; ++ GOMP_loop_doacross_runtime_start; ++ GOMP_loop_doacross_static_start; ++ GOMP_doacross_post; ++ GOMP_doacross_wait; ++ GOMP_loop_ull_doacross_dynamic_start; ++ GOMP_loop_ull_doacross_guided_start; ++ GOMP_loop_ull_doacross_runtime_start; ++ GOMP_loop_ull_doacross_static_start; ++ GOMP_doacross_ull_post; ++ GOMP_doacross_ull_wait; ++ GOMP_loop_nonmonotonic_dynamic_next; ++ GOMP_loop_nonmonotonic_dynamic_start; ++ GOMP_loop_nonmonotonic_guided_next; ++ GOMP_loop_nonmonotonic_guided_start; ++ GOMP_loop_ull_nonmonotonic_dynamic_next; ++ GOMP_loop_ull_nonmonotonic_dynamic_start; ++ GOMP_loop_ull_nonmonotonic_guided_next; ++ GOMP_loop_ull_nonmonotonic_guided_start; ++ GOMP_parallel_loop_nonmonotonic_dynamic; ++ GOMP_parallel_loop_nonmonotonic_guided; ++} GOMP_4.0.1; ++ ++OACC_2.0 { ++ global: ++ acc_get_num_devices; ++ acc_get_num_devices_h_; ++ acc_set_device_type; ++ acc_set_device_type_h_; ++ acc_get_device_type; ++ acc_get_device_type_h_; ++ acc_set_device_num; ++ acc_set_device_num_h_; ++ acc_get_device_num; ++ acc_get_device_num_h_; ++ acc_async_test; ++ acc_async_test_h_; ++ acc_async_test_all; ++ acc_async_test_all_h_; ++ acc_wait; ++ acc_wait_h_; ++ acc_wait_async; ++ acc_wait_async_h_; ++ acc_wait_all; ++ acc_wait_all_h_; ++ acc_wait_all_async; ++ acc_wait_all_async_h_; ++ acc_init; ++ acc_init_h_; ++ acc_shutdown; ++ acc_shutdown_h_; ++ acc_on_device; ++ acc_on_device_h_; ++ acc_malloc; ++ acc_free; ++ acc_copyin; ++ acc_copyin_32_h_; ++ acc_copyin_64_h_; ++ acc_copyin_array_h_; ++ acc_present_or_copyin; ++ acc_present_or_copyin_32_h_; ++ acc_present_or_copyin_64_h_; ++ acc_present_or_copyin_array_h_; ++ acc_create; ++ acc_create_32_h_; ++ acc_create_64_h_; ++ acc_create_array_h_; ++ acc_present_or_create; ++ acc_present_or_create_32_h_; ++ acc_present_or_create_64_h_; ++ acc_present_or_create_array_h_; ++ acc_copyout; ++ acc_copyout_32_h_; ++ acc_copyout_64_h_; ++ acc_copyout_array_h_; ++ acc_delete; ++ acc_delete_32_h_; ++ acc_delete_64_h_; ++ acc_delete_array_h_; ++ acc_update_device; ++ acc_update_device_32_h_; ++ acc_update_device_64_h_; ++ acc_update_device_array_h_; ++ acc_update_self; ++ acc_update_self_32_h_; ++ acc_update_self_64_h_; ++ acc_update_self_array_h_; ++ acc_map_data; ++ acc_unmap_data; ++ acc_deviceptr; ++ acc_hostptr; ++ acc_is_present; ++ acc_is_present_32_h_; ++ acc_is_present_64_h_; ++ acc_is_present_array_h_; ++ acc_memcpy_to_device; ++ acc_memcpy_from_device; ++ acc_get_current_cuda_device; ++ acc_get_current_cuda_context; ++ acc_get_cuda_stream; ++ acc_set_cuda_stream; ++}; ++ ++GOACC_2.0 { ++ global: ++ GOACC_data_end; ++ GOACC_data_start; ++ GOACC_enter_exit_data; ++ GOACC_parallel; ++ GOACC_update; ++ GOACC_wait; ++ GOACC_get_thread_num; ++ GOACC_get_num_threads; ++}; ++ ++GOACC_2.0.1 { ++ global: ++ GOACC_declare; ++ GOACC_parallel_keyed; ++} GOACC_2.0; ++ ++GOMP_PLUGIN_1.0 { ++ global: ++ GOMP_PLUGIN_malloc; ++ GOMP_PLUGIN_malloc_cleared; ++ GOMP_PLUGIN_realloc; ++ GOMP_PLUGIN_debug; ++ GOMP_PLUGIN_error; ++ GOMP_PLUGIN_fatal; ++ GOMP_PLUGIN_async_unmap_vars; ++ GOMP_PLUGIN_acc_thread; ++}; ++ ++GOMP_PLUGIN_1.1 { ++ global: ++ GOMP_PLUGIN_target_task_completion; ++} GOMP_PLUGIN_1.0; +--- libgomp/ordered.c.jj 2013-01-21 16:00:46.137873657 +0100 ++++ libgomp/ordered.c 2016-07-13 16:57:18.918355780 +0200 +@@ -25,6 +25,9 @@ + /* This file handles the ORDERED construct. */ + + #include "libgomp.h" ++#include <stdarg.h> ++#include <string.h> ++#include "doacross.h" + + + /* This function is called when first allocating an iteration block. That +@@ -249,3 +252,533 @@ void + GOMP_ordered_end (void) + { + } ++ ++/* DOACROSS initialization. */ ++ ++#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) ++ ++void ++gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_work_share *ws = thr->ts.work_share; ++ unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; ++ unsigned long ent, num_ents, elt_sz, shift_sz; ++ struct gomp_doacross_work_share *doacross; ++ ++ if (team == NULL || team->nthreads == 1) ++ return; ++ ++ for (i = 0; i < ncounts; i++) ++ { ++ /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ ++ if (counts[i] == 0) ++ return; ++ ++ if (num_bits <= MAX_COLLAPSED_BITS) ++ { ++ unsigned int this_bits; ++ if (counts[i] == 1) ++ this_bits = 1; ++ else ++ this_bits = __SIZEOF_LONG__ * __CHAR_BIT__ ++ - __builtin_clzl (counts[i] - 1); ++ if (num_bits + this_bits <= MAX_COLLAPSED_BITS) ++ { ++ bits[i] = this_bits; ++ num_bits += this_bits; ++ } ++ else ++ num_bits = MAX_COLLAPSED_BITS + 1; ++ } ++ } ++ ++ if (ws->sched == GFS_STATIC) ++ num_ents = team->nthreads; ++ else if (ws->sched == GFS_GUIDED) ++ num_ents = counts[0]; ++ else ++ num_ents = (counts[0] - 1) / chunk_size + 1; ++ if (num_bits <= MAX_COLLAPSED_BITS) ++ { ++ elt_sz = sizeof (unsigned long); ++ shift_sz = ncounts * sizeof (unsigned int); ++ } ++ else ++ { ++ elt_sz = sizeof (unsigned long) * ncounts; ++ shift_sz = 0; ++ } ++ elt_sz = (elt_sz + 63) & ~63UL; ++ ++ doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz ++ + shift_sz); ++ doacross->chunk_size = chunk_size; ++ doacross->elt_sz = elt_sz; ++ doacross->ncounts = ncounts; ++ doacross->flattened = false; ++ doacross->array = (unsigned char *) ++ ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) ++ & ~(uintptr_t) 63); ++ if (num_bits <= MAX_COLLAPSED_BITS) ++ { ++ unsigned int shift_count = 0; ++ doacross->flattened = true; ++ for (i = ncounts; i > 0; i--) ++ { ++ doacross->shift_counts[i - 1] = shift_count; ++ shift_count += bits[i - 1]; ++ } ++ for (ent = 0; ent < num_ents; ent++) ++ *(unsigned long *) (doacross->array + ent * elt_sz) = 0; ++ } ++ else ++ for (ent = 0; ent < num_ents; ent++) ++ memset (doacross->array + ent * elt_sz, '\0', ++ sizeof (unsigned long) * ncounts); ++ if (ws->sched == GFS_STATIC && chunk_size == 0) ++ { ++ unsigned long q = counts[0] / num_ents; ++ unsigned long t = counts[0] % num_ents; ++ doacross->boundary = t * (q + 1); ++ doacross->q = q; ++ doacross->t = t; ++ } ++ ws->doacross = doacross; ++} ++ ++/* DOACROSS POST operation. */ ++ ++void ++GOMP_doacross_post (long *counts) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_work_share *ws = thr->ts.work_share; ++ struct gomp_doacross_work_share *doacross = ws->doacross; ++ unsigned long ent; ++ unsigned int i; ++ ++ if (__builtin_expect (doacross == NULL, 0)) ++ { ++ __sync_synchronize (); ++ return; ++ } ++ ++ if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ++ ent = thr->ts.team_id; ++ else if (ws->sched == GFS_GUIDED) ++ ent = counts[0]; ++ else ++ ent = counts[0] / doacross->chunk_size; ++ unsigned long *array = (unsigned long *) (doacross->array ++ + ent * doacross->elt_sz); ++ ++ if (__builtin_expect (doacross->flattened, 1)) ++ { ++ unsigned long flattened ++ = (unsigned long) counts[0] << doacross->shift_counts[0]; ++ ++ for (i = 1; i < doacross->ncounts; i++) ++ flattened |= (unsigned long) counts[i] ++ << doacross->shift_counts[i]; ++ flattened++; ++ if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) ++ __atomic_thread_fence (MEMMODEL_RELEASE); ++ else ++ __atomic_store_n (array, flattened, MEMMODEL_RELEASE); ++ return; ++ } ++ ++ __atomic_thread_fence (MEMMODEL_ACQUIRE); ++ for (i = doacross->ncounts; i-- > 0; ) ++ { ++ if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) ++ __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); ++ } ++} ++ ++/* DOACROSS WAIT operation. */ ++ ++void ++GOMP_doacross_wait (long first, ...) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_work_share *ws = thr->ts.work_share; ++ struct gomp_doacross_work_share *doacross = ws->doacross; ++ va_list ap; ++ unsigned long ent; ++ unsigned int i; ++ ++ if (__builtin_expect (doacross == NULL, 0)) ++ { ++ __sync_synchronize (); ++ return; ++ } ++ ++ if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ++ { ++ if (ws->chunk_size == 0) ++ { ++ if (first < doacross->boundary) ++ ent = first / (doacross->q + 1); ++ else ++ ent = (first - doacross->boundary) / doacross->q ++ + doacross->t; ++ } ++ else ++ ent = first / ws->chunk_size % thr->ts.team->nthreads; ++ } ++ else if (ws->sched == GFS_GUIDED) ++ ent = first; ++ else ++ ent = first / doacross->chunk_size; ++ unsigned long *array = (unsigned long *) (doacross->array ++ + ent * doacross->elt_sz); ++ ++ if (__builtin_expect (doacross->flattened, 1)) ++ { ++ unsigned long flattened ++ = (unsigned long) first << doacross->shift_counts[0]; ++ unsigned long cur; ++ ++ va_start (ap, first); ++ for (i = 1; i < doacross->ncounts; i++) ++ flattened |= (unsigned long) va_arg (ap, long) ++ << doacross->shift_counts[i]; ++ cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); ++ if (flattened < cur) ++ { ++ __atomic_thread_fence (MEMMODEL_RELEASE); ++ va_end (ap); ++ return; ++ } ++ doacross_spin (array, flattened, cur); ++ __atomic_thread_fence (MEMMODEL_RELEASE); ++ va_end (ap); ++ return; ++ } ++ ++ do ++ { ++ va_start (ap, first); ++ for (i = 0; i < doacross->ncounts; i++) ++ { ++ unsigned long thisv ++ = (unsigned long) (i ? va_arg (ap, long) : first) + 1; ++ unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); ++ if (thisv < cur) ++ { ++ i = doacross->ncounts; ++ break; ++ } ++ if (thisv > cur) ++ break; ++ } ++ va_end (ap); ++ if (i == doacross->ncounts) ++ break; ++ cpu_relax (); ++ } ++ while (1); ++ __sync_synchronize (); ++} ++ ++typedef unsigned long long gomp_ull; ++ ++void ++gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_work_share *ws = thr->ts.work_share; ++ unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; ++ unsigned long ent, num_ents, elt_sz, shift_sz; ++ struct gomp_doacross_work_share *doacross; ++ ++ if (team == NULL || team->nthreads == 1) ++ return; ++ ++ for (i = 0; i < ncounts; i++) ++ { ++ /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ ++ if (counts[i] == 0) ++ return; ++ ++ if (num_bits <= MAX_COLLAPSED_BITS) ++ { ++ unsigned int this_bits; ++ if (counts[i] == 1) ++ this_bits = 1; ++ else ++ this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__ ++ - __builtin_clzll (counts[i] - 1); ++ if (num_bits + this_bits <= MAX_COLLAPSED_BITS) ++ { ++ bits[i] = this_bits; ++ num_bits += this_bits; ++ } ++ else ++ num_bits = MAX_COLLAPSED_BITS + 1; ++ } ++ } ++ ++ if (ws->sched == GFS_STATIC) ++ num_ents = team->nthreads; ++ else if (ws->sched == GFS_GUIDED) ++ num_ents = counts[0]; ++ else ++ num_ents = (counts[0] - 1) / chunk_size + 1; ++ if (num_bits <= MAX_COLLAPSED_BITS) ++ { ++ elt_sz = sizeof (unsigned long); ++ shift_sz = ncounts * sizeof (unsigned int); ++ } ++ else ++ { ++ if (sizeof (gomp_ull) == sizeof (unsigned long)) ++ elt_sz = sizeof (gomp_ull) * ncounts; ++ else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long)) ++ elt_sz = sizeof (unsigned long) * 2 * ncounts; ++ else ++ abort (); ++ shift_sz = 0; ++ } ++ elt_sz = (elt_sz + 63) & ~63UL; ++ ++ doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz ++ + shift_sz); ++ doacross->chunk_size_ull = chunk_size; ++ doacross->elt_sz = elt_sz; ++ doacross->ncounts = ncounts; ++ doacross->flattened = false; ++ doacross->boundary = 0; ++ doacross->array = (unsigned char *) ++ ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) ++ & ~(uintptr_t) 63); ++ if (num_bits <= MAX_COLLAPSED_BITS) ++ { ++ unsigned int shift_count = 0; ++ doacross->flattened = true; ++ for (i = ncounts; i > 0; i--) ++ { ++ doacross->shift_counts[i - 1] = shift_count; ++ shift_count += bits[i - 1]; ++ } ++ for (ent = 0; ent < num_ents; ent++) ++ *(unsigned long *) (doacross->array + ent * elt_sz) = 0; ++ } ++ else ++ for (ent = 0; ent < num_ents; ent++) ++ memset (doacross->array + ent * elt_sz, '\0', ++ sizeof (unsigned long) * ncounts); ++ if (ws->sched == GFS_STATIC && chunk_size == 0) ++ { ++ gomp_ull q = counts[0] / num_ents; ++ gomp_ull t = counts[0] % num_ents; ++ doacross->boundary_ull = t * (q + 1); ++ doacross->q_ull = q; ++ doacross->t = t; ++ } ++ ws->doacross = doacross; ++} ++ ++/* DOACROSS POST operation. */ ++ ++void ++GOMP_doacross_ull_post (gomp_ull *counts) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_work_share *ws = thr->ts.work_share; ++ struct gomp_doacross_work_share *doacross = ws->doacross; ++ unsigned long ent; ++ unsigned int i; ++ ++ if (__builtin_expect (doacross == NULL, 0)) ++ { ++ __sync_synchronize (); ++ return; ++ } ++ ++ if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ++ ent = thr->ts.team_id; ++ else if (ws->sched == GFS_GUIDED) ++ ent = counts[0]; ++ else ++ ent = counts[0] / doacross->chunk_size_ull; ++ ++ if (__builtin_expect (doacross->flattened, 1)) ++ { ++ unsigned long *array = (unsigned long *) (doacross->array ++ + ent * doacross->elt_sz); ++ gomp_ull flattened ++ = counts[0] << doacross->shift_counts[0]; ++ ++ for (i = 1; i < doacross->ncounts; i++) ++ flattened |= counts[i] << doacross->shift_counts[i]; ++ flattened++; ++ if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) ++ __atomic_thread_fence (MEMMODEL_RELEASE); ++ else ++ __atomic_store_n (array, flattened, MEMMODEL_RELEASE); ++ return; ++ } ++ ++ __atomic_thread_fence (MEMMODEL_ACQUIRE); ++ if (sizeof (gomp_ull) == sizeof (unsigned long)) ++ { ++ gomp_ull *array = (gomp_ull *) (doacross->array ++ + ent * doacross->elt_sz); ++ ++ for (i = doacross->ncounts; i-- > 0; ) ++ { ++ if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) ++ __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); ++ } ++ } ++ else ++ { ++ unsigned long *array = (unsigned long *) (doacross->array ++ + ent * doacross->elt_sz); ++ ++ for (i = doacross->ncounts; i-- > 0; ) ++ { ++ gomp_ull cull = counts[i] + 1UL; ++ unsigned long c = (unsigned long) cull; ++ if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED)) ++ __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE); ++ c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); ++ if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED)) ++ __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE); ++ } ++ } ++} ++ ++/* DOACROSS WAIT operation. */ ++ ++void ++GOMP_doacross_ull_wait (gomp_ull first, ...) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_work_share *ws = thr->ts.work_share; ++ struct gomp_doacross_work_share *doacross = ws->doacross; ++ va_list ap; ++ unsigned long ent; ++ unsigned int i; ++ ++ if (__builtin_expect (doacross == NULL, 0)) ++ { ++ __sync_synchronize (); ++ return; ++ } ++ ++ if (__builtin_expect (ws->sched == GFS_STATIC, 1)) ++ { ++ if (ws->chunk_size_ull == 0) ++ { ++ if (first < doacross->boundary_ull) ++ ent = first / (doacross->q_ull + 1); ++ else ++ ent = (first - doacross->boundary_ull) / doacross->q_ull ++ + doacross->t; ++ } ++ else ++ ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; ++ } ++ else if (ws->sched == GFS_GUIDED) ++ ent = first; ++ else ++ ent = first / doacross->chunk_size_ull; ++ ++ if (__builtin_expect (doacross->flattened, 1)) ++ { ++ unsigned long *array = (unsigned long *) (doacross->array ++ + ent * doacross->elt_sz); ++ gomp_ull flattened = first << doacross->shift_counts[0]; ++ unsigned long cur; ++ ++ va_start (ap, first); ++ for (i = 1; i < doacross->ncounts; i++) ++ flattened |= va_arg (ap, gomp_ull) ++ << doacross->shift_counts[i]; ++ cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); ++ if (flattened < cur) ++ { ++ __atomic_thread_fence (MEMMODEL_RELEASE); ++ va_end (ap); ++ return; ++ } ++ doacross_spin (array, flattened, cur); ++ __atomic_thread_fence (MEMMODEL_RELEASE); ++ va_end (ap); ++ return; ++ } ++ ++ if (sizeof (gomp_ull) == sizeof (unsigned long)) ++ { ++ gomp_ull *array = (gomp_ull *) (doacross->array ++ + ent * doacross->elt_sz); ++ do ++ { ++ va_start (ap, first); ++ for (i = 0; i < doacross->ncounts; i++) ++ { ++ gomp_ull thisv ++ = (i ? va_arg (ap, gomp_ull) : first) + 1; ++ gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); ++ if (thisv < cur) ++ { ++ i = doacross->ncounts; ++ break; ++ } ++ if (thisv > cur) ++ break; ++ } ++ va_end (ap); ++ if (i == doacross->ncounts) ++ break; ++ cpu_relax (); ++ } ++ while (1); ++ } ++ else ++ { ++ unsigned long *array = (unsigned long *) (doacross->array ++ + ent * doacross->elt_sz); ++ do ++ { ++ va_start (ap, first); ++ for (i = 0; i < doacross->ncounts; i++) ++ { ++ gomp_ull thisv ++ = (i ? va_arg (ap, gomp_ull) : first) + 1; ++ unsigned long t ++ = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); ++ unsigned long cur ++ = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED); ++ if (t < cur) ++ { ++ i = doacross->ncounts; ++ break; ++ } ++ if (t > cur) ++ break; ++ t = thisv; ++ cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED); ++ if (t < cur) ++ { ++ i = doacross->ncounts; ++ break; ++ } ++ if (t > cur) ++ break; ++ } ++ va_end (ap); ++ if (i == doacross->ncounts) ++ break; ++ cpu_relax (); ++ } ++ while (1); ++ } ++ __sync_synchronize (); ++} +--- libgomp/loop.c.jj 2014-05-15 10:56:36.487505570 +0200 ++++ libgomp/loop.c 2016-07-13 16:57:13.488423109 +0200 +@@ -110,6 +110,11 @@ gomp_loop_static_start (long start, long + return !gomp_iter_static_next (istart, iend); + } + ++/* The current dynamic implementation is always monotonic. The ++ entrypoints without nonmonotonic in them have to be always monotonic, ++ but the nonmonotonic ones could be changed to use work-stealing for ++ improved scalability. */ ++ + static bool + gomp_loop_dynamic_start (long start, long end, long incr, long chunk_size, + long *istart, long *iend) +@@ -135,6 +140,9 @@ gomp_loop_dynamic_start (long start, lon + return ret; + } + ++/* Similarly as for dynamic, though the question is how can the chunk sizes ++ be decreased without a central locking or atomics. */ ++ + static bool + gomp_loop_guided_start (long start, long end, long incr, long chunk_size, + long *istart, long *iend) +@@ -168,13 +176,16 @@ GOMP_loop_runtime_start (long start, lon + switch (icv->run_sched_var) + { + case GFS_STATIC: +- return gomp_loop_static_start (start, end, incr, icv->run_sched_modifier, ++ return gomp_loop_static_start (start, end, incr, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: +- return gomp_loop_dynamic_start (start, end, incr, icv->run_sched_modifier, ++ return gomp_loop_dynamic_start (start, end, incr, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: +- return gomp_loop_guided_start (start, end, incr, icv->run_sched_modifier, ++ return gomp_loop_guided_start (start, end, incr, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback +@@ -265,15 +276,15 @@ GOMP_loop_ordered_runtime_start (long st + { + case GFS_STATIC: + return gomp_loop_ordered_static_start (start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_DYNAMIC: + return gomp_loop_ordered_dynamic_start (start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_GUIDED: + return gomp_loop_ordered_guided_start (start, end, incr, +- icv->run_sched_modifier, ++ icv->run_sched_chunk_size, + istart, iend); + case GFS_AUTO: + /* For now map to schedule(static), later on we could play with feedback +@@ -285,6 +296,111 @@ GOMP_loop_ordered_runtime_start (long st + } + } + ++/* The *_doacross_*_start routines are similar. The only difference is that ++ this work-share construct is initialized to expect an ORDERED(N) - DOACROSS ++ section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 ++ and other COUNTS array elements tell the library number of iterations ++ in the ordered inner loops. */ ++ ++static bool ++gomp_loop_doacross_static_start (unsigned ncounts, long *counts, ++ long chunk_size, long *istart, long *iend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ ++ thr->ts.static_trip = 0; ++ if (gomp_work_share_start (false)) ++ { ++ gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, ++ GFS_STATIC, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size); ++ gomp_work_share_init_done (); ++ } ++ ++ return !gomp_iter_static_next (istart, iend); ++} ++ ++static bool ++gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts, ++ long chunk_size, long *istart, long *iend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ bool ret; ++ ++ if (gomp_work_share_start (false)) ++ { ++ gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, ++ GFS_DYNAMIC, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size); ++ gomp_work_share_init_done (); ++ } ++ ++#ifdef HAVE_SYNC_BUILTINS ++ ret = gomp_iter_dynamic_next (istart, iend); ++#else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ ret = gomp_iter_dynamic_next_locked (istart, iend); ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif ++ ++ return ret; ++} ++ ++static bool ++gomp_loop_doacross_guided_start (unsigned ncounts, long *counts, ++ long chunk_size, long *istart, long *iend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ bool ret; ++ ++ if (gomp_work_share_start (false)) ++ { ++ gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, ++ GFS_GUIDED, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size); ++ gomp_work_share_init_done (); ++ } ++ ++#ifdef HAVE_SYNC_BUILTINS ++ ret = gomp_iter_guided_next (istart, iend); ++#else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ ret = gomp_iter_guided_next_locked (istart, iend); ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif ++ ++ return ret; ++} ++ ++bool ++GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts, ++ long *istart, long *iend) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ switch (icv->run_sched_var) ++ { ++ case GFS_STATIC: ++ return gomp_loop_doacross_static_start (ncounts, counts, ++ icv->run_sched_chunk_size, ++ istart, iend); ++ case GFS_DYNAMIC: ++ return gomp_loop_doacross_dynamic_start (ncounts, counts, ++ icv->run_sched_chunk_size, ++ istart, iend); ++ case GFS_GUIDED: ++ return gomp_loop_doacross_guided_start (ncounts, counts, ++ icv->run_sched_chunk_size, ++ istart, iend); ++ case GFS_AUTO: ++ /* For now map to schedule(static), later on we could play with feedback ++ driven choice. */ ++ return gomp_loop_doacross_static_start (ncounts, counts, ++ 0, istart, iend); ++ default: ++ abort (); ++ } ++} ++ + /* The *_next routines are called when the thread completes processing of + the iteration block currently assigned to it. If the work-share + construct is bound directly to a parallel construct, then the iteration +@@ -483,7 +599,7 @@ GOMP_parallel_loop_runtime_start (void ( + { + struct gomp_task_icv *icv = gomp_icv (false); + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- icv->run_sched_var, icv->run_sched_modifier, 0); ++ icv->run_sched_var, icv->run_sched_chunk_size, 0); + } + + ialias_redirect (GOMP_parallel_end) +@@ -521,6 +637,37 @@ GOMP_parallel_loop_guided (void (*fn) (v + GOMP_parallel_end (); + } + ++#ifdef HAVE_ATTRIBUTE_ALIAS ++extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic ++ __attribute__((alias ("GOMP_parallel_loop_dynamic"))); ++extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided ++ __attribute__((alias ("GOMP_parallel_loop_guided"))); ++#else ++void ++GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, ++ long end, long incr, long chunk_size, ++ unsigned flags) ++{ ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ GFS_DYNAMIC, chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++ ++void ++GOMP_parallel_loop_nonmonotonic_guided (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, ++ long end, long incr, long chunk_size, ++ unsigned flags) ++{ ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ GFS_GUIDED, chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++#endif ++ + void + GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, + unsigned num_threads, long start, long end, +@@ -528,7 +675,7 @@ GOMP_parallel_loop_runtime (void (*fn) ( + { + struct gomp_task_icv *icv = gomp_icv (false); + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- icv->run_sched_var, icv->run_sched_modifier, ++ icv->run_sched_var, icv->run_sched_chunk_size, + flags); + fn (data); + GOMP_parallel_end (); +@@ -569,6 +716,10 @@ extern __typeof(gomp_loop_dynamic_start) + __attribute__((alias ("gomp_loop_dynamic_start"))); + extern __typeof(gomp_loop_guided_start) GOMP_loop_guided_start + __attribute__((alias ("gomp_loop_guided_start"))); ++extern __typeof(gomp_loop_dynamic_start) GOMP_loop_nonmonotonic_dynamic_start ++ __attribute__((alias ("gomp_loop_dynamic_start"))); ++extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start ++ __attribute__((alias ("gomp_loop_guided_start"))); + + extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start + __attribute__((alias ("gomp_loop_ordered_static_start"))); +@@ -577,12 +728,23 @@ extern __typeof(gomp_loop_ordered_dynami + extern __typeof(gomp_loop_ordered_guided_start) GOMP_loop_ordered_guided_start + __attribute__((alias ("gomp_loop_ordered_guided_start"))); + ++extern __typeof(gomp_loop_doacross_static_start) GOMP_loop_doacross_static_start ++ __attribute__((alias ("gomp_loop_doacross_static_start"))); ++extern __typeof(gomp_loop_doacross_dynamic_start) GOMP_loop_doacross_dynamic_start ++ __attribute__((alias ("gomp_loop_doacross_dynamic_start"))); ++extern __typeof(gomp_loop_doacross_guided_start) GOMP_loop_doacross_guided_start ++ __attribute__((alias ("gomp_loop_doacross_guided_start"))); ++ + extern __typeof(gomp_loop_static_next) GOMP_loop_static_next + __attribute__((alias ("gomp_loop_static_next"))); + extern __typeof(gomp_loop_dynamic_next) GOMP_loop_dynamic_next + __attribute__((alias ("gomp_loop_dynamic_next"))); + extern __typeof(gomp_loop_guided_next) GOMP_loop_guided_next + __attribute__((alias ("gomp_loop_guided_next"))); ++extern __typeof(gomp_loop_dynamic_next) GOMP_loop_nonmonotonic_dynamic_next ++ __attribute__((alias ("gomp_loop_dynamic_next"))); ++extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next ++ __attribute__((alias ("gomp_loop_guided_next"))); + + extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next + __attribute__((alias ("gomp_loop_ordered_static_next"))); +@@ -613,6 +775,21 @@ GOMP_loop_guided_start (long start, long + } + + bool ++GOMP_loop_nonmonotonic_dynamic_start (long start, long end, long incr, ++ long chunk_size, long *istart, ++ long *iend) ++{ ++ return gomp_loop_dynamic_start (start, end, incr, chunk_size, istart, iend); ++} ++ ++bool ++GOMP_loop_nonmonotonic_guided_start (long start, long end, long incr, ++ long chunk_size, long *istart, long *iend) ++{ ++ return gomp_loop_guided_start (start, end, incr, chunk_size, istart, iend); ++} ++ ++bool + GOMP_loop_ordered_static_start (long start, long end, long incr, + long chunk_size, long *istart, long *iend) + { +@@ -637,6 +814,30 @@ GOMP_loop_ordered_guided_start (long sta + } + + bool ++GOMP_loop_doacross_static_start (unsigned ncounts, long *counts, ++ long chunk_size, long *istart, long *iend) ++{ ++ return gomp_loop_doacross_static_start (ncounts, counts, chunk_size, ++ istart, iend); ++} ++ ++bool ++GOMP_loop_doacross_dynamic_start (unsigned ncounts, long *counts, ++ long chunk_size, long *istart, long *iend) ++{ ++ return gomp_loop_doacross_dynamic_start (ncounts, counts, chunk_size, ++ istart, iend); ++} ++ ++bool ++GOMP_loop_doacross_guided_start (unsigned ncounts, long *counts, ++ long chunk_size, long *istart, long *iend) ++{ ++ return gomp_loop_doacross_guided_start (ncounts, counts, chunk_size, ++ istart, iend); ++} ++ ++bool + GOMP_loop_static_next (long *istart, long *iend) + { + return gomp_loop_static_next (istart, iend); +@@ -653,6 +854,18 @@ GOMP_loop_guided_next (long *istart, lon + { + return gomp_loop_guided_next (istart, iend); + } ++ ++bool ++GOMP_loop_nonmonotonic_dynamic_next (long *istart, long *iend) ++{ ++ return gomp_loop_dynamic_next (istart, iend); ++} ++ ++bool ++GOMP_loop_nonmonotonic_guided_next (long *istart, long *iend) ++{ ++ return gomp_loop_guided_next (istart, iend); ++} + + bool + GOMP_loop_ordered_static_next (long *istart, long *iend) +--- libgomp/error.c.jj 2013-01-21 16:00:31.834953566 +0100 ++++ libgomp/error.c 2016-07-13 16:57:04.437535335 +0200 +@@ -35,7 +35,26 @@ + #include <stdlib.h> + + +-static void ++#undef gomp_vdebug ++void ++gomp_vdebug (int kind __attribute__ ((unused)), const char *msg, va_list list) ++{ ++ if (gomp_debug_var) ++ vfprintf (stderr, msg, list); ++} ++ ++#undef gomp_debug ++void ++gomp_debug (int kind, const char *msg, ...) ++{ ++ va_list list; ++ ++ va_start (list, msg); ++ gomp_vdebug (kind, msg, list); ++ va_end (list); ++} ++ ++void + gomp_verror (const char *fmt, va_list list) + { + fputs ("\nlibgomp: ", stderr); +@@ -54,13 +73,18 @@ gomp_error (const char *fmt, ...) + } + + void ++gomp_vfatal (const char *fmt, va_list list) ++{ ++ gomp_verror (fmt, list); ++ exit (EXIT_FAILURE); ++} ++ ++void + gomp_fatal (const char *fmt, ...) + { + va_list list; + + va_start (list, fmt); +- gomp_verror (fmt, list); ++ gomp_vfatal (fmt, list); + va_end (list); +- +- exit (EXIT_FAILURE); + } +--- libgomp/Makefile.am.jj 2014-05-15 11:12:10.000000000 +0200 ++++ libgomp/Makefile.am 2016-07-14 16:10:51.968202878 +0200 +@@ -60,7 +60,13 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L + libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ + iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ + task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ +- time.c fortran.c affinity.c target.c ++ time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \ ++ oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \ ++ oacc-plugin.c oacc-cuda.c priority_queue.c ++ ++if USE_FORTRAN ++libgomp_la_SOURCES += openacc.f90 ++endif + + nodist_noinst_HEADERS = libgomp_f.h + nodist_libsubinclude_HEADERS = omp.h +--- libgomp/Makefile.in.jj 2014-05-15 11:12:10.000000000 +0200 ++++ libgomp/Makefile.in 2016-07-14 16:11:10.981954087 +0200 +@@ -36,6 +36,7 @@ POST_UNINSTALL = : + build_triplet = @build@ + host_triplet = @host@ + target_triplet = @target@ ++@USE_FORTRAN_TRUE@am__append_1 = openacc.f90 + subdir = . + DIST_COMMON = ChangeLog $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ + $(top_srcdir)/configure $(am__configure_deps) \ +@@ -92,11 +93,15 @@ am__installdirs = "$(DESTDIR)$(toolexecl + "$(DESTDIR)$(toolexeclibdir)" + LTLIBRARIES = $(toolexeclib_LTLIBRARIES) + libgomp_la_LIBADD = ++@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo + am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \ + error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \ + parallel.lo sections.lo single.lo task.lo team.lo work.lo \ + lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \ +- fortran.lo affinity.lo target.lo ++ fortran.lo affinity.lo target.lo splay-tree.lo \ ++ libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \ ++ oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \ ++ priority_queue.lo $(am__objects_1) + libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) + DEFAULT_INCLUDES = -I.@am__isrc@ + depcomp = $(SHELL) $(top_srcdir)/../depcomp +@@ -108,6 +113,13 @@ LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIB + --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + CCLD = $(CC) ++FCCOMPILE = $(FC) $(AM_FCFLAGS) $(FCFLAGS) ++LTFCCOMPILE = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ ++ --mode=compile $(FC) $(AM_FCFLAGS) $(FCFLAGS) ++FCLD = $(FC) ++FCLINK = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ ++ --mode=link $(FCLD) $(AM_FCFLAGS) $(FCFLAGS) $(AM_LDFLAGS) \ ++ $(LDFLAGS) -o $@ + SOURCES = $(libgomp_la_SOURCES) + MULTISRCTOP = + MULTIBUILDTOP = +@@ -315,10 +327,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_i + libgomp_la_DEPENDENCIES = $(libgomp_version_dep) + libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS) + libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ +- iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \ +- task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ +- time.c fortran.c affinity.c target.c +- ++ iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \ ++ single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \ ++ bar.c ptrlock.c time.c fortran.c affinity.c target.c \ ++ splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \ ++ oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \ ++ priority_queue.c $(am__append_1) + nodist_noinst_HEADERS = libgomp_f.h + nodist_libsubinclude_HEADERS = omp.h + @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod +@@ -351,7 +365,7 @@ all: config.h + $(MAKE) $(AM_MAKEFLAGS) all-recursive + + .SUFFIXES: +-.SUFFIXES: .c .dvi .lo .o .obj .ps ++.SUFFIXES: .c .dvi .f90 .lo .o .obj .ps + am--refresh: + @: + $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) +@@ -463,17 +477,27 @@ distclean-compile: + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop_ull.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mutex.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-async.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-cuda.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-host.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-init.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splay-tree.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@ +@@ -501,6 +525,15 @@ distclean-compile: + @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ + @am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + ++.f90.o: ++ $(FCCOMPILE) -c -o $@ $< ++ ++.f90.obj: ++ $(FCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` ++ ++.f90.lo: ++ $(LTFCCOMPILE) -c -o $@ $< ++ + mostlyclean-libtool: + -rm -f *.lo + +--- libgomp/task.c.jj 2014-08-06 16:25:16.575091658 +0200 ++++ libgomp/task.c 2016-07-13 17:47:58.722758497 +0200 +@@ -28,6 +28,7 @@ + #include "libgomp.h" + #include <stdlib.h> + #include <string.h> ++#include "gomp-constants.h" + + typedef struct gomp_task_depend_entry *hash_entry_type; + +@@ -63,6 +64,14 @@ void + gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task, + struct gomp_task_icv *prev_icv) + { ++ /* It would seem that using memset here would be a win, but it turns ++ out that partially filling gomp_task allows us to keep the ++ overhead of task creation low. In the nqueens-1.c test, for a ++ sufficiently large N, we drop the overhead from 5-6% to 1%. ++ ++ Note, the nqueens-1.c test in serial mode is a good test to ++ benchmark the overhead of creating tasks as there are millions of ++ tiny tasks created that all run undeferred. */ + task->parent = parent_task; + task->icv = *prev_icv; + task->kind = GOMP_TASK_IMPLICIT; +@@ -71,7 +80,7 @@ gomp_init_task (struct gomp_task *task, + task->final_task = false; + task->copy_ctors_done = false; + task->parent_depends_on = false; +- task->children = NULL; ++ priority_queue_init (&task->children_queue); + task->taskgroup = NULL; + task->dependers = NULL; + task->depend_hash = NULL; +@@ -90,30 +99,194 @@ gomp_end_task (void) + thr->task = task->parent; + } + ++/* Clear the parent field of every task in LIST. */ ++ + static inline void +-gomp_clear_parent (struct gomp_task *children) ++gomp_clear_parent_in_list (struct priority_list *list) + { +- struct gomp_task *task = children; +- +- if (task) ++ struct priority_node *p = list->tasks; ++ if (p) + do + { +- task->parent = NULL; +- task = task->next_child; ++ priority_node_to_task (PQ_CHILDREN, p)->parent = NULL; ++ p = p->next; + } +- while (task != children); ++ while (p != list->tasks); ++} ++ ++/* Splay tree version of gomp_clear_parent_in_list. ++ ++ Clear the parent field of every task in NODE within SP, and free ++ the node when done. */ ++ ++static void ++gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node) ++{ ++ if (!node) ++ return; ++ prio_splay_tree_node left = node->left, right = node->right; ++ gomp_clear_parent_in_list (&node->key.l); ++#if _LIBGOMP_CHECKING_ ++ memset (node, 0xaf, sizeof (*node)); ++#endif ++ /* No need to remove the node from the tree. We're nuking ++ everything, so just free the nodes and our caller can clear the ++ entire splay tree. */ ++ free (node); ++ gomp_clear_parent_in_tree (sp, left); ++ gomp_clear_parent_in_tree (sp, right); ++} ++ ++/* Clear the parent field of every task in Q and remove every task ++ from Q. */ ++ ++static inline void ++gomp_clear_parent (struct priority_queue *q) ++{ ++ if (priority_queue_multi_p (q)) ++ { ++ gomp_clear_parent_in_tree (&q->t, q->t.root); ++ /* All the nodes have been cleared in gomp_clear_parent_in_tree. ++ No need to remove anything. We can just nuke everything. */ ++ q->t.root = NULL; ++ } ++ else ++ gomp_clear_parent_in_list (&q->l); + } + +-static void gomp_task_maybe_wait_for_dependencies (void **depend); ++/* Helper function for GOMP_task and gomp_create_target_task. ++ ++ For a TASK with in/out dependencies, fill in the various dependency ++ queues. PARENT is the parent of said task. DEPEND is as in ++ GOMP_task. */ ++ ++static void ++gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent, ++ void **depend) ++{ ++ size_t ndepend = (uintptr_t) depend[0]; ++ size_t nout = (uintptr_t) depend[1]; ++ size_t i; ++ hash_entry_type ent; ++ ++ task->depend_count = ndepend; ++ task->num_dependees = 0; ++ if (parent->depend_hash == NULL) ++ parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); ++ for (i = 0; i < ndepend; i++) ++ { ++ task->depend[i].addr = depend[2 + i]; ++ task->depend[i].next = NULL; ++ task->depend[i].prev = NULL; ++ task->depend[i].task = task; ++ task->depend[i].is_in = i >= nout; ++ task->depend[i].redundant = false; ++ task->depend[i].redundant_out = false; ++ ++ hash_entry_type *slot = htab_find_slot (&parent->depend_hash, ++ &task->depend[i], INSERT); ++ hash_entry_type out = NULL, last = NULL; ++ if (*slot) ++ { ++ /* If multiple depends on the same task are the same, all but the ++ first one are redundant. As inout/out come first, if any of them ++ is inout/out, it will win, which is the right semantics. */ ++ if ((*slot)->task == task) ++ { ++ task->depend[i].redundant = true; ++ continue; ++ } ++ for (ent = *slot; ent; ent = ent->next) ++ { ++ if (ent->redundant_out) ++ break; ++ ++ last = ent; ++ ++ /* depend(in:...) doesn't depend on earlier depend(in:...). */ ++ if (i >= nout && ent->is_in) ++ continue; ++ ++ if (!ent->is_in) ++ out = ent; ++ ++ struct gomp_task *tsk = ent->task; ++ if (tsk->dependers == NULL) ++ { ++ tsk->dependers ++ = gomp_malloc (sizeof (struct gomp_dependers_vec) ++ + 6 * sizeof (struct gomp_task *)); ++ tsk->dependers->n_elem = 1; ++ tsk->dependers->allocated = 6; ++ tsk->dependers->elem[0] = task; ++ task->num_dependees++; ++ continue; ++ } ++ /* We already have some other dependency on tsk from earlier ++ depend clause. */ ++ else if (tsk->dependers->n_elem ++ && (tsk->dependers->elem[tsk->dependers->n_elem - 1] ++ == task)) ++ continue; ++ else if (tsk->dependers->n_elem == tsk->dependers->allocated) ++ { ++ tsk->dependers->allocated ++ = tsk->dependers->allocated * 2 + 2; ++ tsk->dependers ++ = gomp_realloc (tsk->dependers, ++ sizeof (struct gomp_dependers_vec) ++ + (tsk->dependers->allocated ++ * sizeof (struct gomp_task *))); ++ } ++ tsk->dependers->elem[tsk->dependers->n_elem++] = task; ++ task->num_dependees++; ++ } ++ task->depend[i].next = *slot; ++ (*slot)->prev = &task->depend[i]; ++ } ++ *slot = &task->depend[i]; ++ ++ /* There is no need to store more than one depend({,in}out:) task per ++ address in the hash table chain for the purpose of creation of ++ deferred tasks, because each out depends on all earlier outs, thus it ++ is enough to record just the last depend({,in}out:). For depend(in:), ++ we need to keep all of the previous ones not terminated yet, because ++ a later depend({,in}out:) might need to depend on all of them. So, if ++ the new task's clause is depend({,in}out:), we know there is at most ++ one other depend({,in}out:) clause in the list (out). For ++ non-deferred tasks we want to see all outs, so they are moved to the ++ end of the chain, after first redundant_out entry all following ++ entries should be redundant_out. */ ++ if (!task->depend[i].is_in && out) ++ { ++ if (out != last) ++ { ++ out->next->prev = out->prev; ++ out->prev->next = out->next; ++ out->next = last->next; ++ out->prev = last; ++ last->next = out; ++ if (out->next) ++ out->next->prev = out; ++ } ++ out->redundant_out = true; ++ } ++ } ++} + + /* Called when encountering an explicit task directive. If IF_CLAUSE is + false, then we must not delay in executing the task. If UNTIED is true, +- then the task may be executed by any member of the team. */ ++ then the task may be executed by any member of the team. ++ ++ DEPEND is an array containing: ++ depend[0]: number of depend elements. ++ depend[1]: number of depend elements of type "out". ++ depend[2..N+1]: address of [1..N]th depend element. */ + + void + GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), + long arg_size, long arg_align, bool if_clause, unsigned flags, +- void **depend) ++ void **depend, int priority) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +@@ -125,8 +298,7 @@ GOMP_task (void (*fn) (void *), void *da + might be running on different thread than FN. */ + if (cpyfn) + if_clause = false; +- if (flags & 1) +- flags &= ~1; ++ flags &= ~GOMP_TASK_FLAG_UNTIED; + #endif + + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ +@@ -135,6 +307,11 @@ GOMP_task (void (*fn) (void *), void *da + || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) + return; + ++ if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0) ++ priority = 0; ++ else if (priority > gomp_max_task_priority_var) ++ priority = gomp_max_task_priority_var; ++ + if (!if_clause || team == NULL + || (thr->task && thr->task->final_task) + || team->task_count > 64 * team->nthreads) +@@ -147,12 +324,15 @@ GOMP_task (void (*fn) (void *), void *da + depend clauses for non-deferred tasks other than this, because + the parent task is suspended until the child task finishes and thus + it can't start further child tasks. */ +- if ((flags & 8) && thr->task && thr->task->depend_hash) ++ if ((flags & GOMP_TASK_FLAG_DEPEND) ++ && thr->task && thr->task->depend_hash) + gomp_task_maybe_wait_for_dependencies (depend); + + gomp_init_task (&task, thr->task, gomp_icv (false)); +- task.kind = GOMP_TASK_IFFALSE; +- task.final_task = (thr->task && thr->task->final_task) || (flags & 2); ++ task.kind = GOMP_TASK_UNDEFERRED; ++ task.final_task = (thr->task && thr->task->final_task) ++ || (flags & GOMP_TASK_FLAG_FINAL); ++ task.priority = priority; + if (thr->task) + { + task.in_tied_task = thr->task->in_tied_task; +@@ -178,10 +358,10 @@ GOMP_task (void (*fn) (void *), void *da + child thread, but seeing a stale non-NULL value is not a + problem. Once past the task_lock acquisition, this thread + will see the real value of task.children. */ +- if (task.children != NULL) ++ if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED)) + { + gomp_mutex_lock (&team->task_lock); +- gomp_clear_parent (task.children); ++ gomp_clear_parent (&task.children_queue); + gomp_mutex_unlock (&team->task_lock); + } + gomp_end_task (); +@@ -195,7 +375,7 @@ GOMP_task (void (*fn) (void *), void *da + bool do_wake; + size_t depend_size = 0; + +- if (flags & 8) ++ if (flags & GOMP_TASK_FLAG_DEPEND) + depend_size = ((uintptr_t) depend[0] + * sizeof (struct gomp_task_depend_entry)); + task = gomp_malloc (sizeof (*task) + depend_size +@@ -203,7 +383,8 @@ GOMP_task (void (*fn) (void *), void *da + arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) + & ~(uintptr_t) (arg_align - 1)); + gomp_init_task (task, parent, gomp_icv (false)); +- task->kind = GOMP_TASK_IFFALSE; ++ task->priority = priority; ++ task->kind = GOMP_TASK_UNDEFERRED; + task->in_tied_task = parent->in_tied_task; + task->taskgroup = taskgroup; + thr->task = task; +@@ -218,7 +399,7 @@ GOMP_task (void (*fn) (void *), void *da + task->kind = GOMP_TASK_WAITING; + task->fn = fn; + task->fn_data = arg; +- task->final_task = (flags & 2) >> 1; ++ task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1; + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ +@@ -235,171 +416,39 @@ GOMP_task (void (*fn) (void *), void *da + taskgroup->num_children++; + if (depend_size) + { +- size_t ndepend = (uintptr_t) depend[0]; +- size_t nout = (uintptr_t) depend[1]; +- size_t i; +- hash_entry_type ent; +- +- task->depend_count = ndepend; +- task->num_dependees = 0; +- if (parent->depend_hash == NULL) +- parent->depend_hash +- = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); +- for (i = 0; i < ndepend; i++) +- { +- task->depend[i].addr = depend[2 + i]; +- task->depend[i].next = NULL; +- task->depend[i].prev = NULL; +- task->depend[i].task = task; +- task->depend[i].is_in = i >= nout; +- task->depend[i].redundant = false; +- task->depend[i].redundant_out = false; +- +- hash_entry_type *slot +- = htab_find_slot (&parent->depend_hash, &task->depend[i], +- INSERT); +- hash_entry_type out = NULL, last = NULL; +- if (*slot) +- { +- /* If multiple depends on the same task are the +- same, all but the first one are redundant. +- As inout/out come first, if any of them is +- inout/out, it will win, which is the right +- semantics. */ +- if ((*slot)->task == task) +- { +- task->depend[i].redundant = true; +- continue; +- } +- for (ent = *slot; ent; ent = ent->next) +- { +- if (ent->redundant_out) +- break; +- +- last = ent; +- +- /* depend(in:...) doesn't depend on earlier +- depend(in:...). */ +- if (i >= nout && ent->is_in) +- continue; +- +- if (!ent->is_in) +- out = ent; +- +- struct gomp_task *tsk = ent->task; +- if (tsk->dependers == NULL) +- { +- tsk->dependers +- = gomp_malloc (sizeof (struct gomp_dependers_vec) +- + 6 * sizeof (struct gomp_task *)); +- tsk->dependers->n_elem = 1; +- tsk->dependers->allocated = 6; +- tsk->dependers->elem[0] = task; +- task->num_dependees++; +- continue; +- } +- /* We already have some other dependency on tsk +- from earlier depend clause. */ +- else if (tsk->dependers->n_elem +- && (tsk->dependers->elem[tsk->dependers->n_elem +- - 1] +- == task)) +- continue; +- else if (tsk->dependers->n_elem +- == tsk->dependers->allocated) +- { +- tsk->dependers->allocated +- = tsk->dependers->allocated * 2 + 2; +- tsk->dependers +- = gomp_realloc (tsk->dependers, +- sizeof (struct gomp_dependers_vec) +- + (tsk->dependers->allocated +- * sizeof (struct gomp_task *))); +- } +- tsk->dependers->elem[tsk->dependers->n_elem++] = task; +- task->num_dependees++; +- } +- task->depend[i].next = *slot; +- (*slot)->prev = &task->depend[i]; +- } +- *slot = &task->depend[i]; +- +- /* There is no need to store more than one depend({,in}out:) +- task per address in the hash table chain for the purpose +- of creation of deferred tasks, because each out +- depends on all earlier outs, thus it is enough to record +- just the last depend({,in}out:). For depend(in:), we need +- to keep all of the previous ones not terminated yet, because +- a later depend({,in}out:) might need to depend on all of +- them. So, if the new task's clause is depend({,in}out:), +- we know there is at most one other depend({,in}out:) clause +- in the list (out). For non-deferred tasks we want to see +- all outs, so they are moved to the end of the chain, +- after first redundant_out entry all following entries +- should be redundant_out. */ +- if (!task->depend[i].is_in && out) +- { +- if (out != last) +- { +- out->next->prev = out->prev; +- out->prev->next = out->next; +- out->next = last->next; +- out->prev = last; +- last->next = out; +- if (out->next) +- out->next->prev = out; +- } +- out->redundant_out = true; +- } +- } ++ gomp_task_handle_depend (task, parent, depend); + if (task->num_dependees) + { ++ /* Tasks that depend on other tasks are not put into the ++ various waiting queues, so we are done for now. Said ++ tasks are instead put into the queues via ++ gomp_task_run_post_handle_dependers() after their ++ dependencies have been satisfied. After which, they ++ can be picked up by the various scheduling ++ points. */ + gomp_mutex_unlock (&team->task_lock); + return; + } + } +- if (parent->children) +- { +- task->next_child = parent->children; +- task->prev_child = parent->children->prev_child; +- task->next_child->prev_child = task; +- task->prev_child->next_child = task; +- } +- else +- { +- task->next_child = task; +- task->prev_child = task; +- } +- parent->children = task; ++ ++ priority_queue_insert (PQ_CHILDREN, &parent->children_queue, ++ task, priority, ++ PRIORITY_INSERT_BEGIN, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); + if (taskgroup) +- { +- if (taskgroup->children) +- { +- task->next_taskgroup = taskgroup->children; +- task->prev_taskgroup = taskgroup->children->prev_taskgroup; +- task->next_taskgroup->prev_taskgroup = task; +- task->prev_taskgroup->next_taskgroup = task; +- } +- else +- { +- task->next_taskgroup = task; +- task->prev_taskgroup = task; +- } +- taskgroup->children = task; +- } +- if (team->task_queue) +- { +- task->next_queue = team->task_queue; +- task->prev_queue = team->task_queue->prev_queue; +- task->next_queue->prev_queue = task; +- task->prev_queue->next_queue = task; +- } +- else +- { +- task->next_queue = task; +- task->prev_queue = task; +- team->task_queue = task; +- } ++ priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ task, priority, ++ PRIORITY_INSERT_BEGIN, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ ++ priority_queue_insert (PQ_TEAM, &team->task_queue, ++ task, priority, ++ PRIORITY_INSERT_END, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ + ++team->task_count; + ++team->task_queued_count; + gomp_team_barrier_set_task_pending (&team->barrier); +@@ -411,36 +460,529 @@ GOMP_task (void (*fn) (void *), void *da + } + } + +-static inline bool +-gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, +- struct gomp_taskgroup *taskgroup, struct gomp_team *team) ++ialias (GOMP_taskgroup_start) ++ialias (GOMP_taskgroup_end) ++ ++#define TYPE long ++#define UTYPE unsigned long ++#define TYPE_is_long 1 ++#include "taskloop.c" ++#undef TYPE ++#undef UTYPE ++#undef TYPE_is_long ++ ++#define TYPE unsigned long long ++#define UTYPE TYPE ++#define GOMP_taskloop GOMP_taskloop_ull ++#include "taskloop.c" ++#undef TYPE ++#undef UTYPE ++#undef GOMP_taskloop ++ ++static void inline ++priority_queue_move_task_first (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task) + { ++#if _LIBGOMP_CHECKING_ ++ if (!priority_queue_task_in_queue_p (type, head, task)) ++ gomp_fatal ("Attempt to move first missing task %p", task); ++#endif ++ struct priority_list *list; ++ if (priority_queue_multi_p (head)) ++ { ++ list = priority_queue_lookup_priority (head, task->priority); ++#if _LIBGOMP_CHECKING_ ++ if (!list) ++ gomp_fatal ("Unable to find priority %d", task->priority); ++#endif ++ } ++ else ++ list = &head->l; ++ priority_list_remove (list, task_to_priority_node (type, task), 0); ++ priority_list_insert (type, list, task, task->priority, ++ PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN, ++ task->parent_depends_on); ++} ++ ++/* Actual body of GOMP_PLUGIN_target_task_completion that is executed ++ with team->task_lock held, or is executed in the thread that called ++ gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been ++ run before it acquires team->task_lock. */ ++ ++static void ++gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task) ++{ ++ struct gomp_task *parent = task->parent; + if (parent) ++ priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue, ++ task); ++ ++ struct gomp_taskgroup *taskgroup = task->taskgroup; ++ if (taskgroup) ++ priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ task); ++ ++ priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority, ++ PRIORITY_INSERT_BEGIN, false, ++ task->parent_depends_on); ++ task->kind = GOMP_TASK_WAITING; ++ if (parent && parent->taskwait) + { +- if (parent->children == child_task) +- parent->children = child_task->next_child; +- if (__builtin_expect (child_task->parent_depends_on, 0) +- && parent->taskwait->last_parent_depends_on == child_task) +- { +- if (child_task->prev_child->kind == GOMP_TASK_WAITING +- && child_task->prev_child->parent_depends_on) +- parent->taskwait->last_parent_depends_on = child_task->prev_child; +- else +- parent->taskwait->last_parent_depends_on = NULL; ++ if (parent->taskwait->in_taskwait) ++ { ++ /* One more task has had its dependencies met. ++ Inform any waiters. */ ++ parent->taskwait->in_taskwait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); + } ++ else if (parent->taskwait->in_depend_wait) ++ { ++ /* One more task has had its dependencies met. ++ Inform any waiters. */ ++ parent->taskwait->in_depend_wait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); ++ } ++ } ++ if (taskgroup && taskgroup->in_taskgroup_wait) ++ { ++ /* One more task has had its dependencies met. ++ Inform any waiters. */ ++ taskgroup->in_taskgroup_wait = false; ++ gomp_sem_post (&taskgroup->taskgroup_sem); + } +- if (taskgroup && taskgroup->children == child_task) +- taskgroup->children = child_task->next_taskgroup; +- child_task->prev_queue->next_queue = child_task->next_queue; +- child_task->next_queue->prev_queue = child_task->prev_queue; +- if (team->task_queue == child_task) ++ ++ ++team->task_queued_count; ++ gomp_team_barrier_set_task_pending (&team->barrier); ++ /* I'm afraid this can't be done after releasing team->task_lock, ++ as gomp_target_task_completion is run from unrelated thread and ++ therefore in between gomp_mutex_unlock and gomp_team_barrier_wake ++ the team could be gone already. */ ++ if (team->nthreads > team->task_running_count) ++ gomp_team_barrier_wake (&team->barrier, 1); ++} ++ ++/* Signal that a target task TTASK has completed the asynchronously ++ running phase and should be requeued as a task to handle the ++ variable unmapping. */ ++ ++void ++GOMP_PLUGIN_target_task_completion (void *data) ++{ ++ struct gomp_target_task *ttask = (struct gomp_target_task *) data; ++ struct gomp_task *task = ttask->task; ++ struct gomp_team *team = ttask->team; ++ ++ gomp_mutex_lock (&team->task_lock); ++ if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN) + { +- if (child_task->next_queue != child_task) +- team->task_queue = child_task->next_queue; ++ ttask->state = GOMP_TARGET_TASK_FINISHED; ++ gomp_mutex_unlock (&team->task_lock); ++ return; ++ } ++ ttask->state = GOMP_TARGET_TASK_FINISHED; ++ gomp_target_task_completion (team, task); ++ gomp_mutex_unlock (&team->task_lock); ++} ++ ++static void gomp_task_run_post_handle_depend_hash (struct gomp_task *); ++ ++/* Called for nowait target tasks. */ ++ ++bool ++gomp_create_target_task (struct gomp_device_descr *devicep, ++ void (*fn) (void *), size_t mapnum, void **hostaddrs, ++ size_t *sizes, unsigned short *kinds, ++ unsigned int flags, void **depend, void **args, ++ enum gomp_target_task_state state) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ ++ /* If parallel or taskgroup has been cancelled, don't start new tasks. */ ++ if (team ++ && (gomp_team_barrier_cancelled (&team->barrier) ++ || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) ++ return true; ++ ++ struct gomp_target_task *ttask; ++ struct gomp_task *task; ++ struct gomp_task *parent = thr->task; ++ struct gomp_taskgroup *taskgroup = parent->taskgroup; ++ bool do_wake; ++ size_t depend_size = 0; ++ uintptr_t depend_cnt = 0; ++ size_t tgt_align = 0, tgt_size = 0; ++ ++ if (depend != NULL) ++ { ++ depend_cnt = (uintptr_t) depend[0]; ++ depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry); ++ } ++ if (fn) ++ { ++ /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are ++ firstprivate on the target task. */ ++ size_t i; ++ for (i = 0; i < mapnum; i++) ++ if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) ++ { ++ size_t align = (size_t) 1 << (kinds[i] >> 8); ++ if (tgt_align < align) ++ tgt_align = align; ++ tgt_size = (tgt_size + align - 1) & ~(align - 1); ++ tgt_size += sizes[i]; ++ } ++ if (tgt_align) ++ tgt_size += tgt_align - 1; + else +- team->task_queue = NULL; ++ tgt_size = 0; + } ++ ++ task = gomp_malloc (sizeof (*task) + depend_size ++ + sizeof (*ttask) ++ + mapnum * (sizeof (void *) + sizeof (size_t) ++ + sizeof (unsigned short)) ++ + tgt_size); ++ gomp_init_task (task, parent, gomp_icv (false)); ++ task->priority = 0; ++ task->kind = GOMP_TASK_WAITING; ++ task->in_tied_task = parent->in_tied_task; ++ task->taskgroup = taskgroup; ++ ttask = (struct gomp_target_task *) &task->depend[depend_cnt]; ++ ttask->devicep = devicep; ++ ttask->fn = fn; ++ ttask->mapnum = mapnum; ++ ttask->args = args; ++ memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *)); ++ ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum]; ++ memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t)); ++ ttask->kinds = (unsigned short *) &ttask->sizes[mapnum]; ++ memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short)); ++ if (tgt_align) ++ { ++ char *tgt = (char *) &ttask->kinds[mapnum]; ++ size_t i; ++ uintptr_t al = (uintptr_t) tgt & (tgt_align - 1); ++ if (al) ++ tgt += tgt_align - al; ++ tgt_size = 0; ++ for (i = 0; i < mapnum; i++) ++ if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE) ++ { ++ size_t align = (size_t) 1 << (kinds[i] >> 8); ++ tgt_size = (tgt_size + align - 1) & ~(align - 1); ++ memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]); ++ ttask->hostaddrs[i] = tgt + tgt_size; ++ tgt_size = tgt_size + sizes[i]; ++ } ++ } ++ ttask->flags = flags; ++ ttask->state = state; ++ ttask->task = task; ++ ttask->team = team; ++ task->fn = NULL; ++ task->fn_data = ttask; ++ task->final_task = 0; ++ gomp_mutex_lock (&team->task_lock); ++ /* If parallel or taskgroup has been cancelled, don't start new tasks. */ ++ if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier) ++ || (taskgroup && taskgroup->cancelled), 0)) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ gomp_finish_task (task); ++ free (task); ++ return true; ++ } ++ if (depend_size) ++ { ++ gomp_task_handle_depend (task, parent, depend); ++ if (task->num_dependees) ++ { ++ if (taskgroup) ++ taskgroup->num_children++; ++ gomp_mutex_unlock (&team->task_lock); ++ return true; ++ } ++ } ++ if (state == GOMP_TARGET_TASK_DATA) ++ { ++ gomp_task_run_post_handle_depend_hash (task); ++ gomp_mutex_unlock (&team->task_lock); ++ gomp_finish_task (task); ++ free (task); ++ return false; ++ } ++ if (taskgroup) ++ taskgroup->num_children++; ++ /* For async offloading, if we don't need to wait for dependencies, ++ run the gomp_target_task_fn right away, essentially schedule the ++ mapping part of the task in the current thread. */ ++ if (devicep != NULL ++ && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) ++ { ++ priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0, ++ PRIORITY_INSERT_END, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ if (taskgroup) ++ priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ task, 0, PRIORITY_INSERT_END, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ task->pnode[PQ_TEAM].next = NULL; ++ task->pnode[PQ_TEAM].prev = NULL; ++ task->kind = GOMP_TASK_TIED; ++ ++team->task_count; ++ gomp_mutex_unlock (&team->task_lock); ++ ++ thr->task = task; ++ gomp_target_task_fn (task->fn_data); ++ thr->task = parent; ++ ++ gomp_mutex_lock (&team->task_lock); ++ task->kind = GOMP_TASK_ASYNC_RUNNING; ++ /* If GOMP_PLUGIN_target_task_completion has run already ++ in between gomp_target_task_fn and the mutex lock, ++ perform the requeuing here. */ ++ if (ttask->state == GOMP_TARGET_TASK_FINISHED) ++ gomp_target_task_completion (team, task); ++ else ++ ttask->state = GOMP_TARGET_TASK_RUNNING; ++ gomp_mutex_unlock (&team->task_lock); ++ return true; ++ } ++ priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0, ++ PRIORITY_INSERT_BEGIN, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ if (taskgroup) ++ priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0, ++ PRIORITY_INSERT_BEGIN, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0, ++ PRIORITY_INSERT_END, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); ++ ++team->task_count; ++ ++team->task_queued_count; ++ gomp_team_barrier_set_task_pending (&team->barrier); ++ do_wake = team->task_running_count + !parent->in_tied_task ++ < team->nthreads; ++ gomp_mutex_unlock (&team->task_lock); ++ if (do_wake) ++ gomp_team_barrier_wake (&team->barrier, 1); ++ return true; ++} ++ ++/* Given a parent_depends_on task in LIST, move it to the front of its ++ priority so it is run as soon as possible. ++ ++ Care is taken to update the list's LAST_PARENT_DEPENDS_ON field. ++ ++ We rearrange the queue such that all parent_depends_on tasks are ++ first, and last_parent_depends_on points to the last such task we ++ rearranged. For example, given the following tasks in a queue ++ where PD[123] are the parent_depends_on tasks: ++ ++ task->children ++ | ++ V ++ C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4 ++ ++ We rearrange such that: ++ ++ task->children ++ | +--- last_parent_depends_on ++ | | ++ V V ++ PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4. */ ++ ++static void inline ++priority_list_upgrade_task (struct priority_list *list, ++ struct priority_node *node) ++{ ++ struct priority_node *last_parent_depends_on ++ = list->last_parent_depends_on; ++ if (last_parent_depends_on) ++ { ++ node->prev->next = node->next; ++ node->next->prev = node->prev; ++ node->prev = last_parent_depends_on; ++ node->next = last_parent_depends_on->next; ++ node->prev->next = node; ++ node->next->prev = node; ++ } ++ else if (node != list->tasks) ++ { ++ node->prev->next = node->next; ++ node->next->prev = node->prev; ++ node->prev = list->tasks->prev; ++ node->next = list->tasks; ++ list->tasks = node; ++ node->prev->next = node; ++ node->next->prev = node; ++ } ++ list->last_parent_depends_on = node; ++} ++ ++/* Given a parent_depends_on TASK in its parent's children_queue, move ++ it to the front of its priority so it is run as soon as possible. ++ ++ PARENT is passed as an optimization. ++ ++ (This function could be defined in priority_queue.c, but we want it ++ inlined, and putting it in priority_queue.h is not an option, given ++ that gomp_task has not been properly defined at that point). */ ++ ++static void inline ++priority_queue_upgrade_task (struct gomp_task *task, ++ struct gomp_task *parent) ++{ ++ struct priority_queue *head = &parent->children_queue; ++ struct priority_node *node = &task->pnode[PQ_CHILDREN]; ++#if _LIBGOMP_CHECKING_ ++ if (!task->parent_depends_on) ++ gomp_fatal ("priority_queue_upgrade_task: task must be a " ++ "parent_depends_on task"); ++ if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task)) ++ gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task); ++#endif ++ if (priority_queue_multi_p (head)) ++ { ++ struct priority_list *list ++ = priority_queue_lookup_priority (head, task->priority); ++ priority_list_upgrade_task (list, node); ++ } ++ else ++ priority_list_upgrade_task (&head->l, node); ++} ++ ++/* Given a CHILD_TASK in LIST that is about to be executed, move it out of ++ the way in LIST so that other tasks can be considered for ++ execution. LIST contains tasks of type TYPE. ++ ++ Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field ++ if applicable. */ ++ ++static void inline ++priority_list_downgrade_task (enum priority_queue_type type, ++ struct priority_list *list, ++ struct gomp_task *child_task) ++{ ++ struct priority_node *node = task_to_priority_node (type, child_task); ++ if (list->tasks == node) ++ list->tasks = node->next; ++ else if (node->next != list->tasks) ++ { ++ /* The task in NODE is about to become TIED and TIED tasks ++ cannot come before WAITING tasks. If we're about to ++ leave the queue in such an indeterminate state, rewire ++ things appropriately. However, a TIED task at the end is ++ perfectly fine. */ ++ struct gomp_task *next_task = priority_node_to_task (type, node->next); ++ if (next_task->kind == GOMP_TASK_WAITING) ++ { ++ /* Remove from list. */ ++ node->prev->next = node->next; ++ node->next->prev = node->prev; ++ /* Rewire at the end. */ ++ node->next = list->tasks; ++ node->prev = list->tasks->prev; ++ list->tasks->prev->next = node; ++ list->tasks->prev = node; ++ } ++ } ++ ++ /* If the current task is the last_parent_depends_on for its ++ priority, adjust last_parent_depends_on appropriately. */ ++ if (__builtin_expect (child_task->parent_depends_on, 0) ++ && list->last_parent_depends_on == node) ++ { ++ struct gomp_task *prev_child = priority_node_to_task (type, node->prev); ++ if (node->prev != node ++ && prev_child->kind == GOMP_TASK_WAITING ++ && prev_child->parent_depends_on) ++ list->last_parent_depends_on = node->prev; ++ else ++ { ++ /* There are no more parent_depends_on entries waiting ++ to run, clear the list. */ ++ list->last_parent_depends_on = NULL; ++ } ++ } ++} ++ ++/* Given a TASK in HEAD that is about to be executed, move it out of ++ the way so that other tasks can be considered for execution. HEAD ++ contains tasks of type TYPE. ++ ++ Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field ++ if applicable. ++ ++ (This function could be defined in priority_queue.c, but we want it ++ inlined, and putting it in priority_queue.h is not an option, given ++ that gomp_task has not been properly defined at that point). */ ++ ++static void inline ++priority_queue_downgrade_task (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task) ++{ ++#if _LIBGOMP_CHECKING_ ++ if (!priority_queue_task_in_queue_p (type, head, task)) ++ gomp_fatal ("Attempt to downgrade missing task %p", task); ++#endif ++ if (priority_queue_multi_p (head)) ++ { ++ struct priority_list *list ++ = priority_queue_lookup_priority (head, task->priority); ++ priority_list_downgrade_task (type, list, task); ++ } ++ else ++ priority_list_downgrade_task (type, &head->l, task); ++} ++ ++/* Setup CHILD_TASK to execute. This is done by setting the task to ++ TIED, and updating all relevant queues so that CHILD_TASK is no ++ longer chosen for scheduling. Also, remove CHILD_TASK from the ++ overall team task queue entirely. ++ ++ Return TRUE if task or its containing taskgroup has been ++ cancelled. */ ++ ++static inline bool ++gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent, ++ struct gomp_team *team) ++{ ++#if _LIBGOMP_CHECKING_ ++ if (child_task->parent) ++ priority_queue_verify (PQ_CHILDREN, ++ &child_task->parent->children_queue, true); ++ if (child_task->taskgroup) ++ priority_queue_verify (PQ_TASKGROUP, ++ &child_task->taskgroup->taskgroup_queue, false); ++ priority_queue_verify (PQ_TEAM, &team->task_queue, false); ++#endif ++ ++ /* Task is about to go tied, move it out of the way. */ ++ if (parent) ++ priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue, ++ child_task); ++ ++ /* Task is about to go tied, move it out of the way. */ ++ struct gomp_taskgroup *taskgroup = child_task->taskgroup; ++ if (taskgroup) ++ priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ child_task); ++ ++ priority_queue_remove (PQ_TEAM, &team->task_queue, child_task, ++ MEMMODEL_RELAXED); ++ child_task->pnode[PQ_TEAM].next = NULL; ++ child_task->pnode[PQ_TEAM].prev = NULL; + child_task->kind = GOMP_TASK_TIED; ++ + if (--team->task_queued_count == 0) + gomp_team_barrier_clear_task_pending (&team->barrier); + if ((gomp_team_barrier_cancelled (&team->barrier) +@@ -478,6 +1020,14 @@ gomp_task_run_post_handle_depend_hash (s + } + } + ++/* After a CHILD_TASK has been run, adjust the dependency queue for ++ each task that depends on CHILD_TASK, to record the fact that there ++ is one less dependency to worry about. If a task that depended on ++ CHILD_TASK now has no dependencies, place it in the various queues ++ so it gets scheduled to run. ++ ++ TEAM is the team to which CHILD_TASK belongs to. */ ++ + static size_t + gomp_task_run_post_handle_dependers (struct gomp_task *child_task, + struct gomp_team *team) +@@ -487,91 +1037,60 @@ gomp_task_run_post_handle_dependers (str + for (i = 0; i < count; i++) + { + struct gomp_task *task = child_task->dependers->elem[i]; ++ ++ /* CHILD_TASK satisfies a dependency for TASK. Keep track of ++ TASK's remaining dependencies. Once TASK has no other ++ depenencies, put it into the various queues so it will get ++ scheduled for execution. */ + if (--task->num_dependees != 0) + continue; + + struct gomp_taskgroup *taskgroup = task->taskgroup; + if (parent) + { +- if (parent->children) +- { +- /* If parent is in gomp_task_maybe_wait_for_dependencies +- and it doesn't need to wait for this task, put it after +- all ready to run tasks it needs to wait for. */ +- if (parent->taskwait && parent->taskwait->last_parent_depends_on +- && !task->parent_depends_on) +- { +- struct gomp_task *last_parent_depends_on +- = parent->taskwait->last_parent_depends_on; +- task->next_child = last_parent_depends_on->next_child; +- task->prev_child = last_parent_depends_on; +- } +- else +- { +- task->next_child = parent->children; +- task->prev_child = parent->children->prev_child; +- parent->children = task; +- } +- task->next_child->prev_child = task; +- task->prev_child->next_child = task; +- } +- else +- { +- task->next_child = task; +- task->prev_child = task; +- parent->children = task; +- } ++ priority_queue_insert (PQ_CHILDREN, &parent->children_queue, ++ task, task->priority, ++ PRIORITY_INSERT_BEGIN, ++ /*adjust_parent_depends_on=*/true, ++ task->parent_depends_on); + if (parent->taskwait) + { + if (parent->taskwait->in_taskwait) + { ++ /* One more task has had its dependencies met. ++ Inform any waiters. */ + parent->taskwait->in_taskwait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } + else if (parent->taskwait->in_depend_wait) + { ++ /* One more task has had its dependencies met. ++ Inform any waiters. */ + parent->taskwait->in_depend_wait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } +- if (parent->taskwait->last_parent_depends_on == NULL +- && task->parent_depends_on) +- parent->taskwait->last_parent_depends_on = task; + } + } + if (taskgroup) + { +- if (taskgroup->children) +- { +- task->next_taskgroup = taskgroup->children; +- task->prev_taskgroup = taskgroup->children->prev_taskgroup; +- task->next_taskgroup->prev_taskgroup = task; +- task->prev_taskgroup->next_taskgroup = task; +- } +- else +- { +- task->next_taskgroup = task; +- task->prev_taskgroup = task; +- } +- taskgroup->children = task; ++ priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ task, task->priority, ++ PRIORITY_INSERT_BEGIN, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); + if (taskgroup->in_taskgroup_wait) + { ++ /* One more task has had its dependencies met. ++ Inform any waiters. */ + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); + } + } +- if (team->task_queue) +- { +- task->next_queue = team->task_queue; +- task->prev_queue = team->task_queue->prev_queue; +- task->next_queue->prev_queue = task; +- task->prev_queue->next_queue = task; +- } +- else +- { +- task->next_queue = task; +- task->prev_queue = task; +- team->task_queue = task; +- } ++ priority_queue_insert (PQ_TEAM, &team->task_queue, ++ task, task->priority, ++ PRIORITY_INSERT_END, ++ /*adjust_parent_depends_on=*/false, ++ task->parent_depends_on); + ++team->task_count; + ++team->task_queued_count; + ++ret; +@@ -601,12 +1120,18 @@ gomp_task_run_post_handle_depend (struct + return gomp_task_run_post_handle_dependers (child_task, team); + } + ++/* Remove CHILD_TASK from its parent. */ ++ + static inline void + gomp_task_run_post_remove_parent (struct gomp_task *child_task) + { + struct gomp_task *parent = child_task->parent; + if (parent == NULL) + return; ++ ++ /* If this was the last task the parent was depending on, ++ synchronize with gomp_task_maybe_wait_for_dependencies so it can ++ clean up and return. */ + if (__builtin_expect (child_task->parent_depends_on, 0) + && --parent->taskwait->n_depend == 0 + && parent->taskwait->in_depend_wait) +@@ -614,36 +1139,31 @@ gomp_task_run_post_remove_parent (struct + parent->taskwait->in_depend_wait = false; + gomp_sem_post (&parent->taskwait->taskwait_sem); + } +- child_task->prev_child->next_child = child_task->next_child; +- child_task->next_child->prev_child = child_task->prev_child; +- if (parent->children != child_task) +- return; +- if (child_task->next_child != child_task) +- parent->children = child_task->next_child; +- else ++ ++ if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue, ++ child_task, MEMMODEL_RELEASE) ++ && parent->taskwait && parent->taskwait->in_taskwait) + { +- /* We access task->children in GOMP_taskwait +- outside of the task lock mutex region, so +- need a release barrier here to ensure memory +- written by child_task->fn above is flushed +- before the NULL is written. */ +- __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE); +- if (parent->taskwait && parent->taskwait->in_taskwait) +- { +- parent->taskwait->in_taskwait = false; +- gomp_sem_post (&parent->taskwait->taskwait_sem); +- } ++ parent->taskwait->in_taskwait = false; ++ gomp_sem_post (&parent->taskwait->taskwait_sem); + } ++ child_task->pnode[PQ_CHILDREN].next = NULL; ++ child_task->pnode[PQ_CHILDREN].prev = NULL; + } + ++/* Remove CHILD_TASK from its taskgroup. */ ++ + static inline void + gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task) + { + struct gomp_taskgroup *taskgroup = child_task->taskgroup; + if (taskgroup == NULL) + return; +- child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup; +- child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup; ++ bool empty = priority_queue_remove (PQ_TASKGROUP, ++ &taskgroup->taskgroup_queue, ++ child_task, MEMMODEL_RELAXED); ++ child_task->pnode[PQ_TASKGROUP].next = NULL; ++ child_task->pnode[PQ_TASKGROUP].prev = NULL; + if (taskgroup->num_children > 1) + --taskgroup->num_children; + else +@@ -655,18 +1175,10 @@ gomp_task_run_post_remove_taskgroup (str + before the NULL is written. */ + __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE); + } +- if (taskgroup->children != child_task) +- return; +- if (child_task->next_taskgroup != child_task) +- taskgroup->children = child_task->next_taskgroup; +- else ++ if (empty && taskgroup->in_taskgroup_wait) + { +- taskgroup->children = NULL; +- if (taskgroup->in_taskgroup_wait) +- { +- taskgroup->in_taskgroup_wait = false; +- gomp_sem_post (&taskgroup->taskgroup_sem); +- } ++ taskgroup->in_taskgroup_wait = false; ++ gomp_sem_post (&taskgroup->taskgroup_sem); + } + } + +@@ -696,11 +1208,15 @@ gomp_barrier_handle_tasks (gomp_barrier_ + while (1) + { + bool cancelled = false; +- if (team->task_queue != NULL) ++ if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED)) + { +- child_task = team->task_queue; ++ bool ignored; ++ child_task ++ = priority_queue_next_task (PQ_TEAM, &team->task_queue, ++ PQ_IGNORED, NULL, ++ &ignored); + cancelled = gomp_task_run_pre (child_task, child_task->parent, +- child_task->taskgroup, team); ++ team); + if (__builtin_expect (cancelled, 0)) + { + if (to_free) +@@ -729,7 +1245,29 @@ gomp_barrier_handle_tasks (gomp_barrier_ + if (child_task) + { + thr->task = child_task; +- child_task->fn (child_task->fn_data); ++ if (__builtin_expect (child_task->fn == NULL, 0)) ++ { ++ if (gomp_target_task_fn (child_task->fn_data)) ++ { ++ thr->task = task; ++ gomp_mutex_lock (&team->task_lock); ++ child_task->kind = GOMP_TASK_ASYNC_RUNNING; ++ team->task_running_count--; ++ struct gomp_target_task *ttask ++ = (struct gomp_target_task *) child_task->fn_data; ++ /* If GOMP_PLUGIN_target_task_completion has run already ++ in between gomp_target_task_fn and the mutex lock, ++ perform the requeuing here. */ ++ if (ttask->state == GOMP_TARGET_TASK_FINISHED) ++ gomp_target_task_completion (team, child_task); ++ else ++ ttask->state = GOMP_TARGET_TASK_RUNNING; ++ child_task = NULL; ++ continue; ++ } ++ } ++ else ++ child_task->fn (child_task->fn_data); + thr->task = task; + } + else +@@ -741,7 +1279,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); + gomp_task_run_post_remove_parent (child_task); +- gomp_clear_parent (child_task->children); ++ gomp_clear_parent (&child_task->children_queue); + gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; + child_task = NULL; +@@ -765,7 +1303,9 @@ gomp_barrier_handle_tasks (gomp_barrier_ + } + } + +-/* Called when encountering a taskwait directive. */ ++/* Called when encountering a taskwait directive. ++ ++ Wait for all children of the current task. */ + + void + GOMP_taskwait (void) +@@ -785,15 +1325,16 @@ GOMP_taskwait (void) + child thread task work function are seen before we exit from + GOMP_taskwait. */ + if (task == NULL +- || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL) ++ || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE)) + return; + + memset (&taskwait, 0, sizeof (taskwait)); ++ bool child_q = false; + gomp_mutex_lock (&team->task_lock); + while (1) + { + bool cancelled = false; +- if (task->children == NULL) ++ if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED)) + { + bool destroy_taskwait = task->taskwait != NULL; + task->taskwait = NULL; +@@ -807,12 +1348,14 @@ GOMP_taskwait (void) + gomp_sem_destroy (&taskwait.taskwait_sem); + return; + } +- if (task->children->kind == GOMP_TASK_WAITING) ++ struct gomp_task *next_task ++ = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, ++ PQ_TEAM, &team->task_queue, &child_q); ++ if (next_task->kind == GOMP_TASK_WAITING) + { +- child_task = task->children; ++ child_task = next_task; + cancelled +- = gomp_task_run_pre (child_task, task, child_task->taskgroup, +- team); ++ = gomp_task_run_pre (child_task, task, team); + if (__builtin_expect (cancelled, 0)) + { + if (to_free) +@@ -826,8 +1369,10 @@ GOMP_taskwait (void) + } + else + { +- /* All tasks we are waiting for are already running +- in other threads. Wait for them. */ ++ /* All tasks we are waiting for are either running in other ++ threads, or they are tasks that have not had their ++ dependencies met (so they're not even in the queue). Wait ++ for them. */ + if (task->taskwait == NULL) + { + taskwait.in_depend_wait = false; +@@ -851,7 +1396,28 @@ GOMP_taskwait (void) + if (child_task) + { + thr->task = child_task; +- child_task->fn (child_task->fn_data); ++ if (__builtin_expect (child_task->fn == NULL, 0)) ++ { ++ if (gomp_target_task_fn (child_task->fn_data)) ++ { ++ thr->task = task; ++ gomp_mutex_lock (&team->task_lock); ++ child_task->kind = GOMP_TASK_ASYNC_RUNNING; ++ struct gomp_target_task *ttask ++ = (struct gomp_target_task *) child_task->fn_data; ++ /* If GOMP_PLUGIN_target_task_completion has run already ++ in between gomp_target_task_fn and the mutex lock, ++ perform the requeuing here. */ ++ if (ttask->state == GOMP_TARGET_TASK_FINISHED) ++ gomp_target_task_completion (team, child_task); ++ else ++ ttask->state = GOMP_TARGET_TASK_RUNNING; ++ child_task = NULL; ++ continue; ++ } ++ } ++ else ++ child_task->fn (child_task->fn_data); + thr->task = task; + } + else +@@ -862,17 +1428,19 @@ GOMP_taskwait (void) + finish_cancelled:; + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); +- child_task->prev_child->next_child = child_task->next_child; +- child_task->next_child->prev_child = child_task->prev_child; +- if (task->children == child_task) +- { +- if (child_task->next_child != child_task) +- task->children = child_task->next_child; +- else +- task->children = NULL; ++ ++ if (child_q) ++ { ++ priority_queue_remove (PQ_CHILDREN, &task->children_queue, ++ child_task, MEMMODEL_RELAXED); ++ child_task->pnode[PQ_CHILDREN].next = NULL; ++ child_task->pnode[PQ_CHILDREN].prev = NULL; + } +- gomp_clear_parent (child_task->children); ++ ++ gomp_clear_parent (&child_task->children_queue); ++ + gomp_task_run_post_remove_taskgroup (child_task); ++ + to_free = child_task; + child_task = NULL; + team->task_count--; +@@ -887,10 +1455,20 @@ GOMP_taskwait (void) + } + } + +-/* This is like GOMP_taskwait, but we only wait for tasks that the +- upcoming task depends on. */ ++/* An undeferred task is about to run. Wait for all tasks that this ++ undeferred task depends on. + +-static void ++ This is done by first putting all known ready dependencies ++ (dependencies that have their own dependencies met) at the top of ++ the scheduling queues. Then we iterate through these imminently ++ ready tasks (and possibly other high priority tasks), and run them. ++ If we run out of ready dependencies to execute, we either wait for ++ the reamining dependencies to finish, or wait for them to get ++ scheduled so we can run them. ++ ++ DEPEND is as in GOMP_task. */ ++ ++void + gomp_task_maybe_wait_for_dependencies (void **depend) + { + struct gomp_thread *thr = gomp_thread (); +@@ -898,7 +1476,6 @@ gomp_task_maybe_wait_for_dependencies (v + struct gomp_team *team = thr->ts.team; + struct gomp_task_depend_entry elem, *ent = NULL; + struct gomp_taskwait taskwait; +- struct gomp_task *last_parent_depends_on = NULL; + size_t ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; + size_t i; +@@ -922,32 +1499,11 @@ gomp_task_maybe_wait_for_dependencies (v + { + tsk->parent_depends_on = true; + ++num_awaited; ++ /* If depenency TSK itself has no dependencies and is ++ ready to run, move it up front so that we run it as ++ soon as possible. */ + if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING) +- { +- /* If a task we need to wait for is not already +- running and is ready to be scheduled, move it +- to front, so that we run it as soon as possible. */ +- if (last_parent_depends_on) +- { +- tsk->prev_child->next_child = tsk->next_child; +- tsk->next_child->prev_child = tsk->prev_child; +- tsk->prev_child = last_parent_depends_on; +- tsk->next_child = last_parent_depends_on->next_child; +- tsk->prev_child->next_child = tsk; +- tsk->next_child->prev_child = tsk; +- } +- else if (tsk != task->children) +- { +- tsk->prev_child->next_child = tsk->next_child; +- tsk->next_child->prev_child = tsk->prev_child; +- tsk->prev_child = task->children; +- tsk->next_child = task->children->next_child; +- task->children = tsk; +- tsk->prev_child->next_child = tsk; +- tsk->next_child->prev_child = tsk; +- } +- last_parent_depends_on = tsk; +- } ++ priority_queue_upgrade_task (tsk, task); + } + } + } +@@ -959,7 +1515,6 @@ gomp_task_maybe_wait_for_dependencies (v + + memset (&taskwait, 0, sizeof (taskwait)); + taskwait.n_depend = num_awaited; +- taskwait.last_parent_depends_on = last_parent_depends_on; + gomp_sem_init (&taskwait.taskwait_sem, 0); + task->taskwait = &taskwait; + +@@ -978,12 +1533,30 @@ gomp_task_maybe_wait_for_dependencies (v + gomp_sem_destroy (&taskwait.taskwait_sem); + return; + } +- if (task->children->kind == GOMP_TASK_WAITING) ++ ++ /* Theoretically when we have multiple priorities, we should ++ chose between the highest priority item in ++ task->children_queue and team->task_queue here, so we should ++ use priority_queue_next_task(). However, since we are ++ running an undeferred task, perhaps that makes all tasks it ++ depends on undeferred, thus a priority of INF? This would ++ make it unnecessary to take anything into account here, ++ but the dependencies. ++ ++ On the other hand, if we want to use priority_queue_next_task(), ++ care should be taken to only use priority_queue_remove() ++ below if the task was actually removed from the children ++ queue. */ ++ bool ignored; ++ struct gomp_task *next_task ++ = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, ++ PQ_IGNORED, NULL, &ignored); ++ ++ if (next_task->kind == GOMP_TASK_WAITING) + { +- child_task = task->children; ++ child_task = next_task; + cancelled +- = gomp_task_run_pre (child_task, task, child_task->taskgroup, +- team); ++ = gomp_task_run_pre (child_task, task, team); + if (__builtin_expect (cancelled, 0)) + { + if (to_free) +@@ -996,8 +1569,10 @@ gomp_task_maybe_wait_for_dependencies (v + } + } + else +- /* All tasks we are waiting for are already running +- in other threads. Wait for them. */ ++ /* All tasks we are waiting for are either running in other ++ threads, or they are tasks that have not had their ++ dependencies met (so they're not even in the queue). Wait ++ for them. */ + taskwait.in_depend_wait = true; + gomp_mutex_unlock (&team->task_lock); + if (do_wake) +@@ -1014,7 +1589,28 @@ gomp_task_maybe_wait_for_dependencies (v + if (child_task) + { + thr->task = child_task; +- child_task->fn (child_task->fn_data); ++ if (__builtin_expect (child_task->fn == NULL, 0)) ++ { ++ if (gomp_target_task_fn (child_task->fn_data)) ++ { ++ thr->task = task; ++ gomp_mutex_lock (&team->task_lock); ++ child_task->kind = GOMP_TASK_ASYNC_RUNNING; ++ struct gomp_target_task *ttask ++ = (struct gomp_target_task *) child_task->fn_data; ++ /* If GOMP_PLUGIN_target_task_completion has run already ++ in between gomp_target_task_fn and the mutex lock, ++ perform the requeuing here. */ ++ if (ttask->state == GOMP_TARGET_TASK_FINISHED) ++ gomp_target_task_completion (team, child_task); ++ else ++ ttask->state = GOMP_TARGET_TASK_RUNNING; ++ child_task = NULL; ++ continue; ++ } ++ } ++ else ++ child_task->fn (child_task->fn_data); + thr->task = task; + } + else +@@ -1027,16 +1623,13 @@ gomp_task_maybe_wait_for_dependencies (v + = gomp_task_run_post_handle_depend (child_task, team); + if (child_task->parent_depends_on) + --taskwait.n_depend; +- child_task->prev_child->next_child = child_task->next_child; +- child_task->next_child->prev_child = child_task->prev_child; +- if (task->children == child_task) +- { +- if (child_task->next_child != child_task) +- task->children = child_task->next_child; +- else +- task->children = NULL; +- } +- gomp_clear_parent (child_task->children); ++ ++ priority_queue_remove (PQ_CHILDREN, &task->children_queue, ++ child_task, MEMMODEL_RELAXED); ++ child_task->pnode[PQ_CHILDREN].next = NULL; ++ child_task->pnode[PQ_CHILDREN].prev = NULL; ++ ++ gomp_clear_parent (&child_task->children_queue); + gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; + child_task = NULL; +@@ -1069,14 +1662,14 @@ GOMP_taskgroup_start (void) + struct gomp_taskgroup *taskgroup; + + /* If team is NULL, all tasks are executed as +- GOMP_TASK_IFFALSE tasks and thus all children tasks of ++ GOMP_TASK_UNDEFERRED tasks and thus all children tasks of + taskgroup and their descendant tasks will be finished + by the time GOMP_taskgroup_end is called. */ + if (team == NULL) + return; + taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup)); + taskgroup->prev = task->taskgroup; +- taskgroup->children = NULL; ++ priority_queue_init (&taskgroup->taskgroup_queue); + taskgroup->in_taskgroup_wait = false; + taskgroup->cancelled = false; + taskgroup->num_children = 0; +@@ -1098,6 +1691,17 @@ GOMP_taskgroup_end (void) + if (team == NULL) + return; + taskgroup = task->taskgroup; ++ if (__builtin_expect (taskgroup == NULL, 0) ++ && thr->ts.level == 0) ++ { ++ /* This can happen if GOMP_taskgroup_start is called when ++ thr->ts.team == NULL, but inside of the taskgroup there ++ is #pragma omp target nowait that creates an implicit ++ team with a single thread. In this case, we want to wait ++ for all outstanding tasks in this team. */ ++ gomp_team_barrier_wait (&team->barrier); ++ return; ++ } + + /* The acquire barrier on load of taskgroup->num_children here + synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup. +@@ -1108,19 +1712,25 @@ GOMP_taskgroup_end (void) + if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0) + goto finish; + ++ bool unused; + gomp_mutex_lock (&team->task_lock); + while (1) + { + bool cancelled = false; +- if (taskgroup->children == NULL) ++ if (priority_queue_empty_p (&taskgroup->taskgroup_queue, ++ MEMMODEL_RELAXED)) + { + if (taskgroup->num_children) + { +- if (task->children == NULL) ++ if (priority_queue_empty_p (&task->children_queue, ++ MEMMODEL_RELAXED)) + goto do_wait; +- child_task = task->children; +- } +- else ++ child_task ++ = priority_queue_next_task (PQ_CHILDREN, &task->children_queue, ++ PQ_TEAM, &team->task_queue, ++ &unused); ++ } ++ else + { + gomp_mutex_unlock (&team->task_lock); + if (to_free) +@@ -1132,12 +1742,13 @@ GOMP_taskgroup_end (void) + } + } + else +- child_task = taskgroup->children; ++ child_task ++ = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ PQ_TEAM, &team->task_queue, &unused); + if (child_task->kind == GOMP_TASK_WAITING) + { + cancelled +- = gomp_task_run_pre (child_task, child_task->parent, taskgroup, +- team); ++ = gomp_task_run_pre (child_task, child_task->parent, team); + if (__builtin_expect (cancelled, 0)) + { + if (to_free) +@@ -1153,8 +1764,10 @@ GOMP_taskgroup_end (void) + { + child_task = NULL; + do_wait: +- /* All tasks we are waiting for are already running +- in other threads. Wait for them. */ ++ /* All tasks we are waiting for are either running in other ++ threads, or they are tasks that have not had their ++ dependencies met (so they're not even in the queue). Wait ++ for them. */ + taskgroup->in_taskgroup_wait = true; + } + gomp_mutex_unlock (&team->task_lock); +@@ -1172,7 +1785,28 @@ GOMP_taskgroup_end (void) + if (child_task) + { + thr->task = child_task; +- child_task->fn (child_task->fn_data); ++ if (__builtin_expect (child_task->fn == NULL, 0)) ++ { ++ if (gomp_target_task_fn (child_task->fn_data)) ++ { ++ thr->task = task; ++ gomp_mutex_lock (&team->task_lock); ++ child_task->kind = GOMP_TASK_ASYNC_RUNNING; ++ struct gomp_target_task *ttask ++ = (struct gomp_target_task *) child_task->fn_data; ++ /* If GOMP_PLUGIN_target_task_completion has run already ++ in between gomp_target_task_fn and the mutex lock, ++ perform the requeuing here. */ ++ if (ttask->state == GOMP_TARGET_TASK_FINISHED) ++ gomp_target_task_completion (team, child_task); ++ else ++ ttask->state = GOMP_TARGET_TASK_RUNNING; ++ child_task = NULL; ++ continue; ++ } ++ } ++ else ++ child_task->fn (child_task->fn_data); + thr->task = task; + } + else +@@ -1184,7 +1818,7 @@ GOMP_taskgroup_end (void) + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); + gomp_task_run_post_remove_parent (child_task); +- gomp_clear_parent (child_task->children); ++ gomp_clear_parent (&child_task->children_queue); + gomp_task_run_post_remove_taskgroup (child_task); + to_free = child_task; + child_task = NULL; +--- libgomp/libgomp_g.h.jj 2014-05-15 10:56:31.429532978 +0200 ++++ libgomp/libgomp_g.h 2016-07-13 16:57:04.422535521 +0200 +@@ -29,6 +29,7 @@ + #define LIBGOMP_G_H 1 + + #include <stdbool.h> ++#include <stddef.h> + + /* barrier.c */ + +@@ -50,6 +51,10 @@ extern bool GOMP_loop_static_start (long + extern bool GOMP_loop_dynamic_start (long, long, long, long, long *, long *); + extern bool GOMP_loop_guided_start (long, long, long, long, long *, long *); + extern bool GOMP_loop_runtime_start (long, long, long, long *, long *); ++extern bool GOMP_loop_nonmonotonic_dynamic_start (long, long, long, long, ++ long *, long *); ++extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long, ++ long *, long *); + + extern bool GOMP_loop_ordered_static_start (long, long, long, long, + long *, long *); +@@ -63,12 +68,23 @@ extern bool GOMP_loop_static_next (long + extern bool GOMP_loop_dynamic_next (long *, long *); + extern bool GOMP_loop_guided_next (long *, long *); + extern bool GOMP_loop_runtime_next (long *, long *); ++extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *); ++extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *); + + extern bool GOMP_loop_ordered_static_next (long *, long *); + extern bool GOMP_loop_ordered_dynamic_next (long *, long *); + extern bool GOMP_loop_ordered_guided_next (long *, long *); + extern bool GOMP_loop_ordered_runtime_next (long *, long *); + ++extern bool GOMP_loop_doacross_static_start (unsigned, long *, long, long *, ++ long *); ++extern bool GOMP_loop_doacross_dynamic_start (unsigned, long *, long, long *, ++ long *); ++extern bool GOMP_loop_doacross_guided_start (unsigned, long *, long, long *, ++ long *); ++extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *, ++ long *); ++ + extern void GOMP_parallel_loop_static_start (void (*)(void *), void *, + unsigned, long, long, long, long); + extern void GOMP_parallel_loop_dynamic_start (void (*)(void *), void *, +@@ -89,6 +105,12 @@ extern void GOMP_parallel_loop_guided (v + extern void GOMP_parallel_loop_runtime (void (*)(void *), void *, + unsigned, long, long, long, + unsigned); ++extern void GOMP_parallel_loop_nonmonotonic_dynamic (void (*)(void *), void *, ++ unsigned, long, long, ++ long, long, unsigned); ++extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *, ++ unsigned, long, long, ++ long, long, unsigned); + + extern void GOMP_loop_end (void); + extern void GOMP_loop_end_nowait (void); +@@ -119,6 +141,18 @@ extern bool GOMP_loop_ull_runtime_start + unsigned long long, + unsigned long long *, + unsigned long long *); ++extern bool GOMP_loop_ull_nonmonotonic_dynamic_start (bool, unsigned long long, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_nonmonotonic_guided_start (bool, unsigned long long, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); + + extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long, + unsigned long long, +@@ -152,6 +186,10 @@ extern bool GOMP_loop_ull_guided_next (u + unsigned long long *); + extern bool GOMP_loop_ull_runtime_next (unsigned long long *, + unsigned long long *); ++extern bool GOMP_loop_ull_nonmonotonic_dynamic_next (unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *, ++ unsigned long long *); + + extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *, + unsigned long long *); +@@ -162,10 +200,34 @@ extern bool GOMP_loop_ull_ordered_guided + extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *, + unsigned long long *); + ++extern bool GOMP_loop_ull_doacross_static_start (unsigned, ++ unsigned long long *, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned, ++ unsigned long long *, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_doacross_guided_start (unsigned, ++ unsigned long long *, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_doacross_runtime_start (unsigned, ++ unsigned long long *, ++ unsigned long long *, ++ unsigned long long *); ++ + /* ordered.c */ + + extern void GOMP_ordered_start (void); + extern void GOMP_ordered_end (void); ++extern void GOMP_doacross_post (long *); ++extern void GOMP_doacross_wait (long, ...); ++extern void GOMP_doacross_ull_post (unsigned long long *); ++extern void GOMP_doacross_ull_wait (unsigned long long, ...); + + /* parallel.c */ + +@@ -178,7 +240,15 @@ extern bool GOMP_cancellation_point (int + /* task.c */ + + extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *), +- long, long, bool, unsigned, void **); ++ long, long, bool, unsigned, void **, int); ++extern void GOMP_taskloop (void (*) (void *), void *, ++ void (*) (void *, void *), long, long, unsigned, ++ unsigned long, int, long, long, long); ++extern void GOMP_taskloop_ull (void (*) (void *), void *, ++ void (*) (void *, void *), long, long, ++ unsigned, unsigned long, int, ++ unsigned long long, unsigned long long, ++ unsigned long long); + extern void GOMP_taskwait (void); + extern void GOMP_taskyield (void); + extern void GOMP_taskgroup_start (void); +@@ -206,11 +276,38 @@ extern void GOMP_single_copy_end (void * + + extern void GOMP_target (int, void (*) (void *), const void *, + size_t, void **, size_t *, unsigned char *); ++extern void GOMP_target_ext (int, void (*) (void *), size_t, void **, size_t *, ++ unsigned short *, unsigned int, void **, void **); + extern void GOMP_target_data (int, const void *, + size_t, void **, size_t *, unsigned char *); ++extern void GOMP_target_data_ext (int, size_t, void **, size_t *, ++ unsigned short *); + extern void GOMP_target_end_data (void); + extern void GOMP_target_update (int, const void *, + size_t, void **, size_t *, unsigned char *); ++extern void GOMP_target_update_ext (int, size_t, void **, size_t *, ++ unsigned short *, unsigned int, void **); ++extern void GOMP_target_enter_exit_data (int, size_t, void **, size_t *, ++ unsigned short *, unsigned int, ++ void **); + extern void GOMP_teams (unsigned int, unsigned int); + ++/* oacc-parallel.c */ ++ ++extern void GOACC_parallel_keyed (int, void (*) (void *), size_t, ++ void **, size_t *, unsigned short *, ...); ++extern void GOACC_parallel (int, void (*) (void *), size_t, void **, size_t *, ++ unsigned short *, int, int, int, int, int, ...); ++extern void GOACC_data_start (int, size_t, void **, size_t *, ++ unsigned short *); ++extern void GOACC_data_end (void); ++extern void GOACC_enter_exit_data (int, size_t, void **, ++ size_t *, unsigned short *, int, int, ...); ++extern void GOACC_update (int, size_t, void **, size_t *, ++ unsigned short *, int, int, ...); ++extern void GOACC_wait (int, int, ...); ++extern int GOACC_get_num_threads (void); ++extern int GOACC_get_thread_num (void); ++extern void GOACC_declare (int, size_t, void **, size_t *, unsigned short *); ++ + #endif /* LIBGOMP_G_H */ +--- libgomp/libgomp.h.jj 2014-08-01 15:59:49.145188127 +0200 ++++ libgomp/libgomp.h 2016-07-14 17:40:24.038243456 +0200 +@@ -34,12 +34,35 @@ + #ifndef LIBGOMP_H + #define LIBGOMP_H 1 + ++#ifndef _LIBGOMP_CHECKING_ ++/* Define to 1 to perform internal sanity checks. */ ++#define _LIBGOMP_CHECKING_ 0 ++#endif ++ + #include "config.h" + #include "gstdint.h" ++#include "libgomp-plugin.h" + + #include <pthread.h> + #include <stdbool.h> + #include <stdlib.h> ++#include <stdarg.h> ++ ++/* Needed for memset in priority_queue.c. */ ++#if _LIBGOMP_CHECKING_ ++# ifdef STRING_WITH_STRINGS ++# include <string.h> ++# include <strings.h> ++# else ++# ifdef HAVE_STRING_H ++# include <string.h> ++# else ++# ifdef HAVE_STRINGS_H ++# include <strings.h> ++# endif ++# endif ++# endif ++#endif + + #ifdef HAVE_ATTRIBUTE_VISIBILITY + # pragma GCC visibility push(hidden) +@@ -56,6 +79,44 @@ enum memmodel + MEMMODEL_SEQ_CST = 5 + }; + ++/* alloc.c */ ++ ++extern void *gomp_malloc (size_t) __attribute__((malloc)); ++extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); ++extern void *gomp_realloc (void *, size_t); ++ ++/* Avoid conflicting prototypes of alloca() in system headers by using ++ GCC's builtin alloca(). */ ++#define gomp_alloca(x) __builtin_alloca(x) ++ ++/* error.c */ ++ ++extern void gomp_vdebug (int, const char *, va_list); ++extern void gomp_debug (int, const char *, ...) ++ __attribute__ ((format (printf, 2, 3))); ++#define gomp_vdebug(KIND, FMT, VALIST) \ ++ do { \ ++ if (__builtin_expect (gomp_debug_var, 0)) \ ++ (gomp_vdebug) ((KIND), (FMT), (VALIST)); \ ++ } while (0) ++#define gomp_debug(KIND, ...) \ ++ do { \ ++ if (__builtin_expect (gomp_debug_var, 0)) \ ++ (gomp_debug) ((KIND), __VA_ARGS__); \ ++ } while (0) ++extern void gomp_verror (const char *, va_list); ++extern void gomp_error (const char *, ...) ++ __attribute__ ((format (printf, 1, 2))); ++extern void gomp_vfatal (const char *, va_list) ++ __attribute__ ((noreturn)); ++extern void gomp_fatal (const char *, ...) ++ __attribute__ ((noreturn, format (printf, 1, 2))); ++ ++struct gomp_task; ++struct gomp_taskgroup; ++struct htab; ++ ++#include "priority_queue.h" + #include "sem.h" + #include "mutex.h" + #include "bar.h" +@@ -74,6 +135,44 @@ enum gomp_schedule_type + GFS_AUTO + }; + ++struct gomp_doacross_work_share ++{ ++ union { ++ /* chunk_size copy, as ws->chunk_size is multiplied by incr for ++ GFS_DYNAMIC. */ ++ long chunk_size; ++ /* Likewise, but for ull implementation. */ ++ unsigned long long chunk_size_ull; ++ /* For schedule(static,0) this is the number ++ of iterations assigned to the last thread, i.e. number of ++ iterations / number of threads. */ ++ long q; ++ /* Likewise, but for ull implementation. */ ++ unsigned long long q_ull; ++ }; ++ /* Size of each array entry (padded to cache line size). */ ++ unsigned long elt_sz; ++ /* Number of dimensions in sink vectors. */ ++ unsigned int ncounts; ++ /* True if the iterations can be flattened. */ ++ bool flattened; ++ /* Actual array (of elt_sz sized units), aligned to cache line size. ++ This is indexed by team_id for GFS_STATIC and outermost iteration ++ / chunk_size for other schedules. */ ++ unsigned char *array; ++ /* These two are only used for schedule(static,0). */ ++ /* This one is number of iterations % number of threads. */ ++ long t; ++ union { ++ /* And this one is cached t * (q + 1). */ ++ long boundary; ++ /* Likewise, but for the ull implementation. */ ++ unsigned long long boundary_ull; ++ }; ++ /* Array of shift counts for each dimension if they can be flattened. */ ++ unsigned int shift_counts[]; ++}; ++ + struct gomp_work_share + { + /* This member records the SCHEDULE clause to be used for this construct. +@@ -105,13 +204,18 @@ struct gomp_work_share + }; + }; + +- /* This is a circular queue that details which threads will be allowed +- into the ordered region and in which order. When a thread allocates +- iterations on which it is going to work, it also registers itself at +- the end of the array. When a thread reaches the ordered region, it +- checks to see if it is the one at the head of the queue. If not, it +- blocks on its RELEASE semaphore. */ +- unsigned *ordered_team_ids; ++ union { ++ /* This is a circular queue that details which threads will be allowed ++ into the ordered region and in which order. When a thread allocates ++ iterations on which it is going to work, it also registers itself at ++ the end of the array. When a thread reaches the ordered region, it ++ checks to see if it is the one at the head of the queue. If not, it ++ blocks on its RELEASE semaphore. */ ++ unsigned *ordered_team_ids; ++ ++ /* This is a pointer to DOACROSS work share data. */ ++ struct gomp_doacross_work_share *doacross; ++ }; + + /* This is the number of threads that have registered themselves in + the circular queue ordered_team_ids. */ +@@ -230,7 +334,7 @@ struct gomp_task_icv + { + unsigned long nthreads_var; + enum gomp_schedule_type run_sched_var; +- int run_sched_modifier; ++ int run_sched_chunk_size; + int default_device_var; + unsigned int thread_limit_var; + bool dyn_var; +@@ -246,6 +350,7 @@ extern gomp_mutex_t gomp_managed_threads + #endif + extern unsigned long gomp_max_active_levels_var; + extern bool gomp_cancel_var; ++extern int gomp_max_task_priority_var; + extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; + extern unsigned long gomp_available_cpus, gomp_managed_threads; + extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len; +@@ -253,25 +358,36 @@ extern char *gomp_bind_var_list; + extern unsigned long gomp_bind_var_list_len; + extern void **gomp_places_list; + extern unsigned long gomp_places_list_len; ++extern int gomp_debug_var; ++extern int goacc_device_num; ++extern char *goacc_device_type; + + enum gomp_task_kind + { ++ /* Implicit task. */ + GOMP_TASK_IMPLICIT, +- GOMP_TASK_IFFALSE, ++ /* Undeferred task. */ ++ GOMP_TASK_UNDEFERRED, ++ /* Task created by GOMP_task and waiting to be run. */ + GOMP_TASK_WAITING, +- GOMP_TASK_TIED ++ /* Task currently executing or scheduled and about to execute. */ ++ GOMP_TASK_TIED, ++ /* Used for target tasks that have vars mapped and async run started, ++ but not yet completed. Once that completes, they will be readded ++ into the queues as GOMP_TASK_WAITING in order to perform the var ++ unmapping. */ ++ GOMP_TASK_ASYNC_RUNNING + }; + +-struct gomp_task; +-struct gomp_taskgroup; +-struct htab; +- + struct gomp_task_depend_entry + { ++ /* Address of dependency. */ + void *addr; + struct gomp_task_depend_entry *next; + struct gomp_task_depend_entry *prev; ++ /* Task that provides the dependency in ADDR. */ + struct gomp_task *task; ++ /* Depend entry is of type "IN". */ + bool is_in; + bool redundant; + bool redundant_out; +@@ -290,8 +406,8 @@ struct gomp_taskwait + { + bool in_taskwait; + bool in_depend_wait; ++ /* Number of tasks we are waiting for. */ + size_t n_depend; +- struct gomp_task *last_parent_depends_on; + gomp_sem_t taskwait_sem; + }; + +@@ -299,20 +415,31 @@ struct gomp_taskwait + + struct gomp_task + { ++ /* Parent of this task. */ + struct gomp_task *parent; +- struct gomp_task *children; +- struct gomp_task *next_child; +- struct gomp_task *prev_child; +- struct gomp_task *next_queue; +- struct gomp_task *prev_queue; +- struct gomp_task *next_taskgroup; +- struct gomp_task *prev_taskgroup; ++ /* Children of this task. */ ++ struct priority_queue children_queue; ++ /* Taskgroup this task belongs in. */ + struct gomp_taskgroup *taskgroup; ++ /* Tasks that depend on this task. */ + struct gomp_dependers_vec *dependers; + struct htab *depend_hash; + struct gomp_taskwait *taskwait; ++ /* Number of items in DEPEND. */ + size_t depend_count; ++ /* Number of tasks this task depends on. Once this counter reaches ++ 0, we have no unsatisfied dependencies, and this task can be put ++ into the various queues to be scheduled. */ + size_t num_dependees; ++ ++ /* Priority of this task. */ ++ int priority; ++ /* The priority node for this task in each of the different queues. ++ We put this here to avoid allocating space for each priority ++ node. Then we play offsetof() games to convert between pnode[] ++ entries and the gomp_task in which they reside. */ ++ struct priority_node pnode[3]; ++ + struct gomp_task_icv icv; + void (*fn) (void *); + void *fn_data; +@@ -320,20 +447,58 @@ struct gomp_task + bool in_tied_task; + bool final_task; + bool copy_ctors_done; ++ /* Set for undeferred tasks with unsatisfied dependencies which ++ block further execution of their parent until the dependencies ++ are satisfied. */ + bool parent_depends_on; ++ /* Dependencies provided and/or needed for this task. DEPEND_COUNT ++ is the number of items available. */ + struct gomp_task_depend_entry depend[]; + }; + ++/* This structure describes a single #pragma omp taskgroup. */ ++ + struct gomp_taskgroup + { + struct gomp_taskgroup *prev; +- struct gomp_task *children; ++ /* Queue of tasks that belong in this taskgroup. */ ++ struct priority_queue taskgroup_queue; + bool in_taskgroup_wait; + bool cancelled; + gomp_sem_t taskgroup_sem; + size_t num_children; + }; + ++/* Various state of OpenMP async offloading tasks. */ ++enum gomp_target_task_state ++{ ++ GOMP_TARGET_TASK_DATA, ++ GOMP_TARGET_TASK_BEFORE_MAP, ++ GOMP_TARGET_TASK_FALLBACK, ++ GOMP_TARGET_TASK_READY_TO_RUN, ++ GOMP_TARGET_TASK_RUNNING, ++ GOMP_TARGET_TASK_FINISHED ++}; ++ ++/* This structure describes a target task. */ ++ ++struct gomp_target_task ++{ ++ struct gomp_device_descr *devicep; ++ void (*fn) (void *); ++ size_t mapnum; ++ size_t *sizes; ++ unsigned short *kinds; ++ unsigned int flags; ++ enum gomp_target_task_state state; ++ struct target_mem_desc *tgt; ++ struct gomp_task *task; ++ struct gomp_team *team; ++ /* Device-specific target arguments. */ ++ void **args; ++ void *hostaddrs[]; ++}; ++ + /* This structure describes a "team" of threads. These are the threads + that are spawned by a PARALLEL constructs, as well as the work sharing + constructs that the team encounters. */ +@@ -396,7 +561,8 @@ struct gomp_team + struct gomp_work_share work_shares[8]; + + gomp_mutex_t task_lock; +- struct gomp_task *task_queue; ++ /* Scheduled tasks. */ ++ struct priority_queue task_queue; + /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team. */ + unsigned int task_count; + /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled. */ +@@ -451,6 +617,9 @@ struct gomp_thread_pool + struct gomp_thread **threads; + unsigned threads_size; + unsigned threads_used; ++ /* The last team is used for non-nested teams to delay their destruction to ++ make sure all the threads in the team move on to the pool's barrier before ++ the team's barrier is destroyed. */ + struct gomp_team *last_team; + /* Number of threads running in this contention group. */ + unsigned long threads_busy; +@@ -519,23 +688,7 @@ extern bool gomp_affinity_same_place (vo + extern bool gomp_affinity_finalize_place_list (bool); + extern bool gomp_affinity_init_level (int, unsigned long, bool); + extern void gomp_affinity_print_place (void *); +- +-/* alloc.c */ +- +-extern void *gomp_malloc (size_t) __attribute__((malloc)); +-extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); +-extern void *gomp_realloc (void *, size_t); +- +-/* Avoid conflicting prototypes of alloca() in system headers by using +- GCC's builtin alloca(). */ +-#define gomp_alloca(x) __builtin_alloca(x) +- +-/* error.c */ +- +-extern void gomp_error (const char *, ...) +- __attribute__((format (printf, 1, 2))); +-extern void gomp_fatal (const char *, ...) +- __attribute__((noreturn, format (printf, 1, 2))); ++extern void gomp_get_place_proc_ids_8 (int, int64_t *); + + /* iter.c */ + +@@ -572,6 +725,9 @@ extern void gomp_ordered_next (void); + extern void gomp_ordered_static_init (void); + extern void gomp_ordered_static_next (void); + extern void gomp_ordered_sync (void); ++extern void gomp_doacross_init (unsigned, long *, long); ++extern void gomp_doacross_ull_init (unsigned, unsigned long long *, ++ unsigned long long); + + /* parallel.c */ + +@@ -588,6 +744,12 @@ extern void gomp_init_task (struct gomp_ + struct gomp_task_icv *); + extern void gomp_end_task (void); + extern void gomp_barrier_handle_tasks (gomp_barrier_state_t); ++extern void gomp_task_maybe_wait_for_dependencies (void **); ++extern bool gomp_create_target_task (struct gomp_device_descr *, ++ void (*) (void *), size_t, void **, ++ size_t *, unsigned short *, unsigned int, ++ void **, void **, ++ enum gomp_target_task_state); + + static void inline + gomp_finish_task (struct gomp_task *task) +@@ -606,7 +768,213 @@ extern void gomp_free_thread (void *); + + /* target.c */ + ++extern void gomp_init_targets_once (void); + extern int gomp_get_num_devices (void); ++extern bool gomp_target_task_fn (void *); ++ ++/* Splay tree definitions. */ ++typedef struct splay_tree_node_s *splay_tree_node; ++typedef struct splay_tree_s *splay_tree; ++typedef struct splay_tree_key_s *splay_tree_key; ++ ++struct target_var_desc { ++ /* Splay key. */ ++ splay_tree_key key; ++ /* True if data should be copied from device to host at the end. */ ++ bool copy_from; ++ /* True if data always should be copied from device to host at the end. */ ++ bool always_copy_from; ++ /* Relative offset against key host_start. */ ++ uintptr_t offset; ++ /* Actual length. */ ++ uintptr_t length; ++}; ++ ++struct target_mem_desc { ++ /* Reference count. */ ++ uintptr_t refcount; ++ /* All the splay nodes allocated together. */ ++ splay_tree_node array; ++ /* Start of the target region. */ ++ uintptr_t tgt_start; ++ /* End of the targer region. */ ++ uintptr_t tgt_end; ++ /* Handle to free. */ ++ void *to_free; ++ /* Previous target_mem_desc. */ ++ struct target_mem_desc *prev; ++ /* Number of items in following list. */ ++ size_t list_count; ++ ++ /* Corresponding target device descriptor. */ ++ struct gomp_device_descr *device_descr; ++ ++ /* List of target items to remove (or decrease refcount) ++ at the end of region. */ ++ struct target_var_desc list[]; ++}; ++ ++/* Special value for refcount - infinity. */ ++#define REFCOUNT_INFINITY (~(uintptr_t) 0) ++/* Special value for refcount - tgt_offset contains target address of the ++ artificial pointer to "omp declare target link" object. */ ++#define REFCOUNT_LINK (~(uintptr_t) 1) ++ ++struct splay_tree_key_s { ++ /* Address of the host object. */ ++ uintptr_t host_start; ++ /* Address immediately after the host object. */ ++ uintptr_t host_end; ++ /* Descriptor of the target memory. */ ++ struct target_mem_desc *tgt; ++ /* Offset from tgt->tgt_start to the start of the target object. */ ++ uintptr_t tgt_offset; ++ /* Reference count. */ ++ uintptr_t refcount; ++ /* Pointer to the original mapping of "omp declare target link" object. */ ++ splay_tree_key link_key; ++}; ++ ++/* The comparison function. */ ++ ++static inline int ++splay_compare (splay_tree_key x, splay_tree_key y) ++{ ++ if (x->host_start == x->host_end ++ && y->host_start == y->host_end) ++ return 0; ++ if (x->host_end <= y->host_start) ++ return -1; ++ if (x->host_start >= y->host_end) ++ return 1; ++ return 0; ++} ++ ++#include "splay-tree.h" ++ ++typedef struct acc_dispatch_t ++{ ++ /* This is a linked list of data mapped using the ++ acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas. ++ Unlike mapped_data in the goacc_thread struct, unmapping can ++ happen out-of-order with respect to mapping. */ ++ /* This is guarded by the lock in the "outer" struct gomp_device_descr. */ ++ struct target_mem_desc *data_environ; ++ ++ /* Execute. */ ++ void (*exec_func) (void (*) (void *), size_t, void **, void **, int, ++ unsigned *, void *); ++ ++ /* Async cleanup callback registration. */ ++ void (*register_async_cleanup_func) (void *, int); ++ ++ /* Asynchronous routines. */ ++ int (*async_test_func) (int); ++ int (*async_test_all_func) (void); ++ void (*async_wait_func) (int); ++ void (*async_wait_async_func) (int, int); ++ void (*async_wait_all_func) (void); ++ void (*async_wait_all_async_func) (int); ++ void (*async_set_async_func) (int); ++ ++ /* Create/destroy TLS data. */ ++ void *(*create_thread_data_func) (int); ++ void (*destroy_thread_data_func) (void *); ++ ++ /* NVIDIA target specific routines. */ ++ struct { ++ void *(*get_current_device_func) (void); ++ void *(*get_current_context_func) (void); ++ void *(*get_stream_func) (int); ++ int (*set_stream_func) (int, void *); ++ } cuda; ++} acc_dispatch_t; ++ ++/* Various state of the accelerator device. */ ++enum gomp_device_state ++{ ++ GOMP_DEVICE_UNINITIALIZED, ++ GOMP_DEVICE_INITIALIZED, ++ GOMP_DEVICE_FINALIZED ++}; ++ ++/* This structure describes accelerator device. ++ It contains name of the corresponding libgomp plugin, function handlers for ++ interaction with the device, ID-number of the device, and information about ++ mapped memory. */ ++struct gomp_device_descr ++{ ++ /* Immutable data, which is only set during initialization, and which is not ++ guarded by the lock. */ ++ ++ /* The name of the device. */ ++ const char *name; ++ ++ /* Capabilities of device (supports OpenACC, OpenMP). */ ++ unsigned int capabilities; ++ ++ /* This is the ID number of device among devices of the same type. */ ++ int target_id; ++ ++ /* This is the TYPE of device. */ ++ enum offload_target_type type; ++ ++ /* Function handlers. */ ++ const char *(*get_name_func) (void); ++ unsigned int (*get_caps_func) (void); ++ int (*get_type_func) (void); ++ int (*get_num_devices_func) (void); ++ bool (*init_device_func) (int); ++ bool (*fini_device_func) (int); ++ unsigned (*version_func) (void); ++ int (*load_image_func) (int, unsigned, const void *, struct addr_pair **); ++ bool (*unload_image_func) (int, unsigned, const void *); ++ void *(*alloc_func) (int, size_t); ++ bool (*free_func) (int, void *); ++ bool (*dev2host_func) (int, void *, const void *, size_t); ++ bool (*host2dev_func) (int, void *, const void *, size_t); ++ bool (*dev2dev_func) (int, void *, const void *, size_t); ++ bool (*can_run_func) (void *); ++ void (*run_func) (int, void *, void *, void **); ++ void (*async_run_func) (int, void *, void *, void **, void *); ++ ++ /* Splay tree containing information about mapped memory regions. */ ++ struct splay_tree_s mem_map; ++ ++ /* Mutex for the mutable data. */ ++ gomp_mutex_t lock; ++ ++ /* Current state of the device. OpenACC allows to move from INITIALIZED state ++ back to UNINITIALIZED state. OpenMP allows only to move from INITIALIZED ++ to FINALIZED state (at program shutdown). */ ++ enum gomp_device_state state; ++ ++ /* OpenACC-specific data and functions. */ ++ /* This is mutable because of its mutable data_environ and target_data ++ members. */ ++ acc_dispatch_t openacc; ++}; ++ ++/* Kind of the pragma, for which gomp_map_vars () is called. */ ++enum gomp_map_vars_kind ++{ ++ GOMP_MAP_VARS_OPENACC, ++ GOMP_MAP_VARS_TARGET, ++ GOMP_MAP_VARS_DATA, ++ GOMP_MAP_VARS_ENTER_DATA ++}; ++ ++extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); ++extern void gomp_acc_remove_pointer (void *, bool, int, int); ++ ++extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, ++ size_t, void **, void **, ++ size_t *, void *, bool, ++ enum gomp_map_vars_kind); ++extern void gomp_unmap_vars (struct target_mem_desc *, bool); ++extern void gomp_init_device (struct gomp_device_descr *); ++extern void gomp_free_memmap (struct splay_tree_s *); ++extern void gomp_unload_device (struct gomp_device_descr *); + + /* work.c */ + +@@ -646,8 +1014,28 @@ typedef enum omp_proc_bind_t + omp_proc_bind_spread = 4 + } omp_proc_bind_t; + ++typedef enum omp_lock_hint_t ++{ ++ omp_lock_hint_none = 0, ++ omp_lock_hint_uncontended = 1, ++ omp_lock_hint_contended = 2, ++ omp_lock_hint_nonspeculative = 4, ++ omp_lock_hint_speculative = 8, ++} omp_lock_hint_t; ++ ++extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t) ++ __GOMP_NOTHROW; ++extern void omp_init_nest_lock_with_hint (omp_lock_t *, omp_lock_hint_t) ++ __GOMP_NOTHROW; ++ + extern int omp_get_cancellation (void) __GOMP_NOTHROW; + extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW; ++extern int omp_get_num_places (void) __GOMP_NOTHROW; ++extern int omp_get_place_num_procs (int) __GOMP_NOTHROW; ++extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW; ++extern int omp_get_place_num (void) __GOMP_NOTHROW; ++extern int omp_get_partition_num_places (void) __GOMP_NOTHROW; ++extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW; + + extern void omp_set_default_device (int) __GOMP_NOTHROW; + extern int omp_get_default_device (void) __GOMP_NOTHROW; +@@ -656,6 +1044,24 @@ extern int omp_get_num_teams (void) __GO + extern int omp_get_team_num (void) __GOMP_NOTHROW; + + extern int omp_is_initial_device (void) __GOMP_NOTHROW; ++extern int omp_get_initial_device (void) __GOMP_NOTHROW; ++extern int omp_get_max_task_priority (void) __GOMP_NOTHROW; ++ ++extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW; ++extern void omp_target_free (void *, int) __GOMP_NOTHROW; ++extern int omp_target_is_present (void *, int) __GOMP_NOTHROW; ++extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__, ++ __SIZE_TYPE__, int, int) __GOMP_NOTHROW; ++extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int, ++ const __SIZE_TYPE__ *, ++ const __SIZE_TYPE__ *, ++ const __SIZE_TYPE__ *, ++ const __SIZE_TYPE__ *, ++ const __SIZE_TYPE__ *, int, int) ++ __GOMP_NOTHROW; ++extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__, ++ __SIZE_TYPE__, int) __GOMP_NOTHROW; ++extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW; + + #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \ + || !defined (HAVE_ATTRIBUTE_ALIAS) \ +@@ -728,4 +1134,34 @@ extern int gomp_test_nest_lock_25 (omp_n + # define ialias_call(fn) fn + #endif + ++/* Helper function for priority_node_to_task() and ++ task_to_priority_node(). ++ ++ Return the offset from a task to its priority_node entry. The ++ priority_node entry is has a type of TYPE. */ ++ ++static inline size_t ++priority_queue_offset (enum priority_queue_type type) ++{ ++ return offsetof (struct gomp_task, pnode[(int) type]); ++} ++ ++/* Return the task associated with a priority NODE of type TYPE. */ ++ ++static inline struct gomp_task * ++priority_node_to_task (enum priority_queue_type type, ++ struct priority_node *node) ++{ ++ return (struct gomp_task *) ((char *) node - priority_queue_offset (type)); ++} ++ ++/* Return the priority node of type TYPE for a given TASK. */ ++ ++static inline struct priority_node * ++task_to_priority_node (enum priority_queue_type type, ++ struct gomp_task *task) ++{ ++ return (struct priority_node *) ((char *) task ++ + priority_queue_offset (type)); ++} + #endif /* LIBGOMP_H */ +--- libgomp/env.c.jj 2014-05-15 10:56:32.420522486 +0200 ++++ libgomp/env.c 2016-07-13 16:57:04.437535335 +0200 +@@ -27,6 +27,8 @@ + + #include "libgomp.h" + #include "libgomp_f.h" ++#include "oacc-int.h" ++#include "gomp-constants.h" + #include <ctype.h> + #include <stdlib.h> + #include <stdio.h> +@@ -56,7 +58,7 @@ struct gomp_task_icv gomp_global_icv = { + .nthreads_var = 1, + .thread_limit_var = UINT_MAX, + .run_sched_var = GFS_DYNAMIC, +- .run_sched_modifier = 1, ++ .run_sched_chunk_size = 1, + .default_device_var = 0, + .dyn_var = false, + .nest_var = false, +@@ -66,6 +68,7 @@ struct gomp_task_icv gomp_global_icv = { + + unsigned long gomp_max_active_levels_var = INT_MAX; + bool gomp_cancel_var = false; ++int gomp_max_task_priority_var = 0; + #ifndef HAVE_SYNC_BUILTINS + gomp_mutex_t gomp_managed_threads_lock; + #endif +@@ -76,6 +79,9 @@ char *gomp_bind_var_list; + unsigned long gomp_bind_var_list_len; + void **gomp_places_list; + unsigned long gomp_places_list_len; ++int gomp_debug_var; ++char *goacc_device_type; ++int goacc_device_num; + + /* Parse the OMP_SCHEDULE environment variable. */ + +@@ -118,7 +124,7 @@ parse_schedule (void) + ++env; + if (*env == '\0') + { +- gomp_global_icv.run_sched_modifier ++ gomp_global_icv.run_sched_chunk_size + = gomp_global_icv.run_sched_var != GFS_STATIC; + return; + } +@@ -144,7 +150,7 @@ parse_schedule (void) + + if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC) + value = 1; +- gomp_global_icv.run_sched_modifier = value; ++ gomp_global_icv.run_sched_chunk_size = value; + return; + + unknown: +@@ -1011,6 +1017,16 @@ parse_affinity (bool ignore) + return false; + } + ++static void ++parse_acc_device_type (void) ++{ ++ const char *env = getenv ("ACC_DEVICE_TYPE"); ++ ++ if (env && *env != '\0') ++ goacc_device_type = strdup (env); ++ else ++ goacc_device_type = NULL; ++} + + static void + handle_omp_display_env (unsigned long stacksize, int wait_policy) +@@ -1054,7 +1070,7 @@ handle_omp_display_env (unsigned long st + + fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr); + +- fputs (" _OPENMP = '201307'\n", stderr); ++ fputs (" _OPENMP = '201511'\n", stderr); + fprintf (stderr, " OMP_DYNAMIC = '%s'\n", + gomp_global_icv.dyn_var ? "TRUE" : "FALSE"); + fprintf (stderr, " OMP_NESTED = '%s'\n", +@@ -1142,6 +1158,8 @@ handle_omp_display_env (unsigned long st + gomp_cancel_var ? "TRUE" : "FALSE"); + fprintf (stderr, " OMP_DEFAULT_DEVICE = '%d'\n", + gomp_global_icv.default_device_var); ++ fprintf (stderr, " OMP_MAX_TASK_PRIORITY = '%d'\n", ++ gomp_max_task_priority_var); + + if (verbose) + { +@@ -1174,6 +1192,7 @@ initialize_env (void) + parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var); + parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var); + parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true); ++ parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true); + parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var, + true); + if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false)) +@@ -1181,6 +1200,7 @@ initialize_env (void) + gomp_global_icv.thread_limit_var + = thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var; + } ++ parse_int ("GOMP_DEBUG", &gomp_debug_var, true); + #ifndef HAVE_SYNC_BUILTINS + gomp_mutex_init (&gomp_managed_threads_lock); + #endif +@@ -1271,6 +1291,15 @@ initialize_env (void) + } + + handle_omp_display_env (stacksize, wait_policy); ++ ++ /* OpenACC. */ ++ ++ if (!parse_int ("ACC_DEVICE_NUM", &goacc_device_num, true)) ++ goacc_device_num = 0; ++ ++ parse_acc_device_type (); ++ ++ goacc_runtime_initialize (); + } + + +@@ -1312,21 +1341,21 @@ omp_get_nested (void) + } + + void +-omp_set_schedule (omp_sched_t kind, int modifier) ++omp_set_schedule (omp_sched_t kind, int chunk_size) + { + struct gomp_task_icv *icv = gomp_icv (true); + switch (kind) + { + case omp_sched_static: +- if (modifier < 1) +- modifier = 0; +- icv->run_sched_modifier = modifier; ++ if (chunk_size < 1) ++ chunk_size = 0; ++ icv->run_sched_chunk_size = chunk_size; + break; + case omp_sched_dynamic: + case omp_sched_guided: +- if (modifier < 1) +- modifier = 1; +- icv->run_sched_modifier = modifier; ++ if (chunk_size < 1) ++ chunk_size = 1; ++ icv->run_sched_chunk_size = chunk_size; + break; + case omp_sched_auto: + break; +@@ -1337,11 +1366,11 @@ omp_set_schedule (omp_sched_t kind, int + } + + void +-omp_get_schedule (omp_sched_t *kind, int *modifier) ++omp_get_schedule (omp_sched_t *kind, int *chunk_size) + { + struct gomp_task_icv *icv = gomp_icv (false); + *kind = icv->run_sched_var; +- *modifier = icv->run_sched_modifier; ++ *chunk_size = icv->run_sched_chunk_size; + } + + int +@@ -1377,6 +1406,12 @@ omp_get_cancellation (void) + return gomp_cancel_var; + } + ++int ++omp_get_max_task_priority (void) ++{ ++ return gomp_max_task_priority_var; ++} ++ + omp_proc_bind_t + omp_get_proc_bind (void) + { +@@ -1425,6 +1460,59 @@ omp_is_initial_device (void) + return 1; + } + ++int ++omp_get_initial_device (void) ++{ ++ return GOMP_DEVICE_HOST_FALLBACK; ++} ++ ++int ++omp_get_num_places (void) ++{ ++ return gomp_places_list_len; ++} ++ ++int ++omp_get_place_num (void) ++{ ++ if (gomp_places_list == NULL) ++ return -1; ++ ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->place == 0) ++ gomp_init_affinity (); ++ ++ return (int) thr->place - 1; ++} ++ ++int ++omp_get_partition_num_places (void) ++{ ++ if (gomp_places_list == NULL) ++ return 0; ++ ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->place == 0) ++ gomp_init_affinity (); ++ ++ return thr->ts.place_partition_len; ++} ++ ++void ++omp_get_partition_place_nums (int *place_nums) ++{ ++ if (gomp_places_list == NULL) ++ return; ++ ++ struct gomp_thread *thr = gomp_thread (); ++ if (thr->place == 0) ++ gomp_init_affinity (); ++ ++ unsigned int i; ++ for (i = 0; i < thr->ts.place_partition_len; i++) ++ *place_nums++ = thr->ts.place_partition_off + i; ++} ++ + ialias (omp_set_dynamic) + ialias (omp_set_nested) + ialias (omp_set_num_threads) +@@ -1444,3 +1532,9 @@ ialias (omp_get_num_devices) + ialias (omp_get_num_teams) + ialias (omp_get_team_num) + ialias (omp_is_initial_device) ++ialias (omp_get_initial_device) ++ialias (omp_get_max_task_priority) ++ialias (omp_get_num_places) ++ialias (omp_get_place_num) ++ialias (omp_get_partition_num_places) ++ialias (omp_get_partition_place_nums) +--- libgomp/openacc.h.jj 2016-07-13 16:57:04.432535397 +0200 ++++ libgomp/openacc.h 2016-07-13 16:57:04.432535397 +0200 +@@ -0,0 +1,131 @@ ++/* OpenACC Runtime Library User-facing Declarations ++ ++ Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#ifndef _OPENACC_H ++#define _OPENACC_H 1 ++ ++/* The OpenACC standard is silent on whether or not including <openacc.h> ++ might or must not include other header files. We chose to include ++ some. */ ++#include <stddef.h> ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#if __cplusplus >= 201103 ++# define __GOACC_NOTHROW noexcept ++#elif __cplusplus ++# define __GOACC_NOTHROW throw () ++#else /* Not C++ */ ++# define __GOACC_NOTHROW __attribute__ ((__nothrow__)) ++#endif ++ ++/* Types */ ++typedef enum acc_device_t { ++ /* Keep in sync with include/gomp-constants.h. */ ++ acc_device_none = 0, ++ acc_device_default = 1, ++ acc_device_host = 2, ++ /* acc_device_host_nonshm = 3 removed. */ ++ acc_device_not_host = 4, ++ acc_device_nvidia = 5, ++ _ACC_device_hwm, ++ /* Ensure enumeration is layout compatible with int. */ ++ _ACC_highest = __INT_MAX__, ++ _ACC_neg = -1 ++} acc_device_t; ++ ++typedef enum acc_async_t { ++ /* Keep in sync with include/gomp-constants.h. */ ++ acc_async_noval = -1, ++ acc_async_sync = -2 ++} acc_async_t; ++ ++int acc_get_num_devices (acc_device_t) __GOACC_NOTHROW; ++void acc_set_device_type (acc_device_t) __GOACC_NOTHROW; ++acc_device_t acc_get_device_type (void) __GOACC_NOTHROW; ++void acc_set_device_num (int, acc_device_t) __GOACC_NOTHROW; ++int acc_get_device_num (acc_device_t) __GOACC_NOTHROW; ++int acc_async_test (int) __GOACC_NOTHROW; ++int acc_async_test_all (void) __GOACC_NOTHROW; ++void acc_wait (int) __GOACC_NOTHROW; ++void acc_wait_async (int, int) __GOACC_NOTHROW; ++void acc_wait_all (void) __GOACC_NOTHROW; ++void acc_wait_all_async (int) __GOACC_NOTHROW; ++void acc_init (acc_device_t) __GOACC_NOTHROW; ++void acc_shutdown (acc_device_t) __GOACC_NOTHROW; ++#ifdef __cplusplus ++int acc_on_device (int __arg) __GOACC_NOTHROW; ++#else ++int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW; ++#endif ++void *acc_malloc (size_t) __GOACC_NOTHROW; ++void acc_free (void *) __GOACC_NOTHROW; ++/* Some of these would be more correct with const qualifiers, but ++ the standard specifies otherwise. */ ++void *acc_copyin (void *, size_t) __GOACC_NOTHROW; ++void *acc_present_or_copyin (void *, size_t) __GOACC_NOTHROW; ++void *acc_create (void *, size_t) __GOACC_NOTHROW; ++void *acc_present_or_create (void *, size_t) __GOACC_NOTHROW; ++void acc_copyout (void *, size_t) __GOACC_NOTHROW; ++void acc_delete (void *, size_t) __GOACC_NOTHROW; ++void acc_update_device (void *, size_t) __GOACC_NOTHROW; ++void acc_update_self (void *, size_t) __GOACC_NOTHROW; ++void acc_map_data (void *, void *, size_t) __GOACC_NOTHROW; ++void acc_unmap_data (void *) __GOACC_NOTHROW; ++void *acc_deviceptr (void *) __GOACC_NOTHROW; ++void *acc_hostptr (void *) __GOACC_NOTHROW; ++int acc_is_present (void *, size_t) __GOACC_NOTHROW; ++void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW; ++void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW; ++ ++/* Old names. OpenACC does not specify whether these can or must ++ not be macros, inlines or aliases for the new names. */ ++#define acc_pcreate acc_present_or_create ++#define acc_pcopyin acc_present_or_copyin ++ ++/* CUDA-specific routines. */ ++void *acc_get_current_cuda_device (void) __GOACC_NOTHROW; ++void *acc_get_current_cuda_context (void) __GOACC_NOTHROW; ++void *acc_get_cuda_stream (int) __GOACC_NOTHROW; ++int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW; ++ ++#ifdef __cplusplus ++} ++ ++/* Forwarding function with correctly typed arg. */ ++ ++#pragma acc routine seq ++inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW ++{ ++ return acc_on_device ((int) __arg); ++} ++#endif ++ ++#endif /* _OPENACC_H */ +--- libgomp/config/linux/doacross.h.jj 2016-07-13 16:57:18.902355979 +0200 ++++ libgomp/config/linux/doacross.h 2016-07-13 16:57:18.902355979 +0200 +@@ -0,0 +1,57 @@ ++/* Copyright (C) 2015-2016 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* This is a Linux specific implementation of doacross spinning. */ ++ ++#ifndef GOMP_DOACROSS_H ++#define GOMP_DOACROSS_H 1 ++ ++#include "libgomp.h" ++#include <errno.h> ++#include "wait.h" ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility push(hidden) ++#endif ++ ++static inline void doacross_spin (unsigned long *addr, unsigned long expected, ++ unsigned long cur) ++{ ++ /* FIXME: back off depending on how large expected - cur is. */ ++ do ++ { ++ cpu_relax (); ++ cur = __atomic_load_n (addr, MEMMODEL_RELAXED); ++ if (expected < cur) ++ return; ++ } ++ while (1); ++} ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility pop ++#endif ++ ++#endif /* GOMP_DOACROSS_H */ +--- libgomp/config/posix/doacross.h.jj 2016-07-13 16:57:18.903355966 +0200 ++++ libgomp/config/posix/doacross.h 2016-07-13 16:57:18.903355966 +0200 +@@ -0,0 +1,62 @@ ++/* Copyright (C) 2015-2016 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* This is a generic implementation of doacross spinning. */ ++ ++#ifndef GOMP_DOACROSS_H ++#define GOMP_DOACROSS_H 1 ++ ++#include "libgomp.h" ++#include <errno.h> ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility push(hidden) ++#endif ++ ++static inline void ++cpu_relax (void) ++{ ++ __asm volatile ("" : : : "memory"); ++} ++ ++static inline void doacross_spin (unsigned long *addr, unsigned long expected, ++ unsigned long cur) ++{ ++ /* FIXME: back off depending on how large expected - cur is. */ ++ do ++ { ++ cpu_relax (); ++ cur = __atomic_load_n (addr, MEMMODEL_RELAXED); ++ if (expected < cur) ++ return; ++ } ++ while (1); ++} ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility pop ++#endif ++ ++#endif /* GOMP_DOACROSS_H */ +--- libgomp/splay-tree.c.jj 2016-07-13 16:57:18.919355768 +0200 ++++ libgomp/splay-tree.c 2016-07-13 16:57:18.919355768 +0200 +@@ -0,0 +1,238 @@ ++/* A splay-tree datatype. ++ Copyright (C) 1998-2016 Free Software Foundation, Inc. ++ Contributed by Mark Mitchell (mark@markmitchell.com). ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* The splay tree code copied from include/splay-tree.h and adjusted, ++ so that all the data lives directly in splay_tree_node_s structure ++ and no extra allocations are needed. */ ++ ++/* For an easily readable description of splay-trees, see: ++ ++ Lewis, Harry R. and Denenberg, Larry. Data Structures and Their ++ Algorithms. Harper-Collins, Inc. 1991. ++ ++ The major feature of splay trees is that all basic tree operations ++ are amortized O(log n) time for a tree with n nodes. */ ++ ++#include "libgomp.h" ++ ++/* Rotate the edge joining the left child N with its parent P. PP is the ++ grandparents' pointer to P. */ ++ ++static inline void ++rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n) ++{ ++ splay_tree_node tmp; ++ tmp = n->right; ++ n->right = p; ++ p->left = tmp; ++ *pp = n; ++} ++ ++/* Rotate the edge joining the right child N with its parent P. PP is the ++ grandparents' pointer to P. */ ++ ++static inline void ++rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n) ++{ ++ splay_tree_node tmp; ++ tmp = n->left; ++ n->left = p; ++ p->right = tmp; ++ *pp = n; ++} ++ ++/* Bottom up splay of KEY. */ ++ ++static void ++splay_tree_splay (splay_tree sp, splay_tree_key key) ++{ ++ if (sp->root == NULL) ++ return; ++ ++ do { ++ int cmp1, cmp2; ++ splay_tree_node n, c; ++ ++ n = sp->root; ++ cmp1 = splay_compare (key, &n->key); ++ ++ /* Found. */ ++ if (cmp1 == 0) ++ return; ++ ++ /* Left or right? If no child, then we're done. */ ++ if (cmp1 < 0) ++ c = n->left; ++ else ++ c = n->right; ++ if (!c) ++ return; ++ ++ /* Next one left or right? If found or no child, we're done ++ after one rotation. */ ++ cmp2 = splay_compare (key, &c->key); ++ if (cmp2 == 0 ++ || (cmp2 < 0 && !c->left) ++ || (cmp2 > 0 && !c->right)) ++ { ++ if (cmp1 < 0) ++ rotate_left (&sp->root, n, c); ++ else ++ rotate_right (&sp->root, n, c); ++ return; ++ } ++ ++ /* Now we have the four cases of double-rotation. */ ++ if (cmp1 < 0 && cmp2 < 0) ++ { ++ rotate_left (&n->left, c, c->left); ++ rotate_left (&sp->root, n, n->left); ++ } ++ else if (cmp1 > 0 && cmp2 > 0) ++ { ++ rotate_right (&n->right, c, c->right); ++ rotate_right (&sp->root, n, n->right); ++ } ++ else if (cmp1 < 0 && cmp2 > 0) ++ { ++ rotate_right (&n->left, c, c->right); ++ rotate_left (&sp->root, n, n->left); ++ } ++ else if (cmp1 > 0 && cmp2 < 0) ++ { ++ rotate_left (&n->right, c, c->left); ++ rotate_right (&sp->root, n, n->right); ++ } ++ } while (1); ++} ++ ++/* Insert a new NODE into SP. The NODE shouldn't exist in the tree. */ ++ ++attribute_hidden void ++splay_tree_insert (splay_tree sp, splay_tree_node node) ++{ ++ int comparison = 0; ++ ++ splay_tree_splay (sp, &node->key); ++ ++ if (sp->root) ++ comparison = splay_compare (&sp->root->key, &node->key); ++ ++ if (sp->root && comparison == 0) ++ gomp_fatal ("Duplicate node"); ++ else ++ { ++ /* Insert it at the root. */ ++ if (sp->root == NULL) ++ node->left = node->right = NULL; ++ else if (comparison < 0) ++ { ++ node->left = sp->root; ++ node->right = node->left->right; ++ node->left->right = NULL; ++ } ++ else ++ { ++ node->right = sp->root; ++ node->left = node->right->left; ++ node->right->left = NULL; ++ } ++ ++ sp->root = node; ++ } ++} ++ ++/* Remove node with KEY from SP. It is not an error if it did not exist. */ ++ ++attribute_hidden void ++splay_tree_remove (splay_tree sp, splay_tree_key key) ++{ ++ splay_tree_splay (sp, key); ++ ++ if (sp->root && splay_compare (&sp->root->key, key) == 0) ++ { ++ splay_tree_node left, right; ++ ++ left = sp->root->left; ++ right = sp->root->right; ++ ++ /* One of the children is now the root. Doesn't matter much ++ which, so long as we preserve the properties of the tree. */ ++ if (left) ++ { ++ sp->root = left; ++ ++ /* If there was a right child as well, hang it off the ++ right-most leaf of the left child. */ ++ if (right) ++ { ++ while (left->right) ++ left = left->right; ++ left->right = right; ++ } ++ } ++ else ++ sp->root = right; ++ } ++} ++ ++/* Lookup KEY in SP, returning NODE if present, and NULL ++ otherwise. */ ++ ++attribute_hidden splay_tree_key ++splay_tree_lookup (splay_tree sp, splay_tree_key key) ++{ ++ splay_tree_splay (sp, key); ++ ++ if (sp->root && splay_compare (&sp->root->key, key) == 0) ++ return &sp->root->key; ++ else ++ return NULL; ++} ++ ++/* Helper function for splay_tree_foreach. ++ ++ Run FUNC on every node in KEY. */ ++ ++static void ++splay_tree_foreach_internal (splay_tree_node node, splay_tree_callback func, ++ void *data) ++{ ++ if (!node) ++ return; ++ func (&node->key, data); ++ splay_tree_foreach_internal (node->left, func, data); ++ /* Yeah, whatever. GCC can fix my tail recursion. */ ++ splay_tree_foreach_internal (node->right, func, data); ++} ++ ++/* Run FUNC on each of the nodes in SP. */ ++ ++attribute_hidden void ++splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data) ++{ ++ splay_tree_foreach_internal (sp->root, func, data); ++} +--- libgomp/libgomp-plugin.c.jj 2016-07-13 16:57:04.435535360 +0200 ++++ libgomp/libgomp-plugin.c 2016-07-13 16:57:04.435535360 +0200 +@@ -0,0 +1,80 @@ ++/* Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* Exported (non-hidden) functions exposing libgomp interface for plugins. */ ++ ++#include <stdlib.h> ++ ++#include "libgomp.h" ++#include "libgomp-plugin.h" ++ ++void * ++GOMP_PLUGIN_malloc (size_t size) ++{ ++ return gomp_malloc (size); ++} ++ ++void * ++GOMP_PLUGIN_malloc_cleared (size_t size) ++{ ++ return gomp_malloc_cleared (size); ++} ++ ++void * ++GOMP_PLUGIN_realloc (void *ptr, size_t size) ++{ ++ return gomp_realloc (ptr, size); ++} ++ ++void ++GOMP_PLUGIN_debug (int kind, const char *msg, ...) ++{ ++ va_list ap; ++ ++ va_start (ap, msg); ++ gomp_vdebug (kind, msg, ap); ++ va_end (ap); ++} ++ ++void ++GOMP_PLUGIN_error (const char *msg, ...) ++{ ++ va_list ap; ++ ++ va_start (ap, msg); ++ gomp_verror (msg, ap); ++ va_end (ap); ++} ++ ++void ++GOMP_PLUGIN_fatal (const char *msg, ...) ++{ ++ va_list ap; ++ ++ va_start (ap, msg); ++ gomp_vfatal (msg, ap); ++ va_end (ap); ++} +--- libgomp/libgomp-plugin.h.jj 2016-07-13 16:57:04.438535323 +0200 ++++ libgomp/libgomp-plugin.h 2016-07-13 16:57:04.438535323 +0200 +@@ -0,0 +1,80 @@ ++/* Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* An interface to various libgomp-internal functions for use by plugins. */ ++ ++#ifndef LIBGOMP_PLUGIN_H ++#define LIBGOMP_PLUGIN_H 1 ++ ++#include <stddef.h> ++#include <stdint.h> ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/* Capabilities of offloading devices. */ ++#define GOMP_OFFLOAD_CAP_SHARED_MEM (1 << 0) ++#define GOMP_OFFLOAD_CAP_NATIVE_EXEC (1 << 1) ++#define GOMP_OFFLOAD_CAP_OPENMP_400 (1 << 2) ++#define GOMP_OFFLOAD_CAP_OPENACC_200 (1 << 3) ++ ++/* Type of offload target device. Keep in sync with include/gomp-constants.h. */ ++enum offload_target_type ++{ ++ OFFLOAD_TARGET_TYPE_HOST = 2, ++ /* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed. */ ++ OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5, ++ OFFLOAD_TARGET_TYPE_INTEL_MIC = 6, ++ OFFLOAD_TARGET_TYPE_HSA = 7 ++}; ++ ++/* Auxiliary struct, used for transferring pairs of addresses from plugin ++ to libgomp. */ ++struct addr_pair ++{ ++ uintptr_t start; ++ uintptr_t end; ++}; ++ ++/* Miscellaneous functions. */ ++extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc)); ++extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc)); ++extern void *GOMP_PLUGIN_realloc (void *, size_t); ++void GOMP_PLUGIN_target_task_completion (void *); ++ ++extern void GOMP_PLUGIN_debug (int, const char *, ...) ++ __attribute__ ((format (printf, 2, 3))); ++extern void GOMP_PLUGIN_error (const char *, ...) ++ __attribute__ ((format (printf, 1, 2))); ++extern void GOMP_PLUGIN_fatal (const char *, ...) ++ __attribute__ ((noreturn, format (printf, 1, 2))); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +--- libgomp/oacc-async.c.jj 2016-07-13 16:57:13.488423109 +0200 ++++ libgomp/oacc-async.c 2016-07-13 16:57:13.488423109 +0200 +@@ -0,0 +1,107 @@ ++/* OpenACC Runtime Library Definitions. ++ ++ Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#include <assert.h> ++#include "openacc.h" ++#include "libgomp.h" ++#include "oacc-int.h" ++ ++int ++acc_async_test (int async) ++{ ++ if (async < acc_async_sync) ++ gomp_fatal ("invalid async argument: %d", async); ++ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (!thr || !thr->dev) ++ gomp_fatal ("no device active"); ++ ++ return thr->dev->openacc.async_test_func (async); ++} ++ ++int ++acc_async_test_all (void) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (!thr || !thr->dev) ++ gomp_fatal ("no device active"); ++ ++ return thr->dev->openacc.async_test_all_func (); ++} ++ ++void ++acc_wait (int async) ++{ ++ if (async < acc_async_sync) ++ gomp_fatal ("invalid async argument: %d", async); ++ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (!thr || !thr->dev) ++ gomp_fatal ("no device active"); ++ ++ thr->dev->openacc.async_wait_func (async); ++} ++ ++void ++acc_wait_async (int async1, int async2) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (!thr || !thr->dev) ++ gomp_fatal ("no device active"); ++ ++ thr->dev->openacc.async_wait_async_func (async1, async2); ++} ++ ++void ++acc_wait_all (void) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (!thr || !thr->dev) ++ gomp_fatal ("no device active"); ++ ++ thr->dev->openacc.async_wait_all_func (); ++} ++ ++void ++acc_wait_all_async (int async) ++{ ++ if (async < acc_async_sync) ++ gomp_fatal ("invalid async argument: %d", async); ++ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (!thr || !thr->dev) ++ gomp_fatal ("no device active"); ++ ++ thr->dev->openacc.async_wait_all_async_func (async); ++} +--- libgomp/splay-tree.h.jj 2016-07-13 16:57:18.934355582 +0200 ++++ libgomp/splay-tree.h 2016-07-13 16:57:18.934355582 +0200 +@@ -0,0 +1,130 @@ ++/* A splay-tree datatype. ++ Copyright (C) 1998-2016 Free Software Foundation, Inc. ++ Contributed by Mark Mitchell (mark@markmitchell.com). ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* The splay tree code copied from include/splay-tree.h and adjusted, ++ so that all the data lives directly in splay_tree_node_s structure ++ and no extra allocations are needed. ++ ++ Files including this header should before including it add: ++typedef struct splay_tree_node_s *splay_tree_node; ++typedef struct splay_tree_s *splay_tree; ++typedef struct splay_tree_key_s *splay_tree_key; ++ define splay_tree_key_s structure, and define ++ splay_compare inline function. ++ ++ Alternatively, they can define splay_tree_prefix macro before ++ including this header and then all the above types, the ++ splay_compare function and the splay_tree_{lookup,insert_remove} ++ function will be prefixed by that prefix. If splay_tree_prefix ++ macro is defined, this header must be included twice: once where ++ you need the header file definitions, and once where you need the ++ .c implementation routines. In the latter case, you must also ++ define the macro splay_tree_c. See the include of splay-tree.h in ++ priority_queue.[hc] for an example. */ ++ ++/* For an easily readable description of splay-trees, see: ++ ++ Lewis, Harry R. and Denenberg, Larry. Data Structures and Their ++ Algorithms. Harper-Collins, Inc. 1991. ++ ++ The major feature of splay trees is that all basic tree operations ++ are amortized O(log n) time for a tree with n nodes. */ ++ ++#ifdef splay_tree_prefix ++# define splay_tree_name_1(prefix, name) prefix ## _ ## name ++# define splay_tree_name(prefix, name) splay_tree_name_1 (prefix, name) ++# define splay_tree_node_s \ ++ splay_tree_name (splay_tree_prefix, splay_tree_node_s) ++# define splay_tree_s \ ++ splay_tree_name (splay_tree_prefix, splay_tree_s) ++# define splay_tree_key_s \ ++ splay_tree_name (splay_tree_prefix, splay_tree_key_s) ++# define splay_tree_node \ ++ splay_tree_name (splay_tree_prefix, splay_tree_node) ++# define splay_tree \ ++ splay_tree_name (splay_tree_prefix, splay_tree) ++# define splay_tree_key \ ++ splay_tree_name (splay_tree_prefix, splay_tree_key) ++# define splay_compare \ ++ splay_tree_name (splay_tree_prefix, splay_compare) ++# define splay_tree_lookup \ ++ splay_tree_name (splay_tree_prefix, splay_tree_lookup) ++# define splay_tree_insert \ ++ splay_tree_name (splay_tree_prefix, splay_tree_insert) ++# define splay_tree_remove \ ++ splay_tree_name (splay_tree_prefix, splay_tree_remove) ++# define splay_tree_foreach \ ++ splay_tree_name (splay_tree_prefix, splay_tree_foreach) ++# define splay_tree_callback \ ++ splay_tree_name (splay_tree_prefix, splay_tree_callback) ++#endif ++ ++#ifndef splay_tree_c ++/* Header file definitions and prototypes. */ ++ ++/* The nodes in the splay tree. */ ++struct splay_tree_node_s { ++ struct splay_tree_key_s key; ++ /* The left and right children, respectively. */ ++ splay_tree_node left; ++ splay_tree_node right; ++}; ++ ++/* The splay tree. */ ++struct splay_tree_s { ++ splay_tree_node root; ++}; ++ ++typedef void (*splay_tree_callback) (splay_tree_key, void *); ++ ++extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key); ++extern void splay_tree_insert (splay_tree, splay_tree_node); ++extern void splay_tree_remove (splay_tree, splay_tree_key); ++extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *); ++#else /* splay_tree_c */ ++# ifdef splay_tree_prefix ++# include "splay-tree.c" ++# undef splay_tree_name_1 ++# undef splay_tree_name ++# undef splay_tree_node_s ++# undef splay_tree_s ++# undef splay_tree_key_s ++# undef splay_tree_node ++# undef splay_tree ++# undef splay_tree_key ++# undef splay_compare ++# undef splay_tree_lookup ++# undef splay_tree_insert ++# undef splay_tree_remove ++# undef splay_tree_foreach ++# undef splay_tree_callback ++# undef splay_tree_c ++# endif ++#endif /* #ifndef splay_tree_c */ ++ ++#ifdef splay_tree_prefix ++# undef splay_tree_prefix ++#endif +--- libgomp/oacc-plugin.c.jj 2016-07-13 16:57:13.481423196 +0200 ++++ libgomp/oacc-plugin.c 2016-07-14 15:40:21.653151873 +0200 +@@ -0,0 +1,44 @@ ++/* Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* Initialize and register OpenACC dispatch table from libgomp plugin. */ ++ ++#include "libgomp.h" ++#include "oacc-plugin.h" ++#include "oacc-int.h" ++ ++void ++GOMP_PLUGIN_async_unmap_vars (void *ptr, int async) ++{ ++} ++ ++/* Return the target-specific part of the TLS data for the current thread. */ ++ ++void * ++GOMP_PLUGIN_acc_thread (void) ++{ ++ return NULL; ++} +--- libgomp/oacc-init.c.jj 2016-07-13 16:57:04.423535509 +0200 ++++ libgomp/oacc-init.c 2016-07-14 19:06:41.679575688 +0200 +@@ -0,0 +1,640 @@ ++/* OpenACC Runtime initialization routines ++ ++ Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#include "libgomp.h" ++#include "oacc-int.h" ++#include "openacc.h" ++#include <assert.h> ++#include <stdlib.h> ++#include <strings.h> ++#include <stdbool.h> ++#include <string.h> ++ ++/* This lock is used to protect access to cached_base_dev, dispatchers and ++ the (abstract) initialisation state of attached offloading devices. */ ++ ++static gomp_mutex_t acc_device_lock; ++ ++/* A cached version of the dispatcher for the global "current" accelerator type, ++ e.g. used as the default when creating new host threads. This is the ++ device-type equivalent of goacc_device_num (which specifies which device to ++ use out of potentially several of the same type). If there are several ++ devices of a given type, this points at the first one. */ ++ ++static struct gomp_device_descr *cached_base_dev = NULL; ++ ++#if defined HAVE_TLS || defined USE_EMUTLS ++__thread struct goacc_thread *goacc_tls_data; ++#else ++pthread_key_t goacc_tls_key; ++#endif ++static pthread_key_t goacc_cleanup_key; ++ ++static struct goacc_thread *goacc_threads; ++static gomp_mutex_t goacc_thread_lock; ++ ++/* An array of dispatchers for device types, indexed by the type. This array ++ only references "base" devices, and other instances of the same type are ++ found by simply indexing from each such device (which are stored linearly, ++ grouped by device in target.c:devices). */ ++static struct gomp_device_descr *dispatchers[_ACC_device_hwm] = { 0 }; ++ ++attribute_hidden void ++goacc_register (struct gomp_device_descr *disp) ++{ ++ /* Only register the 0th device here. */ ++ if (disp->target_id != 0) ++ return; ++ ++ gomp_mutex_lock (&acc_device_lock); ++ ++ assert (acc_device_type (disp->type) != acc_device_none ++ && acc_device_type (disp->type) != acc_device_default ++ && acc_device_type (disp->type) != acc_device_not_host); ++ assert (!dispatchers[disp->type]); ++ dispatchers[disp->type] = disp; ++ ++ gomp_mutex_unlock (&acc_device_lock); ++} ++ ++static const char * ++name_of_acc_device_t (enum acc_device_t type) ++{ ++ switch (type) ++ { ++ case acc_device_none: return "none"; ++ case acc_device_default: return "default"; ++ case acc_device_host: return "host"; ++ case acc_device_not_host: return "not_host"; ++ case acc_device_nvidia: return "nvidia"; ++ default: gomp_fatal ("unknown device type %u", (unsigned) type); ++ } ++} ++ ++/* ACC_DEVICE_LOCK must be held before calling this function. If FAIL_IS_ERROR ++ is true, this function raises an error if there are no devices of type D, ++ otherwise it returns NULL in that case. */ ++ ++static struct gomp_device_descr * ++resolve_device (acc_device_t d, bool fail_is_error) ++{ ++ acc_device_t d_arg = d; ++ ++ switch (d) ++ { ++ case acc_device_default: ++ { ++ if (goacc_device_type) ++ { ++ /* Lookup the named device. */ ++ if (!strcasecmp (goacc_device_type, "host")) ++ { ++ d = acc_device_host; ++ goto found; ++ } ++ ++ if (fail_is_error) ++ { ++ gomp_mutex_unlock (&acc_device_lock); ++ gomp_fatal ("device type %s not supported", goacc_device_type); ++ } ++ else ++ return NULL; ++ } ++ ++ /* No default device specified, so start scanning for any non-host ++ device that is available. */ ++ d = acc_device_not_host; ++ } ++ /* FALLTHROUGH */ ++ ++ case acc_device_not_host: ++ if (d_arg == acc_device_default) ++ { ++ d = acc_device_host; ++ goto found; ++ } ++ if (fail_is_error) ++ { ++ gomp_mutex_unlock (&acc_device_lock); ++ gomp_fatal ("no device found"); ++ } ++ else ++ return NULL; ++ break; ++ ++ case acc_device_host: ++ break; ++ ++ default: ++ if (d > _ACC_device_hwm) ++ { ++ if (fail_is_error) ++ goto unsupported_device; ++ else ++ return NULL; ++ } ++ break; ++ } ++ found: ++ ++ assert (d != acc_device_none ++ && d != acc_device_default ++ && d != acc_device_not_host); ++ ++ if (dispatchers[d] == NULL && fail_is_error) ++ { ++ unsupported_device: ++ gomp_mutex_unlock (&acc_device_lock); ++ gomp_fatal ("device type %s not supported", name_of_acc_device_t (d)); ++ } ++ ++ return dispatchers[d]; ++} ++ ++/* Emit a suitable error if no device of a particular type is available, or ++ the given device number is out-of-range. */ ++static void ++acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) ++{ ++ if (ndevs == 0) ++ gomp_fatal ("no devices of type %s available", name_of_acc_device_t (d)); ++ else ++ gomp_fatal ("device %u out of range", ord); ++} ++ ++/* This is called when plugins have been initialized, and serves to call ++ (indirectly) the target's device_init hook. Calling multiple times without ++ an intervening acc_shutdown_1 call is an error. ACC_DEVICE_LOCK must be ++ held before calling this function. */ ++ ++static struct gomp_device_descr * ++acc_init_1 (acc_device_t d) ++{ ++ struct gomp_device_descr *base_dev, *acc_dev; ++ int ndevs; ++ ++ base_dev = resolve_device (d, true); ++ ++ ndevs = base_dev->get_num_devices_func (); ++ ++ if (ndevs <= 0 || goacc_device_num >= ndevs) ++ acc_dev_num_out_of_range (d, goacc_device_num, ndevs); ++ ++ acc_dev = &base_dev[goacc_device_num]; ++ ++ gomp_mutex_lock (&acc_dev->lock); ++ if (acc_dev->state == GOMP_DEVICE_INITIALIZED) ++ { ++ gomp_mutex_unlock (&acc_dev->lock); ++ gomp_fatal ("device already active"); ++ } ++ ++ gomp_init_device (acc_dev); ++ gomp_mutex_unlock (&acc_dev->lock); ++ ++ return base_dev; ++} ++ ++/* ACC_DEVICE_LOCK must be held before calling this function. */ ++ ++static void ++acc_shutdown_1 (acc_device_t d) ++{ ++ struct gomp_device_descr *base_dev; ++ struct goacc_thread *walk; ++ int ndevs, i; ++ bool devices_active = false; ++ ++ /* Get the base device for this device type. */ ++ base_dev = resolve_device (d, true); ++ ++ ndevs = base_dev->get_num_devices_func (); ++ ++ gomp_mutex_lock (&goacc_thread_lock); ++ ++ /* Free target-specific TLS data and close all devices. */ ++ for (walk = goacc_threads; walk != NULL; walk = walk->next) ++ { ++ if (walk->target_tls) ++ base_dev->openacc.destroy_thread_data_func (walk->target_tls); ++ ++ walk->target_tls = NULL; ++ ++ /* Similarly, if this happens then user code has done something weird. */ ++ if (walk->saved_bound_dev) ++ { ++ gomp_mutex_unlock (&goacc_thread_lock); ++ gomp_fatal ("shutdown during host fallback"); ++ } ++ ++ if (walk->dev) ++ { ++ gomp_mutex_lock (&walk->dev->lock); ++ gomp_free_memmap (&walk->dev->mem_map); ++ gomp_mutex_unlock (&walk->dev->lock); ++ ++ walk->dev = NULL; ++ walk->base_dev = NULL; ++ } ++ } ++ ++ gomp_mutex_unlock (&goacc_thread_lock); ++ ++ /* Close all the devices of this type that have been opened. */ ++ bool ret = true; ++ for (i = 0; i < ndevs; i++) ++ { ++ struct gomp_device_descr *acc_dev = &base_dev[i]; ++ gomp_mutex_lock (&acc_dev->lock); ++ if (acc_dev->state == GOMP_DEVICE_INITIALIZED) ++ { ++ devices_active = true; ++ ret &= acc_dev->fini_device_func (acc_dev->target_id); ++ acc_dev->state = GOMP_DEVICE_UNINITIALIZED; ++ } ++ gomp_mutex_unlock (&acc_dev->lock); ++ } ++ ++ if (!ret) ++ gomp_fatal ("device finalization failed"); ++ ++ if (!devices_active) ++ gomp_fatal ("no device initialized"); ++} ++ ++static struct goacc_thread * ++goacc_new_thread (void) ++{ ++ struct goacc_thread *thr = gomp_malloc (sizeof (struct gomp_thread)); ++ ++#if defined HAVE_TLS || defined USE_EMUTLS ++ goacc_tls_data = thr; ++#else ++ pthread_setspecific (goacc_tls_key, thr); ++#endif ++ ++ pthread_setspecific (goacc_cleanup_key, thr); ++ ++ gomp_mutex_lock (&goacc_thread_lock); ++ thr->next = goacc_threads; ++ goacc_threads = thr; ++ gomp_mutex_unlock (&goacc_thread_lock); ++ ++ return thr; ++} ++ ++static void ++goacc_destroy_thread (void *data) ++{ ++ struct goacc_thread *thr = data, *walk, *prev; ++ ++ gomp_mutex_lock (&goacc_thread_lock); ++ ++ if (thr) ++ { ++ struct gomp_device_descr *acc_dev = thr->dev; ++ ++ if (acc_dev && thr->target_tls) ++ { ++ acc_dev->openacc.destroy_thread_data_func (thr->target_tls); ++ thr->target_tls = NULL; ++ } ++ ++ assert (!thr->mapped_data); ++ ++ /* Remove from thread list. */ ++ for (prev = NULL, walk = goacc_threads; walk; ++ prev = walk, walk = walk->next) ++ if (walk == thr) ++ { ++ if (prev == NULL) ++ goacc_threads = walk->next; ++ else ++ prev->next = walk->next; ++ ++ free (thr); ++ ++ break; ++ } ++ ++ assert (walk); ++ } ++ ++ gomp_mutex_unlock (&goacc_thread_lock); ++} ++ ++/* Use the ORD'th device instance for the current host thread (or -1 for the ++ current global default). The device (and the runtime) must be initialised ++ before calling this function. */ ++ ++void ++goacc_attach_host_thread_to_device (int ord) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL; ++ int num_devices; ++ ++ if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0)) ++ return; ++ ++ if (ord < 0) ++ ord = goacc_device_num; ++ ++ /* Decide which type of device to use. If the current thread has a device ++ type already (e.g. set by acc_set_device_type), use that, else use the ++ global default. */ ++ if (thr && thr->base_dev) ++ base_dev = thr->base_dev; ++ else ++ { ++ assert (cached_base_dev); ++ base_dev = cached_base_dev; ++ } ++ ++ num_devices = base_dev->get_num_devices_func (); ++ if (num_devices <= 0 || ord >= num_devices) ++ acc_dev_num_out_of_range (acc_device_type (base_dev->type), ord, ++ num_devices); ++ ++ if (!thr) ++ thr = goacc_new_thread (); ++ ++ thr->base_dev = base_dev; ++ thr->dev = acc_dev = &base_dev[ord]; ++ thr->saved_bound_dev = NULL; ++ ++ thr->target_tls ++ = acc_dev->openacc.create_thread_data_func (ord); ++ ++ acc_dev->openacc.async_set_async_func (acc_async_sync); ++} ++ ++/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of ++ init/shutdown is per-process or per-thread. We choose per-process. */ ++ ++void ++acc_init (acc_device_t d) ++{ ++ gomp_mutex_lock (&acc_device_lock); ++ ++ cached_base_dev = acc_init_1 (d); ++ ++ gomp_mutex_unlock (&acc_device_lock); ++ ++ goacc_attach_host_thread_to_device (-1); ++} ++ ++ialias (acc_init) ++ ++void ++acc_shutdown (acc_device_t d) ++{ ++ gomp_mutex_lock (&acc_device_lock); ++ ++ acc_shutdown_1 (d); ++ ++ gomp_mutex_unlock (&acc_device_lock); ++} ++ ++ialias (acc_shutdown) ++ ++int ++acc_get_num_devices (acc_device_t d) ++{ ++ int n = 0; ++ struct gomp_device_descr *acc_dev; ++ ++ if (d == acc_device_none) ++ return 0; ++ ++ gomp_mutex_lock (&acc_device_lock); ++ acc_dev = resolve_device (d, false); ++ gomp_mutex_unlock (&acc_device_lock); ++ ++ if (!acc_dev) ++ return 0; ++ ++ n = acc_dev->get_num_devices_func (); ++ if (n < 0) ++ n = 0; ++ ++ return n; ++} ++ ++ialias (acc_get_num_devices) ++ ++/* Set the device type for the current thread only (using the current global ++ default device number), initialising that device if necessary. Also set the ++ default device type for new threads to D. */ ++ ++void ++acc_set_device_type (acc_device_t d) ++{ ++ struct gomp_device_descr *base_dev, *acc_dev; ++ struct goacc_thread *thr = goacc_thread (); ++ ++ gomp_mutex_lock (&acc_device_lock); ++ ++ cached_base_dev = base_dev = resolve_device (d, true); ++ acc_dev = &base_dev[goacc_device_num]; ++ ++ gomp_mutex_lock (&acc_dev->lock); ++ if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED) ++ gomp_init_device (acc_dev); ++ gomp_mutex_unlock (&acc_dev->lock); ++ ++ gomp_mutex_unlock (&acc_device_lock); ++ ++ /* We're changing device type: invalidate the current thread's dev and ++ base_dev pointers. */ ++ if (thr && thr->base_dev != base_dev) ++ { ++ thr->base_dev = thr->dev = NULL; ++ } ++ ++ goacc_attach_host_thread_to_device (-1); ++} ++ ++ialias (acc_set_device_type) ++ ++acc_device_t ++acc_get_device_type (void) ++{ ++ acc_device_t res = acc_device_none; ++ struct gomp_device_descr *dev; ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (thr && thr->base_dev) ++ res = acc_device_type (thr->base_dev->type); ++ else ++ { ++ gomp_mutex_lock (&acc_device_lock); ++ dev = resolve_device (acc_device_default, true); ++ gomp_mutex_unlock (&acc_device_lock); ++ res = acc_device_type (dev->type); ++ } ++ ++ assert (res != acc_device_default ++ && res != acc_device_not_host); ++ ++ return res; ++} ++ ++ialias (acc_get_device_type) ++ ++int ++acc_get_device_num (acc_device_t d) ++{ ++ const struct gomp_device_descr *dev; ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (d >= _ACC_device_hwm) ++ gomp_fatal ("unknown device type %u", (unsigned) d); ++ ++ gomp_mutex_lock (&acc_device_lock); ++ dev = resolve_device (d, true); ++ gomp_mutex_unlock (&acc_device_lock); ++ ++ if (thr && thr->base_dev == dev && thr->dev) ++ return thr->dev->target_id; ++ ++ return goacc_device_num; ++} ++ ++ialias (acc_get_device_num) ++ ++void ++acc_set_device_num (int ord, acc_device_t d) ++{ ++ struct gomp_device_descr *base_dev, *acc_dev; ++ int num_devices; ++ ++ if (ord < 0) ++ ord = goacc_device_num; ++ ++ if ((int) d == 0) ++ /* Set whatever device is being used by the current host thread to use ++ device instance ORD. It's unclear if this is supposed to affect other ++ host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num). */ ++ goacc_attach_host_thread_to_device (ord); ++ else ++ { ++ gomp_mutex_lock (&acc_device_lock); ++ ++ cached_base_dev = base_dev = resolve_device (d, true); ++ ++ num_devices = base_dev->get_num_devices_func (); ++ ++ if (num_devices <= 0 || ord >= num_devices) ++ acc_dev_num_out_of_range (d, ord, num_devices); ++ ++ acc_dev = &base_dev[ord]; ++ ++ gomp_mutex_lock (&acc_dev->lock); ++ if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED) ++ gomp_init_device (acc_dev); ++ gomp_mutex_unlock (&acc_dev->lock); ++ ++ gomp_mutex_unlock (&acc_device_lock); ++ ++ goacc_attach_host_thread_to_device (ord); ++ } ++ ++ goacc_device_num = ord; ++} ++ ++ialias (acc_set_device_num) ++ ++int ++acc_on_device (acc_device_t dev) ++{ ++ return dev == acc_device_host || dev == acc_device_none; ++} ++ ++ialias (acc_on_device) ++ ++attribute_hidden void ++goacc_runtime_initialize (void) ++{ ++ gomp_mutex_init (&acc_device_lock); ++ ++#if !(defined HAVE_TLS || defined USE_EMUTLS) ++ pthread_key_create (&goacc_tls_key, NULL); ++#endif ++ ++ pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread); ++ ++ cached_base_dev = NULL; ++ ++ goacc_threads = NULL; ++ gomp_mutex_init (&goacc_thread_lock); ++ ++ /* Initialize and register the 'host' device type. */ ++ goacc_host_init (); ++} ++ ++/* Compiler helper functions */ ++ ++attribute_hidden void ++goacc_save_and_set_bind (acc_device_t d) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ assert (!thr->saved_bound_dev); ++ ++ thr->saved_bound_dev = thr->dev; ++ thr->dev = dispatchers[d]; ++} ++ ++attribute_hidden void ++goacc_restore_bind (void) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ thr->dev = thr->saved_bound_dev; ++ thr->saved_bound_dev = NULL; ++} ++ ++/* This is called from any OpenACC support function that may need to implicitly ++ initialize the libgomp runtime, either globally or from a new host thread. ++ On exit "goacc_thread" will return a valid & populated thread block. */ ++ ++attribute_hidden void ++goacc_lazy_initialize (void) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (thr && thr->dev) ++ return; ++ ++ if (!cached_base_dev) ++ acc_init (acc_device_default); ++ else ++ goacc_attach_host_thread_to_device (-1); ++} +--- libgomp/oacc-int.h.jj 2016-07-13 16:57:04.400535794 +0200 ++++ libgomp/oacc-int.h 2016-07-13 16:57:04.400535794 +0200 +@@ -0,0 +1,106 @@ ++/* OpenACC Runtime - internal declarations ++ ++ Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* This file contains data types and function declarations that are not ++ part of the official OpenACC user interface. There are declarations ++ in here that are part of the GNU OpenACC ABI, in that the compiler is ++ required to know about them and use them. ++ ++ The convention is that the all caps prefix "GOACC" is used group items ++ that are part of the external ABI, and the lower case prefix "goacc" ++ is used group items that are completely private to the library. */ ++ ++#ifndef OACC_INT_H ++#define OACC_INT_H 1 ++ ++#include "openacc.h" ++#include "config.h" ++#include <stddef.h> ++#include <stdbool.h> ++#include <stdarg.h> ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility push(hidden) ++#endif ++ ++static inline enum acc_device_t ++acc_device_type (enum offload_target_type type) ++{ ++ return (enum acc_device_t) type; ++} ++ ++struct goacc_thread ++{ ++ /* The base device for the current thread. */ ++ struct gomp_device_descr *base_dev; ++ ++ /* The device for the current thread. */ ++ struct gomp_device_descr *dev; ++ ++ struct gomp_device_descr *saved_bound_dev; ++ ++ /* This is a linked list of data mapped by the "acc data" pragma, following ++ strictly push/pop semantics according to lexical scope. */ ++ struct target_mem_desc *mapped_data; ++ ++ /* These structures form a list: this is the next thread in that list. */ ++ struct goacc_thread *next; ++ ++ /* Target-specific data (used by plugin). */ ++ void *target_tls; ++}; ++ ++#if defined HAVE_TLS || defined USE_EMUTLS ++extern __thread struct goacc_thread *goacc_tls_data; ++static inline struct goacc_thread * ++goacc_thread (void) ++{ ++ return goacc_tls_data; ++} ++#else ++extern pthread_key_t goacc_tls_key; ++static inline struct goacc_thread * ++goacc_thread (void) ++{ ++ return pthread_getspecific (goacc_tls_key); ++} ++#endif ++ ++void goacc_register (struct gomp_device_descr *) __GOACC_NOTHROW; ++void goacc_attach_host_thread_to_device (int); ++void goacc_runtime_initialize (void); ++void goacc_save_and_set_bind (acc_device_t); ++void goacc_restore_bind (void); ++void goacc_lazy_initialize (void); ++void goacc_host_init (void); ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility pop ++#endif ++ ++#endif +--- libgomp/oacc-host.c.jj 2016-07-13 16:57:13.489423096 +0200 ++++ libgomp/oacc-host.c 2016-07-13 16:57:13.489423096 +0200 +@@ -0,0 +1,266 @@ ++/* OpenACC Runtime Library: acc_device_host. ++ ++ Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#include "libgomp.h" ++#include "oacc-int.h" ++#include "gomp-constants.h" ++ ++#include <stdbool.h> ++#include <stddef.h> ++#include <stdint.h> ++ ++static struct gomp_device_descr host_dispatch; ++ ++static const char * ++host_get_name (void) ++{ ++ return host_dispatch.name; ++} ++ ++static unsigned int ++host_get_caps (void) ++{ ++ return host_dispatch.capabilities; ++} ++ ++static int ++host_get_type (void) ++{ ++ return host_dispatch.type; ++} ++ ++static int ++host_get_num_devices (void) ++{ ++ return 1; ++} ++ ++static bool ++host_init_device (int n __attribute__ ((unused))) ++{ ++ return true; ++} ++ ++static bool ++host_fini_device (int n __attribute__ ((unused))) ++{ ++ return true; ++} ++ ++static unsigned ++host_version (void) ++{ ++ return GOMP_VERSION; ++} ++ ++static int ++host_load_image (int n __attribute__ ((unused)), ++ unsigned v __attribute__ ((unused)), ++ const void *t __attribute__ ((unused)), ++ struct addr_pair **r __attribute__ ((unused))) ++{ ++ return 0; ++} ++ ++static bool ++host_unload_image (int n __attribute__ ((unused)), ++ unsigned v __attribute__ ((unused)), ++ const void *t __attribute__ ((unused))) ++{ ++ return true; ++} ++ ++static void * ++host_alloc (int n __attribute__ ((unused)), size_t s) ++{ ++ return gomp_malloc (s); ++} ++ ++static bool ++host_free (int n __attribute__ ((unused)), void *p) ++{ ++ free (p); ++ return true; ++} ++ ++static bool ++host_dev2host (int n __attribute__ ((unused)), ++ void *h __attribute__ ((unused)), ++ const void *d __attribute__ ((unused)), ++ size_t s __attribute__ ((unused))) ++{ ++ return true; ++} ++ ++static bool ++host_host2dev (int n __attribute__ ((unused)), ++ void *d __attribute__ ((unused)), ++ const void *h __attribute__ ((unused)), ++ size_t s __attribute__ ((unused))) ++{ ++ return true; ++} ++ ++static void ++host_run (int n __attribute__ ((unused)), void *fn_ptr, void *vars, ++ void **args __attribute__((unused))) ++{ ++ void (*fn)(void *) = (void (*)(void *)) fn_ptr; ++ ++ fn (vars); ++} ++ ++static void ++host_openacc_exec (void (*fn) (void *), ++ size_t mapnum __attribute__ ((unused)), ++ void **hostaddrs, ++ void **devaddrs __attribute__ ((unused)), ++ int async __attribute__ ((unused)), ++ unsigned *dims __attribute ((unused)), ++ void *targ_mem_desc __attribute__ ((unused))) ++{ ++ fn (hostaddrs); ++} ++ ++static void ++host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)), ++ int async __attribute__ ((unused))) ++{ ++} ++ ++static int ++host_openacc_async_test (int async __attribute__ ((unused))) ++{ ++ return 1; ++} ++ ++static int ++host_openacc_async_test_all (void) ++{ ++ return 1; ++} ++ ++static void ++host_openacc_async_wait (int async __attribute__ ((unused))) ++{ ++} ++ ++static void ++host_openacc_async_wait_async (int async1 __attribute__ ((unused)), ++ int async2 __attribute__ ((unused))) ++{ ++} ++ ++static void ++host_openacc_async_wait_all (void) ++{ ++} ++ ++static void ++host_openacc_async_wait_all_async (int async __attribute__ ((unused))) ++{ ++} ++ ++static void ++host_openacc_async_set_async (int async __attribute__ ((unused))) ++{ ++} ++ ++static void * ++host_openacc_create_thread_data (int ord __attribute__ ((unused))) ++{ ++ return NULL; ++} ++ ++static void ++host_openacc_destroy_thread_data (void *tls_data __attribute__ ((unused))) ++{ ++} ++ ++static struct gomp_device_descr host_dispatch = ++ { ++ .name = "host", ++ .capabilities = (GOMP_OFFLOAD_CAP_SHARED_MEM ++ | GOMP_OFFLOAD_CAP_NATIVE_EXEC ++ | GOMP_OFFLOAD_CAP_OPENACC_200), ++ .target_id = 0, ++ .type = OFFLOAD_TARGET_TYPE_HOST, ++ ++ .get_name_func = host_get_name, ++ .get_caps_func = host_get_caps, ++ .get_type_func = host_get_type, ++ .get_num_devices_func = host_get_num_devices, ++ .init_device_func = host_init_device, ++ .fini_device_func = host_fini_device, ++ .version_func = host_version, ++ .load_image_func = host_load_image, ++ .unload_image_func = host_unload_image, ++ .alloc_func = host_alloc, ++ .free_func = host_free, ++ .dev2host_func = host_dev2host, ++ .host2dev_func = host_host2dev, ++ .run_func = host_run, ++ ++ .mem_map = { NULL }, ++ /* .lock initilized in goacc_host_init. */ ++ .state = GOMP_DEVICE_UNINITIALIZED, ++ ++ .openacc = { ++ .data_environ = NULL, ++ ++ .exec_func = host_openacc_exec, ++ ++ .register_async_cleanup_func = host_openacc_register_async_cleanup, ++ ++ .async_test_func = host_openacc_async_test, ++ .async_test_all_func = host_openacc_async_test_all, ++ .async_wait_func = host_openacc_async_wait, ++ .async_wait_async_func = host_openacc_async_wait_async, ++ .async_wait_all_func = host_openacc_async_wait_all, ++ .async_wait_all_async_func = host_openacc_async_wait_all_async, ++ .async_set_async_func = host_openacc_async_set_async, ++ ++ .create_thread_data_func = host_openacc_create_thread_data, ++ .destroy_thread_data_func = host_openacc_destroy_thread_data, ++ ++ .cuda = { ++ .get_current_device_func = NULL, ++ .get_current_context_func = NULL, ++ .get_stream_func = NULL, ++ .set_stream_func = NULL, ++ } ++ } ++ }; ++ ++/* Initialize and register this device type. */ ++void ++goacc_host_init (void) ++{ ++ gomp_mutex_init (&host_dispatch.lock); ++ goacc_register (&host_dispatch); ++} +--- libgomp/oacc-parallel.c.jj 2016-07-13 16:57:04.399535807 +0200 ++++ libgomp/oacc-parallel.c 2016-07-14 18:53:06.694996381 +0200 +@@ -0,0 +1,241 @@ ++/* Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* This file handles OpenACC constructs. */ ++ ++#include "openacc.h" ++#include "libgomp.h" ++#include "libgomp_g.h" ++#include "gomp-constants.h" ++#include "oacc-int.h" ++#ifdef HAVE_INTTYPES_H ++# include <inttypes.h> /* For PRIu64. */ ++#endif ++#include <string.h> ++#include <stdarg.h> ++#include <assert.h> ++ ++static void goacc_wait (int async, int num_waits, va_list *ap); ++ ++ ++/* Launch a possibly offloaded function on DEVICE. FN is the host fn ++ address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory ++ blocks to be copied to/from the device. Varadic arguments are ++ keyed optional parameters terminated with a zero. */ ++ ++void ++GOACC_parallel_keyed (int device, void (*fn) (void *), ++ size_t mapnum, void **hostaddrs, size_t *sizes, ++ unsigned short *kinds, ...) ++{ ++ bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; ++ struct goacc_thread *thr; ++ struct gomp_device_descr *acc_dev; ++ ++#ifdef HAVE_INTTYPES_H ++ gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n", ++ __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds); ++#else ++ gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n", ++ __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds); ++#endif ++ goacc_lazy_initialize (); ++ ++ thr = goacc_thread (); ++ acc_dev = thr->dev; ++ ++ /* Host fallback if "if" clause is false or if the current device is set to ++ the host. */ ++ if (host_fallback) ++ { ++ goacc_save_and_set_bind (acc_device_host); ++ fn (hostaddrs); ++ goacc_restore_bind (); ++ return; ++ } ++ else if (acc_device_type (acc_dev->type) == acc_device_host) ++ { ++ fn (hostaddrs); ++ return; ++ } ++ ++ /* acc_device_host is the only supported device type. */ ++} ++ ++/* Legacy entry point, only provide host execution. */ ++ ++void ++GOACC_parallel (int device, void (*fn) (void *), ++ size_t mapnum, void **hostaddrs, size_t *sizes, ++ unsigned short *kinds, ++ int num_gangs, int num_workers, int vector_length, ++ int async, int num_waits, ...) ++{ ++ goacc_save_and_set_bind (acc_device_host); ++ fn (hostaddrs); ++ goacc_restore_bind (); ++} ++ ++void ++GOACC_data_start (int device, size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned short *kinds) ++{ ++ goacc_lazy_initialize (); ++} ++ ++void ++GOACC_data_end (void) ++{ ++ gomp_debug (0, " %s: restore mappings\n", __FUNCTION__); ++ gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); ++} ++ ++void ++GOACC_enter_exit_data (int device, size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned short *kinds, ++ int async, int num_waits, ...) ++{ ++ goacc_lazy_initialize (); ++} ++ ++static void ++goacc_wait (int async, int num_waits, va_list *ap) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ struct gomp_device_descr *acc_dev = thr->dev; ++ ++ while (num_waits--) ++ { ++ int qid = va_arg (*ap, int); ++ ++ if (acc_async_test (qid)) ++ continue; ++ ++ if (async == acc_async_sync) ++ acc_wait (qid); ++ else if (qid == async) ++ ;/* If we're waiting on the same asynchronous queue as we're ++ launching on, the queue itself will order work as ++ required, so there's no need to wait explicitly. */ ++ else ++ acc_dev->openacc.async_wait_async_func (qid, async); ++ } ++} ++ ++void ++GOACC_update (int device, size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned short *kinds, ++ int async, int num_waits, ...) ++{ ++ goacc_lazy_initialize (); ++} ++ ++void ++GOACC_wait (int async, int num_waits, ...) ++{ ++ if (num_waits) ++ { ++ va_list ap; ++ ++ va_start (ap, num_waits); ++ goacc_wait (async, num_waits, &ap); ++ va_end (ap); ++ } ++ else if (async == acc_async_sync) ++ acc_wait_all (); ++ else if (async == acc_async_noval) ++ goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval); ++} ++ ++int ++GOACC_get_num_threads (void) ++{ ++ return 1; ++} ++ ++int ++GOACC_get_thread_num (void) ++{ ++ return 0; ++} ++ ++void ++GOACC_declare (int device, size_t mapnum, ++ void **hostaddrs, size_t *sizes, unsigned short *kinds) ++{ ++ int i; ++ ++ for (i = 0; i < mapnum; i++) ++ { ++ unsigned char kind = kinds[i] & 0xff; ++ ++ if (kind == GOMP_MAP_POINTER || kind == GOMP_MAP_TO_PSET) ++ continue; ++ ++ switch (kind) ++ { ++ case GOMP_MAP_FORCE_ALLOC: ++ case GOMP_MAP_FORCE_FROM: ++ case GOMP_MAP_FORCE_TO: ++ case GOMP_MAP_POINTER: ++ case GOMP_MAP_DELETE: ++ GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], 0, 0); ++ break; ++ ++ case GOMP_MAP_FORCE_DEVICEPTR: ++ break; ++ ++ case GOMP_MAP_ALLOC: ++ if (!acc_is_present (hostaddrs[i], sizes[i])) ++ GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], 0, 0); ++ break; ++ ++ case GOMP_MAP_TO: ++ GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], 0, 0); ++ ++ break; ++ ++ case GOMP_MAP_FROM: ++ kinds[i] = GOMP_MAP_FORCE_FROM; ++ GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], 0, 0); ++ break; ++ ++ case GOMP_MAP_FORCE_PRESENT: ++ if (!acc_is_present (hostaddrs[i], sizes[i])) ++ gomp_fatal ("[%p,%ld] is not mapped", hostaddrs[i], ++ (unsigned long) sizes[i]); ++ break; ++ ++ default: ++ assert (0); ++ break; ++ } ++ } ++} +--- libgomp/oacc-cuda.c.jj 2016-07-13 16:57:04.432535397 +0200 ++++ libgomp/oacc-cuda.c 2016-07-13 16:57:04.432535397 +0200 +@@ -0,0 +1,86 @@ ++/* OpenACC Runtime Library: CUDA support glue. ++ ++ Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#include "openacc.h" ++#include "config.h" ++#include "libgomp.h" ++#include "oacc-int.h" ++ ++void * ++acc_get_current_cuda_device (void) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func) ++ return thr->dev->openacc.cuda.get_current_device_func (); ++ ++ return NULL; ++} ++ ++void * ++acc_get_current_cuda_context (void) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func) ++ return thr->dev->openacc.cuda.get_current_context_func (); ++ ++ return NULL; ++} ++ ++void * ++acc_get_cuda_stream (int async) ++{ ++ struct goacc_thread *thr = goacc_thread (); ++ ++ if (async < 0) ++ return NULL; ++ ++ if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) ++ return thr->dev->openacc.cuda.get_stream_func (async); ++ ++ return NULL; ++} ++ ++int ++acc_set_cuda_stream (int async, void *stream) ++{ ++ struct goacc_thread *thr; ++ ++ if (async < 0 || stream == NULL) ++ return 0; ++ ++ goacc_lazy_initialize (); ++ ++ thr = goacc_thread (); ++ ++ if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) ++ return thr->dev->openacc.cuda.set_stream_func (async, stream); ++ ++ return -1; ++} +--- libgomp/openacc_lib.h.jj 2016-07-13 16:57:13.486423134 +0200 ++++ libgomp/openacc_lib.h 2016-07-13 16:57:13.486423134 +0200 +@@ -0,0 +1,382 @@ ++! OpenACC Runtime Library Definitions. -*- mode: fortran -*- ++ ++! Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++! Contributed by Tobias Burnus <burnus@net-b.de> ++! and Mentor Embedded. ++ ++! This file is part of the GNU Offloading and Multi Processing Library ++! (libgomp). ++ ++! Libgomp is free software; you can redistribute it and/or modify it ++! under the terms of the GNU General Public License as published by ++! the Free Software Foundation; either version 3, or (at your option) ++! any later version. ++ ++! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++! FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++! more details. ++ ++! Under Section 7 of GPL version 3, you are granted additional ++! permissions described in the GCC Runtime Library Exception, version ++! 3.1, as published by the Free Software Foundation. ++ ++! You should have received a copy of the GNU General Public License and ++! a copy of the GCC Runtime Library Exception along with this program; ++! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++! <http://www.gnu.org/licenses/>. ++ ++! NOTE: Due to the use of dimension (..), the code only works when compiled ++! with -std=f2008ts/gnu/legacy but not with other standard settings. ++! Alternatively, the user can use the module version, which permits ++! compilation with -std=f95. ++ ++ integer, parameter :: acc_device_kind = 4 ++ ++! Keep in sync with include/gomp-constants.h. ++ integer (acc_device_kind), parameter :: acc_device_none = 0 ++ integer (acc_device_kind), parameter :: acc_device_default = 1 ++ integer (acc_device_kind), parameter :: acc_device_host = 2 ++! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 ++! removed. ++ integer (acc_device_kind), parameter :: acc_device_not_host = 4 ++ integer (acc_device_kind), parameter :: acc_device_nvidia = 5 ++ ++ integer, parameter :: acc_handle_kind = 4 ++ ++! Keep in sync with include/gomp-constants.h. ++ integer (acc_handle_kind), parameter :: acc_async_noval = -1 ++ integer (acc_handle_kind), parameter :: acc_async_sync = -2 ++ ++ integer, parameter :: openacc_version = 201306 ++ ++ interface acc_get_num_devices ++ function acc_get_num_devices_h (d) ++ import acc_device_kind ++ integer acc_get_num_devices_h ++ integer (acc_device_kind) d ++ end function ++ end interface ++ ++ interface acc_set_device_type ++ subroutine acc_set_device_type_h (d) ++ import acc_device_kind ++ integer (acc_device_kind) d ++ end subroutine ++ end interface ++ ++ interface acc_get_device_type ++ function acc_get_device_type_h () ++ import acc_device_kind ++ integer (acc_device_kind) acc_get_device_type_h ++ end function ++ end interface ++ ++ interface acc_set_device_num ++ subroutine acc_set_device_num_h (n, d) ++ import acc_device_kind ++ integer n ++ integer (acc_device_kind) d ++ end subroutine ++ end interface ++ ++ interface acc_get_device_num ++ function acc_get_device_num_h (d) ++ import acc_device_kind ++ integer acc_get_device_num_h ++ integer (acc_device_kind) d ++ end function ++ end interface ++ ++ interface acc_async_test ++ function acc_async_test_h (a) ++ logical acc_async_test_h ++ integer a ++ end function ++ end interface ++ ++ interface acc_async_test_all ++ function acc_async_test_all_h () ++ logical acc_async_test_all_h ++ end function ++ end interface ++ ++ interface acc_wait ++ subroutine acc_wait_h (a) ++ integer a ++ end subroutine ++ end interface ++ ++ interface acc_wait_async ++ subroutine acc_wait_async_h (a1, a2) ++ integer a1, a2 ++ end subroutine ++ end interface ++ ++ interface acc_wait_all ++ subroutine acc_wait_all_h () ++ end subroutine ++ end interface ++ ++ interface acc_wait_all_async ++ subroutine acc_wait_all_async_h (a) ++ integer a ++ end subroutine ++ end interface ++ ++ interface acc_init ++ subroutine acc_init_h (devicetype) ++ import acc_device_kind ++ integer (acc_device_kind) devicetype ++ end subroutine ++ end interface ++ ++ interface acc_shutdown ++ subroutine acc_shutdown_h (devicetype) ++ import acc_device_kind ++ integer (acc_device_kind) devicetype ++ end subroutine ++ end interface ++ ++ interface acc_on_device ++ function acc_on_device_h (devicetype) ++ import acc_device_kind ++ logical acc_on_device_h ++ integer (acc_device_kind) devicetype ++ end function ++ end interface ++ ++ ! acc_malloc: Only available in C/C++ ++ ! acc_free: Only available in C/C++ ++ ++ interface acc_copyin ++ subroutine acc_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_present_or_copyin ++ subroutine acc_present_or_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_pcopyin ++ subroutine acc_pcopyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_pcopyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_pcopyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_create ++ subroutine acc_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_create_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_present_or_create ++ subroutine acc_present_or_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_present_or_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_present_or_create_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_pcreate ++ subroutine acc_pcreate_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_pcreate_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_pcreate_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_copyout ++ subroutine acc_copyout_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyout_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyout_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_delete ++ subroutine acc_delete_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_delete_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_delete_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_update_device ++ subroutine acc_update_device_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_update_device_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_update_device_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ interface acc_update_self ++ subroutine acc_update_self_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_update_self_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_update_self_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ end interface ++ ++ ! acc_map_data: Only available in C/C++ ++ ! acc_unmap_data: Only available in C/C++ ++ ! acc_deviceptr: Only available in C/C++ ++ ! acc_ostptr: Only available in C/C++ ++ ++ interface acc_is_present ++ function acc_is_present_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ logical acc_is_present_32_h ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end function ++ ++ function acc_is_present_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ logical acc_is_present_64_h ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end function ++ ++ function acc_is_present_array_h (a) ++ logical acc_is_present_array_h ++ type (*), dimension (..), contiguous :: a ++ end function ++ end interface ++ ++ ! acc_memcpy_to_device: Only available in C/C++ ++ ! acc_memcpy_from_device: Only available in C/C++ +--- libgomp/gomp-constants.h.jj 2016-07-14 16:02:47.212545826 +0200 ++++ libgomp/gomp-constants.h 2016-05-26 21:04:40.000000000 +0200 +@@ -0,0 +1,259 @@ ++/* Communication between GCC and libgomp. ++ ++ Copyright (C) 2014-2015 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#ifndef GOMP_CONSTANTS_H ++#define GOMP_CONSTANTS_H 1 ++ ++/* Memory mapping types. */ ++ ++/* One byte. */ ++#define GOMP_MAP_LAST (1 << 8) ++ ++#define GOMP_MAP_FLAG_TO (1 << 0) ++#define GOMP_MAP_FLAG_FROM (1 << 1) ++/* Special map kinds, enumerated starting here. */ ++#define GOMP_MAP_FLAG_SPECIAL_0 (1 << 2) ++#define GOMP_MAP_FLAG_SPECIAL_1 (1 << 3) ++#define GOMP_MAP_FLAG_SPECIAL_2 (1 << 4) ++#define GOMP_MAP_FLAG_SPECIAL (GOMP_MAP_FLAG_SPECIAL_1 \ ++ | GOMP_MAP_FLAG_SPECIAL_0) ++/* Flag to force a specific behavior (or else, trigger a run-time error). */ ++#define GOMP_MAP_FLAG_FORCE (1 << 7) ++ ++enum gomp_map_kind ++ { ++ /* If not already present, allocate. */ ++ GOMP_MAP_ALLOC = 0, ++ /* ..., and copy to device. */ ++ GOMP_MAP_TO = (GOMP_MAP_ALLOC | GOMP_MAP_FLAG_TO), ++ /* ..., and copy from device. */ ++ GOMP_MAP_FROM = (GOMP_MAP_ALLOC | GOMP_MAP_FLAG_FROM), ++ /* ..., and copy to and from device. */ ++ GOMP_MAP_TOFROM = (GOMP_MAP_TO | GOMP_MAP_FROM), ++ /* The following kind is an internal only map kind, used for pointer based ++ array sections. OMP_CLAUSE_SIZE for these is not the pointer size, ++ which is implicitly POINTER_SIZE_UNITS, but the bias. */ ++ GOMP_MAP_POINTER = (GOMP_MAP_FLAG_SPECIAL_0 | 0), ++ /* Also internal, behaves like GOMP_MAP_TO, but additionally any ++ GOMP_MAP_POINTER records consecutive after it which have addresses ++ falling into that range will not be ignored if GOMP_MAP_TO_PSET wasn't ++ mapped already. */ ++ GOMP_MAP_TO_PSET = (GOMP_MAP_FLAG_SPECIAL_0 | 1), ++ /* Must already be present. */ ++ GOMP_MAP_FORCE_PRESENT = (GOMP_MAP_FLAG_SPECIAL_0 | 2), ++ /* Deallocate a mapping, without copying from device. */ ++ GOMP_MAP_DELETE = (GOMP_MAP_FLAG_SPECIAL_0 | 3), ++ /* Is a device pointer. OMP_CLAUSE_SIZE for these is unused; is implicitly ++ POINTER_SIZE_UNITS. */ ++ GOMP_MAP_FORCE_DEVICEPTR = (GOMP_MAP_FLAG_SPECIAL_1 | 0), ++ /* Do not map, copy bits for firstprivate instead. */ ++ /* OpenACC device_resident. */ ++ GOMP_MAP_DEVICE_RESIDENT = (GOMP_MAP_FLAG_SPECIAL_1 | 1), ++ /* OpenACC link. */ ++ GOMP_MAP_LINK = (GOMP_MAP_FLAG_SPECIAL_1 | 2), ++ /* Allocate. */ ++ GOMP_MAP_FIRSTPRIVATE = (GOMP_MAP_FLAG_SPECIAL | 0), ++ /* Similarly, but store the value in the pointer rather than ++ pointed by the pointer. */ ++ GOMP_MAP_FIRSTPRIVATE_INT = (GOMP_MAP_FLAG_SPECIAL | 1), ++ /* Pointer translate host address into device address and copy that ++ back to host. */ ++ GOMP_MAP_USE_DEVICE_PTR = (GOMP_MAP_FLAG_SPECIAL | 2), ++ /* Allocate a zero length array section. Prefer next non-zero length ++ mapping over previous non-zero length mapping over zero length mapping ++ at the address. If not already mapped, do nothing (and pointer translate ++ to NULL). */ ++ GOMP_MAP_ZERO_LEN_ARRAY_SECTION = (GOMP_MAP_FLAG_SPECIAL | 3), ++ /* Allocate. */ ++ GOMP_MAP_FORCE_ALLOC = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_ALLOC), ++ /* ..., and copy to device. */ ++ GOMP_MAP_FORCE_TO = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_TO), ++ /* ..., and copy from device. */ ++ GOMP_MAP_FORCE_FROM = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_FROM), ++ /* ..., and copy to and from device. */ ++ GOMP_MAP_FORCE_TOFROM = (GOMP_MAP_FLAG_FORCE | GOMP_MAP_TOFROM), ++ /* If not already present, allocate. And unconditionally copy to ++ device. */ ++ GOMP_MAP_ALWAYS_TO = (GOMP_MAP_FLAG_SPECIAL_2 | GOMP_MAP_TO), ++ /* If not already present, allocate. And unconditionally copy from ++ device. */ ++ GOMP_MAP_ALWAYS_FROM = (GOMP_MAP_FLAG_SPECIAL_2 ++ | GOMP_MAP_FROM), ++ /* If not already present, allocate. And unconditionally copy to and from ++ device. */ ++ GOMP_MAP_ALWAYS_TOFROM = (GOMP_MAP_FLAG_SPECIAL_2 ++ | GOMP_MAP_TOFROM), ++ /* Map a sparse struct; the address is the base of the structure, alignment ++ it's required alignment, and size is the number of adjacent entries ++ that belong to the struct. The adjacent entries should be sorted by ++ increasing address, so it is easy to determine lowest needed address ++ (address of the first adjacent entry) and highest needed address ++ (address of the last adjacent entry plus its size). */ ++ GOMP_MAP_STRUCT = (GOMP_MAP_FLAG_SPECIAL_2 ++ | GOMP_MAP_FLAG_SPECIAL | 0), ++ /* On a location of a pointer/reference that is assumed to be already mapped ++ earlier, store the translated address of the preceeding mapping. ++ No refcount is bumped by this, and the store is done unconditionally. */ ++ GOMP_MAP_ALWAYS_POINTER = (GOMP_MAP_FLAG_SPECIAL_2 ++ | GOMP_MAP_FLAG_SPECIAL | 1), ++ /* Forced deallocation of zero length array section. */ ++ GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION ++ = (GOMP_MAP_FLAG_SPECIAL_2 ++ | GOMP_MAP_FLAG_SPECIAL | 3), ++ /* Decrement usage count and deallocate if zero. */ ++ GOMP_MAP_RELEASE = (GOMP_MAP_FLAG_SPECIAL_2 ++ | GOMP_MAP_DELETE), ++ ++ /* Internal to GCC, not used in libgomp. */ ++ /* Do not map, but pointer assign a pointer instead. */ ++ GOMP_MAP_FIRSTPRIVATE_POINTER = (GOMP_MAP_LAST | 1), ++ /* Do not map, but pointer assign a reference instead. */ ++ GOMP_MAP_FIRSTPRIVATE_REFERENCE = (GOMP_MAP_LAST | 2) ++ }; ++ ++#define GOMP_MAP_COPY_TO_P(X) \ ++ (!((X) & GOMP_MAP_FLAG_SPECIAL) \ ++ && ((X) & GOMP_MAP_FLAG_TO)) ++ ++#define GOMP_MAP_COPY_FROM_P(X) \ ++ (!((X) & GOMP_MAP_FLAG_SPECIAL) \ ++ && ((X) & GOMP_MAP_FLAG_FROM)) ++ ++#define GOMP_MAP_POINTER_P(X) \ ++ ((X) == GOMP_MAP_POINTER) ++ ++#define GOMP_MAP_ALWAYS_TO_P(X) \ ++ (((X) == GOMP_MAP_ALWAYS_TO) || ((X) == GOMP_MAP_ALWAYS_TOFROM)) ++ ++#define GOMP_MAP_ALWAYS_FROM_P(X) \ ++ (((X) == GOMP_MAP_ALWAYS_FROM) || ((X) == GOMP_MAP_ALWAYS_TOFROM)) ++ ++#define GOMP_MAP_ALWAYS_P(X) \ ++ (GOMP_MAP_ALWAYS_TO_P (X) || ((X) == GOMP_MAP_ALWAYS_FROM)) ++ ++ ++/* Asynchronous behavior. Keep in sync with ++ libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_async_t. */ ++ ++#define GOMP_ASYNC_NOVAL -1 ++#define GOMP_ASYNC_SYNC -2 ++ ++ ++/* Device codes. Keep in sync with ++ libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_device_t as well as ++ libgomp/libgomp-plugin.h. */ ++#define GOMP_DEVICE_NONE 0 ++#define GOMP_DEVICE_DEFAULT 1 ++#define GOMP_DEVICE_HOST 2 ++/* #define GOMP_DEVICE_HOST_NONSHM 3 removed. */ ++#define GOMP_DEVICE_NOT_HOST 4 ++#define GOMP_DEVICE_NVIDIA_PTX 5 ++#define GOMP_DEVICE_INTEL_MIC 6 ++#define GOMP_DEVICE_HSA 7 ++ ++#define GOMP_DEVICE_ICV -1 ++#define GOMP_DEVICE_HOST_FALLBACK -2 ++ ++/* GOMP_task/GOMP_taskloop* flags argument. */ ++#define GOMP_TASK_FLAG_UNTIED (1 << 0) ++#define GOMP_TASK_FLAG_FINAL (1 << 1) ++#define GOMP_TASK_FLAG_MERGEABLE (1 << 2) ++#define GOMP_TASK_FLAG_DEPEND (1 << 3) ++#define GOMP_TASK_FLAG_PRIORITY (1 << 4) ++#define GOMP_TASK_FLAG_UP (1 << 8) ++#define GOMP_TASK_FLAG_GRAINSIZE (1 << 9) ++#define GOMP_TASK_FLAG_IF (1 << 10) ++#define GOMP_TASK_FLAG_NOGROUP (1 << 11) ++ ++/* GOMP_target{_ext,update_ext,enter_exit_data} flags argument. */ ++#define GOMP_TARGET_FLAG_NOWAIT (1 << 0) ++#define GOMP_TARGET_FLAG_EXIT_DATA (1 << 1) ++/* Internal to libgomp. */ ++#define GOMP_TARGET_FLAG_UPDATE (1U << 31) ++ ++/* Versions of libgomp and device-specific plugins. GOMP_VERSION ++ should be incremented whenever an ABI-incompatible change is introduced ++ to the plugin interface defined in libgomp/libgomp.h. */ ++#define GOMP_VERSION 1 ++#define GOMP_VERSION_NVIDIA_PTX 1 ++#define GOMP_VERSION_INTEL_MIC 0 ++#define GOMP_VERSION_HSA 0 ++ ++#define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV)) ++#define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff) ++#define GOMP_VERSION_DEV(PACK) ((PACK) & 0xffff) ++ ++#define GOMP_DIM_GANG 0 ++#define GOMP_DIM_WORKER 1 ++#define GOMP_DIM_VECTOR 2 ++#define GOMP_DIM_MAX 3 ++#define GOMP_DIM_MASK(X) (1u << (X)) ++ ++/* Varadic launch arguments. End of list is marked by a zero. */ ++#define GOMP_LAUNCH_DIM 1 /* Launch dimensions, op = mask */ ++#define GOMP_LAUNCH_ASYNC 2 /* Async, op = cst val if not MAX */ ++#define GOMP_LAUNCH_WAIT 3 /* Waits, op = num waits. */ ++#define GOMP_LAUNCH_CODE_SHIFT 28 ++#define GOMP_LAUNCH_DEVICE_SHIFT 16 ++#define GOMP_LAUNCH_OP_SHIFT 0 ++#define GOMP_LAUNCH_PACK(CODE,DEVICE,OP) \ ++ (((CODE) << GOMP_LAUNCH_CODE_SHIFT) \ ++ | ((DEVICE) << GOMP_LAUNCH_DEVICE_SHIFT) \ ++ | ((OP) << GOMP_LAUNCH_OP_SHIFT)) ++#define GOMP_LAUNCH_CODE(X) (((X) >> GOMP_LAUNCH_CODE_SHIFT) & 0xf) ++#define GOMP_LAUNCH_DEVICE(X) (((X) >> GOMP_LAUNCH_DEVICE_SHIFT) & 0xfff) ++#define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & 0xffff) ++#define GOMP_LAUNCH_OP_MAX 0xffff ++ ++/* Bitmask to apply in order to find out the intended device of a target ++ argument. */ ++#define GOMP_TARGET_ARG_DEVICE_MASK ((1 << 7) - 1) ++/* The target argument is significant for all devices. */ ++#define GOMP_TARGET_ARG_DEVICE_ALL 0 ++ ++/* Flag set when the subsequent element in the device-specific argument ++ values. */ ++#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM (1 << 7) ++ ++/* Bitmask to apply to a target argument to find out the value identifier. */ ++#define GOMP_TARGET_ARG_ID_MASK (((1 << 8) - 1) << 8) ++/* Target argument index of NUM_TEAMS. */ ++#define GOMP_TARGET_ARG_NUM_TEAMS (1 << 8) ++/* Target argument index of THREAD_LIMIT. */ ++#define GOMP_TARGET_ARG_THREAD_LIMIT (2 << 8) ++ ++/* If the value is directly embeded in target argument, it should be a 16-bit ++ at most and shifted by this many bits. */ ++#define GOMP_TARGET_ARG_VALUE_SHIFT 16 ++ ++/* HSA specific data structures. */ ++ ++/* Identifiers of device-specific target arguments. */ ++#define GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES (1 << 8) ++ ++#endif +--- libgomp/oacc-mem.c.jj 2016-07-13 16:57:04.433535385 +0200 ++++ libgomp/oacc-mem.c 2016-07-14 15:39:44.644631308 +0200 +@@ -0,0 +1,204 @@ ++/* OpenACC Runtime initialization routines ++ ++ Copyright (C) 2013-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#include "openacc.h" ++#include "config.h" ++#include "libgomp.h" ++#include "gomp-constants.h" ++#include "oacc-int.h" ++#include <stdint.h> ++#include <string.h> ++#include <assert.h> ++ ++/* OpenACC is silent on how memory exhaustion is indicated. We return ++ NULL. */ ++ ++void * ++acc_malloc (size_t s) ++{ ++ if (!s) ++ return NULL; ++ ++ goacc_lazy_initialize (); ++ return malloc (s); ++} ++ ++/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event ++ the device address is mapped. We choose to check if it mapped, ++ and if it is, to unmap it. */ ++void ++acc_free (void *d) ++{ ++ return free (d); ++} ++ ++void ++acc_memcpy_to_device (void *d, void *h, size_t s) ++{ ++ memmove (d, h, s); ++} ++ ++void ++acc_memcpy_from_device (void *h, void *d, size_t s) ++{ ++ memmove (h, d, s); ++} ++ ++/* Return the device pointer that corresponds to host data H. Or NULL ++ if no mapping. */ ++ ++void * ++acc_deviceptr (void *h) ++{ ++ goacc_lazy_initialize (); ++ return h; ++} ++ ++/* Return the host pointer that corresponds to device data D. Or NULL ++ if no mapping. */ ++ ++void * ++acc_hostptr (void *d) ++{ ++ goacc_lazy_initialize (); ++ return d; ++} ++ ++/* Return 1 if host data [H,+S] is present on the device. */ ++ ++int ++acc_is_present (void *h, size_t s) ++{ ++ if (!s || !h) ++ return 0; ++ ++ goacc_lazy_initialize (); ++ return h != NULL; ++} ++ ++/* Create a mapping for host [H,+S] -> device [D,+S] */ ++ ++void ++acc_map_data (void *h, void *d, size_t s) ++{ ++ goacc_lazy_initialize (); ++ ++ if (d != h) ++ gomp_fatal ("cannot map data on shared-memory system"); ++} ++ ++void ++acc_unmap_data (void *h) ++{ ++} ++ ++#define FLAG_PRESENT (1 << 0) ++#define FLAG_CREATE (1 << 1) ++#define FLAG_COPY (1 << 2) ++ ++static void * ++present_create_copy (unsigned f, void *h, size_t s) ++{ ++ if (!h || !s) ++ gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s); ++ ++ goacc_lazy_initialize (); ++ return h; ++} ++ ++void * ++acc_create (void *h, size_t s) ++{ ++ return present_create_copy (FLAG_CREATE, h, s); ++} ++ ++void * ++acc_copyin (void *h, size_t s) ++{ ++ return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s); ++} ++ ++void * ++acc_present_or_create (void *h, size_t s) ++{ ++ return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); ++} ++ ++void * ++acc_present_or_copyin (void *h, size_t s) ++{ ++ return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); ++} ++ ++#define FLAG_COPYOUT (1 << 0) ++ ++static void ++delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) ++{ ++} ++ ++void ++acc_delete (void *h , size_t s) ++{ ++ delete_copyout (0, h, s, __FUNCTION__); ++} ++ ++void ++acc_copyout (void *h, size_t s) ++{ ++ delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); ++} ++ ++static void ++update_dev_host (int is_dev, void *h, size_t s) ++{ ++ goacc_lazy_initialize (); ++} ++ ++void ++acc_update_device (void *h, size_t s) ++{ ++ update_dev_host (1, h, s); ++} ++ ++void ++acc_update_self (void *h, size_t s) ++{ ++ update_dev_host (0, h, s); ++} ++ ++void ++gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes, ++ void *kinds) ++{ ++} ++ ++void ++gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) ++{ ++} +--- libgomp/oacc-plugin.h.jj 2016-07-13 16:57:13.487423121 +0200 ++++ libgomp/oacc-plugin.h 2016-07-13 16:57:13.487423121 +0200 +@@ -0,0 +1,33 @@ ++/* Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++ Contributed by Mentor Embedded. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++#ifndef OACC_PLUGIN_H ++#define OACC_PLUGIN_H 1 ++ ++extern void GOMP_PLUGIN_async_unmap_vars (void *, int); ++extern void *GOMP_PLUGIN_acc_thread (void); ++ ++#endif +--- libgomp/taskloop.c.jj 2016-07-13 16:57:18.935355570 +0200 ++++ libgomp/taskloop.c 2016-07-13 16:57:18.935355570 +0200 +@@ -0,0 +1,340 @@ ++/* Copyright (C) 2015-2016 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* This file handles the taskloop construct. It is included twice, once ++ for the long and once for unsigned long long variant. */ ++ ++/* Called when encountering an explicit task directive. If IF_CLAUSE is ++ false, then we must not delay in executing the task. If UNTIED is true, ++ then the task may be executed by any member of the team. */ ++ ++void ++GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), ++ long arg_size, long arg_align, unsigned flags, ++ unsigned long num_tasks, int priority, ++ TYPE start, TYPE end, TYPE step) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ ++#ifdef HAVE_BROKEN_POSIX_SEMAPHORES ++ /* If pthread_mutex_* is used for omp_*lock*, then each task must be ++ tied to one thread all the time. This means UNTIED tasks must be ++ tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN ++ might be running on different thread than FN. */ ++ if (cpyfn) ++ flags &= ~GOMP_TASK_FLAG_IF; ++ flags &= ~GOMP_TASK_FLAG_UNTIED; ++#endif ++ ++ /* If parallel or taskgroup has been cancelled, don't start new tasks. */ ++ if (team && gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ ++#ifdef TYPE_is_long ++ TYPE s = step; ++ if (step > 0) ++ { ++ if (start >= end) ++ return; ++ s--; ++ } ++ else ++ { ++ if (start <= end) ++ return; ++ s++; ++ } ++ UTYPE n = (end - start + s) / step; ++#else ++ UTYPE n; ++ if (flags & GOMP_TASK_FLAG_UP) ++ { ++ if (start >= end) ++ return; ++ n = (end - start + step - 1) / step; ++ } ++ else ++ { ++ if (start <= end) ++ return; ++ n = (start - end - step - 1) / -step; ++ } ++#endif ++ ++ TYPE task_step = step; ++ unsigned long nfirst = n; ++ if (flags & GOMP_TASK_FLAG_GRAINSIZE) ++ { ++ unsigned long grainsize = num_tasks; ++#ifdef TYPE_is_long ++ num_tasks = n / grainsize; ++#else ++ UTYPE ndiv = n / grainsize; ++ num_tasks = ndiv; ++ if (num_tasks != ndiv) ++ num_tasks = ~0UL; ++#endif ++ if (num_tasks <= 1) ++ { ++ num_tasks = 1; ++ task_step = end - start; ++ } ++ else if (num_tasks >= grainsize ++#ifndef TYPE_is_long ++ && num_tasks != ~0UL ++#endif ++ ) ++ { ++ UTYPE mul = num_tasks * grainsize; ++ task_step = (TYPE) grainsize * step; ++ if (mul != n) ++ { ++ task_step += step; ++ nfirst = n - mul - 1; ++ } ++ } ++ else ++ { ++ UTYPE div = n / num_tasks; ++ UTYPE mod = n % num_tasks; ++ task_step = (TYPE) div * step; ++ if (mod) ++ { ++ task_step += step; ++ nfirst = mod - 1; ++ } ++ } ++ } ++ else ++ { ++ if (num_tasks == 0) ++ num_tasks = team ? team->nthreads : 1; ++ if (num_tasks >= n) ++ num_tasks = n; ++ else ++ { ++ UTYPE div = n / num_tasks; ++ UTYPE mod = n % num_tasks; ++ task_step = (TYPE) div * step; ++ if (mod) ++ { ++ task_step += step; ++ nfirst = mod - 1; ++ } ++ } ++ } ++ ++ if (flags & GOMP_TASK_FLAG_NOGROUP) ++ { ++ if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled) ++ return; ++ } ++ else ++ ialias_call (GOMP_taskgroup_start) (); ++ ++ if (priority > gomp_max_task_priority_var) ++ priority = gomp_max_task_priority_var; ++ ++ if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL ++ || (thr->task && thr->task->final_task) ++ || team->task_count + num_tasks > 64 * team->nthreads) ++ { ++ unsigned long i; ++ if (__builtin_expect (cpyfn != NULL, 0)) ++ { ++ struct gomp_task task[num_tasks]; ++ struct gomp_task *parent = thr->task; ++ arg_size = (arg_size + arg_align - 1) & ~(arg_align - 1); ++ char buf[num_tasks * arg_size + arg_align - 1]; ++ char *arg = (char *) (((uintptr_t) buf + arg_align - 1) ++ & ~(uintptr_t) (arg_align - 1)); ++ char *orig_arg = arg; ++ for (i = 0; i < num_tasks; i++) ++ { ++ gomp_init_task (&task[i], parent, gomp_icv (false)); ++ task[i].priority = priority; ++ task[i].kind = GOMP_TASK_UNDEFERRED; ++ task[i].final_task = (thr->task && thr->task->final_task) ++ || (flags & GOMP_TASK_FLAG_FINAL); ++ if (thr->task) ++ { ++ task[i].in_tied_task = thr->task->in_tied_task; ++ task[i].taskgroup = thr->task->taskgroup; ++ } ++ thr->task = &task[i]; ++ cpyfn (arg, data); ++ arg += arg_size; ++ } ++ arg = orig_arg; ++ for (i = 0; i < num_tasks; i++) ++ { ++ thr->task = &task[i]; ++ ((TYPE *)arg)[0] = start; ++ start += task_step; ++ ((TYPE *)arg)[1] = start; ++ if (i == nfirst) ++ task_step -= step; ++ fn (arg); ++ arg += arg_size; ++ if (!priority_queue_empty_p (&task[i].children_queue, ++ MEMMODEL_RELAXED)) ++ { ++ gomp_mutex_lock (&team->task_lock); ++ gomp_clear_parent (&task[i].children_queue); ++ gomp_mutex_unlock (&team->task_lock); ++ } ++ gomp_end_task (); ++ } ++ } ++ else ++ for (i = 0; i < num_tasks; i++) ++ { ++ struct gomp_task task; ++ ++ gomp_init_task (&task, thr->task, gomp_icv (false)); ++ task.priority = priority; ++ task.kind = GOMP_TASK_UNDEFERRED; ++ task.final_task = (thr->task && thr->task->final_task) ++ || (flags & GOMP_TASK_FLAG_FINAL); ++ if (thr->task) ++ { ++ task.in_tied_task = thr->task->in_tied_task; ++ task.taskgroup = thr->task->taskgroup; ++ } ++ thr->task = &task; ++ ((TYPE *)data)[0] = start; ++ start += task_step; ++ ((TYPE *)data)[1] = start; ++ if (i == nfirst) ++ task_step -= step; ++ fn (data); ++ if (!priority_queue_empty_p (&task.children_queue, ++ MEMMODEL_RELAXED)) ++ { ++ gomp_mutex_lock (&team->task_lock); ++ gomp_clear_parent (&task.children_queue); ++ gomp_mutex_unlock (&team->task_lock); ++ } ++ gomp_end_task (); ++ } ++ } ++ else ++ { ++ struct gomp_task *tasks[num_tasks]; ++ struct gomp_task *parent = thr->task; ++ struct gomp_taskgroup *taskgroup = parent->taskgroup; ++ char *arg; ++ int do_wake; ++ unsigned long i; ++ ++ for (i = 0; i < num_tasks; i++) ++ { ++ struct gomp_task *task ++ = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1); ++ tasks[i] = task; ++ arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1) ++ & ~(uintptr_t) (arg_align - 1)); ++ gomp_init_task (task, parent, gomp_icv (false)); ++ task->priority = priority; ++ task->kind = GOMP_TASK_UNDEFERRED; ++ task->in_tied_task = parent->in_tied_task; ++ task->taskgroup = taskgroup; ++ thr->task = task; ++ if (cpyfn) ++ { ++ cpyfn (arg, data); ++ task->copy_ctors_done = true; ++ } ++ else ++ memcpy (arg, data, arg_size); ++ ((TYPE *)arg)[0] = start; ++ start += task_step; ++ ((TYPE *)arg)[1] = start; ++ if (i == nfirst) ++ task_step -= step; ++ thr->task = parent; ++ task->kind = GOMP_TASK_WAITING; ++ task->fn = fn; ++ task->fn_data = arg; ++ task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1; ++ } ++ gomp_mutex_lock (&team->task_lock); ++ /* If parallel or taskgroup has been cancelled, don't start new ++ tasks. */ ++ if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) ++ || (taskgroup && taskgroup->cancelled)) ++ && cpyfn == NULL, 0)) ++ { ++ gomp_mutex_unlock (&team->task_lock); ++ for (i = 0; i < num_tasks; i++) ++ { ++ gomp_finish_task (tasks[i]); ++ free (tasks[i]); ++ } ++ if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) ++ ialias_call (GOMP_taskgroup_end) (); ++ return; ++ } ++ if (taskgroup) ++ taskgroup->num_children += num_tasks; ++ for (i = 0; i < num_tasks; i++) ++ { ++ struct gomp_task *task = tasks[i]; ++ priority_queue_insert (PQ_CHILDREN, &parent->children_queue, ++ task, priority, ++ PRIORITY_INSERT_BEGIN, ++ /*last_parent_depends_on=*/false, ++ task->parent_depends_on); ++ if (taskgroup) ++ priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, ++ task, priority, PRIORITY_INSERT_BEGIN, ++ /*last_parent_depends_on=*/false, ++ task->parent_depends_on); ++ priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority, ++ PRIORITY_INSERT_END, ++ /*last_parent_depends_on=*/false, ++ task->parent_depends_on); ++ ++team->task_count; ++ ++team->task_queued_count; ++ } ++ gomp_team_barrier_set_task_pending (&team->barrier); ++ if (team->task_running_count + !parent->in_tied_task ++ < team->nthreads) ++ { ++ do_wake = team->nthreads - team->task_running_count ++ - !parent->in_tied_task; ++ if ((unsigned long) do_wake > num_tasks) ++ do_wake = num_tasks; ++ } ++ else ++ do_wake = 0; ++ gomp_mutex_unlock (&team->task_lock); ++ if (do_wake) ++ gomp_team_barrier_wake (&team->barrier, do_wake); ++ } ++ if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) ++ ialias_call (GOMP_taskgroup_end) (); ++} +--- libgomp/priority_queue.h.jj 2016-07-13 16:57:04.438535323 +0200 ++++ libgomp/priority_queue.h 2016-07-13 16:57:04.438535323 +0200 +@@ -0,0 +1,485 @@ ++/* Copyright (C) 2015-2016 Free Software Foundation, Inc. ++ Contributed by Aldy Hernandez <aldyh@redhat.com>. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* Header file for a priority queue of GOMP tasks. */ ++ ++/* ?? Perhaps all the priority_tree_* functions are complex and rare ++ enough to go out-of-line and be moved to priority_queue.c. ?? */ ++ ++#ifndef _PRIORITY_QUEUE_H_ ++#define _PRIORITY_QUEUE_H_ ++ ++/* One task. */ ++ ++struct priority_node ++{ ++ /* Next and previous chains in a circular doubly linked list for ++ tasks within this task's priority. */ ++ struct priority_node *next, *prev; ++}; ++ ++/* All tasks within the same priority. */ ++ ++struct priority_list ++{ ++ /* Priority of the tasks in this set. */ ++ int priority; ++ ++ /* Tasks. */ ++ struct priority_node *tasks; ++ ++ /* This points to the last of the higher priority WAITING tasks. ++ Remember that for the children queue, we have: ++ ++ parent_depends_on WAITING tasks. ++ !parent_depends_on WAITING tasks. ++ TIED tasks. ++ ++ This is a pointer to the last of the parent_depends_on WAITING ++ tasks which are essentially, higher priority items within their ++ priority. */ ++ struct priority_node *last_parent_depends_on; ++}; ++ ++/* Another splay tree instantiation, for priority_list's. */ ++typedef struct prio_splay_tree_node_s *prio_splay_tree_node; ++typedef struct prio_splay_tree_s *prio_splay_tree; ++typedef struct prio_splay_tree_key_s *prio_splay_tree_key; ++struct prio_splay_tree_key_s { ++ /* This structure must only containing a priority_list, as we cast ++ prio_splay_tree_key to priority_list throughout. */ ++ struct priority_list l; ++}; ++#define splay_tree_prefix prio ++#include "splay-tree.h" ++ ++/* The entry point into a priority queue of tasks. ++ ++ There are two alternate implementations with which to store tasks: ++ as a balanced tree of sorts, or as a simple list of tasks. If ++ there are only priority-0 items (ROOT is NULL), we use the simple ++ list, otherwise (ROOT is non-NULL) we use the tree. */ ++ ++struct priority_queue ++{ ++ /* If t.root != NULL, this is a splay tree of priority_lists to hold ++ all tasks. This is only used if multiple priorities are in play, ++ otherwise we use the priority_list `l' below to hold all ++ (priority-0) tasks. */ ++ struct prio_splay_tree_s t; ++ ++ /* If T above is NULL, only priority-0 items exist, so keep them ++ in a simple list. */ ++ struct priority_list l; ++}; ++ ++enum priority_insert_type { ++ /* Insert at the beginning of a priority list. */ ++ PRIORITY_INSERT_BEGIN, ++ /* Insert at the end of a priority list. */ ++ PRIORITY_INSERT_END ++}; ++ ++/* Used to determine in which queue a given priority node belongs in. ++ See pnode field of gomp_task. */ ++ ++enum priority_queue_type ++{ ++ PQ_TEAM, /* Node belongs in gomp_team's task_queue. */ ++ PQ_CHILDREN, /* Node belongs in parent's children_queue. */ ++ PQ_TASKGROUP, /* Node belongs in taskgroup->taskgroup_queue. */ ++ PQ_IGNORED = 999 ++}; ++ ++/* Priority queue implementation prototypes. */ ++ ++extern bool priority_queue_task_in_queue_p (enum priority_queue_type, ++ struct priority_queue *, ++ struct gomp_task *); ++extern void priority_queue_dump (enum priority_queue_type, ++ struct priority_queue *); ++extern void priority_queue_verify (enum priority_queue_type, ++ struct priority_queue *, bool); ++extern void priority_tree_remove (enum priority_queue_type, ++ struct priority_queue *, ++ struct priority_node *); ++extern struct gomp_task *priority_tree_next_task (enum priority_queue_type, ++ struct priority_queue *, ++ enum priority_queue_type, ++ struct priority_queue *, ++ bool *); ++ ++/* Return TRUE if there is more than one priority in HEAD. This is ++ used throughout to to choose between the fast path (priority 0 only ++ items) and a world with multiple priorities. */ ++ ++static inline bool ++priority_queue_multi_p (struct priority_queue *head) ++{ ++ return __builtin_expect (head->t.root != NULL, 0); ++} ++ ++/* Initialize a priority queue. */ ++ ++static inline void ++priority_queue_init (struct priority_queue *head) ++{ ++ head->t.root = NULL; ++ /* To save a few microseconds, we don't initialize head->l.priority ++ to 0 here. It is implied that priority will be 0 if head->t.root ++ == NULL. ++ ++ priority_tree_insert() will fix this when we encounter multiple ++ priorities. */ ++ head->l.tasks = NULL; ++ head->l.last_parent_depends_on = NULL; ++} ++ ++static inline void ++priority_queue_free (struct priority_queue *head) ++{ ++ /* There's nothing to do, as tasks were freed as they were removed ++ in priority_queue_remove. */ ++} ++ ++/* Forward declarations. */ ++static inline size_t priority_queue_offset (enum priority_queue_type); ++static inline struct gomp_task *priority_node_to_task ++ (enum priority_queue_type, ++ struct priority_node *); ++static inline struct priority_node *task_to_priority_node ++ (enum priority_queue_type, ++ struct gomp_task *); ++ ++/* Return TRUE if priority queue HEAD is empty. ++ ++ MODEL IS MEMMODEL_ACQUIRE if we should use an acquire atomic to ++ read from the root of the queue, otherwise MEMMODEL_RELAXED if we ++ should use a plain load. */ ++ ++static inline _Bool ++priority_queue_empty_p (struct priority_queue *head, enum memmodel model) ++{ ++ /* Note: The acquire barriers on the loads here synchronize with ++ the write of a NULL in gomp_task_run_post_remove_parent. It is ++ not necessary that we synchronize with other non-NULL writes at ++ this point, but we must ensure that all writes to memory by a ++ child thread task work function are seen before we exit from ++ GOMP_taskwait. */ ++ if (priority_queue_multi_p (head)) ++ { ++ if (model == MEMMODEL_ACQUIRE) ++ return __atomic_load_n (&head->t.root, MEMMODEL_ACQUIRE) == NULL; ++ return head->t.root == NULL; ++ } ++ if (model == MEMMODEL_ACQUIRE) ++ return __atomic_load_n (&head->l.tasks, MEMMODEL_ACQUIRE) == NULL; ++ return head->l.tasks == NULL; ++} ++ ++/* Look for a given PRIORITY in HEAD. Return it if found, otherwise ++ return NULL. This only applies to the tree variant in HEAD. There ++ is no point in searching for priorities in HEAD->L. */ ++ ++static inline struct priority_list * ++priority_queue_lookup_priority (struct priority_queue *head, int priority) ++{ ++ if (head->t.root == NULL) ++ return NULL; ++ struct prio_splay_tree_key_s k; ++ k.l.priority = priority; ++ return (struct priority_list *) ++ prio_splay_tree_lookup (&head->t, &k); ++} ++ ++/* Insert task in DATA, with PRIORITY, in the priority list in LIST. ++ LIST contains items of type TYPE. ++ ++ If POS is PRIORITY_INSERT_BEGIN, the new task is inserted at the ++ top of its respective priority. If POS is PRIORITY_INSERT_END, the ++ task is inserted at the end of its priority. ++ ++ If ADJUST_PARENT_DEPENDS_ON is TRUE, LIST is a children queue, and ++ we must keep track of higher and lower priority WAITING tasks by ++ keeping the queue's last_parent_depends_on field accurate. This ++ only applies to the children queue, and the caller must ensure LIST ++ is a children queue in this case. ++ ++ If ADJUST_PARENT_DEPENDS_ON is TRUE, TASK_IS_PARENT_DEPENDS_ON is ++ set to the task's parent_depends_on field. If ++ ADJUST_PARENT_DEPENDS_ON is FALSE, this field is irrelevant. ++ ++ Return the new priority_node. */ ++ ++static inline void ++priority_list_insert (enum priority_queue_type type, ++ struct priority_list *list, ++ struct gomp_task *task, ++ int priority, ++ enum priority_insert_type pos, ++ bool adjust_parent_depends_on, ++ bool task_is_parent_depends_on) ++{ ++ struct priority_node *node = task_to_priority_node (type, task); ++ if (list->tasks) ++ { ++ /* If we are keeping track of higher/lower priority items, ++ but this is a lower priority WAITING task ++ (parent_depends_on != NULL), put it after all ready to ++ run tasks. See the comment in ++ priority_queue_upgrade_task for a visual on how tasks ++ should be organized. */ ++ if (adjust_parent_depends_on ++ && pos == PRIORITY_INSERT_BEGIN ++ && list->last_parent_depends_on ++ && !task_is_parent_depends_on) ++ { ++ struct priority_node *last_parent_depends_on ++ = list->last_parent_depends_on; ++ node->next = last_parent_depends_on->next; ++ node->prev = last_parent_depends_on; ++ } ++ /* Otherwise, put it at the top/bottom of the queue. */ ++ else ++ { ++ node->next = list->tasks; ++ node->prev = list->tasks->prev; ++ if (pos == PRIORITY_INSERT_BEGIN) ++ list->tasks = node; ++ } ++ node->next->prev = node; ++ node->prev->next = node; ++ } ++ else ++ { ++ node->next = node; ++ node->prev = node; ++ list->tasks = node; ++ } ++ if (adjust_parent_depends_on ++ && list->last_parent_depends_on == NULL ++ && task_is_parent_depends_on) ++ list->last_parent_depends_on = node; ++} ++ ++/* Tree version of priority_list_insert. */ ++ ++static inline void ++priority_tree_insert (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task, ++ int priority, ++ enum priority_insert_type pos, ++ bool adjust_parent_depends_on, ++ bool task_is_parent_depends_on) ++{ ++ if (__builtin_expect (head->t.root == NULL, 0)) ++ { ++ /* The first time around, transfer any priority 0 items to the ++ tree. */ ++ if (head->l.tasks != NULL) ++ { ++ prio_splay_tree_node k = gomp_malloc (sizeof (*k)); ++ k->left = NULL; ++ k->right = NULL; ++ k->key.l.priority = 0; ++ k->key.l.tasks = head->l.tasks; ++ k->key.l.last_parent_depends_on = head->l.last_parent_depends_on; ++ prio_splay_tree_insert (&head->t, k); ++ head->l.tasks = NULL; ++ } ++ } ++ struct priority_list *list ++ = priority_queue_lookup_priority (head, priority); ++ if (!list) ++ { ++ prio_splay_tree_node k = gomp_malloc (sizeof (*k)); ++ k->left = NULL; ++ k->right = NULL; ++ k->key.l.priority = priority; ++ k->key.l.tasks = NULL; ++ k->key.l.last_parent_depends_on = NULL; ++ prio_splay_tree_insert (&head->t, k); ++ list = &k->key.l; ++ } ++ priority_list_insert (type, list, task, priority, pos, ++ adjust_parent_depends_on, ++ task_is_parent_depends_on); ++} ++ ++/* Generic version of priority_*_insert. */ ++ ++static inline void ++priority_queue_insert (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task, ++ int priority, ++ enum priority_insert_type pos, ++ bool adjust_parent_depends_on, ++ bool task_is_parent_depends_on) ++{ ++#if _LIBGOMP_CHECKING_ ++ if (priority_queue_task_in_queue_p (type, head, task)) ++ gomp_fatal ("Attempt to insert existing task %p", task); ++#endif ++ if (priority_queue_multi_p (head) || __builtin_expect (priority > 0, 0)) ++ priority_tree_insert (type, head, task, priority, pos, ++ adjust_parent_depends_on, ++ task_is_parent_depends_on); ++ else ++ priority_list_insert (type, &head->l, task, priority, pos, ++ adjust_parent_depends_on, ++ task_is_parent_depends_on); ++} ++ ++/* If multiple priorities are in play, return the highest priority ++ task from within Q1 and Q2, while giving preference to tasks from ++ Q1. If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to ++ TRUE, otherwise it is set to FALSE. ++ ++ If multiple priorities are not in play (only 0 priorities are ++ available), the next task is chosen exclusively from Q1. ++ ++ As a special case, Q2 can be NULL, in which case, we just choose ++ the highest priority WAITING task in Q1. This is an optimization ++ to speed up looking through only one queue. ++ ++ We assume Q1 has at least one item. */ ++ ++static inline struct gomp_task * ++priority_queue_next_task (enum priority_queue_type t1, ++ struct priority_queue *q1, ++ enum priority_queue_type t2, ++ struct priority_queue *q2, ++ bool *q1_chosen_p) ++{ ++#if _LIBGOMP_CHECKING_ ++ if (priority_queue_empty_p (q1, MEMMODEL_RELAXED)) ++ gomp_fatal ("priority_queue_next_task: Q1 is empty"); ++#endif ++ if (priority_queue_multi_p (q1)) ++ { ++ struct gomp_task *t ++ = priority_tree_next_task (t1, q1, t2, q2, q1_chosen_p); ++ /* If T is NULL, there are no WAITING tasks in Q1. In which ++ case, return any old (non-waiting) task which will cause the ++ caller to do the right thing when checking T->KIND == ++ GOMP_TASK_WAITING. */ ++ if (!t) ++ { ++#if _LIBGOMP_CHECKING_ ++ if (*q1_chosen_p == false) ++ gomp_fatal ("priority_queue_next_task inconsistency"); ++#endif ++ return priority_node_to_task (t1, q1->t.root->key.l.tasks); ++ } ++ return t; ++ } ++ else ++ { ++ *q1_chosen_p = true; ++ return priority_node_to_task (t1, q1->l.tasks); ++ } ++} ++ ++/* Remove NODE from LIST. ++ ++ If we are removing the one and only item in the list, and MODEL is ++ MEMMODEL_RELEASE, use an atomic release to clear the list. ++ ++ If the list becomes empty after the remove, return TRUE. */ ++ ++static inline bool ++priority_list_remove (struct priority_list *list, ++ struct priority_node *node, ++ enum memmodel model) ++{ ++ bool empty = false; ++ node->prev->next = node->next; ++ node->next->prev = node->prev; ++ if (list->tasks == node) ++ { ++ if (node->next != node) ++ list->tasks = node->next; ++ else ++ { ++ /* We access task->children in GOMP_taskwait outside of ++ the task lock mutex region, so need a release barrier ++ here to ensure memory written by child_task->fn above ++ is flushed before the NULL is written. */ ++ if (model == MEMMODEL_RELEASE) ++ __atomic_store_n (&list->tasks, NULL, MEMMODEL_RELEASE); ++ else ++ list->tasks = NULL; ++ empty = true; ++ goto remove_out; ++ } ++ } ++remove_out: ++#if _LIBGOMP_CHECKING_ ++ memset (node, 0xaf, sizeof (*node)); ++#endif ++ return empty; ++} ++ ++/* This is the generic version of priority_list_remove. ++ ++ Remove NODE from priority queue HEAD. HEAD contains tasks of type TYPE. ++ ++ If we are removing the one and only item in the priority queue and ++ MODEL is MEMMODEL_RELEASE, use an atomic release to clear the queue. ++ ++ If the queue becomes empty after the remove, return TRUE. */ ++ ++static inline bool ++priority_queue_remove (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task, ++ enum memmodel model) ++{ ++#if _LIBGOMP_CHECKING_ ++ if (!priority_queue_task_in_queue_p (type, head, task)) ++ gomp_fatal ("Attempt to remove missing task %p", task); ++#endif ++ if (priority_queue_multi_p (head)) ++ { ++ priority_tree_remove (type, head, task_to_priority_node (type, task)); ++ if (head->t.root == NULL) ++ { ++ if (model == MEMMODEL_RELEASE) ++ /* Errr, we store NULL twice, the alternative would be to ++ use an atomic release directly in the splay tree ++ routines. Worth it? */ ++ __atomic_store_n (&head->t.root, NULL, MEMMODEL_RELEASE); ++ return true; ++ } ++ return false; ++ } ++ else ++ return priority_list_remove (&head->l, ++ task_to_priority_node (type, task), model); ++} ++ ++#endif /* _PRIORITY_QUEUE_H_ */ +--- libgomp/priority_queue.c.jj 2016-07-13 16:57:04.435535360 +0200 ++++ libgomp/priority_queue.c 2016-07-13 16:57:04.435535360 +0200 +@@ -0,0 +1,300 @@ ++/* Copyright (C) 2015-2016 Free Software Foundation, Inc. ++ Contributed by Aldy Hernandez <aldyh@redhat.com>. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ <http://www.gnu.org/licenses/>. */ ++ ++/* Priority queue implementation of GOMP tasks. */ ++ ++#include "libgomp.h" ++ ++#if _LIBGOMP_CHECKING_ ++#include <stdio.h> ++ ++/* Sanity check to verify whether a TASK is in LIST. Return TRUE if ++ found, FALSE otherwise. ++ ++ TYPE is the type of priority queue this task resides in. */ ++ ++static inline bool ++priority_queue_task_in_list_p (enum priority_queue_type type, ++ struct priority_list *list, ++ struct gomp_task *task) ++{ ++ struct priority_node *p = list->tasks; ++ do ++ { ++ if (priority_node_to_task (type, p) == task) ++ return true; ++ p = p->next; ++ } ++ while (p != list->tasks); ++ return false; ++} ++ ++/* Tree version of priority_queue_task_in_list_p. */ ++ ++static inline bool ++priority_queue_task_in_tree_p (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task) ++{ ++ struct priority_list *list ++ = priority_queue_lookup_priority (head, task->priority); ++ if (!list) ++ return false; ++ return priority_queue_task_in_list_p (type, list, task); ++} ++ ++/* Generic version of priority_queue_task_in_list_p that works for ++ trees or lists. */ ++ ++bool ++priority_queue_task_in_queue_p (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct gomp_task *task) ++{ ++ if (priority_queue_empty_p (head, MEMMODEL_RELAXED)) ++ return false; ++ if (priority_queue_multi_p (head)) ++ return priority_queue_task_in_tree_p (type, head, task); ++ else ++ return priority_queue_task_in_list_p (type, &head->l, task); ++} ++ ++/* Sanity check LIST to make sure the tasks therein are in the right ++ order. LIST is a priority list of type TYPE. ++ ++ The expected order is that GOMP_TASK_WAITING tasks come before ++ GOMP_TASK_TIED/GOMP_TASK_ASYNC_RUNNING ones. ++ ++ If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING ++ tasks come before !parent_depends_on WAITING tasks. This is only ++ applicable to the children queue, and the caller is expected to ++ ensure that we are verifying the children queue. */ ++ ++static void ++priority_list_verify (enum priority_queue_type type, ++ struct priority_list *list, bool check_deps) ++{ ++ bool seen_tied = false; ++ bool seen_plain_waiting = false; ++ struct priority_node *p = list->tasks; ++ while (1) ++ { ++ struct gomp_task *t = priority_node_to_task (type, p); ++ if (seen_tied && t->kind == GOMP_TASK_WAITING) ++ gomp_fatal ("priority_queue_verify: WAITING task after TIED"); ++ if (t->kind >= GOMP_TASK_TIED) ++ seen_tied = true; ++ else if (check_deps && t->kind == GOMP_TASK_WAITING) ++ { ++ if (t->parent_depends_on) ++ { ++ if (seen_plain_waiting) ++ gomp_fatal ("priority_queue_verify: " ++ "parent_depends_on after !parent_depends_on"); ++ } ++ else ++ seen_plain_waiting = true; ++ } ++ p = p->next; ++ if (p == list->tasks) ++ break; ++ } ++} ++ ++/* Callback type for priority_tree_verify_callback. */ ++struct cbtype ++{ ++ enum priority_queue_type type; ++ bool check_deps; ++}; ++ ++/* Verify every task in NODE. ++ ++ Callback for splay_tree_foreach. */ ++ ++static void ++priority_tree_verify_callback (prio_splay_tree_key key, void *data) ++{ ++ struct cbtype *cb = (struct cbtype *) data; ++ priority_list_verify (cb->type, &key->l, cb->check_deps); ++} ++ ++/* Generic version of priority_list_verify. ++ ++ Sanity check HEAD to make sure the tasks therein are in the right ++ order. The priority_queue holds tasks of type TYPE. ++ ++ If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING ++ tasks come before !parent_depends_on WAITING tasks. This is only ++ applicable to the children queue, and the caller is expected to ++ ensure that we are verifying the children queue. */ ++ ++void ++priority_queue_verify (enum priority_queue_type type, ++ struct priority_queue *head, bool check_deps) ++{ ++ if (priority_queue_empty_p (head, MEMMODEL_RELAXED)) ++ return; ++ if (priority_queue_multi_p (head)) ++ { ++ struct cbtype cb = { type, check_deps }; ++ prio_splay_tree_foreach (&head->t, ++ priority_tree_verify_callback, &cb); ++ } ++ else ++ priority_list_verify (type, &head->l, check_deps); ++} ++#endif /* _LIBGOMP_CHECKING_ */ ++ ++/* Remove NODE from priority queue HEAD, wherever it may be inside the ++ tree. HEAD contains tasks of type TYPE. */ ++ ++void ++priority_tree_remove (enum priority_queue_type type, ++ struct priority_queue *head, ++ struct priority_node *node) ++{ ++ /* ?? The only reason this function is not inlined is because we ++ need to find the priority within gomp_task (which has not been ++ completely defined in the header file). If the lack of inlining ++ is a concern, we could pass the priority number as a ++ parameter, or we could move this to libgomp.h. */ ++ int priority = priority_node_to_task (type, node)->priority; ++ ++ /* ?? We could avoid this lookup by keeping a pointer to the key in ++ the priority_node. */ ++ struct priority_list *list ++ = priority_queue_lookup_priority (head, priority); ++#if _LIBGOMP_CHECKING_ ++ if (!list) ++ gomp_fatal ("Unable to find priority %d", priority); ++#endif ++ /* If NODE was the last in its priority, clean up the priority. */ ++ if (priority_list_remove (list, node, MEMMODEL_RELAXED)) ++ { ++ prio_splay_tree_remove (&head->t, (prio_splay_tree_key) list); ++ list->tasks = NULL; ++#if _LIBGOMP_CHECKING_ ++ memset (list, 0xaf, sizeof (*list)); ++#endif ++ free (list); ++ } ++} ++ ++/* Return the highest priority WAITING task in a splay tree NODE. If ++ there are no WAITING tasks available, return NULL. ++ ++ NODE is a priority list containing tasks of type TYPE. ++ ++ The right most node in a tree contains the highest priority. ++ Recurse down to find such a node. If the task at that max node is ++ not WAITING, bubble back up and look at the remaining tasks ++ in-order. */ ++ ++static struct gomp_task * ++priority_tree_next_task_1 (enum priority_queue_type type, ++ prio_splay_tree_node node) ++{ ++ again: ++ if (!node) ++ return NULL; ++ struct gomp_task *ret = priority_tree_next_task_1 (type, node->right); ++ if (ret) ++ return ret; ++ ret = priority_node_to_task (type, node->key.l.tasks); ++ if (ret->kind == GOMP_TASK_WAITING) ++ return ret; ++ node = node->left; ++ goto again; ++} ++ ++/* Return the highest priority WAITING task from within Q1 and Q2, ++ while giving preference to tasks from Q1. Q1 is a queue containing ++ items of type TYPE1. Q2 is a queue containing items of type TYPE2. ++ ++ Since we are mostly interested in Q1, if there are no WAITING tasks ++ in Q1, we don't bother checking Q2, and just return NULL. ++ ++ As a special case, Q2 can be NULL, in which case, we just choose ++ the highest priority WAITING task in Q1. This is an optimization ++ to speed up looking through only one queue. ++ ++ If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to ++ TRUE, otherwise it is set to FALSE. */ ++ ++struct gomp_task * ++priority_tree_next_task (enum priority_queue_type type1, ++ struct priority_queue *q1, ++ enum priority_queue_type type2, ++ struct priority_queue *q2, ++ bool *q1_chosen_p) ++{ ++ struct gomp_task *t1 = priority_tree_next_task_1 (type1, q1->t.root); ++ if (!t1 ++ /* Special optimization when only searching through one queue. */ ++ || !q2) ++ { ++ *q1_chosen_p = true; ++ return t1; ++ } ++ struct gomp_task *t2 = priority_tree_next_task_1 (type2, q2->t.root); ++ if (!t2 || t1->priority > t2->priority) ++ { ++ *q1_chosen_p = true; ++ return t1; ++ } ++ if (t2->priority > t1->priority) ++ { ++ *q1_chosen_p = false; ++ return t2; ++ } ++ /* If we get here, the priorities are the same, so we must look at ++ parent_depends_on to make our decision. */ ++#if _LIBGOMP_CHECKING_ ++ if (t1 != t2) ++ gomp_fatal ("priority_tree_next_task: t1 != t2"); ++#endif ++ if (t2->parent_depends_on && !t1->parent_depends_on) ++ { ++ *q1_chosen_p = false; ++ return t2; ++ } ++ *q1_chosen_p = true; ++ return t1; ++} ++ ++/* Priority splay trees comparison function. */ ++static inline int ++prio_splay_compare (prio_splay_tree_key x, prio_splay_tree_key y) ++{ ++ if (x->l.priority == y->l.priority) ++ return 0; ++ return x->l.priority < y->l.priority ? -1 : 1; ++} ++ ++/* Define another splay tree instantiation, for priority_list's. */ ++#define splay_tree_prefix prio ++#define splay_tree_c ++#include "splay-tree.h" +--- libgomp/openacc.f90.jj 2016-07-13 16:57:04.434535373 +0200 ++++ libgomp/openacc.f90 2016-07-14 19:01:54.901230875 +0200 +@@ -0,0 +1,911 @@ ++! OpenACC Runtime Library Definitions. ++ ++! Copyright (C) 2014-2016 Free Software Foundation, Inc. ++ ++! Contributed by Tobias Burnus <burnus@net-b.de> ++! and Mentor Embedded. ++ ++! This file is part of the GNU Offloading and Multi Processing Library ++! (libgomp). ++ ++! Libgomp is free software; you can redistribute it and/or modify it ++! under the terms of the GNU General Public License as published by ++! the Free Software Foundation; either version 3, or (at your option) ++! any later version. ++ ++! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++! FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++! more details. ++ ++! Under Section 7 of GPL version 3, you are granted additional ++! permissions described in the GCC Runtime Library Exception, version ++! 3.1, as published by the Free Software Foundation. ++ ++! You should have received a copy of the GNU General Public License and ++! a copy of the GCC Runtime Library Exception along with this program; ++! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++! <http://www.gnu.org/licenses/>. ++ ++module openacc_kinds ++ use iso_fortran_env, only: int32 ++ implicit none ++ ++ private :: int32 ++ public :: acc_device_kind ++ ++ integer, parameter :: acc_device_kind = int32 ++ ++ public :: acc_device_none, acc_device_default, acc_device_host ++ public :: acc_device_not_host, acc_device_nvidia ++ ++ ! Keep in sync with include/gomp-constants.h. ++ integer (acc_device_kind), parameter :: acc_device_none = 0 ++ integer (acc_device_kind), parameter :: acc_device_default = 1 ++ integer (acc_device_kind), parameter :: acc_device_host = 2 ++ ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed. ++ integer (acc_device_kind), parameter :: acc_device_not_host = 4 ++ integer (acc_device_kind), parameter :: acc_device_nvidia = 5 ++ ++ public :: acc_handle_kind ++ ++ integer, parameter :: acc_handle_kind = int32 ++ ++ public :: acc_async_noval, acc_async_sync ++ ++ ! Keep in sync with include/gomp-constants.h. ++ integer (acc_handle_kind), parameter :: acc_async_noval = -1 ++ integer (acc_handle_kind), parameter :: acc_async_sync = -2 ++ ++end module ++ ++module openacc_internal ++ use openacc_kinds ++ implicit none ++ ++ interface ++ function acc_get_num_devices_h (d) ++ import ++ integer acc_get_num_devices_h ++ integer (acc_device_kind) d ++ end function ++ ++ subroutine acc_set_device_type_h (d) ++ import ++ integer (acc_device_kind) d ++ end subroutine ++ ++ function acc_get_device_type_h () ++ import ++ integer (acc_device_kind) acc_get_device_type_h ++ end function ++ ++ subroutine acc_set_device_num_h (n, d) ++ import ++ integer n ++ integer (acc_device_kind) d ++ end subroutine ++ ++ function acc_get_device_num_h (d) ++ import ++ integer acc_get_device_num_h ++ integer (acc_device_kind) d ++ end function ++ ++ function acc_async_test_h (a) ++ logical acc_async_test_h ++ integer a ++ end function ++ ++ function acc_async_test_all_h () ++ logical acc_async_test_all_h ++ end function ++ ++ subroutine acc_wait_h (a) ++ integer a ++ end subroutine ++ ++ subroutine acc_wait_async_h (a1, a2) ++ integer a1, a2 ++ end subroutine ++ ++ subroutine acc_wait_all_h () ++ end subroutine ++ ++ subroutine acc_wait_all_async_h (a) ++ integer a ++ end subroutine ++ ++ subroutine acc_init_h (d) ++ import ++ integer (acc_device_kind) d ++ end subroutine ++ ++ subroutine acc_shutdown_h (d) ++ import ++ integer (acc_device_kind) d ++ end subroutine ++ ++ function acc_on_device_h (d) ++ import ++ integer (acc_device_kind) d ++ logical acc_on_device_h ++ end function ++ ++ subroutine acc_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_present_or_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_create_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_present_or_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_present_or_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_present_or_create_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_copyout_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyout_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyout_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_delete_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_delete_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_delete_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_update_device_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_update_device_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_update_device_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_update_self_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_update_self_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_update_self_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ function acc_is_present_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ logical acc_is_present_32_h ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end function ++ ++ function acc_is_present_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ logical acc_is_present_64_h ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end function ++ ++ function acc_is_present_array_h (a) ++ logical acc_is_present_array_h ++ type (*), dimension (..), contiguous :: a ++ end function ++ end interface ++ ++ interface ++ function acc_get_num_devices_l (d) & ++ bind (C, name = "acc_get_num_devices") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_get_num_devices_l ++ integer (c_int), value :: d ++ end function ++ ++ subroutine acc_set_device_type_l (d) & ++ bind (C, name = "acc_set_device_type") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: d ++ end subroutine ++ ++ function acc_get_device_type_l () & ++ bind (C, name = "acc_get_device_type") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_get_device_type_l ++ end function ++ ++ subroutine acc_set_device_num_l (n, d) & ++ bind (C, name = "acc_set_device_num") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: n, d ++ end subroutine ++ ++ function acc_get_device_num_l (d) & ++ bind (C, name = "acc_get_device_num") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_get_device_num_l ++ integer (c_int), value :: d ++ end function ++ ++ function acc_async_test_l (a) & ++ bind (C, name = "acc_async_test") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_async_test_l ++ integer (c_int), value :: a ++ end function ++ ++ function acc_async_test_all_l () & ++ bind (C, name = "acc_async_test_all") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_async_test_all_l ++ end function ++ ++ subroutine acc_wait_l (a) & ++ bind (C, name = "acc_wait") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: a ++ end subroutine ++ ++ subroutine acc_wait_async_l (a1, a2) & ++ bind (C, name = "acc_wait_async") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: a1, a2 ++ end subroutine ++ ++ subroutine acc_wait_all_l () & ++ bind (C, name = "acc_wait_all") ++ use iso_c_binding, only: c_int ++ end subroutine ++ ++ subroutine acc_wait_all_async_l (a) & ++ bind (C, name = "acc_wait_all_async") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: a ++ end subroutine ++ ++ subroutine acc_init_l (d) & ++ bind (C, name = "acc_init") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: d ++ end subroutine ++ ++ subroutine acc_shutdown_l (d) & ++ bind (C, name = "acc_shutdown") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: d ++ end subroutine ++ ++ function acc_on_device_l (d) & ++ bind (C, name = "acc_on_device") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_on_device_l ++ integer (c_int), value :: d ++ end function ++ ++ subroutine acc_copyin_l (a, len) & ++ bind (C, name = "acc_copyin") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_l (a, len) & ++ bind (C, name = "acc_present_or_copyin") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_create_l (a, len) & ++ bind (C, name = "acc_create") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_present_or_create_l (a, len) & ++ bind (C, name = "acc_present_or_create") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_copyout_l (a, len) & ++ bind (C, name = "acc_copyout") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_delete_l (a, len) & ++ bind (C, name = "acc_delete") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_update_device_l (a, len) & ++ bind (C, name = "acc_update_device") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_update_self_l (a, len) & ++ bind (C, name = "acc_update_self") ++ use iso_c_binding, only: c_size_t ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ function acc_is_present_l (a, len) & ++ bind (C, name = "acc_is_present") ++ use iso_c_binding, only: c_int32_t, c_size_t ++ integer (c_int32_t) :: acc_is_present_l ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end function ++ end interface ++end module ++ ++module openacc ++ use openacc_kinds ++ use openacc_internal ++ implicit none ++ ++ public :: openacc_version ++ ++ public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type ++ public :: acc_set_device_num, acc_get_device_num, acc_async_test ++ public :: acc_async_test_all, acc_wait, acc_wait_async, acc_wait_all ++ public :: acc_wait_all_async, acc_init, acc_shutdown, acc_on_device ++ public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create ++ public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete ++ public :: acc_update_device, acc_update_self, acc_is_present ++ ++ integer, parameter :: openacc_version = 201306 ++ ++ interface acc_get_num_devices ++ procedure :: acc_get_num_devices_h ++ end interface ++ ++ interface acc_set_device_type ++ procedure :: acc_set_device_type_h ++ end interface ++ ++ interface acc_get_device_type ++ procedure :: acc_get_device_type_h ++ end interface ++ ++ interface acc_set_device_num ++ procedure :: acc_set_device_num_h ++ end interface ++ ++ interface acc_get_device_num ++ procedure :: acc_get_device_num_h ++ end interface ++ ++ interface acc_async_test ++ procedure :: acc_async_test_h ++ end interface ++ ++ interface acc_async_test_all ++ procedure :: acc_async_test_all_h ++ end interface ++ ++ interface acc_wait ++ procedure :: acc_wait_h ++ end interface ++ ++ interface acc_wait_async ++ procedure :: acc_wait_async_h ++ end interface ++ ++ interface acc_wait_all ++ procedure :: acc_wait_all_h ++ end interface ++ ++ interface acc_wait_all_async ++ procedure :: acc_wait_all_async_h ++ end interface ++ ++ interface acc_init ++ procedure :: acc_init_h ++ end interface ++ ++ interface acc_shutdown ++ procedure :: acc_shutdown_h ++ end interface ++ ++ interface acc_on_device ++ procedure :: acc_on_device_h ++ end interface ++ ++ ! acc_malloc: Only available in C/C++ ++ ! acc_free: Only available in C/C++ ++ ++ ! As vendor extension, the following code supports both 32bit and 64bit ++ ! arguments for "size"; the OpenACC standard only permits default-kind ++ ! integers, which are of kind 4 (i.e. 32 bits). ++ ! Additionally, the two-argument version also takes arrays as argument. ++ ! and the one argument version also scalars. Note that the code assumes ++ ! that the arrays are contiguous. ++ ++ interface acc_copyin ++ procedure :: acc_copyin_32_h ++ procedure :: acc_copyin_64_h ++ procedure :: acc_copyin_array_h ++ end interface ++ ++ interface acc_present_or_copyin ++ procedure :: acc_present_or_copyin_32_h ++ procedure :: acc_present_or_copyin_64_h ++ procedure :: acc_present_or_copyin_array_h ++ end interface ++ ++ interface acc_pcopyin ++ procedure :: acc_present_or_copyin_32_h ++ procedure :: acc_present_or_copyin_64_h ++ procedure :: acc_present_or_copyin_array_h ++ end interface ++ ++ interface acc_create ++ procedure :: acc_create_32_h ++ procedure :: acc_create_64_h ++ procedure :: acc_create_array_h ++ end interface ++ ++ interface acc_present_or_create ++ procedure :: acc_present_or_create_32_h ++ procedure :: acc_present_or_create_64_h ++ procedure :: acc_present_or_create_array_h ++ end interface ++ ++ interface acc_pcreate ++ procedure :: acc_present_or_create_32_h ++ procedure :: acc_present_or_create_64_h ++ procedure :: acc_present_or_create_array_h ++ end interface ++ ++ interface acc_copyout ++ procedure :: acc_copyout_32_h ++ procedure :: acc_copyout_64_h ++ procedure :: acc_copyout_array_h ++ end interface ++ ++ interface acc_delete ++ procedure :: acc_delete_32_h ++ procedure :: acc_delete_64_h ++ procedure :: acc_delete_array_h ++ end interface ++ ++ interface acc_update_device ++ procedure :: acc_update_device_32_h ++ procedure :: acc_update_device_64_h ++ procedure :: acc_update_device_array_h ++ end interface ++ ++ interface acc_update_self ++ procedure :: acc_update_self_32_h ++ procedure :: acc_update_self_64_h ++ procedure :: acc_update_self_array_h ++ end interface ++ ++ ! acc_map_data: Only available in C/C++ ++ ! acc_unmap_data: Only available in C/C++ ++ ! acc_deviceptr: Only available in C/C++ ++ ! acc_hostptr: Only available in C/C++ ++ ++ interface acc_is_present ++ procedure :: acc_is_present_32_h ++ procedure :: acc_is_present_64_h ++ procedure :: acc_is_present_array_h ++ end interface ++ ++ ! acc_memcpy_to_device: Only available in C/C++ ++ ! acc_memcpy_from_device: Only available in C/C++ ++ ++end module ++ ++function acc_get_num_devices_h (d) ++ use openacc_internal, only: acc_get_num_devices_l ++ use openacc_kinds ++ integer acc_get_num_devices_h ++ integer (acc_device_kind) d ++ acc_get_num_devices_h = acc_get_num_devices_l (d) ++end function ++ ++subroutine acc_set_device_type_h (d) ++ use openacc_internal, only: acc_set_device_type_l ++ use openacc_kinds ++ integer (acc_device_kind) d ++ call acc_set_device_type_l (d) ++end subroutine ++ ++function acc_get_device_type_h () ++ use openacc_internal, only: acc_get_device_type_l ++ use openacc_kinds ++ integer (acc_device_kind) acc_get_device_type_h ++ acc_get_device_type_h = acc_get_device_type_l () ++end function ++ ++subroutine acc_set_device_num_h (n, d) ++ use openacc_internal, only: acc_set_device_num_l ++ use openacc_kinds ++ integer n ++ integer (acc_device_kind) d ++ call acc_set_device_num_l (n, d) ++end subroutine ++ ++function acc_get_device_num_h (d) ++ use openacc_internal, only: acc_get_device_num_l ++ use openacc_kinds ++ integer acc_get_device_num_h ++ integer (acc_device_kind) d ++ acc_get_device_num_h = acc_get_device_num_l (d) ++end function ++ ++function acc_async_test_h (a) ++ use openacc_internal, only: acc_async_test_l ++ logical acc_async_test_h ++ integer a ++ if (acc_async_test_l (a) .eq. 1) then ++ acc_async_test_h = .TRUE. ++ else ++ acc_async_test_h = .FALSE. ++ end if ++end function ++ ++function acc_async_test_all_h () ++ use openacc_internal, only: acc_async_test_all_l ++ logical acc_async_test_all_h ++ if (acc_async_test_all_l () .eq. 1) then ++ acc_async_test_all_h = .TRUE. ++ else ++ acc_async_test_all_h = .FALSE. ++ end if ++end function ++ ++subroutine acc_wait_h (a) ++ use openacc_internal, only: acc_wait_l ++ integer a ++ call acc_wait_l (a) ++end subroutine ++ ++subroutine acc_wait_async_h (a1, a2) ++ use openacc_internal, only: acc_wait_async_l ++ integer a1, a2 ++ call acc_wait_async_l (a1, a2) ++end subroutine ++ ++subroutine acc_wait_all_h () ++ use openacc_internal, only: acc_wait_all_l ++ call acc_wait_all_l () ++end subroutine ++ ++subroutine acc_wait_all_async_h (a) ++ use openacc_internal, only: acc_wait_all_async_l ++ integer a ++ call acc_wait_all_async_l (a) ++end subroutine ++ ++subroutine acc_init_h (d) ++ use openacc_internal, only: acc_init_l ++ use openacc_kinds ++ integer (acc_device_kind) d ++ call acc_init_l (d) ++end subroutine ++ ++subroutine acc_shutdown_h (d) ++ use openacc_internal, only: acc_shutdown_l ++ use openacc_kinds ++ integer (acc_device_kind) d ++ call acc_shutdown_l (d) ++end subroutine ++ ++function acc_on_device_h (d) ++ use openacc_internal, only: acc_on_device_l ++ use openacc_kinds ++ integer (acc_device_kind) d ++ logical acc_on_device_h ++ if (acc_on_device_l (d) .eq. 1) then ++ acc_on_device_h = .TRUE. ++ else ++ acc_on_device_h = .FALSE. ++ end if ++end function ++ ++subroutine acc_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_copyin_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_copyin_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyin_array_h (a) ++ use openacc_internal, only: acc_copyin_l ++ type (*), dimension (..), contiguous :: a ++ call acc_copyin_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_present_or_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_present_or_copyin_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_present_or_copyin_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_copyin_array_h (a) ++ use openacc_internal, only: acc_present_or_copyin_l ++ type (*), dimension (..), contiguous :: a ++ call acc_present_or_copyin_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_create_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_create_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_create_array_h (a) ++ use openacc_internal, only: acc_create_l ++ type (*), dimension (..), contiguous :: a ++ call acc_create_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_present_or_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_present_or_create_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_present_or_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_present_or_create_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_present_or_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_create_array_h (a) ++ use openacc_internal, only: acc_present_or_create_l ++ type (*), dimension (..), contiguous :: a ++ call acc_present_or_create_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_copyout_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_copyout_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_copyout_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyout_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_copyout_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_copyout_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyout_array_h (a) ++ use openacc_internal, only: acc_copyout_l ++ type (*), dimension (..), contiguous :: a ++ call acc_copyout_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_delete_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_delete_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_delete_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_delete_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_delete_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_delete_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_delete_array_h (a) ++ use openacc_internal, only: acc_delete_l ++ type (*), dimension (..), contiguous :: a ++ call acc_delete_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_update_device_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_update_device_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_update_device_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_device_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_update_device_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_update_device_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_device_array_h (a) ++ use openacc_internal, only: acc_update_device_l ++ type (*), dimension (..), contiguous :: a ++ call acc_update_device_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_update_self_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_update_self_l ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_update_self_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_self_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_update_self_l ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_update_self_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_self_array_h (a) ++ use openacc_internal, only: acc_update_self_l ++ type (*), dimension (..), contiguous :: a ++ call acc_update_self_l (a, sizeof (a)) ++end subroutine ++ ++function acc_is_present_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal, only: acc_is_present_l ++ logical acc_is_present_32_h ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then ++ acc_is_present_32_h = .TRUE. ++ else ++ acc_is_present_32_h = .FALSE. ++ end if ++end function ++ ++function acc_is_present_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal, only: acc_is_present_l ++ logical acc_is_present_64_h ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then ++ acc_is_present_64_h = .TRUE. ++ else ++ acc_is_present_64_h = .FALSE. ++ end if ++end function ++ ++function acc_is_present_array_h (a) ++ use openacc_internal, only: acc_is_present_l ++ logical acc_is_present_array_h ++ type (*), dimension (..), contiguous :: a ++ acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 ++end function |