1 files changed, 10653 insertions, 0 deletions
diff --git a/gcc48-libgomp-20160715.patch b/gcc48-libgomp-20160715.patch
new file mode 100644
index 0000000..9b6a61e
--- /dev/null
+++ b/gcc48-libgomp-20160715.patch
@@ -0,0 +1,10653 @@
+--- libgomp/config/linux/wait.h.jj	2013-01-31 20:29:10.091548989 +0100
++++ libgomp/config/linux/wait.h	2016-07-13 16:57:18.902355979 +0200
+@@ -34,13 +34,13 @@
+ 
+ #define FUTEX_WAIT	0
+ #define FUTEX_WAKE	1
+-#define FUTEX_PRIVATE_FLAG	128L
++#define FUTEX_PRIVATE_FLAG	128
+ 
+ #ifdef HAVE_ATTRIBUTE_VISIBILITY
+ # pragma GCC visibility push(hidden)
+ #endif
+ 
+-extern long int gomp_futex_wait, gomp_futex_wake;
++extern int gomp_futex_wait, gomp_futex_wake;
+ 
+ #include <futex.h>
+ 
+@@ -48,7 +48,9 @@ static inline int do_spin (int *addr, in
+ {
+   unsigned long long i, count = gomp_spin_count_var;
+ 
+-  if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0))
++  if (__builtin_expect (__atomic_load_n (&gomp_managed_threads,
++                                         MEMMODEL_RELAXED)
++                        > gomp_available_cpus, 0))
+     count = gomp_throttled_spin_count_var;
+   for (i = 0; i < count; i++)
+     if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
+--- libgomp/config/linux/affinity.c.jj	2014-05-15 10:56:37.499502573 +0200
++++ libgomp/config/linux/affinity.c	2016-07-13 16:57:18.902355979 +0200
+@@ -352,6 +352,45 @@ gomp_affinity_print_place (void *p)
+     fprintf (stderr, ":%lu", len);
+ }
+ 
++int
++omp_get_place_num_procs (int place_num)
++{
++  if (place_num < 0 || place_num >= gomp_places_list_len)
++    return 0;
++
++  cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num];
++  return gomp_cpuset_popcount (gomp_cpuset_size, cpusetp);
++}
++
++void
++omp_get_place_proc_ids (int place_num, int *ids)
++{
++  if (place_num < 0 || place_num >= gomp_places_list_len)
++    return;
++
++  cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num];
++  unsigned long i, max = 8 * gomp_cpuset_size;
++  for (i = 0; i < max; i++)
++    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
++      *ids++ = i;
++}
++
++void
++gomp_get_place_proc_ids_8 (int place_num, int64_t *ids)
++{
++  if (place_num < 0 || place_num >= gomp_places_list_len)
++    return;
++
++  cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num];
++  unsigned long i, max = 8 * gomp_cpuset_size;
++  for (i = 0; i < max; i++)
++    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
++      *ids++ = i;
++}
++
++ialias(omp_get_place_num_procs)
++ialias(omp_get_place_proc_ids)
++
+ #else
+ 
+ #include "../posix/affinity.c"
+--- libgomp/config/linux/mutex.c.jj	2013-01-21 16:00:38.220917670 +0100
++++ libgomp/config/linux/mutex.c	2016-07-13 16:57:18.870356375 +0200
+@@ -28,8 +28,8 @@
+ 
+ #include "wait.h"
+ 
+-long int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
+-long int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG;
++int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
++int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG;
+ 
+ void
+ gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval)
+--- libgomp/config/posix/affinity.c.jj	2014-05-15 10:56:37.987498844 +0200
++++ libgomp/config/posix/affinity.c	2016-07-15 12:08:28.410015743 +0200
+@@ -113,3 +113,27 @@ gomp_affinity_print_place (void *p)
+ {
+   (void) p;
+ }
++
++int
++omp_get_place_num_procs (int place_num)
++{
++  (void) place_num;
++  return 0;
++}
++
++void
++omp_get_place_proc_ids (int place_num, int *ids)
++{
++  (void) place_num;
++  (void) ids;
++}
++
++void
++gomp_get_place_proc_ids_8 (int place_num, int64_t *ids)
++{
++  (void) place_num;
++  (void) ids;
++}
++
++ialias(omp_get_place_num_procs)
++ialias(omp_get_place_proc_ids)
+--- libgomp/loop_ull.c.jj	2013-01-21 16:00:46.477871806 +0100
++++ libgomp/loop_ull.c	2016-07-13 16:57:18.918355780 +0200
+@@ -174,15 +174,15 @@ GOMP_loop_ull_runtime_start (bool up, go
+     {
+     case GFS_STATIC:
+       return gomp_loop_ull_static_start (up, start, end, incr,
+-					 icv->run_sched_modifier,
++					 icv->run_sched_chunk_size,
+ 					 istart, iend);
+     case GFS_DYNAMIC:
+       return gomp_loop_ull_dynamic_start (up, start, end, incr,
+-					  icv->run_sched_modifier,
++					  icv->run_sched_chunk_size,
+ 					  istart, iend);
+     case GFS_GUIDED:
+       return gomp_loop_ull_guided_start (up, start, end, incr,
+-					 icv->run_sched_modifier,
++					 icv->run_sched_chunk_size,
+ 					 istart, iend);
+     case GFS_AUTO:
+       /* For now map to schedule(static), later on we could play with feedback
+@@ -278,15 +278,15 @@ GOMP_loop_ull_ordered_runtime_start (boo
+     {
+     case GFS_STATIC:
+       return gomp_loop_ull_ordered_static_start (up, start, end, incr,
+-						 icv->run_sched_modifier,
++						 icv->run_sched_chunk_size,
+ 						 istart, iend);
+     case GFS_DYNAMIC:
+       return gomp_loop_ull_ordered_dynamic_start (up, start, end, incr,
+-						  icv->run_sched_modifier,
++						  icv->run_sched_chunk_size,
+ 						  istart, iend);
+     case GFS_GUIDED:
+       return gomp_loop_ull_ordered_guided_start (up, start, end, incr,
+-						 icv->run_sched_modifier,
++						 icv->run_sched_chunk_size,
+ 						 istart, iend);
+     case GFS_AUTO:
+       /* For now map to schedule(static), later on we could play with feedback
+@@ -298,6 +298,114 @@ GOMP_loop_ull_ordered_runtime_start (boo
+     }
+ }
+ 
++/* The *_doacross_*_start routines are similar.  The only difference is that
++   this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
++   section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
++   and other COUNTS array elements tell the library number of iterations
++   in the ordered inner loops.  */
++
++static bool
++gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
++				     gomp_ull chunk_size, gomp_ull *istart,
++				     gomp_ull *iend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++
++  thr->ts.static_trip = 0;
++  if (gomp_work_share_start (false))
++    {
++      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
++			  GFS_STATIC, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size);
++      gomp_work_share_init_done ();
++    }
++
++  return !gomp_iter_ull_static_next (istart, iend);
++}
++
++static bool
++gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
++				      gomp_ull chunk_size, gomp_ull *istart,
++				      gomp_ull *iend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  bool ret;
++
++  if (gomp_work_share_start (false))
++    {
++      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
++			  GFS_DYNAMIC, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size);
++      gomp_work_share_init_done ();
++    }
++
++#if defined HAVE_SYNC_BUILTINS && defined __LP64__
++  ret = gomp_iter_ull_dynamic_next (istart, iend);
++#else
++  gomp_mutex_lock (&thr->ts.work_share->lock);
++  ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
++
++  return ret;
++}
++
++static bool
++gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
++				     gomp_ull chunk_size, gomp_ull *istart,
++				     gomp_ull *iend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  bool ret;
++
++  if (gomp_work_share_start (false))
++    {
++      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
++			  GFS_GUIDED, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size);
++      gomp_work_share_init_done ();
++    }
++
++#if defined HAVE_SYNC_BUILTINS && defined __LP64__
++  ret = gomp_iter_ull_guided_next (istart, iend);
++#else
++  gomp_mutex_lock (&thr->ts.work_share->lock);
++  ret = gomp_iter_ull_guided_next_locked (istart, iend);
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
++
++  return ret;
++}
++
++bool
++GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts,
++				      gomp_ull *istart, gomp_ull *iend)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  switch (icv->run_sched_var)
++    {
++    case GFS_STATIC:
++      return gomp_loop_ull_doacross_static_start (ncounts, counts,
++						  icv->run_sched_chunk_size,
++						  istart, iend);
++    case GFS_DYNAMIC:
++      return gomp_loop_ull_doacross_dynamic_start (ncounts, counts,
++						   icv->run_sched_chunk_size,
++						   istart, iend);
++    case GFS_GUIDED:
++      return gomp_loop_ull_doacross_guided_start (ncounts, counts,
++						  icv->run_sched_chunk_size,
++						  istart, iend);
++    case GFS_AUTO:
++      /* For now map to schedule(static), later on we could play with feedback
++	 driven choice.  */
++      return gomp_loop_ull_doacross_static_start (ncounts, counts,
++						  0, istart, iend);
++    default:
++      abort ();
++    }
++}
++
+ /* The *_next routines are called when the thread completes processing of
+    the iteration block currently assigned to it.  If the work-share
+    construct is bound directly to a parallel construct, then the iteration
+@@ -457,6 +565,10 @@ extern __typeof(gomp_loop_ull_dynamic_st
+ 	__attribute__((alias ("gomp_loop_ull_dynamic_start")));
+ extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_guided_start
+ 	__attribute__((alias ("gomp_loop_ull_guided_start")));
++extern __typeof(gomp_loop_ull_dynamic_start) GOMP_loop_ull_nonmonotonic_dynamic_start
++	__attribute__((alias ("gomp_loop_ull_dynamic_start")));
++extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start
++	__attribute__((alias ("gomp_loop_ull_guided_start")));
+ 
+ extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start
+ 	__attribute__((alias ("gomp_loop_ull_ordered_static_start")));
+@@ -465,12 +577,23 @@ extern __typeof(gomp_loop_ull_ordered_dy
+ extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start
+ 	__attribute__((alias ("gomp_loop_ull_ordered_guided_start")));
+ 
++extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start
++	__attribute__((alias ("gomp_loop_ull_doacross_static_start")));
++extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start
++	__attribute__((alias ("gomp_loop_ull_doacross_dynamic_start")));
++extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start
++	__attribute__((alias ("gomp_loop_ull_doacross_guided_start")));
++
+ extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next
+ 	__attribute__((alias ("gomp_loop_ull_static_next")));
+ extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next
+ 	__attribute__((alias ("gomp_loop_ull_dynamic_next")));
+ extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_guided_next
+ 	__attribute__((alias ("gomp_loop_ull_guided_next")));
++extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_nonmonotonic_dynamic_next
++	__attribute__((alias ("gomp_loop_ull_dynamic_next")));
++extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next
++	__attribute__((alias ("gomp_loop_ull_guided_next")));
+ 
+ extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next
+ 	__attribute__((alias ("gomp_loop_ull_ordered_static_next")));
+@@ -507,6 +630,25 @@ GOMP_loop_ull_guided_start (bool up, gom
+ }
+ 
+ bool
++GOMP_loop_ull_nonmonotonic_dynamic_start (bool up, gomp_ull start,
++					  gomp_ull end, gomp_ull incr,
++					  gomp_ull chunk_size,
++					  gomp_ull *istart, gomp_ull *iend)
++{
++  return gomp_loop_ull_dynamic_start (up, start, end, incr, chunk_size, istart,
++				      iend);
++}
++
++bool
++GOMP_loop_ull_nonmonotonic_guided_start (bool up, gomp_ull start, gomp_ull end,
++					 gomp_ull incr, gomp_ull chunk_size,
++					 gomp_ull *istart, gomp_ull *iend)
++{
++  return gomp_loop_ull_guided_start (up, start, end, incr, chunk_size, istart,
++				     iend);
++}
++
++bool
+ GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end,
+ 				    gomp_ull incr, gomp_ull chunk_size,
+ 				    gomp_ull *istart, gomp_ull *iend)
+@@ -534,6 +676,33 @@ GOMP_loop_ull_ordered_guided_start (bool
+ }
+ 
+ bool
++GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
++				     gomp_ull chunk_size, gomp_ull *istart,
++				     gomp_ull *iend)
++{
++  return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size,
++					      istart, iend);
++}
++
++bool
++GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
++				      gomp_ull chunk_size, gomp_ull *istart,
++				      gomp_ull *iend)
++{
++  return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size,
++					       istart, iend);
++}
++
++bool
++GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
++				     gomp_ull chunk_size, gomp_ull *istart,
++				     gomp_ull *iend)
++{
++  return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size,
++					      istart, iend);
++}
++
++bool
+ GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend)
+ {
+   return gomp_loop_ull_static_next (istart, iend);
+@@ -550,6 +719,18 @@ GOMP_loop_ull_guided_next (gomp_ull *ist
+ {
+   return gomp_loop_ull_guided_next (istart, iend);
+ }
++
++bool
++GOMP_loop_ull_nonmonotonic_dynamic_next (gomp_ull *istart, gomp_ull *iend)
++{
++  return gomp_loop_ull_dynamic_next (istart, iend);
++}
++
++bool
++GOMP_loop_ull_nonmonotonic_guided_next (gomp_ull *istart, gomp_ull *iend)
++{
++  return gomp_loop_ull_guided_next (istart, iend);
++}
+ 
+ bool
+ GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend)
+--- libgomp/team.c.jj	2014-05-15 10:56:32.092524669 +0200
++++ libgomp/team.c	2016-07-13 17:58:01.907291111 +0200
+@@ -133,6 +133,25 @@ gomp_thread_start (void *xdata)
+   return NULL;
+ }
+ 
++static inline struct gomp_team *
++get_last_team (unsigned nthreads)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  if (thr->ts.team == NULL)
++    {
++      struct gomp_thread_pool *pool = thr->thread_pool;
++      if (pool != NULL)
++	{
++	  struct gomp_team *last_team = pool->last_team;
++	  if (last_team != NULL && last_team->nthreads == nthreads)
++	    {
++	      pool->last_team = NULL;
++	      return last_team;
++	    }
++	}
++    }
++  return NULL;
++}
+ 
+ /* Create a new team data structure.  */
+ 
+@@ -140,18 +159,27 @@ struct gomp_team *
+ gomp_new_team (unsigned nthreads)
+ {
+   struct gomp_team *team;
+-  size_t size;
+   int i;
+ 
+-  size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0])
+-				      + sizeof (team->implicit_task[0]));
+-  team = gomp_malloc (size);
++  team = get_last_team (nthreads);
++  if (team == NULL)
++    {
++      size_t extra = sizeof (team->ordered_release[0])
++		     + sizeof (team->implicit_task[0]);
++      team = gomp_malloc (sizeof (*team) + nthreads * extra);
++
++#ifndef HAVE_SYNC_BUILTINS
++      gomp_mutex_init (&team->work_share_list_free_lock);
++#endif
++      gomp_barrier_init (&team->barrier, nthreads);
++      gomp_mutex_init (&team->task_lock);
++
++      team->nthreads = nthreads;
++    }
+ 
+   team->work_share_chunk = 8;
+ #ifdef HAVE_SYNC_BUILTINS
+   team->single_count = 0;
+-#else
+-  gomp_mutex_init (&team->work_share_list_free_lock);
+ #endif
+   team->work_shares_to_free = &team->work_shares[0];
+   gomp_init_work_share (&team->work_shares[0], false, nthreads);
+@@ -162,15 +190,11 @@ gomp_new_team (unsigned nthreads)
+     team->work_shares[i].next_free = &team->work_shares[i + 1];
+   team->work_shares[i].next_free = NULL;
+ 
+-  team->nthreads = nthreads;
+-  gomp_barrier_init (&team->barrier, nthreads);
+-
+   gomp_sem_init (&team->master_release, 0);
+   team->ordered_release = (void *) &team->implicit_task[nthreads];
+   team->ordered_release[0] = &team->master_release;
+ 
+-  gomp_mutex_init (&team->task_lock);
+-  team->task_queue = NULL;
++  priority_queue_init (&team->task_queue);
+   team->task_count = 0;
+   team->task_queued_count = 0;
+   team->task_running_count = 0;
+@@ -186,8 +210,12 @@ gomp_new_team (unsigned nthreads)
+ static void
+ free_team (struct gomp_team *team)
+ {
++#ifndef HAVE_SYNC_BUILTINS
++  gomp_mutex_destroy (&team->work_share_list_free_lock);
++#endif
+   gomp_barrier_destroy (&team->barrier);
+   gomp_mutex_destroy (&team->task_lock);
++  priority_queue_free (&team->task_queue);
+   free (team);
+ }
+ 
+@@ -258,6 +286,8 @@ gomp_free_thread (void *arg __attribute_
+       free (pool);
+       thr->thread_pool = NULL;
+     }
++  if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
++    gomp_team_end ();
+   if (thr->task != NULL)
+     {
+       struct gomp_task *task = thr->task;
+@@ -287,7 +317,7 @@ gomp_team_start (void (*fn) (void *), vo
+   struct gomp_thread **affinity_thr = NULL;
+ 
+   thr = gomp_thread ();
+-  nested = thr->ts.team != NULL;
++  nested = thr->ts.level;
+   if (__builtin_expect (thr->thread_pool == NULL, 0))
+     {
+       thr->thread_pool = gomp_new_thread_pool ();
+@@ -894,9 +924,6 @@ gomp_team_end (void)
+       while (ws != NULL);
+     }
+   gomp_sem_destroy (&team->master_release);
+-#ifndef HAVE_SYNC_BUILTINS
+-  gomp_mutex_destroy (&team->work_share_list_free_lock);
+-#endif
+ 
+   if (__builtin_expect (thr->ts.team != NULL, 0)
+       || __builtin_expect (team->nthreads == 1, 0))
+--- libgomp/target.c.jj	2014-05-15 10:56:38.313498020 +0200
++++ libgomp/target.c	2016-07-15 16:58:29.249328861 +0200
+@@ -22,14 +22,22 @@
+    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-/* This file handles the maintainence of threads in response to team
+-   creation and termination.  */
++/* This file contains the support of offloading.  */
+ 
++#include "config.h"
+ #include "libgomp.h"
++#include "oacc-plugin.h"
++#include "oacc-int.h"
++#include "gomp-constants.h"
+ #include <limits.h>
+ #include <stdbool.h>
+ #include <stdlib.h>
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>  /* For PRIu64.  */
++#endif
+ #include <string.h>
++#include <assert.h>
++#include <errno.h>
+ 
+ attribute_hidden int
+ gomp_get_num_devices (void)
+@@ -37,22 +45,87 @@ gomp_get_num_devices (void)
+   return 0;
+ }
+ 
+-/* Called when encountering a target directive.  If DEVICE
+-   is -1, it means use device-var ICV.  If it is -2 (or any other value
+-   larger than last available hw device, use host fallback.
+-   FN is address of host code, OPENMP_TARGET contains value of the
+-   __OPENMP_TARGET__ symbol in the shared library or binary that invokes
+-   GOMP_target.  HOSTADDRS, SIZES and KINDS are arrays
+-   with MAPNUM entries, with addresses of the host objects,
+-   sizes of the host objects (resp. for pointer kind pointer bias
+-   and assumed sizeof (void *) size) and kinds.  */
++/* This function should be called from every offload image while loading.
++   It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of
++   the target, and TARGET_DATA needed by target plugin.  */
+ 
+ void
+-GOMP_target (int device, void (*fn) (void *), const void *openmp_target,
+-	     size_t mapnum, void **hostaddrs, size_t *sizes,
+-	     unsigned char *kinds)
++GOMP_offload_register_ver (unsigned version, const void *host_table,
++			   int target_type, const void *target_data)
++{
++  (void) version;
++  (void) host_table;
++  (void) target_type;
++  (void) target_data;
++}
++
++void
++GOMP_offload_register (const void *host_table, int target_type,
++		       const void *target_data)
++{
++  (void) host_table;
++  (void) target_type;
++  (void) target_data;
++}
++
++/* This function should be called from every offload image while unloading.
++   It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of
++   the target, and TARGET_DATA needed by target plugin.  */
++
++void
++GOMP_offload_unregister_ver (unsigned version, const void *host_table,
++			     int target_type, const void *target_data)
++{
++  (void) version;
++  (void) host_table;
++  (void) target_type;
++  (void) target_data;
++}
++
++void
++GOMP_offload_unregister (const void *host_table, int target_type,
++			 const void *target_data)
++{
++  (void) host_table;
++  (void) target_type;
++  (void) target_data;
++}
++
++/* This function initializes the target device, specified by DEVICEP.  DEVICEP
++   must be locked on entry, and remains locked on return.  */
++
++attribute_hidden void
++gomp_init_device (struct gomp_device_descr *devicep)
++{
++  devicep->state = GOMP_DEVICE_INITIALIZED;
++}
++
++attribute_hidden void
++gomp_unload_device (struct gomp_device_descr *devicep)
++{
++}
++
++/* Free address mapping tables.  MM must be locked on entry, and remains locked
++   on return.  */
++
++attribute_hidden void
++gomp_free_memmap (struct splay_tree_s *mem_map)
++{
++  while (mem_map->root)
++    {
++      struct target_mem_desc *tgt = mem_map->root->key.tgt;
++
++      splay_tree_remove (mem_map, &mem_map->root->key);
++      free (tgt->array);
++      free (tgt);
++    }
++}
++
++/* Host fallback for GOMP_target{,_ext} routines.  */
++
++static void
++gomp_target_fallback (void (*fn) (void *), void **hostaddrs)
+ {
+-  /* Host fallback.  */
+   struct gomp_thread old_thr, *thr = gomp_thread ();
+   old_thr = *thr;
+   memset (thr, '\0', sizeof (*thr));
+@@ -66,10 +139,167 @@ GOMP_target (int device, void (*fn) (voi
+   *thr = old_thr;
+ }
+ 
++/* Calculate alignment and size requirements of a private copy of data shared
++   as GOMP_MAP_FIRSTPRIVATE and store them to TGT_ALIGN and TGT_SIZE.  */
++
++static inline void
++calculate_firstprivate_requirements (size_t mapnum, size_t *sizes,
++				     unsigned short *kinds, size_t *tgt_align,
++				     size_t *tgt_size)
++{
++  size_t i;
++  for (i = 0; i < mapnum; i++)
++    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
++      {
++	size_t align = (size_t) 1 << (kinds[i] >> 8);
++	if (*tgt_align < align)
++	  *tgt_align = align;
++	*tgt_size = (*tgt_size + align - 1) & ~(align - 1);
++	*tgt_size += sizes[i];
++      }
++}
++
++/* Copy data shared as GOMP_MAP_FIRSTPRIVATE to DST.  */
++
++static inline void
++copy_firstprivate_data (char *tgt, size_t mapnum, void **hostaddrs,
++			size_t *sizes, unsigned short *kinds, size_t tgt_align,
++			size_t tgt_size)
++{
++  uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
++  if (al)
++    tgt += tgt_align - al;
++  tgt_size = 0;
++  size_t i;
++  for (i = 0; i < mapnum; i++)
++    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
++      {
++	size_t align = (size_t) 1 << (kinds[i] >> 8);
++	tgt_size = (tgt_size + align - 1) & ~(align - 1);
++	memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
++	hostaddrs[i] = tgt + tgt_size;
++	tgt_size = tgt_size + sizes[i];
++      }
++}
++
++/* Called when encountering a target directive.  If DEVICE
++   is GOMP_DEVICE_ICV, it means use device-var ICV.  If it is
++   GOMP_DEVICE_HOST_FALLBACK (or any value
++   larger than last available hw device), use host fallback.
++   FN is address of host code, UNUSED is part of the current ABI, but
++   we're not actually using it.  HOSTADDRS, SIZES and KINDS are arrays
++   with MAPNUM entries, with addresses of the host objects,
++   sizes of the host objects (resp. for pointer kind pointer bias
++   and assumed sizeof (void *) size) and kinds.  */
++
++void
++GOMP_target (int device, void (*fn) (void *), const void *unused,
++	     size_t mapnum, void **hostaddrs, size_t *sizes,
++	     unsigned char *kinds)
++{
++  return gomp_target_fallback (fn, hostaddrs);
++}
++
++/* Like GOMP_target, but KINDS is 16-bit, UNUSED is no longer present,
++   and several arguments have been added:
++   FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h.
++   DEPEND is array of dependencies, see GOMP_task for details.
++
++   ARGS is a pointer to an array consisting of a variable number of both
++   device-independent and device-specific arguments, which can take one two
++   elements where the first specifies for which device it is intended, the type
++   and optionally also the value.  If the value is not present in the first
++   one, the whole second element the actual value.  The last element of the
++   array is a single NULL.  Among the device independent can be for example
++   NUM_TEAMS and THREAD_LIMIT.
++
++   NUM_TEAMS is positive if GOMP_teams will be called in the body with
++   that value, or 1 if teams construct is not present, or 0, if
++   teams construct does not have num_teams clause and so the choice is
++   implementation defined, and -1 if it can't be determined on the host
++   what value will GOMP_teams have on the device.
++   THREAD_LIMIT similarly is positive if GOMP_teams will be called in the
++   body with that value, or 0, if teams construct does not have thread_limit
++   clause or the teams construct is not present, or -1 if it can't be
++   determined on the host what value will GOMP_teams have on the device.  */
++
++void
++GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
++		 void **hostaddrs, size_t *sizes, unsigned short *kinds,
++		 unsigned int flags, void **depend, void **args)
++{
++  size_t tgt_align = 0, tgt_size = 0;
++  bool fpc_done = false;
++
++  if (flags & GOMP_TARGET_FLAG_NOWAIT)
++    {
++      struct gomp_thread *thr = gomp_thread ();
++      if (thr->ts.team
++	  && !thr->task->final_task)
++	{
++	  gomp_create_target_task (NULL, fn, mapnum, hostaddrs,
++				   sizes, kinds, flags, depend, args,
++				   GOMP_TARGET_TASK_BEFORE_MAP);
++	  return;
++	}
++    }
++
++  /* If there are depend clauses, but nowait is not present
++     (or we are in a final task), block the parent task until the
++     dependencies are resolved and then just continue with the rest
++     of the function as if it is a merged task.  */
++  if (depend != NULL)
++    {
++      struct gomp_thread *thr = gomp_thread ();
++      if (thr->task && thr->task->depend_hash)
++	{
++	  /* If we might need to wait, copy firstprivate now.  */
++	  calculate_firstprivate_requirements (mapnum, sizes, kinds,
++					       &tgt_align, &tgt_size);
++	  if (tgt_align)
++	    {
++	      char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
++	      copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds,
++				      tgt_align, tgt_size);
++	    }
++	  fpc_done = true;
++	  gomp_task_maybe_wait_for_dependencies (depend);
++	}
++    }
++
++  if (!fpc_done)
++    {
++      calculate_firstprivate_requirements (mapnum, sizes, kinds,
++					   &tgt_align, &tgt_size);
++      if (tgt_align)
++	{
++	  char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
++	  copy_firstprivate_data (tgt, mapnum, hostaddrs, sizes, kinds,
++				  tgt_align, tgt_size);
++	}
++    }
++  gomp_target_fallback (fn, hostaddrs);
++}
++
++/* Host fallback for GOMP_target_data{,_ext} routines.  */
++
++static void
++gomp_target_data_fallback (void)
++{
++}
++
+ void
+-GOMP_target_data (int device, const void *openmp_target, size_t mapnum,
++GOMP_target_data (int device, const void *unused, size_t mapnum,
+ 		  void **hostaddrs, size_t *sizes, unsigned char *kinds)
+ {
++  return gomp_target_data_fallback ();
++}
++
++void
++GOMP_target_data_ext (int device, size_t mapnum, void **hostaddrs,
++		      size_t *sizes, unsigned short *kinds)
++{
++  return gomp_target_data_fallback ();
+ }
+ 
+ void
+@@ -78,12 +308,112 @@ GOMP_target_end_data (void)
+ }
+ 
+ void
+-GOMP_target_update (int device, const void *openmp_target, size_t mapnum,
++GOMP_target_update (int device, const void *unused, size_t mapnum,
+ 		    void **hostaddrs, size_t *sizes, unsigned char *kinds)
+ {
+ }
+ 
+ void
++GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
++			size_t *sizes, unsigned short *kinds,
++			unsigned int flags, void **depend)
++{
++  /* If there are depend clauses, but nowait is not present,
++     block the parent task until the dependencies are resolved
++     and then just continue with the rest of the function as if it
++     is a merged task.  Until we are able to schedule task during
++     variable mapping or unmapping, ignore nowait if depend clauses
++     are not present.  */
++  if (depend != NULL)
++    {
++      struct gomp_thread *thr = gomp_thread ();
++      if (thr->task && thr->task->depend_hash)
++	{
++	  if ((flags & GOMP_TARGET_FLAG_NOWAIT)
++	      && thr->ts.team
++	      && !thr->task->final_task)
++	    {
++	      if (gomp_create_target_task (NULL, (void (*) (void *)) NULL,
++					   mapnum, hostaddrs, sizes, kinds,
++					   flags | GOMP_TARGET_FLAG_UPDATE,
++					   depend, NULL, GOMP_TARGET_TASK_DATA))
++		return;
++	    }
++	  else
++	    {
++	      struct gomp_team *team = thr->ts.team;
++	      /* If parallel or taskgroup has been cancelled, don't start new
++		 tasks.  */
++	      if (team
++		  && (gomp_team_barrier_cancelled (&team->barrier)
++		      || (thr->task->taskgroup
++			  && thr->task->taskgroup->cancelled)))
++		return;
++
++	      gomp_task_maybe_wait_for_dependencies (depend);
++	    }
++	}
++    }
++}
++
++void
++GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
++			     size_t *sizes, unsigned short *kinds,
++			     unsigned int flags, void **depend)
++{
++  /* If there are depend clauses, but nowait is not present,
++     block the parent task until the dependencies are resolved
++     and then just continue with the rest of the function as if it
++     is a merged task.  Until we are able to schedule task during
++     variable mapping or unmapping, ignore nowait if depend clauses
++     are not present.  */
++  if (depend != NULL)
++    {
++      struct gomp_thread *thr = gomp_thread ();
++      if (thr->task && thr->task->depend_hash)
++	{
++	  if ((flags & GOMP_TARGET_FLAG_NOWAIT)
++	      && thr->ts.team
++	      && !thr->task->final_task)
++	    {
++	      if (gomp_create_target_task (NULL, (void (*) (void *)) NULL,
++					   mapnum, hostaddrs, sizes, kinds,
++					   flags, depend, NULL,
++					   GOMP_TARGET_TASK_DATA))
++		return;
++	    }
++	  else
++	    {
++	      struct gomp_team *team = thr->ts.team;
++	      /* If parallel or taskgroup has been cancelled, don't start new
++		 tasks.  */
++	      if (team
++		  && (gomp_team_barrier_cancelled (&team->barrier)
++		      || (thr->task->taskgroup
++			  && thr->task->taskgroup->cancelled)))
++		return;
++
++	      gomp_task_maybe_wait_for_dependencies (depend);
++	    }
++	}
++    }
++}
++
++bool
++gomp_target_task_fn (void *data)
++{
++  struct gomp_target_task *ttask = (struct gomp_target_task *) data;
++
++  if (ttask->fn != NULL)
++    {
++      ttask->state = GOMP_TARGET_TASK_FALLBACK;
++      gomp_target_fallback (ttask->fn, ttask->hostaddrs);
++      return false;
++    }
++  return false;
++}
++
++void
+ GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+ {
+   if (thread_limit)
+@@ -94,3 +424,153 @@ GOMP_teams (unsigned int num_teams, unsi
+     }
+   (void) num_teams;
+ }
++
++void *
++omp_target_alloc (size_t size, int device_num)
++{
++  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
++    return malloc (size);
++
++  return NULL;
++}
++
++void
++omp_target_free (void *device_ptr, int device_num)
++{
++  if (device_ptr == NULL)
++    return;
++
++  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
++    {
++      free (device_ptr);
++      return;
++    }
++}
++
++int
++omp_target_is_present (void *ptr, int device_num)
++{
++  if (ptr == NULL)
++    return 1;
++
++  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
++    return 1;
++
++  return 0;
++}
++
++int
++omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
++		   size_t src_offset, int dst_device_num, int src_device_num)
++{
++  if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK)
++    return EINVAL;
++  if (src_device_num != GOMP_DEVICE_HOST_FALLBACK)
++    return EINVAL;
++  memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length);
++  return 0;
++}
++
++#define HALF_SIZE_T (((size_t) 1) << (8 * sizeof (size_t) / 2))
++
++#define __builtin_mul_overflow(x, y, z) \
++  ({ bool retval = false;					\
++     size_t xval = (x);						\
++     size_t yval = (y);						\
++     size_t zval = xval * yval;					\
++     if (__builtin_expect ((xval | yval) >= HALF_SIZE_T, 0))	\
++       {							\
++         if (xval && zval / xval != yval)			\
++	   retval = true;					\
++       }							\
++     *(z) = zval;						\
++     retval; })
++
++static int
++omp_target_memcpy_rect_worker (void *dst, void *src, size_t element_size,
++			       int num_dims, const size_t *volume,
++			       const size_t *dst_offsets,
++			       const size_t *src_offsets,
++			       const size_t *dst_dimensions,
++			       const size_t *src_dimensions)
++{
++  size_t dst_slice = element_size;
++  size_t src_slice = element_size;
++  size_t j, dst_off, src_off, length;
++  int i, ret;
++
++
++  if (num_dims == 1)
++    {
++      if (__builtin_mul_overflow (element_size, volume[0], &length)
++	  || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
++	  || __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
++	return EINVAL;
++      memcpy ((char *) dst + dst_off, (char *) src + src_off, length);
++      ret = 1;
++      return ret ? 0 : EINVAL;
++    }
++
++  /* FIXME: it would be nice to have some plugin function to handle
++     num_dims == 2 and num_dims == 3 more efficiently.  Larger ones can
++     be handled in the generic recursion below, and for host-host it
++     should be used even for any num_dims >= 2.  */
++
++  for (i = 1; i < num_dims; i++)
++    if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
++	|| __builtin_mul_overflow (src_slice, src_dimensions[i], &src_slice))
++      return EINVAL;
++  if (__builtin_mul_overflow (dst_slice, dst_offsets[0], &dst_off)
++      || __builtin_mul_overflow (src_slice, src_offsets[0], &src_off))
++    return EINVAL;
++  for (j = 0; j < volume[0]; j++)
++    {
++      ret = omp_target_memcpy_rect_worker ((char *) dst + dst_off,
++					   (char *) src + src_off,
++					   element_size, num_dims - 1,
++					   volume + 1, dst_offsets + 1,
++					   src_offsets + 1, dst_dimensions + 1,
++					   src_dimensions + 1);
++      if (ret)
++	return ret;
++      dst_off += dst_slice;
++      src_off += src_slice;
++    }
++  return 0;
++}
++
++int
++omp_target_memcpy_rect (void *dst, void *src, size_t element_size,
++			int num_dims, const size_t *volume,
++			const size_t *dst_offsets,
++			const size_t *src_offsets,
++			const size_t *dst_dimensions,
++			const size_t *src_dimensions,
++			int dst_device_num, int src_device_num)
++{
++  if (!dst && !src)
++    return INT_MAX;
++
++  if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK)
++    return EINVAL;
++  if (src_device_num != GOMP_DEVICE_HOST_FALLBACK)
++    return EINVAL;
++
++  int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
++					   volume, dst_offsets, src_offsets,
++					   dst_dimensions, src_dimensions);
++  return ret;
++}
++
++int
++omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
++			  size_t device_offset, int device_num)
++{
++  return EINVAL;
++}
++
++int
++omp_target_disassociate_ptr (void *ptr, int device_num)
++{
++  return EINVAL;
++}
+--- libgomp/fortran.c.jj	2014-05-15 10:56:31.593531223 +0200
++++ libgomp/fortran.c	2016-07-13 16:57:04.432535397 +0200
+@@ -67,12 +67,20 @@ ialias_redirect (omp_get_active_level)
+ ialias_redirect (omp_in_final)
+ ialias_redirect (omp_get_cancellation)
+ ialias_redirect (omp_get_proc_bind)
++ialias_redirect (omp_get_num_places)
++ialias_redirect (omp_get_place_num_procs)
++ialias_redirect (omp_get_place_proc_ids)
++ialias_redirect (omp_get_place_num)
++ialias_redirect (omp_get_partition_num_places)
++ialias_redirect (omp_get_partition_place_nums)
+ ialias_redirect (omp_set_default_device)
+ ialias_redirect (omp_get_default_device)
+ ialias_redirect (omp_get_num_devices)
+ ialias_redirect (omp_get_num_teams)
+ ialias_redirect (omp_get_team_num)
+ ialias_redirect (omp_is_initial_device)
++ialias_redirect (omp_get_initial_device)
++ialias_redirect (omp_get_max_task_priority)
+ #endif
+ 
+ #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
+@@ -342,35 +350,35 @@ omp_get_wtime_ (void)
+ }
+ 
+ void
+-omp_set_schedule_ (const int32_t *kind, const int32_t *modifier)
++omp_set_schedule_ (const int32_t *kind, const int32_t *chunk_size)
+ {
+-  omp_set_schedule (*kind, *modifier);
++  omp_set_schedule (*kind, *chunk_size);
+ }
+ 
+ void
+-omp_set_schedule_8_ (const int32_t *kind, const int64_t *modifier)
++omp_set_schedule_8_ (const int32_t *kind, const int64_t *chunk_size)
+ {
+-  omp_set_schedule (*kind, TO_INT (*modifier));
++  omp_set_schedule (*kind, TO_INT (*chunk_size));
+ }
+ 
+ void
+-omp_get_schedule_ (int32_t *kind, int32_t *modifier)
++omp_get_schedule_ (int32_t *kind, int32_t *chunk_size)
+ {
+   omp_sched_t k;
+-  int m;
+-  omp_get_schedule (&k, &m);
++  int cs;
++  omp_get_schedule (&k, &cs);
+   *kind = k;
+-  *modifier = m;
++  *chunk_size = cs;
+ }
+ 
+ void
+-omp_get_schedule_8_ (int32_t *kind, int64_t *modifier)
++omp_get_schedule_8_ (int32_t *kind, int64_t *chunk_size)
+ {
+   omp_sched_t k;
+-  int m;
+-  omp_get_schedule (&k, &m);
++  int cs;
++  omp_get_schedule (&k, &cs);
+   *kind = k;
+-  *modifier = m;
++  *chunk_size = cs;
+ }
+ 
+ int32_t
+@@ -451,6 +459,69 @@ omp_get_proc_bind_ (void)
+   return omp_get_proc_bind ();
+ }
+ 
++int32_t
++omp_get_num_places_ (void)
++{
++  return omp_get_num_places ();
++}
++
++int32_t
++omp_get_place_num_procs_ (const int32_t *place_num)
++{
++  return omp_get_place_num_procs (*place_num);
++}
++
++int32_t
++omp_get_place_num_procs_8_ (const int64_t *place_num)
++{
++  return omp_get_place_num_procs (TO_INT (*place_num));
++}
++
++void
++omp_get_place_proc_ids_ (const int32_t *place_num, int32_t *ids)
++{
++  omp_get_place_proc_ids (*place_num, (int *) ids);
++}
++
++void
++omp_get_place_proc_ids_8_ (const int64_t *place_num, int64_t *ids)
++{
++  gomp_get_place_proc_ids_8 (TO_INT (*place_num), ids);
++}
++
++int32_t
++omp_get_place_num_ (void)
++{
++  return omp_get_place_num ();
++}
++
++int32_t
++omp_get_partition_num_places_ (void)
++{
++  return omp_get_partition_num_places ();
++}
++
++void
++omp_get_partition_place_nums_ (int32_t *place_nums)
++{
++  omp_get_partition_place_nums ((int *) place_nums);
++}
++
++void
++omp_get_partition_place_nums_8_ (int64_t *place_nums)
++{
++  if (gomp_places_list == NULL)
++    return;
++
++  struct gomp_thread *thr = gomp_thread ();
++  if (thr->place == 0)
++    gomp_init_affinity ();
++
++  unsigned int i;
++  for (i = 0; i < thr->ts.place_partition_len; i++)
++    *place_nums++ = (int64_t) thr->ts.place_partition_off + i;
++}
++
+ void
+ omp_set_default_device_ (const int32_t *device_num)
+ {
+@@ -492,3 +563,15 @@ omp_is_initial_device_ (void)
+ {
+   return omp_is_initial_device ();
+ }
++
++int32_t
++omp_get_initial_device_ (void)
++{
++  return omp_get_initial_device ();
++}
++
++int32_t
++omp_get_max_task_priority_ (void)
++{
++  return omp_get_max_task_priority ();
++}
+--- libgomp/libgomp.map.jj	2014-05-15 10:56:31.927533549 +0200
++++ libgomp/libgomp.map	2016-07-13 16:57:04.434535373 +0200
+@@ -134,6 +134,36 @@ OMP_4.0 {
+ 	omp_is_initial_device_;
+ } OMP_3.1;
+ 
++OMP_4.5 {
++  global:
++	omp_get_max_task_priority;
++	omp_get_max_task_priority_;
++	omp_get_num_places;
++	omp_get_num_places_;
++	omp_get_place_num_procs;
++	omp_get_place_num_procs_;
++	omp_get_place_num_procs_8_;
++	omp_get_place_proc_ids;
++	omp_get_place_proc_ids_;
++	omp_get_place_proc_ids_8_;
++	omp_get_place_num;
++	omp_get_place_num_;
++	omp_get_partition_num_places;
++	omp_get_partition_num_places_;
++	omp_get_partition_place_nums;
++	omp_get_partition_place_nums_;
++	omp_get_partition_place_nums_8_;
++	omp_get_initial_device;
++	omp_get_initial_device_;
++	omp_target_alloc;
++	omp_target_free;
++	omp_target_is_present;
++	omp_target_memcpy;
++	omp_target_memcpy_rect;
++	omp_target_associate_ptr;
++	omp_target_disassociate_ptr;
++} OMP_4.0;
++
+ GOMP_1.0 {
+   global:
+ 	GOMP_atomic_end;
+@@ -227,3 +257,158 @@ GOMP_4.0 {
+ 	GOMP_target_update;
+ 	GOMP_teams;
+ } GOMP_3.0;
++
++GOMP_4.0.1 {
++  global:
++	GOMP_offload_register;
++	GOMP_offload_unregister;
++} GOMP_4.0;
++
++GOMP_4.5 {
++  global:
++	GOMP_target_ext;
++	GOMP_target_data_ext;
++	GOMP_target_update_ext;
++	GOMP_target_enter_exit_data;
++	GOMP_taskloop;
++	GOMP_taskloop_ull;
++	GOMP_offload_register_ver;
++	GOMP_offload_unregister_ver;
++	GOMP_loop_doacross_dynamic_start;
++	GOMP_loop_doacross_guided_start;
++	GOMP_loop_doacross_runtime_start;
++	GOMP_loop_doacross_static_start;
++	GOMP_doacross_post;
++	GOMP_doacross_wait;
++	GOMP_loop_ull_doacross_dynamic_start;
++	GOMP_loop_ull_doacross_guided_start;
++	GOMP_loop_ull_doacross_runtime_start;
++	GOMP_loop_ull_doacross_static_start;
++	GOMP_doacross_ull_post;
++	GOMP_doacross_ull_wait;
++	GOMP_loop_nonmonotonic_dynamic_next;
++	GOMP_loop_nonmonotonic_dynamic_start;
++	GOMP_loop_nonmonotonic_guided_next;
++	GOMP_loop_nonmonotonic_guided_start;
++	GOMP_loop_ull_nonmonotonic_dynamic_next;
++	GOMP_loop_ull_nonmonotonic_dynamic_start;
++	GOMP_loop_ull_nonmonotonic_guided_next;
++	GOMP_loop_ull_nonmonotonic_guided_start;
++	GOMP_parallel_loop_nonmonotonic_dynamic;
++	GOMP_parallel_loop_nonmonotonic_guided;
++} GOMP_4.0.1;
++
++OACC_2.0 {
++  global:
++	acc_get_num_devices;
++	acc_get_num_devices_h_;
++	acc_set_device_type;
++	acc_set_device_type_h_;
++	acc_get_device_type;
++	acc_get_device_type_h_;
++	acc_set_device_num;
++	acc_set_device_num_h_;
++	acc_get_device_num;
++	acc_get_device_num_h_;
++	acc_async_test;
++	acc_async_test_h_;
++	acc_async_test_all;
++	acc_async_test_all_h_;
++	acc_wait;
++	acc_wait_h_;
++	acc_wait_async;
++	acc_wait_async_h_;
++	acc_wait_all;
++	acc_wait_all_h_;
++	acc_wait_all_async;
++	acc_wait_all_async_h_;
++	acc_init;
++	acc_init_h_;
++	acc_shutdown;
++	acc_shutdown_h_;
++	acc_on_device;
++	acc_on_device_h_;
++	acc_malloc;
++	acc_free;
++	acc_copyin;
++	acc_copyin_32_h_;
++	acc_copyin_64_h_;
++	acc_copyin_array_h_;
++	acc_present_or_copyin;
++	acc_present_or_copyin_32_h_;
++	acc_present_or_copyin_64_h_;
++	acc_present_or_copyin_array_h_;
++	acc_create;
++	acc_create_32_h_;
++	acc_create_64_h_;
++	acc_create_array_h_;
++	acc_present_or_create;
++	acc_present_or_create_32_h_;
++	acc_present_or_create_64_h_;
++	acc_present_or_create_array_h_;
++	acc_copyout;
++	acc_copyout_32_h_;
++	acc_copyout_64_h_;
++	acc_copyout_array_h_;
++	acc_delete;
++	acc_delete_32_h_;
++	acc_delete_64_h_;
++	acc_delete_array_h_;
++	acc_update_device;
++	acc_update_device_32_h_;
++	acc_update_device_64_h_;
++	acc_update_device_array_h_;
++	acc_update_self;
++	acc_update_self_32_h_;
++	acc_update_self_64_h_;
++	acc_update_self_array_h_;
++	acc_map_data;
++	acc_unmap_data;
++	acc_deviceptr;
++	acc_hostptr;
++	acc_is_present;
++	acc_is_present_32_h_;
++	acc_is_present_64_h_;
++	acc_is_present_array_h_;
++	acc_memcpy_to_device;
++	acc_memcpy_from_device;
++	acc_get_current_cuda_device;
++	acc_get_current_cuda_context;
++	acc_get_cuda_stream;
++	acc_set_cuda_stream;
++};
++
++GOACC_2.0 {
++  global:
++	GOACC_data_end;
++	GOACC_data_start;
++	GOACC_enter_exit_data;
++	GOACC_parallel;
++	GOACC_update;
++	GOACC_wait;
++	GOACC_get_thread_num;
++	GOACC_get_num_threads;
++};
++
++GOACC_2.0.1 {
++  global:
++	GOACC_declare;
++	GOACC_parallel_keyed;
++} GOACC_2.0;
++
++GOMP_PLUGIN_1.0 {
++  global:
++	GOMP_PLUGIN_malloc;
++	GOMP_PLUGIN_malloc_cleared;
++	GOMP_PLUGIN_realloc;
++	GOMP_PLUGIN_debug;
++	GOMP_PLUGIN_error;
++	GOMP_PLUGIN_fatal;
++	GOMP_PLUGIN_async_unmap_vars;
++	GOMP_PLUGIN_acc_thread;
++};
++
++GOMP_PLUGIN_1.1 {
++  global:
++	GOMP_PLUGIN_target_task_completion;
++} GOMP_PLUGIN_1.0;
+--- libgomp/ordered.c.jj	2013-01-21 16:00:46.137873657 +0100
++++ libgomp/ordered.c	2016-07-13 16:57:18.918355780 +0200
+@@ -25,6 +25,9 @@
+ /* This file handles the ORDERED construct.  */
+ 
+ #include "libgomp.h"
++#include <stdarg.h>
++#include <string.h>
++#include "doacross.h"
+ 
+ 
+ /* This function is called when first allocating an iteration block.  That
+@@ -249,3 +252,533 @@ void
+ GOMP_ordered_end (void)
+ {
+ }
++
++/* DOACROSS initialization.  */
++
++#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
++
++void
++gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_work_share *ws = thr->ts.work_share;
++  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
++  unsigned long ent, num_ents, elt_sz, shift_sz;
++  struct gomp_doacross_work_share *doacross;
++
++  if (team == NULL || team->nthreads == 1)
++    return;
++
++  for (i = 0; i < ncounts; i++)
++    {
++      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
++      if (counts[i] == 0)
++	return;
++
++      if (num_bits <= MAX_COLLAPSED_BITS)
++	{
++	  unsigned int this_bits;
++	  if (counts[i] == 1)
++	    this_bits = 1;
++	  else
++	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
++			- __builtin_clzl (counts[i] - 1);
++	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
++	    {
++	      bits[i] = this_bits;
++	      num_bits += this_bits;
++	    }
++	  else
++	    num_bits = MAX_COLLAPSED_BITS + 1;
++	}
++    }
++
++  if (ws->sched == GFS_STATIC)
++    num_ents = team->nthreads;
++  else if (ws->sched == GFS_GUIDED)
++    num_ents = counts[0];
++  else
++    num_ents = (counts[0] - 1) / chunk_size + 1;
++  if (num_bits <= MAX_COLLAPSED_BITS)
++    {
++      elt_sz = sizeof (unsigned long);
++      shift_sz = ncounts * sizeof (unsigned int);
++    }
++  else
++    {
++      elt_sz = sizeof (unsigned long) * ncounts;
++      shift_sz = 0;
++    }
++  elt_sz = (elt_sz + 63) & ~63UL;
++
++  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
++			  + shift_sz);
++  doacross->chunk_size = chunk_size;
++  doacross->elt_sz = elt_sz;
++  doacross->ncounts = ncounts;
++  doacross->flattened = false;
++  doacross->array = (unsigned char *)
++		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
++		     & ~(uintptr_t) 63);
++  if (num_bits <= MAX_COLLAPSED_BITS)
++    {
++      unsigned int shift_count = 0;
++      doacross->flattened = true;
++      for (i = ncounts; i > 0; i--)
++	{
++	  doacross->shift_counts[i - 1] = shift_count;
++	  shift_count += bits[i - 1];
++	}
++      for (ent = 0; ent < num_ents; ent++)
++	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
++    }
++  else
++    for (ent = 0; ent < num_ents; ent++)
++      memset (doacross->array + ent * elt_sz, '\0',
++	      sizeof (unsigned long) * ncounts);
++  if (ws->sched == GFS_STATIC && chunk_size == 0)
++    {
++      unsigned long q = counts[0] / num_ents;
++      unsigned long t = counts[0] % num_ents;
++      doacross->boundary = t * (q + 1);
++      doacross->q = q;
++      doacross->t = t;
++    }
++  ws->doacross = doacross;
++}
++
++/* DOACROSS POST operation.  */
++
++void
++GOMP_doacross_post (long *counts)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_work_share *ws = thr->ts.work_share;
++  struct gomp_doacross_work_share *doacross = ws->doacross;
++  unsigned long ent;
++  unsigned int i;
++
++  if (__builtin_expect (doacross == NULL, 0))
++    {
++      __sync_synchronize ();
++      return;
++    }
++
++  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
++    ent = thr->ts.team_id;
++  else if (ws->sched == GFS_GUIDED)
++    ent = counts[0];
++  else
++    ent = counts[0] / doacross->chunk_size;
++  unsigned long *array = (unsigned long *) (doacross->array
++					    + ent * doacross->elt_sz);
++
++  if (__builtin_expect (doacross->flattened, 1))
++    {
++      unsigned long flattened
++	= (unsigned long) counts[0] << doacross->shift_counts[0];
++
++      for (i = 1; i < doacross->ncounts; i++)
++	flattened |= (unsigned long) counts[i]
++		     << doacross->shift_counts[i];
++      flattened++;
++      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
++	__atomic_thread_fence (MEMMODEL_RELEASE);
++      else
++	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
++      return;
++    }
++
++  __atomic_thread_fence (MEMMODEL_ACQUIRE);
++  for (i = doacross->ncounts; i-- > 0; )
++    {
++      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
++	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
++    }
++}
++
++/* DOACROSS WAIT operation.  */
++
++void
++GOMP_doacross_wait (long first, ...)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_work_share *ws = thr->ts.work_share;
++  struct gomp_doacross_work_share *doacross = ws->doacross;
++  va_list ap;
++  unsigned long ent;
++  unsigned int i;
++
++  if (__builtin_expect (doacross == NULL, 0))
++    {
++      __sync_synchronize ();
++      return;
++    }
++
++  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
++    {
++      if (ws->chunk_size == 0)
++	{
++	  if (first < doacross->boundary)
++	    ent = first / (doacross->q + 1);
++	  else
++	    ent = (first - doacross->boundary) / doacross->q
++		  + doacross->t;
++	}
++      else
++	ent = first / ws->chunk_size % thr->ts.team->nthreads;
++    }
++  else if (ws->sched == GFS_GUIDED)
++    ent = first;
++  else
++    ent = first / doacross->chunk_size;
++  unsigned long *array = (unsigned long *) (doacross->array
++					    + ent * doacross->elt_sz);
++
++  if (__builtin_expect (doacross->flattened, 1))
++    {
++      unsigned long flattened
++	= (unsigned long) first << doacross->shift_counts[0];
++      unsigned long cur;
++
++      va_start (ap, first);
++      for (i = 1; i < doacross->ncounts; i++)
++	flattened |= (unsigned long) va_arg (ap, long)
++		     << doacross->shift_counts[i];
++      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
++      if (flattened < cur)
++	{
++	  __atomic_thread_fence (MEMMODEL_RELEASE);
++	  va_end (ap);
++	  return;
++	}
++      doacross_spin (array, flattened, cur);
++      __atomic_thread_fence (MEMMODEL_RELEASE);
++      va_end (ap);
++      return;
++    }
++
++  do
++    {
++      va_start (ap, first);
++      for (i = 0; i < doacross->ncounts; i++)
++	{
++	  unsigned long thisv
++	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
++	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
++	  if (thisv < cur)
++	    {
++	      i = doacross->ncounts;
++	      break;
++	    }
++	  if (thisv > cur)
++	    break;
++	}
++      va_end (ap);
++      if (i == doacross->ncounts)
++	break;
++      cpu_relax ();
++    }
++  while (1);
++  __sync_synchronize ();
++}
++
++typedef unsigned long long gomp_ull;
++
++void
++gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_work_share *ws = thr->ts.work_share;
++  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
++  unsigned long ent, num_ents, elt_sz, shift_sz;
++  struct gomp_doacross_work_share *doacross;
++
++  if (team == NULL || team->nthreads == 1)
++    return;
++
++  for (i = 0; i < ncounts; i++)
++    {
++      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
++      if (counts[i] == 0)
++	return;
++
++      if (num_bits <= MAX_COLLAPSED_BITS)
++	{
++	  unsigned int this_bits;
++	  if (counts[i] == 1)
++	    this_bits = 1;
++	  else
++	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
++			- __builtin_clzll (counts[i] - 1);
++	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
++	    {
++	      bits[i] = this_bits;
++	      num_bits += this_bits;
++	    }
++	  else
++	    num_bits = MAX_COLLAPSED_BITS + 1;
++	}
++    }
++
++  if (ws->sched == GFS_STATIC)
++    num_ents = team->nthreads;
++  else if (ws->sched == GFS_GUIDED)
++    num_ents = counts[0];
++  else
++    num_ents = (counts[0] - 1) / chunk_size + 1;
++  if (num_bits <= MAX_COLLAPSED_BITS)
++    {
++      elt_sz = sizeof (unsigned long);
++      shift_sz = ncounts * sizeof (unsigned int);
++    }
++  else
++    {
++      if (sizeof (gomp_ull) == sizeof (unsigned long))
++	elt_sz = sizeof (gomp_ull) * ncounts;
++      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
++	elt_sz = sizeof (unsigned long) * 2 * ncounts;
++      else
++	abort ();
++      shift_sz = 0;
++    }
++  elt_sz = (elt_sz + 63) & ~63UL;
++
++  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
++			  + shift_sz);
++  doacross->chunk_size_ull = chunk_size;
++  doacross->elt_sz = elt_sz;
++  doacross->ncounts = ncounts;
++  doacross->flattened = false;
++  doacross->boundary = 0;
++  doacross->array = (unsigned char *)
++		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
++		     & ~(uintptr_t) 63);
++  if (num_bits <= MAX_COLLAPSED_BITS)
++    {
++      unsigned int shift_count = 0;
++      doacross->flattened = true;
++      for (i = ncounts; i > 0; i--)
++	{
++	  doacross->shift_counts[i - 1] = shift_count;
++	  shift_count += bits[i - 1];
++	}
++      for (ent = 0; ent < num_ents; ent++)
++	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
++    }
++  else
++    for (ent = 0; ent < num_ents; ent++)
++      memset (doacross->array + ent * elt_sz, '\0',
++	      sizeof (unsigned long) * ncounts);
++  if (ws->sched == GFS_STATIC && chunk_size == 0)
++    {
++      gomp_ull q = counts[0] / num_ents;
++      gomp_ull t = counts[0] % num_ents;
++      doacross->boundary_ull = t * (q + 1);
++      doacross->q_ull = q;
++      doacross->t = t;
++    }
++  ws->doacross = doacross;
++}
++
++/* DOACROSS POST operation.  */
++
++void
++GOMP_doacross_ull_post (gomp_ull *counts)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_work_share *ws = thr->ts.work_share;
++  struct gomp_doacross_work_share *doacross = ws->doacross;
++  unsigned long ent;
++  unsigned int i;
++
++  if (__builtin_expect (doacross == NULL, 0))
++    {
++      __sync_synchronize ();
++      return;
++    }
++
++  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
++    ent = thr->ts.team_id;
++  else if (ws->sched == GFS_GUIDED)
++    ent = counts[0];
++  else
++    ent = counts[0] / doacross->chunk_size_ull;
++
++  if (__builtin_expect (doacross->flattened, 1))
++    {
++      unsigned long *array = (unsigned long *) (doacross->array
++			      + ent * doacross->elt_sz);
++      gomp_ull flattened
++	= counts[0] << doacross->shift_counts[0];
++
++      for (i = 1; i < doacross->ncounts; i++)
++	flattened |= counts[i] << doacross->shift_counts[i];
++      flattened++;
++      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
++	__atomic_thread_fence (MEMMODEL_RELEASE);
++      else
++	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
++      return;
++    }
++
++  __atomic_thread_fence (MEMMODEL_ACQUIRE);
++  if (sizeof (gomp_ull) == sizeof (unsigned long))
++    {
++      gomp_ull *array = (gomp_ull *) (doacross->array
++				      + ent * doacross->elt_sz);
++
++      for (i = doacross->ncounts; i-- > 0; )
++	{
++	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
++	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
++	}
++    }
++  else
++    {
++      unsigned long *array = (unsigned long *) (doacross->array
++						+ ent * doacross->elt_sz);
++
++      for (i = doacross->ncounts; i-- > 0; )
++	{
++	  gomp_ull cull = counts[i] + 1UL;
++	  unsigned long c = (unsigned long) cull;
++	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
++	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
++	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
++	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
++	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
++	}
++    }
++}
++
++/* DOACROSS WAIT operation.  */
++
++void
++GOMP_doacross_ull_wait (gomp_ull first, ...)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_work_share *ws = thr->ts.work_share;
++  struct gomp_doacross_work_share *doacross = ws->doacross;
++  va_list ap;
++  unsigned long ent;
++  unsigned int i;
++
++  if (__builtin_expect (doacross == NULL, 0))
++    {
++      __sync_synchronize ();
++      return;
++    }
++
++  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
++    {
++      if (ws->chunk_size_ull == 0)
++	{
++	  if (first < doacross->boundary_ull)
++	    ent = first / (doacross->q_ull + 1);
++	  else
++	    ent = (first - doacross->boundary_ull) / doacross->q_ull
++		  + doacross->t;
++	}
++      else
++	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
++    }
++  else if (ws->sched == GFS_GUIDED)
++    ent = first;
++  else
++    ent = first / doacross->chunk_size_ull;
++
++  if (__builtin_expect (doacross->flattened, 1))
++    {
++      unsigned long *array = (unsigned long *) (doacross->array
++						+ ent * doacross->elt_sz);
++      gomp_ull flattened = first << doacross->shift_counts[0];
++      unsigned long cur;
++
++      va_start (ap, first);
++      for (i = 1; i < doacross->ncounts; i++)
++	flattened |= va_arg (ap, gomp_ull)
++		     << doacross->shift_counts[i];
++      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
++      if (flattened < cur)
++	{
++	  __atomic_thread_fence (MEMMODEL_RELEASE);
++	  va_end (ap);
++	  return;
++	}
++      doacross_spin (array, flattened, cur);
++      __atomic_thread_fence (MEMMODEL_RELEASE);
++      va_end (ap);
++      return;
++    }
++
++  if (sizeof (gomp_ull) == sizeof (unsigned long))
++    {
++      gomp_ull *array = (gomp_ull *) (doacross->array
++				      + ent * doacross->elt_sz);
++      do
++	{
++	  va_start (ap, first);
++	  for (i = 0; i < doacross->ncounts; i++)
++	    {
++	      gomp_ull thisv
++		= (i ? va_arg (ap, gomp_ull) : first) + 1;
++	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
++	      if (thisv < cur)
++		{
++		  i = doacross->ncounts;
++		  break;
++		}
++	      if (thisv > cur)
++		break;
++	    }
++	  va_end (ap);
++	  if (i == doacross->ncounts)
++	    break;
++	  cpu_relax ();
++	}
++      while (1);
++    }
++  else
++    {
++      unsigned long *array = (unsigned long *) (doacross->array
++						+ ent * doacross->elt_sz);
++      do
++	{
++	  va_start (ap, first);
++	  for (i = 0; i < doacross->ncounts; i++)
++	    {
++	      gomp_ull thisv
++		= (i ? va_arg (ap, gomp_ull) : first) + 1;
++	      unsigned long t
++		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
++	      unsigned long cur
++		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
++	      if (t < cur)
++		{
++		  i = doacross->ncounts;
++		  break;
++		}
++	      if (t > cur)
++		break;
++	      t = thisv;
++	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
++	      if (t < cur)
++		{
++		  i = doacross->ncounts;
++		  break;
++		}
++	      if (t > cur)
++		break;
++	    }
++	  va_end (ap);
++	  if (i == doacross->ncounts)
++	    break;
++	  cpu_relax ();
++	}
++      while (1);
++    }
++  __sync_synchronize ();
++}
+--- libgomp/loop.c.jj	2014-05-15 10:56:36.487505570 +0200
++++ libgomp/loop.c	2016-07-13 16:57:13.488423109 +0200
+@@ -110,6 +110,11 @@ gomp_loop_static_start (long start, long
+   return !gomp_iter_static_next (istart, iend);
+ }
+ 
++/* The current dynamic implementation is always monotonic.  The
++   entrypoints without nonmonotonic in them have to be always monotonic,
++   but the nonmonotonic ones could be changed to use work-stealing for
++   improved scalability.  */
++
+ static bool
+ gomp_loop_dynamic_start (long start, long end, long incr, long chunk_size,
+ 			 long *istart, long *iend)
+@@ -135,6 +140,9 @@ gomp_loop_dynamic_start (long start, lon
+   return ret;
+ }
+ 
++/* Similarly as for dynamic, though the question is how can the chunk sizes
++   be decreased without a central locking or atomics.  */
++
+ static bool
+ gomp_loop_guided_start (long start, long end, long incr, long chunk_size,
+ 			long *istart, long *iend)
+@@ -168,13 +176,16 @@ GOMP_loop_runtime_start (long start, lon
+   switch (icv->run_sched_var)
+     {
+     case GFS_STATIC:
+-      return gomp_loop_static_start (start, end, incr, icv->run_sched_modifier,
++      return gomp_loop_static_start (start, end, incr,
++				     icv->run_sched_chunk_size,
+ 				     istart, iend);
+     case GFS_DYNAMIC:
+-      return gomp_loop_dynamic_start (start, end, incr, icv->run_sched_modifier,
++      return gomp_loop_dynamic_start (start, end, incr,
++				      icv->run_sched_chunk_size,
+ 				      istart, iend);
+     case GFS_GUIDED:
+-      return gomp_loop_guided_start (start, end, incr, icv->run_sched_modifier,
++      return gomp_loop_guided_start (start, end, incr,
++				     icv->run_sched_chunk_size,
+ 				     istart, iend);
+     case GFS_AUTO:
+       /* For now map to schedule(static), later on we could play with feedback
+@@ -265,15 +276,15 @@ GOMP_loop_ordered_runtime_start (long st
+     {
+     case GFS_STATIC:
+       return gomp_loop_ordered_static_start (start, end, incr,
+-					     icv->run_sched_modifier,
++					     icv->run_sched_chunk_size,
+ 					     istart, iend);
+     case GFS_DYNAMIC:
+       return gomp_loop_ordered_dynamic_start (start, end, incr,
+-					      icv->run_sched_modifier,
++					      icv->run_sched_chunk_size,
+ 					      istart, iend);
+     case GFS_GUIDED:
+       return gomp_loop_ordered_guided_start (start, end, incr,
+-					     icv->run_sched_modifier,
++					     icv->run_sched_chunk_size,
+ 					     istart, iend);
+     case GFS_AUTO:
+       /* For now map to schedule(static), later on we could play with feedback
+@@ -285,6 +296,111 @@ GOMP_loop_ordered_runtime_start (long st
+     }
+ }
+ 
++/* The *_doacross_*_start routines are similar.  The only difference is that
++   this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
++   section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
++   and other COUNTS array elements tell the library number of iterations
++   in the ordered inner loops.  */
++
++static bool
++gomp_loop_doacross_static_start (unsigned ncounts, long *counts,
++				 long chunk_size, long *istart, long *iend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++
++  thr->ts.static_trip = 0;
++  if (gomp_work_share_start (false))
++    {
++      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
++		      GFS_STATIC, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size);
++      gomp_work_share_init_done ();
++    }
++
++  return !gomp_iter_static_next (istart, iend);
++}
++
++static bool
++gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
++				  long chunk_size, long *istart, long *iend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  bool ret;
++
++  if (gomp_work_share_start (false))
++    {
++      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
++		      GFS_DYNAMIC, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size);
++      gomp_work_share_init_done ();
++    }
++
++#ifdef HAVE_SYNC_BUILTINS
++  ret = gomp_iter_dynamic_next (istart, iend);
++#else
++  gomp_mutex_lock (&thr->ts.work_share->lock);
++  ret = gomp_iter_dynamic_next_locked (istart, iend);
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
++
++  return ret;
++}
++
++static bool
++gomp_loop_doacross_guided_start (unsigned ncounts, long *counts,
++				 long chunk_size, long *istart, long *iend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  bool ret;
++
++  if (gomp_work_share_start (false))
++    {
++      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
++		      GFS_GUIDED, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size);
++      gomp_work_share_init_done ();
++    }
++
++#ifdef HAVE_SYNC_BUILTINS
++  ret = gomp_iter_guided_next (istart, iend);
++#else
++  gomp_mutex_lock (&thr->ts.work_share->lock);
++  ret = gomp_iter_guided_next_locked (istart, iend);
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
++
++  return ret;
++}
++
++bool
++GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts,
++				  long *istart, long *iend)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  switch (icv->run_sched_var)
++    {
++    case GFS_STATIC:
++      return gomp_loop_doacross_static_start (ncounts, counts,
++					      icv->run_sched_chunk_size,
++					      istart, iend);
++    case GFS_DYNAMIC:
++      return gomp_loop_doacross_dynamic_start (ncounts, counts,
++					       icv->run_sched_chunk_size,
++					       istart, iend);
++    case GFS_GUIDED:
++      return gomp_loop_doacross_guided_start (ncounts, counts,
++					      icv->run_sched_chunk_size,
++					      istart, iend);
++    case GFS_AUTO:
++      /* For now map to schedule(static), later on we could play with feedback
++	 driven choice.  */
++      return gomp_loop_doacross_static_start (ncounts, counts,
++					      0, istart, iend);
++    default:
++      abort ();
++    }
++}
++
+ /* The *_next routines are called when the thread completes processing of 
+    the iteration block currently assigned to it.  If the work-share 
+    construct is bound directly to a parallel construct, then the iteration
+@@ -483,7 +599,7 @@ GOMP_parallel_loop_runtime_start (void (
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    icv->run_sched_var, icv->run_sched_modifier, 0);
++			    icv->run_sched_var, icv->run_sched_chunk_size, 0);
+ }
+ 
+ ialias_redirect (GOMP_parallel_end)
+@@ -521,6 +637,37 @@ GOMP_parallel_loop_guided (void (*fn) (v
+   GOMP_parallel_end ();
+ }
+ 
++#ifdef HAVE_ATTRIBUTE_ALIAS
++extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic
++	__attribute__((alias ("GOMP_parallel_loop_dynamic")));
++extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided
++	__attribute__((alias ("GOMP_parallel_loop_guided")));
++#else
++void
++GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data,
++					 unsigned num_threads, long start,
++					 long end, long incr, long chunk_size,
++					 unsigned flags)
++{
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    GFS_DYNAMIC, chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++
++void
++GOMP_parallel_loop_nonmonotonic_guided (void (*fn) (void *), void *data,
++					unsigned num_threads, long start,
++					long end, long incr, long chunk_size,
++					unsigned flags)
++{
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    GFS_GUIDED, chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++#endif
++
+ void
+ GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
+ 			    unsigned num_threads, long start, long end,
+@@ -528,7 +675,7 @@ GOMP_parallel_loop_runtime (void (*fn) (
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    icv->run_sched_var, icv->run_sched_modifier,
++			    icv->run_sched_var, icv->run_sched_chunk_size,
+ 			    flags);
+   fn (data);
+   GOMP_parallel_end ();
+@@ -569,6 +716,10 @@ extern __typeof(gomp_loop_dynamic_start)
+ 	__attribute__((alias ("gomp_loop_dynamic_start")));
+ extern __typeof(gomp_loop_guided_start) GOMP_loop_guided_start
+ 	__attribute__((alias ("gomp_loop_guided_start")));
++extern __typeof(gomp_loop_dynamic_start) GOMP_loop_nonmonotonic_dynamic_start
++	__attribute__((alias ("gomp_loop_dynamic_start")));
++extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start
++	__attribute__((alias ("gomp_loop_guided_start")));
+ 
+ extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start
+ 	__attribute__((alias ("gomp_loop_ordered_static_start")));
+@@ -577,12 +728,23 @@ extern __typeof(gomp_loop_ordered_dynami
+ extern __typeof(gomp_loop_ordered_guided_start) GOMP_loop_ordered_guided_start
+ 	__attribute__((alias ("gomp_loop_ordered_guided_start")));
+ 
++extern __typeof(gomp_loop_doacross_static_start) GOMP_loop_doacross_static_start
++	__attribute__((alias ("gomp_loop_doacross_static_start")));
++extern __typeof(gomp_loop_doacross_dynamic_start) GOMP_loop_doacross_dynamic_start
++	__attribute__((alias ("gomp_loop_doacross_dynamic_start")));
++extern __typeof(gomp_loop_doacross_guided_start) GOMP_loop_doacross_guided_start
++	__attribute__((alias ("gomp_loop_doacross_guided_start")));
++
+ extern __typeof(gomp_loop_static_next) GOMP_loop_static_next
+ 	__attribute__((alias ("gomp_loop_static_next")));
+ extern __typeof(gomp_loop_dynamic_next) GOMP_loop_dynamic_next
+ 	__attribute__((alias ("gomp_loop_dynamic_next")));
+ extern __typeof(gomp_loop_guided_next) GOMP_loop_guided_next
+ 	__attribute__((alias ("gomp_loop_guided_next")));
++extern __typeof(gomp_loop_dynamic_next) GOMP_loop_nonmonotonic_dynamic_next
++	__attribute__((alias ("gomp_loop_dynamic_next")));
++extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next
++	__attribute__((alias ("gomp_loop_guided_next")));
+ 
+ extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next
+ 	__attribute__((alias ("gomp_loop_ordered_static_next")));
+@@ -613,6 +775,21 @@ GOMP_loop_guided_start (long start, long
+ }
+ 
+ bool
++GOMP_loop_nonmonotonic_dynamic_start (long start, long end, long incr,
++				      long chunk_size, long *istart,
++				      long *iend)
++{
++  return gomp_loop_dynamic_start (start, end, incr, chunk_size, istart, iend);
++}
++
++bool
++GOMP_loop_nonmonotonic_guided_start (long start, long end, long incr,
++				     long chunk_size, long *istart, long *iend)
++{
++  return gomp_loop_guided_start (start, end, incr, chunk_size, istart, iend);
++}
++
++bool
+ GOMP_loop_ordered_static_start (long start, long end, long incr,
+ 				long chunk_size, long *istart, long *iend)
+ {
+@@ -637,6 +814,30 @@ GOMP_loop_ordered_guided_start (long sta
+ }
+ 
+ bool
++GOMP_loop_doacross_static_start (unsigned ncounts, long *counts,
++				 long chunk_size, long *istart, long *iend)
++{
++  return gomp_loop_doacross_static_start (ncounts, counts, chunk_size,
++					  istart, iend);
++}
++
++bool
++GOMP_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
++				  long chunk_size, long *istart, long *iend)
++{
++  return gomp_loop_doacross_dynamic_start (ncounts, counts, chunk_size,
++					   istart, iend);
++}
++
++bool
++GOMP_loop_doacross_guided_start (unsigned ncounts, long *counts,
++				 long chunk_size, long *istart, long *iend)
++{
++  return gomp_loop_doacross_guided_start (ncounts, counts, chunk_size,
++					  istart, iend);
++}
++
++bool
+ GOMP_loop_static_next (long *istart, long *iend)
+ {
+   return gomp_loop_static_next (istart, iend);
+@@ -653,6 +854,18 @@ GOMP_loop_guided_next (long *istart, lon
+ {
+   return gomp_loop_guided_next (istart, iend);
+ }
++
++bool
++GOMP_loop_nonmonotonic_dynamic_next (long *istart, long *iend)
++{
++  return gomp_loop_dynamic_next (istart, iend);
++}
++
++bool
++GOMP_loop_nonmonotonic_guided_next (long *istart, long *iend)
++{
++  return gomp_loop_guided_next (istart, iend);
++}
+ 
+ bool
+ GOMP_loop_ordered_static_next (long *istart, long *iend)
+--- libgomp/error.c.jj	2013-01-21 16:00:31.834953566 +0100
++++ libgomp/error.c	2016-07-13 16:57:04.437535335 +0200
+@@ -35,7 +35,26 @@
+ #include <stdlib.h>
+ 
+ 
+-static void
++#undef gomp_vdebug
++void
++gomp_vdebug (int kind __attribute__ ((unused)), const char *msg, va_list list)
++{
++  if (gomp_debug_var)
++    vfprintf (stderr, msg, list);
++}
++
++#undef gomp_debug
++void
++gomp_debug (int kind, const char *msg, ...)
++{
++  va_list list;
++
++  va_start (list, msg);
++  gomp_vdebug (kind, msg, list);
++  va_end (list);
++}
++
++void
+ gomp_verror (const char *fmt, va_list list)
+ {
+   fputs ("\nlibgomp: ", stderr);
+@@ -54,13 +73,18 @@ gomp_error (const char *fmt, ...)
+ }
+ 
+ void
++gomp_vfatal (const char *fmt, va_list list)
++{
++  gomp_verror (fmt, list);
++  exit (EXIT_FAILURE);
++}
++
++void
+ gomp_fatal (const char *fmt, ...)
+ {
+   va_list list;
+ 
+   va_start (list, fmt);
+-  gomp_verror (fmt, list);
++  gomp_vfatal (fmt, list);
+   va_end (list);
+-
+-  exit (EXIT_FAILURE);
+ }
+--- libgomp/Makefile.am.jj	2014-05-15 11:12:10.000000000 +0200
++++ libgomp/Makefile.am	2016-07-14 16:10:51.968202878 +0200
+@@ -60,7 +60,13 @@ libgomp_la_LINK = $(LINK) $(libgomp_la_L
+ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
+ 	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
+ 	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
+-	time.c fortran.c affinity.c target.c
++	time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \
++	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
++	oacc-plugin.c oacc-cuda.c priority_queue.c
++
++if USE_FORTRAN
++libgomp_la_SOURCES += openacc.f90
++endif
+ 
+ nodist_noinst_HEADERS = libgomp_f.h
+ nodist_libsubinclude_HEADERS = omp.h
+--- libgomp/Makefile.in.jj	2014-05-15 11:12:10.000000000 +0200
++++ libgomp/Makefile.in	2016-07-14 16:11:10.981954087 +0200
+@@ -36,6 +36,7 @@ POST_UNINSTALL = :
+ build_triplet = @build@
+ host_triplet = @host@
+ target_triplet = @target@
++@USE_FORTRAN_TRUE@am__append_1 = openacc.f90
+ subdir = .
+ DIST_COMMON = ChangeLog $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+ 	$(top_srcdir)/configure $(am__configure_deps) \
+@@ -92,11 +93,15 @@ am__installdirs = "$(DESTDIR)$(toolexecl
+ 	"$(DESTDIR)$(toolexeclibdir)"
+ LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
+ libgomp_la_LIBADD =
++@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
+ am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
+ 	error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
+ 	parallel.lo sections.lo single.lo task.lo team.lo work.lo \
+ 	lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
+-	fortran.lo affinity.lo target.lo
++	fortran.lo affinity.lo target.lo splay-tree.lo \
++	libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \
++	oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
++	priority_queue.lo $(am__objects_1)
+ libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
+ DEFAULT_INCLUDES = -I.@am__isrc@
+ depcomp = $(SHELL) $(top_srcdir)/../depcomp
+@@ -108,6 +113,13 @@ LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIB
+ 	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ 	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+ CCLD = $(CC)
++FCCOMPILE = $(FC) $(AM_FCFLAGS) $(FCFLAGS)
++LTFCCOMPILE = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
++	--mode=compile $(FC) $(AM_FCFLAGS) $(FCFLAGS)
++FCLD = $(FC)
++FCLINK = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
++	--mode=link $(FCLD) $(AM_FCFLAGS) $(FCFLAGS) $(AM_LDFLAGS) \
++	$(LDFLAGS) -o $@
+ SOURCES = $(libgomp_la_SOURCES)
+ MULTISRCTOP = 
+ MULTIBUILDTOP = 
+@@ -315,10 +327,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_i
+ libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
+ libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
+ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
+-	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
+-	task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
+-	time.c fortran.c affinity.c target.c
+-
++	iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \
++	single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \
++	bar.c ptrlock.c time.c fortran.c affinity.c target.c \
++	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
++	oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
++	priority_queue.c $(am__append_1)
+ nodist_noinst_HEADERS = libgomp_f.h
+ nodist_libsubinclude_HEADERS = omp.h
+ @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod
+@@ -351,7 +365,7 @@ all: config.h
+ 	$(MAKE) $(AM_MAKEFLAGS) all-recursive
+ 
+ .SUFFIXES:
+-.SUFFIXES: .c .dvi .lo .o .obj .ps
++.SUFFIXES: .c .dvi .f90 .lo .o .obj .ps
+ am--refresh:
+ 	@:
+ $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+@@ -463,17 +477,27 @@ distclean-compile:
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop_ull.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mutex.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-async.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-cuda.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-host.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-init.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/splay-tree.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
+@@ -501,6 +525,15 @@ distclean-compile:
+ @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ @am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+ 
++.f90.o:
++	$(FCCOMPILE) -c -o $@ $<
++
++.f90.obj:
++	$(FCCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
++
++.f90.lo:
++	$(LTFCCOMPILE) -c -o $@ $<
++
+ mostlyclean-libtool:
+ 	-rm -f *.lo
+ 
+--- libgomp/task.c.jj	2014-08-06 16:25:16.575091658 +0200
++++ libgomp/task.c	2016-07-13 17:47:58.722758497 +0200
+@@ -28,6 +28,7 @@
+ #include "libgomp.h"
+ #include <stdlib.h>
+ #include <string.h>
++#include "gomp-constants.h"
+ 
+ typedef struct gomp_task_depend_entry *hash_entry_type;
+ 
+@@ -63,6 +64,14 @@ void
+ gomp_init_task (struct gomp_task *task, struct gomp_task *parent_task,
+ 		struct gomp_task_icv *prev_icv)
+ {
++  /* It would seem that using memset here would be a win, but it turns
++     out that partially filling gomp_task allows us to keep the
++     overhead of task creation low.  In the nqueens-1.c test, for a
++     sufficiently large N, we drop the overhead from 5-6% to 1%.
++
++     Note, the nqueens-1.c test in serial mode is a good test to
++     benchmark the overhead of creating tasks as there are millions of
++     tiny tasks created that all run undeferred.  */
+   task->parent = parent_task;
+   task->icv = *prev_icv;
+   task->kind = GOMP_TASK_IMPLICIT;
+@@ -71,7 +80,7 @@ gomp_init_task (struct gomp_task *task,
+   task->final_task = false;
+   task->copy_ctors_done = false;
+   task->parent_depends_on = false;
+-  task->children = NULL;
++  priority_queue_init (&task->children_queue);
+   task->taskgroup = NULL;
+   task->dependers = NULL;
+   task->depend_hash = NULL;
+@@ -90,30 +99,194 @@ gomp_end_task (void)
+   thr->task = task->parent;
+ }
+ 
++/* Clear the parent field of every task in LIST.  */
++
+ static inline void
+-gomp_clear_parent (struct gomp_task *children)
++gomp_clear_parent_in_list (struct priority_list *list)
+ {
+-  struct gomp_task *task = children;
+-
+-  if (task)
++  struct priority_node *p = list->tasks;
++  if (p)
+     do
+       {
+-	task->parent = NULL;
+-	task = task->next_child;
++	priority_node_to_task (PQ_CHILDREN, p)->parent = NULL;
++	p = p->next;
+       }
+-    while (task != children);
++    while (p != list->tasks);
++}
++
++/* Splay tree version of gomp_clear_parent_in_list.
++
++   Clear the parent field of every task in NODE within SP, and free
++   the node when done.  */
++
++static void
++gomp_clear_parent_in_tree (prio_splay_tree sp, prio_splay_tree_node node)
++{
++  if (!node)
++    return;
++  prio_splay_tree_node left = node->left, right = node->right;
++  gomp_clear_parent_in_list (&node->key.l);
++#if _LIBGOMP_CHECKING_
++  memset (node, 0xaf, sizeof (*node));
++#endif
++  /* No need to remove the node from the tree.  We're nuking
++     everything, so just free the nodes and our caller can clear the
++     entire splay tree.  */
++  free (node);
++  gomp_clear_parent_in_tree (sp, left);
++  gomp_clear_parent_in_tree (sp, right);
++}
++
++/* Clear the parent field of every task in Q and remove every task
++   from Q.  */
++
++static inline void
++gomp_clear_parent (struct priority_queue *q)
++{
++  if (priority_queue_multi_p (q))
++    {
++      gomp_clear_parent_in_tree (&q->t, q->t.root);
++      /* All the nodes have been cleared in gomp_clear_parent_in_tree.
++	 No need to remove anything.  We can just nuke everything.  */
++      q->t.root = NULL;
++    }
++  else
++    gomp_clear_parent_in_list (&q->l);
+ }
+ 
+-static void gomp_task_maybe_wait_for_dependencies (void **depend);
++/* Helper function for GOMP_task and gomp_create_target_task.
++
++   For a TASK with in/out dependencies, fill in the various dependency
++   queues.  PARENT is the parent of said task.  DEPEND is as in
++   GOMP_task.  */
++
++static void
++gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
++			 void **depend)
++{
++  size_t ndepend = (uintptr_t) depend[0];
++  size_t nout = (uintptr_t) depend[1];
++  size_t i;
++  hash_entry_type ent;
++
++  task->depend_count = ndepend;
++  task->num_dependees = 0;
++  if (parent->depend_hash == NULL)
++    parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
++  for (i = 0; i < ndepend; i++)
++    {
++      task->depend[i].addr = depend[2 + i];
++      task->depend[i].next = NULL;
++      task->depend[i].prev = NULL;
++      task->depend[i].task = task;
++      task->depend[i].is_in = i >= nout;
++      task->depend[i].redundant = false;
++      task->depend[i].redundant_out = false;
++
++      hash_entry_type *slot = htab_find_slot (&parent->depend_hash,
++					      &task->depend[i], INSERT);
++      hash_entry_type out = NULL, last = NULL;
++      if (*slot)
++	{
++	  /* If multiple depends on the same task are the same, all but the
++	     first one are redundant.  As inout/out come first, if any of them
++	     is inout/out, it will win, which is the right semantics.  */
++	  if ((*slot)->task == task)
++	    {
++	      task->depend[i].redundant = true;
++	      continue;
++	    }
++	  for (ent = *slot; ent; ent = ent->next)
++	    {
++	      if (ent->redundant_out)
++		break;
++
++	      last = ent;
++
++	      /* depend(in:...) doesn't depend on earlier depend(in:...).  */
++	      if (i >= nout && ent->is_in)
++		continue;
++
++	      if (!ent->is_in)
++		out = ent;
++
++	      struct gomp_task *tsk = ent->task;
++	      if (tsk->dependers == NULL)
++		{
++		  tsk->dependers
++		    = gomp_malloc (sizeof (struct gomp_dependers_vec)
++				   + 6 * sizeof (struct gomp_task *));
++		  tsk->dependers->n_elem = 1;
++		  tsk->dependers->allocated = 6;
++		  tsk->dependers->elem[0] = task;
++		  task->num_dependees++;
++		  continue;
++		}
++	      /* We already have some other dependency on tsk from earlier
++		 depend clause.  */
++	      else if (tsk->dependers->n_elem
++		       && (tsk->dependers->elem[tsk->dependers->n_elem - 1]
++			   == task))
++		continue;
++	      else if (tsk->dependers->n_elem == tsk->dependers->allocated)
++		{
++		  tsk->dependers->allocated
++		    = tsk->dependers->allocated * 2 + 2;
++		  tsk->dependers
++		    = gomp_realloc (tsk->dependers,
++				    sizeof (struct gomp_dependers_vec)
++				    + (tsk->dependers->allocated
++				       * sizeof (struct gomp_task *)));
++		}
++	      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
++	      task->num_dependees++;
++	    }
++	  task->depend[i].next = *slot;
++	  (*slot)->prev = &task->depend[i];
++	}
++      *slot = &task->depend[i];
++
++      /* There is no need to store more than one depend({,in}out:) task per
++	 address in the hash table chain for the purpose of creation of
++	 deferred tasks, because each out depends on all earlier outs, thus it
++	 is enough to record just the last depend({,in}out:).  For depend(in:),
++	 we need to keep all of the previous ones not terminated yet, because
++	 a later depend({,in}out:) might need to depend on all of them.  So, if
++	 the new task's clause is depend({,in}out:), we know there is at most
++	 one other depend({,in}out:) clause in the list (out).  For
++	 non-deferred tasks we want to see all outs, so they are moved to the
++	 end of the chain, after first redundant_out entry all following
++	 entries should be redundant_out.  */
++      if (!task->depend[i].is_in && out)
++	{
++	  if (out != last)
++	    {
++	      out->next->prev = out->prev;
++	      out->prev->next = out->next;
++	      out->next = last->next;
++	      out->prev = last;
++	      last->next = out;
++	      if (out->next)
++		out->next->prev = out;
++	    }
++	  out->redundant_out = true;
++	}
++    }
++}
+ 
+ /* Called when encountering an explicit task directive.  If IF_CLAUSE is
+    false, then we must not delay in executing the task.  If UNTIED is true,
+-   then the task may be executed by any member of the team.  */
++   then the task may be executed by any member of the team.
++
++   DEPEND is an array containing:
++	depend[0]: number of depend elements.
++	depend[1]: number of depend elements of type "out".
++	depend[2..N+1]: address of [1..N]th depend element.  */
+ 
+ void
+ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
+ 	   long arg_size, long arg_align, bool if_clause, unsigned flags,
+-	   void **depend)
++	   void **depend, int priority)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+@@ -125,8 +298,7 @@ GOMP_task (void (*fn) (void *), void *da
+      might be running on different thread than FN.  */
+   if (cpyfn)
+     if_clause = false;
+-  if (flags & 1)
+-    flags &= ~1;
++  flags &= ~GOMP_TASK_FLAG_UNTIED;
+ #endif
+ 
+   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+@@ -135,6 +307,11 @@ GOMP_task (void (*fn) (void *), void *da
+ 	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+     return;
+ 
++  if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
++    priority = 0;
++  else if (priority > gomp_max_task_priority_var)
++    priority = gomp_max_task_priority_var;
++
+   if (!if_clause || team == NULL
+       || (thr->task && thr->task->final_task)
+       || team->task_count > 64 * team->nthreads)
+@@ -147,12 +324,15 @@ GOMP_task (void (*fn) (void *), void *da
+ 	 depend clauses for non-deferred tasks other than this, because
+ 	 the parent task is suspended until the child task finishes and thus
+ 	 it can't start further child tasks.  */
+-      if ((flags & 8) && thr->task && thr->task->depend_hash)
++      if ((flags & GOMP_TASK_FLAG_DEPEND)
++	  && thr->task && thr->task->depend_hash)
+ 	gomp_task_maybe_wait_for_dependencies (depend);
+ 
+       gomp_init_task (&task, thr->task, gomp_icv (false));
+-      task.kind = GOMP_TASK_IFFALSE;
+-      task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
++      task.kind = GOMP_TASK_UNDEFERRED;
++      task.final_task = (thr->task && thr->task->final_task)
++			|| (flags & GOMP_TASK_FLAG_FINAL);
++      task.priority = priority;
+       if (thr->task)
+ 	{
+ 	  task.in_tied_task = thr->task->in_tied_task;
+@@ -178,10 +358,10 @@ GOMP_task (void (*fn) (void *), void *da
+ 	 child thread, but seeing a stale non-NULL value is not a
+ 	 problem.  Once past the task_lock acquisition, this thread
+ 	 will see the real value of task.children.  */
+-      if (task.children != NULL)
++      if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED))
+ 	{
+ 	  gomp_mutex_lock (&team->task_lock);
+-	  gomp_clear_parent (task.children);
++	  gomp_clear_parent (&task.children_queue);
+ 	  gomp_mutex_unlock (&team->task_lock);
+ 	}
+       gomp_end_task ();
+@@ -195,7 +375,7 @@ GOMP_task (void (*fn) (void *), void *da
+       bool do_wake;
+       size_t depend_size = 0;
+ 
+-      if (flags & 8)
++      if (flags & GOMP_TASK_FLAG_DEPEND)
+ 	depend_size = ((uintptr_t) depend[0]
+ 		       * sizeof (struct gomp_task_depend_entry));
+       task = gomp_malloc (sizeof (*task) + depend_size
+@@ -203,7 +383,8 @@ GOMP_task (void (*fn) (void *), void *da
+       arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
+ 		      & ~(uintptr_t) (arg_align - 1));
+       gomp_init_task (task, parent, gomp_icv (false));
+-      task->kind = GOMP_TASK_IFFALSE;
++      task->priority = priority;
++      task->kind = GOMP_TASK_UNDEFERRED;
+       task->in_tied_task = parent->in_tied_task;
+       task->taskgroup = taskgroup;
+       thr->task = task;
+@@ -218,7 +399,7 @@ GOMP_task (void (*fn) (void *), void *da
+       task->kind = GOMP_TASK_WAITING;
+       task->fn = fn;
+       task->fn_data = arg;
+-      task->final_task = (flags & 2) >> 1;
++      task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
+       gomp_mutex_lock (&team->task_lock);
+       /* If parallel or taskgroup has been cancelled, don't start new
+ 	 tasks.  */
+@@ -235,171 +416,39 @@ GOMP_task (void (*fn) (void *), void *da
+ 	taskgroup->num_children++;
+       if (depend_size)
+ 	{
+-	  size_t ndepend = (uintptr_t) depend[0];
+-	  size_t nout = (uintptr_t) depend[1];
+-	  size_t i;
+-	  hash_entry_type ent;
+-
+-	  task->depend_count = ndepend;
+-	  task->num_dependees = 0;
+-	  if (parent->depend_hash == NULL)
+-	    parent->depend_hash
+-	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+-	  for (i = 0; i < ndepend; i++)
+-	    {
+-	      task->depend[i].addr = depend[2 + i];
+-	      task->depend[i].next = NULL;
+-	      task->depend[i].prev = NULL;
+-	      task->depend[i].task = task;
+-	      task->depend[i].is_in = i >= nout;
+-	      task->depend[i].redundant = false;
+-	      task->depend[i].redundant_out = false;
+-
+-	      hash_entry_type *slot
+-		= htab_find_slot (&parent->depend_hash, &task->depend[i],
+-				  INSERT);
+-	      hash_entry_type out = NULL, last = NULL;
+-	      if (*slot)
+-		{
+-		  /* If multiple depends on the same task are the
+-		     same, all but the first one are redundant.
+-		     As inout/out come first, if any of them is
+-		     inout/out, it will win, which is the right
+-		     semantics.  */
+-		  if ((*slot)->task == task)
+-		    {
+-		      task->depend[i].redundant = true;
+-		      continue;
+-		    }
+-		  for (ent = *slot; ent; ent = ent->next)
+-		    {
+-		      if (ent->redundant_out)
+-			break;
+-
+-		      last = ent;
+-
+-		      /* depend(in:...) doesn't depend on earlier
+-			 depend(in:...).  */
+-		      if (i >= nout && ent->is_in)
+-			continue;
+-
+-		      if (!ent->is_in)
+-			out = ent;
+-
+-		      struct gomp_task *tsk = ent->task;
+-		      if (tsk->dependers == NULL)
+-			{
+-			  tsk->dependers
+-			    = gomp_malloc (sizeof (struct gomp_dependers_vec)
+-					   + 6 * sizeof (struct gomp_task *));
+-			  tsk->dependers->n_elem = 1;
+-			  tsk->dependers->allocated = 6;
+-			  tsk->dependers->elem[0] = task;
+-			  task->num_dependees++;
+-			  continue;
+-			}
+-		      /* We already have some other dependency on tsk
+-			 from earlier depend clause.  */
+-		      else if (tsk->dependers->n_elem
+-			       && (tsk->dependers->elem[tsk->dependers->n_elem
+-							- 1]
+-				   == task))
+-			continue;
+-		      else if (tsk->dependers->n_elem
+-			       == tsk->dependers->allocated)
+-			{
+-			  tsk->dependers->allocated
+-			    = tsk->dependers->allocated * 2 + 2;
+-			  tsk->dependers
+-			    = gomp_realloc (tsk->dependers,
+-					    sizeof (struct gomp_dependers_vec)
+-					    + (tsk->dependers->allocated
+-					       * sizeof (struct gomp_task *)));
+-			}
+-		      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
+-		      task->num_dependees++;
+-		    }
+-		  task->depend[i].next = *slot;
+-		  (*slot)->prev = &task->depend[i];
+-		}
+-	      *slot = &task->depend[i];
+-
+-	      /* There is no need to store more than one depend({,in}out:)
+-		 task per address in the hash table chain for the purpose
+-		 of creation of deferred tasks, because each out
+-		 depends on all earlier outs, thus it is enough to record
+-		 just the last depend({,in}out:).  For depend(in:), we need
+-		 to keep all of the previous ones not terminated yet, because
+-		 a later depend({,in}out:) might need to depend on all of
+-		 them.  So, if the new task's clause is depend({,in}out:),
+-		 we know there is at most one other depend({,in}out:) clause
+-		 in the list (out).  For non-deferred tasks we want to see
+-		 all outs, so they are moved to the end of the chain,
+-		 after first redundant_out entry all following entries
+-		 should be redundant_out.  */
+-	      if (!task->depend[i].is_in && out)
+-		{
+-		  if (out != last)
+-		    {
+-		      out->next->prev = out->prev;
+-		      out->prev->next = out->next;
+-		      out->next = last->next;
+-		      out->prev = last;
+-		      last->next = out;
+-		      if (out->next)
+-			out->next->prev = out;
+-		    }
+-		  out->redundant_out = true;
+-		}
+-	    }
++	  gomp_task_handle_depend (task, parent, depend);
+ 	  if (task->num_dependees)
+ 	    {
++	      /* Tasks that depend on other tasks are not put into the
++		 various waiting queues, so we are done for now.  Said
++		 tasks are instead put into the queues via
++		 gomp_task_run_post_handle_dependers() after their
++		 dependencies have been satisfied.  After which, they
++		 can be picked up by the various scheduling
++		 points.  */
+ 	      gomp_mutex_unlock (&team->task_lock);
+ 	      return;
+ 	    }
+ 	}
+-      if (parent->children)
+-	{
+-	  task->next_child = parent->children;
+-	  task->prev_child = parent->children->prev_child;
+-	  task->next_child->prev_child = task;
+-	  task->prev_child->next_child = task;
+-	}
+-      else
+-	{
+-	  task->next_child = task;
+-	  task->prev_child = task;
+-	}
+-      parent->children = task;
++
++      priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
++			     task, priority,
++			     PRIORITY_INSERT_BEGIN,
++			     /*adjust_parent_depends_on=*/false,
++			     task->parent_depends_on);
+       if (taskgroup)
+-	{
+-	  if (taskgroup->children)
+-	    {
+-	      task->next_taskgroup = taskgroup->children;
+-	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+-	      task->next_taskgroup->prev_taskgroup = task;
+-	      task->prev_taskgroup->next_taskgroup = task;
+-	    }
+-	  else
+-	    {
+-	      task->next_taskgroup = task;
+-	      task->prev_taskgroup = task;
+-	    }
+-	  taskgroup->children = task;
+-	}
+-      if (team->task_queue)
+-	{
+-	  task->next_queue = team->task_queue;
+-	  task->prev_queue = team->task_queue->prev_queue;
+-	  task->next_queue->prev_queue = task;
+-	  task->prev_queue->next_queue = task;
+-	}
+-      else
+-	{
+-	  task->next_queue = task;
+-	  task->prev_queue = task;
+-	  team->task_queue = task;
+-	}
++	priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++			       task, priority,
++			       PRIORITY_INSERT_BEGIN,
++			       /*adjust_parent_depends_on=*/false,
++			       task->parent_depends_on);
++
++      priority_queue_insert (PQ_TEAM, &team->task_queue,
++			     task, priority,
++			     PRIORITY_INSERT_END,
++			     /*adjust_parent_depends_on=*/false,
++			     task->parent_depends_on);
++
+       ++team->task_count;
+       ++team->task_queued_count;
+       gomp_team_barrier_set_task_pending (&team->barrier);
+@@ -411,36 +460,529 @@ GOMP_task (void (*fn) (void *), void *da
+     }
+ }
+ 
+-static inline bool
+-gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
+-		   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
++ialias (GOMP_taskgroup_start)
++ialias (GOMP_taskgroup_end)
++
++#define TYPE long
++#define UTYPE unsigned long
++#define TYPE_is_long 1
++#include "taskloop.c"
++#undef TYPE
++#undef UTYPE
++#undef TYPE_is_long
++
++#define TYPE unsigned long long
++#define UTYPE TYPE
++#define GOMP_taskloop GOMP_taskloop_ull
++#include "taskloop.c"
++#undef TYPE
++#undef UTYPE
++#undef GOMP_taskloop
++
++static void inline
++priority_queue_move_task_first (enum priority_queue_type type,
++				struct priority_queue *head,
++				struct gomp_task *task)
+ {
++#if _LIBGOMP_CHECKING_
++  if (!priority_queue_task_in_queue_p (type, head, task))
++    gomp_fatal ("Attempt to move first missing task %p", task);
++#endif
++  struct priority_list *list;
++  if (priority_queue_multi_p (head))
++    {
++      list = priority_queue_lookup_priority (head, task->priority);
++#if _LIBGOMP_CHECKING_
++      if (!list)
++	gomp_fatal ("Unable to find priority %d", task->priority);
++#endif
++    }
++  else
++    list = &head->l;
++  priority_list_remove (list, task_to_priority_node (type, task), 0);
++  priority_list_insert (type, list, task, task->priority,
++			PRIORITY_INSERT_BEGIN, type == PQ_CHILDREN,
++			task->parent_depends_on);
++}
++
++/* Actual body of GOMP_PLUGIN_target_task_completion that is executed
++   with team->task_lock held, or is executed in the thread that called
++   gomp_target_task_fn if GOMP_PLUGIN_target_task_completion has been
++   run before it acquires team->task_lock.  */
++
++static void
++gomp_target_task_completion (struct gomp_team *team, struct gomp_task *task)
++{
++  struct gomp_task *parent = task->parent;
+   if (parent)
++    priority_queue_move_task_first (PQ_CHILDREN, &parent->children_queue,
++				    task);
++
++  struct gomp_taskgroup *taskgroup = task->taskgroup;
++  if (taskgroup)
++    priority_queue_move_task_first (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++				    task);
++
++  priority_queue_insert (PQ_TEAM, &team->task_queue, task, task->priority,
++			 PRIORITY_INSERT_BEGIN, false,
++			 task->parent_depends_on);
++  task->kind = GOMP_TASK_WAITING;
++  if (parent && parent->taskwait)
+     {
+-      if (parent->children == child_task)
+-	parent->children = child_task->next_child;
+-      if (__builtin_expect (child_task->parent_depends_on, 0)
+-	  && parent->taskwait->last_parent_depends_on == child_task)
+-	{
+-	  if (child_task->prev_child->kind == GOMP_TASK_WAITING
+-	      && child_task->prev_child->parent_depends_on)
+-	    parent->taskwait->last_parent_depends_on = child_task->prev_child;
+-	  else
+-	    parent->taskwait->last_parent_depends_on = NULL;
++      if (parent->taskwait->in_taskwait)
++	{
++	  /* One more task has had its dependencies met.
++	     Inform any waiters.  */
++	  parent->taskwait->in_taskwait = false;
++	  gomp_sem_post (&parent->taskwait->taskwait_sem);
+ 	}
++      else if (parent->taskwait->in_depend_wait)
++	{
++	  /* One more task has had its dependencies met.
++	     Inform any waiters.  */
++	  parent->taskwait->in_depend_wait = false;
++	  gomp_sem_post (&parent->taskwait->taskwait_sem);
++	}
++    }
++  if (taskgroup && taskgroup->in_taskgroup_wait)
++    {
++      /* One more task has had its dependencies met.
++	 Inform any waiters.  */
++      taskgroup->in_taskgroup_wait = false;
++      gomp_sem_post (&taskgroup->taskgroup_sem);
+     }
+-  if (taskgroup && taskgroup->children == child_task)
+-    taskgroup->children = child_task->next_taskgroup;
+-  child_task->prev_queue->next_queue = child_task->next_queue;
+-  child_task->next_queue->prev_queue = child_task->prev_queue;
+-  if (team->task_queue == child_task)
++
++  ++team->task_queued_count;
++  gomp_team_barrier_set_task_pending (&team->barrier);
++  /* I'm afraid this can't be done after releasing team->task_lock,
++     as gomp_target_task_completion is run from unrelated thread and
++     therefore in between gomp_mutex_unlock and gomp_team_barrier_wake
++     the team could be gone already.  */
++  if (team->nthreads > team->task_running_count)
++    gomp_team_barrier_wake (&team->barrier, 1);
++}
++
++/* Signal that a target task TTASK has completed the asynchronously
++   running phase and should be requeued as a task to handle the
++   variable unmapping.  */
++
++void
++GOMP_PLUGIN_target_task_completion (void *data)
++{
++  struct gomp_target_task *ttask = (struct gomp_target_task *) data;
++  struct gomp_task *task = ttask->task;
++  struct gomp_team *team = ttask->team;
++
++  gomp_mutex_lock (&team->task_lock);
++  if (ttask->state == GOMP_TARGET_TASK_READY_TO_RUN)
+     {
+-      if (child_task->next_queue != child_task)
+-	team->task_queue = child_task->next_queue;
++      ttask->state = GOMP_TARGET_TASK_FINISHED;
++      gomp_mutex_unlock (&team->task_lock);
++      return;
++    }
++  ttask->state = GOMP_TARGET_TASK_FINISHED;
++  gomp_target_task_completion (team, task);
++  gomp_mutex_unlock (&team->task_lock);
++}
++
++static void gomp_task_run_post_handle_depend_hash (struct gomp_task *);
++
++/* Called for nowait target tasks.  */
++
++bool
++gomp_create_target_task (struct gomp_device_descr *devicep,
++			 void (*fn) (void *), size_t mapnum, void **hostaddrs,
++			 size_t *sizes, unsigned short *kinds,
++			 unsigned int flags, void **depend, void **args,
++			 enum gomp_target_task_state state)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++
++  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
++  if (team
++      && (gomp_team_barrier_cancelled (&team->barrier)
++	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
++    return true;
++
++  struct gomp_target_task *ttask;
++  struct gomp_task *task;
++  struct gomp_task *parent = thr->task;
++  struct gomp_taskgroup *taskgroup = parent->taskgroup;
++  bool do_wake;
++  size_t depend_size = 0;
++  uintptr_t depend_cnt = 0;
++  size_t tgt_align = 0, tgt_size = 0;
++
++  if (depend != NULL)
++    {
++      depend_cnt = (uintptr_t) depend[0];
++      depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
++    }
++  if (fn)
++    {
++      /* GOMP_MAP_FIRSTPRIVATE need to be copied first, as they are
++	 firstprivate on the target task.  */
++      size_t i;
++      for (i = 0; i < mapnum; i++)
++	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
++	  {
++	    size_t align = (size_t) 1 << (kinds[i] >> 8);
++	    if (tgt_align < align)
++	      tgt_align = align;
++	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
++	    tgt_size += sizes[i];
++	  }
++      if (tgt_align)
++	tgt_size += tgt_align - 1;
+       else
+-	team->task_queue = NULL;
++	tgt_size = 0;
+     }
++
++  task = gomp_malloc (sizeof (*task) + depend_size
++		      + sizeof (*ttask)
++		      + mapnum * (sizeof (void *) + sizeof (size_t)
++				  + sizeof (unsigned short))
++		      + tgt_size);
++  gomp_init_task (task, parent, gomp_icv (false));
++  task->priority = 0;
++  task->kind = GOMP_TASK_WAITING;
++  task->in_tied_task = parent->in_tied_task;
++  task->taskgroup = taskgroup;
++  ttask = (struct gomp_target_task *) &task->depend[depend_cnt];
++  ttask->devicep = devicep;
++  ttask->fn = fn;
++  ttask->mapnum = mapnum;
++  ttask->args = args;
++  memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *));
++  ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum];
++  memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t));
++  ttask->kinds = (unsigned short *) &ttask->sizes[mapnum];
++  memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short));
++  if (tgt_align)
++    {
++      char *tgt = (char *) &ttask->kinds[mapnum];
++      size_t i;
++      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
++      if (al)
++	tgt += tgt_align - al;
++      tgt_size = 0;
++      for (i = 0; i < mapnum; i++)
++	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
++	  {
++	    size_t align = (size_t) 1 << (kinds[i] >> 8);
++	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
++	    memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
++	    ttask->hostaddrs[i] = tgt + tgt_size;
++	    tgt_size = tgt_size + sizes[i];
++	  }
++    }
++  ttask->flags = flags;
++  ttask->state = state;
++  ttask->task = task;
++  ttask->team = team;
++  task->fn = NULL;
++  task->fn_data = ttask;
++  task->final_task = 0;
++  gomp_mutex_lock (&team->task_lock);
++  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
++  if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier)
++			|| (taskgroup && taskgroup->cancelled), 0))
++    {
++      gomp_mutex_unlock (&team->task_lock);
++      gomp_finish_task (task);
++      free (task);
++      return true;
++    }
++  if (depend_size)
++    {
++      gomp_task_handle_depend (task, parent, depend);
++      if (task->num_dependees)
++	{
++	  if (taskgroup)
++	    taskgroup->num_children++;
++	  gomp_mutex_unlock (&team->task_lock);
++	  return true;
++	}
++    }
++  if (state == GOMP_TARGET_TASK_DATA)
++    {
++      gomp_task_run_post_handle_depend_hash (task);
++      gomp_mutex_unlock (&team->task_lock);
++      gomp_finish_task (task);
++      free (task);
++      return false;
++    }
++  if (taskgroup)
++    taskgroup->num_children++;
++  /* For async offloading, if we don't need to wait for dependencies,
++     run the gomp_target_task_fn right away, essentially schedule the
++     mapping part of the task in the current thread.  */
++  if (devicep != NULL
++      && (devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
++    {
++      priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
++			     PRIORITY_INSERT_END,
++			     /*adjust_parent_depends_on=*/false,
++			     task->parent_depends_on);
++      if (taskgroup)
++	priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++			       task, 0, PRIORITY_INSERT_END,
++			       /*adjust_parent_depends_on=*/false,
++			       task->parent_depends_on);
++      task->pnode[PQ_TEAM].next = NULL;
++      task->pnode[PQ_TEAM].prev = NULL;
++      task->kind = GOMP_TASK_TIED;
++      ++team->task_count;
++      gomp_mutex_unlock (&team->task_lock);
++
++      thr->task = task;
++      gomp_target_task_fn (task->fn_data);
++      thr->task = parent;
++
++      gomp_mutex_lock (&team->task_lock);
++      task->kind = GOMP_TASK_ASYNC_RUNNING;
++      /* If GOMP_PLUGIN_target_task_completion has run already
++	 in between gomp_target_task_fn and the mutex lock,
++	 perform the requeuing here.  */
++      if (ttask->state == GOMP_TARGET_TASK_FINISHED)
++	gomp_target_task_completion (team, task);
++      else
++	ttask->state = GOMP_TARGET_TASK_RUNNING;
++      gomp_mutex_unlock (&team->task_lock);
++      return true;
++    }
++  priority_queue_insert (PQ_CHILDREN, &parent->children_queue, task, 0,
++			 PRIORITY_INSERT_BEGIN,
++			 /*adjust_parent_depends_on=*/false,
++			 task->parent_depends_on);
++  if (taskgroup)
++    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue, task, 0,
++			   PRIORITY_INSERT_BEGIN,
++			   /*adjust_parent_depends_on=*/false,
++			   task->parent_depends_on);
++  priority_queue_insert (PQ_TEAM, &team->task_queue, task, 0,
++			 PRIORITY_INSERT_END,
++			 /*adjust_parent_depends_on=*/false,
++			 task->parent_depends_on);
++  ++team->task_count;
++  ++team->task_queued_count;
++  gomp_team_barrier_set_task_pending (&team->barrier);
++  do_wake = team->task_running_count + !parent->in_tied_task
++	    < team->nthreads;
++  gomp_mutex_unlock (&team->task_lock);
++  if (do_wake)
++    gomp_team_barrier_wake (&team->barrier, 1);
++  return true;
++}
++
++/* Given a parent_depends_on task in LIST, move it to the front of its
++   priority so it is run as soon as possible.
++
++   Care is taken to update the list's LAST_PARENT_DEPENDS_ON field.
++
++   We rearrange the queue such that all parent_depends_on tasks are
++   first, and last_parent_depends_on points to the last such task we
++   rearranged.  For example, given the following tasks in a queue
++   where PD[123] are the parent_depends_on tasks:
++
++	task->children
++	|
++	V
++	C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4
++
++	We rearrange such that:
++
++	task->children
++	|	       +--- last_parent_depends_on
++	|	       |
++	V	       V
++	PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4.  */
++
++static void inline
++priority_list_upgrade_task (struct priority_list *list,
++			    struct priority_node *node)
++{
++  struct priority_node *last_parent_depends_on
++    = list->last_parent_depends_on;
++  if (last_parent_depends_on)
++    {
++      node->prev->next = node->next;
++      node->next->prev = node->prev;
++      node->prev = last_parent_depends_on;
++      node->next = last_parent_depends_on->next;
++      node->prev->next = node;
++      node->next->prev = node;
++    }
++  else if (node != list->tasks)
++    {
++      node->prev->next = node->next;
++      node->next->prev = node->prev;
++      node->prev = list->tasks->prev;
++      node->next = list->tasks;
++      list->tasks = node;
++      node->prev->next = node;
++      node->next->prev = node;
++    }
++  list->last_parent_depends_on = node;
++}
++
++/* Given a parent_depends_on TASK in its parent's children_queue, move
++   it to the front of its priority so it is run as soon as possible.
++
++   PARENT is passed as an optimization.
++
++   (This function could be defined in priority_queue.c, but we want it
++   inlined, and putting it in priority_queue.h is not an option, given
++   that gomp_task has not been properly defined at that point).  */
++
++static void inline
++priority_queue_upgrade_task (struct gomp_task *task,
++			     struct gomp_task *parent)
++{
++  struct priority_queue *head = &parent->children_queue;
++  struct priority_node *node = &task->pnode[PQ_CHILDREN];
++#if _LIBGOMP_CHECKING_
++  if (!task->parent_depends_on)
++    gomp_fatal ("priority_queue_upgrade_task: task must be a "
++		"parent_depends_on task");
++  if (!priority_queue_task_in_queue_p (PQ_CHILDREN, head, task))
++    gomp_fatal ("priority_queue_upgrade_task: cannot find task=%p", task);
++#endif
++  if (priority_queue_multi_p (head))
++    {
++      struct priority_list *list
++	= priority_queue_lookup_priority (head, task->priority);
++      priority_list_upgrade_task (list, node);
++    }
++  else
++    priority_list_upgrade_task (&head->l, node);
++}
++
++/* Given a CHILD_TASK in LIST that is about to be executed, move it out of
++   the way in LIST so that other tasks can be considered for
++   execution.  LIST contains tasks of type TYPE.
++
++   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
++   if applicable.  */
++
++static void inline
++priority_list_downgrade_task (enum priority_queue_type type,
++			      struct priority_list *list,
++			      struct gomp_task *child_task)
++{
++  struct priority_node *node = task_to_priority_node (type, child_task);
++  if (list->tasks == node)
++    list->tasks = node->next;
++  else if (node->next != list->tasks)
++    {
++      /* The task in NODE is about to become TIED and TIED tasks
++	 cannot come before WAITING tasks.  If we're about to
++	 leave the queue in such an indeterminate state, rewire
++	 things appropriately.  However, a TIED task at the end is
++	 perfectly fine.  */
++      struct gomp_task *next_task = priority_node_to_task (type, node->next);
++      if (next_task->kind == GOMP_TASK_WAITING)
++	{
++	  /* Remove from list.  */
++	  node->prev->next = node->next;
++	  node->next->prev = node->prev;
++	  /* Rewire at the end.  */
++	  node->next = list->tasks;
++	  node->prev = list->tasks->prev;
++	  list->tasks->prev->next = node;
++	  list->tasks->prev = node;
++	}
++    }
++
++  /* If the current task is the last_parent_depends_on for its
++     priority, adjust last_parent_depends_on appropriately.  */
++  if (__builtin_expect (child_task->parent_depends_on, 0)
++      && list->last_parent_depends_on == node)
++    {
++      struct gomp_task *prev_child = priority_node_to_task (type, node->prev);
++      if (node->prev != node
++	  && prev_child->kind == GOMP_TASK_WAITING
++	  && prev_child->parent_depends_on)
++	list->last_parent_depends_on = node->prev;
++      else
++	{
++	  /* There are no more parent_depends_on entries waiting
++	     to run, clear the list.  */
++	  list->last_parent_depends_on = NULL;
++	}
++    }
++}
++
++/* Given a TASK in HEAD that is about to be executed, move it out of
++   the way so that other tasks can be considered for execution.  HEAD
++   contains tasks of type TYPE.
++
++   Care is taken to update the queue's LAST_PARENT_DEPENDS_ON field
++   if applicable.
++
++   (This function could be defined in priority_queue.c, but we want it
++   inlined, and putting it in priority_queue.h is not an option, given
++   that gomp_task has not been properly defined at that point).  */
++
++static void inline
++priority_queue_downgrade_task (enum priority_queue_type type,
++			       struct priority_queue *head,
++			       struct gomp_task *task)
++{
++#if _LIBGOMP_CHECKING_
++  if (!priority_queue_task_in_queue_p (type, head, task))
++    gomp_fatal ("Attempt to downgrade missing task %p", task);
++#endif
++  if (priority_queue_multi_p (head))
++    {
++      struct priority_list *list
++	= priority_queue_lookup_priority (head, task->priority);
++      priority_list_downgrade_task (type, list, task);
++    }
++  else
++    priority_list_downgrade_task (type, &head->l, task);
++}
++
++/* Setup CHILD_TASK to execute.  This is done by setting the task to
++   TIED, and updating all relevant queues so that CHILD_TASK is no
++   longer chosen for scheduling.  Also, remove CHILD_TASK from the
++   overall team task queue entirely.
++
++   Return TRUE if task or its containing taskgroup has been
++   cancelled.  */
++
++static inline bool
++gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
++		   struct gomp_team *team)
++{
++#if _LIBGOMP_CHECKING_
++  if (child_task->parent)
++    priority_queue_verify (PQ_CHILDREN,
++			   &child_task->parent->children_queue, true);
++  if (child_task->taskgroup)
++    priority_queue_verify (PQ_TASKGROUP,
++			   &child_task->taskgroup->taskgroup_queue, false);
++  priority_queue_verify (PQ_TEAM, &team->task_queue, false);
++#endif
++
++  /* Task is about to go tied, move it out of the way.  */
++  if (parent)
++    priority_queue_downgrade_task (PQ_CHILDREN, &parent->children_queue,
++				   child_task);
++
++  /* Task is about to go tied, move it out of the way.  */
++  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
++  if (taskgroup)
++    priority_queue_downgrade_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++				   child_task);
++
++  priority_queue_remove (PQ_TEAM, &team->task_queue, child_task,
++			 MEMMODEL_RELAXED);
++  child_task->pnode[PQ_TEAM].next = NULL;
++  child_task->pnode[PQ_TEAM].prev = NULL;
+   child_task->kind = GOMP_TASK_TIED;
++
+   if (--team->task_queued_count == 0)
+     gomp_team_barrier_clear_task_pending (&team->barrier);
+   if ((gomp_team_barrier_cancelled (&team->barrier)
+@@ -478,6 +1020,14 @@ gomp_task_run_post_handle_depend_hash (s
+       }
+ }
+ 
++/* After a CHILD_TASK has been run, adjust the dependency queue for
++   each task that depends on CHILD_TASK, to record the fact that there
++   is one less dependency to worry about.  If a task that depended on
++   CHILD_TASK now has no dependencies, place it in the various queues
++   so it gets scheduled to run.
++
++   TEAM is the team to which CHILD_TASK belongs to.  */
++
+ static size_t
+ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
+ 				     struct gomp_team *team)
+@@ -487,91 +1037,60 @@ gomp_task_run_post_handle_dependers (str
+   for (i = 0; i < count; i++)
+     {
+       struct gomp_task *task = child_task->dependers->elem[i];
++
++      /* CHILD_TASK satisfies a dependency for TASK.  Keep track of
++	 TASK's remaining dependencies.  Once TASK has no other
++	 depenencies, put it into the various queues so it will get
++	 scheduled for execution.  */
+       if (--task->num_dependees != 0)
+ 	continue;
+ 
+       struct gomp_taskgroup *taskgroup = task->taskgroup;
+       if (parent)
+ 	{
+-	  if (parent->children)
+-	    {
+-	      /* If parent is in gomp_task_maybe_wait_for_dependencies
+-		 and it doesn't need to wait for this task, put it after
+-		 all ready to run tasks it needs to wait for.  */
+-	      if (parent->taskwait && parent->taskwait->last_parent_depends_on
+-		  && !task->parent_depends_on)
+-		{
+-		  struct gomp_task *last_parent_depends_on
+-		    = parent->taskwait->last_parent_depends_on;
+-		  task->next_child = last_parent_depends_on->next_child;
+-		  task->prev_child = last_parent_depends_on;
+-		}
+-	      else
+-		{
+-		  task->next_child = parent->children;
+-		  task->prev_child = parent->children->prev_child;
+-		  parent->children = task;
+-		}
+-	      task->next_child->prev_child = task;
+-	      task->prev_child->next_child = task;
+-	    }
+-	  else
+-	    {
+-	      task->next_child = task;
+-	      task->prev_child = task;
+-	      parent->children = task;
+-	    }
++	  priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
++				 task, task->priority,
++				 PRIORITY_INSERT_BEGIN,
++				 /*adjust_parent_depends_on=*/true,
++				 task->parent_depends_on);
+ 	  if (parent->taskwait)
+ 	    {
+ 	      if (parent->taskwait->in_taskwait)
+ 		{
++		  /* One more task has had its dependencies met.
++		     Inform any waiters.  */
+ 		  parent->taskwait->in_taskwait = false;
+ 		  gomp_sem_post (&parent->taskwait->taskwait_sem);
+ 		}
+ 	      else if (parent->taskwait->in_depend_wait)
+ 		{
++		  /* One more task has had its dependencies met.
++		     Inform any waiters.  */
+ 		  parent->taskwait->in_depend_wait = false;
+ 		  gomp_sem_post (&parent->taskwait->taskwait_sem);
+ 		}
+-	      if (parent->taskwait->last_parent_depends_on == NULL
+-		  && task->parent_depends_on)
+-		parent->taskwait->last_parent_depends_on = task;
+ 	    }
+ 	}
+       if (taskgroup)
+ 	{
+-	  if (taskgroup->children)
+-	    {
+-	      task->next_taskgroup = taskgroup->children;
+-	      task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+-	      task->next_taskgroup->prev_taskgroup = task;
+-	      task->prev_taskgroup->next_taskgroup = task;
+-	    }
+-	  else
+-	    {
+-	      task->next_taskgroup = task;
+-	      task->prev_taskgroup = task;
+-	    }
+-	  taskgroup->children = task;
++	  priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++				 task, task->priority,
++				 PRIORITY_INSERT_BEGIN,
++				 /*adjust_parent_depends_on=*/false,
++				 task->parent_depends_on);
+ 	  if (taskgroup->in_taskgroup_wait)
+ 	    {
++	      /* One more task has had its dependencies met.
++		 Inform any waiters.  */
+ 	      taskgroup->in_taskgroup_wait = false;
+ 	      gomp_sem_post (&taskgroup->taskgroup_sem);
+ 	    }
+ 	}
+-      if (team->task_queue)
+-	{
+-	  task->next_queue = team->task_queue;
+-	  task->prev_queue = team->task_queue->prev_queue;
+-	  task->next_queue->prev_queue = task;
+-	  task->prev_queue->next_queue = task;
+-	}
+-      else
+-	{
+-	  task->next_queue = task;
+-	  task->prev_queue = task;
+-	  team->task_queue = task;
+-	}
++      priority_queue_insert (PQ_TEAM, &team->task_queue,
++			     task, task->priority,
++			     PRIORITY_INSERT_END,
++			     /*adjust_parent_depends_on=*/false,
++			     task->parent_depends_on);
+       ++team->task_count;
+       ++team->task_queued_count;
+       ++ret;
+@@ -601,12 +1120,18 @@ gomp_task_run_post_handle_depend (struct
+   return gomp_task_run_post_handle_dependers (child_task, team);
+ }
+ 
++/* Remove CHILD_TASK from its parent.  */
++
+ static inline void
+ gomp_task_run_post_remove_parent (struct gomp_task *child_task)
+ {
+   struct gomp_task *parent = child_task->parent;
+   if (parent == NULL)
+     return;
++
++  /* If this was the last task the parent was depending on,
++     synchronize with gomp_task_maybe_wait_for_dependencies so it can
++     clean up and return.  */
+   if (__builtin_expect (child_task->parent_depends_on, 0)
+       && --parent->taskwait->n_depend == 0
+       && parent->taskwait->in_depend_wait)
+@@ -614,36 +1139,31 @@ gomp_task_run_post_remove_parent (struct
+       parent->taskwait->in_depend_wait = false;
+       gomp_sem_post (&parent->taskwait->taskwait_sem);
+     }
+-  child_task->prev_child->next_child = child_task->next_child;
+-  child_task->next_child->prev_child = child_task->prev_child;
+-  if (parent->children != child_task)
+-    return;
+-  if (child_task->next_child != child_task)
+-    parent->children = child_task->next_child;
+-  else
++
++  if (priority_queue_remove (PQ_CHILDREN, &parent->children_queue,
++			     child_task, MEMMODEL_RELEASE)
++      && parent->taskwait && parent->taskwait->in_taskwait)
+     {
+-      /* We access task->children in GOMP_taskwait
+-	 outside of the task lock mutex region, so
+-	 need a release barrier here to ensure memory
+-	 written by child_task->fn above is flushed
+-	 before the NULL is written.  */
+-      __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
+-      if (parent->taskwait && parent->taskwait->in_taskwait)
+-	{
+-	  parent->taskwait->in_taskwait = false;
+-	  gomp_sem_post (&parent->taskwait->taskwait_sem);
+-	}
++      parent->taskwait->in_taskwait = false;
++      gomp_sem_post (&parent->taskwait->taskwait_sem);
+     }
++  child_task->pnode[PQ_CHILDREN].next = NULL;
++  child_task->pnode[PQ_CHILDREN].prev = NULL;
+ }
+ 
++/* Remove CHILD_TASK from its taskgroup.  */
++
+ static inline void
+ gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
+ {
+   struct gomp_taskgroup *taskgroup = child_task->taskgroup;
+   if (taskgroup == NULL)
+     return;
+-  child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
+-  child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
++  bool empty = priority_queue_remove (PQ_TASKGROUP,
++				      &taskgroup->taskgroup_queue,
++				      child_task, MEMMODEL_RELAXED);
++  child_task->pnode[PQ_TASKGROUP].next = NULL;
++  child_task->pnode[PQ_TASKGROUP].prev = NULL;
+   if (taskgroup->num_children > 1)
+     --taskgroup->num_children;
+   else
+@@ -655,18 +1175,10 @@ gomp_task_run_post_remove_taskgroup (str
+ 	 before the NULL is written.  */
+       __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+     }
+-  if (taskgroup->children != child_task)
+-    return;
+-  if (child_task->next_taskgroup != child_task)
+-    taskgroup->children = child_task->next_taskgroup;
+-  else
++  if (empty && taskgroup->in_taskgroup_wait)
+     {
+-      taskgroup->children = NULL;
+-      if (taskgroup->in_taskgroup_wait)
+-	{
+-	  taskgroup->in_taskgroup_wait = false;
+-	  gomp_sem_post (&taskgroup->taskgroup_sem);
+-	}
++      taskgroup->in_taskgroup_wait = false;
++      gomp_sem_post (&taskgroup->taskgroup_sem);
+     }
+ }
+ 
+@@ -696,11 +1208,15 @@ gomp_barrier_handle_tasks (gomp_barrier_
+   while (1)
+     {
+       bool cancelled = false;
+-      if (team->task_queue != NULL)
++      if (!priority_queue_empty_p (&team->task_queue, MEMMODEL_RELAXED))
+ 	{
+-	  child_task = team->task_queue;
++	  bool ignored;
++	  child_task
++	    = priority_queue_next_task (PQ_TEAM, &team->task_queue,
++					PQ_IGNORED, NULL,
++					&ignored);
+ 	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
+-					 child_task->taskgroup, team);
++					 team);
+ 	  if (__builtin_expect (cancelled, 0))
+ 	    {
+ 	      if (to_free)
+@@ -729,7 +1245,29 @@ gomp_barrier_handle_tasks (gomp_barrier_
+       if (child_task)
+ 	{
+ 	  thr->task = child_task;
+-	  child_task->fn (child_task->fn_data);
++	  if (__builtin_expect (child_task->fn == NULL, 0))
++	    {
++	      if (gomp_target_task_fn (child_task->fn_data))
++		{
++		  thr->task = task;
++		  gomp_mutex_lock (&team->task_lock);
++		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
++		  team->task_running_count--;
++		  struct gomp_target_task *ttask
++		    = (struct gomp_target_task *) child_task->fn_data;
++		  /* If GOMP_PLUGIN_target_task_completion has run already
++		     in between gomp_target_task_fn and the mutex lock,
++		     perform the requeuing here.  */
++		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
++		    gomp_target_task_completion (team, child_task);
++		  else
++		    ttask->state = GOMP_TARGET_TASK_RUNNING;
++		  child_task = NULL;
++		  continue;
++		}
++	    }
++	  else
++	    child_task->fn (child_task->fn_data);
+ 	  thr->task = task;
+ 	}
+       else
+@@ -741,7 +1279,7 @@ gomp_barrier_handle_tasks (gomp_barrier_
+ 	  size_t new_tasks
+ 	    = gomp_task_run_post_handle_depend (child_task, team);
+ 	  gomp_task_run_post_remove_parent (child_task);
+-	  gomp_clear_parent (child_task->children);
++	  gomp_clear_parent (&child_task->children_queue);
+ 	  gomp_task_run_post_remove_taskgroup (child_task);
+ 	  to_free = child_task;
+ 	  child_task = NULL;
+@@ -765,7 +1303,9 @@ gomp_barrier_handle_tasks (gomp_barrier_
+     }
+ }
+ 
+-/* Called when encountering a taskwait directive.  */
++/* Called when encountering a taskwait directive.
++
++   Wait for all children of the current task.  */
+ 
+ void
+ GOMP_taskwait (void)
+@@ -785,15 +1325,16 @@ GOMP_taskwait (void)
+      child thread task work function are seen before we exit from
+      GOMP_taskwait.  */
+   if (task == NULL
+-      || __atomic_load_n (&task->children, MEMMODEL_ACQUIRE) == NULL)
++      || priority_queue_empty_p (&task->children_queue, MEMMODEL_ACQUIRE))
+     return;
+ 
+   memset (&taskwait, 0, sizeof (taskwait));
++  bool child_q = false;
+   gomp_mutex_lock (&team->task_lock);
+   while (1)
+     {
+       bool cancelled = false;
+-      if (task->children == NULL)
++      if (priority_queue_empty_p (&task->children_queue, MEMMODEL_RELAXED))
+ 	{
+ 	  bool destroy_taskwait = task->taskwait != NULL;
+ 	  task->taskwait = NULL;
+@@ -807,12 +1348,14 @@ GOMP_taskwait (void)
+ 	    gomp_sem_destroy (&taskwait.taskwait_sem);
+ 	  return;
+ 	}
+-      if (task->children->kind == GOMP_TASK_WAITING)
++      struct gomp_task *next_task
++	= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
++				    PQ_TEAM, &team->task_queue, &child_q);
++      if (next_task->kind == GOMP_TASK_WAITING)
+ 	{
+-	  child_task = task->children;
++	  child_task = next_task;
+ 	  cancelled
+-	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
+-				 team);
++	    = gomp_task_run_pre (child_task, task, team);
+ 	  if (__builtin_expect (cancelled, 0))
+ 	    {
+ 	      if (to_free)
+@@ -826,8 +1369,10 @@ GOMP_taskwait (void)
+ 	}
+       else
+ 	{
+-	  /* All tasks we are waiting for are already running
+-	     in other threads.  Wait for them.  */
++	/* All tasks we are waiting for are either running in other
++	   threads, or they are tasks that have not had their
++	   dependencies met (so they're not even in the queue).  Wait
++	   for them.  */
+ 	  if (task->taskwait == NULL)
+ 	    {
+ 	      taskwait.in_depend_wait = false;
+@@ -851,7 +1396,28 @@ GOMP_taskwait (void)
+       if (child_task)
+ 	{
+ 	  thr->task = child_task;
+-	  child_task->fn (child_task->fn_data);
++	  if (__builtin_expect (child_task->fn == NULL, 0))
++	    {
++	      if (gomp_target_task_fn (child_task->fn_data))
++		{
++		  thr->task = task;
++		  gomp_mutex_lock (&team->task_lock);
++		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
++		  struct gomp_target_task *ttask
++		    = (struct gomp_target_task *) child_task->fn_data;
++		  /* If GOMP_PLUGIN_target_task_completion has run already
++		     in between gomp_target_task_fn and the mutex lock,
++		     perform the requeuing here.  */
++		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
++		    gomp_target_task_completion (team, child_task);
++		  else
++		    ttask->state = GOMP_TARGET_TASK_RUNNING;
++		  child_task = NULL;
++		  continue;
++		}
++	    }
++	  else
++	    child_task->fn (child_task->fn_data);
+ 	  thr->task = task;
+ 	}
+       else
+@@ -862,17 +1428,19 @@ GOMP_taskwait (void)
+ 	 finish_cancelled:;
+ 	  size_t new_tasks
+ 	    = gomp_task_run_post_handle_depend (child_task, team);
+-	  child_task->prev_child->next_child = child_task->next_child;
+-	  child_task->next_child->prev_child = child_task->prev_child;
+-	  if (task->children == child_task)
+-	    {
+-	      if (child_task->next_child != child_task)
+-		task->children = child_task->next_child;
+-	      else
+-		task->children = NULL;
++
++	  if (child_q)
++	    {
++	      priority_queue_remove (PQ_CHILDREN, &task->children_queue,
++				     child_task, MEMMODEL_RELAXED);
++	      child_task->pnode[PQ_CHILDREN].next = NULL;
++	      child_task->pnode[PQ_CHILDREN].prev = NULL;
+ 	    }
+-	  gomp_clear_parent (child_task->children);
++
++	  gomp_clear_parent (&child_task->children_queue);
++
+ 	  gomp_task_run_post_remove_taskgroup (child_task);
++
+ 	  to_free = child_task;
+ 	  child_task = NULL;
+ 	  team->task_count--;
+@@ -887,10 +1455,20 @@ GOMP_taskwait (void)
+     }
+ }
+ 
+-/* This is like GOMP_taskwait, but we only wait for tasks that the
+-   upcoming task depends on.  */
++/* An undeferred task is about to run.  Wait for all tasks that this
++   undeferred task depends on.
+ 
+-static void
++   This is done by first putting all known ready dependencies
++   (dependencies that have their own dependencies met) at the top of
++   the scheduling queues.  Then we iterate through these imminently
++   ready tasks (and possibly other high priority tasks), and run them.
++   If we run out of ready dependencies to execute, we either wait for
++   the reamining dependencies to finish, or wait for them to get
++   scheduled so we can run them.
++
++   DEPEND is as in GOMP_task.  */
++
++void
+ gomp_task_maybe_wait_for_dependencies (void **depend)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+@@ -898,7 +1476,6 @@ gomp_task_maybe_wait_for_dependencies (v
+   struct gomp_team *team = thr->ts.team;
+   struct gomp_task_depend_entry elem, *ent = NULL;
+   struct gomp_taskwait taskwait;
+-  struct gomp_task *last_parent_depends_on = NULL;
+   size_t ndepend = (uintptr_t) depend[0];
+   size_t nout = (uintptr_t) depend[1];
+   size_t i;
+@@ -922,32 +1499,11 @@ gomp_task_maybe_wait_for_dependencies (v
+ 	      {
+ 		tsk->parent_depends_on = true;
+ 		++num_awaited;
++		/* If depenency TSK itself has no dependencies and is
++		   ready to run, move it up front so that we run it as
++		   soon as possible.  */
+ 		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
+-		  {
+-		    /* If a task we need to wait for is not already
+-		       running and is ready to be scheduled, move it
+-		       to front, so that we run it as soon as possible.  */
+-		    if (last_parent_depends_on)
+-		      {
+-			tsk->prev_child->next_child = tsk->next_child;
+-			tsk->next_child->prev_child = tsk->prev_child;
+-			tsk->prev_child = last_parent_depends_on;
+-			tsk->next_child = last_parent_depends_on->next_child;
+-			tsk->prev_child->next_child = tsk;
+-			tsk->next_child->prev_child = tsk;
+-		      }
+-		    else if (tsk != task->children)
+-		      {
+-			tsk->prev_child->next_child = tsk->next_child;
+-			tsk->next_child->prev_child = tsk->prev_child;
+-			tsk->prev_child = task->children;
+-			tsk->next_child = task->children->next_child;
+-			task->children = tsk;
+-			tsk->prev_child->next_child = tsk;
+-			tsk->next_child->prev_child = tsk;
+-		      }
+-		    last_parent_depends_on = tsk;
+-		  }
++		  priority_queue_upgrade_task (tsk, task);
+ 	      }
+ 	  }
+     }
+@@ -959,7 +1515,6 @@ gomp_task_maybe_wait_for_dependencies (v
+ 
+   memset (&taskwait, 0, sizeof (taskwait));
+   taskwait.n_depend = num_awaited;
+-  taskwait.last_parent_depends_on = last_parent_depends_on;
+   gomp_sem_init (&taskwait.taskwait_sem, 0);
+   task->taskwait = &taskwait;
+ 
+@@ -978,12 +1533,30 @@ gomp_task_maybe_wait_for_dependencies (v
+ 	  gomp_sem_destroy (&taskwait.taskwait_sem);
+ 	  return;
+ 	}
+-      if (task->children->kind == GOMP_TASK_WAITING)
++
++      /* Theoretically when we have multiple priorities, we should
++	 chose between the highest priority item in
++	 task->children_queue and team->task_queue here, so we should
++	 use priority_queue_next_task().  However, since we are
++	 running an undeferred task, perhaps that makes all tasks it
++	 depends on undeferred, thus a priority of INF?  This would
++	 make it unnecessary to take anything into account here,
++	 but the dependencies.
++
++	 On the other hand, if we want to use priority_queue_next_task(),
++	 care should be taken to only use priority_queue_remove()
++	 below if the task was actually removed from the children
++	 queue.  */
++      bool ignored;
++      struct gomp_task *next_task
++	= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
++				    PQ_IGNORED, NULL, &ignored);
++
++      if (next_task->kind == GOMP_TASK_WAITING)
+ 	{
+-	  child_task = task->children;
++	  child_task = next_task;
+ 	  cancelled
+-	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
+-				 team);
++	    = gomp_task_run_pre (child_task, task, team);
+ 	  if (__builtin_expect (cancelled, 0))
+ 	    {
+ 	      if (to_free)
+@@ -996,8 +1569,10 @@ gomp_task_maybe_wait_for_dependencies (v
+ 	    }
+ 	}
+       else
+-	/* All tasks we are waiting for are already running
+-	   in other threads.  Wait for them.  */
++	/* All tasks we are waiting for are either running in other
++	   threads, or they are tasks that have not had their
++	   dependencies met (so they're not even in the queue).  Wait
++	   for them.  */
+ 	taskwait.in_depend_wait = true;
+       gomp_mutex_unlock (&team->task_lock);
+       if (do_wake)
+@@ -1014,7 +1589,28 @@ gomp_task_maybe_wait_for_dependencies (v
+       if (child_task)
+ 	{
+ 	  thr->task = child_task;
+-	  child_task->fn (child_task->fn_data);
++	  if (__builtin_expect (child_task->fn == NULL, 0))
++	    {
++	      if (gomp_target_task_fn (child_task->fn_data))
++		{
++		  thr->task = task;
++		  gomp_mutex_lock (&team->task_lock);
++		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
++		  struct gomp_target_task *ttask
++		    = (struct gomp_target_task *) child_task->fn_data;
++		  /* If GOMP_PLUGIN_target_task_completion has run already
++		     in between gomp_target_task_fn and the mutex lock,
++		     perform the requeuing here.  */
++		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
++		    gomp_target_task_completion (team, child_task);
++		  else
++		    ttask->state = GOMP_TARGET_TASK_RUNNING;
++		  child_task = NULL;
++		  continue;
++		}
++	    }
++	  else
++	    child_task->fn (child_task->fn_data);
+ 	  thr->task = task;
+ 	}
+       else
+@@ -1027,16 +1623,13 @@ gomp_task_maybe_wait_for_dependencies (v
+ 	    = gomp_task_run_post_handle_depend (child_task, team);
+ 	  if (child_task->parent_depends_on)
+ 	    --taskwait.n_depend;
+-	  child_task->prev_child->next_child = child_task->next_child;
+-	  child_task->next_child->prev_child = child_task->prev_child;
+-	  if (task->children == child_task)
+-	    {
+-	      if (child_task->next_child != child_task)
+-		task->children = child_task->next_child;
+-	      else
+-		task->children = NULL;
+-	    }
+-	  gomp_clear_parent (child_task->children);
++
++	  priority_queue_remove (PQ_CHILDREN, &task->children_queue,
++				 child_task, MEMMODEL_RELAXED);
++	  child_task->pnode[PQ_CHILDREN].next = NULL;
++	  child_task->pnode[PQ_CHILDREN].prev = NULL;
++
++	  gomp_clear_parent (&child_task->children_queue);
+ 	  gomp_task_run_post_remove_taskgroup (child_task);
+ 	  to_free = child_task;
+ 	  child_task = NULL;
+@@ -1069,14 +1662,14 @@ GOMP_taskgroup_start (void)
+   struct gomp_taskgroup *taskgroup;
+ 
+   /* If team is NULL, all tasks are executed as
+-     GOMP_TASK_IFFALSE tasks and thus all children tasks of
++     GOMP_TASK_UNDEFERRED tasks and thus all children tasks of
+      taskgroup and their descendant tasks will be finished
+      by the time GOMP_taskgroup_end is called.  */
+   if (team == NULL)
+     return;
+   taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
+   taskgroup->prev = task->taskgroup;
+-  taskgroup->children = NULL;
++  priority_queue_init (&taskgroup->taskgroup_queue);
+   taskgroup->in_taskgroup_wait = false;
+   taskgroup->cancelled = false;
+   taskgroup->num_children = 0;
+@@ -1098,6 +1691,17 @@ GOMP_taskgroup_end (void)
+   if (team == NULL)
+     return;
+   taskgroup = task->taskgroup;
++  if (__builtin_expect (taskgroup == NULL, 0)
++      && thr->ts.level == 0)
++    {
++      /* This can happen if GOMP_taskgroup_start is called when
++	 thr->ts.team == NULL, but inside of the taskgroup there
++	 is #pragma omp target nowait that creates an implicit
++	 team with a single thread.  In this case, we want to wait
++	 for all outstanding tasks in this team.  */
++      gomp_team_barrier_wait (&team->barrier);
++      return;
++    }
+ 
+   /* The acquire barrier on load of taskgroup->num_children here
+      synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
+@@ -1108,19 +1712,25 @@ GOMP_taskgroup_end (void)
+   if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
+     goto finish;
+ 
++  bool unused;
+   gomp_mutex_lock (&team->task_lock);
+   while (1)
+     {
+       bool cancelled = false;
+-      if (taskgroup->children == NULL)
++      if (priority_queue_empty_p (&taskgroup->taskgroup_queue,
++				  MEMMODEL_RELAXED))
+ 	{
+ 	  if (taskgroup->num_children)
+ 	    {
+-	      if (task->children == NULL)
++	      if (priority_queue_empty_p (&task->children_queue,
++					  MEMMODEL_RELAXED))
+ 		goto do_wait;
+-	      child_task = task->children;
+-            }
+-          else
++	      child_task
++		= priority_queue_next_task (PQ_CHILDREN, &task->children_queue,
++					    PQ_TEAM, &team->task_queue,
++					    &unused);
++	    }
++	  else
+ 	    {
+ 	      gomp_mutex_unlock (&team->task_lock);
+ 	      if (to_free)
+@@ -1132,12 +1742,13 @@ GOMP_taskgroup_end (void)
+ 	    }
+ 	}
+       else
+-	child_task = taskgroup->children;
++	child_task
++	  = priority_queue_next_task (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++				      PQ_TEAM, &team->task_queue, &unused);
+       if (child_task->kind == GOMP_TASK_WAITING)
+ 	{
+ 	  cancelled
+-	    = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
+-				 team);
++	    = gomp_task_run_pre (child_task, child_task->parent, team);
+ 	  if (__builtin_expect (cancelled, 0))
+ 	    {
+ 	      if (to_free)
+@@ -1153,8 +1764,10 @@ GOMP_taskgroup_end (void)
+ 	{
+ 	  child_task = NULL;
+ 	 do_wait:
+-	  /* All tasks we are waiting for are already running
+-	     in other threads.  Wait for them.  */
++	/* All tasks we are waiting for are either running in other
++	   threads, or they are tasks that have not had their
++	   dependencies met (so they're not even in the queue).  Wait
++	   for them.  */
+ 	  taskgroup->in_taskgroup_wait = true;
+ 	}
+       gomp_mutex_unlock (&team->task_lock);
+@@ -1172,7 +1785,28 @@ GOMP_taskgroup_end (void)
+       if (child_task)
+ 	{
+ 	  thr->task = child_task;
+-	  child_task->fn (child_task->fn_data);
++	  if (__builtin_expect (child_task->fn == NULL, 0))
++	    {
++	      if (gomp_target_task_fn (child_task->fn_data))
++		{
++		  thr->task = task;
++		  gomp_mutex_lock (&team->task_lock);
++		  child_task->kind = GOMP_TASK_ASYNC_RUNNING;
++		  struct gomp_target_task *ttask
++		    = (struct gomp_target_task *) child_task->fn_data;
++		  /* If GOMP_PLUGIN_target_task_completion has run already
++		     in between gomp_target_task_fn and the mutex lock,
++		     perform the requeuing here.  */
++		  if (ttask->state == GOMP_TARGET_TASK_FINISHED)
++		    gomp_target_task_completion (team, child_task);
++		  else
++		    ttask->state = GOMP_TARGET_TASK_RUNNING;
++		  child_task = NULL;
++		  continue;
++		}
++	    }
++	  else
++	    child_task->fn (child_task->fn_data);
+ 	  thr->task = task;
+ 	}
+       else
+@@ -1184,7 +1818,7 @@ GOMP_taskgroup_end (void)
+ 	  size_t new_tasks
+ 	    = gomp_task_run_post_handle_depend (child_task, team);
+ 	  gomp_task_run_post_remove_parent (child_task);
+-	  gomp_clear_parent (child_task->children);
++	  gomp_clear_parent (&child_task->children_queue);
+ 	  gomp_task_run_post_remove_taskgroup (child_task);
+ 	  to_free = child_task;
+ 	  child_task = NULL;
+--- libgomp/libgomp_g.h.jj	2014-05-15 10:56:31.429532978 +0200
++++ libgomp/libgomp_g.h	2016-07-13 16:57:04.422535521 +0200
+@@ -29,6 +29,7 @@
+ #define LIBGOMP_G_H 1
+ 
+ #include <stdbool.h>
++#include <stddef.h>
+ 
+ /* barrier.c */
+ 
+@@ -50,6 +51,10 @@ extern bool GOMP_loop_static_start (long
+ extern bool GOMP_loop_dynamic_start (long, long, long, long, long *, long *);
+ extern bool GOMP_loop_guided_start (long, long, long, long, long *, long *);
+ extern bool GOMP_loop_runtime_start (long, long, long, long *, long *);
++extern bool GOMP_loop_nonmonotonic_dynamic_start (long, long, long, long,
++						  long *, long *);
++extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long,
++						 long *, long *);
+ 
+ extern bool GOMP_loop_ordered_static_start (long, long, long, long,
+ 					    long *, long *);
+@@ -63,12 +68,23 @@ extern bool GOMP_loop_static_next (long
+ extern bool GOMP_loop_dynamic_next (long *, long *);
+ extern bool GOMP_loop_guided_next (long *, long *);
+ extern bool GOMP_loop_runtime_next (long *, long *);
++extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *);
++extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *);
+ 
+ extern bool GOMP_loop_ordered_static_next (long *, long *);
+ extern bool GOMP_loop_ordered_dynamic_next (long *, long *);
+ extern bool GOMP_loop_ordered_guided_next (long *, long *);
+ extern bool GOMP_loop_ordered_runtime_next (long *, long *);
+ 
++extern bool GOMP_loop_doacross_static_start (unsigned, long *, long, long *,
++					     long *);
++extern bool GOMP_loop_doacross_dynamic_start (unsigned, long *, long, long *,
++					      long *);
++extern bool GOMP_loop_doacross_guided_start (unsigned, long *, long, long *,
++					     long *);
++extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *,
++					      long *);
++
+ extern void GOMP_parallel_loop_static_start (void (*)(void *), void *,
+ 					     unsigned, long, long, long, long);
+ extern void GOMP_parallel_loop_dynamic_start (void (*)(void *), void *,
+@@ -89,6 +105,12 @@ extern void GOMP_parallel_loop_guided (v
+ extern void GOMP_parallel_loop_runtime (void (*)(void *), void *,
+ 					unsigned, long, long, long,
+ 					unsigned);
++extern void GOMP_parallel_loop_nonmonotonic_dynamic (void (*)(void *), void *,
++						     unsigned, long, long,
++						     long, long, unsigned);
++extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *,
++						    unsigned, long, long,
++						    long, long, unsigned);
+ 
+ extern void GOMP_loop_end (void);
+ extern void GOMP_loop_end_nowait (void);
+@@ -119,6 +141,18 @@ extern bool GOMP_loop_ull_runtime_start
+ 					 unsigned long long,
+ 					 unsigned long long *,
+ 					 unsigned long long *);
++extern bool GOMP_loop_ull_nonmonotonic_dynamic_start (bool, unsigned long long,
++						      unsigned long long,
++						      unsigned long long,
++						      unsigned long long,
++						      unsigned long long *,
++						      unsigned long long *);
++extern bool GOMP_loop_ull_nonmonotonic_guided_start (bool, unsigned long long,
++						     unsigned long long,
++						     unsigned long long,
++						     unsigned long long,
++						     unsigned long long *,
++						     unsigned long long *);
+ 
+ extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long,
+ 						unsigned long long,
+@@ -152,6 +186,10 @@ extern bool GOMP_loop_ull_guided_next (u
+ 				       unsigned long long *);
+ extern bool GOMP_loop_ull_runtime_next (unsigned long long *,
+ 					unsigned long long *);
++extern bool GOMP_loop_ull_nonmonotonic_dynamic_next (unsigned long long *,
++						     unsigned long long *);
++extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *,
++						    unsigned long long *);
+ 
+ extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *,
+ 					       unsigned long long *);
+@@ -162,10 +200,34 @@ extern bool GOMP_loop_ull_ordered_guided
+ extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *,
+ 						unsigned long long *);
+ 
++extern bool GOMP_loop_ull_doacross_static_start (unsigned,
++						 unsigned long long *,
++						 unsigned long long,
++						 unsigned long long *,
++						 unsigned long long *);
++extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned,
++						  unsigned long long *,
++						  unsigned long long,
++						  unsigned long long *,
++						  unsigned long long *);
++extern bool GOMP_loop_ull_doacross_guided_start (unsigned,
++						 unsigned long long *,
++						 unsigned long long,
++						 unsigned long long *,
++						 unsigned long long *);
++extern bool GOMP_loop_ull_doacross_runtime_start (unsigned,
++						  unsigned long long *,
++						  unsigned long long *,
++						  unsigned long long *);
++
+ /* ordered.c */
+ 
+ extern void GOMP_ordered_start (void);
+ extern void GOMP_ordered_end (void);
++extern void GOMP_doacross_post (long *);
++extern void GOMP_doacross_wait (long, ...);
++extern void GOMP_doacross_ull_post (unsigned long long *);
++extern void GOMP_doacross_ull_wait (unsigned long long, ...);
+ 
+ /* parallel.c */
+ 
+@@ -178,7 +240,15 @@ extern bool GOMP_cancellation_point (int
+ /* task.c */
+ 
+ extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
+-		       long, long, bool, unsigned, void **);
++		       long, long, bool, unsigned, void **, int);
++extern void GOMP_taskloop (void (*) (void *), void *,
++			   void (*) (void *, void *), long, long, unsigned,
++			   unsigned long, int, long, long, long);
++extern void GOMP_taskloop_ull (void (*) (void *), void *,
++			       void (*) (void *, void *), long, long,
++			       unsigned, unsigned long, int,
++			       unsigned long long, unsigned long long,
++			       unsigned long long);
+ extern void GOMP_taskwait (void);
+ extern void GOMP_taskyield (void);
+ extern void GOMP_taskgroup_start (void);
+@@ -206,11 +276,38 @@ extern void GOMP_single_copy_end (void *
+ 
+ extern void GOMP_target (int, void (*) (void *), const void *,
+ 			 size_t, void **, size_t *, unsigned char *);
++extern void GOMP_target_ext (int, void (*) (void *), size_t, void **, size_t *,
++			     unsigned short *, unsigned int, void **, void **);
+ extern void GOMP_target_data (int, const void *,
+ 			      size_t, void **, size_t *, unsigned char *);
++extern void GOMP_target_data_ext (int, size_t, void **, size_t *,
++				  unsigned short *);
+ extern void GOMP_target_end_data (void);
+ extern void GOMP_target_update (int, const void *,
+ 				size_t, void **, size_t *, unsigned char *);
++extern void GOMP_target_update_ext (int, size_t, void **, size_t *,
++				    unsigned short *, unsigned int, void **);
++extern void GOMP_target_enter_exit_data (int, size_t, void **, size_t *,
++					 unsigned short *, unsigned int,
++					 void **);
+ extern void GOMP_teams (unsigned int, unsigned int);
+ 
++/* oacc-parallel.c */
++
++extern void GOACC_parallel_keyed (int, void (*) (void *), size_t,
++				  void **, size_t *, unsigned short *, ...);
++extern void GOACC_parallel (int, void (*) (void *), size_t, void **, size_t *,
++			    unsigned short *, int, int, int, int, int, ...);
++extern void GOACC_data_start (int, size_t, void **, size_t *,
++			      unsigned short *);
++extern void GOACC_data_end (void);
++extern void GOACC_enter_exit_data (int, size_t, void **,
++				   size_t *, unsigned short *, int, int, ...);
++extern void GOACC_update (int, size_t, void **, size_t *,
++			  unsigned short *, int, int, ...);
++extern void GOACC_wait (int, int, ...);
++extern int GOACC_get_num_threads (void);
++extern int GOACC_get_thread_num (void);
++extern void GOACC_declare (int, size_t, void **, size_t *, unsigned short *);
++
+ #endif /* LIBGOMP_G_H */
+--- libgomp/libgomp.h.jj	2014-08-01 15:59:49.145188127 +0200
++++ libgomp/libgomp.h	2016-07-14 17:40:24.038243456 +0200
+@@ -34,12 +34,35 @@
+ #ifndef LIBGOMP_H 
+ #define LIBGOMP_H 1
+ 
++#ifndef _LIBGOMP_CHECKING_
++/* Define to 1 to perform internal sanity checks.  */
++#define _LIBGOMP_CHECKING_ 0
++#endif
++
+ #include "config.h"
+ #include "gstdint.h"
++#include "libgomp-plugin.h"
+ 
+ #include <pthread.h>
+ #include <stdbool.h>
+ #include <stdlib.h>
++#include <stdarg.h>
++
++/* Needed for memset in priority_queue.c.  */
++#if _LIBGOMP_CHECKING_
++# ifdef STRING_WITH_STRINGS
++#  include <string.h>
++#  include <strings.h>
++# else
++#  ifdef HAVE_STRING_H
++#   include <string.h>
++#  else
++#   ifdef HAVE_STRINGS_H
++#    include <strings.h>
++#   endif
++#  endif
++# endif
++#endif
+ 
+ #ifdef HAVE_ATTRIBUTE_VISIBILITY
+ # pragma GCC visibility push(hidden)
+@@ -56,6 +79,44 @@ enum memmodel
+   MEMMODEL_SEQ_CST = 5
+ };
+ 
++/* alloc.c */
++
++extern void *gomp_malloc (size_t) __attribute__((malloc));
++extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
++extern void *gomp_realloc (void *, size_t);
++
++/* Avoid conflicting prototypes of alloca() in system headers by using
++   GCC's builtin alloca().  */
++#define gomp_alloca(x)  __builtin_alloca(x)
++
++/* error.c */
++
++extern void gomp_vdebug (int, const char *, va_list);
++extern void gomp_debug (int, const char *, ...)
++	__attribute__ ((format (printf, 2, 3)));
++#define gomp_vdebug(KIND, FMT, VALIST) \
++  do { \
++    if (__builtin_expect (gomp_debug_var, 0)) \
++      (gomp_vdebug) ((KIND), (FMT), (VALIST)); \
++  } while (0)
++#define gomp_debug(KIND, ...) \
++  do { \
++    if (__builtin_expect (gomp_debug_var, 0)) \
++      (gomp_debug) ((KIND), __VA_ARGS__); \
++  } while (0)
++extern void gomp_verror (const char *, va_list);
++extern void gomp_error (const char *, ...)
++	__attribute__ ((format (printf, 1, 2)));
++extern void gomp_vfatal (const char *, va_list)
++	__attribute__ ((noreturn));
++extern void gomp_fatal (const char *, ...)
++	__attribute__ ((noreturn, format (printf, 1, 2)));
++
++struct gomp_task;
++struct gomp_taskgroup;
++struct htab;
++
++#include "priority_queue.h"
+ #include "sem.h"
+ #include "mutex.h"
+ #include "bar.h"
+@@ -74,6 +135,44 @@ enum gomp_schedule_type
+   GFS_AUTO
+ };
+ 
++struct gomp_doacross_work_share
++{
++  union {
++    /* chunk_size copy, as ws->chunk_size is multiplied by incr for
++       GFS_DYNAMIC.  */
++    long chunk_size;
++    /* Likewise, but for ull implementation.  */
++    unsigned long long chunk_size_ull;
++    /* For schedule(static,0) this is the number
++       of iterations assigned to the last thread, i.e. number of
++       iterations / number of threads.  */
++    long q;
++    /* Likewise, but for ull implementation.  */
++    unsigned long long q_ull;
++  };
++  /* Size of each array entry (padded to cache line size).  */
++  unsigned long elt_sz;
++  /* Number of dimensions in sink vectors.  */
++  unsigned int ncounts;
++  /* True if the iterations can be flattened.  */
++  bool flattened;
++  /* Actual array (of elt_sz sized units), aligned to cache line size.
++     This is indexed by team_id for GFS_STATIC and outermost iteration
++     / chunk_size for other schedules.  */
++  unsigned char *array;
++  /* These two are only used for schedule(static,0).  */
++  /* This one is number of iterations % number of threads.  */
++  long t;
++  union {
++    /* And this one is cached t * (q + 1).  */
++    long boundary;
++    /* Likewise, but for the ull implementation.  */
++    unsigned long long boundary_ull;
++  };
++  /* Array of shift counts for each dimension if they can be flattened.  */
++  unsigned int shift_counts[];
++};
++
+ struct gomp_work_share
+ {
+   /* This member records the SCHEDULE clause to be used for this construct.
+@@ -105,13 +204,18 @@ struct gomp_work_share
+     };
+   };
+ 
+-  /* This is a circular queue that details which threads will be allowed
+-     into the ordered region and in which order.  When a thread allocates
+-     iterations on which it is going to work, it also registers itself at
+-     the end of the array.  When a thread reaches the ordered region, it
+-     checks to see if it is the one at the head of the queue.  If not, it
+-     blocks on its RELEASE semaphore.  */
+-  unsigned *ordered_team_ids;
++  union {
++    /* This is a circular queue that details which threads will be allowed
++       into the ordered region and in which order.  When a thread allocates
++       iterations on which it is going to work, it also registers itself at
++       the end of the array.  When a thread reaches the ordered region, it
++       checks to see if it is the one at the head of the queue.  If not, it
++       blocks on its RELEASE semaphore.  */
++    unsigned *ordered_team_ids;
++
++    /* This is a pointer to DOACROSS work share data.  */
++    struct gomp_doacross_work_share *doacross;
++  };
+ 
+   /* This is the number of threads that have registered themselves in
+      the circular queue ordered_team_ids.  */
+@@ -230,7 +334,7 @@ struct gomp_task_icv
+ {
+   unsigned long nthreads_var;
+   enum gomp_schedule_type run_sched_var;
+-  int run_sched_modifier;
++  int run_sched_chunk_size;
+   int default_device_var;
+   unsigned int thread_limit_var;
+   bool dyn_var;
+@@ -246,6 +350,7 @@ extern gomp_mutex_t gomp_managed_threads
+ #endif
+ extern unsigned long gomp_max_active_levels_var;
+ extern bool gomp_cancel_var;
++extern int gomp_max_task_priority_var;
+ extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+ extern unsigned long gomp_available_cpus, gomp_managed_threads;
+ extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
+@@ -253,25 +358,36 @@ extern char *gomp_bind_var_list;
+ extern unsigned long gomp_bind_var_list_len;
+ extern void **gomp_places_list;
+ extern unsigned long gomp_places_list_len;
++extern int gomp_debug_var;
++extern int goacc_device_num;
++extern char *goacc_device_type;
+ 
+ enum gomp_task_kind
+ {
++  /* Implicit task.  */
+   GOMP_TASK_IMPLICIT,
+-  GOMP_TASK_IFFALSE,
++  /* Undeferred task.  */
++  GOMP_TASK_UNDEFERRED,
++  /* Task created by GOMP_task and waiting to be run.  */
+   GOMP_TASK_WAITING,
+-  GOMP_TASK_TIED
++  /* Task currently executing or scheduled and about to execute.  */
++  GOMP_TASK_TIED,
++  /* Used for target tasks that have vars mapped and async run started,
++     but not yet completed.  Once that completes, they will be readded
++     into the queues as GOMP_TASK_WAITING in order to perform the var
++     unmapping.  */
++  GOMP_TASK_ASYNC_RUNNING
+ };
+ 
+-struct gomp_task;
+-struct gomp_taskgroup;
+-struct htab;
+-
+ struct gomp_task_depend_entry
+ {
++  /* Address of dependency.  */
+   void *addr;
+   struct gomp_task_depend_entry *next;
+   struct gomp_task_depend_entry *prev;
++  /* Task that provides the dependency in ADDR.  */
+   struct gomp_task *task;
++  /* Depend entry is of type "IN".  */
+   bool is_in;
+   bool redundant;
+   bool redundant_out;
+@@ -290,8 +406,8 @@ struct gomp_taskwait
+ {
+   bool in_taskwait;
+   bool in_depend_wait;
++  /* Number of tasks we are waiting for.  */
+   size_t n_depend;
+-  struct gomp_task *last_parent_depends_on;
+   gomp_sem_t taskwait_sem;
+ };
+ 
+@@ -299,20 +415,31 @@ struct gomp_taskwait
+ 
+ struct gomp_task
+ {
++  /* Parent of this task.  */
+   struct gomp_task *parent;
+-  struct gomp_task *children;
+-  struct gomp_task *next_child;
+-  struct gomp_task *prev_child;
+-  struct gomp_task *next_queue;
+-  struct gomp_task *prev_queue;
+-  struct gomp_task *next_taskgroup;
+-  struct gomp_task *prev_taskgroup;
++  /* Children of this task.  */
++  struct priority_queue children_queue;
++  /* Taskgroup this task belongs in.  */
+   struct gomp_taskgroup *taskgroup;
++  /* Tasks that depend on this task.  */
+   struct gomp_dependers_vec *dependers;
+   struct htab *depend_hash;
+   struct gomp_taskwait *taskwait;
++  /* Number of items in DEPEND.  */
+   size_t depend_count;
++  /* Number of tasks this task depends on.  Once this counter reaches
++     0, we have no unsatisfied dependencies, and this task can be put
++     into the various queues to be scheduled.  */
+   size_t num_dependees;
++
++  /* Priority of this task.  */
++  int priority;
++  /* The priority node for this task in each of the different queues.
++     We put this here to avoid allocating space for each priority
++     node.  Then we play offsetof() games to convert between pnode[]
++     entries and the gomp_task in which they reside.  */
++  struct priority_node pnode[3];
++
+   struct gomp_task_icv icv;
+   void (*fn) (void *);
+   void *fn_data;
+@@ -320,20 +447,58 @@ struct gomp_task
+   bool in_tied_task;
+   bool final_task;
+   bool copy_ctors_done;
++  /* Set for undeferred tasks with unsatisfied dependencies which
++     block further execution of their parent until the dependencies
++     are satisfied.  */
+   bool parent_depends_on;
++  /* Dependencies provided and/or needed for this task.  DEPEND_COUNT
++     is the number of items available.  */
+   struct gomp_task_depend_entry depend[];
+ };
+ 
++/* This structure describes a single #pragma omp taskgroup.  */
++
+ struct gomp_taskgroup
+ {
+   struct gomp_taskgroup *prev;
+-  struct gomp_task *children;
++  /* Queue of tasks that belong in this taskgroup.  */
++  struct priority_queue taskgroup_queue;
+   bool in_taskgroup_wait;
+   bool cancelled;
+   gomp_sem_t taskgroup_sem;
+   size_t num_children;
+ };
+ 
++/* Various state of OpenMP async offloading tasks.  */
++enum gomp_target_task_state
++{
++  GOMP_TARGET_TASK_DATA,
++  GOMP_TARGET_TASK_BEFORE_MAP,
++  GOMP_TARGET_TASK_FALLBACK,
++  GOMP_TARGET_TASK_READY_TO_RUN,
++  GOMP_TARGET_TASK_RUNNING,
++  GOMP_TARGET_TASK_FINISHED
++};
++
++/* This structure describes a target task.  */
++
++struct gomp_target_task
++{
++  struct gomp_device_descr *devicep;
++  void (*fn) (void *);
++  size_t mapnum;
++  size_t *sizes;
++  unsigned short *kinds;
++  unsigned int flags;
++  enum gomp_target_task_state state;
++  struct target_mem_desc *tgt;
++  struct gomp_task *task;
++  struct gomp_team *team;
++  /* Device-specific target arguments.  */
++  void **args;
++  void *hostaddrs[];
++};
++
+ /* This structure describes a "team" of threads.  These are the threads
+    that are spawned by a PARALLEL constructs, as well as the work sharing
+    constructs that the team encounters.  */
+@@ -396,7 +561,8 @@ struct gomp_team
+   struct gomp_work_share work_shares[8];
+ 
+   gomp_mutex_t task_lock;
+-  struct gomp_task *task_queue;
++  /* Scheduled tasks.  */
++  struct priority_queue task_queue;
+   /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
+   unsigned int task_count;
+   /* Number of GOMP_TASK_WAITING tasks currently waiting to be scheduled.  */
+@@ -451,6 +617,9 @@ struct gomp_thread_pool
+   struct gomp_thread **threads;
+   unsigned threads_size;
+   unsigned threads_used;
++  /* The last team is used for non-nested teams to delay their destruction to
++     make sure all the threads in the team move on to the pool's barrier before
++     the team's barrier is destroyed.  */
+   struct gomp_team *last_team;
+   /* Number of threads running in this contention group.  */
+   unsigned long threads_busy;
+@@ -519,23 +688,7 @@ extern bool gomp_affinity_same_place (vo
+ extern bool gomp_affinity_finalize_place_list (bool);
+ extern bool gomp_affinity_init_level (int, unsigned long, bool);
+ extern void gomp_affinity_print_place (void *);
+-
+-/* alloc.c */
+-
+-extern void *gomp_malloc (size_t) __attribute__((malloc));
+-extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
+-extern void *gomp_realloc (void *, size_t);
+-
+-/* Avoid conflicting prototypes of alloca() in system headers by using
+-   GCC's builtin alloca().  */
+-#define gomp_alloca(x)  __builtin_alloca(x)
+-
+-/* error.c */
+-
+-extern void gomp_error (const char *, ...)
+-	__attribute__((format (printf, 1, 2)));
+-extern void gomp_fatal (const char *, ...)
+-	__attribute__((noreturn, format (printf, 1, 2)));
++extern void gomp_get_place_proc_ids_8 (int, int64_t *);
+ 
+ /* iter.c */
+ 
+@@ -572,6 +725,9 @@ extern void gomp_ordered_next (void);
+ extern void gomp_ordered_static_init (void);
+ extern void gomp_ordered_static_next (void);
+ extern void gomp_ordered_sync (void);
++extern void gomp_doacross_init (unsigned, long *, long);
++extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
++				    unsigned long long);
+ 
+ /* parallel.c */
+ 
+@@ -588,6 +744,12 @@ extern void gomp_init_task (struct gomp_
+ 			    struct gomp_task_icv *);
+ extern void gomp_end_task (void);
+ extern void gomp_barrier_handle_tasks (gomp_barrier_state_t);
++extern void gomp_task_maybe_wait_for_dependencies (void **);
++extern bool gomp_create_target_task (struct gomp_device_descr *,
++				     void (*) (void *), size_t, void **,
++				     size_t *, unsigned short *, unsigned int,
++				     void **, void **,
++				     enum gomp_target_task_state);
+ 
+ static void inline
+ gomp_finish_task (struct gomp_task *task)
+@@ -606,7 +768,213 @@ extern void gomp_free_thread (void *);
+ 
+ /* target.c */
+ 
++extern void gomp_init_targets_once (void);
+ extern int gomp_get_num_devices (void);
++extern bool gomp_target_task_fn (void *);
++
++/* Splay tree definitions.  */
++typedef struct splay_tree_node_s *splay_tree_node;
++typedef struct splay_tree_s *splay_tree;
++typedef struct splay_tree_key_s *splay_tree_key;
++
++struct target_var_desc {
++  /* Splay key.  */
++  splay_tree_key key;
++  /* True if data should be copied from device to host at the end.  */
++  bool copy_from;
++  /* True if data always should be copied from device to host at the end.  */
++  bool always_copy_from;
++  /* Relative offset against key host_start.  */
++  uintptr_t offset;
++  /* Actual length.  */
++  uintptr_t length;
++};
++
++struct target_mem_desc {
++  /* Reference count.  */
++  uintptr_t refcount;
++  /* All the splay nodes allocated together.  */
++  splay_tree_node array;
++  /* Start of the target region.  */
++  uintptr_t tgt_start;
++  /* End of the targer region.  */
++  uintptr_t tgt_end;
++  /* Handle to free.  */
++  void *to_free;
++  /* Previous target_mem_desc.  */
++  struct target_mem_desc *prev;
++  /* Number of items in following list.  */
++  size_t list_count;
++
++  /* Corresponding target device descriptor.  */
++  struct gomp_device_descr *device_descr;
++
++  /* List of target items to remove (or decrease refcount)
++     at the end of region.  */
++  struct target_var_desc list[];
++};
++
++/* Special value for refcount - infinity.  */
++#define REFCOUNT_INFINITY (~(uintptr_t) 0)
++/* Special value for refcount - tgt_offset contains target address of the
++   artificial pointer to "omp declare target link" object.  */
++#define REFCOUNT_LINK (~(uintptr_t) 1)
++
++struct splay_tree_key_s {
++  /* Address of the host object.  */
++  uintptr_t host_start;
++  /* Address immediately after the host object.  */
++  uintptr_t host_end;
++  /* Descriptor of the target memory.  */
++  struct target_mem_desc *tgt;
++  /* Offset from tgt->tgt_start to the start of the target object.  */
++  uintptr_t tgt_offset;
++  /* Reference count.  */
++  uintptr_t refcount;
++  /* Pointer to the original mapping of "omp declare target link" object.  */
++  splay_tree_key link_key;
++};
++
++/* The comparison function.  */
++
++static inline int
++splay_compare (splay_tree_key x, splay_tree_key y)
++{
++  if (x->host_start == x->host_end
++      && y->host_start == y->host_end)
++    return 0;
++  if (x->host_end <= y->host_start)
++    return -1;
++  if (x->host_start >= y->host_end)
++    return 1;
++  return 0;
++}
++
++#include "splay-tree.h"
++
++typedef struct acc_dispatch_t
++{
++  /* This is a linked list of data mapped using the
++     acc_map_data/acc_unmap_data or "acc enter data"/"acc exit data" pragmas.
++     Unlike mapped_data in the goacc_thread struct, unmapping can
++     happen out-of-order with respect to mapping.  */
++  /* This is guarded by the lock in the "outer" struct gomp_device_descr.  */
++  struct target_mem_desc *data_environ;
++
++  /* Execute.  */
++  void (*exec_func) (void (*) (void *), size_t, void **, void **, int,
++		     unsigned *, void *);
++
++  /* Async cleanup callback registration.  */
++  void (*register_async_cleanup_func) (void *, int);
++
++  /* Asynchronous routines.  */
++  int (*async_test_func) (int);
++  int (*async_test_all_func) (void);
++  void (*async_wait_func) (int);
++  void (*async_wait_async_func) (int, int);
++  void (*async_wait_all_func) (void);
++  void (*async_wait_all_async_func) (int);
++  void (*async_set_async_func) (int);
++
++  /* Create/destroy TLS data.  */
++  void *(*create_thread_data_func) (int);
++  void (*destroy_thread_data_func) (void *);
++
++  /* NVIDIA target specific routines.  */
++  struct {
++    void *(*get_current_device_func) (void);
++    void *(*get_current_context_func) (void);
++    void *(*get_stream_func) (int);
++    int (*set_stream_func) (int, void *);
++  } cuda;
++} acc_dispatch_t;
++
++/* Various state of the accelerator device.  */
++enum gomp_device_state
++{
++  GOMP_DEVICE_UNINITIALIZED,
++  GOMP_DEVICE_INITIALIZED,
++  GOMP_DEVICE_FINALIZED
++};
++
++/* This structure describes accelerator device.
++   It contains name of the corresponding libgomp plugin, function handlers for
++   interaction with the device, ID-number of the device, and information about
++   mapped memory.  */
++struct gomp_device_descr
++{
++  /* Immutable data, which is only set during initialization, and which is not
++     guarded by the lock.  */
++
++  /* The name of the device.  */
++  const char *name;
++
++  /* Capabilities of device (supports OpenACC, OpenMP).  */
++  unsigned int capabilities;
++
++  /* This is the ID number of device among devices of the same type.  */
++  int target_id;
++
++  /* This is the TYPE of device.  */
++  enum offload_target_type type;
++
++  /* Function handlers.  */
++  const char *(*get_name_func) (void);
++  unsigned int (*get_caps_func) (void);
++  int (*get_type_func) (void);
++  int (*get_num_devices_func) (void);
++  bool (*init_device_func) (int);
++  bool (*fini_device_func) (int);
++  unsigned (*version_func) (void);
++  int (*load_image_func) (int, unsigned, const void *, struct addr_pair **);
++  bool (*unload_image_func) (int, unsigned, const void *);
++  void *(*alloc_func) (int, size_t);
++  bool (*free_func) (int, void *);
++  bool (*dev2host_func) (int, void *, const void *, size_t);
++  bool (*host2dev_func) (int, void *, const void *, size_t);
++  bool (*dev2dev_func) (int, void *, const void *, size_t);
++  bool (*can_run_func) (void *);
++  void (*run_func) (int, void *, void *, void **);
++  void (*async_run_func) (int, void *, void *, void **, void *);
++
++  /* Splay tree containing information about mapped memory regions.  */
++  struct splay_tree_s mem_map;
++
++  /* Mutex for the mutable data.  */
++  gomp_mutex_t lock;
++
++  /* Current state of the device.  OpenACC allows to move from INITIALIZED state
++     back to UNINITIALIZED state.  OpenMP allows only to move from INITIALIZED
++     to FINALIZED state (at program shutdown).  */
++  enum gomp_device_state state;
++
++  /* OpenACC-specific data and functions.  */
++  /* This is mutable because of its mutable data_environ and target_data
++     members.  */
++  acc_dispatch_t openacc;
++};
++
++/* Kind of the pragma, for which gomp_map_vars () is called.  */
++enum gomp_map_vars_kind
++{
++  GOMP_MAP_VARS_OPENACC,
++  GOMP_MAP_VARS_TARGET,
++  GOMP_MAP_VARS_DATA,
++  GOMP_MAP_VARS_ENTER_DATA
++};
++
++extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
++extern void gomp_acc_remove_pointer (void *, bool, int, int);
++
++extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
++					      size_t, void **, void **,
++					      size_t *, void *, bool,
++					      enum gomp_map_vars_kind);
++extern void gomp_unmap_vars (struct target_mem_desc *, bool);
++extern void gomp_init_device (struct gomp_device_descr *);
++extern void gomp_free_memmap (struct splay_tree_s *);
++extern void gomp_unload_device (struct gomp_device_descr *);
+ 
+ /* work.c */
+ 
+@@ -646,8 +1014,28 @@ typedef enum omp_proc_bind_t
+   omp_proc_bind_spread = 4
+ } omp_proc_bind_t;
+ 
++typedef enum omp_lock_hint_t
++{
++  omp_lock_hint_none = 0,
++  omp_lock_hint_uncontended = 1,
++  omp_lock_hint_contended = 2,
++  omp_lock_hint_nonspeculative = 4,
++  omp_lock_hint_speculative = 8,
++} omp_lock_hint_t;
++
++extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t)
++  __GOMP_NOTHROW;
++extern void omp_init_nest_lock_with_hint (omp_lock_t *, omp_lock_hint_t)
++  __GOMP_NOTHROW;
++
+ extern int omp_get_cancellation (void) __GOMP_NOTHROW;
+ extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
++extern int omp_get_num_places (void) __GOMP_NOTHROW;
++extern int omp_get_place_num_procs (int) __GOMP_NOTHROW;
++extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW;
++extern int omp_get_place_num (void) __GOMP_NOTHROW;
++extern int omp_get_partition_num_places (void) __GOMP_NOTHROW;
++extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW;
+ 
+ extern void omp_set_default_device (int) __GOMP_NOTHROW;
+ extern int omp_get_default_device (void) __GOMP_NOTHROW;
+@@ -656,6 +1044,24 @@ extern int omp_get_num_teams (void) __GO
+ extern int omp_get_team_num (void) __GOMP_NOTHROW;
+ 
+ extern int omp_is_initial_device (void) __GOMP_NOTHROW;
++extern int omp_get_initial_device (void) __GOMP_NOTHROW;
++extern int omp_get_max_task_priority (void) __GOMP_NOTHROW;
++
++extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW;
++extern void omp_target_free (void *, int) __GOMP_NOTHROW;
++extern int omp_target_is_present (void *, int) __GOMP_NOTHROW;
++extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__,
++			      __SIZE_TYPE__, int, int) __GOMP_NOTHROW;
++extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int,
++				   const __SIZE_TYPE__ *,
++				   const __SIZE_TYPE__ *,
++				   const __SIZE_TYPE__ *,
++				   const __SIZE_TYPE__ *,
++				   const __SIZE_TYPE__ *, int, int)
++  __GOMP_NOTHROW;
++extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__,
++				     __SIZE_TYPE__, int) __GOMP_NOTHROW;
++extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW;   
+ 
+ #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \
+     || !defined (HAVE_ATTRIBUTE_ALIAS) \
+@@ -728,4 +1134,34 @@ extern int gomp_test_nest_lock_25 (omp_n
+ # define ialias_call(fn) fn
+ #endif
+ 
++/* Helper function for priority_node_to_task() and
++   task_to_priority_node().
++
++   Return the offset from a task to its priority_node entry.  The
++   priority_node entry is has a type of TYPE.  */
++
++static inline size_t
++priority_queue_offset (enum priority_queue_type type)
++{
++  return offsetof (struct gomp_task, pnode[(int) type]);
++}
++
++/* Return the task associated with a priority NODE of type TYPE.  */
++
++static inline struct gomp_task *
++priority_node_to_task (enum priority_queue_type type,
++		       struct priority_node *node)
++{
++  return (struct gomp_task *) ((char *) node - priority_queue_offset (type));
++}
++
++/* Return the priority node of type TYPE for a given TASK.  */
++
++static inline struct priority_node *
++task_to_priority_node (enum priority_queue_type type,
++		       struct gomp_task *task)
++{
++  return (struct priority_node *) ((char *) task
++				   + priority_queue_offset (type));
++}
+ #endif /* LIBGOMP_H */
+--- libgomp/env.c.jj	2014-05-15 10:56:32.420522486 +0200
++++ libgomp/env.c	2016-07-13 16:57:04.437535335 +0200
+@@ -27,6 +27,8 @@
+ 
+ #include "libgomp.h"
+ #include "libgomp_f.h"
++#include "oacc-int.h"
++#include "gomp-constants.h"
+ #include <ctype.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+@@ -56,7 +58,7 @@ struct gomp_task_icv gomp_global_icv = {
+   .nthreads_var = 1,
+   .thread_limit_var = UINT_MAX,
+   .run_sched_var = GFS_DYNAMIC,
+-  .run_sched_modifier = 1,
++  .run_sched_chunk_size = 1,
+   .default_device_var = 0,
+   .dyn_var = false,
+   .nest_var = false,
+@@ -66,6 +68,7 @@ struct gomp_task_icv gomp_global_icv = {
+ 
+ unsigned long gomp_max_active_levels_var = INT_MAX;
+ bool gomp_cancel_var = false;
++int gomp_max_task_priority_var = 0;
+ #ifndef HAVE_SYNC_BUILTINS
+ gomp_mutex_t gomp_managed_threads_lock;
+ #endif
+@@ -76,6 +79,9 @@ char *gomp_bind_var_list;
+ unsigned long gomp_bind_var_list_len;
+ void **gomp_places_list;
+ unsigned long gomp_places_list_len;
++int gomp_debug_var;
++char *goacc_device_type;
++int goacc_device_num;
+ 
+ /* Parse the OMP_SCHEDULE environment variable.  */
+ 
+@@ -118,7 +124,7 @@ parse_schedule (void)
+     ++env;
+   if (*env == '\0')
+     {
+-      gomp_global_icv.run_sched_modifier
++      gomp_global_icv.run_sched_chunk_size
+ 	= gomp_global_icv.run_sched_var != GFS_STATIC;
+       return;
+     }
+@@ -144,7 +150,7 @@ parse_schedule (void)
+ 
+   if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
+     value = 1;
+-  gomp_global_icv.run_sched_modifier = value;
++  gomp_global_icv.run_sched_chunk_size = value;
+   return;
+ 
+  unknown:
+@@ -1011,6 +1017,16 @@ parse_affinity (bool ignore)
+   return false;
+ }
+ 
++static void
++parse_acc_device_type (void)
++{
++  const char *env = getenv ("ACC_DEVICE_TYPE");
++
++  if (env && *env != '\0')
++    goacc_device_type = strdup (env);
++  else
++    goacc_device_type = NULL;
++}
+ 
+ static void
+ handle_omp_display_env (unsigned long stacksize, int wait_policy)
+@@ -1054,7 +1070,7 @@ handle_omp_display_env (unsigned long st
+ 
+   fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr);
+ 
+-  fputs ("  _OPENMP = '201307'\n", stderr);
++  fputs ("  _OPENMP = '201511'\n", stderr);
+   fprintf (stderr, "  OMP_DYNAMIC = '%s'\n",
+ 	   gomp_global_icv.dyn_var ? "TRUE" : "FALSE");
+   fprintf (stderr, "  OMP_NESTED = '%s'\n",
+@@ -1142,6 +1158,8 @@ handle_omp_display_env (unsigned long st
+ 	   gomp_cancel_var ? "TRUE" : "FALSE");
+   fprintf (stderr, "  OMP_DEFAULT_DEVICE = '%d'\n",
+ 	   gomp_global_icv.default_device_var);
++  fprintf (stderr, "  OMP_MAX_TASK_PRIORITY = '%d'\n",
++	   gomp_max_task_priority_var);
+ 
+   if (verbose)
+     {
+@@ -1174,6 +1192,7 @@ initialize_env (void)
+   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
+   parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
+   parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
++  parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true);
+   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
+ 		       true);
+   if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false))
+@@ -1181,6 +1200,7 @@ initialize_env (void)
+       gomp_global_icv.thread_limit_var
+ 	= thread_limit_var > INT_MAX ? UINT_MAX : thread_limit_var;
+     }
++  parse_int ("GOMP_DEBUG", &gomp_debug_var, true);
+ #ifndef HAVE_SYNC_BUILTINS
+   gomp_mutex_init (&gomp_managed_threads_lock);
+ #endif
+@@ -1271,6 +1291,15 @@ initialize_env (void)
+     }
+ 
+   handle_omp_display_env (stacksize, wait_policy);
++
++  /* OpenACC.  */
++
++  if (!parse_int ("ACC_DEVICE_NUM", &goacc_device_num, true))
++    goacc_device_num = 0;
++
++  parse_acc_device_type ();
++
++  goacc_runtime_initialize ();
+ }
+ 
+ 
+@@ -1312,21 +1341,21 @@ omp_get_nested (void)
+ }
+ 
+ void
+-omp_set_schedule (omp_sched_t kind, int modifier)
++omp_set_schedule (omp_sched_t kind, int chunk_size)
+ {
+   struct gomp_task_icv *icv = gomp_icv (true);
+   switch (kind)
+     {
+     case omp_sched_static:
+-      if (modifier < 1)
+-	modifier = 0;
+-      icv->run_sched_modifier = modifier;
++      if (chunk_size < 1)
++	chunk_size = 0;
++      icv->run_sched_chunk_size = chunk_size;
+       break;
+     case omp_sched_dynamic:
+     case omp_sched_guided:
+-      if (modifier < 1)
+-	modifier = 1;
+-      icv->run_sched_modifier = modifier;
++      if (chunk_size < 1)
++	chunk_size = 1;
++      icv->run_sched_chunk_size = chunk_size;
+       break;
+     case omp_sched_auto:
+       break;
+@@ -1337,11 +1366,11 @@ omp_set_schedule (omp_sched_t kind, int
+ }
+ 
+ void
+-omp_get_schedule (omp_sched_t *kind, int *modifier)
++omp_get_schedule (omp_sched_t *kind, int *chunk_size)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+   *kind = icv->run_sched_var;
+-  *modifier = icv->run_sched_modifier;
++  *chunk_size = icv->run_sched_chunk_size;
+ }
+ 
+ int
+@@ -1377,6 +1406,12 @@ omp_get_cancellation (void)
+   return gomp_cancel_var;
+ }
+ 
++int
++omp_get_max_task_priority (void)
++{
++  return gomp_max_task_priority_var;
++}
++
+ omp_proc_bind_t
+ omp_get_proc_bind (void)
+ {
+@@ -1425,6 +1460,59 @@ omp_is_initial_device (void)
+   return 1;
+ }
+ 
++int
++omp_get_initial_device (void)
++{
++  return GOMP_DEVICE_HOST_FALLBACK;
++}
++
++int
++omp_get_num_places (void)
++{
++  return gomp_places_list_len;
++}
++
++int
++omp_get_place_num (void)
++{
++  if (gomp_places_list == NULL)
++    return -1;
++
++  struct gomp_thread *thr = gomp_thread ();
++  if (thr->place == 0)
++    gomp_init_affinity ();
++
++  return (int) thr->place - 1;
++}
++
++int
++omp_get_partition_num_places (void)
++{
++  if (gomp_places_list == NULL)
++    return 0;
++
++  struct gomp_thread *thr = gomp_thread ();
++  if (thr->place == 0)
++    gomp_init_affinity ();
++
++  return thr->ts.place_partition_len;
++}
++
++void
++omp_get_partition_place_nums (int *place_nums)
++{
++  if (gomp_places_list == NULL)
++    return;
++
++  struct gomp_thread *thr = gomp_thread ();
++  if (thr->place == 0)
++    gomp_init_affinity ();
++
++  unsigned int i;
++  for (i = 0; i < thr->ts.place_partition_len; i++)
++    *place_nums++ = thr->ts.place_partition_off + i;
++}
++
+ ialias (omp_set_dynamic)
+ ialias (omp_set_nested)
+ ialias (omp_set_num_threads)
+@@ -1444,3 +1532,9 @@ ialias (omp_get_num_devices)
+ ialias (omp_get_num_teams)
+ ialias (omp_get_team_num)
+ ialias (omp_is_initial_device)
++ialias (omp_get_initial_device)
++ialias (omp_get_max_task_priority)
++ialias (omp_get_num_places)
++ialias (omp_get_place_num)
++ialias (omp_get_partition_num_places)
++ialias (omp_get_partition_place_nums)
+--- libgomp/openacc.h.jj	2016-07-13 16:57:04.432535397 +0200
++++ libgomp/openacc.h	2016-07-13 16:57:04.432535397 +0200
+@@ -0,0 +1,131 @@
++/* OpenACC Runtime Library User-facing Declarations
++
++   Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef _OPENACC_H
++#define _OPENACC_H 1
++
++/* The OpenACC standard is silent on whether or not including <openacc.h>
++   might or must not include other header files.  We chose to include
++   some.  */
++#include <stddef.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#if __cplusplus >= 201103
++# define __GOACC_NOTHROW noexcept
++#elif __cplusplus
++# define __GOACC_NOTHROW throw ()
++#else /* Not C++ */
++# define __GOACC_NOTHROW __attribute__ ((__nothrow__))
++#endif
++
++/* Types */
++typedef enum acc_device_t {
++  /* Keep in sync with include/gomp-constants.h.  */
++  acc_device_none = 0,
++  acc_device_default = 1,
++  acc_device_host = 2,
++  /* acc_device_host_nonshm = 3 removed.  */
++  acc_device_not_host = 4,
++  acc_device_nvidia = 5,
++  _ACC_device_hwm,
++  /* Ensure enumeration is layout compatible with int.  */
++  _ACC_highest = __INT_MAX__,
++  _ACC_neg = -1
++} acc_device_t;
++
++typedef enum acc_async_t {
++  /* Keep in sync with include/gomp-constants.h.  */
++  acc_async_noval = -1,
++  acc_async_sync  = -2
++} acc_async_t;
++
++int acc_get_num_devices (acc_device_t) __GOACC_NOTHROW;
++void acc_set_device_type (acc_device_t) __GOACC_NOTHROW;
++acc_device_t acc_get_device_type (void) __GOACC_NOTHROW;
++void acc_set_device_num (int, acc_device_t) __GOACC_NOTHROW;
++int acc_get_device_num (acc_device_t) __GOACC_NOTHROW;
++int acc_async_test (int) __GOACC_NOTHROW;
++int acc_async_test_all (void) __GOACC_NOTHROW;
++void acc_wait (int) __GOACC_NOTHROW;
++void acc_wait_async (int, int) __GOACC_NOTHROW;
++void acc_wait_all (void) __GOACC_NOTHROW;
++void acc_wait_all_async (int) __GOACC_NOTHROW;
++void acc_init (acc_device_t) __GOACC_NOTHROW;
++void acc_shutdown (acc_device_t) __GOACC_NOTHROW;
++#ifdef __cplusplus
++int acc_on_device (int __arg) __GOACC_NOTHROW;
++#else
++int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
++#endif
++void *acc_malloc (size_t) __GOACC_NOTHROW;
++void acc_free (void *) __GOACC_NOTHROW;
++/* Some of these would be more correct with const qualifiers, but
++   the standard specifies otherwise.  */
++void *acc_copyin (void *, size_t) __GOACC_NOTHROW;
++void *acc_present_or_copyin (void *, size_t) __GOACC_NOTHROW;
++void *acc_create (void *, size_t) __GOACC_NOTHROW;
++void *acc_present_or_create (void *, size_t) __GOACC_NOTHROW;
++void acc_copyout (void *, size_t) __GOACC_NOTHROW;
++void acc_delete (void *, size_t) __GOACC_NOTHROW;
++void acc_update_device (void *, size_t) __GOACC_NOTHROW;
++void acc_update_self (void *, size_t) __GOACC_NOTHROW;
++void acc_map_data (void *, void *, size_t) __GOACC_NOTHROW;
++void acc_unmap_data (void *) __GOACC_NOTHROW;
++void *acc_deviceptr (void *) __GOACC_NOTHROW;
++void *acc_hostptr (void *) __GOACC_NOTHROW;
++int acc_is_present (void *, size_t) __GOACC_NOTHROW;
++void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
++void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
++
++/* Old names.  OpenACC does not specify whether these can or must
++   not be macros, inlines or aliases for the new names.  */
++#define acc_pcreate acc_present_or_create
++#define acc_pcopyin acc_present_or_copyin
++
++/* CUDA-specific routines.  */
++void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
++void *acc_get_current_cuda_context (void) __GOACC_NOTHROW;
++void *acc_get_cuda_stream (int) __GOACC_NOTHROW;
++int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
++
++#ifdef __cplusplus
++}
++
++/* Forwarding function with correctly typed arg.  */
++
++#pragma acc routine seq
++inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
++{
++  return acc_on_device ((int) __arg);
++}
++#endif
++
++#endif /* _OPENACC_H */
+--- libgomp/config/linux/doacross.h.jj	2016-07-13 16:57:18.902355979 +0200
++++ libgomp/config/linux/doacross.h	2016-07-13 16:57:18.902355979 +0200
+@@ -0,0 +1,57 @@
++/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This is a Linux specific implementation of doacross spinning.  */
++
++#ifndef GOMP_DOACROSS_H
++#define GOMP_DOACROSS_H 1
++
++#include "libgomp.h"
++#include <errno.h>
++#include "wait.h"
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility push(hidden)
++#endif
++
++static inline void doacross_spin (unsigned long *addr, unsigned long expected,
++				  unsigned long cur)
++{
++  /* FIXME: back off depending on how large expected - cur is.  */
++  do
++    {
++      cpu_relax ();
++      cur = __atomic_load_n (addr, MEMMODEL_RELAXED);
++      if (expected < cur)
++	return;
++    }
++  while (1);
++}
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility pop
++#endif
++
++#endif /* GOMP_DOACROSS_H */
+--- libgomp/config/posix/doacross.h.jj	2016-07-13 16:57:18.903355966 +0200
++++ libgomp/config/posix/doacross.h	2016-07-13 16:57:18.903355966 +0200
+@@ -0,0 +1,62 @@
++/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This is a generic implementation of doacross spinning.  */
++
++#ifndef GOMP_DOACROSS_H
++#define GOMP_DOACROSS_H 1
++
++#include "libgomp.h"
++#include <errno.h>
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility push(hidden)
++#endif
++
++static inline void
++cpu_relax (void)
++{
++  __asm volatile ("" : : : "memory");
++}
++
++static inline void doacross_spin (unsigned long *addr, unsigned long expected,
++				  unsigned long cur)
++{
++  /* FIXME: back off depending on how large expected - cur is.  */
++  do
++    {
++      cpu_relax ();
++      cur = __atomic_load_n (addr, MEMMODEL_RELAXED);
++      if (expected < cur)
++	return;
++    }
++  while (1);
++}
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility pop
++#endif
++
++#endif /* GOMP_DOACROSS_H */
+--- libgomp/splay-tree.c.jj	2016-07-13 16:57:18.919355768 +0200
++++ libgomp/splay-tree.c	2016-07-13 16:57:18.919355768 +0200
+@@ -0,0 +1,238 @@
++/* A splay-tree datatype.
++   Copyright (C) 1998-2016 Free Software Foundation, Inc.
++   Contributed by Mark Mitchell (mark@markmitchell.com).
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* The splay tree code copied from include/splay-tree.h and adjusted,
++   so that all the data lives directly in splay_tree_node_s structure
++   and no extra allocations are needed.  */
++
++/* For an easily readable description of splay-trees, see:
++
++     Lewis, Harry R. and Denenberg, Larry.  Data Structures and Their
++     Algorithms.  Harper-Collins, Inc.  1991.
++
++   The major feature of splay trees is that all basic tree operations
++   are amortized O(log n) time for a tree with n nodes.  */
++
++#include "libgomp.h"
++
++/* Rotate the edge joining the left child N with its parent P.  PP is the
++   grandparents' pointer to P.  */
++
++static inline void
++rotate_left (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
++{
++  splay_tree_node tmp;
++  tmp = n->right;
++  n->right = p;
++  p->left = tmp;
++  *pp = n;
++}
++
++/* Rotate the edge joining the right child N with its parent P.  PP is the
++   grandparents' pointer to P.  */
++
++static inline void
++rotate_right (splay_tree_node *pp, splay_tree_node p, splay_tree_node n)
++{
++  splay_tree_node tmp;
++  tmp = n->left;
++  n->left = p;
++  p->right = tmp;
++  *pp = n;
++}
++
++/* Bottom up splay of KEY.  */
++
++static void
++splay_tree_splay (splay_tree sp, splay_tree_key key)
++{
++  if (sp->root == NULL)
++    return;
++
++  do {
++    int cmp1, cmp2;
++    splay_tree_node n, c;
++
++    n = sp->root;
++    cmp1 = splay_compare (key, &n->key);
++
++    /* Found.  */
++    if (cmp1 == 0)
++      return;
++
++    /* Left or right?  If no child, then we're done.  */
++    if (cmp1 < 0)
++      c = n->left;
++    else
++      c = n->right;
++    if (!c)
++      return;
++
++    /* Next one left or right?  If found or no child, we're done
++       after one rotation.  */
++    cmp2 = splay_compare (key, &c->key);
++    if (cmp2 == 0
++	|| (cmp2 < 0 && !c->left)
++	|| (cmp2 > 0 && !c->right))
++      {
++	if (cmp1 < 0)
++	  rotate_left (&sp->root, n, c);
++	else
++	  rotate_right (&sp->root, n, c);
++	return;
++      }
++
++    /* Now we have the four cases of double-rotation.  */
++    if (cmp1 < 0 && cmp2 < 0)
++      {
++	rotate_left (&n->left, c, c->left);
++	rotate_left (&sp->root, n, n->left);
++      }
++    else if (cmp1 > 0 && cmp2 > 0)
++      {
++	rotate_right (&n->right, c, c->right);
++	rotate_right (&sp->root, n, n->right);
++      }
++    else if (cmp1 < 0 && cmp2 > 0)
++      {
++	rotate_right (&n->left, c, c->right);
++	rotate_left (&sp->root, n, n->left);
++      }
++    else if (cmp1 > 0 && cmp2 < 0)
++      {
++	rotate_left (&n->right, c, c->left);
++	rotate_right (&sp->root, n, n->right);
++      }
++  } while (1);
++}
++
++/* Insert a new NODE into SP.  The NODE shouldn't exist in the tree.  */
++
++attribute_hidden void
++splay_tree_insert (splay_tree sp, splay_tree_node node)
++{
++  int comparison = 0;
++
++  splay_tree_splay (sp, &node->key);
++
++  if (sp->root)
++    comparison = splay_compare (&sp->root->key, &node->key);
++
++  if (sp->root && comparison == 0)
++    gomp_fatal ("Duplicate node");
++  else
++    {
++      /* Insert it at the root.  */
++      if (sp->root == NULL)
++	node->left = node->right = NULL;
++      else if (comparison < 0)
++	{
++	  node->left = sp->root;
++	  node->right = node->left->right;
++	  node->left->right = NULL;
++	}
++      else
++	{
++	  node->right = sp->root;
++	  node->left = node->right->left;
++	  node->right->left = NULL;
++	}
++
++      sp->root = node;
++    }
++}
++
++/* Remove node with KEY from SP.  It is not an error if it did not exist.  */
++
++attribute_hidden void
++splay_tree_remove (splay_tree sp, splay_tree_key key)
++{
++  splay_tree_splay (sp, key);
++
++  if (sp->root && splay_compare (&sp->root->key, key) == 0)
++    {
++      splay_tree_node left, right;
++
++      left = sp->root->left;
++      right = sp->root->right;
++
++      /* One of the children is now the root.  Doesn't matter much
++	 which, so long as we preserve the properties of the tree.  */
++      if (left)
++	{
++	  sp->root = left;
++
++	  /* If there was a right child as well, hang it off the
++	     right-most leaf of the left child.  */
++	  if (right)
++	    {
++	      while (left->right)
++		left = left->right;
++	      left->right = right;
++	    }
++	}
++      else
++	sp->root = right;
++    }
++}
++
++/* Lookup KEY in SP, returning NODE if present, and NULL
++   otherwise.  */
++
++attribute_hidden splay_tree_key
++splay_tree_lookup (splay_tree sp, splay_tree_key key)
++{
++  splay_tree_splay (sp, key);
++
++  if (sp->root && splay_compare (&sp->root->key, key) == 0)
++    return &sp->root->key;
++  else
++    return NULL;
++}
++
++/* Helper function for splay_tree_foreach.
++
++   Run FUNC on every node in KEY.  */
++
++static void
++splay_tree_foreach_internal (splay_tree_node node, splay_tree_callback func,
++			     void *data)
++{
++  if (!node)
++    return;
++  func (&node->key, data);
++  splay_tree_foreach_internal (node->left, func, data);
++  /* Yeah, whatever.  GCC can fix my tail recursion.  */
++  splay_tree_foreach_internal (node->right, func, data);
++}
++
++/* Run FUNC on each of the nodes in SP.  */
++
++attribute_hidden void
++splay_tree_foreach (splay_tree sp, splay_tree_callback func, void *data)
++{
++  splay_tree_foreach_internal (sp->root, func, data);
++}
+--- libgomp/libgomp-plugin.c.jj	2016-07-13 16:57:04.435535360 +0200
++++ libgomp/libgomp-plugin.c	2016-07-13 16:57:04.435535360 +0200
+@@ -0,0 +1,80 @@
++/* Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* Exported (non-hidden) functions exposing libgomp interface for plugins.  */
++
++#include <stdlib.h>
++
++#include "libgomp.h"
++#include "libgomp-plugin.h"
++
++void *
++GOMP_PLUGIN_malloc (size_t size)
++{
++  return gomp_malloc (size);
++}
++
++void *
++GOMP_PLUGIN_malloc_cleared (size_t size)
++{
++  return gomp_malloc_cleared (size);
++}
++
++void *
++GOMP_PLUGIN_realloc (void *ptr, size_t size)
++{
++  return gomp_realloc (ptr, size);
++}
++
++void
++GOMP_PLUGIN_debug (int kind, const char *msg, ...)
++{
++  va_list ap;
++
++  va_start (ap, msg);
++  gomp_vdebug (kind, msg, ap);
++  va_end (ap);
++}
++
++void
++GOMP_PLUGIN_error (const char *msg, ...)
++{
++  va_list ap;
++
++  va_start (ap, msg);
++  gomp_verror (msg, ap);
++  va_end (ap);
++}
++
++void
++GOMP_PLUGIN_fatal (const char *msg, ...)
++{
++  va_list ap;
++
++  va_start (ap, msg);
++  gomp_vfatal (msg, ap);
++  va_end (ap);
++}
+--- libgomp/libgomp-plugin.h.jj	2016-07-13 16:57:04.438535323 +0200
++++ libgomp/libgomp-plugin.h	2016-07-13 16:57:04.438535323 +0200
+@@ -0,0 +1,80 @@
++/* Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* An interface to various libgomp-internal functions for use by plugins.  */
++
++#ifndef LIBGOMP_PLUGIN_H
++#define LIBGOMP_PLUGIN_H 1
++
++#include <stddef.h>
++#include <stdint.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/* Capabilities of offloading devices.  */
++#define GOMP_OFFLOAD_CAP_SHARED_MEM	(1 << 0)
++#define GOMP_OFFLOAD_CAP_NATIVE_EXEC	(1 << 1)
++#define GOMP_OFFLOAD_CAP_OPENMP_400	(1 << 2)
++#define GOMP_OFFLOAD_CAP_OPENACC_200	(1 << 3)
++
++/* Type of offload target device.  Keep in sync with include/gomp-constants.h.  */
++enum offload_target_type
++{
++  OFFLOAD_TARGET_TYPE_HOST = 2,
++  /* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed.  */
++  OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5,
++  OFFLOAD_TARGET_TYPE_INTEL_MIC = 6,
++  OFFLOAD_TARGET_TYPE_HSA = 7
++};
++
++/* Auxiliary struct, used for transferring pairs of addresses from plugin
++   to libgomp.  */
++struct addr_pair
++{
++  uintptr_t start;
++  uintptr_t end;
++};
++
++/* Miscellaneous functions.  */
++extern void *GOMP_PLUGIN_malloc (size_t) __attribute__ ((malloc));
++extern void *GOMP_PLUGIN_malloc_cleared (size_t) __attribute__ ((malloc));
++extern void *GOMP_PLUGIN_realloc (void *, size_t);
++void GOMP_PLUGIN_target_task_completion (void *);
++
++extern void GOMP_PLUGIN_debug (int, const char *, ...)
++	__attribute__ ((format (printf, 2, 3)));
++extern void GOMP_PLUGIN_error (const char *, ...)
++	__attribute__ ((format (printf, 1, 2)));
++extern void GOMP_PLUGIN_fatal (const char *, ...)
++	__attribute__ ((noreturn, format (printf, 1, 2)));
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
+--- libgomp/oacc-async.c.jj	2016-07-13 16:57:13.488423109 +0200
++++ libgomp/oacc-async.c	2016-07-13 16:57:13.488423109 +0200
+@@ -0,0 +1,107 @@
++/* OpenACC Runtime Library Definitions.
++
++   Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <assert.h>
++#include "openacc.h"
++#include "libgomp.h"
++#include "oacc-int.h"
++
++int
++acc_async_test (int async)
++{
++  if (async < acc_async_sync)
++    gomp_fatal ("invalid async argument: %d", async);
++
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (!thr || !thr->dev)
++    gomp_fatal ("no device active");
++
++  return thr->dev->openacc.async_test_func (async);
++}
++
++int
++acc_async_test_all (void)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (!thr || !thr->dev)
++    gomp_fatal ("no device active");
++
++  return thr->dev->openacc.async_test_all_func ();
++}
++
++void
++acc_wait (int async)
++{
++  if (async < acc_async_sync)
++    gomp_fatal ("invalid async argument: %d", async);
++
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (!thr || !thr->dev)
++    gomp_fatal ("no device active");
++
++  thr->dev->openacc.async_wait_func (async);
++}
++
++void
++acc_wait_async (int async1, int async2)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (!thr || !thr->dev)
++    gomp_fatal ("no device active");
++
++  thr->dev->openacc.async_wait_async_func (async1, async2);
++}
++
++void
++acc_wait_all (void)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (!thr || !thr->dev)
++    gomp_fatal ("no device active");
++
++  thr->dev->openacc.async_wait_all_func ();
++}
++
++void
++acc_wait_all_async (int async)
++{
++  if (async < acc_async_sync)
++    gomp_fatal ("invalid async argument: %d", async);
++
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (!thr || !thr->dev)
++    gomp_fatal ("no device active");
++
++  thr->dev->openacc.async_wait_all_async_func (async);
++}
+--- libgomp/splay-tree.h.jj	2016-07-13 16:57:18.934355582 +0200
++++ libgomp/splay-tree.h	2016-07-13 16:57:18.934355582 +0200
+@@ -0,0 +1,130 @@
++/* A splay-tree datatype.
++   Copyright (C) 1998-2016 Free Software Foundation, Inc.
++   Contributed by Mark Mitchell (mark@markmitchell.com).
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* The splay tree code copied from include/splay-tree.h and adjusted,
++   so that all the data lives directly in splay_tree_node_s structure
++   and no extra allocations are needed.
++
++   Files including this header should before including it add:
++typedef struct splay_tree_node_s *splay_tree_node;
++typedef struct splay_tree_s *splay_tree;
++typedef struct splay_tree_key_s *splay_tree_key;
++   define splay_tree_key_s structure, and define
++   splay_compare inline function.
++
++   Alternatively, they can define splay_tree_prefix macro before
++   including this header and then all the above types, the
++   splay_compare function and the splay_tree_{lookup,insert_remove}
++   function will be prefixed by that prefix.  If splay_tree_prefix
++   macro is defined, this header must be included twice: once where
++   you need the header file definitions, and once where you need the
++   .c implementation routines.  In the latter case, you must also
++   define the macro splay_tree_c.  See the include of splay-tree.h in
++   priority_queue.[hc] for an example.  */
++
++/* For an easily readable description of splay-trees, see:
++
++     Lewis, Harry R. and Denenberg, Larry.  Data Structures and Their
++     Algorithms.  Harper-Collins, Inc.  1991.
++
++   The major feature of splay trees is that all basic tree operations
++   are amortized O(log n) time for a tree with n nodes.  */
++
++#ifdef splay_tree_prefix
++# define splay_tree_name_1(prefix, name) prefix ## _ ## name
++# define splay_tree_name(prefix, name) splay_tree_name_1 (prefix, name)
++# define splay_tree_node_s	\
++    splay_tree_name (splay_tree_prefix, splay_tree_node_s)
++# define splay_tree_s		\
++    splay_tree_name (splay_tree_prefix, splay_tree_s)
++# define splay_tree_key_s	\
++    splay_tree_name (splay_tree_prefix, splay_tree_key_s)
++# define splay_tree_node	\
++    splay_tree_name (splay_tree_prefix, splay_tree_node)
++# define splay_tree		\
++    splay_tree_name (splay_tree_prefix, splay_tree)
++# define splay_tree_key		\
++    splay_tree_name (splay_tree_prefix, splay_tree_key)
++# define splay_compare		\
++    splay_tree_name (splay_tree_prefix, splay_compare)
++# define splay_tree_lookup	\
++    splay_tree_name (splay_tree_prefix, splay_tree_lookup)
++# define splay_tree_insert	\
++    splay_tree_name (splay_tree_prefix, splay_tree_insert)
++# define splay_tree_remove	\
++    splay_tree_name (splay_tree_prefix, splay_tree_remove)
++# define splay_tree_foreach	\
++    splay_tree_name (splay_tree_prefix, splay_tree_foreach)
++# define splay_tree_callback	\
++    splay_tree_name (splay_tree_prefix, splay_tree_callback)
++#endif
++
++#ifndef splay_tree_c
++/* Header file definitions and prototypes.  */
++
++/* The nodes in the splay tree.  */
++struct splay_tree_node_s {
++  struct splay_tree_key_s key;
++  /* The left and right children, respectively.  */
++  splay_tree_node left;
++  splay_tree_node right;
++};
++
++/* The splay tree.  */
++struct splay_tree_s {
++  splay_tree_node root;
++};
++
++typedef void (*splay_tree_callback) (splay_tree_key, void *);
++
++extern splay_tree_key splay_tree_lookup (splay_tree, splay_tree_key);
++extern void splay_tree_insert (splay_tree, splay_tree_node);
++extern void splay_tree_remove (splay_tree, splay_tree_key);
++extern void splay_tree_foreach (splay_tree, splay_tree_callback, void *);
++#else  /* splay_tree_c */
++#  ifdef splay_tree_prefix
++#    include "splay-tree.c"
++#    undef splay_tree_name_1
++#    undef splay_tree_name
++#    undef splay_tree_node_s
++#    undef splay_tree_s
++#    undef splay_tree_key_s
++#    undef splay_tree_node
++#    undef splay_tree
++#    undef splay_tree_key
++#    undef splay_compare
++#    undef splay_tree_lookup
++#    undef splay_tree_insert
++#    undef splay_tree_remove
++#    undef splay_tree_foreach
++#    undef splay_tree_callback
++#    undef splay_tree_c
++#  endif
++#endif /* #ifndef splay_tree_c */
++
++#ifdef splay_tree_prefix
++#  undef splay_tree_prefix
++#endif
+--- libgomp/oacc-plugin.c.jj	2016-07-13 16:57:13.481423196 +0200
++++ libgomp/oacc-plugin.c	2016-07-14 15:40:21.653151873 +0200
+@@ -0,0 +1,44 @@
++/* Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* Initialize and register OpenACC dispatch table from libgomp plugin.  */
++
++#include "libgomp.h"
++#include "oacc-plugin.h"
++#include "oacc-int.h"
++
++void
++GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
++{
++}
++
++/* Return the target-specific part of the TLS data for the current thread.  */
++
++void *
++GOMP_PLUGIN_acc_thread (void)
++{
++  return NULL;
++}
+--- libgomp/oacc-init.c.jj	2016-07-13 16:57:04.423535509 +0200
++++ libgomp/oacc-init.c	2016-07-14 19:06:41.679575688 +0200
+@@ -0,0 +1,640 @@
++/* OpenACC Runtime initialization routines
++
++   Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "libgomp.h"
++#include "oacc-int.h"
++#include "openacc.h"
++#include <assert.h>
++#include <stdlib.h>
++#include <strings.h>
++#include <stdbool.h>
++#include <string.h>
++
++/* This lock is used to protect access to cached_base_dev, dispatchers and
++   the (abstract) initialisation state of attached offloading devices.  */
++
++static gomp_mutex_t acc_device_lock;
++
++/* A cached version of the dispatcher for the global "current" accelerator type,
++   e.g. used as the default when creating new host threads.  This is the
++   device-type equivalent of goacc_device_num (which specifies which device to
++   use out of potentially several of the same type).  If there are several
++   devices of a given type, this points at the first one.  */
++
++static struct gomp_device_descr *cached_base_dev = NULL;
++
++#if defined HAVE_TLS || defined USE_EMUTLS
++__thread struct goacc_thread *goacc_tls_data;
++#else
++pthread_key_t goacc_tls_key;
++#endif
++static pthread_key_t goacc_cleanup_key;
++
++static struct goacc_thread *goacc_threads;
++static gomp_mutex_t goacc_thread_lock;
++
++/* An array of dispatchers for device types, indexed by the type.  This array
++   only references "base" devices, and other instances of the same type are
++   found by simply indexing from each such device (which are stored linearly,
++   grouped by device in target.c:devices).  */
++static struct gomp_device_descr *dispatchers[_ACC_device_hwm] = { 0 };
++
++attribute_hidden void
++goacc_register (struct gomp_device_descr *disp)
++{
++  /* Only register the 0th device here.  */
++  if (disp->target_id != 0)
++    return;
++
++  gomp_mutex_lock (&acc_device_lock);
++
++  assert (acc_device_type (disp->type) != acc_device_none
++	  && acc_device_type (disp->type) != acc_device_default
++	  && acc_device_type (disp->type) != acc_device_not_host);
++  assert (!dispatchers[disp->type]);
++  dispatchers[disp->type] = disp;
++
++  gomp_mutex_unlock (&acc_device_lock);
++}
++
++static const char *
++name_of_acc_device_t (enum acc_device_t type)
++{
++  switch (type)
++    {
++    case acc_device_none: return "none";
++    case acc_device_default: return "default";
++    case acc_device_host: return "host";
++    case acc_device_not_host: return "not_host";
++    case acc_device_nvidia: return "nvidia";
++    default: gomp_fatal ("unknown device type %u", (unsigned) type);
++    }
++}
++
++/* ACC_DEVICE_LOCK must be held before calling this function.  If FAIL_IS_ERROR
++   is true, this function raises an error if there are no devices of type D,
++   otherwise it returns NULL in that case.  */
++
++static struct gomp_device_descr *
++resolve_device (acc_device_t d, bool fail_is_error)
++{
++  acc_device_t d_arg = d;
++
++  switch (d)
++    {
++    case acc_device_default:
++      {
++	if (goacc_device_type)
++	  {
++	    /* Lookup the named device.  */
++	    if (!strcasecmp (goacc_device_type, "host"))
++	      {
++		d = acc_device_host;
++		goto found;
++	      }
++
++	    if (fail_is_error)
++	      {
++		gomp_mutex_unlock (&acc_device_lock);
++		gomp_fatal ("device type %s not supported", goacc_device_type);
++	      }
++	    else
++	      return NULL;
++	  }
++
++	/* No default device specified, so start scanning for any non-host
++	   device that is available.  */
++	d = acc_device_not_host;
++      }
++      /* FALLTHROUGH */
++
++    case acc_device_not_host:
++      if (d_arg == acc_device_default)
++	{
++	  d = acc_device_host;
++	  goto found;
++	}
++      if (fail_is_error)
++        {
++	  gomp_mutex_unlock (&acc_device_lock);
++	  gomp_fatal ("no device found");
++	}
++      else
++        return NULL;
++      break;
++
++    case acc_device_host:
++      break;
++
++    default:
++      if (d > _ACC_device_hwm)
++	{
++	  if (fail_is_error)
++	    goto unsupported_device;
++	  else
++	    return NULL;
++	}
++      break;
++    }
++ found:
++
++  assert (d != acc_device_none
++	  && d != acc_device_default
++	  && d != acc_device_not_host);
++
++  if (dispatchers[d] == NULL && fail_is_error)
++    {
++    unsupported_device:
++      gomp_mutex_unlock (&acc_device_lock);
++      gomp_fatal ("device type %s not supported", name_of_acc_device_t (d));
++    }
++
++  return dispatchers[d];
++}
++
++/* Emit a suitable error if no device of a particular type is available, or
++   the given device number is out-of-range.  */
++static void
++acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
++{
++  if (ndevs == 0)
++    gomp_fatal ("no devices of type %s available", name_of_acc_device_t (d));
++  else
++    gomp_fatal ("device %u out of range", ord);
++}
++
++/* This is called when plugins have been initialized, and serves to call
++   (indirectly) the target's device_init hook.  Calling multiple times without
++   an intervening acc_shutdown_1 call is an error.  ACC_DEVICE_LOCK must be
++   held before calling this function.  */
++
++static struct gomp_device_descr *
++acc_init_1 (acc_device_t d)
++{
++  struct gomp_device_descr *base_dev, *acc_dev;
++  int ndevs;
++
++  base_dev = resolve_device (d, true);
++
++  ndevs = base_dev->get_num_devices_func ();
++
++  if (ndevs <= 0 || goacc_device_num >= ndevs)
++    acc_dev_num_out_of_range (d, goacc_device_num, ndevs);
++
++  acc_dev = &base_dev[goacc_device_num];
++
++  gomp_mutex_lock (&acc_dev->lock);
++  if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
++    {
++      gomp_mutex_unlock (&acc_dev->lock);
++      gomp_fatal ("device already active");
++    }
++
++  gomp_init_device (acc_dev);
++  gomp_mutex_unlock (&acc_dev->lock);
++
++  return base_dev;
++}
++
++/* ACC_DEVICE_LOCK must be held before calling this function.  */
++
++static void
++acc_shutdown_1 (acc_device_t d)
++{
++  struct gomp_device_descr *base_dev;
++  struct goacc_thread *walk;
++  int ndevs, i;
++  bool devices_active = false;
++
++  /* Get the base device for this device type.  */
++  base_dev = resolve_device (d, true);
++
++  ndevs = base_dev->get_num_devices_func ();
++
++  gomp_mutex_lock (&goacc_thread_lock);
++
++  /* Free target-specific TLS data and close all devices.  */
++  for (walk = goacc_threads; walk != NULL; walk = walk->next)
++    {
++      if (walk->target_tls)
++	base_dev->openacc.destroy_thread_data_func (walk->target_tls);
++
++      walk->target_tls = NULL;
++
++      /* Similarly, if this happens then user code has done something weird.  */
++      if (walk->saved_bound_dev)
++	{
++	  gomp_mutex_unlock (&goacc_thread_lock);
++	  gomp_fatal ("shutdown during host fallback");
++	}
++
++      if (walk->dev)
++	{
++	  gomp_mutex_lock (&walk->dev->lock);
++	  gomp_free_memmap (&walk->dev->mem_map);
++	  gomp_mutex_unlock (&walk->dev->lock);
++
++	  walk->dev = NULL;
++	  walk->base_dev = NULL;
++	}
++    }
++
++  gomp_mutex_unlock (&goacc_thread_lock);
++
++  /* Close all the devices of this type that have been opened.  */
++  bool ret = true;
++  for (i = 0; i < ndevs; i++)
++    {
++      struct gomp_device_descr *acc_dev = &base_dev[i];
++      gomp_mutex_lock (&acc_dev->lock);
++      if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
++        {
++	  devices_active = true;
++	  ret &= acc_dev->fini_device_func (acc_dev->target_id);
++	  acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
++	}
++      gomp_mutex_unlock (&acc_dev->lock);
++    }
++
++  if (!ret)
++    gomp_fatal ("device finalization failed");
++
++  if (!devices_active)
++    gomp_fatal ("no device initialized");
++}
++
++static struct goacc_thread *
++goacc_new_thread (void)
++{
++  struct goacc_thread *thr = gomp_malloc (sizeof (struct gomp_thread));
++
++#if defined HAVE_TLS || defined USE_EMUTLS
++  goacc_tls_data = thr;
++#else
++  pthread_setspecific (goacc_tls_key, thr);
++#endif
++
++  pthread_setspecific (goacc_cleanup_key, thr);
++
++  gomp_mutex_lock (&goacc_thread_lock);
++  thr->next = goacc_threads;
++  goacc_threads = thr;
++  gomp_mutex_unlock (&goacc_thread_lock);
++
++  return thr;
++}
++
++static void
++goacc_destroy_thread (void *data)
++{
++  struct goacc_thread *thr = data, *walk, *prev;
++
++  gomp_mutex_lock (&goacc_thread_lock);
++
++  if (thr)
++    {
++      struct gomp_device_descr *acc_dev = thr->dev;
++
++      if (acc_dev && thr->target_tls)
++	{
++	  acc_dev->openacc.destroy_thread_data_func (thr->target_tls);
++	  thr->target_tls = NULL;
++	}
++
++      assert (!thr->mapped_data);
++
++      /* Remove from thread list.  */
++      for (prev = NULL, walk = goacc_threads; walk;
++	   prev = walk, walk = walk->next)
++	if (walk == thr)
++	  {
++	    if (prev == NULL)
++	      goacc_threads = walk->next;
++	    else
++	      prev->next = walk->next;
++
++	    free (thr);
++
++	    break;
++	  }
++
++      assert (walk);
++    }
++
++  gomp_mutex_unlock (&goacc_thread_lock);
++}
++
++/* Use the ORD'th device instance for the current host thread (or -1 for the
++   current global default).  The device (and the runtime) must be initialised
++   before calling this function.  */
++
++void
++goacc_attach_host_thread_to_device (int ord)
++{
++  struct goacc_thread *thr = goacc_thread ();
++  struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL;
++  int num_devices;
++  
++  if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0))
++    return;
++  
++  if (ord < 0)
++    ord = goacc_device_num;
++  
++  /* Decide which type of device to use.  If the current thread has a device
++     type already (e.g. set by acc_set_device_type), use that, else use the
++     global default.  */
++  if (thr && thr->base_dev)
++    base_dev = thr->base_dev;
++  else
++    {
++      assert (cached_base_dev);
++      base_dev = cached_base_dev;
++    }
++  
++  num_devices = base_dev->get_num_devices_func ();
++  if (num_devices <= 0 || ord >= num_devices)
++    acc_dev_num_out_of_range (acc_device_type (base_dev->type), ord,
++			      num_devices);
++  
++  if (!thr)
++    thr = goacc_new_thread ();
++  
++  thr->base_dev = base_dev;
++  thr->dev = acc_dev = &base_dev[ord];
++  thr->saved_bound_dev = NULL;
++  
++  thr->target_tls
++    = acc_dev->openacc.create_thread_data_func (ord);
++  
++  acc_dev->openacc.async_set_async_func (acc_async_sync);
++}
++
++/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
++   init/shutdown is per-process or per-thread.  We choose per-process.  */
++
++void
++acc_init (acc_device_t d)
++{
++  gomp_mutex_lock (&acc_device_lock);
++
++  cached_base_dev = acc_init_1 (d);
++
++  gomp_mutex_unlock (&acc_device_lock);
++  
++  goacc_attach_host_thread_to_device (-1);
++}
++
++ialias (acc_init)
++
++void
++acc_shutdown (acc_device_t d)
++{
++  gomp_mutex_lock (&acc_device_lock);
++
++  acc_shutdown_1 (d);
++
++  gomp_mutex_unlock (&acc_device_lock);
++}
++
++ialias (acc_shutdown)
++
++int
++acc_get_num_devices (acc_device_t d)
++{
++  int n = 0;
++  struct gomp_device_descr *acc_dev;
++
++  if (d == acc_device_none)
++    return 0;
++
++  gomp_mutex_lock (&acc_device_lock);
++  acc_dev = resolve_device (d, false);
++  gomp_mutex_unlock (&acc_device_lock);
++
++  if (!acc_dev)
++    return 0;
++
++  n = acc_dev->get_num_devices_func ();
++  if (n < 0)
++    n = 0;
++
++  return n;
++}
++
++ialias (acc_get_num_devices)
++
++/* Set the device type for the current thread only (using the current global
++   default device number), initialising that device if necessary.  Also set the
++   default device type for new threads to D.  */
++
++void
++acc_set_device_type (acc_device_t d)
++{
++  struct gomp_device_descr *base_dev, *acc_dev;
++  struct goacc_thread *thr = goacc_thread ();
++
++  gomp_mutex_lock (&acc_device_lock);
++
++  cached_base_dev = base_dev = resolve_device (d, true);
++  acc_dev = &base_dev[goacc_device_num];
++
++  gomp_mutex_lock (&acc_dev->lock);
++  if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED)
++    gomp_init_device (acc_dev);
++  gomp_mutex_unlock (&acc_dev->lock);
++
++  gomp_mutex_unlock (&acc_device_lock);
++
++  /* We're changing device type: invalidate the current thread's dev and
++     base_dev pointers.  */
++  if (thr && thr->base_dev != base_dev)
++    {
++      thr->base_dev = thr->dev = NULL;
++    }
++
++  goacc_attach_host_thread_to_device (-1);
++}
++
++ialias (acc_set_device_type)
++
++acc_device_t
++acc_get_device_type (void)
++{
++  acc_device_t res = acc_device_none;
++  struct gomp_device_descr *dev;
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (thr && thr->base_dev)
++    res = acc_device_type (thr->base_dev->type);
++  else
++    {
++      gomp_mutex_lock (&acc_device_lock);
++      dev = resolve_device (acc_device_default, true);
++      gomp_mutex_unlock (&acc_device_lock);
++      res = acc_device_type (dev->type);
++    }
++
++  assert (res != acc_device_default
++	  && res != acc_device_not_host);
++
++  return res;
++}
++
++ialias (acc_get_device_type)
++
++int
++acc_get_device_num (acc_device_t d)
++{
++  const struct gomp_device_descr *dev;
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (d >= _ACC_device_hwm)
++    gomp_fatal ("unknown device type %u", (unsigned) d);
++
++  gomp_mutex_lock (&acc_device_lock);
++  dev = resolve_device (d, true);
++  gomp_mutex_unlock (&acc_device_lock);
++
++  if (thr && thr->base_dev == dev && thr->dev)
++    return thr->dev->target_id;
++
++  return goacc_device_num;
++}
++
++ialias (acc_get_device_num)
++
++void
++acc_set_device_num (int ord, acc_device_t d)
++{
++  struct gomp_device_descr *base_dev, *acc_dev;
++  int num_devices;
++
++  if (ord < 0)
++    ord = goacc_device_num;
++
++  if ((int) d == 0)
++    /* Set whatever device is being used by the current host thread to use
++       device instance ORD.  It's unclear if this is supposed to affect other
++       host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num).  */
++    goacc_attach_host_thread_to_device (ord);
++  else
++    {
++      gomp_mutex_lock (&acc_device_lock);
++
++      cached_base_dev = base_dev = resolve_device (d, true);
++
++      num_devices = base_dev->get_num_devices_func ();
++
++      if (num_devices <= 0 || ord >= num_devices)
++        acc_dev_num_out_of_range (d, ord, num_devices);
++
++      acc_dev = &base_dev[ord];
++
++      gomp_mutex_lock (&acc_dev->lock);
++      if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED)
++        gomp_init_device (acc_dev);
++      gomp_mutex_unlock (&acc_dev->lock);
++
++      gomp_mutex_unlock (&acc_device_lock);
++
++      goacc_attach_host_thread_to_device (ord);
++    }
++  
++  goacc_device_num = ord;
++}
++
++ialias (acc_set_device_num)
++
++int
++acc_on_device (acc_device_t dev)
++{
++  return dev == acc_device_host || dev == acc_device_none;
++}
++
++ialias (acc_on_device)
++
++attribute_hidden void
++goacc_runtime_initialize (void)
++{
++  gomp_mutex_init (&acc_device_lock);
++
++#if !(defined HAVE_TLS || defined USE_EMUTLS)
++  pthread_key_create (&goacc_tls_key, NULL);
++#endif
++
++  pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
++
++  cached_base_dev = NULL;
++
++  goacc_threads = NULL;
++  gomp_mutex_init (&goacc_thread_lock);
++
++  /* Initialize and register the 'host' device type.  */
++  goacc_host_init ();
++}
++
++/* Compiler helper functions */
++
++attribute_hidden void
++goacc_save_and_set_bind (acc_device_t d)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  assert (!thr->saved_bound_dev);
++
++  thr->saved_bound_dev = thr->dev;
++  thr->dev = dispatchers[d];
++}
++
++attribute_hidden void
++goacc_restore_bind (void)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  thr->dev = thr->saved_bound_dev;
++  thr->saved_bound_dev = NULL;
++}
++
++/* This is called from any OpenACC support function that may need to implicitly
++   initialize the libgomp runtime, either globally or from a new host thread. 
++   On exit "goacc_thread" will return a valid & populated thread block.  */
++
++attribute_hidden void
++goacc_lazy_initialize (void)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (thr && thr->dev)
++    return;
++
++  if (!cached_base_dev)
++    acc_init (acc_device_default);
++  else
++    goacc_attach_host_thread_to_device (-1);
++}
+--- libgomp/oacc-int.h.jj	2016-07-13 16:57:04.400535794 +0200
++++ libgomp/oacc-int.h	2016-07-13 16:57:04.400535794 +0200
+@@ -0,0 +1,106 @@
++/* OpenACC Runtime - internal declarations
++
++   Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This file contains data types and function declarations that are not
++   part of the official OpenACC user interface.  There are declarations
++   in here that are part of the GNU OpenACC ABI, in that the compiler is
++   required to know about them and use them.
++
++   The convention is that the all caps prefix "GOACC" is used group items
++   that are part of the external ABI, and the lower case prefix "goacc"
++   is used group items that are completely private to the library.  */
++
++#ifndef OACC_INT_H
++#define OACC_INT_H 1
++
++#include "openacc.h"
++#include "config.h"
++#include <stddef.h>
++#include <stdbool.h>
++#include <stdarg.h>
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility push(hidden)
++#endif
++
++static inline enum acc_device_t
++acc_device_type (enum offload_target_type type)
++{
++  return (enum acc_device_t) type;
++}
++
++struct goacc_thread
++{
++  /* The base device for the current thread.  */
++  struct gomp_device_descr *base_dev;
++
++  /* The device for the current thread.  */
++  struct gomp_device_descr *dev;
++
++  struct gomp_device_descr *saved_bound_dev;
++
++  /* This is a linked list of data mapped by the "acc data" pragma, following
++     strictly push/pop semantics according to lexical scope.  */
++  struct target_mem_desc *mapped_data;
++
++  /* These structures form a list: this is the next thread in that list.  */
++  struct goacc_thread *next;
++
++  /* Target-specific data (used by plugin).  */
++  void *target_tls;
++};
++
++#if defined HAVE_TLS || defined USE_EMUTLS
++extern __thread struct goacc_thread *goacc_tls_data;
++static inline struct goacc_thread *
++goacc_thread (void)
++{
++  return goacc_tls_data;
++}
++#else
++extern pthread_key_t goacc_tls_key;
++static inline struct goacc_thread *
++goacc_thread (void)
++{
++  return pthread_getspecific (goacc_tls_key);
++}
++#endif
++
++void goacc_register (struct gomp_device_descr *) __GOACC_NOTHROW;
++void goacc_attach_host_thread_to_device (int);
++void goacc_runtime_initialize (void);
++void goacc_save_and_set_bind (acc_device_t);
++void goacc_restore_bind (void);
++void goacc_lazy_initialize (void);
++void goacc_host_init (void);
++
++#ifdef HAVE_ATTRIBUTE_VISIBILITY
++# pragma GCC visibility pop
++#endif
++
++#endif
+--- libgomp/oacc-host.c.jj	2016-07-13 16:57:13.489423096 +0200
++++ libgomp/oacc-host.c	2016-07-13 16:57:13.489423096 +0200
+@@ -0,0 +1,266 @@
++/* OpenACC Runtime Library: acc_device_host.
++
++   Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "libgomp.h"
++#include "oacc-int.h"
++#include "gomp-constants.h"
++
++#include <stdbool.h>
++#include <stddef.h>
++#include <stdint.h>
++
++static struct gomp_device_descr host_dispatch;
++
++static const char *
++host_get_name (void)
++{
++  return host_dispatch.name;
++}
++
++static unsigned int
++host_get_caps (void)
++{
++  return host_dispatch.capabilities;
++}
++
++static int
++host_get_type (void)
++{
++  return host_dispatch.type;
++}
++
++static int
++host_get_num_devices (void)
++{
++  return 1;
++}
++
++static bool
++host_init_device (int n __attribute__ ((unused)))
++{
++  return true;
++}
++
++static bool
++host_fini_device (int n __attribute__ ((unused)))
++{
++  return true;
++}
++
++static unsigned
++host_version (void)
++{
++  return GOMP_VERSION;
++}
++
++static int
++host_load_image (int n __attribute__ ((unused)),
++		 unsigned v __attribute__ ((unused)),
++		 const void *t __attribute__ ((unused)),
++		 struct addr_pair **r __attribute__ ((unused)))
++{
++  return 0;
++}
++
++static bool
++host_unload_image (int n __attribute__ ((unused)),
++		   unsigned v __attribute__ ((unused)),
++		   const void *t __attribute__ ((unused)))
++{
++  return true;
++}
++
++static void *
++host_alloc (int n __attribute__ ((unused)), size_t s)
++{
++  return gomp_malloc (s);
++}
++
++static bool
++host_free (int n __attribute__ ((unused)), void *p)
++{
++  free (p);
++  return true;
++}
++
++static bool
++host_dev2host (int n __attribute__ ((unused)),
++	       void *h __attribute__ ((unused)),
++	       const void *d __attribute__ ((unused)),
++	       size_t s __attribute__ ((unused)))
++{
++  return true;
++}
++
++static bool
++host_host2dev (int n __attribute__ ((unused)),
++	       void *d __attribute__ ((unused)),
++	       const void *h __attribute__ ((unused)),
++	       size_t s __attribute__ ((unused)))
++{
++  return true;
++}
++
++static void
++host_run (int n __attribute__ ((unused)), void *fn_ptr, void *vars,
++	  void **args __attribute__((unused)))
++{
++  void (*fn)(void *) = (void (*)(void *)) fn_ptr;
++
++  fn (vars);
++}
++
++static void
++host_openacc_exec (void (*fn) (void *),
++		   size_t mapnum __attribute__ ((unused)),
++		   void **hostaddrs,
++		   void **devaddrs __attribute__ ((unused)),
++		   int async __attribute__ ((unused)),
++		   unsigned *dims __attribute ((unused)),
++		   void *targ_mem_desc __attribute__ ((unused)))
++{
++  fn (hostaddrs);
++}
++
++static void
++host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
++				     int async __attribute__ ((unused)))
++{
++}
++
++static int
++host_openacc_async_test (int async __attribute__ ((unused)))
++{
++  return 1;
++}
++
++static int
++host_openacc_async_test_all (void)
++{
++  return 1;
++}
++
++static void
++host_openacc_async_wait (int async __attribute__ ((unused)))
++{
++}
++
++static void
++host_openacc_async_wait_async (int async1 __attribute__ ((unused)),
++			       int async2 __attribute__ ((unused)))
++{
++}
++
++static void
++host_openacc_async_wait_all (void)
++{
++}
++
++static void
++host_openacc_async_wait_all_async (int async __attribute__ ((unused)))
++{
++}
++
++static void
++host_openacc_async_set_async (int async __attribute__ ((unused)))
++{
++}
++
++static void *
++host_openacc_create_thread_data (int ord __attribute__ ((unused)))
++{
++  return NULL;
++}
++
++static void
++host_openacc_destroy_thread_data (void *tls_data __attribute__ ((unused)))
++{
++}
++
++static struct gomp_device_descr host_dispatch =
++  {
++    .name = "host",
++    .capabilities = (GOMP_OFFLOAD_CAP_SHARED_MEM
++		     | GOMP_OFFLOAD_CAP_NATIVE_EXEC
++		     | GOMP_OFFLOAD_CAP_OPENACC_200),
++    .target_id = 0,
++    .type = OFFLOAD_TARGET_TYPE_HOST,
++
++    .get_name_func = host_get_name,
++    .get_caps_func = host_get_caps,
++    .get_type_func = host_get_type,
++    .get_num_devices_func = host_get_num_devices,
++    .init_device_func = host_init_device,
++    .fini_device_func = host_fini_device,
++    .version_func = host_version,
++    .load_image_func = host_load_image,
++    .unload_image_func = host_unload_image,
++    .alloc_func = host_alloc,
++    .free_func = host_free,
++    .dev2host_func = host_dev2host,
++    .host2dev_func = host_host2dev,
++    .run_func = host_run,
++
++    .mem_map = { NULL },
++    /* .lock initilized in goacc_host_init.  */
++    .state = GOMP_DEVICE_UNINITIALIZED,
++
++    .openacc = {
++      .data_environ = NULL,
++
++      .exec_func = host_openacc_exec,
++
++      .register_async_cleanup_func = host_openacc_register_async_cleanup,
++
++      .async_test_func = host_openacc_async_test,
++      .async_test_all_func = host_openacc_async_test_all,
++      .async_wait_func = host_openacc_async_wait,
++      .async_wait_async_func = host_openacc_async_wait_async,
++      .async_wait_all_func = host_openacc_async_wait_all,
++      .async_wait_all_async_func = host_openacc_async_wait_all_async,
++      .async_set_async_func = host_openacc_async_set_async,
++
++      .create_thread_data_func = host_openacc_create_thread_data,
++      .destroy_thread_data_func = host_openacc_destroy_thread_data,
++
++      .cuda = {
++	.get_current_device_func = NULL,
++	.get_current_context_func = NULL,
++	.get_stream_func = NULL,
++	.set_stream_func = NULL,
++      }
++    }
++  };
++
++/* Initialize and register this device type.  */
++void
++goacc_host_init (void)
++{
++  gomp_mutex_init (&host_dispatch.lock);
++  goacc_register (&host_dispatch);
++}
+--- libgomp/oacc-parallel.c.jj	2016-07-13 16:57:04.399535807 +0200
++++ libgomp/oacc-parallel.c	2016-07-14 18:53:06.694996381 +0200
+@@ -0,0 +1,241 @@
++/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This file handles OpenACC constructs.  */
++
++#include "openacc.h"
++#include "libgomp.h"
++#include "libgomp_g.h"
++#include "gomp-constants.h"
++#include "oacc-int.h"
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>  /* For PRIu64.  */
++#endif
++#include <string.h>
++#include <stdarg.h>
++#include <assert.h>
++
++static void goacc_wait (int async, int num_waits, va_list *ap);
++
++
++/* Launch a possibly offloaded function on DEVICE.  FN is the host fn
++   address.  MAPNUM, HOSTADDRS, SIZES & KINDS  describe the memory
++   blocks to be copied to/from the device.  Varadic arguments are
++   keyed optional parameters terminated with a zero.  */
++
++void
++GOACC_parallel_keyed (int device, void (*fn) (void *),
++		      size_t mapnum, void **hostaddrs, size_t *sizes,
++		      unsigned short *kinds, ...)
++{
++  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
++  struct goacc_thread *thr;
++  struct gomp_device_descr *acc_dev;
++
++#ifdef HAVE_INTTYPES_H
++  gomp_debug (0, "%s: mapnum=%"PRIu64", hostaddrs=%p, size=%p, kinds=%p\n",
++	      __FUNCTION__, (uint64_t) mapnum, hostaddrs, sizes, kinds);
++#else
++  gomp_debug (0, "%s: mapnum=%lu, hostaddrs=%p, sizes=%p, kinds=%p\n",
++	      __FUNCTION__, (unsigned long) mapnum, hostaddrs, sizes, kinds);
++#endif
++  goacc_lazy_initialize ();
++
++  thr = goacc_thread ();
++  acc_dev = thr->dev;
++
++  /* Host fallback if "if" clause is false or if the current device is set to
++     the host.  */
++  if (host_fallback)
++    {
++      goacc_save_and_set_bind (acc_device_host);
++      fn (hostaddrs);
++      goacc_restore_bind ();
++      return;
++    }
++  else if (acc_device_type (acc_dev->type) == acc_device_host)
++    {
++      fn (hostaddrs);
++      return;
++    }
++
++  /* acc_device_host is the only supported device type.  */
++}
++
++/* Legacy entry point, only provide host execution.  */
++
++void
++GOACC_parallel (int device, void (*fn) (void *),
++		size_t mapnum, void **hostaddrs, size_t *sizes,
++		unsigned short *kinds,
++		int num_gangs, int num_workers, int vector_length,
++		int async, int num_waits, ...)
++{
++  goacc_save_and_set_bind (acc_device_host);
++  fn (hostaddrs);
++  goacc_restore_bind ();
++}
++
++void
++GOACC_data_start (int device, size_t mapnum,
++		  void **hostaddrs, size_t *sizes, unsigned short *kinds)
++{
++  goacc_lazy_initialize ();
++}
++
++void
++GOACC_data_end (void)
++{
++  gomp_debug (0, "  %s: restore mappings\n", __FUNCTION__);
++  gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
++}
++
++void
++GOACC_enter_exit_data (int device, size_t mapnum,
++		       void **hostaddrs, size_t *sizes, unsigned short *kinds,
++		       int async, int num_waits, ...)
++{
++  goacc_lazy_initialize ();
++}
++
++static void
++goacc_wait (int async, int num_waits, va_list *ap)
++{
++  struct goacc_thread *thr = goacc_thread ();
++  struct gomp_device_descr *acc_dev = thr->dev;
++
++  while (num_waits--)
++    {
++      int qid = va_arg (*ap, int);
++      
++      if (acc_async_test (qid))
++	continue;
++
++      if (async == acc_async_sync)
++	acc_wait (qid);
++      else if (qid == async)
++	;/* If we're waiting on the same asynchronous queue as we're
++	    launching on, the queue itself will order work as
++	    required, so there's no need to wait explicitly.  */
++      else
++	acc_dev->openacc.async_wait_async_func (qid, async);
++    }
++}
++
++void
++GOACC_update (int device, size_t mapnum,
++	      void **hostaddrs, size_t *sizes, unsigned short *kinds,
++	      int async, int num_waits, ...)
++{
++  goacc_lazy_initialize ();
++}
++
++void
++GOACC_wait (int async, int num_waits, ...)
++{
++  if (num_waits)
++    {
++      va_list ap;
++
++      va_start (ap, num_waits);
++      goacc_wait (async, num_waits, &ap);
++      va_end (ap);
++    }
++  else if (async == acc_async_sync)
++    acc_wait_all ();
++  else if (async == acc_async_noval)
++    goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval);
++}
++
++int
++GOACC_get_num_threads (void)
++{
++  return 1;
++}
++
++int
++GOACC_get_thread_num (void)
++{
++  return 0;
++}
++
++void
++GOACC_declare (int device, size_t mapnum,
++	       void **hostaddrs, size_t *sizes, unsigned short *kinds)
++{
++  int i;
++
++  for (i = 0; i < mapnum; i++)
++    {
++      unsigned char kind = kinds[i] & 0xff;
++
++      if (kind == GOMP_MAP_POINTER || kind == GOMP_MAP_TO_PSET)
++	continue;
++
++      switch (kind)
++	{
++	  case GOMP_MAP_FORCE_ALLOC:
++	  case GOMP_MAP_FORCE_FROM:
++	  case GOMP_MAP_FORCE_TO:
++	  case GOMP_MAP_POINTER:
++	  case GOMP_MAP_DELETE:
++	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
++				   &kinds[i], 0, 0);
++	    break;
++
++	  case GOMP_MAP_FORCE_DEVICEPTR:
++	    break;
++
++	  case GOMP_MAP_ALLOC:
++	    if (!acc_is_present (hostaddrs[i], sizes[i]))
++	      GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
++				     &kinds[i], 0, 0);
++	    break;
++
++	  case GOMP_MAP_TO:
++	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
++				   &kinds[i], 0, 0);
++
++	    break;
++
++	  case GOMP_MAP_FROM:
++	    kinds[i] = GOMP_MAP_FORCE_FROM;
++	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
++				   &kinds[i], 0, 0);
++	    break;
++
++	  case GOMP_MAP_FORCE_PRESENT:
++	    if (!acc_is_present (hostaddrs[i], sizes[i]))
++	      gomp_fatal ("[%p,%ld] is not mapped", hostaddrs[i],
++			  (unsigned long) sizes[i]);
++	    break;
++
++	  default:
++	    assert (0);
++	    break;
++	}
++    }
++}
+--- libgomp/oacc-cuda.c.jj	2016-07-13 16:57:04.432535397 +0200
++++ libgomp/oacc-cuda.c	2016-07-13 16:57:04.432535397 +0200
+@@ -0,0 +1,86 @@
++/* OpenACC Runtime Library: CUDA support glue.
++
++   Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "openacc.h"
++#include "config.h"
++#include "libgomp.h"
++#include "oacc-int.h"
++
++void *
++acc_get_current_cuda_device (void)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
++    return thr->dev->openacc.cuda.get_current_device_func ();
++
++  return NULL;
++}
++
++void *
++acc_get_current_cuda_context (void)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
++    return thr->dev->openacc.cuda.get_current_context_func ();
++ 
++  return NULL;
++}
++
++void *
++acc_get_cuda_stream (int async)
++{
++  struct goacc_thread *thr = goacc_thread ();
++
++  if (async < 0)
++    return NULL;
++
++  if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
++    return thr->dev->openacc.cuda.get_stream_func (async);
++ 
++  return NULL;
++}
++
++int
++acc_set_cuda_stream (int async, void *stream)
++{
++  struct goacc_thread *thr;
++
++  if (async < 0 || stream == NULL)
++    return 0;
++
++  goacc_lazy_initialize ();
++
++  thr = goacc_thread ();
++
++  if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
++    return thr->dev->openacc.cuda.set_stream_func (async, stream);
++
++  return -1;
++}
+--- libgomp/openacc_lib.h.jj	2016-07-13 16:57:13.486423134 +0200
++++ libgomp/openacc_lib.h	2016-07-13 16:57:13.486423134 +0200
+@@ -0,0 +1,382 @@
++!  OpenACC Runtime Library Definitions.			-*- mode: fortran -*-
++
++!  Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++!  Contributed by Tobias Burnus <burnus@net-b.de>
++!              and Mentor Embedded.
++
++!  This file is part of the GNU Offloading and Multi Processing Library
++!  (libgomp).
++
++!  Libgomp is free software; you can redistribute it and/or modify it
++!  under the terms of the GNU General Public License as published by
++!  the Free Software Foundation; either version 3, or (at your option)
++!  any later version.
++
++!  Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++!  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++!  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++!  more details.
++
++!  Under Section 7 of GPL version 3, you are granted additional
++!  permissions described in the GCC Runtime Library Exception, version
++!  3.1, as published by the Free Software Foundation.
++
++!  You should have received a copy of the GNU General Public License and
++!  a copy of the GCC Runtime Library Exception along with this program;
++!  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++!  <http://www.gnu.org/licenses/>.
++
++! NOTE: Due to the use of dimension (..), the code only works when compiled
++! with -std=f2008ts/gnu/legacy but not with other standard settings.
++! Alternatively, the user can use the module version, which permits
++! compilation with -std=f95.
++
++      integer, parameter :: acc_device_kind = 4
++
++!     Keep in sync with include/gomp-constants.h.
++      integer (acc_device_kind), parameter :: acc_device_none = 0
++      integer (acc_device_kind), parameter :: acc_device_default = 1
++      integer (acc_device_kind), parameter :: acc_device_host = 2
++!     integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3
++!     removed.
++      integer (acc_device_kind), parameter :: acc_device_not_host = 4
++      integer (acc_device_kind), parameter :: acc_device_nvidia = 5
++
++      integer, parameter :: acc_handle_kind = 4
++
++!     Keep in sync with include/gomp-constants.h.
++      integer (acc_handle_kind), parameter :: acc_async_noval = -1
++      integer (acc_handle_kind), parameter :: acc_async_sync = -2
++
++      integer, parameter :: openacc_version = 201306
++
++      interface acc_get_num_devices
++        function acc_get_num_devices_h (d)
++          import acc_device_kind
++          integer acc_get_num_devices_h
++          integer (acc_device_kind) d
++        end function
++      end interface
++
++      interface acc_set_device_type
++        subroutine acc_set_device_type_h (d)
++          import acc_device_kind
++          integer (acc_device_kind) d
++        end subroutine
++      end interface
++
++      interface acc_get_device_type
++        function acc_get_device_type_h ()
++          import acc_device_kind
++          integer (acc_device_kind) acc_get_device_type_h
++        end function
++      end interface
++
++      interface acc_set_device_num
++        subroutine acc_set_device_num_h (n, d)
++          import acc_device_kind
++          integer n
++          integer (acc_device_kind) d
++        end subroutine
++      end interface
++
++      interface acc_get_device_num
++        function acc_get_device_num_h (d)
++          import acc_device_kind
++          integer acc_get_device_num_h
++          integer (acc_device_kind) d
++        end function
++      end interface
++
++      interface acc_async_test
++        function acc_async_test_h (a)
++          logical acc_async_test_h
++          integer a
++        end function
++      end interface
++
++      interface acc_async_test_all
++        function acc_async_test_all_h ()
++          logical acc_async_test_all_h
++        end function
++      end interface
++
++      interface acc_wait
++        subroutine acc_wait_h (a)
++          integer a
++        end subroutine
++      end interface
++
++      interface acc_wait_async
++        subroutine acc_wait_async_h (a1, a2)
++          integer a1, a2
++        end subroutine
++      end interface
++
++      interface acc_wait_all
++        subroutine acc_wait_all_h ()
++        end subroutine
++      end interface
++
++      interface acc_wait_all_async
++        subroutine acc_wait_all_async_h (a)
++          integer a
++        end subroutine
++      end interface
++
++      interface acc_init
++        subroutine acc_init_h (devicetype)
++          import acc_device_kind
++          integer (acc_device_kind) devicetype
++        end subroutine
++      end interface
++
++      interface acc_shutdown
++        subroutine acc_shutdown_h (devicetype)
++          import acc_device_kind
++          integer (acc_device_kind) devicetype
++        end subroutine
++      end interface
++
++      interface acc_on_device
++        function acc_on_device_h (devicetype)
++          import acc_device_kind
++          logical acc_on_device_h
++          integer (acc_device_kind) devicetype
++        end function
++      end interface
++
++      ! acc_malloc: Only available in C/C++
++      ! acc_free: Only available in C/C++
++
++      interface acc_copyin
++        subroutine acc_copyin_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_copyin_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_copyin_array_h (a)
++          type (*), dimension (..), contiguous :: a
++          end subroutine
++      end interface
++
++      interface acc_present_or_copyin
++        subroutine acc_present_or_copyin_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_present_or_copyin_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_present_or_copyin_array_h (a)
++          type (*), dimension (..), contiguous :: a
++          end subroutine
++      end interface
++
++      interface acc_pcopyin
++        subroutine acc_pcopyin_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_pcopyin_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_pcopyin_array_h (a)
++          type (*), dimension (..), contiguous :: a
++          end subroutine
++      end interface
++
++      interface acc_create
++        subroutine acc_create_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_create_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_create_array_h (a)
++          type (*), dimension (..), contiguous :: a
++          end subroutine
++      end interface
++
++      interface acc_present_or_create
++        subroutine acc_present_or_create_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_present_or_create_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_present_or_create_array_h (a)
++          type (*), dimension (..), contiguous :: a
++          end subroutine
++      end interface
++
++      interface acc_pcreate
++        subroutine acc_pcreate_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_pcreate_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_pcreate_array_h (a)
++          type (*), dimension (..), contiguous :: a
++          end subroutine
++      end interface
++
++      interface acc_copyout
++        subroutine acc_copyout_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_copyout_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_copyout_array_h (a)
++          type (*), dimension (..), contiguous :: a
++        end subroutine
++      end interface
++
++      interface acc_delete
++        subroutine acc_delete_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_delete_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_delete_array_h (a)
++          type (*), dimension (..), contiguous :: a
++        end subroutine
++      end interface
++
++      interface acc_update_device
++        subroutine acc_update_device_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_update_device_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_update_device_array_h (a)
++          type (*), dimension (..), contiguous :: a
++        end subroutine
++      end interface
++
++      interface acc_update_self
++        subroutine acc_update_self_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end subroutine
++
++        subroutine acc_update_self_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end subroutine
++
++        subroutine acc_update_self_array_h (a)
++          type (*), dimension (..), contiguous :: a
++        end subroutine
++      end interface
++
++      ! acc_map_data: Only available in C/C++
++      ! acc_unmap_data: Only available in C/C++
++      ! acc_deviceptr: Only available in C/C++
++      ! acc_ostptr: Only available in C/C++
++
++      interface acc_is_present
++        function acc_is_present_32_h (a, len)
++          use iso_c_binding, only: c_int32_t
++          logical acc_is_present_32_h
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int32_t) len
++        end function
++
++        function acc_is_present_64_h (a, len)
++          use iso_c_binding, only: c_int64_t
++          logical acc_is_present_64_h
++          !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++          type (*), dimension (*) :: a
++          integer (c_int64_t) len
++        end function
++
++        function acc_is_present_array_h (a)
++          logical acc_is_present_array_h
++          type (*), dimension (..), contiguous :: a
++        end function
++      end interface
++
++      ! acc_memcpy_to_device: Only available in C/C++
++      ! acc_memcpy_from_device: Only available in C/C++
+--- libgomp/gomp-constants.h.jj	2016-07-14 16:02:47.212545826 +0200
++++ libgomp/gomp-constants.h	2016-05-26 21:04:40.000000000 +0200
+@@ -0,0 +1,259 @@
++/* Communication between GCC and libgomp.
++
++   Copyright (C) 2014-2015 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef GOMP_CONSTANTS_H
++#define GOMP_CONSTANTS_H 1
++
++/* Memory mapping types.  */
++
++/* One byte.  */
++#define GOMP_MAP_LAST			(1 << 8)
++
++#define GOMP_MAP_FLAG_TO		(1 << 0)
++#define GOMP_MAP_FLAG_FROM		(1 << 1)
++/* Special map kinds, enumerated starting here.  */
++#define GOMP_MAP_FLAG_SPECIAL_0		(1 << 2)
++#define GOMP_MAP_FLAG_SPECIAL_1		(1 << 3)
++#define GOMP_MAP_FLAG_SPECIAL_2		(1 << 4)
++#define GOMP_MAP_FLAG_SPECIAL		(GOMP_MAP_FLAG_SPECIAL_1 \
++					 | GOMP_MAP_FLAG_SPECIAL_0)
++/* Flag to force a specific behavior (or else, trigger a run-time error).  */
++#define GOMP_MAP_FLAG_FORCE		(1 << 7)
++
++enum gomp_map_kind
++  {
++    /* If not already present, allocate.  */
++    GOMP_MAP_ALLOC =			0,
++    /* ..., and copy to device.  */
++    GOMP_MAP_TO =			(GOMP_MAP_ALLOC | GOMP_MAP_FLAG_TO),
++    /* ..., and copy from device.  */
++    GOMP_MAP_FROM =			(GOMP_MAP_ALLOC | GOMP_MAP_FLAG_FROM),
++    /* ..., and copy to and from device.  */
++    GOMP_MAP_TOFROM =			(GOMP_MAP_TO | GOMP_MAP_FROM),
++    /* The following kind is an internal only map kind, used for pointer based
++       array sections.  OMP_CLAUSE_SIZE for these is not the pointer size,
++       which is implicitly POINTER_SIZE_UNITS, but the bias.  */
++    GOMP_MAP_POINTER =			(GOMP_MAP_FLAG_SPECIAL_0 | 0),
++    /* Also internal, behaves like GOMP_MAP_TO, but additionally any
++       GOMP_MAP_POINTER records consecutive after it which have addresses
++       falling into that range will not be ignored if GOMP_MAP_TO_PSET wasn't
++       mapped already.  */
++    GOMP_MAP_TO_PSET =			(GOMP_MAP_FLAG_SPECIAL_0 | 1),
++    /* Must already be present.  */
++    GOMP_MAP_FORCE_PRESENT =		(GOMP_MAP_FLAG_SPECIAL_0 | 2),
++    /* Deallocate a mapping, without copying from device.  */
++    GOMP_MAP_DELETE =			(GOMP_MAP_FLAG_SPECIAL_0 | 3),
++    /* Is a device pointer.  OMP_CLAUSE_SIZE for these is unused; is implicitly
++       POINTER_SIZE_UNITS.  */
++    GOMP_MAP_FORCE_DEVICEPTR =		(GOMP_MAP_FLAG_SPECIAL_1 | 0),
++    /* Do not map, copy bits for firstprivate instead.  */
++    /* OpenACC device_resident.  */
++    GOMP_MAP_DEVICE_RESIDENT =		(GOMP_MAP_FLAG_SPECIAL_1 | 1),
++    /* OpenACC link.  */
++    GOMP_MAP_LINK =			(GOMP_MAP_FLAG_SPECIAL_1 | 2),
++    /* Allocate.  */
++    GOMP_MAP_FIRSTPRIVATE =		(GOMP_MAP_FLAG_SPECIAL | 0),
++    /* Similarly, but store the value in the pointer rather than
++       pointed by the pointer.  */
++    GOMP_MAP_FIRSTPRIVATE_INT =		(GOMP_MAP_FLAG_SPECIAL | 1),
++    /* Pointer translate host address into device address and copy that
++       back to host.  */
++    GOMP_MAP_USE_DEVICE_PTR =		(GOMP_MAP_FLAG_SPECIAL | 2),
++    /* Allocate a zero length array section.  Prefer next non-zero length
++       mapping over previous non-zero length mapping over zero length mapping
++       at the address.  If not already mapped, do nothing (and pointer translate
++       to NULL).  */
++    GOMP_MAP_ZERO_LEN_ARRAY_SECTION = 	(GOMP_MAP_FLAG_SPECIAL | 3),
++    /* Allocate.  */
++    GOMP_MAP_FORCE_ALLOC =		(GOMP_MAP_FLAG_FORCE | GOMP_MAP_ALLOC),
++    /* ..., and copy to device.  */
++    GOMP_MAP_FORCE_TO =			(GOMP_MAP_FLAG_FORCE | GOMP_MAP_TO),
++    /* ..., and copy from device.  */
++    GOMP_MAP_FORCE_FROM =		(GOMP_MAP_FLAG_FORCE | GOMP_MAP_FROM),
++    /* ..., and copy to and from device.  */
++    GOMP_MAP_FORCE_TOFROM =		(GOMP_MAP_FLAG_FORCE | GOMP_MAP_TOFROM),
++    /* If not already present, allocate.  And unconditionally copy to
++       device.  */
++    GOMP_MAP_ALWAYS_TO =		(GOMP_MAP_FLAG_SPECIAL_2 | GOMP_MAP_TO),
++    /* If not already present, allocate.  And unconditionally copy from
++       device.  */
++    GOMP_MAP_ALWAYS_FROM =		(GOMP_MAP_FLAG_SPECIAL_2
++					 | GOMP_MAP_FROM),
++    /* If not already present, allocate.  And unconditionally copy to and from
++       device.  */
++    GOMP_MAP_ALWAYS_TOFROM =		(GOMP_MAP_FLAG_SPECIAL_2
++					 | GOMP_MAP_TOFROM),
++    /* Map a sparse struct; the address is the base of the structure, alignment
++       it's required alignment, and size is the number of adjacent entries
++       that belong to the struct.  The adjacent entries should be sorted by
++       increasing address, so it is easy to determine lowest needed address
++       (address of the first adjacent entry) and highest needed address
++       (address of the last adjacent entry plus its size).  */
++    GOMP_MAP_STRUCT =			(GOMP_MAP_FLAG_SPECIAL_2
++					 | GOMP_MAP_FLAG_SPECIAL | 0),
++    /* On a location of a pointer/reference that is assumed to be already mapped
++       earlier, store the translated address of the preceeding mapping.
++       No refcount is bumped by this, and the store is done unconditionally.  */
++    GOMP_MAP_ALWAYS_POINTER =		(GOMP_MAP_FLAG_SPECIAL_2
++					 | GOMP_MAP_FLAG_SPECIAL | 1),
++    /* Forced deallocation of zero length array section.  */
++    GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION
++      =					(GOMP_MAP_FLAG_SPECIAL_2
++					 | GOMP_MAP_FLAG_SPECIAL | 3),
++    /* Decrement usage count and deallocate if zero.  */
++    GOMP_MAP_RELEASE =			(GOMP_MAP_FLAG_SPECIAL_2
++					 | GOMP_MAP_DELETE),
++
++    /* Internal to GCC, not used in libgomp.  */
++    /* Do not map, but pointer assign a pointer instead.  */
++    GOMP_MAP_FIRSTPRIVATE_POINTER =	(GOMP_MAP_LAST | 1),
++    /* Do not map, but pointer assign a reference instead.  */
++    GOMP_MAP_FIRSTPRIVATE_REFERENCE =	(GOMP_MAP_LAST | 2)
++  };
++
++#define GOMP_MAP_COPY_TO_P(X) \
++  (!((X) & GOMP_MAP_FLAG_SPECIAL) \
++   && ((X) & GOMP_MAP_FLAG_TO))
++
++#define GOMP_MAP_COPY_FROM_P(X) \
++  (!((X) & GOMP_MAP_FLAG_SPECIAL) \
++   && ((X) & GOMP_MAP_FLAG_FROM))
++
++#define GOMP_MAP_POINTER_P(X) \
++  ((X) == GOMP_MAP_POINTER)
++
++#define GOMP_MAP_ALWAYS_TO_P(X) \
++  (((X) == GOMP_MAP_ALWAYS_TO) || ((X) == GOMP_MAP_ALWAYS_TOFROM))
++
++#define GOMP_MAP_ALWAYS_FROM_P(X) \
++  (((X) == GOMP_MAP_ALWAYS_FROM) || ((X) == GOMP_MAP_ALWAYS_TOFROM))
++
++#define GOMP_MAP_ALWAYS_P(X) \
++  (GOMP_MAP_ALWAYS_TO_P (X) || ((X) == GOMP_MAP_ALWAYS_FROM))
++
++
++/* Asynchronous behavior.  Keep in sync with
++   libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_async_t.  */
++
++#define GOMP_ASYNC_NOVAL		-1
++#define GOMP_ASYNC_SYNC			-2
++
++
++/* Device codes.  Keep in sync with
++   libgomp/{openacc.h,openacc.f90,openacc_lib.h}:acc_device_t as well as
++   libgomp/libgomp-plugin.h.  */
++#define GOMP_DEVICE_NONE		0
++#define GOMP_DEVICE_DEFAULT		1
++#define GOMP_DEVICE_HOST		2
++/* #define GOMP_DEVICE_HOST_NONSHM	3 removed.  */
++#define GOMP_DEVICE_NOT_HOST		4
++#define GOMP_DEVICE_NVIDIA_PTX		5
++#define GOMP_DEVICE_INTEL_MIC		6
++#define GOMP_DEVICE_HSA			7
++
++#define GOMP_DEVICE_ICV			-1
++#define GOMP_DEVICE_HOST_FALLBACK	-2
++
++/* GOMP_task/GOMP_taskloop* flags argument.  */
++#define GOMP_TASK_FLAG_UNTIED		(1 << 0)
++#define GOMP_TASK_FLAG_FINAL		(1 << 1)
++#define GOMP_TASK_FLAG_MERGEABLE	(1 << 2)
++#define GOMP_TASK_FLAG_DEPEND		(1 << 3)
++#define GOMP_TASK_FLAG_PRIORITY		(1 << 4)
++#define GOMP_TASK_FLAG_UP		(1 << 8)
++#define GOMP_TASK_FLAG_GRAINSIZE	(1 << 9)
++#define GOMP_TASK_FLAG_IF		(1 << 10)
++#define GOMP_TASK_FLAG_NOGROUP		(1 << 11)
++
++/* GOMP_target{_ext,update_ext,enter_exit_data} flags argument.  */
++#define GOMP_TARGET_FLAG_NOWAIT		(1 << 0)
++#define GOMP_TARGET_FLAG_EXIT_DATA	(1 << 1)
++/* Internal to libgomp.  */
++#define GOMP_TARGET_FLAG_UPDATE		(1U << 31)
++
++/* Versions of libgomp and device-specific plugins.  GOMP_VERSION
++   should be incremented whenever an ABI-incompatible change is introduced
++   to the plugin interface defined in libgomp/libgomp.h.  */
++#define GOMP_VERSION	1
++#define GOMP_VERSION_NVIDIA_PTX 1
++#define GOMP_VERSION_INTEL_MIC 0
++#define GOMP_VERSION_HSA 0
++
++#define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
++#define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
++#define GOMP_VERSION_DEV(PACK) ((PACK) & 0xffff)
++
++#define GOMP_DIM_GANG	0
++#define GOMP_DIM_WORKER	1
++#define GOMP_DIM_VECTOR	2
++#define GOMP_DIM_MAX	3
++#define GOMP_DIM_MASK(X) (1u << (X))
++
++/* Varadic launch arguments.  End of list is marked by a zero.  */
++#define GOMP_LAUNCH_DIM		1  /* Launch dimensions, op = mask */
++#define GOMP_LAUNCH_ASYNC	2  /* Async, op = cst val if not MAX  */
++#define GOMP_LAUNCH_WAIT	3  /* Waits, op = num waits.  */
++#define GOMP_LAUNCH_CODE_SHIFT	28
++#define GOMP_LAUNCH_DEVICE_SHIFT 16
++#define GOMP_LAUNCH_OP_SHIFT 0
++#define GOMP_LAUNCH_PACK(CODE,DEVICE,OP)	\
++  (((CODE) << GOMP_LAUNCH_CODE_SHIFT)		\
++   | ((DEVICE) << GOMP_LAUNCH_DEVICE_SHIFT)	\
++   | ((OP) << GOMP_LAUNCH_OP_SHIFT))
++#define GOMP_LAUNCH_CODE(X) (((X) >> GOMP_LAUNCH_CODE_SHIFT) & 0xf)
++#define GOMP_LAUNCH_DEVICE(X) (((X) >> GOMP_LAUNCH_DEVICE_SHIFT) & 0xfff)
++#define GOMP_LAUNCH_OP(X) (((X) >> GOMP_LAUNCH_OP_SHIFT) & 0xffff)
++#define GOMP_LAUNCH_OP_MAX 0xffff
++
++/* Bitmask to apply in order to find out the intended device of a target
++   argument.  */
++#define GOMP_TARGET_ARG_DEVICE_MASK		((1 << 7) - 1)
++/* The target argument is significant for all devices.  */
++#define GOMP_TARGET_ARG_DEVICE_ALL		0
++
++/* Flag set when the subsequent element in the device-specific argument
++   values.  */
++#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM	(1 << 7)
++
++/* Bitmask to apply to a target argument to find out the value identifier.  */
++#define GOMP_TARGET_ARG_ID_MASK			(((1 << 8) - 1) << 8)
++/* Target argument index of NUM_TEAMS.  */
++#define GOMP_TARGET_ARG_NUM_TEAMS		(1 << 8)
++/* Target argument index of THREAD_LIMIT.  */
++#define GOMP_TARGET_ARG_THREAD_LIMIT		(2 << 8)
++
++/* If the value is directly embeded in target argument, it should be a 16-bit
++   at most and shifted by this many bits.  */
++#define GOMP_TARGET_ARG_VALUE_SHIFT		16
++
++/* HSA specific data structures.  */
++
++/* Identifiers of device-specific target arguments.  */
++#define GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES	(1 << 8)
++
++#endif
+--- libgomp/oacc-mem.c.jj	2016-07-13 16:57:04.433535385 +0200
++++ libgomp/oacc-mem.c	2016-07-14 15:39:44.644631308 +0200
+@@ -0,0 +1,204 @@
++/* OpenACC Runtime initialization routines
++
++   Copyright (C) 2013-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "openacc.h"
++#include "config.h"
++#include "libgomp.h"
++#include "gomp-constants.h"
++#include "oacc-int.h"
++#include <stdint.h>
++#include <string.h>
++#include <assert.h>
++
++/* OpenACC is silent on how memory exhaustion is indicated.  We return
++   NULL.  */
++
++void *
++acc_malloc (size_t s)
++{
++  if (!s)
++    return NULL;
++
++  goacc_lazy_initialize ();
++  return malloc (s);
++}
++
++/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
++   the device address is mapped. We choose to check if it mapped,
++   and if it is, to unmap it. */
++void
++acc_free (void *d)
++{
++  return free (d);
++}
++
++void
++acc_memcpy_to_device (void *d, void *h, size_t s)
++{
++  memmove (d, h, s);
++}
++
++void
++acc_memcpy_from_device (void *h, void *d, size_t s)
++{
++  memmove (h, d, s);
++}
++
++/* Return the device pointer that corresponds to host data H.  Or NULL
++   if no mapping.  */
++
++void *
++acc_deviceptr (void *h)
++{
++  goacc_lazy_initialize ();
++  return h;
++}
++
++/* Return the host pointer that corresponds to device data D.  Or NULL
++   if no mapping.  */
++
++void *
++acc_hostptr (void *d)
++{
++  goacc_lazy_initialize ();
++  return d;
++}
++
++/* Return 1 if host data [H,+S] is present on the device.  */
++
++int
++acc_is_present (void *h, size_t s)
++{
++  if (!s || !h)
++    return 0;
++
++  goacc_lazy_initialize ();
++  return h != NULL;
++}
++
++/* Create a mapping for host [H,+S] -> device [D,+S] */
++
++void
++acc_map_data (void *h, void *d, size_t s)
++{
++  goacc_lazy_initialize ();
++
++  if (d != h)
++    gomp_fatal ("cannot map data on shared-memory system");
++}
++
++void
++acc_unmap_data (void *h)
++{
++}
++
++#define FLAG_PRESENT (1 << 0)
++#define FLAG_CREATE (1 << 1)
++#define FLAG_COPY (1 << 2)
++
++static void *
++present_create_copy (unsigned f, void *h, size_t s)
++{
++  if (!h || !s)
++    gomp_fatal ("[%p,+%d] is a bad range", (void *)h, (int)s);
++
++  goacc_lazy_initialize ();
++  return h;
++}
++
++void *
++acc_create (void *h, size_t s)
++{
++  return present_create_copy (FLAG_CREATE, h, s);
++}
++
++void *
++acc_copyin (void *h, size_t s)
++{
++  return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s);
++}
++
++void *
++acc_present_or_create (void *h, size_t s)
++{
++  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s);
++}
++
++void *
++acc_present_or_copyin (void *h, size_t s)
++{
++  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s);
++}
++
++#define FLAG_COPYOUT (1 << 0)
++
++static void
++delete_copyout (unsigned f, void *h, size_t s, const char *libfnname)
++{
++}
++
++void
++acc_delete (void *h , size_t s)
++{
++  delete_copyout (0, h, s, __FUNCTION__);
++}
++
++void
++acc_copyout (void *h, size_t s)
++{
++  delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__);
++}
++
++static void
++update_dev_host (int is_dev, void *h, size_t s)
++{
++  goacc_lazy_initialize ();
++}
++
++void
++acc_update_device (void *h, size_t s)
++{
++  update_dev_host (1, h, s);
++}
++
++void
++acc_update_self (void *h, size_t s)
++{
++  update_dev_host (0, h, s);
++}
++
++void
++gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
++			 void *kinds)
++{
++}
++
++void
++gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
++{
++}
+--- libgomp/oacc-plugin.h.jj	2016-07-13 16:57:13.487423121 +0200
++++ libgomp/oacc-plugin.h	2016-07-13 16:57:13.487423121 +0200
+@@ -0,0 +1,33 @@
++/* Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++   Contributed by Mentor Embedded.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#ifndef OACC_PLUGIN_H
++#define OACC_PLUGIN_H 1
++
++extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
++extern void *GOMP_PLUGIN_acc_thread (void);
++
++#endif
+--- libgomp/taskloop.c.jj	2016-07-13 16:57:18.935355570 +0200
++++ libgomp/taskloop.c	2016-07-13 16:57:18.935355570 +0200
+@@ -0,0 +1,340 @@
++/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This file handles the taskloop construct.  It is included twice, once
++   for the long and once for unsigned long long variant.  */
++
++/* Called when encountering an explicit task directive.  If IF_CLAUSE is
++   false, then we must not delay in executing the task.  If UNTIED is true,
++   then the task may be executed by any member of the team.  */
++
++void
++GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
++	       long arg_size, long arg_align, unsigned flags,
++	       unsigned long num_tasks, int priority,
++	       TYPE start, TYPE end, TYPE step)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++
++#ifdef HAVE_BROKEN_POSIX_SEMAPHORES
++  /* If pthread_mutex_* is used for omp_*lock*, then each task must be
++     tied to one thread all the time.  This means UNTIED tasks must be
++     tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN
++     might be running on different thread than FN.  */
++  if (cpyfn)
++    flags &= ~GOMP_TASK_FLAG_IF;
++  flags &= ~GOMP_TASK_FLAG_UNTIED;
++#endif
++
++  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
++  if (team && gomp_team_barrier_cancelled (&team->barrier))
++    return;
++
++#ifdef TYPE_is_long
++  TYPE s = step;
++  if (step > 0)
++    {
++      if (start >= end)
++	return;
++      s--;
++    }
++  else
++    {
++      if (start <= end)
++	return;
++      s++;
++    }
++  UTYPE n = (end - start + s) / step;
++#else
++  UTYPE n;
++  if (flags & GOMP_TASK_FLAG_UP)
++    {
++      if (start >= end)
++	return;
++      n = (end - start + step - 1) / step;
++    }
++  else
++    {
++      if (start <= end)
++	return;
++      n = (start - end - step - 1) / -step;
++    }
++#endif
++
++  TYPE task_step = step;
++  unsigned long nfirst = n;
++  if (flags & GOMP_TASK_FLAG_GRAINSIZE)
++    {
++      unsigned long grainsize = num_tasks;
++#ifdef TYPE_is_long
++      num_tasks = n / grainsize;
++#else
++      UTYPE ndiv = n / grainsize;
++      num_tasks = ndiv;
++      if (num_tasks != ndiv)
++	num_tasks = ~0UL;
++#endif
++      if (num_tasks <= 1)
++	{
++	  num_tasks = 1;
++	  task_step = end - start;
++	}
++      else if (num_tasks >= grainsize
++#ifndef TYPE_is_long
++	       && num_tasks != ~0UL
++#endif
++	      )
++	{
++	  UTYPE mul = num_tasks * grainsize;
++	  task_step = (TYPE) grainsize * step;
++	  if (mul != n)
++	    {
++	      task_step += step;
++	      nfirst = n - mul - 1;
++	    }
++	}
++      else
++	{
++	  UTYPE div = n / num_tasks;
++	  UTYPE mod = n % num_tasks;
++	  task_step = (TYPE) div * step;
++	  if (mod)
++	    {
++	      task_step += step;
++	      nfirst = mod - 1;
++	    }
++	}
++    }
++  else
++    {
++      if (num_tasks == 0)
++	num_tasks = team ? team->nthreads : 1;
++      if (num_tasks >= n)
++	num_tasks = n;
++      else
++	{
++	  UTYPE div = n / num_tasks;
++	  UTYPE mod = n % num_tasks;
++	  task_step = (TYPE) div * step;
++	  if (mod)
++	    {
++	      task_step += step;
++	      nfirst = mod - 1;
++	    }
++	}
++    }
++
++  if (flags & GOMP_TASK_FLAG_NOGROUP)
++    {
++      if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled)
++	return;
++    }
++  else
++    ialias_call (GOMP_taskgroup_start) ();
++
++  if (priority > gomp_max_task_priority_var)
++    priority = gomp_max_task_priority_var;
++
++  if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL
++      || (thr->task && thr->task->final_task)
++      || team->task_count + num_tasks > 64 * team->nthreads)
++    {
++      unsigned long i;
++      if (__builtin_expect (cpyfn != NULL, 0))
++	{
++	  struct gomp_task task[num_tasks];
++	  struct gomp_task *parent = thr->task;
++	  arg_size = (arg_size + arg_align - 1) & ~(arg_align - 1);
++	  char buf[num_tasks * arg_size + arg_align - 1];
++	  char *arg = (char *) (((uintptr_t) buf + arg_align - 1)
++				& ~(uintptr_t) (arg_align - 1));
++	  char *orig_arg = arg;
++	  for (i = 0; i < num_tasks; i++)
++	    {
++	      gomp_init_task (&task[i], parent, gomp_icv (false));
++	      task[i].priority = priority;
++	      task[i].kind = GOMP_TASK_UNDEFERRED;
++	      task[i].final_task = (thr->task && thr->task->final_task)
++				   || (flags & GOMP_TASK_FLAG_FINAL);
++	      if (thr->task)
++		{
++		  task[i].in_tied_task = thr->task->in_tied_task;
++		  task[i].taskgroup = thr->task->taskgroup;
++		}
++	      thr->task = &task[i];
++	      cpyfn (arg, data);
++	      arg += arg_size;
++	    }
++	  arg = orig_arg;
++	  for (i = 0; i < num_tasks; i++)
++	    {
++	      thr->task = &task[i];
++	      ((TYPE *)arg)[0] = start;
++	      start += task_step;
++	      ((TYPE *)arg)[1] = start;
++	      if (i == nfirst)
++		task_step -= step;
++	      fn (arg);
++	      arg += arg_size;
++	      if (!priority_queue_empty_p (&task[i].children_queue,
++					   MEMMODEL_RELAXED))
++		{
++		  gomp_mutex_lock (&team->task_lock);
++		  gomp_clear_parent (&task[i].children_queue);
++		  gomp_mutex_unlock (&team->task_lock);
++		}
++	      gomp_end_task ();
++	    }
++	}
++      else
++	for (i = 0; i < num_tasks; i++)
++	  {
++	    struct gomp_task task;
++
++	    gomp_init_task (&task, thr->task, gomp_icv (false));
++	    task.priority = priority;
++	    task.kind = GOMP_TASK_UNDEFERRED;
++	    task.final_task = (thr->task && thr->task->final_task)
++			      || (flags & GOMP_TASK_FLAG_FINAL);
++	    if (thr->task)
++	      {
++		task.in_tied_task = thr->task->in_tied_task;
++		task.taskgroup = thr->task->taskgroup;
++	      }
++	    thr->task = &task;
++	    ((TYPE *)data)[0] = start;
++	    start += task_step;
++	    ((TYPE *)data)[1] = start;
++	    if (i == nfirst)
++	      task_step -= step;
++	    fn (data);
++	    if (!priority_queue_empty_p (&task.children_queue,
++					 MEMMODEL_RELAXED))
++	      {
++		gomp_mutex_lock (&team->task_lock);
++		gomp_clear_parent (&task.children_queue);
++		gomp_mutex_unlock (&team->task_lock);
++	      }
++	    gomp_end_task ();
++	  }
++    }
++  else
++    {
++      struct gomp_task *tasks[num_tasks];
++      struct gomp_task *parent = thr->task;
++      struct gomp_taskgroup *taskgroup = parent->taskgroup;
++      char *arg;
++      int do_wake;
++      unsigned long i;
++
++      for (i = 0; i < num_tasks; i++)
++	{
++	  struct gomp_task *task
++	    = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
++	  tasks[i] = task;
++	  arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
++			  & ~(uintptr_t) (arg_align - 1));
++	  gomp_init_task (task, parent, gomp_icv (false));
++	  task->priority = priority;
++	  task->kind = GOMP_TASK_UNDEFERRED;
++	  task->in_tied_task = parent->in_tied_task;
++	  task->taskgroup = taskgroup;
++	  thr->task = task;
++	  if (cpyfn)
++	    {
++	      cpyfn (arg, data);
++	      task->copy_ctors_done = true;
++	    }
++	  else
++	    memcpy (arg, data, arg_size);
++	  ((TYPE *)arg)[0] = start;
++	  start += task_step;
++	  ((TYPE *)arg)[1] = start;
++	  if (i == nfirst)
++	    task_step -= step;
++	  thr->task = parent;
++	  task->kind = GOMP_TASK_WAITING;
++	  task->fn = fn;
++	  task->fn_data = arg;
++	  task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
++	}
++      gomp_mutex_lock (&team->task_lock);
++      /* If parallel or taskgroup has been cancelled, don't start new
++	 tasks.  */
++      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
++			     || (taskgroup && taskgroup->cancelled))
++			    && cpyfn == NULL, 0))
++	{
++	  gomp_mutex_unlock (&team->task_lock);
++	  for (i = 0; i < num_tasks; i++)
++	    {
++	      gomp_finish_task (tasks[i]);
++	      free (tasks[i]);
++	    }
++	  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
++	    ialias_call (GOMP_taskgroup_end) ();
++	  return;
++	}
++      if (taskgroup)
++	taskgroup->num_children += num_tasks;
++      for (i = 0; i < num_tasks; i++)
++	{
++	  struct gomp_task *task = tasks[i];
++	  priority_queue_insert (PQ_CHILDREN, &parent->children_queue,
++				 task, priority,
++				 PRIORITY_INSERT_BEGIN,
++				 /*last_parent_depends_on=*/false,
++				 task->parent_depends_on);
++	  if (taskgroup)
++	    priority_queue_insert (PQ_TASKGROUP, &taskgroup->taskgroup_queue,
++				   task, priority, PRIORITY_INSERT_BEGIN,
++				   /*last_parent_depends_on=*/false,
++				   task->parent_depends_on);
++	  priority_queue_insert (PQ_TEAM, &team->task_queue, task, priority,
++				 PRIORITY_INSERT_END,
++				 /*last_parent_depends_on=*/false,
++				 task->parent_depends_on);
++	  ++team->task_count;
++	  ++team->task_queued_count;
++	}
++      gomp_team_barrier_set_task_pending (&team->barrier);
++      if (team->task_running_count + !parent->in_tied_task
++	  < team->nthreads)
++	{
++	  do_wake = team->nthreads - team->task_running_count
++		    - !parent->in_tied_task;
++	  if ((unsigned long) do_wake > num_tasks)
++	    do_wake = num_tasks;
++	}
++      else
++	do_wake = 0;
++      gomp_mutex_unlock (&team->task_lock);
++      if (do_wake)
++	gomp_team_barrier_wake (&team->barrier, do_wake);
++    }
++  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
++    ialias_call (GOMP_taskgroup_end) ();
++}
+--- libgomp/priority_queue.h.jj	2016-07-13 16:57:04.438535323 +0200
++++ libgomp/priority_queue.h	2016-07-13 16:57:04.438535323 +0200
+@@ -0,0 +1,485 @@
++/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by Aldy Hernandez <aldyh@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* Header file for a priority queue of GOMP tasks.  */
++
++/* ?? Perhaps all the priority_tree_* functions are complex and rare
++   enough to go out-of-line and be moved to priority_queue.c.  ??  */
++
++#ifndef _PRIORITY_QUEUE_H_
++#define _PRIORITY_QUEUE_H_
++
++/* One task.  */
++
++struct priority_node
++{
++  /* Next and previous chains in a circular doubly linked list for
++     tasks within this task's priority.  */
++  struct priority_node *next, *prev;
++};
++
++/* All tasks within the same priority.  */
++
++struct priority_list
++{
++  /* Priority of the tasks in this set.  */
++  int priority;
++
++  /* Tasks.  */
++  struct priority_node *tasks;
++
++  /* This points to the last of the higher priority WAITING tasks.
++     Remember that for the children queue, we have:
++
++	parent_depends_on WAITING tasks.
++	!parent_depends_on WAITING tasks.
++	TIED tasks.
++
++     This is a pointer to the last of the parent_depends_on WAITING
++     tasks which are essentially, higher priority items within their
++     priority.  */
++  struct priority_node *last_parent_depends_on;
++};
++
++/* Another splay tree instantiation, for priority_list's.  */
++typedef struct prio_splay_tree_node_s *prio_splay_tree_node;
++typedef struct prio_splay_tree_s *prio_splay_tree;
++typedef struct prio_splay_tree_key_s *prio_splay_tree_key;
++struct prio_splay_tree_key_s {
++  /* This structure must only containing a priority_list, as we cast
++     prio_splay_tree_key to priority_list throughout.  */
++  struct priority_list l;
++};
++#define splay_tree_prefix prio
++#include "splay-tree.h"
++
++/* The entry point into a priority queue of tasks.
++
++   There are two alternate implementations with which to store tasks:
++   as a balanced tree of sorts, or as a simple list of tasks.  If
++   there are only priority-0 items (ROOT is NULL), we use the simple
++   list, otherwise (ROOT is non-NULL) we use the tree.  */
++
++struct priority_queue
++{
++  /* If t.root != NULL, this is a splay tree of priority_lists to hold
++     all tasks.  This is only used if multiple priorities are in play,
++     otherwise we use the priority_list `l' below to hold all
++     (priority-0) tasks.  */
++  struct prio_splay_tree_s t;
++
++  /* If T above is NULL, only priority-0 items exist, so keep them
++     in a simple list.  */
++  struct priority_list l;
++};
++
++enum priority_insert_type {
++  /* Insert at the beginning of a priority list.  */
++  PRIORITY_INSERT_BEGIN,
++  /* Insert at the end of a priority list.  */
++  PRIORITY_INSERT_END
++};
++
++/* Used to determine in which queue a given priority node belongs in.
++   See pnode field of gomp_task.  */
++
++enum priority_queue_type
++{
++  PQ_TEAM,	    /* Node belongs in gomp_team's task_queue.  */
++  PQ_CHILDREN,	    /* Node belongs in parent's children_queue.  */
++  PQ_TASKGROUP,	    /* Node belongs in taskgroup->taskgroup_queue.  */
++  PQ_IGNORED = 999
++};
++
++/* Priority queue implementation prototypes.  */
++
++extern bool priority_queue_task_in_queue_p (enum priority_queue_type,
++					    struct priority_queue *,
++					    struct gomp_task *);
++extern void priority_queue_dump (enum priority_queue_type,
++				 struct priority_queue *);
++extern void priority_queue_verify (enum priority_queue_type,
++				   struct priority_queue *, bool);
++extern void priority_tree_remove (enum priority_queue_type,
++				  struct priority_queue *,
++				  struct priority_node *);
++extern struct gomp_task *priority_tree_next_task (enum priority_queue_type,
++						  struct priority_queue *,
++						  enum priority_queue_type,
++						  struct priority_queue *,
++						  bool *);
++
++/* Return TRUE if there is more than one priority in HEAD.  This is
++   used throughout to to choose between the fast path (priority 0 only
++   items) and a world with multiple priorities.  */
++
++static inline bool
++priority_queue_multi_p (struct priority_queue *head)
++{
++  return __builtin_expect (head->t.root != NULL, 0);
++}
++
++/* Initialize a priority queue.  */
++
++static inline void
++priority_queue_init (struct priority_queue *head)
++{
++  head->t.root = NULL;
++  /* To save a few microseconds, we don't initialize head->l.priority
++     to 0 here.  It is implied that priority will be 0 if head->t.root
++     == NULL.
++
++     priority_tree_insert() will fix this when we encounter multiple
++     priorities.  */
++  head->l.tasks = NULL;
++  head->l.last_parent_depends_on = NULL;
++}
++
++static inline void
++priority_queue_free (struct priority_queue *head)
++{
++  /* There's nothing to do, as tasks were freed as they were removed
++     in priority_queue_remove.  */
++}
++
++/* Forward declarations.  */
++static inline size_t priority_queue_offset (enum priority_queue_type);
++static inline struct gomp_task *priority_node_to_task
++				(enum priority_queue_type,
++				 struct priority_node *);
++static inline struct priority_node *task_to_priority_node
++				    (enum priority_queue_type,
++				     struct gomp_task *);
++
++/* Return TRUE if priority queue HEAD is empty.
++
++   MODEL IS MEMMODEL_ACQUIRE if we should use an acquire atomic to
++   read from the root of the queue, otherwise MEMMODEL_RELAXED if we
++   should use a plain load.  */
++
++static inline _Bool
++priority_queue_empty_p (struct priority_queue *head, enum memmodel model)
++{
++  /* Note: The acquire barriers on the loads here synchronize with
++     the write of a NULL in gomp_task_run_post_remove_parent.  It is
++     not necessary that we synchronize with other non-NULL writes at
++     this point, but we must ensure that all writes to memory by a
++     child thread task work function are seen before we exit from
++     GOMP_taskwait.  */
++  if (priority_queue_multi_p (head))
++    {
++      if (model == MEMMODEL_ACQUIRE)
++	return __atomic_load_n (&head->t.root, MEMMODEL_ACQUIRE) == NULL;
++      return head->t.root == NULL;
++    }
++  if (model == MEMMODEL_ACQUIRE)
++    return __atomic_load_n (&head->l.tasks, MEMMODEL_ACQUIRE) == NULL;
++  return head->l.tasks == NULL;
++}
++
++/* Look for a given PRIORITY in HEAD.  Return it if found, otherwise
++   return NULL.  This only applies to the tree variant in HEAD.  There
++   is no point in searching for priorities in HEAD->L.  */
++
++static inline struct priority_list *
++priority_queue_lookup_priority (struct priority_queue *head, int priority)
++{
++  if (head->t.root == NULL)
++    return NULL;
++  struct prio_splay_tree_key_s k;
++  k.l.priority = priority;
++  return (struct priority_list *)
++    prio_splay_tree_lookup (&head->t, &k);
++}
++
++/* Insert task in DATA, with PRIORITY, in the priority list in LIST.
++   LIST contains items of type TYPE.
++
++   If POS is PRIORITY_INSERT_BEGIN, the new task is inserted at the
++   top of its respective priority.  If POS is PRIORITY_INSERT_END, the
++   task is inserted at the end of its priority.
++
++   If ADJUST_PARENT_DEPENDS_ON is TRUE, LIST is a children queue, and
++   we must keep track of higher and lower priority WAITING tasks by
++   keeping the queue's last_parent_depends_on field accurate.  This
++   only applies to the children queue, and the caller must ensure LIST
++   is a children queue in this case.
++
++   If ADJUST_PARENT_DEPENDS_ON is TRUE, TASK_IS_PARENT_DEPENDS_ON is
++   set to the task's parent_depends_on field.  If
++   ADJUST_PARENT_DEPENDS_ON is FALSE, this field is irrelevant.
++
++   Return the new priority_node.  */
++
++static inline void
++priority_list_insert (enum priority_queue_type type,
++		      struct priority_list *list,
++		      struct gomp_task *task,
++		      int priority,
++		      enum priority_insert_type pos,
++		      bool adjust_parent_depends_on,
++		      bool task_is_parent_depends_on)
++{
++  struct priority_node *node = task_to_priority_node (type, task);
++  if (list->tasks)
++    {
++      /* If we are keeping track of higher/lower priority items,
++	 but this is a lower priority WAITING task
++	 (parent_depends_on != NULL), put it after all ready to
++	 run tasks.  See the comment in
++	 priority_queue_upgrade_task for a visual on how tasks
++	 should be organized.  */
++      if (adjust_parent_depends_on
++	  && pos == PRIORITY_INSERT_BEGIN
++	  && list->last_parent_depends_on
++	  && !task_is_parent_depends_on)
++	{
++	  struct priority_node *last_parent_depends_on
++	    = list->last_parent_depends_on;
++	  node->next = last_parent_depends_on->next;
++	  node->prev = last_parent_depends_on;
++	}
++      /* Otherwise, put it at the top/bottom of the queue.  */
++      else
++	{
++	  node->next = list->tasks;
++	  node->prev = list->tasks->prev;
++	  if (pos == PRIORITY_INSERT_BEGIN)
++	    list->tasks = node;
++	}
++      node->next->prev = node;
++      node->prev->next = node;
++    }
++  else
++    {
++      node->next = node;
++      node->prev = node;
++      list->tasks = node;
++    }
++  if (adjust_parent_depends_on
++      && list->last_parent_depends_on == NULL
++      && task_is_parent_depends_on)
++    list->last_parent_depends_on = node;
++}
++
++/* Tree version of priority_list_insert.  */
++
++static inline void
++priority_tree_insert (enum priority_queue_type type,
++		      struct priority_queue *head,
++		      struct gomp_task *task,
++		      int priority,
++		      enum priority_insert_type pos,
++		      bool adjust_parent_depends_on,
++		      bool task_is_parent_depends_on)
++{
++  if (__builtin_expect (head->t.root == NULL, 0))
++    {
++      /* The first time around, transfer any priority 0 items to the
++	 tree.  */
++      if (head->l.tasks != NULL)
++	{
++	  prio_splay_tree_node k = gomp_malloc (sizeof (*k));
++	  k->left = NULL;
++	  k->right = NULL;
++	  k->key.l.priority = 0;
++	  k->key.l.tasks = head->l.tasks;
++	  k->key.l.last_parent_depends_on = head->l.last_parent_depends_on;
++	  prio_splay_tree_insert (&head->t, k);
++	  head->l.tasks = NULL;
++	}
++    }
++  struct priority_list *list
++    = priority_queue_lookup_priority (head, priority);
++  if (!list)
++    {
++      prio_splay_tree_node k = gomp_malloc (sizeof (*k));
++      k->left = NULL;
++      k->right = NULL;
++      k->key.l.priority = priority;
++      k->key.l.tasks = NULL;
++      k->key.l.last_parent_depends_on = NULL;
++      prio_splay_tree_insert (&head->t, k);
++      list = &k->key.l;
++    }
++  priority_list_insert (type, list, task, priority, pos,
++			adjust_parent_depends_on,
++			task_is_parent_depends_on);
++}
++
++/* Generic version of priority_*_insert.  */
++
++static inline void
++priority_queue_insert (enum priority_queue_type type,
++		       struct priority_queue *head,
++		       struct gomp_task *task,
++		       int priority,
++		       enum priority_insert_type pos,
++		       bool adjust_parent_depends_on,
++		       bool task_is_parent_depends_on)
++{
++#if _LIBGOMP_CHECKING_
++  if (priority_queue_task_in_queue_p (type, head, task))
++    gomp_fatal ("Attempt to insert existing task %p", task);
++#endif
++  if (priority_queue_multi_p (head) || __builtin_expect (priority > 0, 0))
++    priority_tree_insert (type, head, task, priority, pos,
++			  adjust_parent_depends_on,
++			  task_is_parent_depends_on);
++  else
++    priority_list_insert (type, &head->l, task, priority, pos,
++			  adjust_parent_depends_on,
++			  task_is_parent_depends_on);
++}
++
++/* If multiple priorities are in play, return the highest priority
++   task from within Q1 and Q2, while giving preference to tasks from
++   Q1.  If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to
++   TRUE, otherwise it is set to FALSE.
++
++   If multiple priorities are not in play (only 0 priorities are
++   available), the next task is chosen exclusively from Q1.
++
++   As a special case, Q2 can be NULL, in which case, we just choose
++   the highest priority WAITING task in Q1.  This is an optimization
++   to speed up looking through only one queue.
++
++   We assume Q1 has at least one item.  */
++
++static inline struct gomp_task *
++priority_queue_next_task (enum priority_queue_type t1,
++			  struct priority_queue *q1,
++			  enum priority_queue_type t2,
++			  struct priority_queue *q2,
++			  bool *q1_chosen_p)
++{
++#if _LIBGOMP_CHECKING_
++  if (priority_queue_empty_p (q1, MEMMODEL_RELAXED))
++    gomp_fatal ("priority_queue_next_task: Q1 is empty");
++#endif
++  if (priority_queue_multi_p (q1))
++    {
++      struct gomp_task *t
++	= priority_tree_next_task (t1, q1, t2, q2, q1_chosen_p);
++      /* If T is NULL, there are no WAITING tasks in Q1.  In which
++	 case, return any old (non-waiting) task which will cause the
++	 caller to do the right thing when checking T->KIND ==
++	 GOMP_TASK_WAITING.  */
++      if (!t)
++	{
++#if _LIBGOMP_CHECKING_
++	  if (*q1_chosen_p == false)
++	    gomp_fatal ("priority_queue_next_task inconsistency");
++#endif
++	  return priority_node_to_task (t1, q1->t.root->key.l.tasks);
++	}
++      return t;
++    }
++  else
++    {
++      *q1_chosen_p = true;
++      return priority_node_to_task (t1, q1->l.tasks);
++    }
++}
++
++/* Remove NODE from LIST.
++
++   If we are removing the one and only item in the list, and MODEL is
++   MEMMODEL_RELEASE, use an atomic release to clear the list.
++
++   If the list becomes empty after the remove, return TRUE.  */
++
++static inline bool
++priority_list_remove (struct priority_list *list,
++		      struct priority_node *node,
++		      enum memmodel model)
++{
++  bool empty = false;
++  node->prev->next = node->next;
++  node->next->prev = node->prev;
++  if (list->tasks == node)
++    {
++      if (node->next != node)
++	list->tasks = node->next;
++      else
++	{
++	  /* We access task->children in GOMP_taskwait outside of
++	     the task lock mutex region, so need a release barrier
++	     here to ensure memory written by child_task->fn above
++	     is flushed before the NULL is written.  */
++	  if (model == MEMMODEL_RELEASE)
++	    __atomic_store_n (&list->tasks, NULL, MEMMODEL_RELEASE);
++	  else
++	    list->tasks = NULL;
++	  empty = true;
++	  goto remove_out;
++	}
++    }
++remove_out:
++#if _LIBGOMP_CHECKING_
++  memset (node, 0xaf, sizeof (*node));
++#endif
++  return empty;
++}
++
++/* This is the generic version of priority_list_remove.
++
++   Remove NODE from priority queue HEAD.  HEAD contains tasks of type TYPE.
++
++   If we are removing the one and only item in the priority queue and
++   MODEL is MEMMODEL_RELEASE, use an atomic release to clear the queue.
++
++   If the queue becomes empty after the remove, return TRUE.  */
++
++static inline bool
++priority_queue_remove (enum priority_queue_type type,
++		       struct priority_queue *head,
++		       struct gomp_task *task,
++		       enum memmodel model)
++{
++#if _LIBGOMP_CHECKING_
++  if (!priority_queue_task_in_queue_p (type, head, task))
++    gomp_fatal ("Attempt to remove missing task %p", task);
++#endif
++  if (priority_queue_multi_p (head))
++    {
++      priority_tree_remove (type, head, task_to_priority_node (type, task));
++      if (head->t.root == NULL)
++	{
++	  if (model == MEMMODEL_RELEASE)
++	    /* Errr, we store NULL twice, the alternative would be to
++	       use an atomic release directly in the splay tree
++	       routines.  Worth it?  */
++	    __atomic_store_n (&head->t.root, NULL, MEMMODEL_RELEASE);
++	  return true;
++	}
++      return false;
++    }
++  else
++    return priority_list_remove (&head->l,
++				 task_to_priority_node (type, task), model);
++}
++
++#endif /* _PRIORITY_QUEUE_H_ */
+--- libgomp/priority_queue.c.jj	2016-07-13 16:57:04.435535360 +0200
++++ libgomp/priority_queue.c	2016-07-13 16:57:04.435535360 +0200
+@@ -0,0 +1,300 @@
++/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
++   Contributed by Aldy Hernandez <aldyh@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* Priority queue implementation of GOMP tasks.  */
++
++#include "libgomp.h"
++
++#if _LIBGOMP_CHECKING_
++#include <stdio.h>
++
++/* Sanity check to verify whether a TASK is in LIST.  Return TRUE if
++   found, FALSE otherwise.
++
++   TYPE is the type of priority queue this task resides in.  */
++
++static inline bool
++priority_queue_task_in_list_p (enum priority_queue_type type,
++			       struct priority_list *list,
++			       struct gomp_task *task)
++{
++  struct priority_node *p = list->tasks;
++  do
++    {
++      if (priority_node_to_task (type, p) == task)
++	return true;
++      p = p->next;
++    }
++  while (p != list->tasks);
++  return false;
++}
++
++/* Tree version of priority_queue_task_in_list_p.  */
++
++static inline bool
++priority_queue_task_in_tree_p (enum priority_queue_type type,
++			       struct priority_queue *head,
++			       struct gomp_task *task)
++{
++  struct priority_list *list
++    = priority_queue_lookup_priority (head, task->priority);
++  if (!list)
++    return false;
++  return priority_queue_task_in_list_p (type, list, task);
++}
++
++/* Generic version of priority_queue_task_in_list_p that works for
++   trees or lists.  */
++
++bool
++priority_queue_task_in_queue_p (enum priority_queue_type type,
++				struct priority_queue *head,
++				struct gomp_task *task)
++{
++  if (priority_queue_empty_p (head, MEMMODEL_RELAXED))
++    return false;
++  if (priority_queue_multi_p (head))
++    return priority_queue_task_in_tree_p (type, head, task);
++  else
++    return priority_queue_task_in_list_p (type, &head->l, task);
++}
++
++/* Sanity check LIST to make sure the tasks therein are in the right
++   order.  LIST is a priority list of type TYPE.
++
++   The expected order is that GOMP_TASK_WAITING tasks come before
++   GOMP_TASK_TIED/GOMP_TASK_ASYNC_RUNNING ones.
++
++   If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING
++   tasks come before !parent_depends_on WAITING tasks.  This is only
++   applicable to the children queue, and the caller is expected to
++   ensure that we are verifying the children queue.  */
++
++static void
++priority_list_verify (enum priority_queue_type type,
++		      struct priority_list *list, bool check_deps)
++{
++  bool seen_tied = false;
++  bool seen_plain_waiting = false;
++  struct priority_node *p = list->tasks;
++  while (1)
++    {
++      struct gomp_task *t = priority_node_to_task (type, p);
++      if (seen_tied && t->kind == GOMP_TASK_WAITING)
++	gomp_fatal ("priority_queue_verify: WAITING task after TIED");
++      if (t->kind >= GOMP_TASK_TIED)
++	seen_tied = true;
++      else if (check_deps && t->kind == GOMP_TASK_WAITING)
++	{
++	  if (t->parent_depends_on)
++	    {
++	      if (seen_plain_waiting)
++		gomp_fatal ("priority_queue_verify: "
++			    "parent_depends_on after !parent_depends_on");
++	    }
++	  else
++	    seen_plain_waiting = true;
++	}
++      p = p->next;
++      if (p == list->tasks)
++	break;
++    }
++}
++
++/* Callback type for priority_tree_verify_callback.  */
++struct cbtype
++{
++  enum priority_queue_type type;
++  bool check_deps;
++};
++
++/* Verify every task in NODE.
++
++   Callback for splay_tree_foreach.  */
++
++static void
++priority_tree_verify_callback (prio_splay_tree_key key, void *data)
++{
++  struct cbtype *cb = (struct cbtype *) data;
++  priority_list_verify (cb->type, &key->l, cb->check_deps);
++}
++
++/* Generic version of priority_list_verify.
++
++   Sanity check HEAD to make sure the tasks therein are in the right
++   order.  The priority_queue holds tasks of type TYPE.
++
++   If CHECK_DEPS is TRUE, we also check that parent_depends_on WAITING
++   tasks come before !parent_depends_on WAITING tasks.  This is only
++   applicable to the children queue, and the caller is expected to
++   ensure that we are verifying the children queue.  */
++
++void
++priority_queue_verify (enum priority_queue_type type,
++		       struct priority_queue *head, bool check_deps)
++{
++  if (priority_queue_empty_p (head, MEMMODEL_RELAXED))
++    return;
++  if (priority_queue_multi_p (head))
++    {
++      struct cbtype cb = { type, check_deps };
++      prio_splay_tree_foreach (&head->t,
++			       priority_tree_verify_callback, &cb);
++    }
++  else
++    priority_list_verify (type, &head->l, check_deps);
++}
++#endif /* _LIBGOMP_CHECKING_ */
++
++/* Remove NODE from priority queue HEAD, wherever it may be inside the
++   tree.  HEAD contains tasks of type TYPE.  */
++
++void
++priority_tree_remove (enum priority_queue_type type,
++		      struct priority_queue *head,
++		      struct priority_node *node)
++{
++  /* ?? The only reason this function is not inlined is because we
++     need to find the priority within gomp_task (which has not been
++     completely defined in the header file).  If the lack of inlining
++     is a concern, we could pass the priority number as a
++     parameter, or we could move this to libgomp.h.  */
++  int priority = priority_node_to_task (type, node)->priority;
++
++  /* ?? We could avoid this lookup by keeping a pointer to the key in
++     the priority_node.  */
++  struct priority_list *list
++    = priority_queue_lookup_priority (head, priority);
++#if _LIBGOMP_CHECKING_
++  if (!list)
++    gomp_fatal ("Unable to find priority %d", priority);
++#endif
++  /* If NODE was the last in its priority, clean up the priority.  */
++  if (priority_list_remove (list, node, MEMMODEL_RELAXED))
++    {
++      prio_splay_tree_remove (&head->t, (prio_splay_tree_key) list);
++      list->tasks = NULL;
++#if _LIBGOMP_CHECKING_
++      memset (list, 0xaf, sizeof (*list));
++#endif
++      free (list);
++    }
++}
++
++/* Return the highest priority WAITING task in a splay tree NODE.  If
++   there are no WAITING tasks available, return NULL.
++
++   NODE is a priority list containing tasks of type TYPE.
++
++   The right most node in a tree contains the highest priority.
++   Recurse down to find such a node.  If the task at that max node is
++   not WAITING, bubble back up and look at the remaining tasks
++   in-order.  */
++
++static struct gomp_task *
++priority_tree_next_task_1 (enum priority_queue_type type,
++			   prio_splay_tree_node node)
++{
++ again:
++  if (!node)
++    return NULL;
++  struct gomp_task *ret = priority_tree_next_task_1 (type, node->right);
++  if (ret)
++    return ret;
++  ret = priority_node_to_task (type, node->key.l.tasks);
++  if (ret->kind == GOMP_TASK_WAITING)
++    return ret;
++  node = node->left;
++  goto again;
++}
++
++/* Return the highest priority WAITING task from within Q1 and Q2,
++   while giving preference to tasks from Q1.  Q1 is a queue containing
++   items of type TYPE1.  Q2 is a queue containing items of type TYPE2.
++
++   Since we are mostly interested in Q1, if there are no WAITING tasks
++   in Q1, we don't bother checking Q2, and just return NULL.
++
++   As a special case, Q2 can be NULL, in which case, we just choose
++   the highest priority WAITING task in Q1.  This is an optimization
++   to speed up looking through only one queue.
++
++   If the returned task is chosen from Q1, *Q1_CHOSEN_P is set to
++   TRUE, otherwise it is set to FALSE.  */
++
++struct gomp_task *
++priority_tree_next_task (enum priority_queue_type type1,
++			 struct priority_queue *q1,
++			 enum priority_queue_type type2,
++			 struct priority_queue *q2,
++			 bool *q1_chosen_p)
++{
++  struct gomp_task *t1 = priority_tree_next_task_1 (type1, q1->t.root);
++  if (!t1
++      /* Special optimization when only searching through one queue.  */
++      || !q2)
++    {
++      *q1_chosen_p = true;
++      return t1;
++    }
++  struct gomp_task *t2 = priority_tree_next_task_1 (type2, q2->t.root);
++  if (!t2 || t1->priority > t2->priority)
++    {
++      *q1_chosen_p = true;
++      return t1;
++    }
++  if (t2->priority > t1->priority)
++    {
++      *q1_chosen_p = false;
++      return t2;
++    }
++  /* If we get here, the priorities are the same, so we must look at
++     parent_depends_on to make our decision.  */
++#if _LIBGOMP_CHECKING_
++  if (t1 != t2)
++    gomp_fatal ("priority_tree_next_task: t1 != t2");
++#endif
++  if (t2->parent_depends_on && !t1->parent_depends_on)
++    {
++      *q1_chosen_p = false;
++      return t2;
++    }
++  *q1_chosen_p = true;
++  return t1;
++}
++
++/* Priority splay trees comparison function.  */
++static inline int
++prio_splay_compare (prio_splay_tree_key x, prio_splay_tree_key y)
++{
++  if (x->l.priority == y->l.priority)
++    return 0;
++  return x->l.priority < y->l.priority ? -1 : 1;
++}
++
++/* Define another splay tree instantiation, for priority_list's.  */
++#define splay_tree_prefix prio
++#define splay_tree_c
++#include "splay-tree.h"
+--- libgomp/openacc.f90.jj	2016-07-13 16:57:04.434535373 +0200
++++ libgomp/openacc.f90	2016-07-14 19:01:54.901230875 +0200
+@@ -0,0 +1,911 @@
++!  OpenACC Runtime Library Definitions.
++
++!  Copyright (C) 2014-2016 Free Software Foundation, Inc.
++
++!  Contributed by Tobias Burnus <burnus@net-b.de>
++!              and Mentor Embedded.
++
++!  This file is part of the GNU Offloading and Multi Processing Library
++!  (libgomp).
++
++!  Libgomp is free software; you can redistribute it and/or modify it
++!  under the terms of the GNU General Public License as published by
++!  the Free Software Foundation; either version 3, or (at your option)
++!  any later version.
++
++!  Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++!  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++!  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++!  more details.
++
++!  Under Section 7 of GPL version 3, you are granted additional
++!  permissions described in the GCC Runtime Library Exception, version
++!  3.1, as published by the Free Software Foundation.
++
++!  You should have received a copy of the GNU General Public License and
++!  a copy of the GCC Runtime Library Exception along with this program;
++!  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++!  <http://www.gnu.org/licenses/>.
++
++module openacc_kinds
++  use iso_fortran_env, only: int32
++  implicit none
++
++  private :: int32
++  public :: acc_device_kind
++
++  integer, parameter :: acc_device_kind = int32
++
++  public :: acc_device_none, acc_device_default, acc_device_host
++  public :: acc_device_not_host, acc_device_nvidia
++
++  ! Keep in sync with include/gomp-constants.h.
++  integer (acc_device_kind), parameter :: acc_device_none = 0
++  integer (acc_device_kind), parameter :: acc_device_default = 1
++  integer (acc_device_kind), parameter :: acc_device_host = 2
++  ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
++  integer (acc_device_kind), parameter :: acc_device_not_host = 4
++  integer (acc_device_kind), parameter :: acc_device_nvidia = 5
++
++  public :: acc_handle_kind
++
++  integer, parameter :: acc_handle_kind = int32
++
++  public :: acc_async_noval, acc_async_sync
++
++  ! Keep in sync with include/gomp-constants.h.
++  integer (acc_handle_kind), parameter :: acc_async_noval = -1
++  integer (acc_handle_kind), parameter :: acc_async_sync = -2
++
++end module
++
++module openacc_internal
++  use openacc_kinds
++  implicit none
++
++  interface
++    function acc_get_num_devices_h (d)
++      import
++      integer acc_get_num_devices_h
++      integer (acc_device_kind) d
++    end function
++
++    subroutine acc_set_device_type_h (d)
++      import
++      integer (acc_device_kind) d
++    end subroutine
++
++    function acc_get_device_type_h ()
++      import
++      integer (acc_device_kind) acc_get_device_type_h
++    end function
++
++    subroutine acc_set_device_num_h (n, d)
++      import
++      integer n
++      integer (acc_device_kind) d
++    end subroutine
++
++    function acc_get_device_num_h (d)
++      import
++      integer acc_get_device_num_h
++      integer (acc_device_kind) d
++    end function
++
++    function acc_async_test_h (a)
++      logical acc_async_test_h
++      integer a
++    end function
++
++    function acc_async_test_all_h ()
++      logical acc_async_test_all_h
++    end function
++
++    subroutine acc_wait_h (a)
++      integer a
++    end subroutine
++
++    subroutine acc_wait_async_h (a1, a2)
++      integer a1, a2
++    end subroutine
++
++    subroutine acc_wait_all_h ()
++    end subroutine
++
++    subroutine acc_wait_all_async_h (a)
++      integer a
++    end subroutine
++
++    subroutine acc_init_h (d)
++      import
++      integer (acc_device_kind) d
++    end subroutine
++
++    subroutine acc_shutdown_h (d)
++      import
++      integer (acc_device_kind) d
++    end subroutine
++
++    function acc_on_device_h (d)
++      import
++      integer (acc_device_kind) d
++      logical acc_on_device_h
++    end function
++
++    subroutine acc_copyin_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_copyin_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_copyin_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_present_or_copyin_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_present_or_copyin_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_present_or_copyin_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_create_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_create_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_create_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_present_or_create_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_present_or_create_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_present_or_create_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_copyout_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_copyout_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_copyout_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_delete_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_delete_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_delete_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_update_device_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_update_device_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_update_device_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_update_self_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_update_self_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_update_self_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    function acc_is_present_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      logical acc_is_present_32_h
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end function
++
++    function acc_is_present_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      logical acc_is_present_64_h
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end function
++
++    function acc_is_present_array_h (a)
++      logical acc_is_present_array_h
++      type (*), dimension (..), contiguous :: a
++    end function
++  end interface
++
++  interface
++    function acc_get_num_devices_l (d) &
++        bind (C, name = "acc_get_num_devices")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_get_num_devices_l
++      integer (c_int), value :: d
++    end function
++
++    subroutine acc_set_device_type_l (d) &
++        bind (C, name = "acc_set_device_type")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: d
++    end subroutine
++
++    function acc_get_device_type_l () &
++        bind (C, name = "acc_get_device_type")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_get_device_type_l
++    end function
++
++    subroutine acc_set_device_num_l (n, d) &
++        bind (C, name = "acc_set_device_num")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: n, d
++    end subroutine
++
++    function acc_get_device_num_l (d) &
++        bind (C, name = "acc_get_device_num")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_get_device_num_l
++      integer (c_int), value :: d
++    end function
++
++    function acc_async_test_l (a) &
++        bind (C, name = "acc_async_test")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_async_test_l
++      integer (c_int), value :: a
++    end function
++
++    function acc_async_test_all_l () &
++        bind (C, name = "acc_async_test_all")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_async_test_all_l
++    end function
++
++    subroutine acc_wait_l (a) &
++        bind (C, name = "acc_wait")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: a
++    end subroutine
++
++    subroutine acc_wait_async_l (a1, a2) &
++        bind (C, name = "acc_wait_async")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: a1, a2
++    end subroutine
++
++    subroutine acc_wait_all_l () &
++        bind (C, name = "acc_wait_all")
++      use iso_c_binding, only: c_int
++    end subroutine
++
++    subroutine acc_wait_all_async_l (a) &
++        bind (C, name = "acc_wait_all_async")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: a
++    end subroutine
++
++    subroutine acc_init_l (d) &
++        bind (C, name = "acc_init")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: d
++    end subroutine
++
++    subroutine acc_shutdown_l (d) &
++        bind (C, name = "acc_shutdown")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: d
++    end subroutine
++
++    function acc_on_device_l (d) &
++        bind (C, name = "acc_on_device")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_on_device_l
++      integer (c_int), value :: d
++    end function
++
++    subroutine acc_copyin_l (a, len) &
++        bind (C, name = "acc_copyin")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_present_or_copyin_l (a, len) &
++        bind (C, name = "acc_present_or_copyin")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_create_l (a, len) &
++        bind (C, name = "acc_create")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_present_or_create_l (a, len) &
++        bind (C, name = "acc_present_or_create")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_copyout_l (a, len) &
++        bind (C, name = "acc_copyout")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_delete_l (a, len) &
++        bind (C, name = "acc_delete")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_update_device_l (a, len) &
++        bind (C, name = "acc_update_device")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_update_self_l (a, len) &
++        bind (C, name = "acc_update_self")
++      use iso_c_binding, only: c_size_t
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    function acc_is_present_l (a, len) &
++        bind (C, name = "acc_is_present")
++      use iso_c_binding, only: c_int32_t, c_size_t
++      integer (c_int32_t) :: acc_is_present_l
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end function
++  end interface
++end module
++
++module openacc
++  use openacc_kinds
++  use openacc_internal
++  implicit none
++
++  public :: openacc_version
++
++  public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type
++  public :: acc_set_device_num, acc_get_device_num, acc_async_test
++  public :: acc_async_test_all, acc_wait, acc_wait_async, acc_wait_all
++  public :: acc_wait_all_async, acc_init, acc_shutdown, acc_on_device
++  public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create
++  public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete
++  public :: acc_update_device, acc_update_self, acc_is_present
++
++  integer, parameter :: openacc_version = 201306
++
++  interface acc_get_num_devices
++    procedure :: acc_get_num_devices_h
++  end interface
++
++  interface acc_set_device_type
++    procedure :: acc_set_device_type_h
++  end interface
++
++  interface acc_get_device_type
++    procedure :: acc_get_device_type_h
++  end interface
++
++  interface acc_set_device_num
++    procedure :: acc_set_device_num_h
++  end interface
++
++  interface acc_get_device_num
++    procedure :: acc_get_device_num_h
++  end interface
++
++  interface acc_async_test
++    procedure :: acc_async_test_h
++  end interface
++
++  interface acc_async_test_all
++    procedure :: acc_async_test_all_h
++  end interface
++
++  interface acc_wait
++    procedure :: acc_wait_h
++  end interface
++
++  interface acc_wait_async
++    procedure :: acc_wait_async_h
++  end interface
++
++  interface acc_wait_all
++    procedure :: acc_wait_all_h
++  end interface
++
++  interface acc_wait_all_async
++    procedure :: acc_wait_all_async_h
++  end interface
++
++  interface acc_init
++    procedure :: acc_init_h
++  end interface
++
++  interface acc_shutdown
++    procedure :: acc_shutdown_h
++  end interface
++
++  interface acc_on_device
++    procedure :: acc_on_device_h
++  end interface
++
++  ! acc_malloc: Only available in C/C++
++  ! acc_free: Only available in C/C++
++
++  ! As vendor extension, the following code supports both 32bit and 64bit
++  ! arguments for "size"; the OpenACC standard only permits default-kind
++  ! integers, which are of kind 4 (i.e. 32 bits).
++  ! Additionally, the two-argument version also takes arrays as argument.
++  ! and the one argument version also scalars. Note that the code assumes
++  ! that the arrays are contiguous.
++
++  interface acc_copyin
++    procedure :: acc_copyin_32_h
++    procedure :: acc_copyin_64_h
++    procedure :: acc_copyin_array_h
++  end interface
++
++  interface acc_present_or_copyin
++    procedure :: acc_present_or_copyin_32_h
++    procedure :: acc_present_or_copyin_64_h
++    procedure :: acc_present_or_copyin_array_h
++  end interface
++
++  interface acc_pcopyin
++    procedure :: acc_present_or_copyin_32_h
++    procedure :: acc_present_or_copyin_64_h
++    procedure :: acc_present_or_copyin_array_h
++  end interface
++
++  interface acc_create
++    procedure :: acc_create_32_h
++    procedure :: acc_create_64_h
++    procedure :: acc_create_array_h
++  end interface
++
++  interface acc_present_or_create
++    procedure :: acc_present_or_create_32_h
++    procedure :: acc_present_or_create_64_h
++    procedure :: acc_present_or_create_array_h
++  end interface
++
++  interface acc_pcreate
++    procedure :: acc_present_or_create_32_h
++    procedure :: acc_present_or_create_64_h
++    procedure :: acc_present_or_create_array_h
++  end interface
++
++  interface acc_copyout
++    procedure :: acc_copyout_32_h
++    procedure :: acc_copyout_64_h
++    procedure :: acc_copyout_array_h
++  end interface
++
++  interface acc_delete
++    procedure :: acc_delete_32_h
++    procedure :: acc_delete_64_h
++    procedure :: acc_delete_array_h
++  end interface
++
++  interface acc_update_device
++    procedure :: acc_update_device_32_h
++    procedure :: acc_update_device_64_h
++    procedure :: acc_update_device_array_h
++  end interface
++
++  interface acc_update_self
++    procedure :: acc_update_self_32_h
++    procedure :: acc_update_self_64_h
++    procedure :: acc_update_self_array_h
++  end interface
++
++  ! acc_map_data: Only available in C/C++
++  ! acc_unmap_data: Only available in C/C++
++  ! acc_deviceptr: Only available in C/C++
++  ! acc_hostptr: Only available in C/C++
++
++  interface acc_is_present
++    procedure :: acc_is_present_32_h
++    procedure :: acc_is_present_64_h
++    procedure :: acc_is_present_array_h
++  end interface
++
++  ! acc_memcpy_to_device: Only available in C/C++
++  ! acc_memcpy_from_device: Only available in C/C++
++
++end module
++
++function acc_get_num_devices_h (d)
++  use openacc_internal, only: acc_get_num_devices_l
++  use openacc_kinds
++  integer acc_get_num_devices_h
++  integer (acc_device_kind) d
++  acc_get_num_devices_h = acc_get_num_devices_l (d)
++end function
++
++subroutine acc_set_device_type_h (d)
++  use openacc_internal, only: acc_set_device_type_l
++  use openacc_kinds
++  integer (acc_device_kind) d
++  call acc_set_device_type_l (d)
++end subroutine
++
++function acc_get_device_type_h ()
++  use openacc_internal, only: acc_get_device_type_l
++  use openacc_kinds
++  integer (acc_device_kind) acc_get_device_type_h
++  acc_get_device_type_h = acc_get_device_type_l ()
++end function
++
++subroutine acc_set_device_num_h (n, d)
++  use openacc_internal, only: acc_set_device_num_l
++  use openacc_kinds
++  integer n
++  integer (acc_device_kind) d
++  call acc_set_device_num_l (n, d)
++end subroutine
++
++function acc_get_device_num_h (d)
++  use openacc_internal, only: acc_get_device_num_l
++  use openacc_kinds
++  integer acc_get_device_num_h
++  integer (acc_device_kind) d
++  acc_get_device_num_h = acc_get_device_num_l (d)
++end function
++
++function acc_async_test_h (a)
++  use openacc_internal, only: acc_async_test_l
++  logical acc_async_test_h
++  integer a
++  if (acc_async_test_l (a) .eq. 1) then
++    acc_async_test_h = .TRUE.
++  else
++    acc_async_test_h = .FALSE.
++  end if
++end function
++
++function acc_async_test_all_h ()
++  use openacc_internal, only: acc_async_test_all_l
++  logical acc_async_test_all_h
++  if (acc_async_test_all_l () .eq. 1) then
++    acc_async_test_all_h = .TRUE.
++  else
++    acc_async_test_all_h = .FALSE.
++  end if
++end function
++
++subroutine acc_wait_h (a)
++  use openacc_internal, only: acc_wait_l
++  integer a
++  call acc_wait_l (a)
++end subroutine
++
++subroutine acc_wait_async_h (a1, a2)
++  use openacc_internal, only: acc_wait_async_l
++  integer a1, a2
++  call acc_wait_async_l (a1, a2)
++end subroutine
++
++subroutine acc_wait_all_h ()
++  use openacc_internal, only: acc_wait_all_l
++  call acc_wait_all_l ()
++end subroutine
++
++subroutine acc_wait_all_async_h (a)
++  use openacc_internal, only: acc_wait_all_async_l
++  integer a
++  call acc_wait_all_async_l (a)
++end subroutine
++
++subroutine acc_init_h (d)
++  use openacc_internal, only: acc_init_l
++  use openacc_kinds
++  integer (acc_device_kind) d
++  call acc_init_l (d)
++end subroutine
++
++subroutine acc_shutdown_h (d)
++  use openacc_internal, only: acc_shutdown_l
++  use openacc_kinds
++  integer (acc_device_kind) d
++  call acc_shutdown_l (d)
++end subroutine
++
++function acc_on_device_h (d)
++  use openacc_internal, only: acc_on_device_l
++  use openacc_kinds
++  integer (acc_device_kind) d
++  logical acc_on_device_h
++  if (acc_on_device_l (d) .eq. 1) then
++    acc_on_device_h = .TRUE.
++  else
++    acc_on_device_h = .FALSE.
++  end if
++end function
++
++subroutine acc_copyin_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_copyin_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyin_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_copyin_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyin_array_h (a)
++  use openacc_internal, only: acc_copyin_l
++  type (*), dimension (..), contiguous :: a
++  call acc_copyin_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_present_or_copyin_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_present_or_copyin_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_copyin_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_present_or_copyin_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_copyin_array_h (a)
++  use openacc_internal, only: acc_present_or_copyin_l
++  type (*), dimension (..), contiguous :: a
++  call acc_present_or_copyin_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_create_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_create_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_create_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_create_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_create_array_h (a)
++  use openacc_internal, only: acc_create_l
++  type (*), dimension (..), contiguous :: a
++  call acc_create_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_present_or_create_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_present_or_create_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_present_or_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_create_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_present_or_create_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_present_or_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_create_array_h (a)
++  use openacc_internal, only: acc_present_or_create_l
++  type (*), dimension (..), contiguous :: a
++  call acc_present_or_create_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_copyout_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_copyout_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_copyout_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyout_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_copyout_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_copyout_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyout_array_h (a)
++  use openacc_internal, only: acc_copyout_l
++  type (*), dimension (..), contiguous :: a
++  call acc_copyout_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_delete_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_delete_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_delete_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_delete_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_delete_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_delete_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_delete_array_h (a)
++  use openacc_internal, only: acc_delete_l
++  type (*), dimension (..), contiguous :: a
++  call acc_delete_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_update_device_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_update_device_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_update_device_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_device_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_update_device_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_update_device_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_device_array_h (a)
++  use openacc_internal, only: acc_update_device_l
++  type (*), dimension (..), contiguous :: a
++  call acc_update_device_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_update_self_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_update_self_l
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_update_self_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_self_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_update_self_l
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_update_self_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_self_array_h (a)
++  use openacc_internal, only: acc_update_self_l
++  type (*), dimension (..), contiguous :: a
++  call acc_update_self_l (a, sizeof (a))
++end subroutine
++
++function acc_is_present_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal, only: acc_is_present_l
++  logical acc_is_present_32_h
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
++    acc_is_present_32_h = .TRUE.
++  else
++    acc_is_present_32_h = .FALSE.
++  end if
++end function
++
++function acc_is_present_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal, only: acc_is_present_l
++  logical acc_is_present_64_h
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
++    acc_is_present_64_h = .TRUE.
++  else
++    acc_is_present_64_h = .FALSE.
++  end if
++end function
++
++function acc_is_present_array_h (a)
++  use openacc_internal, only: acc_is_present_l
++  logical acc_is_present_array_h
++  type (*), dimension (..), contiguous :: a
++  acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1
++end function