Add tests to proposed new builtin async_copy functions with a bug fix. (#725)

* Add tests to proposed new builtin async_copy functions with a bug fix.

* Revert "Add tests to proposed new builtin async_copy functions with a bug fix."

This reverts commit 7d0f16d014d228c327daf27464b27e02267f9aef.

* Add tests to proposed new builtin async_copy functions.

* Added is_extension_available to check if an extension is available.

* Added is extension available for test_async_copy_fence.

* fix build issues on windows.

* include algorithms.h for async copy 2D/3D.

* adding algorithms header.

* Fix numLines - 1 in maxTotalPlanesIn/Out.

* fix formatting violations.

* fixed formatting issue.
diff --git a/test_conformance/basic/CMakeLists.txt b/test_conformance/basic/CMakeLists.txt
index 2717824..c5c4b5f 100644
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -37,6 +37,9 @@
     test_work_item_functions.cpp
     test_astype.cpp
     test_async_copy.cpp
+    test_async_copy2D.cpp
+    test_async_copy3D.cpp
+    test_async_copy_fence.cpp	
     test_sizeof.cpp
     test_vector_creation.cpp
     test_vector_swizzle.cpp
diff --git a/test_conformance/basic/main.cpp b/test_conformance/basic/main.cpp
index d1a35fa..86c3cec 100644
--- a/test_conformance/basic/main.cpp
+++ b/test_conformance/basic/main.cpp
@@ -113,14 +113,24 @@
     ADD_TEST(async_copy_local_to_global),
     ADD_TEST(async_strided_copy_global_to_local),
     ADD_TEST(async_strided_copy_local_to_global),
+    ADD_TEST(async_copy_global_to_local2D),
+    ADD_TEST(async_copy_local_to_global2D),
+    ADD_TEST(async_copy_global_to_local3D),
+    ADD_TEST(async_copy_local_to_global3D),
+    ADD_TEST(async_work_group_copy_fence_import_after_export_aliased_local),
+    ADD_TEST(async_work_group_copy_fence_import_after_export_aliased_global),
+    ADD_TEST(
+        async_work_group_copy_fence_import_after_export_aliased_global_and_local),
+    ADD_TEST(async_work_group_copy_fence_export_after_import_aliased_local),
+    ADD_TEST(async_work_group_copy_fence_export_after_import_aliased_global),
+    ADD_TEST(
+        async_work_group_copy_fence_export_after_import_aliased_global_and_local),
     ADD_TEST(prefetch),
-
     ADD_TEST(kernel_call_kernel_function),
     ADD_TEST(host_numeric_constants),
     ADD_TEST(kernel_numeric_constants),
     ADD_TEST(kernel_limit_constants),
     ADD_TEST(kernel_preprocessor_macros),
-
     ADD_TEST(parameter_types),
     ADD_TEST(vector_creation),
     ADD_TEST(vector_swizzle),
diff --git a/test_conformance/basic/procs.h b/test_conformance/basic/procs.h
index bdb7d6a..4a01a8c 100644
--- a/test_conformance/basic/procs.h
+++ b/test_conformance/basic/procs.h
@@ -115,6 +115,42 @@
 extern int      test_async_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 extern int      test_async_strided_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_async_copy_global_to_local2D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_copy_local_to_global2D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_copy_global_to_local3D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_copy_local_to_global3D(cl_device_id deviceID,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+extern int test_async_work_group_copy_fence_import_after_export_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_async_work_group_copy_fence_import_after_export_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int
+test_async_work_group_copy_fence_import_after_export_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_async_work_group_copy_fence_export_after_import_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_async_work_group_copy_fence_export_after_import_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int
+test_async_work_group_copy_fence_export_after_import_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements);
 extern int      test_prefetch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
 
 extern int      test_host_numeric_constants(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp
new file mode 100644
index 0000000..2b53449
--- /dev/null
+++ b/test_conformance/basic/test_async_copy2D.cpp
@@ -0,0 +1,449 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "../../test_common/harness/conversions.h"
+#include "procs.h"
+
+static const char *async_global_to_local_kernel2D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
+    "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
+    "{\n"
+    " int i, j;\n"
+    // Zero the local storage first
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
+    "(%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_2D2D( (__local %s*)localBuffer, "
+    "(__global const "
+    "%s*)(src+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
+    "srcStride)), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, "
+    "srcStride, dstStride, 0 );\n"
+    // Wait for the copy to complete, then verify by manually copying to the
+    // dest
+    "     wait_group_events( 1, &event );\n"
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     dst[ (get_global_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
+    "localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ];\n"
+    "}\n";
+
+static const char *async_local_to_global_kernel2D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
+    "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
+    "{\n"
+    " int i, j;\n"
+    // Zero the local storage first
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = "
+    "(%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numElementsPerLine; j++)\n"
+    "     localBuffer[ (get_local_id( 0 "
+    ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = src[ "
+    "(get_global_id( 0 )*lineCopiesPerWorkItem+i)*(numElementsPerLine + "
+    "srcStride)+j ];\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_2D2D((__global "
+    "%s*)(dst+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
+    "dstStride)), (__local const %s*)localBuffer, (size_t)numElementsPerLine, "
+    "(size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0 );\n"
+    "    wait_group_events( 1, &event );\n"
+    "}\n";
+
+int test_copy2D(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, const char *kernelCode,
+                ExplicitType vecType, int vecSize, int srcStride, int dstStride,
+                bool localIsDst)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[2];
+    size_t threads[1], localThreads[1];
+    void *inBuffer, *outBuffer, *outBufferCopy;
+    MTdata d;
+    char vecNameString[64];
+    vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
+                vecSize);
+
+    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
+    log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString,
+             srcStride, dstStride);
+
+    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
+    {
+        log_info(
+            "Device does not support extended async copies. Skipping test.\n");
+        return 0;
+    }
+
+    cl_long max_local_mem_size;
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    cl_long max_global_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(max_global_mem_size), &max_global_mem_size,
+                            NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
+
+    cl_long max_alloc_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(max_alloc_size), &max_alloc_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
+
+    if (max_alloc_size > max_global_mem_size / 2)
+        max_alloc_size = max_global_mem_size / 2;
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(num_of_compute_devices),
+                            &num_of_compute_devices, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096];
+    programSource[0] = 0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+                               : "",
+            vecNameString, vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString);
+    // log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&programPtr, "test_fn");
+    test_error(error, "Unable to create testing kernel");
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
+        &max_workgroup_size, NULL);
+    test_error(
+        error,
+        "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(max_local_workgroup_size),
+                            max_local_workgroup_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t numElementsPerLine = 10;
+    size_t lineCopiesPerWorkItem = 13;
+    elementSize =
+        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize
+        * (numElementsPerLine + (localIsDst ? dstStride : srcStride));
+    size_t maxLocalWorkgroupSize =
+        (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if (maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride)
+        / (numElementsPerLine + srcStride);
+    size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride)
+        / (numElementsPerLine + dstStride);
+    size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut);
+    size_t maxLocalWorkgroups =
+        maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
+
+    size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
+        - (localIsDst ? dstStride : srcStride);
+    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
+    size_t totalLines =
+        numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
+    size_t inBufferSize = elementSize
+        * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride);
+    size_t outBufferSize = elementSize
+        * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride);
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+
+    inBuffer = (void *)malloc(inBufferSize);
+    outBuffer = (void *)malloc(outBufferSize);
+    outBufferCopy = (void *)malloc(outBufferSize);
+
+    cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt,
+        lineCopiesPerWorkgroup;
+    lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem;
+    numElementsPerLineInt = (int)numElementsPerLine;
+    lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize);
+
+    log_info(
+        "Global: %d, local %d, local buffer %db, global in buffer %db, "
+        "global out buffer %db, each work group will copy %d lines and each "
+        "work item item will copy %d lines.\n",
+        (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize,
+        (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
+        lineCopiesPerWorkItemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand(gRandomSeed);
+    generate_random_data(
+        vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
+    generate_random_data(
+        vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
+    free_mtdata(d);
+    d = NULL;
+    memcpy(outBufferCopy, outBuffer, outBufferSize);
+
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
+                                inBuffer, &error);
+    test_error(error, "Unable to create input buffer");
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
+                                outBuffer, &error);
+    test_error(error, "Unable to create output buffer");
+
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
+                           &numElementsPerLineInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup),
+                           &lineCopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt),
+                           &lineCopiesPerWorkItemInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride);
+    test_error(error, "Unable to set kernel argument");
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to queue kernel");
+
+    // Read
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
+                                outBuffer, 0, NULL, NULL);
+    test_error(error, "Unable to read results");
+
+    // Verify
+    int failuresPrinted = 0;
+    // Verify
+    size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+    for (int i = 0;
+         i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
+         i += elementSize)
+    {
+        for (int j = 0; j < (int)numElementsPerLine * elementSize;
+             j += elementSize)
+        {
+            int inIdx = i * (numElementsPerLine + srcStride) + j;
+            int outIdx = i * (numElementsPerLine + dstStride) + j;
+            if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
+                       typeSize)
+                != 0)
+            {
+                unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
+                unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
+                char values[4096];
+                values[0] = 0;
+
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of copy did not validate!\n");
+                }
+                sprintf(values + strlen(values), "%d -> [", inIdx);
+                for (int k = 0; k < (int)elementSize; k++)
+                    sprintf(values + strlen(values), "%2x ", inchar[k]);
+                sprintf(values + strlen(values), "] != [");
+                for (int k = 0; k < (int)elementSize; k++)
+                    sprintf(values + strlen(values), "%2x ", outchar[k]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                return -1;
+            }
+        }
+        if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
+                * elementSize)
+        {
+            int outIdx = i * (numElementsPerLine + dstStride)
+                + numElementsPerLine * elementSize;
+            if (memcmp(((char *)outBuffer) + outIdx,
+                       ((char *)outBufferCopy) + outIdx,
+                       dstStride * elementSize)
+                != 0)
+            {
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of copy did not validate!\n");
+                }
+                log_error(
+                    "2D copy corrupted data in output buffer in the stride "
+                    "offset of line %d\n",
+                    i);
+                failuresPrinted++;
+            }
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                return -1;
+            }
+        }
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+    free(outBufferCopy);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, const char *kernelCode,
+                          bool localIsDst)
+{
+    ExplicitType vecType[] = {
+        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
+        kULong, kFloat, kDouble, kNumExplicitTypes
+    };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
+    unsigned int size, typeIndex, srcStride, dstStride;
+
+    int errors = 0;
+
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    {
+        if (vecType[typeIndex] == kDouble
+            && !is_extension_available(deviceID, "cl_khr_fp64"))
+            continue;
+
+        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
+            && !gHasLong)
+            continue;
+
+        for (size = 0; vecSizes[size] != 0; size++)
+        {
+            if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
+                <= 2) // small type
+            {
+                for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes)
+                         / sizeof(smallTypesStrideSizes[0]);
+                     srcStride++)
+                {
+                    for (dstStride = 0;
+                         dstStride < sizeof(smallTypesStrideSizes)
+                             / sizeof(smallTypesStrideSizes[0]);
+                         dstStride++)
+                    {
+                        if (test_copy2D(deviceID, context, queue, kernelCode,
+                                        vecType[typeIndex], vecSizes[size],
+                                        smallTypesStrideSizes[srcStride],
+                                        smallTypesStrideSizes[dstStride],
+                                        localIsDst))
+                        {
+                            errors++;
+                        }
+                    }
+                }
+            }
+            // not a small type, check only zero stride
+            else if (test_copy2D(deviceID, context, queue, kernelCode,
+                                 vecType[typeIndex], vecSizes[size], 0, 0,
+                                 localIsDst))
+            {
+                errors++;
+            }
+        }
+    }
+    if (errors) return -1;
+    return 0;
+}
+
+int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy2D_all_types(deviceID, context, queue,
+                                 async_global_to_local_kernel2D, true);
+}
+
+int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy2D_all_types(deviceID, context, queue,
+                                 async_local_to_global_kernel2D, false);
+}
diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp
new file mode 100644
index 0000000..af10191
--- /dev/null
+++ b/test_conformance/basic/test_async_copy3D.cpp
@@ -0,0 +1,546 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "../../test_common/harness/conversions.h"
+#include "procs.h"
+
+static const char *async_global_to_local_kernel3D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int numLines, int "
+    "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
+    "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
+    "{\n"
+    " int i, j, k;\n"
+    // Zero the local storage first
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
+    "dstLineStride) + k ] = (%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_3D3D( (__local %s*)localBuffer, "
+    "(__global const "
+    "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
+    "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), "
+    "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, "
+    "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, "
+    "0 );\n"
+    // Wait for the copy to complete, then verify by manually copying to the
+    // dest
+    " wait_group_events( 1, &event );\n"
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       dst[ (get_global_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
+    "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + "
+    "dstLineStride) + k ];\n"
+    "}\n";
+
+static const char *async_local_to_global_kernel3D =
+    "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
+    "%s *localBuffer, int numElementsPerLine, int numLines, int "
+    "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, "
+    "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n"
+    "{\n"
+    " int i, j, k;\n"
+    // Zero the local storage first
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
+    "srcLineStride) + k ] = (%s)(%s)0;\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    " for(i=0; i<planesCopiesPerWorkItem; i++)\n"
+    "   for(j=0; j<numLines; j++)\n"
+    "     for(k=0; k<numElementsPerLine; k++)\n"
+    "       localBuffer[ (get_local_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
+    "srcLineStride) + k ] = src[ (get_global_id( 0 "
+    ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + "
+    "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + "
+    "srcLineStride) + k ];\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the copy
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t event;\n"
+    "    event = async_work_group_copy_3D3D((__global "
+    "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*"
+    "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local "
+    "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, "
+    "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, "
+    "dstPlaneStride, 0 );\n"
+    "    wait_group_events( 1, &event );\n"
+    "}\n";
+
+int test_copy3D(cl_device_id deviceID, cl_context context,
+                cl_command_queue queue, const char *kernelCode,
+                ExplicitType vecType, int vecSize, int srcLineStride,
+                int dstLineStride, int srcPlaneStride, int dstPlaneStride,
+                bool localIsDst)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[2];
+    size_t threads[1], localThreads[1];
+    void *inBuffer, *outBuffer, *outBufferCopy;
+    MTdata d;
+    char vecNameString[64];
+    vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
+                vecSize);
+
+    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
+    log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, "
+             "srcPlaneStride = %d, dstPlaneStride = %d\n",
+             vecNameString, srcLineStride, dstLineStride, srcPlaneStride,
+             dstPlaneStride);
+
+    if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
+    {
+        log_info(
+            "Device does not support extended async copies. Skipping test.\n");
+        return 0;
+    }
+
+    cl_long max_local_mem_size;
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    cl_long max_global_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(max_global_mem_size), &max_global_mem_size,
+                            NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
+
+    cl_long max_alloc_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                            sizeof(max_alloc_size), &max_alloc_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
+
+    if (max_alloc_size > max_global_mem_size / 2)
+        max_alloc_size = max_global_mem_size / 2;
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(num_of_compute_devices),
+                            &num_of_compute_devices, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096];
+    programSource[0] = 0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+                               : "",
+            vecNameString, vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString);
+    // log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&programPtr, "test_fn");
+    test_error(error, "Unable to create testing kernel");
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
+        &max_workgroup_size, NULL);
+    test_error(
+        error,
+        "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(max_local_workgroup_size),
+                            max_local_workgroup_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t numElementsPerLine = 10;
+    size_t numLines = 13;
+    size_t planesCopiesPerWorkItem = 2;
+    elementSize =
+        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem = elementSize
+        * (planesCopiesPerWorkItem
+           * (numLines * numElementsPerLine
+              + numLines * (localIsDst ? dstLineStride : srcLineStride)
+              + (localIsDst ? dstPlaneStride : srcPlaneStride)));
+    size_t maxLocalWorkgroupSize =
+        (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if (maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride)
+        / ((numLines * numElementsPerLine + numLines * srcLineStride)
+           + srcPlaneStride);
+    size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride)
+        / ((numLines * numElementsPerLine + numLines * dstLineStride)
+           + dstPlaneStride);
+    size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut);
+    size_t maxLocalWorkgroups =
+        maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
+
+    size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
+        - (localIsDst ? dstPlaneStride : srcPlaneStride);
+    size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
+    size_t totalPlanes =
+        numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
+    size_t inBufferSize = elementSize
+        * (totalPlanes
+               * (numLines * numElementsPerLine + numLines * srcLineStride)
+           + (totalPlanes - 1) * srcPlaneStride);
+    size_t outBufferSize = elementSize
+        * (totalPlanes
+               * (numLines * numElementsPerLine + numLines * dstLineStride)
+           + (totalPlanes - 1) * dstPlaneStride);
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+
+    inBuffer = (void *)malloc(inBufferSize);
+    outBuffer = (void *)malloc(outBufferSize);
+    outBufferCopy = (void *)malloc(outBufferSize);
+
+    cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt,
+        planesCopiesPerWorkgroup;
+    planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem;
+    numElementsPerLineInt = (int)numElementsPerLine;
+    numLinesInt = (int)numLines;
+    planesCopiesPerWorkgroup =
+        (int)(planesCopiesPerWorkItem * localWorkgroupSize);
+
+    log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
+             "global out buffer %db, each work group will copy %d planes and "
+             "each work item item will copy %d planes.\n",
+             (int)globalWorkgroupSize, (int)localWorkgroupSize,
+             (int)localBufferSize, (int)inBufferSize, (int)outBufferSize,
+             planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand(gRandomSeed);
+    generate_random_data(
+        vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
+    generate_random_data(
+        vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
+    free_mtdata(d);
+    d = NULL;
+    memcpy(outBufferCopy, outBuffer, outBufferSize);
+
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
+                                inBuffer, &error);
+    test_error(error, "Unable to create input buffer");
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
+                                outBuffer, &error);
+    test_error(error, "Unable to create output buffer");
+
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
+                           &numElementsPerLineInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 4, sizeof(numLinesInt), &numLinesInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 5, sizeof(planesCopiesPerWorkgroup),
+                           &planesCopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 6, sizeof(planesCopiesPerWorkItemInt),
+                           &planesCopiesPerWorkItemInt);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 7, sizeof(srcLineStride), &srcLineStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 8, sizeof(dstLineStride), &dstLineStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 9, sizeof(srcPlaneStride), &srcPlaneStride);
+    test_error(error, "Unable to set kernel argument");
+    error = clSetKernelArg(kernel, 10, sizeof(dstPlaneStride), &dstPlaneStride);
+    test_error(error, "Unable to set kernel argument");
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to queue kernel");
+
+    // Read
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
+                                outBuffer, 0, NULL, NULL);
+    test_error(error, "Unable to read results");
+
+    // Verify
+    int failuresPrinted = 0;
+    // Verify
+    size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+    for (int i = 0;
+         i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize;
+         i += elementSize)
+    {
+        for (int j = 0; j < (int)numLines * elementSize; j += elementSize)
+        {
+            for (int k = 0; k < (int)numElementsPerLine * elementSize;
+                 k += elementSize)
+            {
+                int inIdx = i
+                        * (numLines * numElementsPerLine
+                           + numLines * srcLineStride + srcPlaneStride)
+                    + j * (numElementsPerLine + srcLineStride) + k;
+                int outIdx = i
+                        * (numLines * numElementsPerLine
+                           + numLines * dstLineStride + dstPlaneStride)
+                    + j * (numElementsPerLine + dstLineStride) + k;
+                if (memcmp(((char *)inBuffer) + inIdx,
+                           ((char *)outBuffer) + outIdx, typeSize)
+                    != 0)
+                {
+                    unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
+                    unsigned char *outchar =
+                        (unsigned char *)outBuffer + outIdx;
+                    char values[4096];
+                    values[0] = 0;
+
+                    if (failuresPrinted == 0)
+                    {
+                        // Print first failure message
+                        log_error("ERROR: Results of copy did not validate!");
+                    }
+                    sprintf(values + strlen(values), "%d -> [", inIdx);
+                    for (int l = 0; l < (int)elementSize; l++)
+                        sprintf(values + strlen(values), "%2x ", inchar[l]);
+                    sprintf(values + strlen(values), "] != [");
+                    for (int l = 0; l < (int)elementSize; l++)
+                        sprintf(values + strlen(values), "%2x ", outchar[l]);
+                    sprintf(values + strlen(values), "]");
+                    log_error("%s\n", values);
+                    failuresPrinted++;
+                }
+
+                if (failuresPrinted > 5)
+                {
+                    log_error("Not printing further failures...\n");
+                    return -1;
+                }
+            }
+            if (j < (int)numLines * elementSize)
+            {
+                int outIdx = i
+                        * (numLines * numElementsPerLine
+                           + numLines * dstLineStride + dstPlaneStride)
+                    + j * (numElementsPerLine + dstLineStride)
+                    + numElementsPerLine * elementSize;
+                if (memcmp(((char *)outBuffer) + outIdx,
+                           ((char *)outBufferCopy) + outIdx,
+                           dstLineStride * elementSize)
+                    != 0)
+                {
+                    if (failuresPrinted == 0)
+                    {
+                        // Print first failure message
+                        log_error("ERROR: Results of copy did not validate!\n");
+                    }
+                    log_error(
+                        "3D copy corrupted data in output buffer in the line "
+                        "stride offset of plane %d line %d\n",
+                        i, j);
+                    failuresPrinted++;
+                }
+                if (failuresPrinted > 5)
+                {
+                    log_error("Not printing further failures...\n");
+                    return -1;
+                }
+            }
+        }
+        if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
+                * elementSize)
+        {
+            int outIdx = i
+                    * (numLines * numElementsPerLine + numLines * dstLineStride
+                       + dstPlaneStride)
+                + (numLines * elementSize) * (numElementsPerLine)
+                + (numLines * elementSize) * (dstLineStride);
+            if (memcmp(((char *)outBuffer) + outIdx,
+                       ((char *)outBufferCopy) + outIdx,
+                       dstPlaneStride * elementSize)
+                != 0)
+            {
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of copy did not validate!\n");
+                }
+                log_error("3D copy corrupted data in output buffer in the "
+                          "plane stride "
+                          "offset of plane %d\n",
+                          i);
+                failuresPrinted++;
+            }
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                return -1;
+            }
+        }
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+    free(outBufferCopy);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue, const char *kernelCode,
+                          bool localIsDst)
+{
+    ExplicitType vecType[] = {
+        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
+        kULong, kFloat, kDouble, kNumExplicitTypes
+    };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
+    unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride,
+        dstPlaneStride;
+
+    int errors = 0;
+
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    {
+        if (vecType[typeIndex] == kDouble
+            && !is_extension_available(deviceID, "cl_khr_fp64"))
+            continue;
+
+        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
+            && !gHasLong)
+            continue;
+
+        for (size = 0; vecSizes[size] != 0; size++)
+        {
+            if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
+                <= 2) // small type
+            {
+                for (srcLineStride = 0;
+                     srcLineStride < sizeof(smallTypesStrideSizes)
+                         / sizeof(smallTypesStrideSizes[0]);
+                     srcLineStride++)
+                {
+                    for (dstLineStride = 0;
+                         dstLineStride < sizeof(smallTypesStrideSizes)
+                             / sizeof(smallTypesStrideSizes[0]);
+                         dstLineStride++)
+                    {
+                        for (srcPlaneStride = 0;
+                             srcPlaneStride < sizeof(smallTypesStrideSizes)
+                                 / sizeof(smallTypesStrideSizes[0]);
+                             srcPlaneStride++)
+                        {
+                            for (dstPlaneStride = 0;
+                                 dstPlaneStride < sizeof(smallTypesStrideSizes)
+                                     / sizeof(smallTypesStrideSizes[0]);
+                                 dstPlaneStride++)
+                            {
+                                if (test_copy3D(
+                                        deviceID, context, queue, kernelCode,
+                                        vecType[typeIndex], vecSizes[size],
+                                        smallTypesStrideSizes[srcLineStride],
+                                        smallTypesStrideSizes[dstLineStride],
+                                        smallTypesStrideSizes[srcPlaneStride],
+                                        smallTypesStrideSizes[dstPlaneStride],
+                                        localIsDst))
+                                {
+                                    errors++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            // not a small type, check only zero stride
+            else if (test_copy3D(deviceID, context, queue, kernelCode,
+                                 vecType[typeIndex], vecSizes[size], 0, 0, 0, 0,
+                                 localIsDst))
+            {
+                errors++;
+            }
+        }
+    }
+    if (errors) return -1;
+    return 0;
+}
+
+int test_async_copy_global_to_local3D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy3D_all_types(deviceID, context, queue,
+                                 async_global_to_local_kernel3D, true);
+}
+
+int test_async_copy_local_to_global3D(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue, int num_elements)
+{
+    return test_copy3D_all_types(deviceID, context, queue,
+                                 async_local_to_global_kernel3D, false);
+}
diff --git a/test_conformance/basic/test_async_copy_fence.cpp b/test_conformance/basic/test_async_copy_fence.cpp
new file mode 100644
index 0000000..74f6e40
--- /dev/null
+++ b/test_conformance/basic/test_async_copy_fence.cpp
@@ -0,0 +1,812 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "../../test_common/harness/conversions.h"
+#include "procs.h"
+
+static const char *import_after_export_aliased_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *exportSrc, __global %s "
+    "*exportDst,\n"
+    "                       const __global %s *importSrc, __global %s "
+    "*importDst,\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since export src and import dst are aliased*/\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    "    int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n"
+    // Zero the local storage first
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "export src and import dst are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDst+exportSrcLocalSize*get_group_id(0)), (__local const "
+    "%s*)localBuffer, (size_t)exportSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy( (__local "
+    "%s*)(localBuffer+localImportOffset), (__global const "
+    "%s*)(importSrc+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, events );\n"
+    // Wait for the export and import to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(localBuffer+localImportOffset)[ get_local_id( 0 "
+    ")*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *import_after_export_aliased_global_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *exportSrc, __global %s "
+    "*exportDstImportSrc,\n"
+    "                       __global %s *importDst, /* there isn't a dedicated "
+    "__global %s buffer for import src since export dst and import src are "
+    "aliased*/\n"
+    "                       __local %s *exportLocalBuffer, __local %s "
+    "*importLocalBuffer,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0)), (__local "
+    "const %s*)exportLocalBuffer, (size_t)exportSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy( (__local %s*)importLocalBuffer, "
+    "(__global const "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0) + "
+    "(exportSrcLocalSize - importSrcLocalSize)), (size_t)importSrcLocalSize, "
+    "events );\n"
+    // Wait for the export and import to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *import_after_export_aliased_global_and_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *exportSrc, __global %s "
+    "*exportDstImportSrc,\n"
+    "                       __global %s *importDst, /* there isn't a dedicated "
+    "__global %s buffer for import src since export dst and import src are "
+    "aliased*/\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since export src and import dst are aliased*/\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    "    int localImportOffset = exportSrcLocalSize - importSrcLocalSize;\n"
+    // Zero the local storage first
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "export src and import dst are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] = "
+    "exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the export and import
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0)), (__local "
+    "const %s*)localBuffer, (size_t)exportSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE | "
+    "CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy( (__local "
+    "%s*)(localBuffer+localImportOffset), (__global const "
+    "%s*)(exportDstImportSrc+exportSrcLocalSize*get_group_id(0) + "
+    "(exportSrcLocalSize - importSrcLocalSize)), (size_t)importSrcLocalSize, "
+    "events );\n"
+    // Wait for the export and import to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(localBuffer+localImportOffset)[ get_local_id( 0 "
+    ")*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *export_after_import_aliased_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *importSrc, __global %s "
+    "*importDst,\n"
+    "                       const __global %s *exportDst, /* there isn't a "
+    "dedicated __global %s buffer for export src since the local memory is "
+    "aliased, so the export src is taken from it */\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since import dst and export src are aliased*/\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "import dst and export src are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy( (__local %s*)localBuffer, (__global "
+    "const %s*)(importSrc+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, events );\n"
+    "    async_work_group_copy_fence( CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(exportDst+exportSrcLocalSize*get_group_id(0)), (__local const "
+    "%s*)(localBuffer + (importSrcLocalSize - exportSrcLocalSize)), "
+    "(size_t)exportSrcLocalSize, 0 );\n"
+    // Wait for the import and export to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *export_after_import_aliased_global_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *importSrcExportDst, __global %s "
+    "*importDst,\n"
+    "                       const __global %s *exportSrc,\n"
+    "                       /* there isn't a dedicated __global %s buffer for "
+    "export dst since import src and export dst are aliased */\n"
+    "                       __local %s *importLocalBuffer, __local %s "
+    "*exportLocalBuffer,\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= (%s)(%s)0;\n"
+    "    }\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    for(i=0; i<exportCopiesPerWorkItem; i++) {\n"
+    "        exportLocalBuffer[ get_local_id( 0 )*exportCopiesPerWorkItem+i ] "
+    "= exportSrc[ get_global_id( 0 )*exportCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    // Do this to verify all kernels are done copying to the local buffer before
+    // we try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy( (__local %s*)importLocalBuffer, "
+    "(__global const "
+    "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0) + "
+    "(importSrcLocalSize - exportSrcLocalSize)), (__local const "
+    "%s*)exportLocalBuffer, (size_t)exportSrcLocalSize, events );\n"
+    // Wait for the import and export to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "importLocalBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+static const char *export_after_import_aliased_global_and_local_kernel =
+    "#pragma OPENCL EXTENSION cl_khr_async_work_group_copy_fence : enable\n"
+    "%s\n" // optional pragma string
+    "__kernel void test_fn( const __global %s *importSrcExportDst, __global %s "
+    "*importDst,\n"
+    "                       /* there isn't a dedicated __global %s buffer for "
+    "export src since the local memory is aliased, so the export src is taken "
+    "from it */\n"
+    "                       /* there isn't a dedicated __global %s buffer for "
+    "export dst since import src and export dst are aliased */\n"
+    "                       __local %s *localBuffer, /* there isn't another "
+    "__local %s local buffer since import dst and export src are aliased*/\n"
+    "                       int importSrcLocalSize, int "
+    "importCopiesPerWorkItem,\n"
+    "                       int exportSrcLocalSize, int "
+    "exportCopiesPerWorkItem )\n"
+    "{\n"
+    "    int i;\n"
+    // Zero the local storage first
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "(%s)(%s)0;\n"
+    "    }\n"
+    "    // no need to set another local buffer values to (%s)(%s)0 since "
+    "import dst and export src are aliased (use the same buffer)\n"
+    // Do this to verify all kernels are done zeroing the local buffer before we
+    // try the import and export
+    "    barrier( CLK_LOCAL_MEM_FENCE );\n"
+    "    event_t events;\n"
+    "    events = async_work_group_copy( (__local %s*)localBuffer, (__global "
+    "const %s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0)), "
+    "(size_t)importSrcLocalSize, 0 );\n"
+    "    async_work_group_copy_fence( CLK_GLOBAL_MEM_FENCE | "
+    "CLK_LOCAL_MEM_FENCE );\n"
+    "    events = async_work_group_copy((__global "
+    "%s*)(importSrcExportDst+importSrcLocalSize*get_group_id(0) + "
+    "(importSrcLocalSize - exportSrcLocalSize)), (__local const "
+    "%s*)(localBuffer + (importSrcLocalSize - exportSrcLocalSize)), "
+    "(size_t)exportSrcLocalSize, events );\n"
+    // Wait for the import and export to complete, then verify by manually
+    // copying to the dest
+    "    wait_group_events( 2, &events );\n"
+    "    for(i=0; i<importCopiesPerWorkItem; i++) {\n"
+    "        importDst[ get_global_id( 0 )*importCopiesPerWorkItem+i ] = "
+    "localBuffer[ get_local_id( 0 )*importCopiesPerWorkItem+i ];\n"
+    "    }\n"
+    "}\n";
+
+int test_copy_fence(cl_device_id deviceID, cl_context context,
+                    cl_command_queue queue, const char *kernelCode,
+                    ExplicitType vecType, int vecSize, bool export_after_import,
+                    bool aliased_local_mem, bool aliased_global_mem)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[4];
+    size_t threads[1], localThreads[1];
+    void *transaction1InBuffer, *transaction1OutBuffer, *transaction2InBuffer,
+        *transaction2OutBuffer;
+    MTdata d;
+    bool transaction1DstIsTransaction2Src =
+        (aliased_global_mem && !export_after_import)
+        || (aliased_local_mem && export_after_import);
+    bool transaction1SrcIsTransaction2Dst =
+        aliased_global_mem && export_after_import;
+    char vecNameString[64];
+    vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
+                vecSize);
+
+    size_t elementSize = get_explicit_type_size(vecType) * vecSize;
+    log_info("Testing %s\n", vecNameString);
+
+    if (!is_extension_available(deviceID, "cl_khr_async_work_group_copy_fence"))
+    {
+        log_info(
+            "Device does not support extended async copies fence. Skipping "
+            "test.\n");
+        return 0;
+    }
+
+    cl_long max_local_mem_size;
+    error =
+        clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                        sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
+                            sizeof(num_of_compute_devices),
+                            &num_of_compute_devices, NULL);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096];
+    programSource[0] = 0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
+                               : "",
+            vecNameString, vecNameString, vecNameString, vecNameString,
+            vecNameString, vecNameString, vecNameString,
+            get_explicit_type_name(vecType), vecNameString,
+            get_explicit_type_name(vecType), vecNameString, vecNameString,
+            vecNameString, vecNameString);
+    // log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                        (const char **)&programPtr, "test_fn");
+    test_error(error, "Unable to create testing kernel");
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(
+        kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
+        &max_workgroup_size, NULL);
+    test_error(
+        error,
+        "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(max_local_workgroup_size),
+                            max_local_workgroup_size, NULL);
+    test_error(error,
+               "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t transaction1NumberOfCopiesPerWorkitem = 13;
+    size_t transaction2NumberOfCopiesPerWorkitem = 2;
+    elementSize =
+        get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem =
+        transaction1NumberOfCopiesPerWorkitem * elementSize
+        + (aliased_local_mem
+               ? 0
+               : transaction2NumberOfCopiesPerWorkitem * elementSize);
+    size_t maxLocalWorkgroupSize =
+        (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if (maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t transaction1LocalBufferSize = localWorkgroupSize * elementSize
+        * transaction1NumberOfCopiesPerWorkitem;
+    size_t transaction2LocalBufferSize = localWorkgroupSize * elementSize
+        * transaction2NumberOfCopiesPerWorkitem; // irrelevant if
+                                                 // aliased_local_mem
+    size_t numberOfLocalWorkgroups = 1111;
+    size_t transaction1GlobalBufferSize =
+        numberOfLocalWorkgroups * transaction1LocalBufferSize;
+    size_t transaction2GlobalBufferSize =
+        numberOfLocalWorkgroups * transaction2LocalBufferSize;
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
+
+    transaction1InBuffer = (void *)malloc(transaction1GlobalBufferSize);
+    transaction1OutBuffer = (void *)malloc(transaction1GlobalBufferSize);
+    transaction2InBuffer = (void *)malloc(transaction2GlobalBufferSize);
+    transaction2OutBuffer = (void *)malloc(transaction2GlobalBufferSize);
+    memset(transaction1OutBuffer, 0, transaction1GlobalBufferSize);
+    memset(transaction2OutBuffer, 0, transaction2GlobalBufferSize);
+
+    cl_int transaction1CopiesPerWorkitemInt, transaction1CopiesPerWorkgroup,
+        transaction2CopiesPerWorkitemInt, transaction2CopiesPerWorkgroup;
+    transaction1CopiesPerWorkitemInt =
+        (int)transaction1NumberOfCopiesPerWorkitem;
+    transaction1CopiesPerWorkgroup =
+        (int)(transaction1NumberOfCopiesPerWorkitem * localWorkgroupSize);
+    transaction2CopiesPerWorkitemInt =
+        (int)transaction2NumberOfCopiesPerWorkitem;
+    transaction2CopiesPerWorkgroup =
+        (int)(transaction2NumberOfCopiesPerWorkitem * localWorkgroupSize);
+
+    log_info(
+        "Global: %d, local %d. 1st Transaction: local buffer %db, global "
+        "buffer %db, each work group will copy %d elements and each work "
+        "item item will copy %d elements. 2nd Transaction: local buffer "
+        "%db, global buffer %db, each work group will copy %d elements and "
+        "each work item will copy %d elements\n",
+        (int)globalWorkgroupSize, (int)localWorkgroupSize,
+        (int)transaction1LocalBufferSize, (int)transaction1GlobalBufferSize,
+        transaction1CopiesPerWorkgroup, transaction1CopiesPerWorkitemInt,
+        (int)transaction2LocalBufferSize, (int)transaction2GlobalBufferSize,
+        transaction2CopiesPerWorkgroup, transaction2CopiesPerWorkitemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand(gRandomSeed);
+    generate_random_data(
+        vecType, transaction1GlobalBufferSize / get_explicit_type_size(vecType),
+        d, transaction1InBuffer);
+    if (!transaction1DstIsTransaction2Src)
+    {
+        generate_random_data(vecType,
+                             transaction2GlobalBufferSize
+                                 / get_explicit_type_size(vecType),
+                             d, transaction2InBuffer);
+    }
+    free_mtdata(d);
+    d = NULL;
+
+    streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                transaction1GlobalBufferSize,
+                                transaction1InBuffer, &error);
+    test_error(error, "Unable to create input buffer");
+    streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                transaction1GlobalBufferSize,
+                                transaction1OutBuffer, &error);
+    test_error(error, "Unable to create output buffer");
+    if (!transaction1DstIsTransaction2Src)
+    {
+        streams[2] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                    transaction2GlobalBufferSize,
+                                    transaction2InBuffer, &error);
+        test_error(error, "Unable to create input buffer");
+    }
+    if (!transaction1SrcIsTransaction2Dst)
+    {
+        streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                    transaction2GlobalBufferSize,
+                                    transaction2OutBuffer, &error);
+        test_error(error, "Unable to create output buffer");
+    }
+
+    cl_uint argIndex = 0;
+    error = clSetKernelArg(kernel, argIndex, sizeof(streams[0]), &streams[0]);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error = clSetKernelArg(kernel, argIndex, sizeof(streams[1]), &streams[1]);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    if (!transaction1DstIsTransaction2Src)
+    {
+        error =
+            clSetKernelArg(kernel, argIndex, sizeof(streams[2]), &streams[2]);
+        test_error(error, "Unable to set kernel argument");
+        ++argIndex;
+    }
+    if (!transaction1SrcIsTransaction2Dst)
+    {
+        error =
+            clSetKernelArg(kernel, argIndex, sizeof(streams[3]), &streams[3]);
+        test_error(error, "Unable to set kernel argument");
+        ++argIndex;
+    }
+    error = clSetKernelArg(kernel, argIndex, transaction1LocalBufferSize, NULL);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    if (!aliased_local_mem)
+    {
+        error =
+            clSetKernelArg(kernel, argIndex, transaction2LocalBufferSize, NULL);
+        test_error(error, "Unable to set kernel argument");
+        ++argIndex;
+    }
+    error =
+        clSetKernelArg(kernel, argIndex, sizeof(transaction1CopiesPerWorkgroup),
+                       &transaction1CopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error = clSetKernelArg(kernel, argIndex,
+                           sizeof(transaction1CopiesPerWorkitemInt),
+                           &transaction1CopiesPerWorkitemInt);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error =
+        clSetKernelArg(kernel, argIndex, sizeof(transaction2CopiesPerWorkgroup),
+                       &transaction2CopiesPerWorkgroup);
+    test_error(error, "Unable to set kernel argument");
+    ++argIndex;
+    error = clSetKernelArg(kernel, argIndex,
+                           sizeof(transaction2CopiesPerWorkitemInt),
+                           &transaction2CopiesPerWorkitemInt);
+    test_error(error, "Unable to set kernel argument");
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
+                                   localThreads, 0, NULL, NULL);
+    test_error(error, "Unable to queue kernel");
+
+    // Read
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                transaction1GlobalBufferSize,
+                                transaction1OutBuffer, 0, NULL, NULL);
+    test_error(error, "Unable to read results");
+    if (transaction1DstIsTransaction2Src)
+    {
+        for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++)
+        {
+            memcpy(
+                (void *)((unsigned char *)transaction2InBuffer
+                         + idx * transaction2CopiesPerWorkgroup * elementSize),
+                (const void *)((unsigned char *)transaction1OutBuffer
+                               + (idx * transaction1CopiesPerWorkgroup
+                                  + (transaction1CopiesPerWorkgroup
+                                     - transaction2CopiesPerWorkgroup))
+                                   * elementSize),
+                (size_t)transaction2CopiesPerWorkgroup * elementSize);
+        }
+    }
+    if (transaction1SrcIsTransaction2Dst)
+    {
+        void *transaction1SrcBuffer =
+            (void *)malloc(transaction1GlobalBufferSize);
+        error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                                    transaction1GlobalBufferSize,
+                                    transaction1SrcBuffer, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+        for (size_t idx = 0; idx < numberOfLocalWorkgroups; idx++)
+        {
+            memcpy(
+                (void *)((unsigned char *)transaction2OutBuffer
+                         + idx * transaction2CopiesPerWorkgroup * elementSize),
+                (const void *)((unsigned char *)transaction1SrcBuffer
+                               + (idx * transaction1CopiesPerWorkgroup
+                                  + (transaction1CopiesPerWorkgroup
+                                     - transaction2CopiesPerWorkgroup))
+                                   * elementSize),
+                (size_t)transaction2CopiesPerWorkgroup * elementSize);
+        }
+        free(transaction1SrcBuffer);
+    }
+    else
+    {
+        error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
+                                    transaction2GlobalBufferSize,
+                                    transaction2OutBuffer, 0, NULL, NULL);
+        test_error(error, "Unable to read results");
+    }
+
+    // Verify
+    int failuresPrinted = 0;
+    if (memcmp(transaction1InBuffer, transaction1OutBuffer,
+               transaction1GlobalBufferSize)
+        != 0)
+    {
+        size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+        unsigned char *inchar = (unsigned char *)transaction1InBuffer;
+        unsigned char *outchar = (unsigned char *)transaction1OutBuffer;
+        for (int i = 0; i < (int)transaction1GlobalBufferSize;
+             i += (int)elementSize)
+        {
+            if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize)
+                != 0)
+            {
+                char values[4096];
+                values[0] = 0;
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of 1st transaction did not "
+                              "validate!\n");
+                }
+                sprintf(values + strlen(values), "%d -> [", i);
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", inchar[i + j]);
+                sprintf(values + strlen(values), "] != [");
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", outchar[i + j]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                break;
+            }
+        }
+    }
+    if (memcmp(transaction2InBuffer, transaction2OutBuffer,
+               transaction2GlobalBufferSize)
+        != 0)
+    {
+        size_t typeSize = get_explicit_type_size(vecType) * vecSize;
+        unsigned char *inchar = (unsigned char *)transaction2InBuffer;
+        unsigned char *outchar = (unsigned char *)transaction2OutBuffer;
+        for (int i = 0; i < (int)transaction2GlobalBufferSize;
+             i += (int)elementSize)
+        {
+            if (memcmp(((char *)inchar) + i, ((char *)outchar) + i, typeSize)
+                != 0)
+            {
+                char values[4096];
+                values[0] = 0;
+                if (failuresPrinted == 0)
+                {
+                    // Print first failure message
+                    log_error("ERROR: Results of 2nd transaction did not "
+                              "validate!\n");
+                }
+                sprintf(values + strlen(values), "%d -> [", i);
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", inchar[i + j]);
+                sprintf(values + strlen(values), "] != [");
+                for (int j = 0; j < (int)elementSize; j++)
+                    sprintf(values + strlen(values), "%2x ", outchar[i + j]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5)
+            {
+                log_error("Not printing further failures...\n");
+                break;
+            }
+        }
+    }
+
+    free(transaction1InBuffer);
+    free(transaction1OutBuffer);
+    free(transaction2InBuffer);
+    free(transaction2OutBuffer);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy_fence_all_types(cl_device_id deviceID, cl_context context,
+                              cl_command_queue queue, const char *kernelCode,
+                              bool export_after_import, bool aliased_local_mem,
+                              bool aliased_global_mem)
+{
+    ExplicitType vecType[] = {
+        kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
+        kULong, kFloat, kDouble, kNumExplicitTypes
+    };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int size, typeIndex;
+
+    int errors = 0;
+
+    for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
+    {
+        if (vecType[typeIndex] == kDouble
+            && !is_extension_available(deviceID, "cl_khr_fp64"))
+            continue;
+
+        if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
+            && !gHasLong)
+            continue;
+
+        for (size = 0; vecSizes[size] != 0; size++)
+        {
+            if (test_copy_fence(deviceID, context, queue, kernelCode,
+                                vecType[typeIndex], vecSizes[size],
+                                export_after_import, aliased_local_mem,
+                                aliased_global_mem))
+            {
+                errors++;
+            }
+        }
+    }
+    if (errors) return -1;
+    return 0;
+}
+
+int test_async_work_group_copy_fence_import_after_export_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     import_after_export_aliased_local_kernel,
+                                     false, true, false);
+}
+
+int test_async_work_group_copy_fence_import_after_export_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     import_after_export_aliased_global_kernel,
+                                     false, false, true);
+}
+
+int test_async_work_group_copy_fence_import_after_export_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(
+        deviceID, context, queue,
+        import_after_export_aliased_global_and_local_kernel, false, true, true);
+}
+
+int test_async_work_group_copy_fence_export_after_import_aliased_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     export_after_import_aliased_local_kernel,
+                                     true, true, false);
+}
+
+int test_async_work_group_copy_fence_export_after_import_aliased_global(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(deviceID, context, queue,
+                                     export_after_import_aliased_global_kernel,
+                                     true, false, true);
+}
+
+int test_async_work_group_copy_fence_export_after_import_aliased_global_and_local(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return test_copy_fence_all_types(
+        deviceID, context, queue,
+        export_after_import_aliased_global_and_local_kernel, true, true, true);
+}