Release candidate for v1_4_1 (aws#417)

* Releae candidate for v1_4_1 Release candidate for RC_v1_4_1. Porting changes to public with new branch. * updates to 3rd party example for v1.4.1 * delta updatres V1.4.1 * Document update
wjblanke · Aug 15, 2018 · 8847d31 · 8847d31
1 parent 2fdf23f
commit 8847d31
Show file tree

Hide file tree

Showing 113 changed files with 7,809 additions and 3,891 deletions.
diff --git a/FAQs.md b/FAQs.md
@@ -382,14 +382,14 @@ FPGA Direct is FPGA to FPGA low latency high throughput peer communication throu
 
 **Q: What is FPGA Link and how fast is it?**
 
-FPGA Link is based on 4 x 100Gbps links on each FPGA card. The FPGA Link is organized as a ring, with 2 x 100Gbps links to each adjacent card. This enables each FPGA card to send/receive data from an adjacent card at 200Gbps speeds. Details on the FPGA Link interface will be provided in the Shell Interface specification when available.
+FPGA Link is based on 4 x 100Gbps links on each FPGA card. The FPGA Link is organized as a ring, with 2 x 100Gbps links to each adjacent card. This enables each FPGA card to send/receive data from an adjacent card at 200Gbps speeds. This is a unsupported feature planned for future release. Details on the FPGA Link interface will be provided in the Shell Interface specification when available.
 
 
 **Q: What protocol is used for FPGA link?**
 
 The FPGA link is a generic raw streaming interface, no transport protocol is provided for it by AWS. It is expected that developers would take advantage of standard PCIe protocol, Ethernet protocol, or Xilinx's (reliable) Aurora protocol layer for this interface.
 
-Details on the Shell Interface to the FPGA Link IP blocks are provided in the [Shell Interface specification](./hdk/docs/AWS_Shell_Interface_Specification.md) when available.
+This is a unsupported feature planned for future release. Details on the Shell Interface to the FPGA Link IP blocks are provided in the [Shell Interface specification](./hdk/docs/AWS_Shell_Interface_Specification.md) when available.
 
 
 **Q: What clock speed does the FPGA utilize?**

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -60,7 +60,7 @@ def dcp_recipe_scenarios = [
     'A1-B2-C0-TIMING',
     'A1-B2-C0-CONGESTION',
     ]
-def fdf_test_names = ['cl_dram_dma[A0-B0-C0-DEFAULT]', 'cl_hello_world[A0-B0-C0-DEFAULT]', 'cl_hello_world_vhdl',
+def fdf_test_names = ['cl_dram_dma[A1-B0-C0-DEFAULT]', 'cl_hello_world[A0-B0-C0-DEFAULT]', 'cl_hello_world_vhdl',
     'cl_uram_example[2]', 'cl_uram_example[3]', 'cl_uram_example[4]']
 
 boolean debug_dcp_gen = params.get('debug_dcp_gen')
@@ -853,12 +853,6 @@ if (test_helloworld_sdaccel_example_fdf || test_all_sdaccel_examples_fdf) {
                                 }
                             }
 
-                            boolean test_sw_emu_supported = true
-
-                            if(test_key =~ '_Debug') {
-                                test_sw_emu_supported = false
-                            }
-
                             // dsa = [ 4DDR: 4ddr ]
                             for ( def dsa in entrySet(dsa_map_for_test) ) {
 
@@ -880,6 +874,33 @@ if (test_helloworld_sdaccel_example_fdf || test_all_sdaccel_examples_fdf) {
                                 String create_afi_report_file  = "sdaccel_create_afi_${e.key}_${dsa.value}_${xilinx_version}.xml"
                                 String run_example_report_file = "sdaccel_run_${e.key}_${dsa.value}_${xilinx_version}.xml"
 
+                                String description_file = "${example_path}/description.json"
+                                def description_json = ["targets":["hw","hw_emu","sw_emu"]]
+
+                                try {
+                                    description_json = readJSON file: description_file
+                                }
+                                catch (exc) {
+                                    echo "Could not read the file: ${description_file}"
+                                    throw exc
+                                }
+
+                                boolean test_sw_emu_supported = true
+
+                                if(description_json["targets"]) {
+                                    if(description_json["targets"].contains("sw_emu")) {
+                                        test_sw_emu_supported = true
+                                        echo "Description file ${description_file} has target sw_emu"
+                                    }
+                                    else {
+                                        test_sw_emu_supported = false
+                                        echo "Description file ${description_file} does not have target sw_emu"
+                                    }
+                                }
+                                else {
+                                    echo "Description json did not have a 'target' key"
+                                }
+
                                 sdaccel_build_stages[build_name] = {
                                     if(test_sw_emu_supported) {
                                         stage(sw_emu_stage_name) {

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -26,6 +26,20 @@
          *    1 DDR controller implemented in the SH (always available)
          *    3 DDR controllers implemented in the CL (configurable number of implemented controllers allowed)
 
+## Release 1.4.1 (See [ERRATA](./ERRATA.md) for unsupported features)
+* Simulation performance Improvements
+  * DDR Behavioural Model- Hardware simulations use an AXI memory model to run 4X faster by skipping DDR initialization. Please refer to this [README](./hdk/cl/examples/cl_dram_dma/verif/README.md) on how to use this feature in your simulation.
+  * DDR Backdoor Loading- Hardware simulation time is reduced by pre-loading data directly into memory models.  Please refer to this [README](./hdk/cl/examples/cl_dram_dma/verif/README.md#ddr-backdoor-loading) for example tests that demonstrate this feature.
+* Fixed Issues
+  * XOCL Driver update to address synchronization issues.
+  * Fixed XOCL driver issues when using ubuntu distribution for Linux OS.
+  * Improved Performance for [cl_dram_dma Public AFI](./hdk/cl/examples/cl_dram_dma/README.md#metadata).
+  * SDAccel 3rd party examples updated to use Shell V1.4 DSA.
+  * Fixed AFI Manifest generation in IPI flow.
+  * HLX button fixed in IPI
+  * [FPGA Library update](./sdk/userspace/README.md)
+
+
 ## Release 1.4.0 (See [ERRATA](./ERRATA.md) for unsupported features)
 * [New Shell Stable: v04261818](./hdk/common/shell_stable).  Starting with release v1.4.0, the AWS FPGA shell stable has been updated and only supports Xilinx 2017.4 SDx/Vivado.  All previous versions of tools and shells are not supported with this developer kit shell release. 
   * [Shell Release Notes](./hdk/docs/AWS_Shell_RELEASE_NOTES.md) 

diff --git a/SDAccel/Makefile b/SDAccel/Makefile
@@ -34,6 +34,8 @@ else
 $(info XILINX_SDX = $(XILINX_SDX))
 endif
 
+OS=$(shell lsb_release -si)
+$(info OS is $(OS))
 
 MODULE :=
 ifeq ($(RELEASE_VER),2017.4)
@@ -42,7 +44,12 @@ ifeq ($(RELEASE_VER),2017.4)
   XRT_HAL_LIB = libxrt-aws.so
   EXE = awssak2
   MODULE = xocl
+ifeq ($(OS),Ubuntu)
+  GLIBCPP_PATH = lib/lnx64.o/Ubuntu
+else
   GLIBCPP_PATH = lib/lnx64.o/Default
+endif # OS check
+
 else
   $(error Environment variable RELEASE_VER not recognized: $(RELEASE_VER))
 endif

diff --git a/SDAccel/examples/3rd_party/README.md b/SDAccel/examples/3rd_party/README.md
diff --git a/SDAccel/examples/3rd_party/common/inc/AOCLUtils/opencl.h b/SDAccel/examples/3rd_party/common/inc/AOCLUtils/opencl.h
@@ -36,7 +36,7 @@ extern void cleanup();
 
 namespace aocl_utils {
 
-static const char *const VERSION_STR = "xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0";
+static const char *const VERSION_STR = "xilinx_aws-vu9p-f1-04261818_dynamic_5_0";
 
 // Host allocation functions
 void *alignedMalloc(size_t size);

diff --git a/SDAccel/examples/3rd_party/fft1d/Makefile b/SDAccel/examples/3rd_party/fft1d/Makefile
@@ -15,21 +15,21 @@
 COMMON_REPO := $(SDACCEL_DIR)/examples/xilinx
 
 include $(COMMON_REPO)/utility/boards.mk
-include $(COMMON_REPO)/libs/xcl/xcl.mk
+include $(COMMON_REPO)/libs/xcl2/xcl2.mk
 include $(COMMON_REPO)/libs/opencl/opencl.mk
 include $(COMMON_REPO)/libs/oclHelper/oclHelper.mk
 
 # Host Application
-main_SRCS=$(wildcard host/src/*.cpp ../common/src/AOCLUtils/*.cpp) $(xcl_SRCS) $(oclHelper_SRCS)
-main_HDRS=$(xcl_HDRS)
-main_CXXFLAGS=$(xcl_CXXFLAGS) $(opencl_CXXFLAGS)  -Ihost/inc -I../common/inc/ $(oclHelper_CXXFLAGS)
+main_SRCS=$(wildcard host/src/*.cpp ../common/src/AOCLUtils/*.cpp) $(xcl2_SRCS) $(oclHelper_SRCS)
+main_HDRS=$(xcl2_HDRS)
+main_CXXFLAGS=$(xcl2_CXXFLAGS) $(opencl_CXXFLAGS)  -Ihost/inc -I../common/inc/ $(oclHelper_CXXFLAGS)
 main_LDFLAGS=$(opencl_LDFLAGS) -lrt
 
 EXES=main
 
 # Kernel                                                                                                                                                                                              
 fft1d_SRCS=./device/fft1d.cl
-fft1d_CLFLAGS= -optimizequick
+#fft1d_CLFLAGS= -optimizequick
 #Specifying Fifo depth for Dataflow                                                                                                                                                                  
 ##fft1d_CLFLAGS+=--xp "param:compiler.xclDataflowFifoDepth=32"
 
@@ -46,11 +46,7 @@ check_XCLBINS=fft1d
 
 CHECKS=check
 
-ifeq ($(DEBUG),1)
 CXXFLAGS += -g
-else
-CXXFLAGS += -O2
-endif
 
 #CXX := g++
 

diff --git a/SDAccel/examples/3rd_party/fft1d/fft1d_fft1d.cl.diff b/SDAccel/examples/3rd_party/fft1d/fft1d_fft1d.cl.diff
@@ -1,9 +1,18 @@
---- third_party/fft1d/device/fft1d.cl      2017-05-09 22:47:43.000000000 +0000
-+++ sdaccel/fft1d/device/fft1d.cl 2017-09-12 19:21:02.120000000 +0000
-@@ -49 +49 @@
+--- device/fft1d.cl	2018-02-12 17:54:56.000000000 +0000
++++ device/fft1d.cl	2018-07-23 20:57:12.414000000 +0000
+@@ -46,7 +46,7 @@
+ // Include source code for an engine that produces 8 points each step
+ #include "fft_8.cl" 
+
 -#pragma OPENCL EXTENSION cl_intel_channels : enable
 +//#pragma OPENCL EXTENSION cl_intel_channels : enable
-@@ -64 +64,9 @@
+
+ #include "../host/inc/fft_config.h"
+
+@@ -66,11 +66,19 @@
+ #define CONT_FACTOR            (1 << LOG_CONT_FACTOR)
+
+ // Need some depth to our channels to accomodate their bursty filling.
 -channel float2 chanin[8] __attribute__((depth(CONT_FACTOR*8)));
 +
 +pipe float2 chanin0 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
@@ -14,52 +23,88 @@
 +pipe float2 chanin5 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
 +pipe float2 chanin6 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
 +pipe float2 chanin7 __attribute__((xcl_reqd_pipe_depth(CONT_FACTOR*8)));
-@@ -68 +76 @@
--  #pragma unroll
-+  __attribute__((opencl_unroll_hint()))
-@@ -137,2 +145,2 @@
+
+ uint bit_reversed(uint x, uint bits) {
+   uint y = 0;
+-  #pragma unroll 
++ __attribute__((opencl_unroll_hint()))
+   for (uint i = 0; i < bits; i++) {
+     y <<= 1;
+     y |= x & 1;
+@@ -139,15 +147,15 @@
+ }
+
+ // group dimension (N/(8*CONT_FACTOR), num_iterations)
 -__attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
 -kernel void fetch (global float2 * restrict src) {
 +kernel __attribute__((reqd_work_group_size(CONT_FACTOR * POINTS, 1, 1)))
 +void fetch (global float2 * restrict src) {
-@@ -145 +153 @@
+
+   const int N = (1 << LOGN);
+   // Each thread will fetch POINTS points. Need POINTS times to pass to FFT.
+   const int BUF_SIZE = 1 << (LOG_CONT_FACTOR + LOGPOINTS + LOGPOINTS);
+
+   // Local memory for CONT_FACTOR * POINTS points
 -  local float2 buf[BUF_SIZE];
-+  local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1)));
-@@ -156,2 +164,2 @@
++  __local float2 buf[BUF_SIZE] __attribute__((xcl_array_partition(block,8,1)));
+
+   uint iteration = get_global_id(1);
+   uint group_per_iter = get_global_id(0);
+@@ -158,17 +166,17 @@
+   uint lid = get_local_id(0);
+   uint local_addr = lid << LOGPOINTS;
+
 -  #pragma unroll
 -  for (uint k = 0; k < POINTS; k++) {
 +  __attribute__((opencl_unroll_hint()))
 +  for (uint k = 0; k < POINTS; k+=2) {
-@@ -160 +167,0 @@
--
-@@ -163,4 +170,4 @@
+     buf[local_addr + k] = src[global_addr + k];
+   }
+
+   barrier (CLK_LOCAL_MEM_FENCE);
+
 -  #pragma unroll
 -  for (uint k = 0; k < POINTS; k++) {
 -    uint buf_addr = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
 -    write_channel_intel (chanin[k], buf[buf_addr]);
 +  uint buf_addr[8];
 +  __attribute__((opencl_unroll_hint()))
-+  for(uint k=0;k<8;k++) {
++  for (uint k = 0; k < 8; k++) {
 +    buf_addr[k] = bit_reversed(k,3) * CONT_FACTOR * POINTS + lid;
-@@ -167,0 +175,12 @@
+   }
+ }
+
+@@ -181,9 +189,24 @@
+  * 'count' represents the number of 4k sets to process
+  * 'inverse' toggles between the direct and the inverse transform
+  */
 +  // bit_reversed reverses the bit locations of the value given.
 +  // The second parameter is the width of the number (in bits) to reverse.
 +  // Only the non-symmetric numbers are changed.  E.g. 001,011,100,110 -> 100,110,100,110
-+    write_pipe (chanin0, &buf[buf_addr[0]]);
-+    write_pipe (chanin1, &buf[buf_addr[1]]);
-+    write_pipe (chanin2, &buf[buf_addr[2]]);
-+    write_pipe (chanin3, &buf[buf_addr[3]]);
-+    write_pipe (chanin4, &buf[buf_addr[4]]);
-+    write_pipe (chanin5, &buf[buf_addr[5]]);
-+    write_pipe (chanin6, &buf[buf_addr[6]]);
-+    write_pipe (chanin7, &buf[buf_addr[7]]);
 +
-@@ -180,2 +199,2 @@
++
++
++    write_pipe(chanin0, &buf[buf_addr[0]]);
++    write_pipe(chanin1, &buf[buf_addr[1]]);
++    write_pipe(chanin2, &buf[buf_addr[2]]);
++    write_pipe(chanin3, &buf[buf_addr[3]]);
++    write_pipe(chanin4, &buf[buf_addr[4]]);
++    write_pipe(chanin5, &buf[buf_addr[5]]);
++    write_pipe(chanin6, &buf[buf_addr[6]]);
++    write_pipe(chanin7, &buf[buf_addr[7]]);
++
+
 -__attribute((task))
 -kernel void fft1d(global float2 * restrict dest,
 +kernel __attribute((reqd_work_group_size(1, 1, 1))) //task))
 +void fft1d(global float2 * restrict dest,
-@@ -218,8 +237,9 @@
+                   int count, int inverse) {
+
+   const int N = (1 << LOGN);
+@@ -220,14 +243,14 @@
+     float2x8 data;
+     // Perform memory transfers only when reading data in range
+     if (i < count * (N / 8)) {
 -      data.i0 = read_channel_intel(chanin[0]);
 -      data.i1 = read_channel_intel(chanin[1]);
 -      data.i2 = read_channel_intel(chanin[2]);
@@ -68,7 +113,6 @@
 -      data.i5 = read_channel_intel(chanin[5]);
 -      data.i6 = read_channel_intel(chanin[6]);
 -      data.i7 = read_channel_intel(chanin[7]);
-+
 +      read_pipe(chanin0,&data.i0);
 +      read_pipe(chanin1,&data.i1);
 +      read_pipe(chanin2,&data.i2);
@@ -77,3 +121,6 @@
 +      read_pipe(chanin5,&data.i5);
 +      read_pipe(chanin6,&data.i6);
 +      read_pipe(chanin7,&data.i7);
+     } else {
+       data.i0 = data.i1 = data.i2 = data.i3 = 
+                 data.i4 = data.i5 = data.i6 = data.i7 = 0;