From bffc0aa209d9abad396e09806f475c44c28c3e7b Mon Sep 17 00:00:00 2001
From: mmrahorovic <mmrahorovic@hotmail.com>
Date: Mon, 26 Sep 2022 19:09:10 +0100
Subject: [PATCH 01/43] Fix to ensure pipeline pragma and functional behavior
 are the same across target boards when innermost loop has loopcount of 1

---
 slidingwindow.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/slidingwindow.h b/slidingwindow.h
index 88634be..7ed7a1f 100644
--- a/slidingwindow.h
+++ b/slidingwindow.h
@@ -1186,17 +1186,19 @@ void ConvolutionInputGenerator_kernel1(
 static_assert(IFMChannels % SIMD == 0, "");
 constexpr unsigned COUNTER_WIDTH = clog2(Stride-1) + 1;
 constexpr unsigned COUNTER_RESET = Stride - 2;
+constexpr unsigned MULTIPLYING_FACTOR = IFMChannels/SIMD;
 	for (unsigned int im=0; im<numReps; im++) {
+#pragma HLS performance target_ti=IFMDim*IFMDim*IFMChannels/SIMD
 		ap_int<COUNTER_WIDTH> counter_y = -1;
 		for (unsigned int y = 0; y < IFMDim; y++) {
 			const bool keep_y = counter_y < 0;
 			counter_y = keep_y ? ap_int<COUNTER_WIDTH>(COUNTER_RESET) : ap_int<COUNTER_WIDTH>(counter_y - 1);
 			ap_int<COUNTER_WIDTH> counter_x = -1;
 			for (unsigned int x = 0; x < IFMDim; x++) {
+#pragma HLS pipeline style=flp II=IFMChannels/SIMD
 				const bool keep_x = counter_x < 0;
 				counter_x = keep_x ? ap_int<COUNTER_WIDTH>(COUNTER_RESET) : ap_int<COUNTER_WIDTH>(counter_x - 1);
-				for (unsigned int count_simd = 0; count_simd < IFMChannels/SIMD; count_simd++) {
-#pragma HLS pipeline style=flp II=1
+				for (unsigned int count_simd = 0; count_simd < MULTIPLYING_FACTOR; count_simd++) {
 					ap_uint<SIMD*Input_precision> inElem = in.read();
 					if (keep_y && keep_x) {
 						out.write(inElem);

From b9001f49d0afe9e30984b573a0e43c5a3330a40d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 6 Oct 2022 07:17:12 +0100
Subject: [PATCH 02/43] Untangle max norm implementation.

---
 normalize.hpp      | 26 +++++++++++---------------
 tb/max_norm_tb.cpp |  2 +-
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/normalize.hpp b/normalize.hpp
index 6320489..109b720 100644
--- a/normalize.hpp
+++ b/normalize.hpp
@@ -85,7 +85,7 @@ void normalize(
  * Quantized maximum normalization over input vectors of length FM_SIZE
  * into the numeric range of the output type `ap_uint<WO>`:
  *
- *	x_i -> round( (2^WO-1) * x_i / max{x_j | j=0:FM_SIZE} )
+ *	x_i -> round( NORMAX * x_i / max{x_j | j=0:FM_SIZE} )
  */
 template<
 	unsigned  FM_SIZE,		// Vector length
@@ -104,6 +104,7 @@ void max_norm(
 	hls::stream<ap_uint<WI>>  buffer;
 #pragma HLS stream variable=buffer depth=FM_SIZE
 
+	// Buffer input and scan it for the maximum
 	ap_uint<WI>  max = 1;	// Prevent division by zero
 	for(unsigned  i = 0; i < FM_SIZE; i++) {
 #pragma HLS pipeline II=1 style=flp
@@ -111,20 +112,15 @@ void max_norm(
 		max = std::max(max, x);
 		buffer.write(x);
 	}
-	normalize<FM_SIZE, 1>(
-		buffer, dst,
-		[max]() -> ap_uint<WI+WO+1> {
-#pragma HLS inline
-// @todo Force a LUT implementation for low precisions (8 bits and fewer) instead of true division.
-			ap_uint<WI+WO+2> const  d = ap_uint<WI+WO+2>((MAX, ap_uint<WI+2>(0))) / max;
-			return  d(WI+WO+1, 1) + d[0];
-		},
-		[](ap_uint<WI+WO+1> const &scale, ap_uint<WI> const &x) -> ap_uint<WO> {
-#pragma HLS inline
-			ap_uint<WO+WI+1> const  p = scale*x;
-			return  p(WO+WI, WI+1) + p[WI];
-		}
-	);
+
+	// Replay buffer normalizing all values
+	for(unsigned  i = 0; i < FM_SIZE; i++) {
+#pragma HLS pipeline II=1 style=flp
+		ap_uint<WO+WI>   const  a = MAX * buffer.read();
+		ap_uint<WO+WI+1> const  b = (a, ap_uint<1>(0));	// div with one fractional binary digit for rounding
+		ap_uint<WO+1>    const  q = b / max;
+		dst.write(q(WO, 1) + q[0]);
+	}
 
 } // max_norm()
 
diff --git a/tb/max_norm_tb.cpp b/tb/max_norm_tb.cpp
index 92cde8c..de3a738 100644
--- a/tb/max_norm_tb.cpp
+++ b/tb/max_norm_tb.cpp
@@ -73,7 +73,7 @@ int main() {
 				for(unsigned  j = 0; j < 2; j++) {
 					TO const     y   = dst[j].read();
 					float const  ref = ref_scale[j] * x;
-					bool  const  ok  = std::abs(y-ref) < 0.6f;
+					bool  const  ok  = std::abs(y-ref) <= 0.5f;
 					if(!ok)  mismatches++;
 					std::cout <<'\t' << std::setw(3) << y << " / " << std::setw(7) << ref << '\t' << (ok? '.' : 'X');
 				}

From d6dbc275cf8f966559edff95a8a12d78c6a6977d Mon Sep 17 00:00:00 2001
From: johnnoel <johnnoel@xilinx.com>
Date: Mon, 14 Nov 2022 11:33:20 +0000
Subject: [PATCH 03/43] Add node label to Jenkinsfile

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 07dc88b..e4fa984 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -30,7 +30,7 @@
  *
  ******************************************************************************/
 
-node {
+node('finn-build || built-in') {
     def app
     stage('Clone repository') {
         /* Let's make sure we have the repository cloned to our workspace */

From a77a7462286bc09e6e30e4c0185a0699d5704213 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 25 Nov 2022 19:21:16 +0100
Subject: [PATCH 04/43] Rework independent SIMD support

---
 vvau.hpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vvau.hpp b/vvau.hpp
index d9604ac..ead49f0 100644
--- a/vvau.hpp
+++ b/vvau.hpp
@@ -58,7 +58,7 @@
  * 
  * \tparam Channels   Number of channels
  * \tparam Kernel_2   Kernel * Kernel dimension (Kernel ^ 2 if square)
- * \tparam SIMD       Number of input columns computed in parallel, must be set to 1
+ * \tparam SIMD       Number of input columns computed in parallel
  * \tparam PE         Number of output rows computed in parallel
  * \tparam MMV        Number of output pixels computed in parallel
  * \tparam TSrcI      DataType of the input activation (as used in the MAC)
@@ -89,7 +89,6 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
 				  int const  reps,
 				  R const &r) {
 
-  static_assert(SIMD == 1, "SIMD parallelism not yet supported.");
 
   // how many different rows each neuron will compute
   // alternatively: number of vertical matrix chunks
@@ -97,8 +96,7 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
 
   // how many synapse groups each row is split into
   // alternatively: number of horizontal matrix chunks
-  // always equal to # kernel pixels since no SIMD
-  unsigned const  SF = Kernel_2;
+  unsigned const  SF = Kernel_2 / SIMD;
   decltype(activation.init(0,0))  accu[MMV][PE];
 #pragma HLS ARRAY_PARTITION variable=accu complete dim=0
 
@@ -107,7 +105,7 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
   unsigned  tile = 0; // invariant: tile = nf*SF + sf
   // everything merged into a common iteration space (one "big" loop instead
   // of smaller nested loops) to get the pipelinening the way we want
-  unsigned const TOTAL_FOLD = NF * SF ;//* Channels/SIMD;
+  unsigned const TOTAL_FOLD = NF * SF ;
   for(unsigned  i = 0; i < reps * TOTAL_FOLD; i++) {
 #pragma HLS pipeline style=flp II=1
     TI  inElem;
@@ -127,9 +125,16 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
     for(unsigned  pe = 0; pe < PE; pe++) {
 #pragma HLS UNROLL
       auto const  wgt = TWeightI()(w[pe]);
+
+      ap_uint<SIMD*TSrcI::width> act_simd= 0;
+      for(unsigned int simd = 0; simd < SIMD; simd++) {
+        act_simd(TSrcI::width*(simd+1)-1, TSrcI::width*simd) = (Slice<ap_uint<PE*TSrcI::width>>()(inElem)(SIMD - simd - 1,0))
+                                                               (TSrcI::width*(pe+1)-1, TSrcI::width*pe);
+      }
+
       for (unsigned mmv = 0; mmv < MMV; mmv++){
-        auto const  act = TSrcI()(inElem, mmv);
-		accu[mmv][pe] += mul(wgt[0], act(pe,mmv), r);
+        auto const  act = TSrcI()(act_simd, mmv);
+        accu[mmv][pe] = mac<SIMD>(accu[mmv][pe], wgt, act, r, mmv);
       }
     }
 

From 2b478b2033b2176553365f4017bd02aca2e8bd5b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 8 Dec 2022 05:22:02 +0000
Subject: [PATCH 05/43] Bump certifi from 2019.3.9 to 2022.12.7

Bumps [certifi](https://github.com/certifi/python-certifi) from 2019.3.9 to 2022.12.7.
- [Release notes](https://github.com/certifi/python-certifi/releases)
- [Commits](https://github.com/certifi/python-certifi/compare/2019.03.09...2022.12.07)

---
updated-dependencies:
- dependency-name: certifi
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/requirements.txt | 2 +-
 requirements.txt      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index e2da33d..53d7a92 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,7 @@
 alabaster==0.7.12
 Babel==2.9.1
 breathe==4.13.0.post0
-certifi==2019.3.9
+certifi==2022.12.7
 chardet==3.0.4
 commonmark==0.9.0
 docutils==0.14
diff --git a/requirements.txt b/requirements.txt
index 250cf16..8c9e03d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@
 alabaster==0.7.12
 Babel==2.9.1
 breathe==4.13.0.post0
-certifi==2019.3.9
+certifi==2022.12.7
 chardet==3.0.4
 commonmark==0.9.0
 docutils==0.14

From 743bb597bf545a232409bfd89d395dde993504bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Wed, 14 Dec 2022 16:27:07 +0000
Subject: [PATCH 06/43] Consolidating into a single requirements.txt.

---
 docs/requirements.txt | 29 +++++++++++++++++++++
 requirements.txt      | 59 -------------------------------------------
 2 files changed, 29 insertions(+), 59 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 53d7a92..685b213 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,32 @@
+#   Copyright (c) 2019, Xilinx, Inc.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions are met:
+#
+#   1.  Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#
+#   2.  Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#
+#   3.  Neither the name of the copyright holder nor the names of its
+#       contributors may be used to endorse or promote products derived from
+#       this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+#   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#   PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+#   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+#   OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+#   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+#   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+#   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 alabaster==0.7.12
 Babel==2.9.1
 breathe==4.13.0.post0
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 8c9e03d..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-#   Copyright (c) 2019, Xilinx, Inc.
-#   All rights reserved.
-# 
-#   Redistribution and use in source and binary forms, with or without 
-#   modification, are permitted provided that the following conditions are met:
-#
-#   1.  Redistributions of source code must retain the above copyright notice, 
-#       this list of conditions and the following disclaimer.
-#
-#   2.  Redistributions in binary form must reproduce the above copyright 
-#       notice, this list of conditions and the following disclaimer in the 
-#       documentation and/or other materials provided with the distribution.
-#
-#   3.  Neither the name of the copyright holder nor the names of its 
-#       contributors may be used to endorse or promote products derived from 
-#       this software without specific prior written permission.
-#
-#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-#   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
-#   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
-#   PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
-#   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
-#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-#   OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-#   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
-#   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
-#   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-alabaster==0.7.12
-Babel==2.9.1
-breathe==4.13.0.post0
-certifi==2022.12.7
-chardet==3.0.4
-commonmark==0.9.0
-docutils==0.14
-future
-idna==2.8
-imagesize==1.1.0
-Jinja2==2.11.3
-MarkupSafe==1.1.1
-packaging==19.0
-Pygments==2.7.4
-pyparsing==2.4.0
-pytz==2019.1
-recommonmark==0.5.0
-requests==2.25.1
-six==1.12.0
-snowballstemmer==1.2.1
-Sphinx==2.0.1
-sphinx-rtd-theme==0.4.3
-sphinxcontrib-applehelp==1.0.1
-sphinxcontrib-devhelp==1.0.1
-sphinxcontrib-htmlhelp==1.0.2
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.2
-sphinxcontrib-serializinghtml==1.1.3
-urllib3==1.26.5
-numpy

From 5faa7508820eec40259f788a2d6b04081c39a95f Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 4 Jan 2023 16:36:35 +0100
Subject: [PATCH 07/43] Set conv_dws testbench to use new SIMD parallelism

---
 tb/conv_dws_tb.cpp         |  2 +-
 tb/conv_dws_top.cpp        |  4 +-
 tb/data/config-conv-dws.h  |  2 +-
 tb/data/memdata-conv-dws.h | 90 ++++----------------------------------
 4 files changed, 14 insertions(+), 84 deletions(-)

diff --git a/tb/conv_dws_tb.cpp b/tb/conv_dws_tb.cpp
index 19a3d92..7ad2f24 100644
--- a/tb/conv_dws_tb.cpp
+++ b/tb/conv_dws_tb.cpp
@@ -104,7 +104,7 @@ int main()
 	for (unsigned int oy = 0; oy < TY; oy++) {
 		for (unsigned int ox = 0; ox <TX; ox++) {
 			for(int pe=0;pe <PE1;pe++){
-				ap_int<4> quantized_weight = (ap_int<4>) PARAM::weights.weights(kx*KERNEL_DIM + ky)[out_chan_count][0];
+				ap_int<4> quantized_weight = (ap_int<4>) PARAM::weights.weights(0)[out_chan_count][TX-1 - (kx*KERNEL_DIM + ky)];
 				W1[out_chan_count][kx][ky] = quantized_weight;
 				kx++;
 				if (kx==KERNEL_DIM){
diff --git a/tb/conv_dws_top.cpp b/tb/conv_dws_top.cpp
index b56765e..a362b8d 100644
--- a/tb/conv_dws_top.cpp
+++ b/tb/conv_dws_top.cpp
@@ -58,9 +58,11 @@ void Testbench_conv_dws(stream<ap_uint<FM_Channels1*INPUT_PRECISION> > & in, str
 	hls::stream<ap_uint<FM_Channels1*ap_uint<INPUT_PRECISION>::width> > resized_stream("resized_stream");
 	hls::stream<ap_uint<PE1*ap_uint<INPUT_PRECISION>::width> > resized_stream_pe("resized_stream_pe");
 	hls::stream<ap_uint<PE1*ap_uint<INPUT_PRECISION>::width> > swg_out("swg_out");
+	hls::stream<ap_uint<PE1*SIMD1*ap_uint<INPUT_PRECISION>::width> > resized_stream_simd("resized_stream_simd");
 	SameResize_Batch<IFMDim1, KERNEL_DIM, STRIDE, FM_Channels1, ap_uint<INPUT_PRECISION> >(in, resized_stream, numReps);
 	StreamingDataWidthConverter_Batch<FM_Channels1*INPUT_PRECISION, PE1*INPUT_PRECISION, (IFMDim1+2)*(IFMDim1+2)>(resized_stream, resized_stream_pe, numReps);
 	ConvolutionInputGenerator_dws<KERNEL_DIM, FM_Channels1, ap_uint<INPUT_PRECISION>::width, IFMDim1+2, OFMDim1, PE1,1>(resized_stream_pe, swg_out, numReps, ap_resource_dflt());
-	Vector_Vector_Activate_Batch<FM_Channels1, KERNEL_DIM*KERNEL_DIM, SIMD1, PE1, MMV1, Slice<ap_uint<INPUT_PRECISION> >, Slice<ap_int<16> >, Identity>(swg_out, out, PARAM::weights, PassThroughActivation<ap_int<16>>(), numReps*OFMDim1*OFMDim1, ap_resource_dsp());
+	StreamingDataWidthConverter_Batch<PE1*INPUT_PRECISION, PE1*SIMD1*INPUT_PRECISION, SIMD1*OFMDim1*OFMDim1>(swg_out, resized_stream_simd, numReps);
+	Vector_Vector_Activate_Batch<FM_Channels1, KERNEL_DIM*KERNEL_DIM, SIMD1, PE1, MMV1, Slice<ap_uint<INPUT_PRECISION> >, Slice<ap_int<16> >, Identity>(resized_stream_simd, out, PARAM::weights, PassThroughActivation<ap_int<16>>(), numReps*OFMDim1*OFMDim1, ap_resource_dsp());
 
 }
diff --git a/tb/data/config-conv-dws.h b/tb/data/config-conv-dws.h
index e063afd..da882f7 100644
--- a/tb/data/config-conv-dws.h
+++ b/tb/data/config-conv-dws.h
@@ -1,5 +1,5 @@
 #define KERNEL_DIM 3 
-#define SIMD1 1
+#define SIMD1 9
 #define PE1 8 
 #define MMV1 1 
 #define WIDTH 4 
diff --git a/tb/data/memdata-conv-dws.h b/tb/data/memdata-conv-dws.h
index d266aa1..d8bc97c 100644
--- a/tb/data/memdata-conv-dws.h
+++ b/tb/data/memdata-conv-dws.h
@@ -1,88 +1,16 @@
 #ifndef PARAMS_HPP
 #define PARAMS_HPP
 namespace PARAM{ 
-static FixedPointWeights<1,ap_int<4>,8,9> weights= {
+static FixedPointWeights<9,ap_int<4>,8,1> weights= {
 {
-{ 
-"0x7",
-"0x1",
-"0x0",
-"0x4",
-"0x7",
-"0x7",
-"0x5",
-"0x2",
-"0x8"} 
-,{ 
-"0x2",
-"0x6",
-"0x0",
-"0x7",
-"0x0",
-"0x7",
-"0x1",
-"0x3",
-"0x6"} 
-,{ 
-"0x2",
-"0x0",
-"0x5",
-"0x5",
-"0x2",
-"0x5",
-"0x5",
-"0x5",
-"0x5"} 
-,{ 
-"0x1",
-"0x5",
-"0x6",
-"0x3",
-"0x1",
-"0x7",
-"0x7",
-"0x1",
-"0x5"} 
-,{ 
-"0x8",
-"0x3",
-"0x5",
-"0x6",
-"0x2",
-"0x3",
-"0x6",
-"0x6",
-"0x7"} 
-,{ 
-"0x6",
-"0x4",
-"0x3",
-"0x5",
-"0x2",
-"0x5",
-"0x5",
-"0x2",
-"0x0"} 
-,{ 
-"0x6",
-"0x2",
-"0x4",
-"0x4",
-"0x5",
-"0x4",
-"0x6",
-"0x8",
-"0x4"} 
-,{ 
-"0x2",
-"0x5",
-"0x4",
-"0x1",
-"0x7",
-"0x5",
-"0x1",
-"0x5",
-"0x8"} 
+{"0x710477528"},
+{"0x260707136"},
+{"0x205525555"},
+{"0x156317715"},
+{"0x835623667"},
+{"0x643525520"},
+{"0x624454684"},
+{"0x254175158"} 
 }
 };
  } 

From 8968711cc79d79606d0f10476a6796ba28a9b3e0 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 18 Jan 2023 15:05:50 +0100
Subject: [PATCH 08/43] [VVAU] SIMD support for streamed weights

---
 vvau.hpp | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vvau.hpp b/vvau.hpp
index ead49f0..f29d359 100644
--- a/vvau.hpp
+++ b/vvau.hpp
@@ -172,7 +172,7 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
  * 
  * \tparam Channels   Number of channels
  * \tparam Kernel_2   Kernel * Kernel dimension (Kernel ^ 2 if square)
- * \tparam SIMD       Number of input columns computed in parallel, must be set to 1
+ * \tparam SIMD       Number of input columns computed in parallel
  * \tparam PE         Number of output rows computed in parallel
  * \tparam MMV        Number of output pixels computed in parallel
  * \tparam TSrcI      DataType of the input activation (as used in the MAC)
@@ -204,7 +204,6 @@ void Vector_Vector_Activate_Stream_Batch(
 	int const  reps,
 	R const &r
 ) {
-	static_assert(SIMD == 1, "SIMD parallelism not yet supported.");
 
 	// how many different rows each neuron will compute
 	// alternatively: number of vertical matrix chunks
@@ -212,8 +211,7 @@ void Vector_Vector_Activate_Stream_Batch(
 
 	// how many synapse groups each row is split into
 	// alternatively: number of horizontal matrix chunks
-	// always equal to # kernel pixels since no SIMD
-	constexpr unsigned  SF = Kernel_2;
+	constexpr unsigned  SF = Kernel_2 / SIMD;
 	decltype(activation.init(0,0))  accu[MMV][PE];
 #pragma HLS ARRAY_PARTITION variable=accu complete dim=0
 
@@ -223,7 +221,7 @@ void Vector_Vector_Activate_Stream_Batch(
 	unsigned  tile = 0; // invariant: tile = nf*SF + sf
 	// everything merged into a common iteration space (one "big" loop instead
 	// of smaller nested loops) to get the pipelinening the way we want
-	constexpr unsigned  TOTAL_FOLD = NF * SF ;//* Channels/SIMD;
+	constexpr unsigned  TOTAL_FOLD = NF * SF ;
 	for(unsigned  i = 0; i < reps * TOTAL_FOLD; i++) {
 #pragma HLS pipeline style=flp II=1
 		TI  inElem;
@@ -250,9 +248,16 @@ void Vector_Vector_Activate_Stream_Batch(
 		for(unsigned  pe = 0; pe < PE; pe++) {
 #pragma HLS UNROLL
 			auto const  wgt = TWeightI()(w[pe]);
+
+			ap_uint<SIMD*TSrcI::width> act_simd= 0;
+			for(unsigned int simd = 0; simd < SIMD; simd++) {
+				act_simd(TSrcI::width*(simd+1)-1, TSrcI::width*simd) = (Slice<ap_uint<PE*TSrcI::width>>()(inElem)(SIMD - simd - 1,0))
+																	   (TSrcI::width*(pe+1)-1, TSrcI::width*pe);
+			}
+
 			for(unsigned mmv = 0; mmv < MMV; mmv++) {
-				auto const  act = TSrcI()(inElem, mmv);
-				accu[mmv][pe] += mul(wgt[0], act(pe,mmv), r);
+				auto const  act = TSrcI()(act_simd, mmv);
+        		accu[mmv][pe] = mac<SIMD>(accu[mmv][pe], wgt, act, r, mmv);
 			}
 		}
 

From 3cdbe83cb6d49dc3e93d8cfd88497a13af36fed1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Feb 2023 11:02:13 +0000
Subject: [PATCH 09/43] Bump future from 0.17.1 to 0.18.3 in /docs

Bumps [future](https://github.com/PythonCharmers/python-future) from 0.17.1 to 0.18.3.
- [Release notes](https://github.com/PythonCharmers/python-future/releases)
- [Changelog](https://github.com/PythonCharmers/python-future/blob/master/docs/changelog.rst)
- [Commits](https://github.com/PythonCharmers/python-future/compare/v0.17.1...v0.18.3)

---
updated-dependencies:
- dependency-name: future
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 docs/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 53d7a92..829edd8 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,7 +5,7 @@ certifi==2022.12.7
 chardet==3.0.4
 commonmark==0.9.0
 docutils==0.14
-future==0.17.1
+future==0.18.3
 idna==2.8
 imagesize==1.1.0
 Jinja2==2.11.3

From eb4a00425d0acc0e32dcf836554d9a9d72db47d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 24 Feb 2023 14:49:10 +0000
Subject: [PATCH 10/43] Use constexpr for compile-time parameters.

---
 vvau.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vvau.hpp b/vvau.hpp
index f29d359..5173c82 100644
--- a/vvau.hpp
+++ b/vvau.hpp
@@ -92,11 +92,11 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
 
   // how many different rows each neuron will compute
   // alternatively: number of vertical matrix chunks
-  unsigned const  NF = Channels / PE;
+  constexpr unsigned  NF = Channels / PE;
 
   // how many synapse groups each row is split into
   // alternatively: number of horizontal matrix chunks
-  unsigned const  SF = Kernel_2 / SIMD;
+  constexpr unsigned  SF = Kernel_2 / SIMD;
   decltype(activation.init(0,0))  accu[MMV][PE];
 #pragma HLS ARRAY_PARTITION variable=accu complete dim=0
 
@@ -105,7 +105,7 @@ void Vector_Vector_Activate_Batch(hls::stream<TI> &in,
   unsigned  tile = 0; // invariant: tile = nf*SF + sf
   // everything merged into a common iteration space (one "big" loop instead
   // of smaller nested loops) to get the pipelinening the way we want
-  unsigned const TOTAL_FOLD = NF * SF ;
+  constexpr unsigned  TOTAL_FOLD = NF * SF ;
   for(unsigned  i = 0; i < reps * TOTAL_FOLD; i++) {
 #pragma HLS pipeline style=flp II=1
     TI  inElem;

From 1fe77e43f5d5f68cac075bfbb959d5d668af049b Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 17 Mar 2023 08:31:24 -0700
Subject: [PATCH 11/43] Adding FMPadding_Pixel_Nonsquare to streamtools

---
 streamtools.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/streamtools.h b/streamtools.h
index 64cf7ad..ff9c1a9 100644
--- a/streamtools.h
+++ b/streamtools.h
@@ -417,6 +417,55 @@ void FMPadding_Batch(
 	}
 }
 
+/**
+ * \brief Feature map pixel padding - Pads each pixel in the input feature
+ *        map with zeros. Used as a pre-processing step for the transposed
+ * 		  convolution operation. Expects data in NHWC format, where N=1.
+ *
+ * \tparam OutputDim_x Padded width of the output feature map
+ * \tparam OutputDim_y Padded height of the output feature map
+ * \tparam Stride_x    Stride for each pixel along the width dimension 
+ * \tparam Stride_y    Stride for each pixel along the height dimension
+ * \tparam NumChannels Number of channels of the input feature map
+ * \tparam	NumChannels		Number of channels of the input feature map
+ * \tparam	SIMD			Input parallelism 
+ * \tparam In_t		   Input datatype
+ *
+ * @param src          Input stream
+ * @param dst 		   Output stream
+ */
+template<
+	unsigned OutputDim_x,
+	unsigned OutputDim_y,
+	unsigned Stride_x,
+	unsigned Stride_y,
+	unsigned NumChannels,
+	unsigned SIMD,
+	typename In_t
+>
+void FMPadding_Pixel_Nonsquare(
+	hls::stream<ap_uint<SIMD*In_t::width>> &src,
+	hls::stream<ap_uint<SIMD*In_t::width>> &dst
+) {
+	static_assert(NumChannels % SIMD == 0, "SIMD must divide channel count.");
+	constexpr unsigned  Folding = NumChannels/SIMD;
+
+	int unsigned  ytrig = 0;
+	for(int unsigned  y = 0; y < OutputDim_y; y++) {
+		int unsigned  xtrig = 0;
+		for(int unsigned  x = 0; x < OutputDim_x; x++) {
+			for(int unsigned  sf = 0; sf < Folding; sf++) {
+#pragma HLS pipeline II=1 style=flp
+				ap_uint<SIMD*In_t::width>  value = 0;
+				if((ytrig == 0) && (xtrig == 0))  value = src.read();
+				dst.write(value);
+			}
+			if(++xtrig == Stride_x)  xtrig = 0;
+		}
+		if(++ytrig == Stride_y)  ytrig = 0;
+	}
+}
+
 /**
  * \brief   Stream Data Width Converter - Converts the width of the input stream in the output stream
  *

From 574e5680dc739e4eca5d13626a4457b8fa781e42 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 17 Mar 2023 08:31:41 -0700
Subject: [PATCH 12/43] Create fmpp_config.h

---
 tb/data/fmpp_config.h | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 tb/data/fmpp_config.h

diff --git a/tb/data/fmpp_config.h b/tb/data/fmpp_config.h
new file mode 100644
index 0000000..340b9bc
--- /dev/null
+++ b/tb/data/fmpp_config.h
@@ -0,0 +1,10 @@
+constexpr unsigned  SIMD1 = 1;
+constexpr unsigned  INPUT_WIDTH = 8;
+constexpr unsigned  INPUT_DIM_X = 30;
+constexpr unsigned  INPUT_DIM_Y = 40;
+constexpr unsigned  CHANNELS = 3;
+constexpr unsigned  XSTRIDE = 5;
+constexpr unsigned  YSTRIDE = 3;
+
+constexpr unsigned  OUTPUT_DIM_X = INPUT_DIM_X + (INPUT_DIM_X - 1) * (XSTRIDE - 1);
+constexpr unsigned  OUTPUT_DIM_Y = INPUT_DIM_Y + (INPUT_DIM_Y - 1) * (YSTRIDE - 1);
\ No newline at end of file

From e2413b9db6db810024bd31e41b13ba963f08ae8e Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 17 Mar 2023 08:31:45 -0700
Subject: [PATCH 13/43] Create fm_pixel_padding_tb.cpp

---
 tb/fm_pixel_padding_tb.cpp | 105 +++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 tb/fm_pixel_padding_tb.cpp

diff --git a/tb/fm_pixel_padding_tb.cpp b/tb/fm_pixel_padding_tb.cpp
new file mode 100644
index 0000000..9ef5cdb
--- /dev/null
+++ b/tb/fm_pixel_padding_tb.cpp
@@ -0,0 +1,105 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#include "bnn-library.h"
+#include "data/fmpp_config.h"
+
+#include <iostream>
+#include <random>
+
+
+void test_fm_pixel_padding(
+	hls::stream<ap_uint<SIMD1*INPUT_WIDTH>> &src,
+	hls::stream<ap_uint<SIMD1*INPUT_WIDTH>> &dst
+);
+
+int main() {
+	std::cout << "Starting testbench for fm_pixel_padding" << std::endl;
+
+	using T = ap_uint<SIMD1*INPUT_WIDTH>;
+	hls::stream<T> input_stream("input_stream");
+	hls::stream<T> output_stream("output_stream");
+	T  expected[OUTPUT_DIM_Y][OUTPUT_DIM_X][CHANNELS];
+
+	{ // Feed random input sequence
+		std::random_device rd;
+		std::uniform_int_distribution<int> dist(0, (1<<(SIMD1*INPUT_WIDTH))-1);
+		unsigned  input_counter = 0;
+
+		for(unsigned  y = 0; y < OUTPUT_DIM_Y; y++) {
+			for(unsigned  x = 0; x < OUTPUT_DIM_X; x++) {
+				for(unsigned  c = 0; c < CHANNELS; c++) {
+					T  val = 0;
+					if(((y % YSTRIDE) == 0) && ((x % XSTRIDE) == 0)) {
+						val = dist(rd);
+						input_stream.write(val);
+						input_counter++;
+					}
+					expected[y][x][c] = val;
+				}
+			}
+		}
+		if(input_counter != (INPUT_DIM_X * INPUT_DIM_Y * CHANNELS)) {
+			std::cout << "Input stream not fully populated." << std::endl;
+			return 1;
+		}
+	}
+	std::cout << "Finished writing to input stream" << std::endl;
+
+	// Run top-level function
+	test_fm_pixel_padding(input_stream, output_stream);
+	std::cout << "Finished writing to output stream" << std::endl;
+
+	// Verify correctness
+	for(unsigned  y = 0; y < OUTPUT_DIM_Y; y++) {
+		for(unsigned  x = 0; x < OUTPUT_DIM_X; x++) {
+			for(unsigned  c = 0; c < CHANNELS; c++) {
+				if(output_stream.empty()) {
+					std::cerr << "Missing outputs." << std::endl;
+					return  1;
+				}
+
+				T const  val = output_stream.read();
+				if(expected[y][x][c] != val) {
+					std::cerr << "Output mismatch." << std::endl;
+					return  1;
+				}
+			}
+		}
+	}
+	if(!output_stream.empty()) {
+		std::cerr << "Output stream not empty." << std::endl;
+		return 1;
+	}
+}

From 44724fe7613e5fb421ffc49147e95abe924d6026 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 17 Mar 2023 08:31:48 -0700
Subject: [PATCH 14/43] Create fm_pixel_padding_top.cpp

---
 tb/fm_pixel_padding_top.cpp | 55 +++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 tb/fm_pixel_padding_top.cpp

diff --git a/tb/fm_pixel_padding_top.cpp b/tb/fm_pixel_padding_top.cpp
new file mode 100644
index 0000000..7d5792b
--- /dev/null
+++ b/tb/fm_pixel_padding_top.cpp
@@ -0,0 +1,55 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include <hls_stream.h>
+using namespace hls;
+#include "ap_int.h"
+#include "bnn-library.h"
+#include "data/fmpp_config.h"
+
+void test_fm_pixel_padding(
+	stream<ap_uint<SIMD1*INPUT_WIDTH>> &src,
+	stream<ap_uint<SIMD1*INPUT_WIDTH>> &dst
+) {
+#pragma HLS interface AXIS port=src
+#pragma HLS interface AXIS port=dst
+
+#pragma HLS interface ap_ctrl_none port=return
+	FMPadding_Pixel_Nonsquare<
+		OUTPUT_DIM_X,
+		OUTPUT_DIM_Y,
+		XSTRIDE,
+		YSTRIDE,
+		CHANNELS,
+		SIMD1,
+		ap_uint<INPUT_WIDTH>
+	>(src, dst);
+}

From 3464a1b78d95b896a568da50098cdf27ab665fb3 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 17 Mar 2023 08:31:54 -0700
Subject: [PATCH 15/43] Create test_fm_pixel_padding.tcl

---
 tb/test_fm_pixel_padding.tcl | 42 ++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 tb/test_fm_pixel_padding.tcl

diff --git a/tb/test_fm_pixel_padding.tcl b/tb/test_fm_pixel_padding.tcl
new file mode 100644
index 0000000..23af38d
--- /dev/null
+++ b/tb/test_fm_pixel_padding.tcl
@@ -0,0 +1,42 @@
+##############################################################################
+ #  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions are met:
+ #
+ #  1.  Redistributions of source code must retain the above copyright notice,
+ #     this list of conditions and the following disclaimer.
+ #
+ #  2.  Redistributions in binary form must reproduce the above copyright
+ #      notice, this list of conditions and the following disclaimer in the
+ #      documentation and/or other materials provided with the distribution.
+ #
+ #  3.  Neither the name of the copyright holder nor the names of its
+ #      contributors may be used to endorse or promote products derived from
+ #      this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ #  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ #  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ #  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ #  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ #  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ #  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ #  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ #  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+###############################################################################
+open_project hls-syn-fmpp
+add_files fm_pixel_padding_top.cpp -cflags "-std=c++14 -I$::env(FINN_HLS_ROOT) -I$::env(FINN_HLS_ROOT)/tb" 
+add_files -tb fm_pixel_padding_tb.cpp -cflags "-std=c++14 -I$::env(FINN_HLS_ROOT) -I$::env(FINN_HLS_ROOT)/tb" 
+set_top test_fm_pixel_padding
+open_solution "sol1"
+set_part {xck26-sfvc784-2LVI-i}
+create_clock -period 5 -name default
+csim_design
+csynth_design
+# cosim_design
+exit

From 8e986eacb642da407bd5b4ec57f23b8f69bb919b Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sat, 1 Apr 2023 17:12:03 -0700
Subject: [PATCH 16/43] Update fm_pixel_padding_tb.cpp

---
 tb/fm_pixel_padding_tb.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tb/fm_pixel_padding_tb.cpp b/tb/fm_pixel_padding_tb.cpp
index 9ef5cdb..c166d92 100644
--- a/tb/fm_pixel_padding_tb.cpp
+++ b/tb/fm_pixel_padding_tb.cpp
@@ -32,8 +32,8 @@
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#include "bnn-library.h"
-#include "data/fmpp_config.h"
+#include "../bnn-library.h"
+#include "data/config_fmpp.h"
 
 #include <iostream>
 #include <random>
@@ -102,4 +102,5 @@ int main() {
 		std::cerr << "Output stream not empty." << std::endl;
 		return 1;
 	}
+	std::cout << "Successfully passed csim testbench." << std::endl;
 }

From 843aeb5aa19de3e16eb98f11d46f966eee9343ea Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sat, 1 Apr 2023 17:12:21 -0700
Subject: [PATCH 17/43] Updating fm pixel padding top-level function

---
 tb/data/config_fmpp.h       | 41 +++++++++++++++++++++++++++++++++++++
 tb/data/fmpp_config.h       | 10 ---------
 tb/fm_pixel_padding_top.cpp |  9 +++-----
 3 files changed, 44 insertions(+), 16 deletions(-)
 create mode 100644 tb/data/config_fmpp.h
 delete mode 100644 tb/data/fmpp_config.h

diff --git a/tb/data/config_fmpp.h b/tb/data/config_fmpp.h
new file mode 100644
index 0000000..7e19e75
--- /dev/null
+++ b/tb/data/config_fmpp.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+
+constexpr unsigned  SIMD1 = 1;
+constexpr unsigned  INPUT_WIDTH = 8;
+constexpr unsigned  INPUT_DIM_X = 30;
+constexpr unsigned  INPUT_DIM_Y = 40;
+constexpr unsigned  CHANNELS = 3;
+constexpr unsigned  XSTRIDE = 5;
+constexpr unsigned  YSTRIDE = 3;
+
+constexpr unsigned  OUTPUT_DIM_X = INPUT_DIM_X + (INPUT_DIM_X - 1) * (XSTRIDE - 1);
+constexpr unsigned  OUTPUT_DIM_Y = INPUT_DIM_Y + (INPUT_DIM_Y - 1) * (YSTRIDE - 1);
\ No newline at end of file
diff --git a/tb/data/fmpp_config.h b/tb/data/fmpp_config.h
deleted file mode 100644
index 340b9bc..0000000
--- a/tb/data/fmpp_config.h
+++ /dev/null
@@ -1,10 +0,0 @@
-constexpr unsigned  SIMD1 = 1;
-constexpr unsigned  INPUT_WIDTH = 8;
-constexpr unsigned  INPUT_DIM_X = 30;
-constexpr unsigned  INPUT_DIM_Y = 40;
-constexpr unsigned  CHANNELS = 3;
-constexpr unsigned  XSTRIDE = 5;
-constexpr unsigned  YSTRIDE = 3;
-
-constexpr unsigned  OUTPUT_DIM_X = INPUT_DIM_X + (INPUT_DIM_X - 1) * (XSTRIDE - 1);
-constexpr unsigned  OUTPUT_DIM_Y = INPUT_DIM_Y + (INPUT_DIM_Y - 1) * (YSTRIDE - 1);
\ No newline at end of file
diff --git a/tb/fm_pixel_padding_top.cpp b/tb/fm_pixel_padding_top.cpp
index 7d5792b..863ee6e 100644
--- a/tb/fm_pixel_padding_top.cpp
+++ b/tb/fm_pixel_padding_top.cpp
@@ -32,17 +32,14 @@
 #include <hls_stream.h>
 using namespace hls;
 #include "ap_int.h"
-#include "bnn-library.h"
-#include "data/fmpp_config.h"
+#include "../bnn-library.h"
+#include "data/config_fmpp.h"
 
 void test_fm_pixel_padding(
 	stream<ap_uint<SIMD1*INPUT_WIDTH>> &src,
 	stream<ap_uint<SIMD1*INPUT_WIDTH>> &dst
 ) {
-#pragma HLS interface AXIS port=src
-#pragma HLS interface AXIS port=dst
-
-#pragma HLS interface ap_ctrl_none port=return
+#pragma HLS DATAFLOW
 	FMPadding_Pixel_Nonsquare<
 		OUTPUT_DIM_X,
 		OUTPUT_DIM_Y,

From bc897f1a92f56a7bae6e9c053d9ee86c54d22d77 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sat, 1 Apr 2023 21:14:56 -0700
Subject: [PATCH 18/43] Adding deconv2d weight generator

---
 tb/data/gen_weights_deconv2d.py | 104 ++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 tb/data/gen_weights_deconv2d.py

diff --git a/tb/data/gen_weights_deconv2d.py b/tb/data/gen_weights_deconv2d.py
new file mode 100644
index 0000000..0a78433
--- /dev/null
+++ b/tb/data/gen_weights_deconv2d.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2023, Advanced Micro Devices, Inc.
+#   All rights reserved.
+# 
+#   Redistribution and use in source and binary forms, with or without 
+#   modification, are permitted provided that the following conditions are met:
+#
+#   1.  Redistributions of source code must retain the above copyright notice, 
+#       this list of conditions and the following disclaimer.
+#
+#   2.  Redistributions in binary form must reproduce the above copyright 
+#       notice, this list of conditions and the following disclaimer in the 
+#       documentation and/or other materials provided with the distribution.
+#
+#   3.  Neither the name of the copyright holder nor the names of its 
+#       contributors may be used to endorse or promote products derived from 
+#       this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+#   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+#   PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
+#   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
+#   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+#   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+#   OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
+#   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+#   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+#   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+import random 
+import subprocess
+import numpy as np
+
+outFileWeights = open("memdata_deconv2d.h" , "wt")
+outFileConfig = open("config_deconv2d.h" , "wt")
+
+num_images = 1 # num images
+in_channels = 3 # input channels
+out_channels = 5 # output channels
+in_x = in_y = 4 # height/width of inp (assuming square)
+padding = 1 # padding (assuming square)
+stride_x = stride_y = 2 # stride (assuming square)
+kernel_x = kernel_y = 4 # kernel size (assuming square)
+out_x = stride_x * (in_x - 1) - (2 * padding) + kernel_x
+out_y = stride_y * (in_y - 1) - (2 * padding) + kernel_y
+
+assert out_x % in_x == 0, "Need even upsampling factor."
+assert out_y % in_y == 0, "Need even upsampling factor."
+
+i_precision = 4
+o_precision = 16
+simd = 2
+pe = 2
+w_precision = 4
+# mmv = 1 # todo - figure out what this is
+
+conv_stride = 1 # assuming square
+conv_padding = kernel_x - padding - 1 # assuming square
+
+tile = in_channels * kernel_x * kernel_y * out_channels // (simd * pe)
+
+assert in_y == in_x, "Testing square inputs."
+assert out_y == out_x, "Testing square outputs."
+assert kernel_x == kernel_y, "Testing square kernels."
+assert stride_x == stride_y, "Testing square strides."
+outFileConfig.write("constexpr unsigned  DeconvIFDim = %d;\n" % in_x)
+outFileConfig.write("constexpr unsigned  DeconvIFMCh = %d;\n" % in_channels)
+outFileConfig.write("constexpr unsigned  DeconvOFDim = %d;\n" % out_x)
+outFileConfig.write("constexpr unsigned  DeconvOFMCh = %d;\n" % out_channels)
+outFileConfig.write("constexpr unsigned  DeconvKernel = %d;\n" % kernel_x)
+outFileConfig.write("constexpr unsigned  DeconvStride = %d;\n" % stride_x)
+outFileConfig.write("constexpr unsigned  DeconvPadding = %d;\n" % padding)
+outFileConfig.write("constexpr unsigned  IPrecision = %d;\n" % i_precision)
+outFileConfig.write("constexpr unsigned  OPrecision = %d;\n" % o_precision)
+outFileConfig.write("constexpr unsigned  WPrecision = %d;\n" % w_precision)
+outFileConfig.close()
+
+
+outFileWeights.write("#ifndef PARAMS_HPP\n")
+outFileWeights.write("#define PARAMS_HPP\n")
+
+outFileWeights.write("namespace PARAM{ \n")
+if (w_precision == 1):
+	outFileWeights.write("static BinaryWeights<%d,%d,%d> weights= {\n{\n" %(simd, pe, tile))
+else:
+	outFileWeights.write(
+		"static FixedPointWeights<%d,ap_int<%d>,%d,%d> weights= {\n{\n" % (simd, w_precision, pe, tile)
+	)
+for p in range(pe):
+	outFileWeights.write("{ \n")
+	for t in range(tile):
+		width = simd*w_precision
+		val = random.randint(0, 1<<width-1)
+		outFileWeights.write("%s" % hex(val))
+		if t!=tile-1:
+			outFileWeights.write(",\n")
+	outFileWeights.write("} \n")
+	if p!=pe-1:
+		outFileWeights.write(",")
+outFileWeights.write("}\n};\n } \n")
+outFileWeights.write("#endif \n")
+outFileWeights.close()

From 683fcc47310bb5da95f1fa14083fc934bfc7afbf Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sat, 1 Apr 2023 21:15:18 -0700
Subject: [PATCH 19/43] Create deconv.hpp

---
 tb/deconv.hpp | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 tb/deconv.hpp

diff --git a/tb/deconv.hpp b/tb/deconv.hpp
new file mode 100644
index 0000000..9d96219
--- /dev/null
+++ b/tb/deconv.hpp
@@ -0,0 +1,70 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+template<
+    unsigned IFMDim,
+    unsigned IFMCh,
+    unsigned OFMDim,
+    unsigned OFMCh,
+    unsigned Kernel,
+    unsigned Stride,
+    unsigned Padding,
+    typename TI,
+    typename TO,
+    typename TW
+>
+void deconv2d(
+    TI const image[IFMDim][IFMDim][IFMCh],
+    TW const weights [IFMCh][OFMCh][Kernel][Kernel],
+    TO outputs [OFMDim][OFMDim][OFMCh]
+) {
+    for (unsigned oc=0; oc < OFMCh; oc++) {
+        for (unsigned ic=0; ic < IFMCh; ic++) {
+            for (unsigned kh=0; kh < Kernel; kh++) {
+                for (unsigned kw=0; kw < Kernel; kw++) {
+                    TW  w = weights[ic][oc][kh][kw];
+                    for (unsigned ih=0; ih < IFMDim; ih++) {
+                        for (unsigned iw=0; iw < IFMDim; iw++) {
+                            TI  x = image[ih][iw][ic];
+                            unsigned  oh = (Stride * ih) + kh - Padding;
+                            unsigned  ow = (Stride * iw) + kw - Padding;
+                            if ((oh < OFMDim) && (oh >= 0) && (ow < OFMDim) && (ow >= 0)) {
+                                TO  y = x * w;
+                                outputs[oh][ow][oc] += y;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file

From 5106b46fb08162e672e68bc93b7c9038b20b03f1 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sat, 1 Apr 2023 21:15:39 -0700
Subject: [PATCH 20/43] Adding top and testbench

---
 tb/deconv_tb.cpp  | 142 ++++++++++++++++++++++++++++++++++++++++++++++
 tb/deconv_top.cpp |  80 ++++++++++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 tb/deconv_tb.cpp
 create mode 100644 tb/deconv_top.cpp

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
new file mode 100644
index 0000000..538b150
--- /dev/null
+++ b/tb/deconv_tb.cpp
@@ -0,0 +1,142 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#include "../bnn-library.h"
+#include "deconv.hpp"
+#include "data/config_deconv2d.h"
+
+#include <iostream>
+#include <random>
+
+
+void test_deconv2d(
+	hls::stream<ap_uint<DeconvIFMCh*IPrecision> > & src,
+	hls::stream<ap_uint<DeconvOFMCh*OPrecision> > & dst
+);
+
+int main() {
+	std::cout << "Starting testbench for deconvolution" << std::endl;
+
+	ap_uint<IPrecision>  inp_image[DeconvIFDim][DeconvIFDim][DeconvIFMCh];
+	ap_uint<OPrecision>  out_image[DeconvOFDim][DeconvOFDim][DeconvOFMCh];
+	hls::stream<ap_uint<DeconvIFMCh*IPrecision> > input_stream("input_stream");
+	hls::stream<ap_uint<DeconvOFMCh*OPrecision> > output_stream("output_stream");
+	ap_uint<WPrecision>  weights[DeconvIFMCh][DeconvOFMCh][DeconvKernel][DeconvKernel];
+	
+	{ // Feed random input sequence
+		std::random_device rd;
+		std::uniform_int_distribution<int> dist(0, (1<<(DeconvIFMCh*IPrecision))-1);
+		unsigned  input_counter = 0;
+
+		for(unsigned  y = 0; y < DeconvIFDim; y++) {
+			for(unsigned  x = 0; x < DeconvIFDim; x++) {
+				for(unsigned  c = 0; c < DeconvIFMCh; c++) {
+					ap_uint<IPrecision>  val = dist(rd);
+					input_stream.write(val);
+					inp_image[y][x][c] = val;
+					input_counter++;
+				}
+			}
+		}
+		if(input_counter != (DeconvIFDim * DeconvIFDim * DeconvIFMCh)) {
+			std::cout << "Input stream not fully populated." << std::endl;
+			return 1;
+		}
+	}
+	std::cout << "Finished writing to input stream" << std::endl;
+
+	// TODO - create weights
+	// { // Feed random weight sequence
+	// 	std::random_device rd;
+	// 	std::uniform_int_distribution<int> dist(0, (1<<(WEIGHT_PRECISION))-1);
+	// 	unsigned  input_counter = 0;
+
+	// 	for(unsigned  y = 0; y < DECONV_INPUT_DIM_Y; y++) {
+	// 		for(unsigned  x = 0; x < DECONV_INPUT_DIM_X; x++) {
+	// 			for(unsigned  c = 0; c < INPUT_CHANNELS; c++) {
+	// 				TI  val = dist(rd);
+	// 				input_stream.write(val);
+	// 				image[y][x][c] = val;
+	// 				input_counter++;
+	// 			}
+	// 		}
+	// 	}
+	// 	if(input_counter != (DECONV_INPUT_DIM_X * DECONV_INPUT_DIM_Y * INPUT_CHANNELS)) {
+	// 		std::cout << "Input stream not fully populated." << std::endl;
+	// 		return 1;
+	// 	}
+	// }
+	std::cout << "Finished writing to input stream" << std::endl;
+
+
+	// TODO - calculate expected outputs from deconvolution
+	std::cout << "Calculating expected output" << std::endl;
+	deconv2d<
+		DeconvIFDim,
+		DeconvIFMCh,
+		DeconvOFDim,
+		DeconvOFMCh,
+		DeconvKernel,
+		DeconvStride,
+		DeconvPadding,
+		ap_uint<IPrecision>,
+		ap_uint<OPrecision>,
+		ap_uint<WPrecision>
+	>(inp_image, weights, out_image);
+
+	// Run top-level function
+	test_deconv2d(input_stream, output_stream);
+	std::cout << "Finished writing to output stream" << std::endl;
+
+	// Verify correctness
+	// for(unsigned  y = 0; y < OUTPUT_DIM_Y; y++) {
+	// 	for(unsigned  x = 0; x < OUTPUT_DIM_X; x++) {
+	// 		for(unsigned  c = 0; c < CHANNELS; c++) {
+	// 			if(output_stream.empty()) {
+	// 				std::cerr << "Missing outputs." << std::endl;
+	// 				return  1;
+	// 			}
+
+	// 			T const  val = output_stream.read();
+	// 			if(expected[y][x][c] != val) {
+	// 				std::cerr << "Output mismatch." << std::endl;
+	// 				return  1;
+	// 			}
+	// 		}
+	// 	}
+	// }
+	// if(!output_stream.empty()) {
+	// 	std::cerr << "Output stream not empty." << std::endl;
+	// 	return 1;
+	// }
+}
diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
new file mode 100644
index 0000000..906ea4d
--- /dev/null
+++ b/tb/deconv_top.cpp
@@ -0,0 +1,80 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#include <hls_stream.h>
+using namespace hls;
+#include "ap_int.h"
+
+#include "../bnn-library.h"
+#include "../weights.hpp"
+// #include "activations.hpp"
+// #include "interpret.hpp"
+// #include "mvau.hpp"
+// #include "conv.hpp"
+#include "data/memdata_deconv2d.h"
+#include "data/config_deconv2d.h"
+
+void test_deconv2d(
+	stream<ap_uint<DeconvIFMCh*IPrecision> > &src,
+	stream<ap_uint<DeconvOFMCh*OPrecision> > &dst
+){
+#pragma HLS DATAFLOW
+
+	ap_uint<DeconvIFMCh*IPrecision>  val = 0;
+	val = src.read();
+
+	// stream<ap_uint<IFM_Channels1*INPUT_PRECISION> > conv_input("input_stream");
+	// FMPadding_Pixel_Nonsquare<
+	// 	OUTPUT_DIM_X, // dimension expected by conv
+	// 	OUTPUT_DIM_Y, // dimension expected by conv
+	// 	XSTRIDE, // stride along pixel padding
+	// 	YSTRIDE, // stride along pixel padding
+	// 	CHANNELS, // num channels expected by conv (input channels)
+	// 	SIMD1, // packing along the channel dim
+	// 	ap_uint<INPUT_WIDTH> // data type of values
+	// >(src, conv_input);
+
+	// // TODO - replace weights and pass-through activation
+	// // TODO - figure out why pass-through activation is 16 bits
+	// // TODO - figure out why we are using DSP
+	// ConvLayer_Batch<
+	// 	KERNEL_DIM,
+	// 	IFM_Channels1,
+	// 	IFMDim1,
+	// 	OFM_Channels1,
+	// 	OFMDim1,
+	// 	SIMD1,
+	// 	PE1,
+	// 	Slice<ap_uint<INPUT_PRECISION> >,
+	// 	Slice<ap_uint<ACTIVATION_PRECISION> >,
+	// 	Identity
+	// >(conv_inputs, dst, PARAM::weights, PassThroughActivation<ap_uint<16>>(), 1, ap_resource_dsp());
+}

From 22e52ee3a8dc35e198de7cfc298caacf4824d83d Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sun, 2 Apr 2023 19:41:02 -0700
Subject: [PATCH 21/43] Update gen_weights_deconv2d.py

---
 tb/data/gen_weights_deconv2d.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tb/data/gen_weights_deconv2d.py b/tb/data/gen_weights_deconv2d.py
index 0a78433..926a00f 100644
--- a/tb/data/gen_weights_deconv2d.py
+++ b/tb/data/gen_weights_deconv2d.py
@@ -38,7 +38,7 @@
 
 num_images = 1 # num images
 in_channels = 3 # input channels
-out_channels = 5 # output channels
+out_channels = 4 # output channels
 in_x = in_y = 4 # height/width of inp (assuming square)
 padding = 1 # padding (assuming square)
 stride_x = stride_y = 2 # stride (assuming square)
@@ -51,9 +51,9 @@
 
 i_precision = 4
 o_precision = 16
+w_precision = 4
 simd = 2
 pe = 2
-w_precision = 4
 # mmv = 1 # todo - figure out what this is
 
 conv_stride = 1 # assuming square
@@ -75,6 +75,8 @@
 outFileConfig.write("constexpr unsigned  IPrecision = %d;\n" % i_precision)
 outFileConfig.write("constexpr unsigned  OPrecision = %d;\n" % o_precision)
 outFileConfig.write("constexpr unsigned  WPrecision = %d;\n" % w_precision)
+outFileConfig.write("constexpr unsigned  ConvSIMD1 = %d;\n" % simd)
+outFileConfig.write("constexpr unsigned  ConvPE1 = %d;\n" % pe)
 outFileConfig.close()
 
 

From 6c5e296b4eceab710a3692bb602e2edc0a829ff9 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sun, 2 Apr 2023 19:41:33 -0700
Subject: [PATCH 22/43] Loading deconv weights from memdata with transposition

---
 tb/deconv_tb.cpp | 68 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 24 deletions(-)

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index 538b150..189595b 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -34,6 +34,7 @@
 #include "../bnn-library.h"
 #include "deconv.hpp"
 #include "data/config_deconv2d.h"
+#include "data/memdata_deconv2d.h"
 
 #include <iostream>
 #include <random>
@@ -51,7 +52,6 @@ int main() {
 	ap_uint<OPrecision>  out_image[DeconvOFDim][DeconvOFDim][DeconvOFMCh];
 	hls::stream<ap_uint<DeconvIFMCh*IPrecision> > input_stream("input_stream");
 	hls::stream<ap_uint<DeconvOFMCh*OPrecision> > output_stream("output_stream");
-	ap_uint<WPrecision>  weights[DeconvIFMCh][DeconvOFMCh][DeconvKernel][DeconvKernel];
 	
 	{ // Feed random input sequence
 		std::random_device rd;
@@ -60,12 +60,15 @@ int main() {
 
 		for(unsigned  y = 0; y < DeconvIFDim; y++) {
 			for(unsigned  x = 0; x < DeconvIFDim; x++) {
+				ap_uint<DeconvIFMCh * IPrecision> input_channel = 0;
 				for(unsigned  c = 0; c < DeconvIFMCh; c++) {
 					ap_uint<IPrecision>  val = dist(rd);
-					input_stream.write(val);
 					inp_image[y][x][c] = val;
+					input_channel = input_channel >> IPrecision;
+					input_channel(DeconvIFMCh * IPrecision - 1, (DeconvIFMCh - 1) * IPrecision) = val;
 					input_counter++;
 				}
+				input_stream.write(input_counter);
 			}
 		}
 		if(input_counter != (DeconvIFDim * DeconvIFDim * DeconvIFMCh)) {
@@ -75,29 +78,46 @@ int main() {
 	}
 	std::cout << "Finished writing to input stream" << std::endl;
 
-	// TODO - create weights
-	// { // Feed random weight sequence
-	// 	std::random_device rd;
-	// 	std::uniform_int_distribution<int> dist(0, (1<<(WEIGHT_PRECISION))-1);
-	// 	unsigned  input_counter = 0;
-
-	// 	for(unsigned  y = 0; y < DECONV_INPUT_DIM_Y; y++) {
-	// 		for(unsigned  x = 0; x < DECONV_INPUT_DIM_X; x++) {
-	// 			for(unsigned  c = 0; c < INPUT_CHANNELS; c++) {
-	// 				TI  val = dist(rd);
-	// 				input_stream.write(val);
-	// 				image[y][x][c] = val;
-	// 				input_counter++;
-	// 			}
-	// 		}
-	// 	}
-	// 	if(input_counter != (DECONV_INPUT_DIM_X * DECONV_INPUT_DIM_Y * INPUT_CHANNELS)) {
-	// 		std::cout << "Input stream not fully populated." << std::endl;
-	// 		return 1;
-	// 	}
-	// }
-	std::cout << "Finished writing to input stream" << std::endl;
+	// Create weights
+	static ap_uint<WPrecision>  weights[DeconvIFMCh][DeconvOFMCh][DeconvKernel][DeconvKernel];
+	{
+		unsigned  oc = 0; // output channel counter
+		unsigned  ic = 0; // input channel counter
+		unsigned  kx = 0; // kernel_x counter
+		unsigned  ky = 0; // kernel_y counter
+		constexpr int  xTile = (DeconvIFMCh * DeconvKernel * DeconvKernel) / ConvSIMD1;
+		constexpr int  yTile = DeconvOFMCh / ConvPE1;
+		for (unsigned  oy = 0; oy < yTile; oy++) {
+			for (unsigned ox = 0; ox < xTile; ox++) {
+				for (unsigned pe = 0; pe < ConvPE1; pe++) {
+					for (unsigned simd = 0; simd < ConvSIMD1; simd++) {
+						// need to transpose the weights since weights are for conv2d
+						unsigned  dkx = DeconvKernel - kx - 1;
+						unsigned  dky = DeconvKernel - ky - 1;
+						weights[ic][oc][dkx][dky] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
+						ic++;
+						if (ic == DeconvIFMCh){
+							ic=0;
+							kx++;
+							if (kx == DeconvKernel){
+								kx=0;
+								ky++;
+								if (ky == DeconvKernel){
+									ky=0;
+									oc++;
+									if (oc == DeconvOFDim){
+										oc=0;
+									}
+								}
+							}
+						}
+					}
+				}
 
+			}
+		}
+	}
+	std::cout << "Finished writing the weights" << std::endl;
 
 	// TODO - calculate expected outputs from deconvolution
 	std::cout << "Calculating expected output" << std::endl;

From d81f950a1158f9baaf3166b97273713627edd12d Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sun, 2 Apr 2023 21:28:13 -0700
Subject: [PATCH 23/43] Adding feature map padding hyperparams

---
 tb/data/gen_weights_deconv2d.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tb/data/gen_weights_deconv2d.py b/tb/data/gen_weights_deconv2d.py
index 926a00f..b390721 100644
--- a/tb/data/gen_weights_deconv2d.py
+++ b/tb/data/gen_weights_deconv2d.py
@@ -52,7 +52,7 @@
 i_precision = 4
 o_precision = 16
 w_precision = 4
-simd = 2
+simd = in_channels # fully unrolling in channels
 pe = 2
 # mmv = 1 # todo - figure out what this is
 
@@ -77,6 +77,12 @@
 outFileConfig.write("constexpr unsigned  WPrecision = %d;\n" % w_precision)
 outFileConfig.write("constexpr unsigned  ConvSIMD1 = %d;\n" % simd)
 outFileConfig.write("constexpr unsigned  ConvPE1 = %d;\n" % pe)
+
+fm_out_x = in_x + (in_x - 1) * (stride_x - 1)
+fm_pad_x = out_x // in_x
+outFileConfig.write("constexpr unsigned  FMPadODim = %d;\n" % fm_out_x)
+outFileConfig.write("constexpr unsigned  FMPadStride = %d;\n" % fm_pad_x)
+
 outFileConfig.close()
 
 

From 661fcd058c9633f8a3fe3fa4b983d4672a2281e2 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sun, 2 Apr 2023 21:28:34 -0700
Subject: [PATCH 24/43] Adding fmpadding and convlayer batch

---
 tb/deconv_top.cpp | 59 ++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
index 906ea4d..4087c40 100644
--- a/tb/deconv_top.cpp
+++ b/tb/deconv_top.cpp
@@ -35,46 +35,37 @@ using namespace hls;
 
 #include "../bnn-library.h"
 #include "../weights.hpp"
-// #include "activations.hpp"
+#include "../activations.hpp"
 // #include "interpret.hpp"
 // #include "mvau.hpp"
-// #include "conv.hpp"
 #include "data/memdata_deconv2d.h"
 #include "data/config_deconv2d.h"
 
 void test_deconv2d(
-	stream<ap_uint<DeconvIFMCh*IPrecision> > &src,
-	stream<ap_uint<DeconvOFMCh*OPrecision> > &dst
+	stream<ap_uint<DeconvIFMCh*IPrecision> > & src,
+	stream<ap_uint<DeconvOFMCh*OPrecision> > & dst
 ){
 #pragma HLS DATAFLOW
-
-	ap_uint<DeconvIFMCh*IPrecision>  val = 0;
-	val = src.read();
-
-	// stream<ap_uint<IFM_Channels1*INPUT_PRECISION> > conv_input("input_stream");
-	// FMPadding_Pixel_Nonsquare<
-	// 	OUTPUT_DIM_X, // dimension expected by conv
-	// 	OUTPUT_DIM_Y, // dimension expected by conv
-	// 	XSTRIDE, // stride along pixel padding
-	// 	YSTRIDE, // stride along pixel padding
-	// 	CHANNELS, // num channels expected by conv (input channels)
-	// 	SIMD1, // packing along the channel dim
-	// 	ap_uint<INPUT_WIDTH> // data type of values
-	// >(src, conv_input);
-
-	// // TODO - replace weights and pass-through activation
-	// // TODO - figure out why pass-through activation is 16 bits
-	// // TODO - figure out why we are using DSP
-	// ConvLayer_Batch<
-	// 	KERNEL_DIM,
-	// 	IFM_Channels1,
-	// 	IFMDim1,
-	// 	OFM_Channels1,
-	// 	OFMDim1,
-	// 	SIMD1,
-	// 	PE1,
-	// 	Slice<ap_uint<INPUT_PRECISION> >,
-	// 	Slice<ap_uint<ACTIVATION_PRECISION> >,
-	// 	Identity
-	// >(conv_inputs, dst, PARAM::weights, PassThroughActivation<ap_uint<16>>(), 1, ap_resource_dsp());
+	stream<ap_uint<DeconvIFMCh*IPrecision> > conv_input("input_stream");
+	FMPadding_Pixel_Nonsquare<
+		FMPadODim, // dimension expected by conv
+		FMPadODim, // dimension expected by conv
+		FMPadStride, // stride along pixel padding
+		FMPadStride, // stride along pixel padding
+		DeconvIFMCh, // num channels expected by conv (input channels)
+		ConvSIMD1, // packing along the channel dim
+		ap_uint<IPrecision> // data type of values
+	>(src, conv_input);
+	ConvLayer_Batch<
+		DeconvKernel, // conv and deconv have same kernel size
+		DeconvIFMCh,
+		FMPadODim, // output of fm padding is input of conv
+		DeconvOFMCh,
+		DeconvOFDim,
+		ConvSIMD1,
+		ConvPE1,
+		Slice<ap_uint<IPrecision> >,
+		Slice<ap_uint<OPrecision> >,
+		Identity
+	>(conv_input, dst, PARAM::weights, PassThroughActivation<ap_uint<16>>(), 1, ap_resource_dsp());
 }

From 83647389c296cae2ed9c424626b5a3149198b01a Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sun, 2 Apr 2023 21:36:33 -0700
Subject: [PATCH 25/43] Adding checks for functional verification

---
 tb/deconv_tb.cpp  | 41 +++++++++++++++++++++--------------------
 tb/deconv_top.cpp |  6 +++---
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index 189595b..22e1c75 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -94,7 +94,7 @@ int main() {
 						// need to transpose the weights since weights are for conv2d
 						unsigned  dkx = DeconvKernel - kx - 1;
 						unsigned  dky = DeconvKernel - ky - 1;
-						weights[ic][oc][dkx][dky] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
+						weights[ic][oc][kx][ky] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
 						ic++;
 						if (ic == DeconvIFMCh){
 							ic=0;
@@ -139,24 +139,25 @@ int main() {
 	std::cout << "Finished writing to output stream" << std::endl;
 
 	// Verify correctness
-	// for(unsigned  y = 0; y < OUTPUT_DIM_Y; y++) {
-	// 	for(unsigned  x = 0; x < OUTPUT_DIM_X; x++) {
-	// 		for(unsigned  c = 0; c < CHANNELS; c++) {
-	// 			if(output_stream.empty()) {
-	// 				std::cerr << "Missing outputs." << std::endl;
-	// 				return  1;
-	// 			}
+	for(unsigned  y = 0; y < DeconvOFDim; y++) {
+		for(unsigned  x = 0; x < DeconvOFDim; x++) {
+			for(unsigned  c = 0; c < DeconvOFMCh; c++) {
+				if(output_stream.empty()) {
+					std::cerr << "Missing outputs." << std::endl;
+					return  1;
+				}
 
-	// 			T const  val = output_stream.read();
-	// 			if(expected[y][x][c] != val) {
-	// 				std::cerr << "Output mismatch." << std::endl;
-	// 				return  1;
-	// 			}
-	// 		}
-	// 	}
-	// }
-	// if(!output_stream.empty()) {
-	// 	std::cerr << "Output stream not empty." << std::endl;
-	// 	return 1;
-	// }
+				ap_uint<OPrecision> const  val = output_stream.read();
+				if(out_image[y][x][c] != val) {
+					std::cerr << "Output mismatch." << std::endl;
+					return  1;
+				}
+			}
+		}
+	}
+	std::cout << "Outputs successfully aligns." << std::endl;
+	if(!output_stream.empty()) {
+		std::cerr << "Output stream not empty." << std::endl;
+		return 1;
+	}
 }
diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
index 4087c40..cfd7675 100644
--- a/tb/deconv_top.cpp
+++ b/tb/deconv_top.cpp
@@ -36,8 +36,8 @@ using namespace hls;
 #include "../bnn-library.h"
 #include "../weights.hpp"
 #include "../activations.hpp"
-// #include "interpret.hpp"
-// #include "mvau.hpp"
+#include "../interpret.hpp"
+#include "../mvau.hpp"
 #include "data/memdata_deconv2d.h"
 #include "data/config_deconv2d.h"
 
@@ -67,5 +67,5 @@ void test_deconv2d(
 		Slice<ap_uint<IPrecision> >,
 		Slice<ap_uint<OPrecision> >,
 		Identity
-	>(conv_input, dst, PARAM::weights, PassThroughActivation<ap_uint<16>>(), 1, ap_resource_dsp());
+	>(conv_input, dst, PARAM::weights, PassThroughActivation<ap_uint<16> >(), 1, ap_resource_dsp());
 }

From 1a5be9ddcbd8cc7e364de24e3d09f1e4864341b4 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Sun, 2 Apr 2023 21:37:00 -0700
Subject: [PATCH 26/43] Adding generated config/memdata

---
 tb/data/config_deconv2d.h  | 14 +++++++
 tb/data/memdata_deconv2d.h | 75 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 tb/data/config_deconv2d.h
 create mode 100644 tb/data/memdata_deconv2d.h

diff --git a/tb/data/config_deconv2d.h b/tb/data/config_deconv2d.h
new file mode 100644
index 0000000..541893c
--- /dev/null
+++ b/tb/data/config_deconv2d.h
@@ -0,0 +1,14 @@
+constexpr unsigned  DeconvIFDim = 4;
+constexpr unsigned  DeconvIFMCh = 3;
+constexpr unsigned  DeconvOFDim = 8;
+constexpr unsigned  DeconvOFMCh = 4;
+constexpr unsigned  DeconvKernel = 4;
+constexpr unsigned  DeconvStride = 2;
+constexpr unsigned  DeconvPadding = 1;
+constexpr unsigned  IPrecision = 4;
+constexpr unsigned  OPrecision = 16;
+constexpr unsigned  WPrecision = 4;
+constexpr unsigned  ConvSIMD1 = 3;
+constexpr unsigned  ConvPE1 = 2;
+constexpr unsigned  FMPadODim = 7;
+constexpr unsigned  FMPadStride = 2;
diff --git a/tb/data/memdata_deconv2d.h b/tb/data/memdata_deconv2d.h
new file mode 100644
index 0000000..05b5e08
--- /dev/null
+++ b/tb/data/memdata_deconv2d.h
@@ -0,0 +1,75 @@
+#ifndef PARAMS_HPP
+#define PARAMS_HPP
+namespace PARAM{ 
+static FixedPointWeights<3,ap_int<4>,2,32> weights= {
+{
+{ 
+0xff,
+0x39c,
+0x11a,
+0x382,
+0x674,
+0x1a6,
+0x4f0,
+0x1c0,
+0x21b,
+0x753,
+0x6a1,
+0x502,
+0x78b,
+0x1da,
+0xd0,
+0x13a,
+0x97,
+0x2f3,
+0x7f5,
+0x105,
+0x17e,
+0x72f,
+0x5bc,
+0x6b,
+0x56a,
+0x24a,
+0x5c4,
+0x441,
+0x730,
+0x22e,
+0x136,
+0x6bc} 
+,{ 
+0x12b,
+0x4b2,
+0x50c,
+0x670,
+0x514,
+0x48a,
+0x286,
+0x744,
+0x4ad,
+0x3eb,
+0x1df,
+0x5ce,
+0x458,
+0x609,
+0x2c8,
+0x51d,
+0xc3,
+0x6b2,
+0x684,
+0x43a,
+0x394,
+0x56,
+0x302,
+0x591,
+0x3c7,
+0x574,
+0x55c,
+0x166,
+0x5be,
+0x63a,
+0xcf,
+0x62c} 
+}
+};
+ } 
+#endif 

From d9ffa3b980010f1f5ed9d870ada4d250335a6f8b Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 17:06:12 -0700
Subject: [PATCH 27/43] Adding square version of fm pixel padding

---
 streamtools.h | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/streamtools.h b/streamtools.h
index ff9c1a9..bb6118c 100644
--- a/streamtools.h
+++ b/streamtools.h
@@ -427,8 +427,7 @@ void FMPadding_Batch(
  * \tparam Stride_x    Stride for each pixel along the width dimension 
  * \tparam Stride_y    Stride for each pixel along the height dimension
  * \tparam NumChannels Number of channels of the input feature map
- * \tparam	NumChannels		Number of channels of the input feature map
- * \tparam	SIMD			Input parallelism 
+ * \tparam SIMD		   Input parallelism 
  * \tparam In_t		   Input datatype
  *
  * @param src          Input stream
@@ -466,6 +465,34 @@ void FMPadding_Pixel_Nonsquare(
 	}
 }
 
+/**
+ * \brief Feature map pixel padding - Pads each pixel in the input feature
+ *        map with zeros. Used as a pre-processing step for the transposed
+ * 		  convolution operation. Expects data in NHWC format, where N=1.
+ *
+ * \tparam OutputDim   Padded width of the output feature map
+ * \tparam Stride      Stride for each pixel along the width dimension
+ * \tparam NumChannels Number of channels of the input feature map
+ * \tparam SIMD		   Input parallelism 
+ * \tparam In_t		   Input datatype
+ *
+ * @param src          Input stream
+ * @param dst 		   Output stream
+ */
+template<
+	unsigned OutputDim,
+	unsigned Stride,
+	unsigned NumChannels,
+	unsigned SIMD,
+	typename In_t
+>
+void FMPadding_Pixel(
+	hls::stream<ap_uint<SIMD*In_t::width>> &src,
+	hls::stream<ap_uint<SIMD*In_t::width>> &dst
+) {
+	FMPadding_Pixel_Nonsquare<OutputDim, OutputDim, Stride, Stride, NumChannels, SIMD, In_t>(src, dst);
+}
+
 /**
  * \brief   Stream Data Width Converter - Converts the width of the input stream in the output stream
  *

From 61ab264a96506edb7241ae996b58f73c430c1238 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 17:06:50 -0700
Subject: [PATCH 28/43] Updating gen_weights for deconv2d

---
 tb/data/config_deconv2d.h       |  36 ++++++---
 tb/data/gen_weights_deconv2d.py |  84 +++++++++++----------
 tb/data/memdata_deconv2d.h      | 130 ++++++++++++++++----------------
 3 files changed, 133 insertions(+), 117 deletions(-)

diff --git a/tb/data/config_deconv2d.h b/tb/data/config_deconv2d.h
index 541893c..bc5f43a 100644
--- a/tb/data/config_deconv2d.h
+++ b/tb/data/config_deconv2d.h
@@ -1,14 +1,26 @@
-constexpr unsigned  DeconvIFDim = 4;
-constexpr unsigned  DeconvIFMCh = 3;
-constexpr unsigned  DeconvOFDim = 8;
-constexpr unsigned  DeconvOFMCh = 4;
-constexpr unsigned  DeconvKernel = 4;
-constexpr unsigned  DeconvStride = 2;
-constexpr unsigned  DeconvPadding = 1;
-constexpr unsigned  IPrecision = 4;
-constexpr unsigned  OPrecision = 16;
-constexpr unsigned  WPrecision = 4;
+constexpr unsigned  IFDim1 = 4;
+constexpr unsigned  IFMCh1 = 3;
+constexpr unsigned  OFDim1 = 7;
+constexpr unsigned  OFMCh1 = 4;
+constexpr unsigned  Kernel1 = 4;
+constexpr unsigned  Stride1 = 3;
+constexpr unsigned  Padding1 = 3;
+
+constexpr unsigned  FMPadODim1 = 10;
+constexpr unsigned  FMPadStride1 = 3;
+constexpr unsigned  FMPadSIMD1 = 3;
+
+constexpr unsigned  ConvKernel1 = 4;
+constexpr unsigned  ConvIFMCh1 = 3;
+constexpr unsigned  ConvIFMDim1 = 10;
+constexpr unsigned  ConvOFMCh1 = 4;
+constexpr unsigned  ConvOFMDim1 = 7;
+constexpr unsigned  ConvPadding1 = 0;
+constexpr unsigned  ConvStride1 = 1;
 constexpr unsigned  ConvSIMD1 = 3;
 constexpr unsigned  ConvPE1 = 2;
-constexpr unsigned  FMPadODim = 7;
-constexpr unsigned  FMPadStride = 2;
+
+constexpr unsigned  IPrecision = 6;
+constexpr unsigned  OPrecision = 16;
+constexpr unsigned  WPrecision = 5;
+
diff --git a/tb/data/gen_weights_deconv2d.py b/tb/data/gen_weights_deconv2d.py
index b390721..730829e 100644
--- a/tb/data/gen_weights_deconv2d.py
+++ b/tb/data/gen_weights_deconv2d.py
@@ -27,62 +27,66 @@
 #   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 #   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import sys
-import random 
-import subprocess
-import numpy as np
+import random
 
 outFileWeights = open("memdata_deconv2d.h" , "wt")
 outFileConfig = open("config_deconv2d.h" , "wt")
 
 num_images = 1 # num images
-in_channels = 3 # input channels
+in_channels = 2 # input channels
 out_channels = 4 # output channels
-in_x = in_y = 4 # height/width of inp (assuming square)
-padding = 1 # padding (assuming square)
-stride_x = stride_y = 2 # stride (assuming square)
-kernel_x = kernel_y = 4 # kernel size (assuming square)
-out_x = stride_x * (in_x - 1) - (2 * padding) + kernel_x
-out_y = stride_y * (in_y - 1) - (2 * padding) + kernel_y
+in_dim = 4 # assuming square inputs
+stride = 3 # assuming square stride
+kernel_size = 4 # assuming square kernels
+padding = 3 # assuming square padidng
+out_dim = stride * (in_dim - 1) + kernel_size - (2 * padding)
 
-assert out_x % in_x == 0, "Need even upsampling factor."
-assert out_y % in_y == 0, "Need even upsampling factor."
-
-i_precision = 4
+i_precision = 6
 o_precision = 16
-w_precision = 4
+w_precision = 5
 simd = in_channels # fully unrolling in channels
 pe = 2
-# mmv = 1 # todo - figure out what this is
 
-conv_stride = 1 # assuming square
-conv_padding = kernel_x - padding - 1 # assuming square
+# deconvolution hyperparameters
+outFileConfig.write("constexpr unsigned  IFDim1 = %d;\n" % in_dim)
+outFileConfig.write("constexpr unsigned  IFMCh1 = %d;\n" % in_channels)
+outFileConfig.write("constexpr unsigned  OFDim1 = %d;\n" % out_dim)
+outFileConfig.write("constexpr unsigned  OFMCh1 = %d;\n" % out_channels)
+outFileConfig.write("constexpr unsigned  Kernel1 = %d;\n" % kernel_size)
+outFileConfig.write("constexpr unsigned  Stride1 = %d;\n" % stride)
+outFileConfig.write("constexpr unsigned  Padding1 = %d;\n" % padding)
+outFileConfig.write("\n")
 
-tile = in_channels * kernel_x * kernel_y * out_channels // (simd * pe)
+# feature map pixel padding hyperparameters
+fm_out_x = in_dim + (in_dim - 1) * (stride - 1)
+fm_pad_x = stride
+outFileConfig.write("constexpr unsigned  FMPadODim1 = %d;\n" % fm_out_x)
+outFileConfig.write("constexpr unsigned  FMPadStride1 = %d;\n" % fm_pad_x)
+outFileConfig.write("constexpr unsigned  FMPadSIMD1 = %d;\n" % simd)
+outFileConfig.write("\n")
 
-assert in_y == in_x, "Testing square inputs."
-assert out_y == out_x, "Testing square outputs."
-assert kernel_x == kernel_y, "Testing square kernels."
-assert stride_x == stride_y, "Testing square strides."
-outFileConfig.write("constexpr unsigned  DeconvIFDim = %d;\n" % in_x)
-outFileConfig.write("constexpr unsigned  DeconvIFMCh = %d;\n" % in_channels)
-outFileConfig.write("constexpr unsigned  DeconvOFDim = %d;\n" % out_x)
-outFileConfig.write("constexpr unsigned  DeconvOFMCh = %d;\n" % out_channels)
-outFileConfig.write("constexpr unsigned  DeconvKernel = %d;\n" % kernel_x)
-outFileConfig.write("constexpr unsigned  DeconvStride = %d;\n" % stride_x)
-outFileConfig.write("constexpr unsigned  DeconvPadding = %d;\n" % padding)
-outFileConfig.write("constexpr unsigned  IPrecision = %d;\n" % i_precision)
-outFileConfig.write("constexpr unsigned  OPrecision = %d;\n" % o_precision)
-outFileConfig.write("constexpr unsigned  WPrecision = %d;\n" % w_precision)
+# convolution hyperparameters
+conv_stride = 1 # assuming square
+conv_padding = kernel_size - padding - 1 # assuming square
+assert conv_padding == 0, "not testing additional padding"
+tile = in_channels * kernel_size * kernel_size * out_channels // (simd * pe)
+outFileConfig.write("constexpr unsigned  ConvKernel1 = %d;\n" % kernel_size)
+outFileConfig.write("constexpr unsigned  ConvIFMCh1 = %d;\n" % in_channels)
+# input of direct convolution is the output of the pixel padding
+outFileConfig.write("constexpr unsigned  ConvIFMDim1 = %d;\n" % fm_out_x)
+outFileConfig.write("constexpr unsigned  ConvOFMCh1 = %d;\n" % out_channels)
+outFileConfig.write("constexpr unsigned  ConvOFMDim1 = %d;\n" % out_dim)
+outFileConfig.write("constexpr unsigned  ConvPadding1 = %d;\n" % conv_padding)
+outFileConfig.write("constexpr unsigned  ConvStride1 = %d;\n" % conv_stride)
 outFileConfig.write("constexpr unsigned  ConvSIMD1 = %d;\n" % simd)
 outFileConfig.write("constexpr unsigned  ConvPE1 = %d;\n" % pe)
+outFileConfig.write("\n")
 
-fm_out_x = in_x + (in_x - 1) * (stride_x - 1)
-fm_pad_x = out_x // in_x
-outFileConfig.write("constexpr unsigned  FMPadODim = %d;\n" % fm_out_x)
-outFileConfig.write("constexpr unsigned  FMPadStride = %d;\n" % fm_pad_x)
-
+# general hyperparameters
+outFileConfig.write("constexpr unsigned  IPrecision = %d;\n" % i_precision)
+outFileConfig.write("constexpr unsigned  OPrecision = %d;\n" % o_precision)
+outFileConfig.write("constexpr unsigned  WPrecision = %d;\n" % w_precision)
+outFileConfig.write("\n")
 outFileConfig.close()
 
 
diff --git a/tb/data/memdata_deconv2d.h b/tb/data/memdata_deconv2d.h
index 05b5e08..4bc924f 100644
--- a/tb/data/memdata_deconv2d.h
+++ b/tb/data/memdata_deconv2d.h
@@ -1,74 +1,74 @@
 #ifndef PARAMS_HPP
 #define PARAMS_HPP
 namespace PARAM{ 
-static FixedPointWeights<3,ap_int<4>,2,32> weights= {
+static FixedPointWeights<3,ap_int<5>,2,32> weights= {
 {
 { 
-0xff,
-0x39c,
-0x11a,
-0x382,
-0x674,
-0x1a6,
-0x4f0,
-0x1c0,
-0x21b,
-0x753,
-0x6a1,
-0x502,
-0x78b,
-0x1da,
-0xd0,
-0x13a,
-0x97,
-0x2f3,
-0x7f5,
-0x105,
-0x17e,
-0x72f,
-0x5bc,
-0x6b,
-0x56a,
-0x24a,
-0x5c4,
-0x441,
-0x730,
-0x22e,
-0x136,
-0x6bc} 
+0x217d,
+0x124b,
+0x2a22,
+0x23d1,
+0x1093,
+0x3d16,
+0x342b,
+0x12bb,
+0x23e,
+0xaf,
+0x3c7c,
+0x2b58,
+0x1e5f,
+0x1428,
+0xe57,
+0x159d,
+0x3162,
+0x4e3,
+0x3548,
+0x29ae,
+0x1d64,
+0x13f4,
+0x3a79,
+0x3aa0,
+0xcd5,
+0x1a07,
+0x12e2,
+0x495,
+0x3aa8,
+0x3b63,
+0x1897,
+0x2e16} 
 ,{ 
-0x12b,
-0x4b2,
-0x50c,
-0x670,
-0x514,
-0x48a,
-0x286,
-0x744,
-0x4ad,
-0x3eb,
-0x1df,
-0x5ce,
-0x458,
-0x609,
-0x2c8,
-0x51d,
-0xc3,
-0x6b2,
-0x684,
-0x43a,
-0x394,
-0x56,
-0x302,
-0x591,
-0x3c7,
-0x574,
-0x55c,
-0x166,
-0x5be,
-0x63a,
-0xcf,
-0x62c} 
+0xa15,
+0x36d3,
+0x1ee4,
+0x336c,
+0x2052,
+0x2a86,
+0x2332,
+0x2066,
+0x2c71,
+0x1da4,
+0x3662,
+0x154b,
+0x2d60,
+0x29be,
+0x2f2a,
+0x3e45,
+0x110e,
+0x291c,
+0x21f,
+0x2ef7,
+0x26c2,
+0x3b3b,
+0x2063,
+0x2ad9,
+0x88f,
+0x82f,
+0x908,
+0x39a3,
+0x377b,
+0x3b77,
+0x57d,
+0x2a1e} 
 }
 };
  } 

From 5a979a2056edec7fa6d50ae74486f34e2b866f06 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 17:09:30 -0700
Subject: [PATCH 29/43] Updating deconv2d testbench

---
 tb/deconv_tb.cpp  | 102 ++++++++++++++++++++++++++--------------------
 tb/deconv_top.cpp |  34 ++++++++--------
 2 files changed, 75 insertions(+), 61 deletions(-)

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index 22e1c75..d193714 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -41,37 +41,37 @@
 
 
 void test_deconv2d(
-	hls::stream<ap_uint<DeconvIFMCh*IPrecision> > & src,
-	hls::stream<ap_uint<DeconvOFMCh*OPrecision> > & dst
+	hls::stream<ap_uint<IFMCh1*IPrecision> > & src,
+	hls::stream<ap_uint<OFMCh1*OPrecision> > & dst
 );
 
 int main() {
 	std::cout << "Starting testbench for deconvolution" << std::endl;
 
-	ap_uint<IPrecision>  inp_image[DeconvIFDim][DeconvIFDim][DeconvIFMCh];
-	ap_uint<OPrecision>  out_image[DeconvOFDim][DeconvOFDim][DeconvOFMCh];
-	hls::stream<ap_uint<DeconvIFMCh*IPrecision> > input_stream("input_stream");
-	hls::stream<ap_uint<DeconvOFMCh*OPrecision> > output_stream("output_stream");
+	ap_uint<IPrecision>  inp_image[IFDim1][IFDim1][IFMCh1];
+	ap_uint<OPrecision>  ref_image[OFDim1][OFDim1][OFMCh1];
+	hls::stream<ap_uint<IFMCh1*IPrecision> > input_stream("input_stream");
+	hls::stream<ap_uint<OFMCh1*OPrecision> > output_stream("output_stream");
 	
 	{ // Feed random input sequence
 		std::random_device rd;
-		std::uniform_int_distribution<int> dist(0, (1<<(DeconvIFMCh*IPrecision))-1);
+		std::uniform_int_distribution<int> dist(0, (1<<(IFMCh1*IPrecision))-1);
 		unsigned  input_counter = 0;
 
-		for(unsigned  y = 0; y < DeconvIFDim; y++) {
-			for(unsigned  x = 0; x < DeconvIFDim; x++) {
-				ap_uint<DeconvIFMCh * IPrecision> input_channel = 0;
-				for(unsigned  c = 0; c < DeconvIFMCh; c++) {
+		for(unsigned  y = 0; y < IFDim1; y++) {
+			for(unsigned  x = 0; x < IFDim1; x++) {
+				ap_uint<IFMCh1 * IPrecision> input_channel = 0;
+				for(unsigned  c = 0; c < IFMCh1; c++) {
 					ap_uint<IPrecision>  val = dist(rd);
 					inp_image[y][x][c] = val;
 					input_channel = input_channel >> IPrecision;
-					input_channel(DeconvIFMCh * IPrecision - 1, (DeconvIFMCh - 1) * IPrecision) = val;
+					input_channel(IFMCh1 * IPrecision - 1, (IFMCh1 - 1) * IPrecision) = val;
 					input_counter++;
 				}
-				input_stream.write(input_counter);
+				input_stream.write(input_channel);
 			}
 		}
-		if(input_counter != (DeconvIFDim * DeconvIFDim * DeconvIFMCh)) {
+		if(input_counter != (IFDim1 * IFDim1 * IFMCh1)) {
 			std::cout << "Input stream not fully populated." << std::endl;
 			return 1;
 		}
@@ -79,33 +79,33 @@ int main() {
 	std::cout << "Finished writing to input stream" << std::endl;
 
 	// Create weights
-	static ap_uint<WPrecision>  weights[DeconvIFMCh][DeconvOFMCh][DeconvKernel][DeconvKernel];
+	static ap_uint<WPrecision>  weights[IFMCh1][OFMCh1][Kernel1][Kernel1];
 	{
 		unsigned  oc = 0; // output channel counter
 		unsigned  ic = 0; // input channel counter
 		unsigned  kx = 0; // kernel_x counter
 		unsigned  ky = 0; // kernel_y counter
-		constexpr int  xTile = (DeconvIFMCh * DeconvKernel * DeconvKernel) / ConvSIMD1;
-		constexpr int  yTile = DeconvOFMCh / ConvPE1;
+		constexpr int  xTile = (IFDim1 * Kernel1 * Kernel1) / ConvSIMD1;
+		constexpr int  yTile = OFDim1 / ConvPE1;
 		for (unsigned  oy = 0; oy < yTile; oy++) {
 			for (unsigned ox = 0; ox < xTile; ox++) {
 				for (unsigned pe = 0; pe < ConvPE1; pe++) {
 					for (unsigned simd = 0; simd < ConvSIMD1; simd++) {
 						// need to transpose the weights since weights are for conv2d
-						unsigned  dkx = DeconvKernel - kx - 1;
-						unsigned  dky = DeconvKernel - ky - 1;
-						weights[ic][oc][kx][ky] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
+						unsigned  dkx = Kernel1 - kx - 1;
+						unsigned  dky = Kernel1 - ky - 1;
+						weights[ic][oc][dkx][dky] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
 						ic++;
-						if (ic == DeconvIFMCh){
+						if (ic == IFMCh1){
 							ic=0;
 							kx++;
-							if (kx == DeconvKernel){
+							if (kx == Kernel1){
 								kx=0;
 								ky++;
-								if (ky == DeconvKernel){
+								if (ky == Kernel1){
 									ky=0;
 									oc++;
-									if (oc == DeconvOFDim){
+									if (oc == OFMCh1){
 										oc=0;
 									}
 								}
@@ -122,42 +122,54 @@ int main() {
 	// TODO - calculate expected outputs from deconvolution
 	std::cout << "Calculating expected output" << std::endl;
 	deconv2d<
-		DeconvIFDim,
-		DeconvIFMCh,
-		DeconvOFDim,
-		DeconvOFMCh,
-		DeconvKernel,
-		DeconvStride,
-		DeconvPadding,
+		IFDim1,
+		IFMCh1,
+		OFDim1,
+		OFMCh1,
+		Kernel1,
+		Stride1,
+		Padding1,
 		ap_uint<IPrecision>,
 		ap_uint<OPrecision>,
 		ap_uint<WPrecision>
-	>(inp_image, weights, out_image);
+	>(inp_image, weights, ref_image);
 
 	// Run top-level function
 	test_deconv2d(input_stream, output_stream);
 	std::cout << "Finished writing to output stream" << std::endl;
 
-	// Verify correctness
-	for(unsigned  y = 0; y < DeconvOFDim; y++) {
-		for(unsigned  x = 0; x < DeconvOFDim; x++) {
-			for(unsigned  c = 0; c < DeconvOFMCh; c++) {
+	{// Verify correctness
+		ap_uint<OPrecision>  val, exp;
+		unsigned int  num_errors = 0;
+		for(unsigned  y = 0; y < OFDim1; y++) {
+			for(unsigned  x = 0; x < OFDim1; x++) {
+
 				if(output_stream.empty()) {
 					std::cerr << "Missing outputs." << std::endl;
 					return  1;
 				}
+				ap_uint<OFMCh1 * OPrecision>  out = output_stream.read();
 
-				ap_uint<OPrecision> const  val = output_stream.read();
-				if(out_image[y][x][c] != val) {
-					std::cerr << "Output mismatch." << std::endl;
-					return  1;
+				for(unsigned  c = 0; c < OFMCh1; c++) {
+					exp = ref_image[y][x][c];
+					val(OPrecision - 1, 0) = out((c + 1)*OPrecision - 1, c * OPrecision);
+					if(exp != val) {
+						std::cout << "Error: Expected["<<y<<"]["<<x<<"]["<<c<<"]="<<exp<<", got "<<val<< std::endl;
+						num_errors++;
+					}
 				}
 			}
 		}
-	}
-	std::cout << "Outputs successfully aligns." << std::endl;
-	if(!output_stream.empty()) {
-		std::cerr << "Output stream not empty." << std::endl;
-		return 1;
+		if(!output_stream.empty()) {
+			std::cerr << "Output stream not empty." << std::endl;
+			return 1;
+		}
+		else if(num_errors == 0) {
+			std::cout << "Outputs successfully aligns." << std::endl;
+		}
+		else {
+			std::cerr << "Error: " << num_errors << " total errors." << std::endl;
+			return 1;
+		}
 	}
 }
diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
index cfd7675..75dc100 100644
--- a/tb/deconv_top.cpp
+++ b/tb/deconv_top.cpp
@@ -41,31 +41,33 @@ using namespace hls;
 #include "data/memdata_deconv2d.h"
 #include "data/config_deconv2d.h"
 
+constexpr unsigned  numReps = 1;
+
 void test_deconv2d(
-	stream<ap_uint<DeconvIFMCh*IPrecision> > & src,
-	stream<ap_uint<DeconvOFMCh*OPrecision> > & dst
+	stream<ap_uint<IFMCh1*IPrecision> > & src,
+	stream<ap_uint<OFMCh1*OPrecision> > & dst
 ){
 #pragma HLS DATAFLOW
-	stream<ap_uint<DeconvIFMCh*IPrecision> > conv_input("input_stream");
-	FMPadding_Pixel_Nonsquare<
-		FMPadODim, // dimension expected by conv
-		FMPadODim, // dimension expected by conv
-		FMPadStride, // stride along pixel padding
-		FMPadStride, // stride along pixel padding
-		DeconvIFMCh, // num channels expected by conv (input channels)
-		ConvSIMD1, // packing along the channel dim
+	stream<ap_uint<IFMCh1*IPrecision> > conv_input("input_stream");
+	FMPadding_Pixel<
+		FMPadODim1, // dimension expected by direct conv
+		FMPadStride1, // stride along pixel padding
+		IFMCh1, // num channels expected by conv (input channels)
+		IFMCh1, // packing along the channel dim
 		ap_uint<IPrecision> // data type of values
 	>(src, conv_input);
+	// Note - would need to insert padding layer is padding is not 0
+	static_assert(ConvPadding1 == 0, "Not testing non-zero padding.");
 	ConvLayer_Batch<
-		DeconvKernel, // conv and deconv have same kernel size
-		DeconvIFMCh,
-		FMPadODim, // output of fm padding is input of conv
-		DeconvOFMCh,
-		DeconvOFDim,
+		ConvKernel1,
+		ConvIFMCh1,
+		ConvIFMDim1,
+		ConvOFMCh1,
+		ConvOFMDim1,
 		ConvSIMD1,
 		ConvPE1,
 		Slice<ap_uint<IPrecision> >,
 		Slice<ap_uint<OPrecision> >,
 		Identity
-	>(conv_input, dst, PARAM::weights, PassThroughActivation<ap_uint<16> >(), 1, ap_resource_dsp());
+	>(conv_input, dst, PARAM::weights, PassThroughActivation<ap_uint<16> >(), numReps, ap_resource_dsp());
 }

From e96d7b13000cd6fdc07fc4095b73db3a18a05af0 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 18:37:43 -0700
Subject: [PATCH 30/43] Focusing on single-channel test cases

---
 tb/data/config_deconv2d.h       | 14 +++---
 tb/data/gen_weights_deconv2d.py |  6 +--
 tb/data/memdata_deconv2d.h      | 83 +++++++--------------------------
 3 files changed, 27 insertions(+), 76 deletions(-)

diff --git a/tb/data/config_deconv2d.h b/tb/data/config_deconv2d.h
index bc5f43a..aac99d8 100644
--- a/tb/data/config_deconv2d.h
+++ b/tb/data/config_deconv2d.h
@@ -1,24 +1,24 @@
 constexpr unsigned  IFDim1 = 4;
-constexpr unsigned  IFMCh1 = 3;
+constexpr unsigned  IFMCh1 = 1;
 constexpr unsigned  OFDim1 = 7;
-constexpr unsigned  OFMCh1 = 4;
+constexpr unsigned  OFMCh1 = 1;
 constexpr unsigned  Kernel1 = 4;
 constexpr unsigned  Stride1 = 3;
 constexpr unsigned  Padding1 = 3;
 
 constexpr unsigned  FMPadODim1 = 10;
 constexpr unsigned  FMPadStride1 = 3;
-constexpr unsigned  FMPadSIMD1 = 3;
+constexpr unsigned  FMPadSIMD1 = 1;
 
 constexpr unsigned  ConvKernel1 = 4;
-constexpr unsigned  ConvIFMCh1 = 3;
+constexpr unsigned  ConvIFMCh1 = 1;
 constexpr unsigned  ConvIFMDim1 = 10;
-constexpr unsigned  ConvOFMCh1 = 4;
+constexpr unsigned  ConvOFMCh1 = 1;
 constexpr unsigned  ConvOFMDim1 = 7;
 constexpr unsigned  ConvPadding1 = 0;
 constexpr unsigned  ConvStride1 = 1;
-constexpr unsigned  ConvSIMD1 = 3;
-constexpr unsigned  ConvPE1 = 2;
+constexpr unsigned  ConvSIMD1 = 1;
+constexpr unsigned  ConvPE1 = 1;
 
 constexpr unsigned  IPrecision = 6;
 constexpr unsigned  OPrecision = 16;
diff --git a/tb/data/gen_weights_deconv2d.py b/tb/data/gen_weights_deconv2d.py
index 730829e..99797d3 100644
--- a/tb/data/gen_weights_deconv2d.py
+++ b/tb/data/gen_weights_deconv2d.py
@@ -33,8 +33,8 @@
 outFileConfig = open("config_deconv2d.h" , "wt")
 
 num_images = 1 # num images
-in_channels = 2 # input channels
-out_channels = 4 # output channels
+in_channels = 1 # input channels
+out_channels = 1 # output channels
 in_dim = 4 # assuming square inputs
 stride = 3 # assuming square stride
 kernel_size = 4 # assuming square kernels
@@ -45,7 +45,7 @@
 o_precision = 16
 w_precision = 5
 simd = in_channels # fully unrolling in channels
-pe = 2
+pe = 1
 
 # deconvolution hyperparameters
 outFileConfig.write("constexpr unsigned  IFDim1 = %d;\n" % in_dim)
diff --git a/tb/data/memdata_deconv2d.h b/tb/data/memdata_deconv2d.h
index 4bc924f..1c86793 100644
--- a/tb/data/memdata_deconv2d.h
+++ b/tb/data/memdata_deconv2d.h
@@ -1,74 +1,25 @@
 #ifndef PARAMS_HPP
 #define PARAMS_HPP
 namespace PARAM{ 
-static FixedPointWeights<3,ap_int<5>,2,32> weights= {
+static FixedPointWeights<1,ap_int<5>,1,16> weights= {
 {
 { 
-0x217d,
-0x124b,
-0x2a22,
-0x23d1,
-0x1093,
-0x3d16,
-0x342b,
-0x12bb,
-0x23e,
-0xaf,
-0x3c7c,
-0x2b58,
-0x1e5f,
-0x1428,
-0xe57,
-0x159d,
-0x3162,
-0x4e3,
-0x3548,
-0x29ae,
-0x1d64,
-0x13f4,
-0x3a79,
-0x3aa0,
-0xcd5,
-0x1a07,
-0x12e2,
-0x495,
-0x3aa8,
-0x3b63,
-0x1897,
-0x2e16} 
-,{ 
-0xa15,
-0x36d3,
-0x1ee4,
-0x336c,
-0x2052,
-0x2a86,
-0x2332,
-0x2066,
-0x2c71,
-0x1da4,
-0x3662,
-0x154b,
-0x2d60,
-0x29be,
-0x2f2a,
-0x3e45,
-0x110e,
-0x291c,
-0x21f,
-0x2ef7,
-0x26c2,
-0x3b3b,
-0x2063,
-0x2ad9,
-0x88f,
-0x82f,
-0x908,
-0x39a3,
-0x377b,
-0x3b77,
-0x57d,
-0x2a1e} 
+0x2,
+0xe,
+0x7,
+0x10,
+0x9,
+0x0,
+0x8,
+0x6,
+0x6,
+0x0,
+0x7,
+0x3,
+0xc,
+0xe,
+0x7,
+0x8} 
 }
 };
  } 

From c1f5130f0c58785be6ffc0865c0427ac25d345a9 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 18:37:54 -0700
Subject: [PATCH 31/43] Fixing indexing errors

---
 tb/deconv_tb.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index d193714..ca9a2d1 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -85,8 +85,8 @@ int main() {
 		unsigned  ic = 0; // input channel counter
 		unsigned  kx = 0; // kernel_x counter
 		unsigned  ky = 0; // kernel_y counter
-		constexpr int  xTile = (IFDim1 * Kernel1 * Kernel1) / ConvSIMD1;
-		constexpr int  yTile = OFDim1 / ConvPE1;
+		constexpr int  xTile = (IFMCh1 * Kernel1 * Kernel1) / ConvSIMD1;
+		constexpr int  yTile = OFMCh1 / ConvPE1;
 		for (unsigned  oy = 0; oy < yTile; oy++) {
 			for (unsigned ox = 0; ox < xTile; ox++) {
 				for (unsigned pe = 0; pe < ConvPE1; pe++) {
@@ -94,7 +94,7 @@ int main() {
 						// need to transpose the weights since weights are for conv2d
 						unsigned  dkx = Kernel1 - kx - 1;
 						unsigned  dky = Kernel1 - ky - 1;
-						weights[ic][oc][dkx][dky] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
+						weights[ic][oc][dky][dkx] = PARAM::weights.weights(oy*xTile + ox)[pe][simd];
 						ic++;
 						if (ic == IFMCh1){
 							ic=0;
@@ -119,7 +119,16 @@ int main() {
 	}
 	std::cout << "Finished writing the weights" << std::endl;
 
-	// TODO - calculate expected outputs from deconvolution
+	// initialize the output buffer to 0
+	for (unsigned y = 0; y < OFDim1; y++) {
+		for (unsigned x = 0; x < OFDim1; x++) {
+			for (unsigned c = 0; c < OFMCh1; c++) {
+				ref_image[y][x][c] = 0;
+			}
+		}
+	}
+
+	// calculate expected outputs from deconvolution
 	std::cout << "Calculating expected output" << std::endl;
 	deconv2d<
 		IFDim1,
@@ -154,7 +163,7 @@ int main() {
 					exp = ref_image[y][x][c];
 					val(OPrecision - 1, 0) = out((c + 1)*OPrecision - 1, c * OPrecision);
 					if(exp != val) {
-						std::cout << "Error: Expected["<<y<<"]["<<x<<"]["<<c<<"]="<<exp<<", got "<<val<< std::endl;
+						std::cout << "Error: Expected["<<y<<"]["<<x<<"]["<<c<<"]="<<exp<<", got "<<out<< std::endl;
 						num_errors++;
 					}
 				}

From b666bf86f2960d430d524da52cc003bd0415e6e2 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 18:38:01 -0700
Subject: [PATCH 32/43] Fixing typo

---
 tb/deconv_top.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
index 75dc100..6bd622b 100644
--- a/tb/deconv_top.cpp
+++ b/tb/deconv_top.cpp
@@ -56,7 +56,7 @@ void test_deconv2d(
 		IFMCh1, // packing along the channel dim
 		ap_uint<IPrecision> // data type of values
 	>(src, conv_input);
-	// Note - would need to insert padding layer is padding is not 0
+	// Note - would need to insert padding layer if padding is not 0
 	static_assert(ConvPadding1 == 0, "Not testing non-zero padding.");
 	ConvLayer_Batch<
 		ConvKernel1,

From a68da3f8f325740328772f6f202eedee4cc72699 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 18:58:34 -0700
Subject: [PATCH 33/43] Minor changes

---
 tb/data/memdata_deconv2d.h | 28 ++++++++++++++--------------
 tb/deconv_tb.cpp           |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tb/data/memdata_deconv2d.h b/tb/data/memdata_deconv2d.h
index 1c86793..619bb44 100644
--- a/tb/data/memdata_deconv2d.h
+++ b/tb/data/memdata_deconv2d.h
@@ -4,22 +4,22 @@ namespace PARAM{
 static FixedPointWeights<1,ap_int<5>,1,16> weights= {
 {
 { 
-0x2,
-0xe,
-0x7,
-0x10,
-0x9,
-0x0,
+0xb,
 0x8,
+0xa,
+0x9,
 0x6,
-0x6,
-0x0,
-0x7,
-0x3,
-0xc,
-0xe,
-0x7,
-0x8} 
+0x9,
+0x4,
+0x4,
+0xf,
+0x9,
+0x5,
+0xb,
+0x4,
+0x8,
+0x2,
+0xe} 
 }
 };
  } 
diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index ca9a2d1..e6db31b 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -60,7 +60,7 @@ int main() {
 
 		for(unsigned  y = 0; y < IFDim1; y++) {
 			for(unsigned  x = 0; x < IFDim1; x++) {
-				ap_uint<IFMCh1 * IPrecision> input_channel = 0;
+				ap_uint<IFMCh1*IPrecision>  input_channel = 0;
 				for(unsigned  c = 0; c < IFMCh1; c++) {
 					ap_uint<IPrecision>  val = dist(rd);
 					inp_image[y][x][c] = val;

From f3ee9a2ed0d5e0af1074f7f65487f9864eebc323 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 19:01:54 -0700
Subject: [PATCH 34/43] Create test_deconv2d.tcl

---
 tb/test_deconv2d.tcl | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 tb/test_deconv2d.tcl

diff --git a/tb/test_deconv2d.tcl b/tb/test_deconv2d.tcl
new file mode 100644
index 0000000..7a14d0b
--- /dev/null
+++ b/tb/test_deconv2d.tcl
@@ -0,0 +1,42 @@
+##############################################################################
+ #  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions are met:
+ #
+ #  1.  Redistributions of source code must retain the above copyright notice,
+ #     this list of conditions and the following disclaimer.
+ #
+ #  2.  Redistributions in binary form must reproduce the above copyright
+ #      notice, this list of conditions and the following disclaimer in the
+ #      documentation and/or other materials provided with the distribution.
+ #
+ #  3.  Neither the name of the copyright holder nor the names of its
+ #      contributors may be used to endorse or promote products derived from
+ #      this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ #  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ #  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ #  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ #  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ #  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ #  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ #  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ #  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ #  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+###############################################################################
+open_project hls-syn-deconv2d
+add_files deconv_top.cpp -cflags "-std=c++14 -I$::env(FINN_HLS_ROOT) -I$::env(FINN_HLS_ROOT)/tb" 
+add_files -tb deconv_tb.cpp -cflags "-std=c++14 -I$::env(FINN_HLS_ROOT) -I$::env(FINN_HLS_ROOT)/tb" 
+set_top test_deconv2d
+open_solution "sol1"
+set_part {xck26-sfvc784-2LVI-i}
+create_clock -period 10 -name default
+csim_design
+csynth_design
+# cosim_design
+exit

From ac3d9e1195a4124389e2b4259817135863cfd8f1 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 19:02:02 -0700
Subject: [PATCH 35/43] Relaxing timing target

---
 tb/test_fm_pixel_padding.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tb/test_fm_pixel_padding.tcl b/tb/test_fm_pixel_padding.tcl
index 23af38d..8351632 100644
--- a/tb/test_fm_pixel_padding.tcl
+++ b/tb/test_fm_pixel_padding.tcl
@@ -35,7 +35,7 @@ add_files -tb fm_pixel_padding_tb.cpp -cflags "-std=c++14 -I$::env(FINN_HLS_ROOT
 set_top test_fm_pixel_padding
 open_solution "sol1"
 set_part {xck26-sfvc784-2LVI-i}
-create_clock -period 5 -name default
+create_clock -period 10 -name default
 csim_design
 csynth_design
 # cosim_design

From 22ab78fb250573dbd04539d97176240942ecac76 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Thu, 6 Apr 2023 19:28:51 -0700
Subject: [PATCH 36/43] Updating paths in files

---
 tb/deconv_tb.cpp            |  2 +-
 tb/deconv_top.cpp           | 10 +++++-----
 tb/fm_pixel_padding_tb.cpp  |  2 +-
 tb/fm_pixel_padding_top.cpp |  2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index e6db31b..a49c822 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -31,7 +31,7 @@
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#include "../bnn-library.h"
+#include "bnn-library.h"
 #include "deconv.hpp"
 #include "data/config_deconv2d.h"
 #include "data/memdata_deconv2d.h"
diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
index 6bd622b..94cda30 100644
--- a/tb/deconv_top.cpp
+++ b/tb/deconv_top.cpp
@@ -33,11 +33,11 @@
 using namespace hls;
 #include "ap_int.h"
 
-#include "../bnn-library.h"
-#include "../weights.hpp"
-#include "../activations.hpp"
-#include "../interpret.hpp"
-#include "../mvau.hpp"
+#include "bnn-library.h"
+#include "weights.hpp"
+#include "activations.hpp"
+#include "interpret.hpp"
+#include "mvau.hpp"
 #include "data/memdata_deconv2d.h"
 #include "data/config_deconv2d.h"
 
diff --git a/tb/fm_pixel_padding_tb.cpp b/tb/fm_pixel_padding_tb.cpp
index c166d92..cacc5d9 100644
--- a/tb/fm_pixel_padding_tb.cpp
+++ b/tb/fm_pixel_padding_tb.cpp
@@ -32,7 +32,7 @@
 #include <ap_int.h>
 #include <hls_stream.h>
 
-#include "../bnn-library.h"
+#include "bnn-library.h"
 #include "data/config_fmpp.h"
 
 #include <iostream>
diff --git a/tb/fm_pixel_padding_top.cpp b/tb/fm_pixel_padding_top.cpp
index 863ee6e..b12ab21 100644
--- a/tb/fm_pixel_padding_top.cpp
+++ b/tb/fm_pixel_padding_top.cpp
@@ -32,7 +32,7 @@
 #include <hls_stream.h>
 using namespace hls;
 #include "ap_int.h"
-#include "../bnn-library.h"
+#include "bnn-library.h"
 #include "data/config_fmpp.h"
 
 void test_fm_pixel_padding(

From 9377c62243bcb6f4e0afd4849c9caa8c9b4830e2 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 14 Apr 2023 07:38:23 -0700
Subject: [PATCH 37/43] Adding new tcl scripts to Jenkinsfile

---
 Jenkinsfile | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 54a64df..503a51f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -142,6 +142,13 @@ node('finn-build || built-in') {
             stage('TMR CHECK') {
                 sh("source ${env.HLS_ENV_SRC}; cd tb; vitis_hls -f test_tmrc_stmr.tcl")
             }
+        }, thirteenthBranch: {
+            stage('FM_PIX_PAD') {
+                sh("source ${env.HLS_ENV_SRC}; cd tb; vitis_hls -f test_fm_pixel_padding.tcl")
+            }
+            stage('DECONV_2D') {
+                sh("source ${env.HLS_ENV_SRC}; cd tb; vitis_hls -f test_deconv2d.tcl")
+            }
         }
     }
 }

From 401b357a2762a2e0857c828f64efadecbd29a371 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 14 Apr 2023 18:03:38 +0100
Subject: [PATCH 38/43] Adopt a deterministic PRNG to make cosim use identical
 reference.

---
 tb/deconv_tb.cpp             |  4 ++--
 tb/fm_pixel_padding_tb.cpp   | 21 ++++++++++++++-------
 tb/test_deconv2d.tcl         |  2 +-
 tb/test_fm_pixel_padding.tcl |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/tb/deconv_tb.cpp b/tb/deconv_tb.cpp
index a49c822..c039c18 100644
--- a/tb/deconv_tb.cpp
+++ b/tb/deconv_tb.cpp
@@ -54,10 +54,10 @@ int main() {
 	hls::stream<ap_uint<OFMCh1*OPrecision> > output_stream("output_stream");
 	
 	{ // Feed random input sequence
-		std::random_device rd;
+		std::minstd_rand  rd;
 		std::uniform_int_distribution<int> dist(0, (1<<(IFMCh1*IPrecision))-1);
-		unsigned  input_counter = 0;
 
+		unsigned  input_counter = 0;
 		for(unsigned  y = 0; y < IFDim1; y++) {
 			for(unsigned  x = 0; x < IFDim1; x++) {
 				ap_uint<IFMCh1*IPrecision>  input_channel = 0;
diff --git a/tb/fm_pixel_padding_tb.cpp b/tb/fm_pixel_padding_tb.cpp
index cacc5d9..de532fd 100644
--- a/tb/fm_pixel_padding_tb.cpp
+++ b/tb/fm_pixel_padding_tb.cpp
@@ -44,6 +44,7 @@ void test_fm_pixel_padding(
 	hls::stream<ap_uint<SIMD1*INPUT_WIDTH>> &dst
 );
 
+
 int main() {
 	std::cout << "Starting testbench for fm_pixel_padding" << std::endl;
 
@@ -53,10 +54,10 @@ int main() {
 	T  expected[OUTPUT_DIM_Y][OUTPUT_DIM_X][CHANNELS];
 
 	{ // Feed random input sequence
-		std::random_device rd;
+		std::minstd_rand  rd;
 		std::uniform_int_distribution<int> dist(0, (1<<(SIMD1*INPUT_WIDTH))-1);
-		unsigned  input_counter = 0;
 
+		unsigned  input_counter = 0;
 		for(unsigned  y = 0; y < OUTPUT_DIM_Y; y++) {
 			for(unsigned  x = 0; x < OUTPUT_DIM_X; x++) {
 				for(unsigned  c = 0; c < CHANNELS; c++) {
@@ -82,25 +83,31 @@ int main() {
 	std::cout << "Finished writing to output stream" << std::endl;
 
 	// Verify correctness
+	int  ret = 0;
 	for(unsigned  y = 0; y < OUTPUT_DIM_Y; y++) {
 		for(unsigned  x = 0; x < OUTPUT_DIM_X; x++) {
 			for(unsigned  c = 0; c < CHANNELS; c++) {
 				if(output_stream.empty()) {
 					std::cerr << "Missing outputs." << std::endl;
-					return  1;
+					goto  err;
 				}
 
 				T const  val = output_stream.read();
 				if(expected[y][x][c] != val) {
-					std::cerr << "Output mismatch." << std::endl;
-					return  1;
+					std::cerr
+						<< "Output mismatch [" << y << ':' << x << ':' << c << "]: "
+						<< val << " instead of " << expected[y][x][c]
+						<< std::endl;
+					ret = 1;
 				}
 			}
 		}
 	}
 	if(!output_stream.empty()) {
 		std::cerr << "Output stream not empty." << std::endl;
-		return 1;
+err:
+		ret = 1;
 	}
-	std::cout << "Successfully passed csim testbench." << std::endl;
+	if(ret == 0)  std::cout << "Successfully passed csim testbench." << std::endl;
+	return  ret;
 }
diff --git a/tb/test_deconv2d.tcl b/tb/test_deconv2d.tcl
index 7a14d0b..8637734 100644
--- a/tb/test_deconv2d.tcl
+++ b/tb/test_deconv2d.tcl
@@ -38,5 +38,5 @@ set_part {xck26-sfvc784-2LVI-i}
 create_clock -period 10 -name default
 csim_design
 csynth_design
-# cosim_design
+cosim_design
 exit
diff --git a/tb/test_fm_pixel_padding.tcl b/tb/test_fm_pixel_padding.tcl
index 8351632..57d08f4 100644
--- a/tb/test_fm_pixel_padding.tcl
+++ b/tb/test_fm_pixel_padding.tcl
@@ -38,5 +38,5 @@ set_part {xck26-sfvc784-2LVI-i}
 create_clock -period 10 -name default
 csim_design
 csynth_design
-# cosim_design
+cosim_design
 exit

From a9d5e086b9bc200c3c2319c74d6ccd39cfb79adf Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 14 Apr 2023 07:52:38 -0700
Subject: [PATCH 39/43] Adding ifdef guards to tb/deconv.hpp

---
 tb/deconv.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tb/deconv.hpp b/tb/deconv.hpp
index 9d96219..5aab5a9 100644
--- a/tb/deconv.hpp
+++ b/tb/deconv.hpp
@@ -30,6 +30,9 @@
  *
  ******************************************************************************/
 
+#ifndef DECONV_TB_H
+#define DECONV_TB_H
+
 template<
     unsigned IFMDim,
     unsigned IFMCh,
@@ -67,4 +70,6 @@ void deconv2d(
             }
         }
     }
-}
\ No newline at end of file
+}
+
+#endif

From ac8d6934da3695cecd164e52827332f99148d04f Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 14 Apr 2023 07:57:57 -0700
Subject: [PATCH 40/43] Adding if/def guards to tb/data/config_deconv2d.h

---
 tb/data/config_deconv2d.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tb/data/config_deconv2d.h b/tb/data/config_deconv2d.h
index aac99d8..e0ee070 100644
--- a/tb/data/config_deconv2d.h
+++ b/tb/data/config_deconv2d.h
@@ -1,3 +1,38 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Advanced Micro Devices, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef DECONV_CONF_H
+#define DECONV_CONF_H
+
 constexpr unsigned  IFDim1 = 4;
 constexpr unsigned  IFMCh1 = 1;
 constexpr unsigned  OFDim1 = 7;
@@ -24,3 +59,4 @@ constexpr unsigned  IPrecision = 6;
 constexpr unsigned  OPrecision = 16;
 constexpr unsigned  WPrecision = 5;
 
+#endif

From c18872caa6daa299dbebd0b1516ea80bc58acad7 Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 14 Apr 2023 10:24:59 -0700
Subject: [PATCH 41/43] Adding if/def guards for fmpp config

---
 tb/data/config_fmpp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tb/data/config_fmpp.h b/tb/data/config_fmpp.h
index 7e19e75..c3c0f61 100644
--- a/tb/data/config_fmpp.h
+++ b/tb/data/config_fmpp.h
@@ -29,6 +29,9 @@
  *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  ******************************************************************************/
 
+#ifndef FMPP_CONFIG_H
+#define FMPP_CONFIG_H
+
 constexpr unsigned  SIMD1 = 1;
 constexpr unsigned  INPUT_WIDTH = 8;
 constexpr unsigned  INPUT_DIM_X = 30;
@@ -38,4 +41,6 @@ constexpr unsigned  XSTRIDE = 5;
 constexpr unsigned  YSTRIDE = 3;
 
 constexpr unsigned  OUTPUT_DIM_X = INPUT_DIM_X + (INPUT_DIM_X - 1) * (XSTRIDE - 1);
-constexpr unsigned  OUTPUT_DIM_Y = INPUT_DIM_Y + (INPUT_DIM_Y - 1) * (YSTRIDE - 1);
\ No newline at end of file
+constexpr unsigned  OUTPUT_DIM_Y = INPUT_DIM_Y + (INPUT_DIM_Y - 1) * (YSTRIDE - 1);
+
+#endif

From 1e4564cd745ba25242c6866d2b604eb215db471b Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 14 Apr 2023 10:25:21 -0700
Subject: [PATCH 42/43] Removing ConvPadding from generate weights scripts

---
 tb/data/gen_weights_deconv2d.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tb/data/gen_weights_deconv2d.py b/tb/data/gen_weights_deconv2d.py
index 99797d3..dab3bd2 100644
--- a/tb/data/gen_weights_deconv2d.py
+++ b/tb/data/gen_weights_deconv2d.py
@@ -47,6 +47,9 @@
 simd = in_channels # fully unrolling in channels
 pe = 1
 
+outFileConfig.write("#ifndef DECONV_CONFIG_H\n")
+outFileConfig.write("#define DECONV_CONFIG_H\n")
+
 # deconvolution hyperparameters
 outFileConfig.write("constexpr unsigned  IFDim1 = %d;\n" % in_dim)
 outFileConfig.write("constexpr unsigned  IFMCh1 = %d;\n" % in_channels)
@@ -76,7 +79,8 @@
 outFileConfig.write("constexpr unsigned  ConvIFMDim1 = %d;\n" % fm_out_x)
 outFileConfig.write("constexpr unsigned  ConvOFMCh1 = %d;\n" % out_channels)
 outFileConfig.write("constexpr unsigned  ConvOFMDim1 = %d;\n" % out_dim)
-outFileConfig.write("constexpr unsigned  ConvPadding1 = %d;\n" % conv_padding)
+# not testing addition padding node here
+# outFileConfig.write("constexpr unsigned  ConvPadding1 = %d;\n" % conv_padding)
 outFileConfig.write("constexpr unsigned  ConvStride1 = %d;\n" % conv_stride)
 outFileConfig.write("constexpr unsigned  ConvSIMD1 = %d;\n" % simd)
 outFileConfig.write("constexpr unsigned  ConvPE1 = %d;\n" % pe)
@@ -86,6 +90,7 @@
 outFileConfig.write("constexpr unsigned  IPrecision = %d;\n" % i_precision)
 outFileConfig.write("constexpr unsigned  OPrecision = %d;\n" % o_precision)
 outFileConfig.write("constexpr unsigned  WPrecision = %d;\n" % w_precision)
+outFileConfig.write("#endif\n")
 outFileConfig.write("\n")
 outFileConfig.close()
 

From 6479551fe509ad2acf98d95b9bf519f1a754df5f Mon Sep 17 00:00:00 2001
From: icolbert <Ian.Colbert@amd.com>
Date: Fri, 14 Apr 2023 10:27:50 -0700
Subject: [PATCH 43/43] Removing redundant ConvPadding from testbench

---
 tb/data/config_deconv2d.h | 1 -
 tb/deconv_top.cpp         | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tb/data/config_deconv2d.h b/tb/data/config_deconv2d.h
index e0ee070..0f38cca 100644
--- a/tb/data/config_deconv2d.h
+++ b/tb/data/config_deconv2d.h
@@ -50,7 +50,6 @@ constexpr unsigned  ConvIFMCh1 = 1;
 constexpr unsigned  ConvIFMDim1 = 10;
 constexpr unsigned  ConvOFMCh1 = 1;
 constexpr unsigned  ConvOFMDim1 = 7;
-constexpr unsigned  ConvPadding1 = 0;
 constexpr unsigned  ConvStride1 = 1;
 constexpr unsigned  ConvSIMD1 = 1;
 constexpr unsigned  ConvPE1 = 1;
diff --git a/tb/deconv_top.cpp b/tb/deconv_top.cpp
index 94cda30..222464d 100644
--- a/tb/deconv_top.cpp
+++ b/tb/deconv_top.cpp
@@ -56,8 +56,8 @@ void test_deconv2d(
 		IFMCh1, // packing along the channel dim
 		ap_uint<IPrecision> // data type of values
 	>(src, conv_input);
-	// Note - would need to insert padding layer if padding is not 0
-	static_assert(ConvPadding1 == 0, "Not testing non-zero padding.");
+	// Note - would need to insert padding layer if padding is not 0, which is the
+	// case in this testbench top-level function
 	ConvLayer_Batch<
 		ConvKernel1,
 		ConvIFMCh1,