I am currently developing a project on the Eclypse Z7 and am integrating several HLS components. I would appreciate any insights or suggestions you could provide regarding my implementation.
Project Overview
I am using Vitis HLS 2023.1 for synthesizing the FPGA components. The project includes data streaming operations where input data is processed and sent to different output channels, as described in the HLS code below.
HLS Code
Here's the HLS code for handling data streams:
#include "main.h"
void read_in(
data_stream_type& indatastream,
internal_stream_data_type& internal_stream
) {
data_stream_t data_in_struct;
internal_stream_data_t internal_struct;
static int call_count = 0; // Static variable to track the number of times the function is called
PROCESSING_LOOP: for (int ii = 0; ii < N_INPUT; ii+=2) {
// Read data from the input stream
data_in_struct = indatastream.read();
ap_uint<32> data = data_in_struct.data;
internal_struct.data = data;
internal_struct.last = (ii == N_INPUT-2 && call_count == N_STEPS-1) ? 1 : 0;
internal_stream.write(internal_struct);
}
call_count++; // Increment call count after processing each call
if (call_count >= N_STEPS) call_count = 0; // Reset after processing all steps
}
void stream_to_ps(
internal_stream_data_type& internal_stream,
data_stream_type& to_ps) {
data_stream_t to_ps_struct;
internal_stream_data_t internal_struct;
TO_PS_LOOP: for (int ii = 0; ii < N_INPUT; ii+=2) {
internal_struct = internal_stream.read();
to_ps_struct.data = internal_struct.data;
to_ps_struct.last = internal_struct.last;
to_ps.write(to_ps_struct);
}
}
void stream_out(
data_stream_type& outdatastream) {
data_stream_t data_out_struct;
static int call_count = 1; // Static variable to track the number of times the function is called
ap_uint<16> result_16_1 = (ap_uint<16>) 30000/call_count;
ap_uint<16> result_16_2 = (ap_uint<16>) 123;
// Correctly pack result_16 and out_val into a single ap_uint<32> variable
ap_uint<32> combined = (static_cast<ap_uint<32>>(result_16_1) << 16) | static_cast<ap_uint<16>>(result_16_2);
data_out_struct.data = combined;
outdatastream.write(data_out_struct);
call_count++;
if (call_count >= N_STEPS+1) call_count = 1; // Reset after processing all steps
}
void episode_processing(
data_stream_type& indatastream,
data_stream_type& outdatastream,
data_stream_type& to_ps) {
READING_LOOP: for (int i = 0; i < N_STEPS; i++) {
#pragma HLS DATAFLOW
internal_stream_data_type internal_stream;
#pragma HLS STREAM variable=internal_stream depth=2004
read_in(indatastream, internal_stream);
stream_to_ps(internal_stream, to_ps);
stream_out(outdatastream);
}
ap_uint<16> result_16_1 = (ap_uint<16>) 0;
ap_uint<16> result_16_2 = (ap_uint<16>) 0;
// Correctly pack result_16 and out_val into a single ap_uint<32> variable
ap_uint<32> combined = (static_cast<ap_uint<32>>(result_16_1) << 16) | static_cast<ap_uint<16>>(result_16_2);
data_stream_t data_out_struct;
data_out_struct.data = combined;
outdatastream.write(data_out_struct);
}
ap_uint<1> level_1_HLS_stream_simplified(
data_stream_type& indatastream,
data_stream_type& outdatastream,
data_stream_type& to_ps,
out_data_t th,
ap_uint<1> sinitdoneadc,
ap_uint<1> sinitdonedac,
ap_uint<1> sinitdonerelay) {
#pragma HLS INTERFACE axis port=indatastream
#pragma HLS INTERFACE axis port=outdatastream
#pragma HLS INTERFACE axis port=to_ps
#pragma HLS INTERFACE s_axilite port=th
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE ap_none port=sinitdoneadc
#pragma HLS INTERFACE ap_none port=sinitdonedac
#pragma HLS INTERFACE ap_none port=sinitdonerelay
bool done = false;
data_stream_t data_in_struct;
if (sinitdoneadc && sinitdonedac && sinitdonerelay) {
while(!done) {
// Read data from the input stream
data_in_struct = indatastream.read();
// Unpack the data
ap_uint<32> data = data_in_struct.data;
ap_int<16> sample1 = data.range(31, 16); // First sample
ap_int<16> sample2 = data.range(15, 0); // Second sample
if ((sample1 >= th) || (sample2 >= th)) {
episode_processing(indatastream, outdatastream, to_ps);
done = true;
}
}
}
return (ap_uint<1>) 1;
}
This is my Vivado block diagram:
And this is the IDE code:
#include <stdio.h>
#include "xil_printf.h"
#include "xaxidma.h"
#include "xparameters.h"
#include "xlevel_1_hls_stream_simplified.h"
XAxiDma AxiDma;
const int AXI_DMA_ID = XPAR_AXI_DMA_0_DEVICE_ID;
const int DEV_ID = XPAR_LEVEL_1_HLS_STREAM_S_0_DEVICE_ID;
#define N_STEPS 2
#define MAX_VAL 32767 // Which corresponds to 5V
#define TH 1000
#define N_INPUT 2000
#define N_RUNS 20
typedef u32 out_data_t;
u32 recv_buffer[N_STEPS*N_INPUT/2];
int InitializeDMA() {
XAxiDma_Config *CfgPtr;
int Status;
CfgPtr = XAxiDma_LookupConfig(AXI_DMA_ID);
if (!CfgPtr) {
xil_printf("No config found for DMA %d\r\n", AXI_DMA_ID);
return XST_FAILURE;
}
Status = XAxiDma_CfgInitialize(&AxiDma, CfgPtr);
if (Status != XST_SUCCESS) {
xil_printf("DMA Initialization failed %d\r\n", Status);
return XST_FAILURE;
}
if (XAxiDma_HasSg(&AxiDma)) {
xil_printf("Device configured as SG mode \r\n");
return XST_FAILURE;
}
XAxiDma_Reset(&AxiDma);
XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DEVICE_TO_DMA);
XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DMA_TO_DEVICE);
return XST_SUCCESS;
}
int main() {
XLevel_1_hls_stream_simplified Instance;
XLevel_1_hls_stream_simplified_Config *ConfigPtr;
int status;
u32 th;
u32 *DataBufferPtr = (u32 *)recv_buffer;
xil_printf("Let's start!\n\r");
ConfigPtr = XLevel_1_hls_stream_simplified_LookupConfig(DEV_ID);
status = XLevel_1_hls_stream_simplified_CfgInitialize(&Instance, ConfigPtr);
if (status == XST_SUCCESS) {
xil_printf("Successfully configured HLS core.\n\r");
}
status = InitializeDMA();
if (status != XST_SUCCESS) {
xil_printf("Data DMA In Initialization Failed\r\n");
return XST_FAILURE;
}
XLevel_1_hls_stream_simplified_InterruptGlobalDisable(&Instance);
XLevel_1_hls_stream_simplified_Set_th(&Instance, TH);
th = XLevel_1_hls_stream_simplified_Get_th(&Instance);
xil_printf("TH set to: %lu\n\r", th);
u32 bytes_to_read = N_STEPS * N_INPUT * sizeof(u32) / 2;
xil_printf("Bytes to read: %d\n\r", bytes_to_read);
Xil_DCacheFlushRange((UINTPTR)DataBufferPtr, bytes_to_read + 16);
status = XAxiDma_SimpleTransfer(&AxiDma, (UINTPTR)DataBufferPtr, bytes_to_read, XAXIDMA_DEVICE_TO_DMA);
if (status != XST_SUCCESS) {
xil_printf("Failed to start DMA transfer\r\n");
xil_printf("Status %d", status);
return XST_FAILURE;
}
XLevel_1_hls_stream_simplified_Start(&Instance);
xil_printf("Ready to go\n\r");
while (XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA));
xil_printf("DMA is done. \n\r");
while (!XLevel_1_hls_stream_simplified_IsDone(&Instance));
xil_printf("HLS is done. \n\r");
Xil_DCacheInvalidateRange((UINTPTR)DataBufferPtr, bytes_to_read + 16);
xil_printf("Received data: ");
for (int i = 0; i < N_STEPS * N_INPUT / 2; i++) {
if (i > 0) xil_printf(", ");
if (i % 1000 == 0) xil_printf("%d", DataBufferPtr[i]);
}
xil_printf("\n\r");
return 0;
}
The HLS code functions correctly as confirmed by C/RTL simulations. The DMA is configured with a Width of Buffer Length Register set to 26 bits, and the FIFO depth is set at 8192.
When executing the XLevel_1_hls_stream_simplified_Start(&Instance); command, the HLS core initiates correctly. This is evident from the DAC output, which reaches approximately 5V for 10 microseconds, then drops to 2.5V for another 10 microseconds before falling to zero. However, the execution gets consistently stuck at while (XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA));, indicating that the DMA remains busy.
I've verified with an Integrated Logic Analyzer (ILA) that the tlast signal transitions to 1 as expected. Despite this, the issue persists, leaving me searching for further troubleshooting steps or insights.
This is the waveform I get from the ILA connected to the to_ps AXIS interface of the HLS core:
start:
and end:
Thank you in advance for any assistance you can provide!
Question
Julii
Hello Digilent Community,
I am currently developing a project on the Eclypse Z7 and am integrating several HLS components. I would appreciate any insights or suggestions you could provide regarding my implementation.
Project Overview
I am using Vitis HLS 2023.1 for synthesizing the FPGA components. The project includes data streaming operations where input data is processed and sent to different output channels, as described in the HLS code below.
HLS Code
Here's the HLS code for handling data streams:
#include "main.h" void read_in( data_stream_type& indatastream, internal_stream_data_type& internal_stream ) { data_stream_t data_in_struct; internal_stream_data_t internal_struct; static int call_count = 0; // Static variable to track the number of times the function is called PROCESSING_LOOP: for (int ii = 0; ii < N_INPUT; ii+=2) { // Read data from the input stream data_in_struct = indatastream.read(); ap_uint<32> data = data_in_struct.data; internal_struct.data = data; internal_struct.last = (ii == N_INPUT-2 && call_count == N_STEPS-1) ? 1 : 0; internal_stream.write(internal_struct); } call_count++; // Increment call count after processing each call if (call_count >= N_STEPS) call_count = 0; // Reset after processing all steps } void stream_to_ps( internal_stream_data_type& internal_stream, data_stream_type& to_ps) { data_stream_t to_ps_struct; internal_stream_data_t internal_struct; TO_PS_LOOP: for (int ii = 0; ii < N_INPUT; ii+=2) { internal_struct = internal_stream.read(); to_ps_struct.data = internal_struct.data; to_ps_struct.last = internal_struct.last; to_ps.write(to_ps_struct); } } void stream_out( data_stream_type& outdatastream) { data_stream_t data_out_struct; static int call_count = 1; // Static variable to track the number of times the function is called ap_uint<16> result_16_1 = (ap_uint<16>) 30000/call_count; ap_uint<16> result_16_2 = (ap_uint<16>) 123; // Correctly pack result_16 and out_val into a single ap_uint<32> variable ap_uint<32> combined = (static_cast<ap_uint<32>>(result_16_1) << 16) | static_cast<ap_uint<16>>(result_16_2); data_out_struct.data = combined; outdatastream.write(data_out_struct); call_count++; if (call_count >= N_STEPS+1) call_count = 1; // Reset after processing all steps } void episode_processing( data_stream_type& indatastream, data_stream_type& outdatastream, data_stream_type& to_ps) { READING_LOOP: for (int i = 0; i < N_STEPS; i++) { #pragma HLS DATAFLOW internal_stream_data_type internal_stream; #pragma HLS STREAM variable=internal_stream depth=2004 read_in(indatastream, internal_stream); stream_to_ps(internal_stream, to_ps); stream_out(outdatastream); } ap_uint<16> result_16_1 = (ap_uint<16>) 0; ap_uint<16> result_16_2 = (ap_uint<16>) 0; // Correctly pack result_16 and out_val into a single ap_uint<32> variable ap_uint<32> combined = (static_cast<ap_uint<32>>(result_16_1) << 16) | static_cast<ap_uint<16>>(result_16_2); data_stream_t data_out_struct; data_out_struct.data = combined; outdatastream.write(data_out_struct); } ap_uint<1> level_1_HLS_stream_simplified( data_stream_type& indatastream, data_stream_type& outdatastream, data_stream_type& to_ps, out_data_t th, ap_uint<1> sinitdoneadc, ap_uint<1> sinitdonedac, ap_uint<1> sinitdonerelay) { #pragma HLS INTERFACE axis port=indatastream #pragma HLS INTERFACE axis port=outdatastream #pragma HLS INTERFACE axis port=to_ps #pragma HLS INTERFACE s_axilite port=th #pragma HLS INTERFACE s_axilite port=return #pragma HLS INTERFACE ap_none port=sinitdoneadc #pragma HLS INTERFACE ap_none port=sinitdonedac #pragma HLS INTERFACE ap_none port=sinitdonerelay bool done = false; data_stream_t data_in_struct; if (sinitdoneadc && sinitdonedac && sinitdonerelay) { while(!done) { // Read data from the input stream data_in_struct = indatastream.read(); // Unpack the data ap_uint<32> data = data_in_struct.data; ap_int<16> sample1 = data.range(31, 16); // First sample ap_int<16> sample2 = data.range(15, 0); // Second sample if ((sample1 >= th) || (sample2 >= th)) { episode_processing(indatastream, outdatastream, to_ps); done = true; } } } return (ap_uint<1>) 1; }
and the following is the main.h:
This is my Vivado block diagram:
And this is the IDE code:
The HLS code functions correctly as confirmed by C/RTL simulations. The DMA is configured with a Width of Buffer Length Register set to 26 bits, and the FIFO depth is set at 8192.
When executing the XLevel_1_hls_stream_simplified_Start(&Instance); command, the HLS core initiates correctly. This is evident from the DAC output, which reaches approximately 5V for 10 microseconds, then drops to 2.5V for another 10 microseconds before falling to zero. However, the execution gets consistently stuck at while (XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA));, indicating that the DMA remains busy.
I've verified with an Integrated Logic Analyzer (ILA) that the tlast signal transitions to 1 as expected. Despite this, the issue persists, leaving me searching for further troubleshooting steps or insights.
This is the waveform I get from the ILA connected to the to_ps AXIS interface of the HLS core:
start:
and end:
Thank you in advance for any assistance you can provide!
Link to comment
Share on other sites
2 answers to this question
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now