Julii
-
Posts
4 -
Joined
-
Last visited
Content Type
Profiles
Forums
Events
Gallery
Posts posted by Julii
-
-
Hello Digilent Community,
I am currently developing a project on the Eclypse Z7 and am integrating several HLS components. I would appreciate any insights or suggestions you could provide regarding my implementation.
Project Overview
I am using Vitis HLS 2023.1 for synthesizing the FPGA components. The project includes data streaming operations where input data is processed and sent to different output channels, as described in the HLS code below.
HLS Code
Here's the HLS code for handling data streams:
#include "main.h" void read_in( data_stream_type& indatastream, internal_stream_data_type& internal_stream ) { data_stream_t data_in_struct; internal_stream_data_t internal_struct; static int call_count = 0; // Static variable to track the number of times the function is called PROCESSING_LOOP: for (int ii = 0; ii < N_INPUT; ii+=2) { // Read data from the input stream data_in_struct = indatastream.read(); ap_uint<32> data = data_in_struct.data; internal_struct.data = data; internal_struct.last = (ii == N_INPUT-2 && call_count == N_STEPS-1) ? 1 : 0; internal_stream.write(internal_struct); } call_count++; // Increment call count after processing each call if (call_count >= N_STEPS) call_count = 0; // Reset after processing all steps } void stream_to_ps( internal_stream_data_type& internal_stream, data_stream_type& to_ps) { data_stream_t to_ps_struct; internal_stream_data_t internal_struct; TO_PS_LOOP: for (int ii = 0; ii < N_INPUT; ii+=2) { internal_struct = internal_stream.read(); to_ps_struct.data = internal_struct.data; to_ps_struct.last = internal_struct.last; to_ps.write(to_ps_struct); } } void stream_out( data_stream_type& outdatastream) { data_stream_t data_out_struct; static int call_count = 1; // Static variable to track the number of times the function is called ap_uint<16> result_16_1 = (ap_uint<16>) 30000/call_count; ap_uint<16> result_16_2 = (ap_uint<16>) 123; // Correctly pack result_16 and out_val into a single ap_uint<32> variable ap_uint<32> combined = (static_cast<ap_uint<32>>(result_16_1) << 16) | static_cast<ap_uint<16>>(result_16_2); data_out_struct.data = combined; outdatastream.write(data_out_struct); call_count++; if (call_count >= N_STEPS+1) call_count = 1; // Reset after processing all steps } void episode_processing( data_stream_type& indatastream, data_stream_type& outdatastream, data_stream_type& to_ps) { READING_LOOP: for (int i = 0; i < N_STEPS; i++) { #pragma HLS DATAFLOW internal_stream_data_type internal_stream; #pragma HLS STREAM variable=internal_stream depth=2004 read_in(indatastream, internal_stream); stream_to_ps(internal_stream, to_ps); stream_out(outdatastream); } ap_uint<16> result_16_1 = (ap_uint<16>) 0; ap_uint<16> result_16_2 = (ap_uint<16>) 0; // Correctly pack result_16 and out_val into a single ap_uint<32> variable ap_uint<32> combined = (static_cast<ap_uint<32>>(result_16_1) << 16) | static_cast<ap_uint<16>>(result_16_2); data_stream_t data_out_struct; data_out_struct.data = combined; outdatastream.write(data_out_struct); } ap_uint<1> level_1_HLS_stream_simplified( data_stream_type& indatastream, data_stream_type& outdatastream, data_stream_type& to_ps, out_data_t th, ap_uint<1> sinitdoneadc, ap_uint<1> sinitdonedac, ap_uint<1> sinitdonerelay) { #pragma HLS INTERFACE axis port=indatastream #pragma HLS INTERFACE axis port=outdatastream #pragma HLS INTERFACE axis port=to_ps #pragma HLS INTERFACE s_axilite port=th #pragma HLS INTERFACE s_axilite port=return #pragma HLS INTERFACE ap_none port=sinitdoneadc #pragma HLS INTERFACE ap_none port=sinitdonedac #pragma HLS INTERFACE ap_none port=sinitdonerelay bool done = false; data_stream_t data_in_struct; if (sinitdoneadc && sinitdonedac && sinitdonerelay) { while(!done) { // Read data from the input stream data_in_struct = indatastream.read(); // Unpack the data ap_uint<32> data = data_in_struct.data; ap_int<16> sample1 = data.range(31, 16); // First sample ap_int<16> sample2 = data.range(15, 0); // Second sample if ((sample1 >= th) || (sample2 >= th)) { episode_processing(indatastream, outdatastream, to_ps); done = true; } } } return (ap_uint<1>) 1; }
and the following is the main.h:
#ifndef MLP_TOP_H #define MLP_TOP_H #include <hls_stream.h> #include <ap_int.h> #include <ap_axi_sdata.h> #include "hls_half.h" typedef ap_int<16> out_data_t; typedef ap_int<8> in_data_t; typedef ap_axiu<32, 0, 0, 0> data_stream_t; typedef hls::stream<data_stream_t> data_stream_type; typedef struct { ap_uint<32> data; ap_uint<1> last; } internal_stream_data_t; typedef hls::stream<internal_stream_data_t> internal_stream_data_type; #define N_INPUT 2000 #define N_STEPS 2 ap_uint<1> level_1_HLS_stream_simplified( data_stream_type& indatastream, data_stream_type& outdatastream, data_stream_type& to_ps, out_data_t th, ap_uint<1> sinitdoneadc, ap_uint<1> sinitdonedac, ap_uint<1> sinitdonerelay); #endif // MLP_TOP_H
This is my Vivado block diagram:
And this is the IDE code:#include <stdio.h> #include "xil_printf.h" #include "xaxidma.h" #include "xparameters.h" #include "xlevel_1_hls_stream_simplified.h" XAxiDma AxiDma; const int AXI_DMA_ID = XPAR_AXI_DMA_0_DEVICE_ID; const int DEV_ID = XPAR_LEVEL_1_HLS_STREAM_S_0_DEVICE_ID; #define N_STEPS 2 #define MAX_VAL 32767 // Which corresponds to 5V #define TH 1000 #define N_INPUT 2000 #define N_RUNS 20 typedef u32 out_data_t; u32 recv_buffer[N_STEPS*N_INPUT/2]; int InitializeDMA() { XAxiDma_Config *CfgPtr; int Status; CfgPtr = XAxiDma_LookupConfig(AXI_DMA_ID); if (!CfgPtr) { xil_printf("No config found for DMA %d\r\n", AXI_DMA_ID); return XST_FAILURE; } Status = XAxiDma_CfgInitialize(&AxiDma, CfgPtr); if (Status != XST_SUCCESS) { xil_printf("DMA Initialization failed %d\r\n", Status); return XST_FAILURE; } if (XAxiDma_HasSg(&AxiDma)) { xil_printf("Device configured as SG mode \r\n"); return XST_FAILURE; } XAxiDma_Reset(&AxiDma); XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DEVICE_TO_DMA); XAxiDma_IntrDisable(&AxiDma, XAXIDMA_IRQ_ALL_MASK, XAXIDMA_DMA_TO_DEVICE); return XST_SUCCESS; } int main() { XLevel_1_hls_stream_simplified Instance; XLevel_1_hls_stream_simplified_Config *ConfigPtr; int status; u32 th; u32 *DataBufferPtr = (u32 *)recv_buffer; xil_printf("Let's start!\n\r"); ConfigPtr = XLevel_1_hls_stream_simplified_LookupConfig(DEV_ID); status = XLevel_1_hls_stream_simplified_CfgInitialize(&Instance, ConfigPtr); if (status == XST_SUCCESS) { xil_printf("Successfully configured HLS core.\n\r"); } status = InitializeDMA(); if (status != XST_SUCCESS) { xil_printf("Data DMA In Initialization Failed\r\n"); return XST_FAILURE; } XLevel_1_hls_stream_simplified_InterruptGlobalDisable(&Instance); XLevel_1_hls_stream_simplified_Set_th(&Instance, TH); th = XLevel_1_hls_stream_simplified_Get_th(&Instance); xil_printf("TH set to: %lu\n\r", th); u32 bytes_to_read = N_STEPS * N_INPUT * sizeof(u32) / 2; xil_printf("Bytes to read: %d\n\r", bytes_to_read); Xil_DCacheFlushRange((UINTPTR)DataBufferPtr, bytes_to_read + 16); status = XAxiDma_SimpleTransfer(&AxiDma, (UINTPTR)DataBufferPtr, bytes_to_read, XAXIDMA_DEVICE_TO_DMA); if (status != XST_SUCCESS) { xil_printf("Failed to start DMA transfer\r\n"); xil_printf("Status %d", status); return XST_FAILURE; } XLevel_1_hls_stream_simplified_Start(&Instance); xil_printf("Ready to go\n\r"); while (XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA)); xil_printf("DMA is done. \n\r"); while (!XLevel_1_hls_stream_simplified_IsDone(&Instance)); xil_printf("HLS is done. \n\r"); Xil_DCacheInvalidateRange((UINTPTR)DataBufferPtr, bytes_to_read + 16); xil_printf("Received data: "); for (int i = 0; i < N_STEPS * N_INPUT / 2; i++) { if (i > 0) xil_printf(", "); if (i % 1000 == 0) xil_printf("%d", DataBufferPtr[i]); } xil_printf("\n\r"); return 0; }
The HLS code functions correctly as confirmed by C/RTL simulations. The DMA is configured with a Width of Buffer Length Register set to 26 bits, and the FIFO depth is set at 8192.
When executing the XLevel_1_hls_stream_simplified_Start(&Instance); command, the HLS core initiates correctly. This is evident from the DAC output, which reaches approximately 5V for 10 microseconds, then drops to 2.5V for another 10 microseconds before falling to zero. However, the execution gets consistently stuck at while (XAxiDma_Busy(&AxiDma, XAXIDMA_DEVICE_TO_DMA));, indicating that the DMA remains busy.
I've verified with an Integrated Logic Analyzer (ILA) that the tlast signal transitions to 1 as expected. Despite this, the issue persists, leaving me searching for further troubleshooting steps or insights.
This is the waveform I get from the ILA connected to the to_ps AXIS interface of the HLS core:
start:
Thank you in advance for any assistance you can provide!
-
Hey everyone,
I'm currently working on a project that involves an Analog-to-Digital Converter (ADC) and a High-Level Synthesis (HLS) core in an FPGA environment. My goal is to streamline the data processing pipeline by having the ADC stream its data directly to the HLS core, bypassing the usual step of writing the data to memory through the S2MM (Stream to Memory-Mapped) interface.
From what I understand, using the S2MM interface to first write ADC data to memory before it's processed by an HLS core is a common approach. However, for my application, this adds unnecessary latency and complexity, especially since the data doesn't need to be stored but rather processed in real-time by the HLS core.
Does anyone have experience or insights on configuring the ADC to stream data directly to an HLS core?
-
On 4/27/2022 at 6:36 PM, RyanW said:
Thank you. I took the advice and wrote my own drivers for this kind of thing. Perhaps I just didn't understand how to use the Xilinx provided ones, but direct transfer mode is fairly simple when you lay it out like that. I know I had read the programming sequence in the docs, but I figured it was just handled in the simple transfer function which wouldn't allow consecutive transfers as it checks if the DMA has been started before already. I had thought that the DMA would de-assert back to a halted state, but seems this is not the case.
Thank you everyone for helping me clear this up. I would like to select everyone as best answer, but I can't, so I'll just go with the last one in the progression.
I'm currently facing a similar challenge, but attempting to establish a DMA transfer from PL to PS. Would you be willing to share the code snippets or more detailed steps you followed to solve your issue? Any additional insights on managing the DMA operations effectively would be greatly appreciated!
Thanks in advance for your assistance!
HLS Core Stalling During DMA Operation on Eclypse Z7
in FPGA
Posted
Hi @artvvb
Thanks for your reply.
Indeed, you've got the core concept correct. The HLS IP continuously reads data from the ZmodScopeController until a signal on one of the two channels surpasses a predefined threshold. Once this condition is met, it triggers the episode_processing function to execute N_STEPS times. In each iteration, the HLS IP sends a distinct pulse to the ZmodAWGController and forwards the next 2000 samples from the input to the Processing System (PS) via DMA. The use of the internal stream was just a desperate attempt (😁) to make it work, same for the external FIFO. Do they make sense?
The repetition of episode_processing N_STEPS times is only related to this simplified version I'm using for debugging. In reality, the IP conducts specific data processing on the inputs received from the ADC. It then determines the appropriate outputs to send to the DAC based on this analysis. Each of these operations must occur sequentially, directly after the preceding one, upon exceeding the threshold, hence the multiple iterations.
Not sure I got everything here, I've implemented this function to try to check the DMA register, is this what you meant?
And also, I introduced a GPIO IP to check the control signals of the ADC and DAC. This it the full code:
And this is what I see as output:
Let's start! Control signals Before initializing the HLS ip. Control Signals State: sInitDoneDAC: 1 sConfigError: 0 sRstBusy: 0 sInitDoneADC: 1 sConfigError: 0 sInitDoneRelay: 1 sDataOverflow: 1 Checking DMA Status... DMA Status Register: 0x1 DMA is halted. DMA is active. -----------------------------------. Successfully configured HLS core. TH set to: 500 Control signals Before Starting the HLS ip. Control Signals State: sInitDoneDAC: 1 sConfigError: 0 sRstBusy: 0 sInitDoneADC: 1 sConfigError: 0 sInitDoneRelay: 1 sDataOverflow: 1 Checking DMA Status... DMA Status Register: 0x1 DMA is halted. DMA is active. -----------------------------------. Bytes to read: 16 Ready to go Control signals After Starting the HLS ip. Control Signals State: sInitDoneDAC: 1 sConfigError: 0 sRstBusy: 0 sInitDoneADC: 1 sConfigError: 0 sInitDoneRelay: 1 sDataOverflow: 1 Checking DMA Status... DMA Status Register: 0x1 DMA is halted. DMA is active. -----------------------------------. HLS is done. Control signals After the HLS ip is done. Control Signals State: sInitDoneDAC: 1 sConfigError: 0 sRstBusy: 0 sInitDoneADC: 1 sConfigError: 0 sInitDoneRelay: 1 sDataOverflow: 1 Checking DMA Status... DMA Status Register: 0x0 DMA is running. DMA is active. -----------------------------------.
The ADC is indeed overflowing and the DMA status register is initially halted and keeps being 'running' after I started the HLS IP. But I'm not sure this is the expected behavior or not.
Any insights or further recommendations you could provide would be greatly appreciated!