File dsp.hpp




typedef struct rx_slice rx_slice


inline void throw_on_cuda_error(cudaError_t code, const char *file, int line)
std::vector<cudaDeviceProp> get_gpu_properties()

Gets the properties of each GPU in the system.


The gpu properties.

void print_gpu_properties(std::vector<cudaDeviceProp> gpu_properties)

Prints the properties of each cudaDeviceProp in the vector.

More info on properties and calculations here:


gpu_properties[in] A vector of cudaDeviceProp structs.

void postprocess(DSPCore *dp)
struct rx_slice
#include <dsp.hpp>

Public Functions

inline rx_slice(double rx_freq, uint32_t slice_id, uint32_t num_ranges, uint32_t beam_count, float first_range, float range_sep, uint32_t tau_spacing)

Public Members

double rx_freq
uint32_t slice_id
uint32_t num_ranges
uint32_t beam_count
float first_range
float range_sep
uint32_t tau_spacing
std::vector<std::string> antenna_names
std::vector<lag> lags
struct lag
#include <dsp.hpp>

Public Functions

inline lag(uint32_t pulse_1, uint32_t pulse_2, uint32_t lag_num)

Public Members

uint32_t pulse_1
uint32_t pulse_2
uint32_t lag_num
class DSPCore
#include <dsp.hpp>

Contains the core DSP work done on the GPU.

Public Functions

void cuda_postprocessing_callback(uint32_t total_antennas, std::vector<std::string> antenna_strings, uint32_t num_samples_rf, std::vector<uint32_t> samples_per_antenna, std::vector<uint32_t> total_output_samples)

Add the postprocessing callback to the stream.

This function allocates the host space needed for filter stage data and then copies the data from GPU into the allocated space. Certain DSPCore members needed for post processing are assigned such as the rx freqs, the number of rf samples, the total antennas and the vector of samples per antenna(each stage).

void initial_memcpy_callback()

Adds the callback to the CUDA stream to acknowledge the RF samples have been copied.

explicit DSPCore(zmq::context_t &context, SignalProcessingOptions &options, uint32_t sq_num, double rx_rate, double output_sample_rate, std::vector<std::vector<float>> filter_taps, std::vector<cuComplex> beam_phases, double driver_initialization_time, double sequence_start_time, std::vector<uint32_t> dm_rates, std::vector<rx_slice> slice_info)

Initializes the parameters needed in order to do asynchronous DSP processing.

The constructor creates a new CUDA stream and initializes the timing events. It then opens the shared memory with the received RF samples for a pulse sequence.

  • context – ZMQ’s application context from which to create sockets.

  • sig_options – The signal processing options.

  • sequence_num[in] The pulse sequence number for which will be acknowledged.

  • rx_rate[in] The USRP sampling rate.

  • output_sample_rate[in] The final decimated output sample rate.

  • filter_taps[in] The filter taps for each stage.

  • beam_phases[in] The beam phases.

  • driver_initialization_time[in] The driver initialization time.

  • sequence_start_time[in] The sequence start time.

  • dm_rates[in] The decimation rates.

  • slice_info[in] The slice info given as a vector of rx_slice structs.


Frees all associated pointers, events, and streams. Removes and deletes shared memory.

void allocate_and_copy_frequencies(void *freqs, uint32_t num_freqs)

Allocates device memory for the filtering frequencies and then copies them to device.

  • freqs – A pointer to the filtering freqs.

  • num_freqs[in] The number of freqs.

void allocate_and_copy_rf_samples(uint32_t total_antennas, uint32_t num_samples_needed, int64_t extra_samples, uint32_t offset_to_first_pulse, double time_zero, double start_time, uint64_t ringbuffer_size, std::vector<cuComplex*> &ringbuffer_ptrs_start)

Allocates device memory for the RF samples and then copies them to device.

Samples are being stored in a shared memory ringbuffer. This function calculates where to index into the ringbuffer for samples and copies them to the gpu. This function will also copy the samples to a shared memory section that data write, or another process can access in order to work with the raw RF samples.

  • total_antennas[in] The total number of antennas.

  • num_samples_needed[in] The number of samples needed from each antenna ringbuffer.

  • extra_samples[in] The number of extra samples needed for filter propagation.

  • offset_to_first_pulse[in] Offset from sequence start to center of first pulse.

  • time_zero[in] The time the driver began collecting samples. seconds since epoch.

  • start_time[in] The start time of the pulse sequence. seconds since epoch.

  • ringbuffer_size[in] The ringbuffer size in number of samples.

  • ringbuffer_ptrs_start – A vector of pointers to the start of each antenna ringbuffer.

void allocate_and_copy_bandpass_filters(void *taps, uint32_t total_taps)

Allocate and copy bandpass filters for all rx freqs to gpu.

  • taps – A pointer to the filter taps.

  • total_taps[in] The total amount of filter taps.

std::vector<cuComplex*> get_filter_outputs_h()

Gets the vector of host side filter outputs.


The filter outputs host vector.

cuComplex *get_last_filter_output_d()

Gets the last filter output d.


The last filter output d.

std::vector<cuComplex*> get_lowpass_filters_d()
cuComplex *get_last_lowpass_filter_d()

Gets the last pointer stored in the lowpass filters vector.


The last lowpass filter pointer inserted into the vector.

std::vector<uint32_t> get_samples_per_antenna()

Gets the samples per antenna vector. Vector contains an element for each stage.


The samples per antenna vector.

std::vector<uint32_t> get_dm_rates()

Gets the vector of decimation rates.


The dm rates.

cuComplex *get_bp_filters_p()

Gets the bandpass filters device pointer.


The bandpass filter pointer.

void allocate_and_copy_lowpass_filter(void *taps, uint32_t total_taps)

Allocate and copy a lowpass filter to the gpu.

  • taps – A pointer to the filter taps.

  • total_taps[in] The total amount of filter taps.

void allocate_output(uint32_t num_output_samples)

Allocate a filter output on the GPU.


num_output_samples[in] The number output samples

std::vector<std::vector<float>> get_filter_taps()

The vector containing vectors of filter taps for each stage.


The filter taps vectors for each stage.

uint32_t get_num_antennas()

Gets the number of antennas.


The number of antennas.

std::vector<std::string> get_antenna_names()

Gets the names of the antennas.


The names of all antennas.

float get_total_timing()

Gets the total GPU process timing in milliseconds.


The total process timing.

float get_decimate_timing()

Gets the total decimation timing in milliseconds.


The decimation timing.

void allocate_and_copy_host(uint32_t num_output_samples, cuComplex *output_d)

Allocate a host pointer for decimation stage output and then copy data.

  • num_output_samples[in] The number output samples needed.

  • output_d – The device pointer from which to copy from.

void clear_device_and_destroy()
cuComplex *get_rf_samples_p()

Gets the device pointer to the RF samples.


The RF samples device pointer.

std::vector<cuComplex> get_rf_samples_h()

Gets the host pointer to the RF samples.


The rf samples host pointer.

double *get_frequencies_p()

Gets the device pointer to the receive frequencies.


The frequencies device pointer.

uint32_t get_num_rf_samples()

Gets the number of rf samples.


The number of rf samples.

uint32_t get_sequence_num()

Gets the sequence number.


The sequence number.

double get_rx_rate()

Gets the rx sample rate.


The rx sampling rate (samples per second).

double get_output_sample_rate()

Gets the output sample rate.


The output decimated and filtered rate (samples per second).

double get_driver_initialization_time()

Gets the driver initialization timestamp.


The driver initialization timestamp.

double get_sequence_start_time()

Gets the sequence start timestamp.


The sequence start timestamp.

std::vector<rx_slice> get_slice_info()

Gets the vector of slice information, rx_slice structs.


The vector of rx_slice structs with slice information.

cudaStream_t get_cuda_stream()

Gets the CUDA stream this DSPCore’s work is associated to.


The CUDA stream.

std::vector<cuComplex> get_beam_phases()

Gets the vector of beam phases.


The beam phases.

std::string get_shared_memory_name()

Gets the name of the shared memory section.


The shared memory name string.

void start_decimate_timing()

Starts the timing before the GPU kernels execute.

void stop_timing()

Stops the timers that the constructor starts.

void send_ack()

Sends the acknowledgment to the radar control that the RF samples have been transfered.

RF samples of one pulse sequence can be transfered asynchronously while samples of another are being processed. This means that it is possible to start running a new pulse sequence in the driver as soon as the samples are copied. The asynchronous nature means only timing constraint is the time needed to run the GPU kernels for decimation.

void send_timing()

Sends the GPU kernel timing to the radar control.

The timing here is used as a rate limiter, so that the GPU doesn’t become backlogged with data. If the GPU is overburdened, this will result in less averages, but the system wont crash.

void send_processed_data(processeddata::ProcessedData &pd)

Sends a processed data packet to data write.


pd – A processeddata protobuf object.

Public Members

SignalProcessingOptions sig_options
Filtering *dsp_filters

Private Functions

void allocate_and_copy_rf_from_device(uint32_t num_rf_samples)

Private Members

cudaStream_t stream

CUDA stream the work will be associated with.

uint32_t sequence_num

Sequence number used to identify and acknowledge a pulse sequence.

double rx_rate

Rx sampling rate for the data being processed.

double output_sample_rate

Output sampling rate of the filtered, decimated, processed data.

std::vector<zmq::socket_t> zmq_sockets

The unique sockets for communicating between processes.

float total_process_timing_ms

Stores the total GPU process timing once all the work is done.

float decimate_kernel_timing_ms

Stores the decimation timing.

double *freqs_d

Pointer to the device rx frequencies.

cuComplex *rf_samples_d

Pointer to the RF samples on device.

cuComplex *bp_filters_d

Pointer to the first stage bandpass filters on device.

std::vector<cuComplex*> lp_filters_d

Vector of device side lowpass filter pointers.

std::vector<cuComplex*> filter_outputs_d

Vector of device side filter output pointers.

std::vector<cuComplex*> filter_outputs_h

Vector of host side filter output pointers.

std::vector<uint32_t> samples_per_antenna

Vector of the samples per antenna at each stage of decimation.

std::vector<uint32_t> dm_rates

Vector of decimation rates at each stage.

std::vector<std::vector<float>> filter_taps

Vector that holds the vectors of filter taps at each stage.

cudaEvent_t initial_start

CUDA event to timestamp when the GPU processing begins.

cudaEvent_t kernel_start

CUDA event to timestamp when the kernels begin executing.

cudaEvent_t stop

CUDA event to timestamp when the GPU processing stops.

cudaEvent_t mem_transfer_end

Cuda event to timestamp the transfer of RF samples to the GPU.

float mem_time_ms

Stores the memory transfer timing.

std::vector<cuComplex*> ringbuffers

A vector of pointers to the start of ringbuffers.

std::vector<cuComplex> rf_samples_h

A host side vector for the rf samples.

uint32_t num_antennas

The number of total antennas.

std::vector<std::string> antenna_names

The name of each antenna.

uint32_t num_rf_samples

The number of rf samples per antenna.

std::vector<cuComplex> beam_phases

A set of beam angle phases for each beam direction.

SharedMemoryHandler shm

A handler for a shared memory section.

double driver_initialization_time

Timestamp of when the driver began sampling. Seconds since epoch.

double sequence_start_time

Timestamp of when the sequence began. Seconds since epoch.

std::vector<rx_slice> slice_info

Slice information given from rx_slice structs.