Hi,
already tried that too. I try to share some code snippet, where I see some of those strange behaviors. In the code below the 8x memcpy_dma is significantly faster than wrapping it all in one single dma call. And by significantly faster I mean, 1400ms per execution cycle of my algoritm to 1600ms.
memcpy_read_dma_async(c_off(0,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma_async(c_off(1,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma_async(c_off(2,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma_async(c_off(3,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma_async(c_off(4,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma_async(c_off(5,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma_async(c_off(6,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16), DESCRIPTOR_CONTROL_EARLY_DONE_ENABLE_MASK);
memcpy_read_dma(c_off(7,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16));
for(int n = 0; n < synth->width; n++)
{
// lift 2's
/*c_off(0,n,synth->width) -= HWLIFT2(*c_off(1,n,synth->width), *c_off(1,n,synth->width));
*c_off(2,n,synth->width) -= HWLIFT2(*c_off(1,n,synth->width), *c_off(3,n,synth->width));
*c_off(4,n,synth->width) -= HWLIFT2(*c_off(3,n,synth->width), *c_off(5,n,synth->width));*/
*c_off(0,n,synth->width) -= ((2 + *c_off(1,n,synth->width) + *c_off(1,n,synth->width)) >> 2);
*c_off(2,n,synth->width) -= ((2 + *c_off(1,n,synth->width) + *c_off(3,n,synth->width)) >> 2);
*c_off(4,n,synth->width) -= ((2 + *c_off(3,n,synth->width) + *c_off(5,n,synth->width)) >> 2);
// lift 3's
*c_off(1,n,synth->width) += ((8 - *c_off(0,n,synth->width) + 9*(*c_off(0,n,synth->width))
+ 9*(*c_off(2,n,synth->width)) - (*c_off(4,n,synth->width))) >> 4);
}
for (int y = 0; y < synth->height/2; y++)
{
if(y < (synth->height/2-4))
{
memcpy_read_dma(c_off((8 + 2*y)%MEM_LIFTCACHE_DEPTH,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16));
memcpy_read_dma(c_off((9 + 2*y)%MEM_LIFTCACHE_DEPTH,0,synth->width), &(synth->pixel),synth->width*sizeof(alt_16));
}
// and so on ...
my memcpy_read_dma is just a simple wrapper around the dma descriptor writing
static void sgdma_read_complete_isr(void * context)
{
read_isr_fired++;
clear_irq(MSGDMA_DISPATCHER_READ_CSR_BASE);
}
void memcpy_read_dma_async(void* dest, void* src, alt_u32 size, unsigned long control_bits)
{
sgdma_standard_descriptor descriptor;
while((RD_CSR_STATUS(MSGDMA_DISPATCHER_READ_CSR_BASE) & CSR_DESCRIPTOR_BUFFER_FULL_MASK) != 0); // spin until there is room for another descriptor to be written to the SGDMA
construct_standard_mm_to_mm_descriptor (&descriptor, (alt_u32 *)src, (alt_u32 *)dest, size, control_bits);
write_standard_descriptor (MSGDMA_DISPATCHER_READ_CSR_BASE, MSGDMA_DISPATCHER_READ_DESCRIPTOR_SLAVE_BASE, &descriptor);
return;
}
void memcpy_read_dma_compl()
{
while(read_isr_fired == 0);
read_isr_fired = 0; // reset spin lock
}
void memcpy_read_dma(void* dest, void* src, alt_u32 size)
{
memcpy_read_dma_async(dest, src, size, DESCRIPTOR_CONTROL_TRANSFER_COMPLETE_IRQ_MASK);
memcpy_read_dma_compl();
}