I am using an STM32 STM32F103C8T6 running at 48 MHz using ADC DMA and I find that using a for
loop for manipulating arrays is much slower than multiple lines of code.
For example:
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
int i = 0;
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
for (i = 0; i < 11; i++) {
adc_read[i] = adc_buffer[i];
}
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}
vs:
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
adc_read[0] = adc_buffer[0];
adc_read[1] = adc_buffer[1];
adc_read[2] = adc_buffer[2];
adc_read[3] = adc_buffer[3];
adc_read[4] = adc_buffer[4];
adc_read[5] = adc_buffer[5];
adc_read[6] = adc_buffer[6];
adc_read[7] = adc_buffer[7];
adc_read[8] = adc_buffer[8];
adc_read[9] = adc_buffer[9];
adc_read[10] = adc_buffer[10];
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}
The for
loop takes 7.9 μs while the direct assignment takes 3.58 μs when seen on an oscilloscope.
The GPIO state change takes 1.26 μs, so effectively the for
loop takes 6.64 μs while direct assignment takes 2.32 μs to execute (almost 1/3).
Is there any way to speed up the for
loop? For my final program I'd like to add the values to a variable which is then averaged to give me a filtered output.
-----------------EDIT-1-------------------------
Personal note - Also if the question doesn't make it clear, I am more of a hardware guy than a software guy. I expected the for loop to be about 10, 20 or 30% slower, never thought it could be 3x slower. I may never use for again :D.
All data is int 16 bits.
Assembly code as below.
08000246: bl 0x8002c96 <HAL_GPIO_TogglePin>
0800024a: movs r3, #0
0800024c: str r3, [r7, #12]
61 for (i = 0; i < 11; i++) // starts here
0800024e: movs r3, #0
08000250: str r3, [r7, #12]
08000252: b.n 0x800026a <HAL_ADC_ConvCpltCallback+54>
64 adc_read[i]=adc_buffer[i];
08000254: ldr r3, [pc, #52] ; (0x800028c <HAL_ADC_ConvCpltCallback+88>)
08000256: ldr r2, [r7, #12]
08000258: lsls r2, r2, #1
0800025a: ldrh r1, [r2, r3]
0800025c: ldr r3, [pc, #48] ; (0x8000290 <HAL_ADC_ConvCpltCallback+92>)
0800025e: ldr r2, [r7, #12]
08000260: lsls r2, r2, #1
08000262: strh r1, [r2, r3]
61 for (i = 0; i < 11; i++)
08000264: ldr r3, [r7, #12]
08000266: adds r3, #1
08000268: str r3, [r7, #12]
0800026a: ldr r3, [r7, #12]
0800026c: cmp r3, #10
0800026e: ble.n 0x8000254 <HAL_ADC_ConvCpltCallback+32> // ends here
97 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14,0u);
54 HAL_GPIO_TogglePin(GPIOC, GPIO_PIN_14);
0800023c: movs r3, #128 ; 0x80
0800023e: lsls r3, r3, #7
08000240: ldr r2, [pc, #116] ; (0x80002b8 <HAL_ADC_ConvCpltCallback+132>)
08000242: movs r1, r3
08000244: movs r0, r2
08000246: bl 0x8002cc6 <HAL_GPIO_TogglePin>
70 adc_read[0]=adc_buffer[0];
0800024a: ldr r3, [pc, #112] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800024c: ldrh r2, [r3, #0]
0800024e: ldr r3, [pc, #112] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000250: strh r2, [r3, #0]
71 adc_read[1]=adc_buffer[1];
08000252: ldr r3, [pc, #104] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000254: ldrh r2, [r3, #2]
08000256: ldr r3, [pc, #104] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000258: strh r2, [r3, #2]
72 adc_read[2]=adc_buffer[2];
0800025a: ldr r3, [pc, #96] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800025c: ldrh r2, [r3, #4]
0800025e: ldr r3, [pc, #96] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000260: strh r2, [r3, #4]
73 adc_read[3]=adc_buffer[3];
08000262: ldr r3, [pc, #88] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000264: ldrh r2, [r3, #6]
08000266: ldr r3, [pc, #88] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000268: strh r2, [r3, #6]
74 adc_read[4]=adc_buffer[4];
0800026a: ldr r3, [pc, #80] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800026c: ldrh r2, [r3, #8]
0800026e: ldr r3, [pc, #80] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000270: strh r2, [r3, #8]
75 adc_read[5]=adc_buffer[5];
08000272: ldr r3, [pc, #72] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000274: ldrh r2, [r3, #10]
08000276: ldr r3, [pc, #72] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000278: strh r2, [r3, #10]
76 adc_read[6]=adc_buffer[6];
0800027a: ldr r3, [pc, #64] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800027c: ldrh r2, [r3, #12]
0800027e: ldr r3, [pc, #64] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000280: strh r2, [r3, #12]
77 adc_read[7]=adc_buffer[7];
08000282: ldr r3, [pc, #56] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000284: ldrh r2, [r3, #14]
08000286: ldr r3, [pc, #56] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000288: strh r2, [r3, #14]
78 adc_read[8]=adc_buffer[8];
0800028a: ldr r3, [pc, #48] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800028c: ldrh r2, [r3, #16]
0800028e: ldr r3, [pc, #48] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000290: strh r2, [r3, #16]
79 adc_read[9]=adc_buffer[9];
08000292: ldr r3, [pc, #40] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000294: ldrh r2, [r3, #18]
08000296: ldr r3, [pc, #40] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000298: strh r2, [r3, #18]
80 adc_read[10]=adc_buffer[10];
0800029a: ldr r3, [pc, #32] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800029c: ldrh r2, [r3, #20]
0800029e: ldr r3, [pc, #32] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
080002a0: strh r2, [r3, #20]
97 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14,0u);
6 Answers 6
Speaking about performance without mentioning how you compile the code is kind of pointless. It shouldn't be a mind-blowing realization that optimized code runs faster than code with optimization disabled...
That being said, there is just so much the optimizer can do when hardware registers or DMA buffers are involved. All of these are (ought to be) volatile
and that effectively blocks many optimizations. It could for example block function inlining, which is a common one.
In your specific example, loop unrolling would be one kind of optimization. But compilers usually follow some scheme for when it does this, like up to 8 objects or similar (likely configurable through some option). Because loop unrolling is a speed over program size optimization.
We may note that your code doesn't use memcpy
, which is often recommended to get the fastest possible code. Modern compilers replace memcpy
with whatever is the fastest for the given data and target. However, using memcpy
on volatile
-qualified data isn't well-defined, since that would involve poorly-defined behavior when "casting away" volatile
. In theory, that could lead to all manner of unexpected bugs. So based on that, a manual loop might not be as naive as it seems.
However, hard-copying data in bare metal embedded applications is often to be avoided and replaced with pointer swaps. In situations like a buffer getting filled up from an ISR, then hard-copy is almost certainly the wrong thing to do and you should just swap pointers to buffers ("double buffering"), so that the ISR and the background program never work on the same one.
With DMA, you could re-route the DMA to use another buffer while you work the the present one. Getting DMA working is often a head ache though, so most would just take the slight performance hit and hard copy the data.
-
1\$\begingroup\$ Might be worth mentioning half complete interrupts as a way to work on half the buffer while the other is updating. This should spread out the load a bit with little complexity added. \$\endgroup\$William– William2024年03月06日 22:40:50 +00:00Commented Mar 6, 2024 at 22:40
-
\$\begingroup\$ @William Yeah can be done, and that's usually how DMA works too. But swapping pointers to buffers is super easy and with no added complexity, so if one can afford some 20-100 bytes or whatever for an additional buffer, that's what I would strongly recommend. \$\endgroup\$Lundin– Lundin2024年03月07日 09:50:07 +00:00Commented Mar 7, 2024 at 9:50
Is there any way to speed up the for loop? For my final program I'd like to add the values to a variable which is then averaged to give me a filtered output.
When in doubt, always check the compiler optimization settings and the generated assembly code.
I think the default is to optimize for size to generate the smallest code. On simple microcontrollers, this is often faster because smaller code means less instructions, which executes faster than more instructions. An exception is loops.
In this case the for() loop will be compiled into the usual: do the useful code inside, increment the counter, test for end of loop, jump back to the beginning.
When you manually unroll the loop, you keep only the useful stuff and get rid of the for() loop management, so it's faster.
If you use -02 it may decide to unroll the loop itself.
That said you do not need to copy your ADC data. I think the ADC DMA supports double buffering on this micro, so you can use that instead of copying. If you want to keep the old samples and avoid overwriting them with new ADC reads, instead of copying them out of the DMA buffer, simply change the address of the DMA buffer to write the new samples somewhere else.
If the goal is to average samples for more accuracy... STM32G0 supports that in hardware, but STM32F1 does not. So if you want maximum speed you'll just have to write
sum = adc_buffer[0] + adc_buffer[1] + ...
Note if you have many samples you don't need to unroll the whole loop. You can sum, say 8 values, and wrap that in a for() loop. This way the loop overhead will be once per 8 values instead of once per value.
The question, as many other such question, lacks enough detail. To talk about short code benchmarks in a way that won't waste your own time first, and our time second, you should make a self-contained example on Godbolt.
Your code looks like every other non-optimized output would look. The code is meant to look very similar to the C source so that it'll be easy to debug. Optimization often makes debugging harder - but that's mostly because debuggers are somewhat braindead in their understanding of the code relative to the compiler proper, not because there's anything wrong with optimization.
Here is the code I've tried:
#include <stdint.h>
typedef struct {} ADC_HandleTypeDef;
void HAL_GPIO_WritePin(int, ...);
enum {GPIOC, GPIO_PIN_14};
volatile int16_t adc_read[12];
volatile int16_t adc_buffer[12];
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
for (int i = 0; i < 11; i++) {
adc_read[i] = adc_buffer[i];
}
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}
Below is what ARM gcc 11.2.1 (none) produced with -O3 optimization level. As you can see, the loop is already unrolled, and each 16-bit value is transferred using an individual 16-bit read and write:
HAL_ADC_ConvCpltCallback:
mov r2, #1
push {r4, lr}
mov r1, r2
mov r0, #0
bl HAL_GPIO_WritePin
ldr r3, .L4
ldrh r0, [r3]
lsl r0, r0, #16
asr r0, r0, #16
strh r0, [r3, #24] @ movhi
ldrh r0, [r3, #2]
lsl r0, r0, #16
asr r0, r0, #16
strh r0, [r3, #26] @ movhi
ldrh r0, [r3, #4]
lsl r0, r0, #16
asr r0, r0, #16
strh r0, [r3, #28] @ movhi
ldrh r0, [r3, #6]
lsl r0, r0, #16
asr r0, r0, #16
strh r0, [r3, #30] @ movhi
ldrh r0, [r3, #8]
lsl r0, r0, #16
asr r0, r0, #16
strh r0, [r3, #32] @ movhi
ldrh ip, [r3, #10]
mov r2, #0
lsl ip, ip, #16
asr ip, ip, #16
strh ip, [r3, #34] @ movhi
ldrh ip, [r3, #12]
lsl ip, ip, #16
asr ip, ip, #16
strh ip, [r3, #36] @ movhi
ldrh ip, [r3, #14]
lsl ip, ip, #16
asr ip, ip, #16
strh ip, [r3, #38] @ movhi
ldrh ip, [r3, #16]
lsl ip, ip, #16
asr ip, ip, #16
strh ip, [r3, #40] @ movhi
ldrh ip, [r3, #18]
lsl ip, ip, #16
asr ip, ip, #16
strh ip, [r3, #42] @ movhi
ldrh ip, [r3, #20]
lsl ip, ip, #16
asr ip, ip, #16
mov r1, #1
mov r0, r2
strh ip, [r3, #44] @ movhi
bl HAL_GPIO_WritePin
pop {r4, lr}
bx lr
To copy the data more efficiently, the buffers can be unions, so that copying happens 32 bits at a time, and the data is accessed 16 bits at a time:
#include <stdint.h>
typedef struct {} ADC_HandleTypeDef;
void HAL_GPIO_WritePin(int, ...);
enum {GPIOC, GPIO_PIN_14};
volatile union {
int32_t raw[6];
int16_t values[12];
} adc_read, adc_buffer;
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
for (int i = 0; i < 6; i++) {
adc_read.raw[i] = adc_buffer.raw[i];
}
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}
Now the output is even faster:
HAL_ADC_ConvCpltCallback:
mov r2, #1
push {r4, lr}
mov r1, r2
mov r0, #0
bl HAL_GPIO_WritePin
mov r2, #0
ldr r3, .L4
ldr r0, [r3]
str r0, [r3, #24]
ldr r0, [r3, #4]
str r0, [r3, #28]
ldr r0, [r3, #8]
str r0, [r3, #32]
ldr r0, [r3, #12]
str r0, [r3, #36]
ldr r0, [r3, #16]
str r0, [r3, #40]
ldr r0, [r3, #20]
mov r1, #1
str r0, [r3, #44]
mov r0, r2
bl HAL_GPIO_WritePin
pop {r4, lr}
bx lr
The manually unrolled loop produces identical assembly output, as it should on any decent modern compiler:
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
adc_read.raw[0] = adc_buffer.raw[0];
adc_read.raw[1] = adc_buffer.raw[1];
adc_read.raw[2] = adc_buffer.raw[2];
adc_read.raw[3] = adc_buffer.raw[3];
adc_read.raw[4] = adc_buffer.raw[4];
adc_read.raw[5] = adc_buffer.raw[5];
HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}
But the whole thing is rather moot since you can use DMA for the transfer, and that will be the fastest in all cases.
There are two ways to do it with DMA:
Keep the interrupt handler, just transfer ADC data using DMA:
void callback() { set_gpio_pin(1u); start_dma(); while (!dma_complete()); set_gpio_pin(0u); }
The DMA transfer should be set up in the main thread, before any callbacks.
start_dma()
anddma_complete()
should be very small inline functions that just manipulate the DMA control/status bits.Let DMA do the whole job. You'd set up three chained transfers.
- First transfer sets the GPIO to 1.
- Second transfer transfers the ADC data.
- Third transfer sets the GPIO to 0.
As you have discovered test and branch is slow. You do not say which compiler you are using. If ARM GCC then optimization level 3 should automatically unroll loops that are smaller than 16 cycles. If not ARM GCC (e.g. Keil, MSVC) then there will be a directive of the form either "#pragma unroll" or "_attribute(?something?)" that can be placed just outside the loop to instruct the compiler what you would like it to do.
When using gcc, even if one doesn't trust the optimizer, using register
storage class can improve performance enormously.
The following code, for example:
unsigned arr1[8],arr2[8];
void test(void)
{
register unsigned *p1 = arr1;
register unsigned *p2 = arr2;
register unsigned *pe = p1+8;
do
{
*p1 += *p2;
p1++;
p2++;
} while(p1 < pe);
}
generates the following loop at optimization level 0:
.L2:
ldr r2, [r4]
ldr r3, [r5]
add r3, r2, r3
str r3, [r4]
add r4, r4, #4
add r5, r5, #4
cmp r4, r6
bcc .L2
Performance could be improved if the compiler could use ldrmia and strmia to eliminate the need for "add x,x,#4" instructions, but it's still massively better than without the register
qualifier.
-
\$\begingroup\$ That's the thing though:
register
doesn't actually do anything at all in O2 and O3 IIRC. So it's nice for -O0, but otherwise useless. Andregister
is effectively not a thing in C++ anyway, so you'd have to resort to implementation-specific keywords. At that point it's IMHO simplest just not to mess with -O0. There's not any reason to use it, especially not with C code that compiles super fast anyway. \$\endgroup\$Kuba hasn't forgotten Monica– Kuba hasn't forgotten Monica2024年03月08日 04:22:21 +00:00Commented Mar 8, 2024 at 4:22 -
\$\begingroup\$ @Kubahasn'tforgottenMonica: At higher optimization settings, gcc requires memory barriers to ensure reliably correct semantics for many constructs. Commercial compiler developers are wise enough to recognize that a volatile-qualified write might disturb the value of "ordinary" objects, but the authors of gcc (and clang) would view code that usefully refrains from register-caching objects across volatile writes as "less efficient" than useless code that would use stale values in the absence of a compiler-specific memory barrier. \$\endgroup\$supercat– supercat2024年03月08日 15:47:40 +00:00Commented Mar 8, 2024 at 15:47
Set up pointers to the start of each array.
Count the for loop index variable down to zero.
In the loop, copy the values using the pointers, auto-post-increment them both.
This reduces per-loop operations. But a sensible compiler optimising setting would likely produce similar code from your [i] original !
Explore related questions
See similar questions with these tags.
memcpy()
,for
loops up from or down to zero. \$\endgroup\$