STM32 for loop slows down code too much

Question 1

I am using an STM32 STM32F103C8T6 running at 48 MHz using ADC DMA and I find that using a for loop for manipulating arrays is much slower than multiple lines of code.

For example:

void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 int i = 0;
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 for (i = 0; i < 11; i++) {
 adc_read[i] = adc_buffer[i];
 }
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

vs:

void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 adc_read[0] = adc_buffer[0];
 adc_read[1] = adc_buffer[1];
 adc_read[2] = adc_buffer[2];
 adc_read[3] = adc_buffer[3];
 adc_read[4] = adc_buffer[4];
 adc_read[5] = adc_buffer[5];
 adc_read[6] = adc_buffer[6];
 adc_read[7] = adc_buffer[7];
 adc_read[8] = adc_buffer[8];
 adc_read[9] = adc_buffer[9];
 adc_read[10] = adc_buffer[10];
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
 }

The for loop takes 7.9 μs while the direct assignment takes 3.58 μs when seen on an oscilloscope.

The GPIO state change takes 1.26 μs, so effectively the for loop takes 6.64 μs while direct assignment takes 2.32 μs to execute (almost 1/3).

Is there any way to speed up the for loop? For my final program I'd like to add the values to a variable which is then averaged to give me a filtered output.

-----------------EDIT-1-------------------------

Personal note - Also if the question doesn't make it clear, I am more of a hardware guy than a software guy. I expected the for loop to be about 10, 20 or 30% slower, never thought it could be 3x slower. I may never use for again :D.

All data is int 16 bits.

Assembly code as below.

08000246: bl 0x8002c96 <HAL_GPIO_TogglePin>
0800024a: movs r3, #0
0800024c: str r3, [r7, #12]
 61 for (i = 0; i < 11; i++) // starts here
0800024e: movs r3, #0
08000250: str r3, [r7, #12]
08000252: b.n 0x800026a <HAL_ADC_ConvCpltCallback+54>
 64 adc_read[i]=adc_buffer[i];
08000254: ldr r3, [pc, #52] ; (0x800028c <HAL_ADC_ConvCpltCallback+88>)
08000256: ldr r2, [r7, #12]
08000258: lsls r2, r2, #1
0800025a: ldrh r1, [r2, r3]
0800025c: ldr r3, [pc, #48] ; (0x8000290 <HAL_ADC_ConvCpltCallback+92>)
0800025e: ldr r2, [r7, #12]
08000260: lsls r2, r2, #1
08000262: strh r1, [r2, r3]
 61 for (i = 0; i < 11; i++)
08000264: ldr r3, [r7, #12]
08000266: adds r3, #1
08000268: str r3, [r7, #12]
0800026a: ldr r3, [r7, #12]
0800026c: cmp r3, #10
0800026e: ble.n 0x8000254 <HAL_ADC_ConvCpltCallback+32> // ends here
 97 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14,0u);

 54 HAL_GPIO_TogglePin(GPIOC, GPIO_PIN_14);
0800023c: movs r3, #128 ; 0x80
0800023e: lsls r3, r3, #7
08000240: ldr r2, [pc, #116] ; (0x80002b8 <HAL_ADC_ConvCpltCallback+132>)
08000242: movs r1, r3
08000244: movs r0, r2
08000246: bl 0x8002cc6 <HAL_GPIO_TogglePin>
 70 adc_read[0]=adc_buffer[0];
0800024a: ldr r3, [pc, #112] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800024c: ldrh r2, [r3, #0]
0800024e: ldr r3, [pc, #112] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000250: strh r2, [r3, #0]
 71 adc_read[1]=adc_buffer[1];
08000252: ldr r3, [pc, #104] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000254: ldrh r2, [r3, #2]
08000256: ldr r3, [pc, #104] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000258: strh r2, [r3, #2]
 72 adc_read[2]=adc_buffer[2];
0800025a: ldr r3, [pc, #96] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800025c: ldrh r2, [r3, #4]
0800025e: ldr r3, [pc, #96] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000260: strh r2, [r3, #4]
 73 adc_read[3]=adc_buffer[3];
08000262: ldr r3, [pc, #88] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000264: ldrh r2, [r3, #6]
08000266: ldr r3, [pc, #88] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000268: strh r2, [r3, #6]
 74 adc_read[4]=adc_buffer[4];
0800026a: ldr r3, [pc, #80] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800026c: ldrh r2, [r3, #8]
0800026e: ldr r3, [pc, #80] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000270: strh r2, [r3, #8]
 75 adc_read[5]=adc_buffer[5];
08000272: ldr r3, [pc, #72] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000274: ldrh r2, [r3, #10]
08000276: ldr r3, [pc, #72] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000278: strh r2, [r3, #10]
 76 adc_read[6]=adc_buffer[6];
0800027a: ldr r3, [pc, #64] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800027c: ldrh r2, [r3, #12]
0800027e: ldr r3, [pc, #64] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000280: strh r2, [r3, #12]
 77 adc_read[7]=adc_buffer[7];
08000282: ldr r3, [pc, #56] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000284: ldrh r2, [r3, #14]
08000286: ldr r3, [pc, #56] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000288: strh r2, [r3, #14]
 78 adc_read[8]=adc_buffer[8];
0800028a: ldr r3, [pc, #48] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800028c: ldrh r2, [r3, #16]
0800028e: ldr r3, [pc, #48] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000290: strh r2, [r3, #16]
 79 adc_read[9]=adc_buffer[9];
08000292: ldr r3, [pc, #40] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
08000294: ldrh r2, [r3, #18]
08000296: ldr r3, [pc, #40] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
08000298: strh r2, [r3, #18]
 80 adc_read[10]=adc_buffer[10];
0800029a: ldr r3, [pc, #32] ; (0x80002bc <HAL_ADC_ConvCpltCallback+136>)
0800029c: ldrh r2, [r3, #20]
0800029e: ldr r3, [pc, #32] ; (0x80002c0 <HAL_ADC_ConvCpltCallback+140>)
080002a0: strh r2, [r3, #20]
 97 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14,0u);

Question 2

Please edit your question to show us your compilation command. Did you let the compiler optimize for speed?

Question 3

Have the compiler show the code it generates. ARM GCC 13.2 "-O3" doesn't generate much different code for memcpy(), for loops up from or down to zero.

Question 4

Direct assignment (loop unrolling) will always be quicker because there is no branching involved. Not much you can do about this unless your processor has branch prediction (it doesn't). Branching is very slow as the CPU must purge the pipeline. A 3x slowdown is not unreasonable.

Question 5

In terms of optimisations: 1) what is the expected size range of the array you want to copy 2) what size are the data elements (8-bit, 16-bit, 32-bit etc)?

Question 6

Was that compiled with no optimisations? It looks insanely inefficient. It keeps loading the base pointers into r3 every single time it needs them.

Question 7

Speaking about performance without mentioning how you compile the code is kind of pointless. It shouldn't be a mind-blowing realization that optimized code runs faster than code with optimization disabled...

That being said, there is just so much the optimizer can do when hardware registers or DMA buffers are involved. All of these are (ought to be) volatile and that effectively blocks many optimizations. It could for example block function inlining, which is a common one.

In your specific example, loop unrolling would be one kind of optimization. But compilers usually follow some scheme for when it does this, like up to 8 objects or similar (likely configurable through some option). Because loop unrolling is a speed over program size optimization.

We may note that your code doesn't use memcpy, which is often recommended to get the fastest possible code. Modern compilers replace memcpy with whatever is the fastest for the given data and target. However, using memcpy on volatile-qualified data isn't well-defined, since that would involve poorly-defined behavior when "casting away" volatile. In theory, that could lead to all manner of unexpected bugs. So based on that, a manual loop might not be as naive as it seems.

However, hard-copying data in bare metal embedded applications is often to be avoided and replaced with pointer swaps. In situations like a buffer getting filled up from an ISR, then hard-copy is almost certainly the wrong thing to do and you should just swap pointers to buffers ("double buffering"), so that the ISR and the background program never work on the same one.

With DMA, you could re-route the DMA to use another buffer while you work the the present one. Getting DMA working is often a head ache though, so most would just take the slight performance hit and hard copy the data.

Question 8

Might be worth mentioning half complete interrupts as a way to work on half the buffer while the other is updating. This should spread out the load a bit with little complexity added.

Question 9

@William Yeah can be done, and that's usually how DMA works too. But swapping pointers to buffers is super easy and with no added complexity, so if one can afford some 20-100 bytes or whatever for an additional buffer, that's what I would strongly recommend.

Question 10

Is there any way to speed up the for loop? For my final program I'd like to add the values to a variable which is then averaged to give me a filtered output.

When in doubt, always check the compiler optimization settings and the generated assembly code.

I think the default is to optimize for size to generate the smallest code. On simple microcontrollers, this is often faster because smaller code means less instructions, which executes faster than more instructions. An exception is loops.

In this case the for() loop will be compiled into the usual: do the useful code inside, increment the counter, test for end of loop, jump back to the beginning.

When you manually unroll the loop, you keep only the useful stuff and get rid of the for() loop management, so it's faster.

If you use -02 it may decide to unroll the loop itself.

That said you do not need to copy your ADC data. I think the ADC DMA supports double buffering on this micro, so you can use that instead of copying. If you want to keep the old samples and avoid overwriting them with new ADC reads, instead of copying them out of the DMA buffer, simply change the address of the DMA buffer to write the new samples somewhere else.

If the goal is to average samples for more accuracy... STM32G0 supports that in hardware, but STM32F1 does not. So if you want maximum speed you'll just have to write

sum = adc_buffer[0] + adc_buffer[1] + ...

Note if you have many samples you don't need to unroll the whole loop. You can sum, say 8 values, and wrap that in a for() loop. This way the loop overhead will be once per 8 values instead of once per value.

Question 11

The question, as many other such question, lacks enough detail. To talk about short code benchmarks in a way that won't waste your own time first, and our time second, you should make a self-contained example on Godbolt.

Your code looks like every other non-optimized output would look. The code is meant to look very similar to the C source so that it'll be easy to debug. Optimization often makes debugging harder - but that's mostly because debuggers are somewhat braindead in their understanding of the code relative to the compiler proper, not because there's anything wrong with optimization.

Here is the code I've tried:

#include <stdint.h>
typedef struct {} ADC_HandleTypeDef;
void HAL_GPIO_WritePin(int, ...);
enum {GPIOC, GPIO_PIN_14};
volatile int16_t adc_read[12];
volatile int16_t adc_buffer[12];
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 for (int i = 0; i < 11; i++) {
 adc_read[i] = adc_buffer[i];
 }
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

Below is what ARM gcc 11.2.1 (none) produced with -O3 optimization level. As you can see, the loop is already unrolled, and each 16-bit value is transferred using an individual 16-bit read and write:

HAL_ADC_ConvCpltCallback:
 mov r2, #1
 push {r4, lr}
 mov r1, r2
 mov r0, #0
 bl HAL_GPIO_WritePin
 ldr r3, .L4
 ldrh r0, [r3]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #24] @ movhi
 ldrh r0, [r3, #2]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #26] @ movhi
 ldrh r0, [r3, #4]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #28] @ movhi
 ldrh r0, [r3, #6]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #30] @ movhi
 ldrh r0, [r3, #8]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #32] @ movhi
 ldrh ip, [r3, #10]
 mov r2, #0
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #34] @ movhi
 ldrh ip, [r3, #12]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #36] @ movhi
 ldrh ip, [r3, #14]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #38] @ movhi
 ldrh ip, [r3, #16]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #40] @ movhi
 ldrh ip, [r3, #18]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #42] @ movhi
 ldrh ip, [r3, #20]
 lsl ip, ip, #16
 asr ip, ip, #16
 mov r1, #1
 mov r0, r2
 strh ip, [r3, #44] @ movhi
 bl HAL_GPIO_WritePin
 pop {r4, lr}
 bx lr

To copy the data more efficiently, the buffers can be unions, so that copying happens 32 bits at a time, and the data is accessed 16 bits at a time:

#include <stdint.h>
typedef struct {} ADC_HandleTypeDef;
void HAL_GPIO_WritePin(int, ...);
enum {GPIOC, GPIO_PIN_14};
volatile union {
 int32_t raw[6];
 int16_t values[12];
} adc_read, adc_buffer;
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 for (int i = 0; i < 6; i++) {
 adc_read.raw[i] = adc_buffer.raw[i];
 }
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

Now the output is even faster:

HAL_ADC_ConvCpltCallback:
 mov r2, #1
 push {r4, lr}
 mov r1, r2
 mov r0, #0
 bl HAL_GPIO_WritePin
 mov r2, #0
 ldr r3, .L4
 ldr r0, [r3]
 str r0, [r3, #24]
 ldr r0, [r3, #4]
 str r0, [r3, #28]
 ldr r0, [r3, #8]
 str r0, [r3, #32]
 ldr r0, [r3, #12]
 str r0, [r3, #36]
 ldr r0, [r3, #16]
 str r0, [r3, #40]
 ldr r0, [r3, #20]
 mov r1, #1
 str r0, [r3, #44]
 mov r0, r2
 bl HAL_GPIO_WritePin
 pop {r4, lr}
 bx lr

The manually unrolled loop produces identical assembly output, as it should on any decent modern compiler:

void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 adc_read.raw[0] = adc_buffer.raw[0];
 adc_read.raw[1] = adc_buffer.raw[1];
 adc_read.raw[2] = adc_buffer.raw[2];
 adc_read.raw[3] = adc_buffer.raw[3];
 adc_read.raw[4] = adc_buffer.raw[4];
 adc_read.raw[5] = adc_buffer.raw[5];
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

But the whole thing is rather moot since you can use DMA for the transfer, and that will be the fastest in all cases.

There are two ways to do it with DMA:

Keep the interrupt handler, just transfer ADC data using DMA:
```
void callback() {
 set_gpio_pin(1u);
 start_dma();
 while (!dma_complete());
 set_gpio_pin(0u);
}
```
The DMA transfer should be set up in the main thread, before any callbacks. start_dma() and dma_complete() should be very small inline functions that just manipulate the DMA control/status bits.
Let DMA do the whole job. You'd set up three chained transfers.
- First transfer sets the GPIO to 1.
- Second transfer transfers the ADC data.
- Third transfer sets the GPIO to 0.

Question 12

As you have discovered test and branch is slow. You do not say which compiler you are using. If ARM GCC then optimization level 3 should automatically unroll loops that are smaller than 16 cycles. If not ARM GCC (e.g. Keil, MSVC) then there will be a directive of the form either "#pragma unroll" or "_attribute(?something?)" that can be placed just outside the loop to instruct the compiler what you would like it to do.

Question 13

When using gcc, even if one doesn't trust the optimizer, using register storage class can improve performance enormously.

The following code, for example:

unsigned arr1[8],arr2[8];
void test(void)
{
 register unsigned *p1 = arr1;
 register unsigned *p2 = arr2;
 register unsigned *pe = p1+8;
 do
 {
 *p1 += *p2;
 p1++;
 p2++; 
 } while(p1 < pe);
}

generates the following loop at optimization level 0:

.L2:
 ldr r2, [r4]
 ldr r3, [r5]
 add r3, r2, r3
 str r3, [r4]
 add r4, r4, #4
 add r5, r5, #4
 cmp r4, r6
 bcc .L2

Performance could be improved if the compiler could use ldrmia and strmia to eliminate the need for "add x,x,#4" instructions, but it's still massively better than without the register qualifier.

Question 14

That's the thing though: register doesn't actually do anything at all in O2 and O3 IIRC. So it's nice for -O0, but otherwise useless. And register is effectively not a thing in C++ anyway, so you'd have to resort to implementation-specific keywords. At that point it's IMHO simplest just not to mess with -O0. There's not any reason to use it, especially not with C code that compiles super fast anyway.

Question 15

@Kubahasn'tforgottenMonica: At higher optimization settings, gcc requires memory barriers to ensure reliably correct semantics for many constructs. Commercial compiler developers are wise enough to recognize that a volatile-qualified write might disturb the value of "ordinary" objects, but the authors of gcc (and clang) would view code that usefully refrains from register-caching objects across volatile writes as "less efficient" than useless code that would use stale values in the absence of a compiler-specific memory barrier.

Question 16

Set up pointers to the start of each array.
Count the for loop index variable down to zero.
In the loop, copy the values using the pointers, auto-post-increment them both.

This reduces per-loop operations. But a sensible compiler optimising setting would likely produce similar code from your [i] original !

Lundin Lundin 23.9k1 gold badge32 silver badges86 bronze badges · Answer 1 · 2024-03-06 11:49:49Z

Speaking about performance without mentioning how you compile the code is kind of pointless. It shouldn't be a mind-blowing realization that optimized code runs faster than code with optimization disabled...

That being said, there is just so much the optimizer can do when hardware registers or DMA buffers are involved. All of these are (ought to be) volatile and that effectively blocks many optimizations. It could for example block function inlining, which is a common one.

In your specific example, loop unrolling would be one kind of optimization. But compilers usually follow some scheme for when it does this, like up to 8 objects or similar (likely configurable through some option). Because loop unrolling is a speed over program size optimization.

We may note that your code doesn't use memcpy, which is often recommended to get the fastest possible code. Modern compilers replace memcpy with whatever is the fastest for the given data and target. However, using memcpy on volatile-qualified data isn't well-defined, since that would involve poorly-defined behavior when "casting away" volatile. In theory, that could lead to all manner of unexpected bugs. So based on that, a manual loop might not be as naive as it seems.

However, hard-copying data in bare metal embedded applications is often to be avoided and replaced with pointer swaps. In situations like a buffer getting filled up from an ISR, then hard-copy is almost certainly the wrong thing to do and you should just swap pointers to buffers ("double buffering"), so that the ISR and the background program never work on the same one.

With DMA, you could re-route the DMA to use another buffer while you work the the present one. Getting DMA working is often a head ache though, so most would just take the slight performance hit and hard copy the data.

Might be worth mentioning half complete interrupts as a way to work on half the buffer while the other is updating. This should spread out the load a bit with little complexity added.
@William Yeah can be done, and that's usually how DMA works too. But swapping pointers to buffers is super easy and with no added complexity, so if one can afford some 20-100 bytes or whatever for an additional buffer, that's what I would strongly recommend.

bobflux bobflux 87.2k3 gold badges100 silver badges250 bronze badges · Answer 2 · 2024-03-06 11:11:35Z

Is there any way to speed up the for loop? For my final program I'd like to add the values to a variable which is then averaged to give me a filtered output.

When in doubt, always check the compiler optimization settings and the generated assembly code.

I think the default is to optimize for size to generate the smallest code. On simple microcontrollers, this is often faster because smaller code means less instructions, which executes faster than more instructions. An exception is loops.

In this case the for() loop will be compiled into the usual: do the useful code inside, increment the counter, test for end of loop, jump back to the beginning.

When you manually unroll the loop, you keep only the useful stuff and get rid of the for() loop management, so it's faster.

If you use -02 it may decide to unroll the loop itself.

That said you do not need to copy your ADC data. I think the ADC DMA supports double buffering on this micro, so you can use that instead of copying. If you want to keep the old samples and avoid overwriting them with new ADC reads, instead of copying them out of the DMA buffer, simply change the address of the DMA buffer to write the new samples somewhere else.

If the goal is to average samples for more accuracy... STM32G0 supports that in hardware, but STM32F1 does not. So if you want maximum speed you'll just have to write

sum = adc_buffer[0] + adc_buffer[1] + ...

Note if you have many samples you don't need to unroll the whole loop. You can sum, say 8 values, and wrap that in a for() loop. This way the loop overhead will be once per 8 values instead of once per value.

score 4 · Answer 3 · 2024-03-06 18:08:16Z

The question, as many other such question, lacks enough detail. To talk about short code benchmarks in a way that won't waste your own time first, and our time second, you should make a self-contained example on Godbolt.

Your code looks like every other non-optimized output would look. The code is meant to look very similar to the C source so that it'll be easy to debug. Optimization often makes debugging harder - but that's mostly because debuggers are somewhat braindead in their understanding of the code relative to the compiler proper, not because there's anything wrong with optimization.

Here is the code I've tried:

#include <stdint.h>
typedef struct {} ADC_HandleTypeDef;
void HAL_GPIO_WritePin(int, ...);
enum {GPIOC, GPIO_PIN_14};
volatile int16_t adc_read[12];
volatile int16_t adc_buffer[12];
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 for (int i = 0; i < 11; i++) {
 adc_read[i] = adc_buffer[i];
 }
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

Below is what ARM gcc 11.2.1 (none) produced with -O3 optimization level. As you can see, the loop is already unrolled, and each 16-bit value is transferred using an individual 16-bit read and write:

HAL_ADC_ConvCpltCallback:
 mov r2, #1
 push {r4, lr}
 mov r1, r2
 mov r0, #0
 bl HAL_GPIO_WritePin
 ldr r3, .L4
 ldrh r0, [r3]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #24] @ movhi
 ldrh r0, [r3, #2]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #26] @ movhi
 ldrh r0, [r3, #4]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #28] @ movhi
 ldrh r0, [r3, #6]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #30] @ movhi
 ldrh r0, [r3, #8]
 lsl r0, r0, #16
 asr r0, r0, #16
 strh r0, [r3, #32] @ movhi
 ldrh ip, [r3, #10]
 mov r2, #0
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #34] @ movhi
 ldrh ip, [r3, #12]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #36] @ movhi
 ldrh ip, [r3, #14]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #38] @ movhi
 ldrh ip, [r3, #16]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #40] @ movhi
 ldrh ip, [r3, #18]
 lsl ip, ip, #16
 asr ip, ip, #16
 strh ip, [r3, #42] @ movhi
 ldrh ip, [r3, #20]
 lsl ip, ip, #16
 asr ip, ip, #16
 mov r1, #1
 mov r0, r2
 strh ip, [r3, #44] @ movhi
 bl HAL_GPIO_WritePin
 pop {r4, lr}
 bx lr

To copy the data more efficiently, the buffers can be unions, so that copying happens 32 bits at a time, and the data is accessed 16 bits at a time:

#include <stdint.h>
typedef struct {} ADC_HandleTypeDef;
void HAL_GPIO_WritePin(int, ...);
enum {GPIOC, GPIO_PIN_14};
volatile union {
 int32_t raw[6];
 int16_t values[12];
} adc_read, adc_buffer;
void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 for (int i = 0; i < 6; i++) {
 adc_read.raw[i] = adc_buffer.raw[i];
 }
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

Now the output is even faster:

HAL_ADC_ConvCpltCallback:
 mov r2, #1
 push {r4, lr}
 mov r1, r2
 mov r0, #0
 bl HAL_GPIO_WritePin
 mov r2, #0
 ldr r3, .L4
 ldr r0, [r3]
 str r0, [r3, #24]
 ldr r0, [r3, #4]
 str r0, [r3, #28]
 ldr r0, [r3, #8]
 str r0, [r3, #32]
 ldr r0, [r3, #12]
 str r0, [r3, #36]
 ldr r0, [r3, #16]
 str r0, [r3, #40]
 ldr r0, [r3, #20]
 mov r1, #1
 str r0, [r3, #44]
 mov r0, r2
 bl HAL_GPIO_WritePin
 pop {r4, lr}
 bx lr

The manually unrolled loop produces identical assembly output, as it should on any decent modern compiler:

void HAL_ADC_ConvCpltCallback(ADC_HandleTypeDef* hadc) {
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 1u);
 adc_read.raw[0] = adc_buffer.raw[0];
 adc_read.raw[1] = adc_buffer.raw[1];
 adc_read.raw[2] = adc_buffer.raw[2];
 adc_read.raw[3] = adc_buffer.raw[3];
 adc_read.raw[4] = adc_buffer.raw[4];
 adc_read.raw[5] = adc_buffer.raw[5];
 HAL_GPIO_WritePin(GPIOC, GPIO_PIN_14, 0u);
}

But the whole thing is rather moot since you can use DMA for the transfer, and that will be the fastest in all cases.

There are two ways to do it with DMA:

Keep the interrupt handler, just transfer ADC data using DMA:
```
void callback() {
 set_gpio_pin(1u);
 start_dma();
 while (!dma_complete());
 set_gpio_pin(0u);
}
```
The DMA transfer should be set up in the main thread, before any callbacks. start_dma() and dma_complete() should be very small inline functions that just manipulate the DMA control/status bits.
Let DMA do the whole job. You'd set up three chained transfers.
- First transfer sets the GPIO to 1.
- Second transfer transfers the ADC data.
- Third transfer sets the GPIO to 0.

user367087 user367087 111 bronze badge · Answer 4 · 2024-03-07 10:21:17Z

As you have discovered test and branch is slow. You do not say which compiler you are using. If ARM GCC then optimization level 3 should automatically unroll loops that are smaller than 16 cycles. If not ARM GCC (e.g. Keil, MSVC) then there will be a directive of the form either "#pragma unroll" or "_attribute(?something?)" that can be placed just outside the loop to instruct the compiler what you would like it to do.

supercat supercat 47.5k3 gold badges91 silver badges151 bronze badges · Answer 5 · 2024-03-07 19:12:43Z

When using gcc, even if one doesn't trust the optimizer, using register storage class can improve performance enormously.

The following code, for example:

unsigned arr1[8],arr2[8];
void test(void)
{
 register unsigned *p1 = arr1;
 register unsigned *p2 = arr2;
 register unsigned *pe = p1+8;
 do
 {
 *p1 += *p2;
 p1++;
 p2++; 
 } while(p1 < pe);
}

generates the following loop at optimization level 0:

.L2:
 ldr r2, [r4]
 ldr r3, [r5]
 add r3, r2, r3
 str r3, [r4]
 add r4, r4, #4
 add r5, r5, #4
 cmp r4, r6
 bcc .L2

Performance could be improved if the compiler could use ldrmia and strmia to eliminate the need for "add x,x,#4" instructions, but it's still massively better than without the register qualifier.

That's the thing though: register doesn't actually do anything at all in O2 and O3 IIRC. So it's nice for -O0, but otherwise useless. And register is effectively not a thing in C++ anyway, so you'd have to resort to implementation-specific keywords. At that point it's IMHO simplest just not to mess with -O0. There's not any reason to use it, especially not with C code that compiles super fast anyway.
@Kubahasn'tforgottenMonica: At higher optimization settings, gcc requires memory barriers to ensure reliably correct semantics for many constructs. Commercial compiler developers are wise enough to recognize that a volatile-qualified write might disturb the value of "ordinary" objects, but the authors of gcc (and clang) would view code that usefully refrains from register-caching objects across volatile writes as "less efficient" than useless code that would use stale values in the absence of a compiler-specific memory barrier.

MikeW MikeW 1795 bronze badges · Answer 6 · 2024-03-07 15:08:17Z

Set up pointers to the start of each array.
Count the for loop index variable down to zero.
In the loop, copy the values using the pointers, auto-post-increment them both.

This reduces per-loop operations. But a sensible compiler optimising setting would likely produce similar code from your [i] original !

Stack Exchange Network

STM32 for loop slows down code too much

6 Answers 6

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

STM32 for loop slows down code too much

6 Answers 6

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions