I have tried implemented a fast lock free Single producer single consumer queue in C++ for practicing concurrency and low level design concepts. I also benchmarked the code on x86_64 2.3GHz Intel 4core CPU. Please review the benchmark code (inspired from moodycamel) and Queue implementation for correctness:
- lfqueue.h
#include <atomic>
#include <cstddef>
#include <unistd.h>
#include <cassert>
#include <limits>
#define CACHE_LINE_SIZE 64
template<typename T>
class LFQueue{
public:
LFQueue(std::size_t size) : write_id(0), read_id(0), capacity_(size) {
buf = new T[capacity_];
}
~LFQueue() { delete[] buf;}
template<typename... Args>
bool write(Args... args){
auto curr_read = read_id.load(std::memory_order_acquire);
auto curr_write = write_id.load(std::memory_order_relaxed);
if(__builtin_expect(curr_write + 1 == curr_read, 0)){
return false;
}
new (&buf[curr_write]) T(std::forward<Args>(args)...);
curr_write++;
if(__builtin_expect(curr_write == capacity_, 0)){
curr_write = 0;
}
write_id.store(curr_write, std::memory_order_release);
return true;
}
bool read(T &val){
auto curr_write = write_id.load(std::memory_order_acquire);
auto curr_read = read_id.load(std::memory_order_relaxed);
if(__builtin_expect(curr_read == curr_write, 0))
return false;
val = std::move(buf[curr_read]);
curr_read++;
if(__builtin_expect(curr_read == capacity_, 0)){
curr_read = 0;
}
read_id.store(curr_read, std::memory_order_release);
return true;
}
T* frontPtr() {
auto curr_read = read_id.load(std::memory_order_acquire);
auto curr_write = write_id.load(std::memory_order_acquire);
if(curr_read == curr_write)
return nullptr;
return buf + curr_read;
}
void popFront() {
auto curr_read = read_id.load(std::memory_order_acquire);
auto curr_write = write_id.load(std::memory_order_acquire);
if(curr_read != curr_write){
curr_read++;
if(curr_read == capacity_)
curr_read = 0;
read_id.store(curr_read, std::memory_order_release);
}
}
bool isEmpty() const {
return write_id.load(std::memory_order_acquire) == read_id.load(std::memory_order_acquire);
}
bool isFull() const {
return write_id.load(std::memory_order_acquire) + 1 == read_id.load(std::memory_order_acquire);
}
size_t size() const {
auto curr_read = read_id.load(std::memory_order_acquire);
auto curr_write = write_id.load(std::memory_order_acquire);
if(curr_write >= curr_read){
return curr_write - curr_read;
}
return curr_write + capacity_ - curr_read;
}
size_t capacity() const { return capacity_;}
private:
alignas(CACHE_LINE_SIZE) std::atomic_size_t write_id;
alignas(CACHE_LINE_SIZE) std::atomic_size_t read_id;
alignas(CACHE_LINE_SIZE) const std::size_t capacity_;
alignas(CACHE_LINE_SIZE) T* buf;
__always_inline bool is_capacity(size_t id){
return (id & capacity_) && !(id & (capacity_ - 1));
}
};
- benchmark_helpers.h
#include <array>
#include <cstddef>
#include <cstdint>
#include <pthread.h>
#include <random>
#include <stdexcept>
#include <x86intrin.h>
constexpr int QUEUE_SIZE = 1000000;
constexpr int LIMIT = QUEUE_SIZE;
constexpr size_t FREQ = 2300000000;
static std::array<bool, LIMIT> rnd_vals1{}, rnd_vals2{};
#define MAIN_THREAD_CPU 0
#define TEST_THREAD_CPU 1
#define PRODUCER_THREAD_CPU 2
#define CONSUMER_THREAD_CPU 3
namespace detail {
void pin_thread_to_cpu(int cpu_id) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu_id, &cpuset);
int rc = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
if (rc != 0) {
throw std::runtime_error("Error setting thread affinity");
}
}
inline uint64_t rdtsc(){
return __rdtsc();
}
void init_rand_array(std::array<bool, LIMIT> &rnd_vals){
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dist(0, 1);
for(size_t i = 0; i < LIMIT; i++){
rnd_vals[i] = dist(gen);
}
}
}
- benchmark_functions.h
#include "benchmark_helpers.h"
#include <iostream>
#include <thread>
template<typename QueueType, typename T>
void sanity_checks(){
std::cout << "Sanity checks:\n";
QueueType queue(LIMIT + 1);
std::thread producer([&]{
for(int i = 0; i < LIMIT; i++){
queue.write(i);
}
});
std::thread consumer([&]{
T val;
for(int i = 0; i < LIMIT; i++){
while(!queue.read(val));
assert(val == i && "Dequeued element doesn't match");
}
});
producer.join();
consumer.join();
std::cout << "OK\n";
}
template<typename QueueType, typename T>
static void raw_add() {
QueueType queue(QUEUE_SIZE + 1);
size_t total_ops = 0;
detail::pin_thread_to_cpu(TEST_THREAD_CPU); // OK for x86_64 where TSC are synchronized across all cores. Not portable.
auto start = detail::rdtsc();
for (int i = 0; i < LIMIT; i++) {
if(!queue.write(i)){
break;
}
total_ops++;
}
auto cycles = detail::rdtsc() - start;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "raw_add\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T>
static void raw_remove() {
detail::pin_thread_to_cpu(TEST_THREAD_CPU);
QueueType queue(QUEUE_SIZE + 1);
size_t total_ops = 0;
for(int i = 0; i < QUEUE_SIZE; i++){
queue.write(i);
}
auto start = detail::rdtsc();
for (int i = 0; i < LIMIT; i++) {
T val;
if(!queue.read(val)){
break;
}
total_ops++;
}
auto cycles = detail::rdtsc() - start;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "raw_remove\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T>
static void single_threaded() {
detail::pin_thread_to_cpu(TEST_THREAD_CPU);
QueueType queue(QUEUE_SIZE + 1);
size_t total_ops = 0;
for(int i = 0; i < QUEUE_SIZE; i++){
queue.write(i);
}
auto start = detail::rdtsc();
for (int i = 0; i < LIMIT; i++) {
if(rnd_vals1[i]){
queue.write(i);
}
else{
T val;
queue.read(val);
}
}
auto cycles = detail::rdtsc() - start;
total_ops = LIMIT;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "single_threaded\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T>
static void mostly_add() {
detail::pin_thread_to_cpu(TEST_THREAD_CPU);
QueueType queue(QUEUE_SIZE + 1);
std::atomic_size_t total_reads = 0, total_writes = 0;
auto start = detail::rdtsc();
std::thread producer([&]{
detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
for(int i = 0; i < LIMIT; i++){
queue.write(i);
total_writes++;
}
});
std::thread consumer([&]{
detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
T val;
for(int i = 0; i < LIMIT/10; i++){
if(rnd_vals1[i]){
queue.read(val);
total_reads++;
}
}
});
producer.join();
consumer.join();
auto cycles = detail::rdtsc() - start;
auto total_ops = total_reads + total_writes;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "mostly_add\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T>
static void mostly_remove() {
detail::pin_thread_to_cpu(TEST_THREAD_CPU);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dist(0, 3);
QueueType queue(QUEUE_SIZE + 1);
std::atomic_size_t total_reads = 0, total_writes = 0;
auto start = detail::rdtsc();
std::thread producer([&]{
detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
for(int i = 0; i < LIMIT/10; i++){
if(rnd_vals1[i]){
queue.write(i);
total_writes++;
}
}
});
std::thread consumer([&]{
detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
T val;
for(int i = 0; i < LIMIT; i++){
queue.read(val);
}
});
total_reads = LIMIT;
producer.join();
consumer.join();
auto cycles = detail::rdtsc() - start;
auto total_ops = total_reads + total_writes;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "mostly_remove\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T>
static void heavy_concurrent() {
detail::pin_thread_to_cpu(TEST_THREAD_CPU);
QueueType queue(QUEUE_SIZE + 1);
std::atomic_size_t total_reads = LIMIT, total_writes = LIMIT;
auto start = detail::rdtsc();
std::thread producer([&]{
detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
for(int i = 0; i < LIMIT; i++){
queue.write(i);
}
});
std::thread consumer([&]{
detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
T val;
for(int i = 0; i < LIMIT; i++){
queue.read(val);
}
});
producer.join();
consumer.join();
auto cycles = detail::rdtsc() - start;
auto total_ops = total_reads + total_writes;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "heavy_concurrent\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T>
static void random_concurrent() {
detail::pin_thread_to_cpu(TEST_THREAD_CPU);
QueueType queue(QUEUE_SIZE + 1);
std::atomic_size_t total_reads = 0, total_writes = 0;
auto start = detail::rdtsc();
std::thread producer([&]{
detail::pin_thread_to_cpu(PRODUCER_THREAD_CPU);
for(int i = 0; i < LIMIT; i++){
if(rnd_vals1[i]){
queue.write(i);
total_writes++;
}
}
});
std::thread consumer([&]{
detail::pin_thread_to_cpu(CONSUMER_THREAD_CPU);
T val;
for(int i = 0; i < LIMIT; i++){
if(rnd_vals2[i]){
queue.read(val);
total_reads++;
}
}
});
producer.join();
consumer.join();
auto cycles = detail::rdtsc() - start;
auto total_ops = total_reads + total_writes;
double avg_time = (static_cast<double>(cycles)) * 1e9 / (FREQ * total_ops);
std::cout << "random_concurrent\t" << avg_time << "ns, total_ops: " << total_ops << "\n";
}
template<typename QueueType, typename T = int>
void run_all_benchmarks(std::string queue_name = "Queue"){
detail::pin_thread_to_cpu(MAIN_THREAD_CPU);
std::cout << "\nBenchmarking " << queue_name << "\n";
sanity_checks<QueueType, T>();
std::thread th(raw_add<QueueType, T>);
th.join();
th = std::thread(raw_remove<QueueType, T>);
th.join();
th = std::thread(single_threaded<QueueType, T>);
th.join();
th = std::thread(mostly_add<QueueType, T>);
th.join();
th = std::thread(mostly_remove<QueueType, T>);
th.join();
th = std::thread(heavy_concurrent<QueueType, T>);
th.join();
th = std::thread(random_concurrent<QueueType, T>);
th.join();
}
And the driver code: 4. spsc_raw_benchmark.cpp
#include <folly/ProducerConsumerQueue.h>
#include "../src/lfqueue.h"
#include "benchmark_functions.h"
struct ComplexStruct{
bool first;
int second;
double last;
ComplexStruct(int val) : second(val) {}
ComplexStruct() = default;
} __attribute__((packed));
int main(){
detail::init_rand_array(rnd_vals1);
detail::init_rand_array(rnd_vals2);
// using DATA_TYPE = int;
using DATA_TYPE = ComplexStruct;
run_all_benchmarks<folly::ProducerConsumerQueue<DATA_TYPE>, DATA_TYPE>("Folly");
run_all_benchmarks<LFQueue<DATA_TYPE>, DATA_TYPE>("LFQueue");
return 0;
}
This is a sample benchmark result for non cache-aligned ComplexStruct
Benchmarking Folly
Sanity checks:
OK
raw_add 10.6289ns, total_ops: 1000000
raw_remove 2.46765ns, total_ops: 1000000
single_threaded 6.36481ns, total_ops: 1000000
mostly_add 13.5758ns, total_ops: 1050105
mostly_remove 3.49217ns, total_ops: 1050105
heavy_concurrent 3.57867ns, total_ops: 2000000
random_concurrent 28.8257ns, total_ops: 1000665
Benchmarking LFQueue
Sanity checks:
OK
raw_add 3.5452ns, total_ops: 1000000
raw_remove 2.74462ns, total_ops: 1000000
single_threaded 5.90889ns, total_ops: 1000000
mostly_add 14.2668ns, total_ops: 1050105
mostly_remove 2.31141ns, total_ops: 1050105
heavy_concurrent 3.46162ns, total_ops: 2000000
random_concurrent 38.9555ns, total_ops: 1000665
And this is 1 sample result for int:
Benchmarking Folly
Sanity checks:
OK
raw_add 4.81129ns, total_ops: 1000000
raw_remove 2.78987ns, total_ops: 1000000
single_threaded 6.13505ns, total_ops: 1000000
mostly_add 12.5706ns, total_ops: 1049680
mostly_remove 2.69507ns, total_ops: 1049680
heavy_concurrent 2.90395ns, total_ops: 2000000
random_concurrent 36.6664ns, total_ops: 1000798
Benchmarking LFQueue
Sanity checks:
OK
raw_add 2.55449ns, total_ops: 1000000
raw_remove 2.63848ns, total_ops: 1000000
single_threaded 5.68764ns, total_ops: 1000000
mostly_add 13.2418ns, total_ops: 1049680
mostly_remove 2.70407ns, total_ops: 1049680
heavy_concurrent 2.86895ns, total_ops: 2000000
random_concurrent 52.6381ns, total_ops: 1000798
If the implementations are correct, I have some questions about benchmark results:
- I notice in multiple runs that
random_concurrent
benchmark results vary from 35-50ns and are usually slower than same run for folly, why is that even though the raw benchmarks are consistently faster? - I had tried to improve cache hit for writes by explicitly default constructing the
buf
array inLFQueue
usingbuf(new T[size]())
. But that doesn't improve the write performance or reduce the cache misses (I analysed using cachegrind and they still show 62500 hits onnew (&buf[curr_write])
line inside write function). Why is that, even though the entire buf (4MB for int case) can fit inside the cache (L3=48MB).
2 Answers 2
Generic remarks
Use a namespace, rather than dumping all the code in the global namespace.
Use a constant, rather than a macro: this is not C.
You only posted benchmarks, no tests. For any "container" class, I recommend testing with heap allocations: they are not forgiving, and help figuring out issues in resource safety... and memory leaks.
Memory Safety: queue
Use a std::unique_ptr<T[]>
, rather than a raw pointer, and you'll get the destructor for free.
Delete the copy/move constructors/assignment operators; they cannot be written safely.
Memory Safety: elements
There is a discrepancy in your usage of buf
:
- You default construct the elements.
- But
write
treats the memory as raw, never destroying them before overwriting.
You need to choose between:
- Default construct, then move-assign to it.
- Raw memory, then construct atop it.
And cannot mix the two.
I heartily recommend the latter, you can use the following helper class as queue element:
template <typename T>
struct Raw {
alignas(T) std::uint8_t raw[sizeof(T)];
};
Simplicity: indexes
Wrapping the indexes manually is possible, but it's actually relatively complicated correctly. At least if you want to use the full capacity -- it looks like you only ever use capacity - 1
elements.
Instead, use std::uint64_t
(and atomic) for the read & write indexes, and just always increment them, then use a power-of-2 capacity and mask them on use.
You can check whether the capacity is a power-of-2 with:
auto is_power_of_2(std::uint64_t i) -> bool {
return i > 0 && (i & (i - 1)) == 0;
}
And once you have a power-of-2, you can mask by & (capacity - 1)
to achieve % capacity
cheaply.
Correctness: atomics
You use the correct memory ordering semantics in read
& write
.
On the other hand, you tend to use too acquire
too much:
frontPtr
is useless, so I'll skip.popFront
is consumer-side, it can read theread
index inrelaxed
mode.isEmpty
,isFull
andsize
do not access the items, they can read indexes inrelaxed
mode.
Using acquire
instead of relaxed
is not wrong, and won't cost any performance on x64... but may on other architectures.
Correctness: nodiscard
While [[nodiscard]]
is C++17, your compiler of choice may already have a similar attribute in C++11. Consider using them on the read
and write
methods so the user is reminded that they may fail, if they forget.
Performance: perfect forwarding
The write
method is not using perfect forwarding correctly, there's a missing &&
in the arguments to avoid a copy.
Performance: cache line size
There's a difference between constructive & destructive hardware interference.
Specifically, modern Intel CPUs tend to fetch cache lines by pair, so that for your purpose (no false-sharing, ie no destructive interference) you'll want 128 bytes, not just 64 bytes.
Performance: memoization
Re-reading the read
index in the consumer, and re-reading the write
index in the producer leads to doubling the contention of your queue.
As a general advice with consumer/producer queues, I recommend splitting the data in 4 parts:
- Common: the common part that never changes,
capacity
&buf
here. - Shared: the common part that changes, the poorly named
write_id
andread_id
here. - Consumer: a part only accessed by the consumer.
- Producer: a part only access by the producer.
In your case, this would mean:
namespace lfqueue::detail {
template <typename T>
struct alignas(CACHE_LINE_SIZE) Common {
std::uint64_t capacity;
std::unique_ptr<Raw<T>> buf;
};
struct alignas(CACHE_LINE_SIZE) Shared {
alignas(CACHE_LINE_SIZE) std::atomic_uint64_t read_index{0};
alignas(CACHE_LINE_SIZE) std::atomic_uint64_t write_index{0};
};
struct alignas(CACHE_LINE_SIZE) Consumer {
std::uint64_t cached_read_index = 0;
};
struct alignas(CACHE_LINE_SIZE) Producer {
std::uint64_t cached_write_index = 0;
};
}
namespace lfqueue {
template <typename T>
class LFQueue {
private:
detail::Common const common;
detail::Shared shared;
detail::Consumer consumer;
detail::Producer producer;
};
}
And as an example:
template <typename... Args>
bool try_write(Args&&... args) {
auto read = shared.read_index.load(std::memory_order_acquire);
auto write = producer.write_index;
if (write == read) {
return false;
}
auto index = write & (self.common.capacity - 1);
auto slot = &common.buf[index].raw;
new (slot) T(std::forward<Args>(args)...);
producer.write_index = write + 1;
shared.write_index.store(write + 1, std::memory_order_release);
return true;
}
Style: initialization
Prefer initializing data members are their declaration sites, when the initializer is constant, or when they are built-ins (and therefore uninitialized by default).
Style: naming
Do not overly abbreviate: read_id
is meaningless -- it's not an idea -- prefer read_index
instead.
Prefer meaningful names:
buf
is very generic, considerslots
orelements
instead.read
andwrite
would be better namedtry_read
andtry_write
, to highlight their fallible nature.LFQueue
says a lot about the implementation (LF
) but nothing about how to use it. The fact that there can be a single consumer & a single producer is VERY important for correct usage, it should be reflected in the name.
And don't overdo it:
frontPtr
returns a pointer. Everybody can see that, no need forPtr
in the name.
Style: documentation
There's no documentation at all:
- No warning that this is a SPSC queue, and the implications of that.
- No explanation as to what that
bool
return type means forread
andwrite
, and in which condition it flips. - No explanation of who can call
frontPtr
safely, when only the consumer can.
You don't need the full Doxygen crap, but a single sentence can do wonders:
/// Returns whether the write succeeded, or not. It fails on full queues.
template <typename... Args>
bool try_write(...);
And don't forget to document safety invariants:
/// Returns the pointer to the first readable element, if any.
///
/// # Safety:
///
/// - Invalidated by any subsequent call to `try_read` or `pop_front`.
/// - Invalidated by the destruction of `this`.
T* front();
-
\$\begingroup\$ Currently, the copy and move operations are deleted because we have a user-supplied destructor; once we have a unique-pointer, then the copy operations will be deleted. That said, it's good to also explicitly delete them, to make it clearer to users that they are not part of the interface. \$\endgroup\$Toby Speight– Toby Speight2025年02月10日 09:47:18 +00:00Commented Feb 10 at 9:47
-
1\$\begingroup\$ @TobySpeight: Right, depending on implementation details some of the special members may be not be implicitly defined, but I still prefer making it explicit as any implicit definition would be wrong. \$\endgroup\$Matthieu M.– Matthieu M.2025年02月10日 10:11:31 +00:00Commented Feb 10 at 10:11
-
\$\begingroup\$ 1. why can I not use move construct object (
new (&buf[curr_write]) T(std::forward<Args>(args)...)
) in the location of default constructed (or moved from object) without explicitly calling the destructor? 2. Regarding cache line size point, do you mean I should separate objects by 128 bytes on intel CPUs to avoid false sharing? I seestd::hardware_destructive_interference_size = 64
on x86_64 gcc 13.3 compiler. \$\endgroup\$wwite– wwite2025年02月10日 17:46:27 +00:00Commented Feb 10 at 17:46 -
\$\begingroup\$ 3. In memoization try_write section, do we not want to CAS
producer.cached_write_index
withShared.write_index
in casewrite==read
. I do get the point though of separating shared and common variables, is it related to this paper \$\endgroup\$wwite– wwite2025年02月10日 17:46:32 +00:00Commented Feb 10 at 17:46 -
1\$\begingroup\$ @TobySpeight: You are correct, and so could deep copies reallly: the reader could make a deep copy themselves, if instead of
front
they had aspan
method giving them all guaranteed to be initialized elements (well, it'd take two spans). The problem is that it's hard to make safe, so conservatively it's easier to disable them to avoid the user shooting themselves in the foot. An alternative I've sometimes used is to split the "Consumer" & "Producer" parts out of the queue, and use a spin lock to create a Consumer or Producer. Enqueuing/dequeuing are still lockless, but it's overall safer. \$\endgroup\$Matthieu M.– Matthieu M.2025年02月10日 18:21:45 +00:00Commented Feb 10 at 18:21
This looks odd:
LFQueue(std::size_t size) : write_id(0), read_id(0), capacity_(size) { buf = new T[capacity_]; }
Why is buf
not initialised in the initializer list, like the other members?
More importantly, this default-constructs capacity_
objects. That might just be inefficient, but when T
is not default-constructible, it makes the class unusable.
We should really consider allocating uninitialised storage for the elements, and constructing them in place as required. That's what standard containers do.
Also, instead of using raw new
, consider changing to std::unique_ptr<T[]> const buf
to manage the storage. Then there would be no need for a user-defined constructor.
Or just delegate managing the storage to a std::deque
and let this class have the Single Responsibility of managing the concurrent access. I'd be surprised if that even costs anything in performance when compiled with -Ofast
.
The write()
and read()
functions are hard to use. Instead of blocking until the operation is possible, they just return false
, meaning that the user has to manage retrying (likely leading to spinning at one end or the other instead of polite waiting).
Prefer to use standard collection names for functions such as emplace_back()
and empty()
, and provide the normal member type aliases such as value_type
. That makes it easier for readers to comprehend, and for users to switch implementations as necessary.
I haven't looked at the benchmarking code, because there's no point measuring performance until the interface is finalised.
-
\$\begingroup\$ I really recommend a
std::unique<T[]>
over astd::deque
, there's no need to pay for the double indirection of a deque. \$\endgroup\$Matthieu M.– Matthieu M.2025年02月10日 08:32:21 +00:00Commented Feb 10 at 8:32 -
\$\begingroup\$ Also,
read
andwrite
have the expected interface for the domain... though I'd prefix them withtry_
to represent their fallible nature. \$\endgroup\$Matthieu M.– Matthieu M.2025年02月10日 08:36:03 +00:00Commented Feb 10 at 8:36 -
\$\begingroup\$ Are you sure that a deque has more indirection, @Matthieu? The two approaches amount to roughly the same: one pointer to follow between our class and the underlying storage. I missed a trick, though, by not recommending
std::queue
as an intermediate wrapper for the deque. \$\endgroup\$Toby Speight– Toby Speight2025年02月10日 08:37:44 +00:00Commented Feb 10 at 8:37 -
\$\begingroup\$
std::deque
is required to guarantee memory stability of its elements, in general it's therefore implemented as a double-ended vector of pointer to fixed-size chunks, so that on resize, the chunks do not move in memory. This leads to a double-indirection. \$\endgroup\$Matthieu M.– Matthieu M.2025年02月10日 09:23:52 +00:00Commented Feb 10 at 9:23 -
2\$\begingroup\$ @wwite, nearly right - we need to point to
T[]
. Not needing the destructor is just a side-effect, rather than the main point, which is that the smart pointer ensures the Right Thing happens during deletion and moving, and disables copying. \$\endgroup\$Toby Speight– Toby Speight2025年02月10日 18:09:26 +00:00Commented Feb 10 at 18:09
Explore related questions
See similar questions with these tags.