Optimizing even more a C++ matrix class with expression templates using variadic templates

Question

I looked at Bowie Owens' cppcon 2019 talk (the slides can be found here) where he shows how to implement a tridiagonal matrix classes using expression templates but without resorting to the CRTP. (It in fact really looks like a vector class but it is not the point.)

I came across this talk because I was trying to move from a simple container matrix class in the code I have to something better allowing for performant matrix/matrix or matrix/vector operations, and wanted to use expression templates that I already encountered in an automatic adjoint differentiation production code I am helping to maintain.

Thanks to questions

https://stackoverflow.com/questions/78278563/expression-templates-cppcon-2019

https://stackoverflow.com/questions/78289192/matrix-class-with-expression-templates-%c3%a0-la-cppcon-2019 (closed but still meaningful)

and https://stackoverflow.com/questions/78298890/naive-matrix-addition-vs-expression-template-matrix-addition-performs-better-al

I finally converged to the following implementation :

#ifndef VARIADIC_H
#define VARIADIC_H
#include <vector>
namespace Matrix
{
 namespace Variadic
 {
 //matrix forward declaration
 class base_matrix;
 //is_matrix_v
 template <class T>
 struct is_matrix
 {
 // matrix (forward declared will be our base matrix class, everything derived from it will be a matrix)
 static constexpr bool value = std::derived_from<T, base_matrix>;
 };
 template <class T>
 struct is_matrix<std::vector<T>>
 {
 // std::vector<T> is a column or row matrix according to its position in a multiplication
 static constexpr bool value = true;
 };
 template <class T>
 constexpr bool is_matrix_v = is_matrix<std::remove_cvref_t<T>>::value;
 struct expression {}; // you always need a base for traits right ?
 template <class T>
 concept is_matrix_or_expression =
 is_matrix_v<std::remove_cvref_t<T>>
 ||
 std::is_base_of_v<expression, std::remove_cvref_t<T>>;
 template <class A, class B>
 constexpr bool is_matrix_binary_op_ok =
 is_matrix_or_expression<A>
 ||
 is_matrix_or_expression<B>;
 //subscripts()
 template <class operand>
 auto subscripts(operand const& v, size_t const i, size_t const j) {
 if constexpr (is_matrix_or_expression<operand>) {
 return v(i, j);
 }
 else {
 return v;
 }
 }
 template <class operand>
 auto rows(operand const& m) {
 if constexpr (is_matrix_or_expression<operand>) {
 return m.rows();
 }
 else {
 return 1;
 }
 };
 template <class operand>
 auto cols(operand const& m) {
 if constexpr (is_matrix_or_expression<operand>) {
 return m.cols();
 }
 else {
 return 1;
 }
 };
 template <class callable, class... operands>
 class matrix_expression : public expression
 {
 std::tuple<operands const &...> args_;
 callable f_;
 public:
 matrix_expression(callable f, operands const&... args) :
 args_(args...),
 f_(f)
 {}
 auto& operator()(const size_t i, const size_t j)
 {
 auto const call_at_index_couple =
 [this, i, j](operands const&... a)
 {
 return f_(subscripts(a, i, j)...);
 };
 return std::apply(call_at_index_couple, args_);
 }
 auto operator()(const size_t i, const size_t j) const
 {
 auto const call_at_index_couple =
 [this, i, j](operands const&... a)
 {
 return f_(subscripts(a, i, j)...);
 };
 return std::apply(call_at_index_couple, args_);
 }
 size_t rows() const
 {
 auto const call_rows =
 [](operands const&... a)
 {
 return std::max(Matrix::Variadic::rows(a)...);
 };
 return std::apply(call_rows, args_);
 }
 size_t cols() const
 {
 auto const call_cols =
 [](operands const&... a)
 {
 return std::max(Matrix::Variadic::cols(a)...);
 };
 return std::apply(call_cols, args_);
 }
 void get_row(size_t row_num, double* zeroed_output) const
 {
 const auto columns = cols();
 auto incremental = [this, row_num, zeroed_output, columns](auto& rhs)
 {
 for (int col_num = 0; col_num != columns; ++col_num)
 {
 zeroed_output[col_num] = f_(zeroed_output[col_num], subscripts(rhs, row_num, col_num));
 }
 };
 std::apply(
 [&incremental](auto&& ...rhs)
 {
 (incremental(rhs), ...);
 },
 args_
 );
 }
 };
 template <class LHS, class RHS, class = std::enable_if_t<is_matrix_binary_op_ok<LHS, RHS>>>
 auto operator+(LHS const& lhs, RHS const& rhs)
 {
 return matrix_expression
 {
 //[](auto a, auto b)
 //{
 // return a + b;
 //}
 std::plus<>{},
 lhs,
 rhs
 };
 }
 template <class LHS, class RHS, class = std::enable_if_t<is_matrix_binary_op_ok<LHS, RHS>>>
 auto operator-(LHS const& lhs, RHS const& rhs)
 {
 return matrix_expression
 {
 //[](auto a, auto b)
 //{
 // return a - b;
 //}
 std::minus<>{},
 lhs,
 rhs
 };
 }
 class base_matrix : public expression
 {
 public:
 virtual ~base_matrix() = default;
 // So we can call matrix(i,j)
 virtual double& operator()(const size_t i, const size_t j) = 0;
 virtual double operator()(const size_t i, const size_t j) const = 0;
 virtual size_t rows() const = 0;
 virtual size_t cols() const = 0;
 };
 class dense_matrix : public base_matrix
 {
 size_t rows_;
 size_t columns_;
 std::vector<double> underlying_vector_;
 template <class src_type>
 void init(const src_type& src)
 {
 for (size_t row = 0; row != src.rows(); ++row)
 {
 src.get_row(row, &underlying_vector_[row * src.cols()]);
 }
 }
 public:
 dense_matrix() :
 rows_(0),
 columns_(0)
 {}
 dense_matrix(const size_t rows, const size_t cols) :
 rows_(rows),
 columns_(cols),
 underlying_vector_(rows* cols)
 {}
 dense_matrix(const size_t rows, const size_t cols, double* ptr) :
 rows_(rows),
 columns_(cols),
 underlying_vector_(ptr, ptr + (rows * cols))
 {}
 dense_matrix(std::vector<double>& vector, bool isColumn = true)
 {
 if (isColumn)
 {
 rows_ = vector.size();
 columns_ = 1;
 }
 else
 {
 rows_ = 1;
 columns_ = vector.size();
 }
 underlying_vector_ = std::move(vector);
 }
 template <class src_type>
 dense_matrix(const src_type& src) :
 rows_(src.rows()),
 columns_(src.cols()),
 underlying_vector_(src.rows()* src.cols())
 {
 init(src);
 }
 template <class src_type>
 dense_matrix& operator=(src_type const& src)
 {
 init(src);
 return *this; // this line was missing in the slides and in the talk
 }
 // Access
 size_t rows() const override { return rows_; }
 size_t cols() const override { return columns_; }
 double& operator()(const size_t i, const size_t j) override
 {
 return underlying_vector_[i * columns_ + j];
 }
 double operator()(const size_t i, const size_t j) const override
 {
 return underlying_vector_[i * columns_ + j];
 }
 };
 }
}
#endif//VARIADIC_H

thanks to SO and I would like to optimize it (performance wise) even more, as it is for now a bit slower than a naive implementation -- see my third SO question), while at the same time making it better from a pure design point of view.

I was advised without further details to "rely on repeated looped += instead of one + loop", which I don't understand, but I am open to advice. (One comment in my third SO question implied that it was possible to make the code as performant as the naive implementation.)

Here is some client code if needed:

#include <iostream>
#include <chrono>
#include <random>
#include "variadic.h"
#define SIMPLE_OPERATION(matrix1, matrix2, matrix3) matrix1 + matrix2 + matrix3
int main()
{
 constexpr size_t nb_ops = 250;
 constexpr size_t square_matrix_sizes[] = { 1000 };
 for (const auto square_matrix_size : square_matrix_sizes)
 {
 // Setting data
 //constexpr size_t square_matrix_size = 800;
 typedef std::mt19937 MyRNG;
 uint32_t seed_val = 1729;
 MyRNG rng;
 rng.seed(seed_val);
 std::normal_distribution<double> normal_dist(0.0, 1.0);
 Matrix::Variadic::dense_matrix m1(square_matrix_size, square_matrix_size);
 for (size_t i = 0; i != square_matrix_size; ++i)
 {
 for (size_t j = 0; j != square_matrix_size; ++j)
 {
 m1(i, j) = normal_dist(rng);
 }
 }
 Matrix::Variadic::dense_matrix m2(square_matrix_size, square_matrix_size);
 for (size_t i = 0; i != square_matrix_size; ++i)
 {
 for (size_t j = 0; j != square_matrix_size; ++j)
 {
 m2(i, j) = normal_dist(rng);
 }
 }
 Matrix::Variadic::dense_matrix m3(square_matrix_size, square_matrix_size);
 for (size_t i = 0; i != square_matrix_size; ++i)
 {
 for (size_t j = 0; j != square_matrix_size; ++j)
 {
 m3(i, j) = normal_dist(rng);
 }
 }
 auto start = std::chrono::steady_clock::now();
 for (size_t i = 0; i != nb_ops; ++i)
 {
 Matrix::Variadic::dense_matrix m4 = SIMPLE_OPERATION(m1, m2, m3);
 }
 auto end = std::chrono::steady_clock::now();
 auto elapsed_seconds = end - start;
 std::cout << std::fixed << elapsed_seconds / nb_ops << "\t";
 }
}

Stack Exchange Network

Optimizing even more a C++ matrix class with expression templates using variadic templates

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Optimizing even more a C++ matrix class with expression templates using variadic templates

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions