Multithreaded Monte Carlo pi approximation with own pseudorandom number generator

Question 1

I made a Monte Carlo pi approximation program, that makes use of multithreading and a pseudorandom number generator I wrote (the one from big_wheel.hpp, which I tested using TestU01).

Code:

monte_carlo_pi.cpp:

#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstdint>
#include <cmath>
#include <thread>
#include <mutex>
#include <random>
#include <vector>
#include <chrono>
#include "big_wheel.hpp"
typedef struct
{
 double x, y, z;
} point_t;
static const double PI = 3.1415926535897932384626433832795;
static const std::uint32_t prng_max_value = big_wheel_engine<>::max();
double approximate_pi(std::uint32_t pc,std::uint32_t tp)
{
 return 4.0*(double(pc)/double(tp));
}
double pi_error(double x)
{
 return 100.0*((x-PI)/PI);
}
std::mutex mtx;
// Points within circle
std::uint32_t pcnt = 0;
template <class RandEng>
void calculate_pi(std::uint32_t nppt,RandEng rgen)
{
 std::lock_guard<std::mutex> guard(mtx);
 point_t rpt;
 for (std::uint32_t i=0;i<nppt;i++)
 {
 rpt.x = double(rgen())/static_cast<double>(prng_max_value);
 rpt.y = double(rgen())/static_cast<double>(prng_max_value);
 rpt.z = (rpt.x*rpt.x)+(rpt.y*rpt.y);
 if (rpt.z <= 1.0) pcnt++;
 }
}
int main(int argc,char **argv)
{
 // Check number of cores
 std::size_t num_cores = std::thread::hardware_concurrency() ? std::thread::hardware_concurrency() : 1;
 // Thread set
 std::vector<std::thread> thread_set;
 // Generate seeds
 std::random_device rd;
 std::uint32_t seed[16] = {0x0};
 for (int i=0;i<16;i++)
 seed[i] = rd();
 // Initialize PRNG
 big_wheel_engine<> rnd(
 seed[ 0],seed[ 1],seed[ 2],seed[ 3],
 seed[ 4],seed[ 5],seed[ 6],seed[ 7],
 seed[ 8],seed[ 9],seed[10],seed[11],
 seed[12],seed[13],seed[14],seed[15]
 );
 std::uint32_t num_points = 3221225472;
 std::uint32_t num_points_per_thread = num_points/num_cores;
 for (std::size_t i=0;i<num_cores;i++)
 thread_set.push_back(
 std::thread(
 calculate_pi<big_wheel_engine<>>,
 num_points_per_thread,
 rnd
 )
 );
 // Calculated Pi value
 double gen_pi = 0;
 // Calculation error
 double pi_err = 0;
 // Duration
 double dur = 0;
 // Print info
 std::cout << "Seeds:\n";
 for (int iy=0;iy<4;iy++)
 {
 std::cout << " ";
 for (int ix=0;ix<4;ix++)
 std::cout << std::setw(11) << std::setfill(' ') << seed[4*iy+ix] << " ";
 std::cout << "\n";
 }
 std::cout << "Number of points: " << num_points << "\n";
 std::cout << "Number of cores: " << num_cores << "\n";
 std::cout << " ==> Number of points per thread: " << num_points_per_thread << "\n\n";
 std::cout << "Generating random points...\n";
 std::chrono::high_resolution_clock::time_point tpa = std::chrono::high_resolution_clock::now();
 for (std::size_t i=0;i<num_cores;i++)
 thread_set[i].join();
 std::chrono::high_resolution_clock::time_point tpb = std::chrono::high_resolution_clock::now();
 std::chrono::duration<double> tm = std::chrono::duration_cast<std::chrono::duration<double>>(tpb-tpa);
 dur = tm.count();
 // Approximate pi and calculate error
 gen_pi = approximate_pi(pcnt,num_points);
 pi_err = pi_error(gen_pi);
 std::cout << "\nResults:\n";
 std::cout << " Number of points within circle: " << pcnt << "\n";
 std::cout << " Pi approximation: " << std::setprecision(32) << gen_pi << "\n";
 std::cout << " Error: " << std::setprecision(10) << pi_err << "%\n";
 std::cout << "Time elapsed: " << std::setprecision(10) << dur << " seconds\n";
}

big_wheel.hpp:

#ifndef BIG_WHEEL_HPP
#define BIG_WHEEL_HPP
#include <iostream>
#include <cstdlib>
#include <cstdint>
#include <climits>
#include <limits>
template <
 std::uint32_t m = 0x3655accc,
 std::size_t rs1 = 11,std::size_t rs2 = 20,std::size_t rs3 = 14,std::size_t rs4 = 18,
 std::size_t rm1 = 16,std::size_t rm2 = 16,
 std::size_t rf = 7
>
class big_wheel_engine
{
 public:
 typedef std::uint32_t result_type;
 static constexpr std::size_t state_size = 4096;
 static constexpr std::uint32_t mix_mask = m;
 static constexpr std::size_t rot_sel1 = rs1;
 static constexpr std::size_t rot_sel2 = rs2;
 static constexpr std::size_t rot_sel3 = rs3;
 static constexpr std::size_t rot_sel4 = rs4;
 static constexpr std::size_t rot_mix1 = rm1;
 static constexpr std::size_t rot_mix2 = rm2;
 static constexpr std::size_t rot_final = rf;
 big_wheel_engine(
 const result_type& s01 = 0x0,const result_type& s02 = 0x0,const result_type& s03 = 0x0,const result_type& s04 = 0x0,
 const result_type& s05 = 0x0,const result_type& s06 = 0x0,const result_type& s07 = 0x0,const result_type& s08 = 0x0,
 const result_type& s09 = 0x0,const result_type& s10 = 0x0,const result_type& s11 = 0x0,const result_type& s12 = 0x0,
 const result_type& s13 = 0x0,const result_type& s14 = 0x0,const result_type& s15 = 0x0,const result_type& s16 = 0x0)
 {
 // Inverse mask
 m_i = ~m;
 seed(s01,s02,s03,s04,
 s05,s06,s07,s08,
 s09,s10,s11,s12,
 s13,s14,s15,s16);
 }
 big_wheel_engine(const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& x)
 {
 for (std::size_t i=0;i<state_size;i++)
 state[i] = x.state[i];
 counter = x.counter;
 }
 big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& operator=(const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& x)
 {
 for (std::size_t i=0;i<state_size;i++)
 state[i] = x.state[i];
 counter = x.counter;
 return *this;
 }
 void seed(
 const result_type& s01 = 0x0,const result_type& s02 = 0x0,const result_type& s03 = 0x0,const result_type& s04 = 0x0,
 const result_type& s05 = 0x0,const result_type& s06 = 0x0,const result_type& s07 = 0x0,const result_type& s08 = 0x0,
 const result_type& s09 = 0x0,const result_type& s10 = 0x0,const result_type& s11 = 0x0,const result_type& s12 = 0x0,
 const result_type& s13 = 0x0,const result_type& s14 = 0x0,const result_type& s15 = 0x0,const result_type& s16 = 0x0)
 {
 counter = 0;
 p1 = (counter+ 0)%4096; p2 = (counter+ 8)%4096;
 p3 = (counter+ 18)%4096; p4 = (counter+ 30)%4096;
 p5 = (counter+ 44)%4096; p6 = (counter+ 60)%4096;
 p7 = (counter+ 78)%4096; p8 = (counter+ 98)%4096;
 p9 = (counter+120)%4096;
 // Based on decimal part of sqrt(61/19)
 result_type off1[16] = {
 0xcab305ab,0x22132737,0xbda768dd,0x2438d3ab,
 0x00f4ca5a,0xd4681d61,0xc6420bc7,0x2828bdba,
 0x64806a32,0x2fa2391f,0xbc61cbca,0xcc6eb7a0,
 0xfa01a2cd,0xb766ce25,0x5499507a,0x19c839a0
 };
 // Based on decimal part of sqrt(73/29)
 result_type off2[16] = {
 0x962a374b,0x0fc75efe,0x86dfdeff,0xddb14a03,
 0xf61361c6,0xc908fde9,0x27df3b7d,0x04ae7492,
 0x5ee38142,0xfb21ee51,0x56fae74d,0x4f688f90,
 0x7889f34c,0x44e4aa33,0x547e3778,0xb6201d7b
 };
 // Based on decimal part of sqrt(79/31)
 result_type off3[16] = {
 0x98ab7f56,0x38ca9f10,0xa2f3d406,0x322c4113,
 0xb8d42dd6,0xe6490633,0x4c5bdcf7,0x454c3122,
 0x7b22e315,0x1a205e44,0xc6d1b31b,0x06eb7dea,
 0x875f1cf1,0xbe9bec2a,0x0350aa26,0x8b3d5aed
 };
 result_type sset[16] = {
 s01,s02,s03,s04,
 s05,s06,s07,s08,
 s09,s10,s11,s12,
 s13,s14,s15,s16
 };
 result_type st_a[16] = {0x0};
 result_type st_b[256] = {0x0};
 result_type st_c[4096] = {0x0};
 result_type rn_1[4096] = {0x0};
 result_type rn_2[4096] = {0x0};
 result_type rn_3[4096] = {0x0};
 result_type rn_4[4096] = {0x0};
 // Expansion
 for (std::size_t i=0;i<16;i++)
 {
 st_a[i] = sset[i]+off1[i];
 st_a[i] = smix(st_a[i],sbox_i1);
 }
 for (std::size_t i=0;i<16;i++)
 {
 for (std::size_t j=0;j<16;j++)
 {
 st_b[16*i+j] = st_a[i]+off2[j];
 st_b[16*i+j] = smix(st_b[16*i+j],sbox_i2);
 }
 }
 for (std::size_t i=0;i<256;i++)
 {
 for (std::size_t j=0;j<16;j++)
 {
 st_c[16*i+j] = st_b[i]+off3[j];
 st_c[16*i+j] = smix(st_c[16*i+j],sbox_i3);
 }
 }
 // Mixing
 for (std::size_t i=0;i<4096;i++)
 {
 rn_1[i] = (st_c[(i+ 0)%4096] << 13) | (st_c[(i+ 4)%4096] >> 19);
 rn_1[i] = smix(rn_1[i],sbox_i4);
 }
 for (std::size_t i=0;i<4096;i++)
 {
 rn_2[i] = (rn_1[(i+10)%4096] << 11) | (rn_1[(i+18)%4096] >> 21);
 rn_2[i] = smix(rn_2[i],sbox_i4);
 }
 for (std::size_t i=0;i<4096;i++)
 {
 rn_3[i] = (rn_2[(i+28)%4096] << 17) | (rn_2[(i+40)%4096] >> 15);
 rn_3[i] = smix(rn_3[i],sbox_i4);
 }
 for (std::size_t i=0;i<4096;i++)
 {
 rn_4[i] = (rn_3[(i+54)%4096] << 10) | (rn_3[(i+70)%4096] >> 22);
 rn_4[i] = smix(rn_4[i],sbox_i4);
 }
 for (std::size_t i=0;i<4096;i++)
 {
 state[i] = (rn_4[(i+88)%4096] << 16) | (rn_4[(i+108)%4096] >> 16);
 state[i] = smix(state[i],sbox_i5);
 }
 }
 // Get next random number
 result_type operator()()
 {
 // Elements from state to mix
 p1 = (counter+ 0)%4096; p2 = (counter+ 8)%4096;
 p3 = (counter+ 18)%4096; p4 = (counter+ 30)%4096;
 p5 = (counter+ 44)%4096; p6 = (counter+ 60)%4096;
 p7 = (counter+ 78)%4096; p8 = (counter+ 98)%4096;
 p9 = (counter+120)%4096;
 // Start mixing
 tmp_11 = (state[p1] << rs1) | (state[p2] >> (32-rs1));
 tmp_12 = (state[p3] << rs2) | (state[p4] >> (32-rs2));
 tmp_13 = (state[p5] << rs3) | (state[p6] >> (32-rs3));
 tmp_14 = (state[p7] << rs4) | (state[p8] >> (32-rs4));
 tmp_21 = ((tmp_11 << rm1) | (tmp_12 >> (32-rm1)));
 tmp_22 = ((tmp_13 << rm2) | (tmp_14 >> (32-rm2)));
 tmp_3 = (tmp_21 & m) | (tmp_22 & m_i);
 tmp_4 = rot_r(state[p9] ^ tmp_3,rf);
 state[p9] = smix(tmp_4,sbox_r);
 // For jumping to next elements
 counter++;
 return state[p9];
 }
 // Discard z elements
 void discard(unsigned long long z)
 {
 for (unsigned long long i=0;i<z;i++)
 (*this)();
 }
 // Minimum and maximum
 static constexpr result_type min()
 {
 return std::numeric_limits<result_type>::min();
 }
 static constexpr result_type max()
 {
 return std::numeric_limits<result_type>::max();
 }
 // Comparision
 friend bool operator==(
 const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& lhs,
 const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& rhs)
 {
 if (lhs.counter != rhs.counter)
 return false;
 for (std::size_t i=0;i<state_size;i++)
 {
 if (lhs.state[i] != rhs.state[i])
 return false;
 else
 continue;
 }
 return true;
 }
 friend bool operator!=(
 const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& lhs,
 const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& rhs)
 {
 return !(lhs == rhs);
 }
 // Stream ops
 template <class CharT,class Traits>
 friend std::basic_ostream<CharT,Traits>& operator<<(
 std::basic_ostream<CharT,Traits>& os,
 const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& bw)
 {
 os << bw.counter;
 for (std::size_t i=0;i<bw.state_size;i++)
 os << ' ' << bw.state[i];
 return os;
 }
 template <class CharT,class Traits>
 friend std::basic_istream<CharT,Traits>& operator>>(
 std::basic_istream<CharT,Traits>& is,
 const big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>& bw)
 {
 is >> bw.counter >> std::ws;
 for (std::size_t i=0;i<bw.state_size;i++)
 is >> bw.state[i] >> std::ws;
 return is;
 }
 private:
 // State
 result_type state[4096];
 // Reverse mask
 result_type m_i;
 // Temporal variables
 result_type tmp_11, tmp_12, tmp_13, tmp_14;
 result_type tmp_21, tmp_22;
 result_type tmp_3;
 result_type tmp_4;
 // For picking elements from state
 std::size_t counter; // Counter
 std::size_t p1, p2, p3, p4, p5, p6, p7, p8, p9; // Positions
 // S-boxes:
 // - sbox_i{1,2,3,4,5}: For initialization
 // -- i{1,2,3}: Seed expansion
 // -- i{4,5}: Mixing
 // - sbox_r: For generating the next random number
 static std::uint32_t sbox_i1[256];
 static std::uint32_t sbox_i2[256];
 static std::uint32_t sbox_i3[256];
 static std::uint32_t sbox_i4[256];
 static std::uint32_t sbox_i5[256];
 static std::uint32_t sbox_r[256];
 // Left/right bit rotation
 template <typename D> D rot_l(D x,unsigned r) { return ((x << r) | (x >> (CHAR_BIT*sizeof(D)-r))); }
 template <typename D> D rot_r(D x,unsigned r) { return ((x >> r) | (x << (CHAR_BIT*sizeof(D)-r))); }
 // Mix x with an S-box sb
 result_type smix(result_type x,std::uint32_t *sb)
 {
 return result_type(
 (sb[(x >> 24)&0xff] << 24)|
 (sb[(x >> 16)&0xff] << 16)|
 (sb[(x >> 8)&0xff] << 8)|
 (sb[(x >> 0)&0xff] << 0)
 );
 }
};
// The S-boxes (randomly generated)
template <
 std::uint32_t m,
 std::size_t rs1,std::size_t rs2,std::size_t rs3,std::size_t rs4,
 std::size_t rm1,std::size_t rm2,
 std::size_t rf
>
std::uint32_t big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>::sbox_i1[256] = {
 0x36,0x09,0x2d,0x7f,0x28,0x8a,0x51,0x81,0x2b,0x69,0x90,0x58,0xaf,0x21,0xda,0x71,
 0x8d,0x4f,0xb9,0xa4,0x91,0xbc,0xde,0x3f,0xad,0x6e,0xbd,0x5a,0xbb,0x92,0x1a,0x5d,
 0x49,0x6a,0x94,0x0a,0xaa,0x30,0xf3,0x73,0x37,0xfa,0xcf,0x7e,0x76,0xdf,0xb1,0x56,
 0xd5,0x78,0x12,0x08,0xe3,0xf1,0x07,0x9d,0x98,0xc5,0xe6,0xcd,0x34,0xf8,0xfd,0xd0,
 0x2f,0x84,0x66,0x4b,0x3c,0x4e,0xc6,0x63,0x99,0x57,0x60,0x54,0x97,0x9f,0x39,0xf0,
 0x59,0xa1,0xcb,0x18,0xed,0x0b,0xd1,0xdb,0x44,0x1e,0x0f,0x72,0x2c,0x5b,0x27,0xc9,
 0x55,0x1c,0x41,0xfc,0x11,0x50,0x4d,0x9b,0x88,0xea,0xd6,0xc8,0x45,0xa2,0xcc,0x8c,
 0xa6,0x61,0xb5,0x01,0xb4,0x6d,0x93,0xc3,0xab,0xdd,0x9e,0x6c,0x38,0xe7,0x47,0xe9,
 0x70,0x31,0xe8,0x03,0x52,0x75,0xe0,0xb6,0x53,0x40,0x46,0x0e,0x3a,0x5e,0x79,0x5c,
 0xc2,0x7c,0x7a,0x6f,0xe1,0xd4,0xb8,0x85,0xee,0x22,0xb0,0x80,0x42,0xf6,0xec,0x24,
 0xe2,0x9a,0x14,0x8f,0xac,0x15,0x10,0x82,0xa3,0x20,0x3b,0xf2,0x68,0x4c,0xd3,0x9c,
 0xca,0x48,0xfe,0x00,0x04,0xc0,0x0c,0xba,0xb7,0xc7,0x3e,0xff,0xd8,0xfb,0x6b,0x65,
 0x26,0x17,0x8b,0xf9,0x02,0xb2,0xe4,0xbf,0x8e,0xc1,0x62,0xd7,0x1d,0x77,0x2e,0x96,
 0x25,0x33,0x5f,0xef,0x35,0x43,0xf5,0xf7,0xa0,0x64,0x05,0xc4,0x1b,0xd2,0x0d,0x16,
 0xdc,0x86,0x7d,0xeb,0x87,0xa5,0x83,0xf4,0x23,0xd9,0x4a,0x95,0x1f,0x7b,0x74,0xa9,
 0x2a,0x13,0x06,0x29,0xae,0xce,0x89,0xa7,0x3d,0x19,0x67,0xa8,0x32,0xb3,0xbe,0xe5
};
template <
 std::uint32_t m,
 std::size_t rs1,std::size_t rs2,std::size_t rs3,std::size_t rs4,
 std::size_t rm1,std::size_t rm2,
 std::size_t rf
>
std::uint32_t big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>::sbox_i2[256] = {
 0x2d,0xa9,0x38,0x42,0x24,0xd1,0x55,0x8a,0x3c,0x22,0x84,0x7a,0xd6,0xbc,0x2a,0xb7,
 0x99,0xd7,0xe6,0x2f,0x9e,0x1f,0x1e,0x0a,0xd0,0x97,0x45,0xe9,0x90,0x8f,0xe7,0x35,
 0x40,0x6e,0x9a,0xeb,0x28,0x75,0x03,0x5a,0x29,0x34,0x6b,0x17,0xaa,0x4a,0x6d,0x00,
 0x36,0xd8,0x71,0x9b,0xf1,0x25,0x5b,0x46,0x12,0x69,0x9d,0xb9,0x66,0xf7,0x74,0x95,
 0x61,0x48,0x1a,0x23,0xcb,0x5c,0xcd,0x70,0x18,0xb3,0x76,0x0e,0x37,0xc8,0x3a,0x92,
 0x59,0xdd,0x26,0xaf,0xb0,0x8e,0x44,0xce,0xe3,0xcf,0x4f,0x0d,0x4c,0x19,0x49,0xca,
 0x07,0xc7,0xd2,0x72,0x7b,0x3e,0xc5,0xc1,0xc2,0x91,0x86,0x4b,0x57,0xe0,0x02,0xb6,
 0xf4,0x7e,0x39,0xd9,0x20,0x65,0xac,0x87,0xb8,0x33,0xe5,0xb2,0xa1,0x96,0x64,0xa6,
 0x2b,0x01,0x85,0x50,0xed,0x27,0xff,0x62,0x30,0x80,0xc9,0xa4,0x31,0xde,0x21,0x7c,
 0xdc,0xf6,0xae,0x52,0x8c,0x51,0x32,0xbf,0x0c,0xa8,0xb4,0xe2,0xe8,0x94,0x14,0x43,
 0xd3,0xdf,0x58,0x77,0x15,0x0b,0xad,0xab,0x7f,0xbd,0xbb,0xf5,0xf9,0x1c,0x5d,0x04,
 0xdb,0x54,0xa3,0x47,0x2e,0xfd,0x67,0x53,0x60,0xee,0x6a,0xa2,0xd4,0xfe,0x8b,0x82,
 0x4e,0xc6,0xc4,0x98,0xb5,0xbe,0x06,0x3d,0x13,0xf8,0xe1,0x3b,0x81,0xa7,0x79,0x8d,
 0xf2,0xb1,0xd5,0x2c,0xc3,0x1d,0x1b,0x73,0x9f,0x6f,0xba,0xcc,0xfc,0xea,0x09,0xa0,
 0xec,0x10,0xc0,0x4d,0x11,0x0f,0x88,0x08,0x5f,0x56,0xf0,0x5e,0x63,0xfa,0x05,0x78,
 0x3f,0x7d,0x41,0xfb,0x68,0xa5,0x89,0xf3,0xef,0x83,0x6c,0x16,0xe4,0xda,0x9c,0x93
};
template <
 std::uint32_t m,
 std::size_t rs1,std::size_t rs2,std::size_t rs3,std::size_t rs4,
 std::size_t rm1,std::size_t rm2,
 std::size_t rf
>
std::uint32_t big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>::sbox_i3[256] = {
 0x06,0x38,0x28,0x6a,0x51,0xb1,0x0a,0x55,0xf9,0xa3,0x73,0x11,0xed,0x3a,0x43,0x83,
 0x54,0x74,0x89,0xe0,0x85,0x10,0xcc,0x6f,0x64,0x2b,0x71,0xc1,0xa4,0x5b,0x42,0x92,
 0xc8,0x99,0x4a,0x8f,0xda,0xbc,0xb7,0x5c,0x81,0x16,0x52,0x50,0x82,0x61,0x3f,0xf5,
 0x1b,0x12,0x90,0x98,0x02,0x05,0x8d,0xa6,0x5a,0xbf,0xe9,0xa7,0x24,0x56,0xfd,0x40,
 0x33,0xe6,0x17,0x72,0xf8,0x18,0xd1,0xac,0xa2,0xfe,0x0f,0x88,0x39,0x09,0x6c,0x0e,
 0x67,0x7c,0x84,0xeb,0x8a,0xba,0x63,0x03,0x75,0xcd,0x93,0x97,0xab,0x6d,0x76,0xdc,
 0xad,0x53,0xb9,0x8e,0xd2,0x4b,0xf2,0xa5,0x4f,0x77,0x7b,0xd7,0x91,0x9f,0x20,0x70,
 0xcb,0xc3,0xb4,0x9c,0xa9,0xd8,0x9d,0xb5,0xca,0x7a,0x1a,0xc4,0x0d,0x19,0xc6,0x30,
 0xf0,0x5d,0x65,0x79,0x7e,0x2c,0x31,0x13,0x9e,0xbd,0x44,0xbe,0x57,0xb2,0xc5,0x80,
 0xe3,0x27,0xd0,0xec,0x48,0x4e,0xae,0x86,0x8c,0xf4,0x3b,0xfb,0xb3,0xa8,0x69,0xb8,
 0x1f,0x0b,0xcf,0x4c,0x45,0xaa,0xdf,0xf7,0xe8,0x68,0xef,0x58,0x34,0x7d,0xbb,0xf6,
 0x2e,0x22,0xdb,0x35,0xd6,0x6b,0x1d,0x5e,0xc0,0xa1,0x3d,0xfa,0xb0,0xff,0x60,0x2a,
 0x41,0x5f,0xc7,0xd5,0x4d,0x1e,0x2f,0x01,0xea,0x49,0xdd,0x26,0xb6,0x78,0xde,0x25,
 0x36,0xf1,0x0c,0xe1,0x04,0x32,0x14,0x3e,0x87,0x6e,0x96,0xe5,0xce,0x95,0xe2,0xaf,
 0xf3,0xa0,0x46,0x37,0x15,0x1c,0x00,0x2d,0x7f,0xe4,0x62,0xee,0x3c,0x08,0x47,0x9b,
 0x66,0xfc,0x21,0xd3,0x94,0x59,0x8b,0x07,0xd4,0x29,0xc9,0x23,0xc2,0x9a,0xe7,0xd9
};
template <
 std::uint32_t m,
 std::size_t rs1,std::size_t rs2,std::size_t rs3,std::size_t rs4,
 std::size_t rm1,std::size_t rm2,
 std::size_t rf
>
std::uint32_t big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>::sbox_i4[256] = {
 0x1a,0x6b,0xfd,0xd0,0x0d,0x11,0x50,0x9e,0x1f,0xd3,0x95,0xca,0xbc,0x0c,0x80,0x29,
 0x47,0xa4,0x3f,0x3c,0x2c,0x76,0x3a,0x91,0x60,0x8d,0x39,0x9b,0x7e,0x6d,0x1b,0x12,
 0x7f,0xf3,0xab,0xb6,0x63,0x7b,0x53,0x31,0xd6,0x4d,0xbb,0xcb,0x56,0xda,0x15,0xbe,
 0x67,0xfb,0xa9,0xdb,0x69,0x97,0x42,0xf1,0x5c,0xc5,0xb3,0x34,0xa3,0x6f,0x51,0x4f,
 0xc2,0x70,0xf8,0x52,0x92,0x81,0xd1,0x0f,0xba,0x3d,0x4c,0xb0,0x48,0xff,0xd2,0x7d,
 0xaf,0x1c,0x65,0xe8,0x4b,0x5f,0x94,0xcc,0x84,0x09,0x64,0x2e,0xb1,0x77,0x96,0xea,
 0x79,0x24,0xb8,0x0e,0x90,0x33,0xf9,0xb2,0x57,0x28,0xe9,0x04,0x03,0xa1,0x17,0xe7,
 0x82,0xaa,0x19,0x83,0x43,0x9a,0xfe,0xfc,0xc6,0xef,0x1e,0xa6,0x5b,0xbd,0xdd,0x08,
 0xb9,0x16,0x9d,0xf2,0x20,0x78,0x06,0xae,0xf0,0xc3,0x59,0x45,0xb7,0xcd,0xa8,0x05,
 0xeb,0x26,0x02,0xf6,0x6e,0x8a,0x14,0x98,0x0b,0x25,0xdc,0x7c,0xc1,0x6a,0x86,0x71,
 0xed,0x68,0xec,0x85,0x35,0x87,0x18,0x93,0x46,0x23,0xd8,0x6c,0xbf,0x61,0x4a,0xac,
 0xcf,0xe3,0x1d,0xb5,0x5e,0xa0,0x9c,0xf7,0x55,0x73,0x2a,0xad,0x00,0xc8,0x75,0xf5,
 0xb4,0x8f,0xc7,0x36,0xe5,0x41,0xde,0xee,0xa7,0x9f,0xce,0xd5,0x21,0x8e,0xa5,0x32,
 0x2b,0x54,0x99,0x7a,0xe0,0x58,0xfa,0x30,0x5d,0x44,0x8b,0x8c,0x4e,0x72,0x66,0xc4,
 0x40,0x2f,0xe6,0x5a,0x2d,0x37,0x01,0x27,0xa2,0xe1,0x22,0x10,0xd7,0x3e,0x89,0x74,
 0xc0,0x38,0x88,0xd9,0x62,0x49,0xdf,0x3b,0xd4,0x0a,0x13,0xf4,0xe4,0x07,0xe2,0xc9
};
template <
 std::uint32_t m,
 std::size_t rs1,std::size_t rs2,std::size_t rs3,std::size_t rs4,
 std::size_t rm1,std::size_t rm2,
 std::size_t rf
>
std::uint32_t big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>::sbox_i5[256] = {
 0x28,0x21,0xde,0xd7,0x8f,0x86,0xba,0xb3,0xeb,0xe2,0x7c,0x75,0x2d,0x24,0x18,0x11,
 0x49,0x40,0x46,0x4f,0x17,0x1e,0x22,0x2b,0x73,0x7a,0xe4,0xed,0xb5,0xbc,0x80,0x89,
 0xd1,0xd8,0x4b,0x42,0x1a,0x13,0x2f,0x26,0x7e,0x77,0xe9,0xe0,0xb8,0xb1,0x8d,0x84,
 0xdc,0xd5,0xd3,0xda,0x82,0x8b,0xb7,0xbe,0xe6,0xef,0x71,0x78,0x20,0x29,0x15,0x1c,
 0x44,0x4d,0x98,0x91,0xc9,0xc0,0xfc,0xf5,0xad,0xa4,0x3a,0x33,0x6b,0x62,0x5e,0x57,
 0x0f,0x06,0x00,0x09,0x51,0x58,0x64,0x6d,0x35,0x3c,0xa2,0xab,0xf3,0xfa,0xc6,0xcf,
 0x97,0x9e,0x0d,0x04,0x5c,0x55,0x69,0x60,0x38,0x31,0xaf,0xa6,0xfe,0xf7,0xcb,0xc2,
 0x9a,0x93,0x95,0x9c,0xc4,0xcd,0xf1,0xf8,0xa0,0xa9,0x37,0x3e,0x66,0x6f,0x53,0x5a,
 0x02,0x0b,0xf4,0xfd,0xa5,0xac,0x90,0x99,0xc1,0xc8,0x56,0x5f,0x07,0x0e,0x32,0x3b,
 0x63,0x6a,0x6c,0x65,0x3d,0x34,0x08,0x01,0x59,0x50,0xce,0xc7,0x9f,0x96,0xaa,0xa3,
 0xfb,0xf2,0x61,0x68,0x30,0x39,0x05,0x0c,0x54,0x5d,0xc3,0xca,0x92,0x9b,0xa7,0xae,
 0xf6,0xff,0xf9,0xf0,0xa8,0xa1,0x9d,0x94,0xcc,0xc5,0x5b,0x52,0x0a,0x03,0x3f,0x36,
 0x6e,0x67,0xb2,0xbb,0xe3,0xea,0xd6,0xdf,0x87,0x8e,0x10,0x19,0x41,0x48,0x74,0x7d,
 0x25,0x2c,0x2a,0x23,0x7b,0x72,0x4e,0x47,0x1f,0x16,0x88,0x81,0xd9,0xd0,0xec,0xe5,
 0xbd,0xb4,0x27,0x2e,0x76,0x7f,0x43,0x4a,0x12,0x1b,0x85,0x8c,0xd4,0xdd,0xe1,0xe8,
 0xb0,0xb9,0xbf,0xb6,0xee,0xe7,0xdb,0xd2,0x8a,0x83,0x1d,0x14,0x4c,0x45,0x79,0x70
};
template <
 std::uint32_t m,
 std::size_t rs1,std::size_t rs2,std::size_t rs3,std::size_t rs4,
 std::size_t rm1,std::size_t rm2,
 std::size_t rf
>
std::uint32_t big_wheel_engine<m,rs1,rs2,rs3,rs4,rm1,rm2,rf>::sbox_r[256] = {
 0xe9,0xe0,0xdc,0xd5,0x8d,0x84,0x1a,0x13,0x4b,0x42,0x7e,0x77,0x2f,0x26,0x20,0x29,
 0x71,0x78,0x44,0x4d,0x15,0x1c,0x82,0x8b,0xd3,0xda,0xe6,0xef,0xb7,0xbe,0x6b,0x62,
 0x3a,0x33,0x0f,0x06,0x5e,0x57,0xc9,0xc0,0x98,0x91,0xad,0xa4,0xfc,0xf5,0xf3,0xfa,
 0xa2,0xab,0x97,0x9e,0xc6,0xcf,0x51,0x58,0x00,0x09,0x35,0x3c,0x64,0x6d,0xfe,0xf7,
 0xaf,0xa6,0x9a,0x93,0xcb,0xc2,0x5c,0x55,0x0d,0x04,0x38,0x31,0x69,0x60,0x66,0x6f,
 0x37,0x3e,0x02,0x0b,0x53,0x5a,0xc4,0xcd,0x95,0x9c,0xa0,0xa9,0xf1,0xf8,0x07,0x0e,
 0x56,0x5f,0x63,0x6a,0x32,0x3b,0xa5,0xac,0xf4,0xfd,0xc1,0xc8,0x90,0x99,0x9f,0x96,
 0xce,0xc7,0xfb,0xf2,0xaa,0xa3,0x3d,0x34,0x6c,0x65,0x59,0x50,0x08,0x01,0x92,0x9b,
 0xc3,0xca,0xf6,0xff,0xa7,0xae,0x30,0x39,0x61,0x68,0x54,0x5d,0x05,0x0c,0x0a,0x03,
 0x5b,0x52,0x6e,0x67,0x3f,0x36,0xa8,0xa1,0xf9,0xf0,0xcc,0xc5,0x9d,0x94,0x41,0x48,
 0x10,0x19,0x25,0x2c,0x74,0x7d,0xe3,0xea,0xb2,0xbb,0x87,0x8e,0xd6,0xdf,0xd9,0xd0,
 0x88,0x81,0xbd,0xb4,0xec,0xe5,0x7b,0x72,0x2a,0x23,0x1f,0x16,0x4e,0x47,0xd4,0xdd,
 0x85,0x8c,0xb0,0xb9,0xe1,0xe8,0x76,0x7f,0x27,0x2e,0x12,0x1b,0x43,0x4a,0x4c,0x45,
 0x1d,0x14,0x28,0x21,0x79,0x70,0xee,0xe7,0xbf,0xb6,0x8a,0x83,0xdb,0xd2,0x2d,0x24,
 0x7c,0x75,0x49,0x40,0x18,0x11,0x8f,0x86,0xde,0xd7,0xeb,0xe2,0xba,0xb3,0xb5,0xbc,
 0xe4,0xed,0xd1,0xd8,0x80,0x89,0x17,0x1e,0x46,0x4f,0x73,0x7a,0x22,0x2b,0xb8,0xb1
};
#endif

So, I have some specific questions about it:

How can I improve the performance of the program?
How can the use of multithreading be improved?

Question 2

Parallelism:
There are two major problems with your code

Your code doesn't run in parallel at all! Locking the mutex at the beginning of calculate_pi means you are always only run one instance of calculate_pi at the same time.
You are passing a copy of the same random number engine to each thread. As this is only a pseudo rando number generator, all threads will operate on the same sequence of numbers (you are doing the same work multiple times).

Solution:

The only thing you share among the codes is pcnt just define it as an atomic (std::atomic<std::uint32_t>). Even Better: Don't share anything: Let each task use it's own hit counter and sum them up at the end (easiest way to do this would be to use std::async).
Create and seed a separate random number engine for each thread

On my 4 Core machine with VS2015U3, this reduced the execution time from 120 to 20 Seconds and reduced the error roughly by a factor of two.

Small style tips:

Don't define variables before you use them. e.g.:
- gen_pi should be defined (as a constant) just when you need to store the result of approximate_pi.
- p1..9 and tmp_x should be (constant) function local variables
Const correctness: A lot of variables are only initialized and never changed afterwards - make them const
range based for: You could use more range based for loops (e.g. when joining the threads, creating the seed, or some loops that go over the state array of the random number engine.

Question 3

The idea of it was using the same PRNG for all the threads, so, is it a good idea to pass the generator by reference and put the lock guard before calling it? I mean, std::thread(calculate_pi<big_wheel_engine<>>,num_points_per_thread,std::ref(rnd)) for initialization and std::uint32_t rndx; { std::lock_guard<std::mutex> guard(mtx); rndx = rgen(); } ... for calling.

Question 4

@alvaroceballos1: The idea of it was using the same PRNG for all the threads. That is most likely a very, very bad Idea. If you want to parallelize your work efficiently YOU MUST NOT SHARE MUTABLE STATE. I haven't put your code through a profiler, but most likely, generating the random number is a costly operation. Serializing that work and adding the overhead of locking/unlocking a contested mutex will likely leave you with no better performance than running your program on a single core. But feel free to try (and report the results).

Question 5

Now I used std::async with separated random number engines as you suggested, and it works fine. Thanks!

MikeMB MikeMB 6035 silver badges13 bronze badges · Accepted Answer · 2016-12-25 00:51:03Z

Parallelism:
There are two major problems with your code

Your code doesn't run in parallel at all! Locking the mutex at the beginning of calculate_pi means you are always only run one instance of calculate_pi at the same time.
You are passing a copy of the same random number engine to each thread. As this is only a pseudo rando number generator, all threads will operate on the same sequence of numbers (you are doing the same work multiple times).

Solution:

The only thing you share among the codes is pcnt just define it as an atomic (std::atomic<std::uint32_t>). Even Better: Don't share anything: Let each task use it's own hit counter and sum them up at the end (easiest way to do this would be to use std::async).
Create and seed a separate random number engine for each thread

On my 4 Core machine with VS2015U3, this reduced the execution time from 120 to 20 Seconds and reduced the error roughly by a factor of two.

Small style tips:

Don't define variables before you use them. e.g.:
- gen_pi should be defined (as a constant) just when you need to store the result of approximate_pi.
- p1..9 and tmp_x should be (constant) function local variables
Const correctness: A lot of variables are only initialized and never changed afterwards - make them const
range based for: You could use more range based for loops (e.g. when joining the threads, creating the seed, or some loops that go over the state array of the random number engine.

The idea of it was using the same PRNG for all the threads, so, is it a good idea to pass the generator by reference and put the lock guard before calling it? I mean, std::thread(calculate_pi<big_wheel_engine<>>,num_points_per_thread,std::ref(rnd)) for initialization and std::uint32_t rndx; { std::lock_guard<std::mutex> guard(mtx); rndx = rgen(); } ... for calling.
@alvaroceballos1: The idea of it was using the same PRNG for all the threads. That is most likely a very, very bad Idea. If you want to parallelize your work efficiently YOU MUST NOT SHARE MUTABLE STATE. I haven't put your code through a profiler, but most likely, generating the random number is a costly operation. Serializing that work and adding the overhead of locking/unlocking a contested mutex will likely leave you with no better performance than running your program on a single core. But feel free to try (and report the results).
Now I used std::async with separated random number engines as you suggested, and it works fine. Thanks!

Stack Exchange Network

Multithreaded Monte Carlo pi approximation with own pseudorandom number generator

Code:

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Multithreaded Monte Carlo pi approximation with own pseudorandom number generator

Code:

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions