vectorize pi

In this exercises you will vectorize the computation of pi using the arc integral

  1. start from the code used in the OpenMP course or from or use the code below

     #include <limits>
     #include <algorithm>
     #include <type_traits>
     #include <cstring>
     #include <x86intrin.h>
     template<typename Float>
     float pi(int num_steps) {
       if(num_steps <=0) __builtin_unreachable();
       Float step =  Float(1.0)/(Float) num_steps;
       Float sum = 0;
       // num_steps = 4*(num_steps/4);  // make sure is a multiple of 4
       for (int i=0;i< num_steps; i++){
         auto x = (Float(i)+Float(0.5))*step;
         sum += Float(4.0)/(Float(1.0)+x*x);
       return step * sum;
     #include <chrono>
     template<typename T>
     void go(int num_steps) {
       auto start = std::chrono::high_resolution_clock::now();
       auto res = pi<T>(num_steps);
       auto total_time = std::chrono::high_resolution_clock::now() -start;
       std::cout << "pi = " << res << " in " << total_time.count() << std::endl;
     int main ()
       auto total_time = std::chrono::high_resolution_clock::duration{};
       constexpr int num_steps = 1000000;
       return 0;
  2. compile with c++ -O2 pi.cpp -fopt-info-vec -march=native
  3. compile with -Ofast (try also -funroll-loops) (why O3 will not vectorize?)
  4. try to vectorize it by yourself using native-vectors

     typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
     constexpr float32x4_t zero4= {0.f,0.f,0.f,0.f};
     float pi<float32x4_t>(int num_steps) {
        // fill in