In this exercises you will vectorize the computation of pi using the arc integral

start from the code used in the OpenMP course or from http://goo.gl/zI3WqS or use the code below

 #include <limits>
 #include <algorithm>
 #include <type_traits>
 #include <cstring>
 #include <x86intrin.h>
    
 template<typename Float>
 float pi(int num_steps) {
   if(num_steps <=0) __builtin_unreachable();
   Float step =  Float(1.0)/(Float) num_steps;
   Float sum = 0;
   // num_steps = 4*(num_steps/4);  // make sure is a multiple of 4
   for (int i=0;i< num_steps; i++){
     auto x = (Float(i)+Float(0.5))*step;
     sum += Float(4.0)/(Float(1.0)+x*x);
   }
   return step * sum;
 }
    
 #include<iostream>
 #include <chrono>
    
 template<typename T>
 void go(int num_steps) {
   auto start = std::chrono::high_resolution_clock::now();
   auto res = pi<T>(num_steps);
   auto total_time = std::chrono::high_resolution_clock::now() -start;
   std::cout << "pi = " << res << " in " << total_time.count() << std::endl;
 }
    
 int main ()
 {

   auto total_time = std::chrono::high_resolution_clock::duration{};
  
   constexpr int num_steps = 1000000;
      
   go<float>(num_steps);
   go<double>(num_steps);
    
   return 0;
 }

compile with c++ -O2 pi.cpp -fopt-info-vec -march=native
compile with -Ofast (try also -funroll-loops) (why O3 will not vectorize?)

try to vectorize it by yourself using native-vectors

 typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
 constexpr float32x4_t zero4= {0.f,0.f,0.f,0.f};

 template<>
 float pi<float32x4_t>(int num_steps) {
    // fill in
 }

vectorize pi