In this exercises you will vectorize the computation of pi using the arc integral
-
start from the code used in the OpenMP course or from http://goo.gl/zI3WqS or use the code below
#include <limits> #include <algorithm> #include <type_traits> #include <cstring> #include <x86intrin.h> template<typename Float> float pi(int num_steps) { if(num_steps <=0) __builtin_unreachable(); Float step = Float(1.0)/(Float) num_steps; Float sum = 0; // num_steps = 4*(num_steps/4); // make sure is a multiple of 4 for (int i=0;i< num_steps; i++){ auto x = (Float(i)+Float(0.5))*step; sum += Float(4.0)/(Float(1.0)+x*x); } return step * sum; } #include<iostream> #include <chrono> template<typename T> void go(int num_steps) { auto start = std::chrono::high_resolution_clock::now(); auto res = pi<T>(num_steps); auto total_time = std::chrono::high_resolution_clock::now() -start; std::cout << "pi = " << res << " in " << total_time.count() << std::endl; } int main () { auto total_time = std::chrono::high_resolution_clock::duration{}; constexpr int num_steps = 1000000; go<float>(num_steps); go<double>(num_steps); return 0; }
- compile with c++ -O2 pi.cpp -fopt-info-vec -march=native
- compile with
-Ofast
(try also -funroll-loops) (why O3 will not vectorize?) -
try to vectorize it by yourself using native-vectors
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; constexpr float32x4_t zero4= {0.f,0.f,0.f,0.f}; template<> float pi<float32x4_t>(int num_steps) { // fill in }