/* HPCtest.cpp Implements the Edinburgh HPC Masters test (C) 2008 Niall Douglas Created: 7th Mar 2008 */ /* NOTE: All documentation is in doxygen format This is an admission test, so I must admit to showing off: Q1: Generically implemented via runtime and via C++ metaprogramming. You'll need GCC to run the metaprogramming as MSVC isn't up to it yet. Q2: Generically implemented via runtime. Q3: Uses binary search to find N(min), generically implemented via runtime and via C++ metaprogramming. I thought something more challenging might be in order, so I have additionally implemented generic C++ and multithreaded SSE2 versions of a Bailey-Borwein-Plouffe (BBP) implementation PI finder. Admittedly, it doesn't come close to beating http://gmplib.org/pi-with-gmp.html, but it is intended as a demonstration of my parallel programming abilities. I am particularly proud of getting the modular exponentiation function (pow16modSSE2) to calculate two values in parallel efficiently. All questions reuse an identical C++ metaprogramming test framework to demonstrate generic programming skills. A nanosecond timer is used for timing and the program will run on POSIX-compatible and Windows. The code was tested on 32 bit Windows XP, Linux and Apple Mac OS X using GCC v4.1.3 and MSVC8. */ /* I use C-style printf() instead of C++ iostreams as the latter can throw exceptions when you really don't want them to. While less 'pure', printf() tends to default as more mission-critical safe. */ #include #include #include #include #include //! The proper value of PI static const double PI_VAL=3.1415926535897932384626433832795; //! A metaprogramming compatible abs() #define myabs(v) (((v)<0) ? -(v) : (v)) // For hires timing function #ifdef WIN32 #define WIN32_LEAN_AND_MEAN #include #else #include #endif #ifdef _MSC_VER typedef unsigned __int64 ulonglong; #else typedef unsigned long long ulonglong; #endif //! Really accurate time function static ulonglong GetNsCount() { #ifdef WIN32 static LARGE_INTEGER ticksPerSec; static double scalefactor; // It's faster to use the FPU than a 128 bit MulDiv implementation // though this would change if SSE2 became more prevalent if(!scalefactor) { if(QueryPerformanceFrequency(&ticksPerSec)) scalefactor=ticksPerSec.QuadPart/1000000000.0; else scalefactor=1; } LARGE_INTEGER val; if(!QueryPerformanceCounter(&val)) return (ulonglong) GetTickCount() * 1000000; return (ulonglong) (val.QuadPart/scalefactor); #else // It's only as good as POSIX can make it, which is pretty good on most // x86 based platforms which have a RTDSC struct timeval tv; if(-1==gettimeofday((::timeval *) &tv, 0)) return 0; return ((ulonglong) tv.tv_sec*1000000000LL)+tv.tv_usec*1000; #endif } //! Return value template struct retvalue { typedef value VALUE; static const int ITER=iter; value v; //!< The value value e; //!< The error diff int iterations; //!< The iteration count retvalue (const value &_v, const value &_e, int _iterations=iter) : v(_v), e(_e), iterations(_iterations) { } }; //! Contains the runtime PI calculation routines namespace PICalcR { static int totaliterations; /*! Computes an approximation to PI by \em iter iterations. Uses a template parameter for \em iter to give compiler maximum possible scope for loop unrolling & optimisation */ template retvalue q1() throw() { value v=0; for(int i=1; i<=N; i++, totaliterations++) { value t=(i-0.5)/N; v+=1/(1+t*t); } v*=(4.0/N); return retvalue(v, myabs(v-PI_VAL)); } /*! Computes an approximation to PI to \em prec accuracy. This is an extremely inefficient implementation. Complexity = ~O(n^2) */ template retvalue q2() throw() { static const value prec=pow(10.0, _prec); retvalue ret(0, prec+1); for(int n=1; ret.e>prec; n++, ret.iterations++) { ret.v=0; for(int i=1; i<=n; i++, totaliterations++) { value t=(i-0.5)/n; ret.v+=1/(1+t*t); } ret.v*=(4.0/n); ret.e=myabs(ret.v-PI_VAL); } return ret; } /*! Computes an approximation to PI to \em prec accuracy. Performing a regression on the error values yields y=0.083*pow(x, -2) That means x=pow(0.083, 1/2)/pow(y, 1/2) == sqrt(0.083)/sqrt(y) This is NOT exact and for higher iterations it becomes inaccurate. I deliberately chopped off the regression so it'll always come under the iterations required. Complexity = O(n) */ template retvalue q3_1() throw() { static const value prec=pow(10.0, _prec); retvalue ret(0, prec+1, (int)(0.2880972058177587/sqrt(prec))-1); for(int n=ret.iterations+1; ret.e>prec; n++, ret.iterations++) { ret.v=0; for(int i=1; i<=n; i++, totaliterations++) { value t=(i-0.5)/n; ret.v+=1/(1+t*t); } ret.v*=(4.0/n); ret.e=myabs(ret.v-PI_VAL); } return ret; } /*! Computes an approximation to PI to \em prec accuracy. I figured you'd probably think I was cheating last time so here's a heuristically optimising solution. It simply doubles or halves the iteration increment until it zeros in on the correct value a bit like a quick sort or binary search. I have no idea if that's optimal though (it's an impractical algorithm for calculating PI anyway). Complexity = O(log n) */ template retvalue q3_2() throw() { bool growing=true; int inc=1; static const value prec=pow(10.0, _prec); retvalue ret(0, prec+1); for(int n=ret.iterations=1; ; n+=inc) { ret.v=0; for(int i=1; i<=n; i++, totaliterations++) { value t=(i-0.5)/n; ret.v+=1/(1+t*t); } ret.v*=(4.0/n); ret.e=ret.v-PI_VAL; if(growing) { if(ret.e>prec) { inc<<=1; ret.iterations=n; } else { inc>>=1; n=ret.iterations; growing=false; } } else { if(ret.e>prec) ret.iterations=n; else { if(!(inc>>=1)) { ret.iterations=n; break; } n=ret.iterations; } } } return ret; } } // namespace //! Contains the static PI calculation routines namespace PICalcS { namespace Impl { template struct calc : public calc { private: static const value t=(i-0.5)/N; public: static const value v=calc::v+1/(1+t*t); }; template struct calc { static const value v=0; }; } /*! Computes an approximation to PI by \em iter iterations via C++ metaprogramming (computes by programming the compiler rather than using the compiler to program) */ template retvalue q1() throw() { static const value v=Impl::calc::v*(4.0/N); static const value e=myabs(v-PI_VAL); return retvalue(v, e); } } // namespace // I put the PI finder into a separate file #include "GenPI.cpp" //! The generic test runner template static void RunTest(retvaluetype (*q)()) throw() { PICalcR::totaliterations=0; ulonglong start=GetNsCount(); retvaluetype ret=q(); ulonglong end=GetNsCount(); printf("%d iterations (precision %s) ran in %luns (totaliter=%d),\n v=%.16f err=%.16f\n\n", ret.iterations, typeid(typename retvaluetype::VALUE).name(), (unsigned long)(end-start), PICalcR::totaliterations, ret.v, ret.e); } static void printf2(FILE *h, const char *fmt, ...) throw() { va_list args; va_start(args, fmt); vfprintf(h, fmt, args); vfprintf(stdout, fmt, args); } static void DoPI(void (*q)(std::vector &buffer, int digits), const char *filename, const int digits) throw() { FILE *oh=fopen(filename, "wt"); printf2(oh, "%d digits of pi in base 16 ...", digits); std::vector buffer; ulonglong start=GetNsCount(); q(buffer, digits); ulonglong end=GetNsCount(); printf2(oh, " calculated in %f seconds\n", (end-start)/1000000000.0); fprintf(oh, "\npi ~= 3.\n"); for(unsigned n=0; n(&PICalcR::q1); RunTest<>(&PICalcR::q1); RunTest<>(&PICalcR::q1); RunTest<>(&PICalcR::q1); RunTest<>(&PICalcR::q1); RunTest<>(&PICalcR::q1); #ifndef _MSC_VER /* I'm afraid current versions of MSVC can't handle static const metaprogramming yet */ printf("\nQuestion 1 (metaprogram):\n"); RunTest<>(&PICalcS::q1); RunTest<>(&PICalcS::q1); RunTest<>(&PICalcS::q1); RunTest<>(&PICalcS::q1); RunTest<>(&PICalcS::q1); //RunTest<>(&PICalcS::q1); // Exceeds GCC's nested template limit #endif printf("\nQuestion 2:\n"); RunTest<>(&PICalcR::q2); printf("\nQuestion 3 (cheating solution):\n"); RunTest<>(&PICalcR::q3_1); printf("\nQuestion 3 (heuristic solution):\n"); RunTest<>(&PICalcR::q3_2); #if 0 // Quick test to see that both match RunTest<>(&PICalcR::q2 ); RunTest<>(&PICalcR::q3_1); RunTest<>(&PICalcR::q3_2); RunTest<>(&PICalcR::q2 ); RunTest<>(&PICalcR::q3_1); RunTest<>(&PICalcR::q3_2); RunTest<>(&PICalcR::q2 ); RunTest<>(&PICalcR::q3_1); RunTest<>(&PICalcR::q3_2); #endif static const int digits=50000; printf("\n\nPortable C++ implementation:\n"); DoPI(PICalcG::calc, "pi.txt", digits); #ifdef HAVE_SSE2 printf("\n\nOpenMP SSE2 implementation:\n"); DoPI(PICalcG::calcSSE2, "piSSE2.txt", digits); #endif #ifdef _MSC_VER getchar(); #endif return 0; }