/* HPCtest.cpp
Implements the Edinburgh HPC Masters test
(C) 2008 Niall Douglas
Created: 7th Mar 2008
*/

/* NOTE: All documentation is in doxygen format

This is an admission test, so I must admit to showing off:

Q1: Generically implemented via runtime and via C++ metaprogramming.
    You'll need GCC to run the metaprogramming as MSVC isn't up to it yet.

Q2: Generically implemented via runtime.

Q3: Uses binary search to find N(min), generically implemented via runtime
and via C++ metaprogramming.

I thought something more challenging might be in order, so I have
additionally implemented generic C++ and multithreaded SSE2 versions
of a Bailey-Borwein-Plouffe (BBP) implementation PI finder. Admittedly,
it doesn't come close to beating http://gmplib.org/pi-with-gmp.html, but
it is intended as a demonstration of my parallel programming abilities.
I am particularly proud of getting the modular exponentiation function
(pow16modSSE2) to calculate two values in parallel efficiently.

All questions reuse an identical C++ metaprogramming test framework
to demonstrate generic programming skills. A nanosecond timer is used
for timing and the program will run on POSIX-compatible and Windows.
The code was tested on 32 bit Windows XP, Linux and Apple Mac OS X using
GCC v4.1.3 and MSVC8.
*/



/* I use C-style printf() instead of C++ iostreams as the latter
can throw exceptions when you really don't want them to. While less
'pure', printf() tends to default as more mission-critical safe. */
#include <stdio.h>
#include <math.h>
#include <stdarg.h>
#include <assert.h>
#include <typeinfo>

//! The proper value of PI
static const double PI_VAL=3.1415926535897932384626433832795;
//! A metaprogramming compatible abs()
#define myabs(v) (((v)<0) ? -(v) : (v))

// For hires timing function
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
#include <sys/time.h>
#endif

#ifdef _MSC_VER
typedef unsigned __int64 ulonglong;
#else
typedef unsigned long long ulonglong;
#endif
//! Really accurate time function
static ulonglong GetNsCount()
{
#ifdef WIN32
   static LARGE_INTEGER ticksPerSec;
   static double scalefactor;
   // It's faster to use the FPU than a 128 bit MulDiv implementation
   // though this would change if SSE2 became more prevalent
   if(!scalefactor)
   {
      if(QueryPerformanceFrequency(&ticksPerSec))
         scalefactor=ticksPerSec.QuadPart/1000000000.0;
      else
         scalefactor=1;
   }
   LARGE_INTEGER val;
   if(!QueryPerformanceCounter(&val))
      return (ulonglong) GetTickCount() * 1000000;
   return (ulonglong) (val.QuadPart/scalefactor);
#else
   // It's only as good as POSIX can make it, which is pretty good on most
  // x86 based platforms which have a RTDSC
   struct timeval tv;
   if(-1==gettimeofday((::timeval *) &tv, 0)) return 0;
   return ((ulonglong) tv.tv_sec*1000000000LL)+tv.tv_usec*1000;
#endif
}


//! Return value
template<class value, int iter=0> struct retvalue
{
   typedef value VALUE;
   static const int ITER=iter;

   value v;        //!< The value
   value e;        //!< The error diff
   int iterations; //!< The iteration count
    retvalue (const value &_v, const value &_e, int _iterations=iter) :
          v(_v), e(_e), iterations(_iterations) { }
};



//! Contains the runtime PI calculation routines
namespace PICalcR {

   static int totaliterations;

   /*! Computes an approximation to PI by \em iter iterations.
   Uses a template parameter for \em iter to give compiler maximum
   possible scope for loop unrolling & optimisation */
   template<class value, int N> retvalue<value, N> q1() throw()
   {
      value v=0;
      for(int i=1; i<=N; i++, totaliterations++)
      {
         value t=(i-0.5)/N;
         v+=1/(1+t*t);
      }
      v*=(4.0/N);
      return retvalue<value, N>(v, myabs(v-PI_VAL));
   }

   /*! Computes an approximation to PI to \em prec accuracy.
   This is an extremely inefficient implementation.
   Complexity = ~O(n^2) */
   template<class value, int _prec> retvalue<value> q2() throw()
   {
      static const value prec=pow(10.0, _prec);
      retvalue<value> ret(0, prec+1);
      for(int n=1; ret.e>prec; n++, ret.iterations++)
      {
         ret.v=0;
         for(int i=1; i<=n; i++, totaliterations++)
         {
            value t=(i-0.5)/n;
            ret.v+=1/(1+t*t);
         }
         ret.v*=(4.0/n);
         ret.e=myabs(ret.v-PI_VAL);
      }
      return ret;
   }

   /*! Computes an approximation to PI to \em prec accuracy.
   Performing a regression on the error values yields y=0.083*pow(x, -2)
   That means x=pow(0.083, 1/2)/pow(y, 1/2) == sqrt(0.083)/sqrt(y)
   This is NOT exact and for higher iterations it becomes inaccurate.
   I deliberately chopped off the regression so it'll always come
   under the iterations required.
   Complexity = O(n)
   */
   template<class value, int _prec> retvalue<value> q3_1() throw()
   {
      static const value prec=pow(10.0, _prec);
      retvalue<value> ret(0, prec+1, (int)(0.2880972058177587/sqrt(prec))-1);
      for(int n=ret.iterations+1; ret.e>prec; n++, ret.iterations++)
      {
         ret.v=0;
         for(int i=1; i<=n; i++, totaliterations++)
         {
            value t=(i-0.5)/n;
            ret.v+=1/(1+t*t);
         }
         ret.v*=(4.0/n);
         ret.e=myabs(ret.v-PI_VAL);
      }
      return ret;
   }


   /*! Computes an approximation to PI to \em prec accuracy.
   I figured you'd probably think I was cheating last time
   so here's a heuristically optimising solution. It simply
   doubles or halves the iteration increment until it zeros
   in on the correct value a bit like a quick sort or binary
   search. I have no idea if that's optimal though (it's an
   impractical algorithm for calculating PI anyway).
   Complexity = O(log n)
   */
   template<class value, int _prec> retvalue<value> q3_2() throw()
   {
      bool growing=true;
      int inc=1;
      static const value prec=pow(10.0, _prec);
      retvalue<value> ret(0, prec+1);
      for(int n=ret.iterations=1; ; n+=inc)
      {
         ret.v=0;
         for(int i=1; i<=n; i++, totaliterations++)
         {
            value t=(i-0.5)/n;
            ret.v+=1/(1+t*t);
         }
         ret.v*=(4.0/n);
         ret.e=ret.v-PI_VAL;
         if(growing)
         {
            if(ret.e>prec)
            {
               inc<<=1;
               ret.iterations=n;
            }
            else
            {
               inc>>=1;
               n=ret.iterations;
               growing=false;
            }
         }
         else
         {
            if(ret.e>prec)
               ret.iterations=n;
            else
            {
               if(!(inc>>=1)) { ret.iterations=n; break; }
               n=ret.iterations;
            }
         }
      }
      return ret;
   }


} // namespace


//! Contains the static PI calculation routines
namespace PICalcS {

   namespace Impl {
      template<class value, int i, int N> struct calc : public calc<value, i-1, N>
      {
         private:
            static const value t=(i-0.5)/N;
         public:
            static const value v=calc<value, i-1, N>::v+1/(1+t*t);
      };
      template<class value, int N> struct calc<value, 0, N>
      {
         static const value v=0;
      };
   }

   /*! Computes an approximation to PI by \em iter iterations
   via C++ metaprogramming (computes by programming the compiler
   rather than using the compiler to program) */
   template<class value, int N> retvalue<value, N> q1() throw()
   {
      static const value v=Impl::calc<value, N, N>::v*(4.0/N);
      static const value e=myabs(v-PI_VAL);
      return retvalue<value, N>(v, e);
   }

} // namespace

// I put the PI finder into a separate file
#include "GenPI.cpp"

//! The generic test runner
template<class retvaluetype> static void RunTest(retvaluetype (*q)()) throw()
{
   PICalcR::totaliterations=0;
   ulonglong start=GetNsCount();
   retvaluetype ret=q();
   ulonglong end=GetNsCount();
   printf("%d iterations (precision %s) ran in %luns (totaliter=%d),\n     v=%.16f err=%.16f\n\n",
         ret.iterations, typeid(typename retvaluetype::VALUE).name(),
         (unsigned long)(end-start), PICalcR::totaliterations, ret.v, ret.e);
}

static void printf2(FILE *h, const char *fmt, ...) throw()
{
    va_list args;
    va_start(args, fmt);
    vfprintf(h, fmt, args);
    vfprintf(stdout, fmt, args);
}

static void DoPI(void (*q)(std::vector<unsigned char> &buffer, int digits), const char *filename, const int digits) throw()
{
   FILE *oh=fopen(filename, "wt");
   printf2(oh, "%d digits of pi in base 16 ...", digits);
   std::vector<unsigned char> buffer;
   ulonglong start=GetNsCount();
   q(buffer, digits);
   ulonglong end=GetNsCount();
   printf2(oh, " calculated in %f seconds\n", (end-start)/1000000000.0);
   fprintf(oh, "\npi ~= 3.\n");
   for(unsigned n=0; n<buffer.size();)
   {
       fprintf(oh, "%02x", buffer[n]);
       n++;
       if(!(n % 5)) fprintf(oh, " ");
       if(!(n % 25)) fprintf(oh, ": %u\n", n*2);
       if(!(n % 250)) fprintf(oh, "\n");
   }
   fprintf(oh, "\n");
   fclose(oh);
}

int main(int argc, char *argv[])
{   //! The precision required
   typedef double prec;

   printf("Niall's generic PI calculator\n"
         "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n"
         "Full proper value of PI is\n     v=%.16lf\n", PI_VAL);

   printf("\nQuestion 1 (runtime):\n");
   RunTest<>(&PICalcR::q1<prec, 1>);
   RunTest<>(&PICalcR::q1<prec, 2>);
   RunTest<>(&PICalcR::q1<prec, 10>);
   RunTest<>(&PICalcR::q1<prec, 50>);
   RunTest<>(&PICalcR::q1<prec, 100>);
   RunTest<>(&PICalcR::q1<prec, 500>);

#ifndef _MSC_VER
   /* I'm afraid current versions of MSVC can't handle
   static const <non integer> metaprogramming yet */

   printf("\nQuestion 1 (metaprogram):\n");
   RunTest<>(&PICalcS::q1<prec, 1>);
   RunTest<>(&PICalcS::q1<prec, 2>);
   RunTest<>(&PICalcS::q1<prec, 10>);
   RunTest<>(&PICalcS::q1<prec, 50>);
   RunTest<>(&PICalcS::q1<prec, 100>);
   //RunTest<>(&PICalcS::q1<prec, 500>);        // Exceeds GCC's nested template limit
#endif

   printf("\nQuestion 2:\n");
   RunTest<>(&PICalcR::q2<prec, -6>);

   printf("\nQuestion 3 (cheating solution):\n");
   RunTest<>(&PICalcR::q3_1<prec, -6>);

   printf("\nQuestion 3 (heuristic solution):\n");
   RunTest<>(&PICalcR::q3_2<prec, -6>);

#if 0
   // Quick test to see that both match
   RunTest<>(&PICalcR::q2  <prec, -3>);
   RunTest<>(&PICalcR::q3_1<prec, -3>);
   RunTest<>(&PICalcR::q3_2<prec, -3>);
   RunTest<>(&PICalcR::q2  <prec, -5>);
   RunTest<>(&PICalcR::q3_1<prec, -5>);
   RunTest<>(&PICalcR::q3_2<prec, -5>);
   RunTest<>(&PICalcR::q2  <prec, -10>);
   RunTest<>(&PICalcR::q3_1<prec, -10>);
   RunTest<>(&PICalcR::q3_2<prec, -10>);
#endif

   static const int digits=50000;
   printf("\n\nPortable C++ implementation:\n");
   DoPI(PICalcG::calc,      "pi.txt",  digits);
#ifdef HAVE_SSE2
   printf("\n\nOpenMP SSE2 implementation:\n");
   DoPI(PICalcG::calcSSE2,  "piSSE2.txt",  digits);
#endif

#ifdef _MSC_VER
   getchar();
#endif
   return 0;
}


