#include <stdio.h>
#include <stdlib.h>
#include "../FW/labImage.h"

#ifndef _TSC
#define _TSC
  typedef struct { unsigned int high, low; } TSC;
#endif

#ifndef _getTSC
#define getTSC(t) asm volatile("push %%edx \n\t push %%eax \n\t cpuid \n\t rdtsc\n\t movl %%edx, %0\n\t movl %%eax, %1\n\t pop %%eax \n\t pop %%edx \n\t" : "=m" (t.high), "=m" (t.low))
#endif

//  ImgAbsDiff: D = | S1 - S2 |
int ImgAbsDiff(unsigned char *Src1, 
	       unsigned char *Src2, 
	       unsigned char *Dest, int length)
{
    asm volatile
      (
        "push %%eax \n\t"
        "push %%ebx \n\t"
	"push %%edi \n\t"
       
        "mov %2, %%eax \n\t"             // load Src1 address into eax
        "mov %1, %%ebx \n\t"             // load Src2 address into ebx
        "mov %0, %%edi \n\t"             // load Dest address into edi

        "mov %3, %%ecx \n\t"             // load loop counter (SIZE) into ecx
        "shr $3, %%ecx \n\t"             // counter/8 (MMX loads 8 bytes at a time)

        ".align 16       \n\t"           // 16 byte allignment of the loop entry
        ".L1013:         \n\t"

        "movq    (%%eax), %%mm1 \n\t"    // load 8 bytes from Src1 into mm1
        "movq    (%%ebx), %%mm2 \n\t"    // load 8 bytes from Src2 into mm2
	"psubusb (%%ebx), %%mm1 \n\t"    // mm1=Src1-Src2 (sub 8 bytes with saturation)
	"psubusb (%%eax), %%mm2 \n\t"    // mm2=Src2-Src1 (sub 8 bytes with saturation)
	"por       %%mm2, %%mm1 \n\t"    // combine both mm2 and mm1 results
	"movq    %%mm1, (%%edi) \n\t"    // store result in Dest

        "add $8, %%eax \n\t"             // increase Src1, Src2 and Dest 
        "add $8, %%ebx \n\t"             // register pointers by 8
        "add $8, %%edi \n\t"

        "dec %%ecx     \n\t"             // decrease loop counter
        "jnz .L1013    \n\t"             // check loop termination, proceed if required
	"pop %%edi \n\t"
	"pop %%ebx \n\t"
	"pop %%eax \n\t"
	
        "emms          \n\t"             // exit MMX state

        : "=m"  (Dest)                   // %0
        :  "m"  (Src2),                  // %1
           "m"  (Src1),                  // %2
           "m"  (length)                 // %3
      );

  return 1;
}


//Measurements
#define CPS 400000000
#define GETTSC_OVERHEAD 0x00

TSC TSCdiff(TSC t1, TSC t2)
{
  TSC ret_val;

  if (t2.high == t1.high)
     {
       ret_val.high = 0;
       ret_val.low  = t2.low - t1.low;
     }
  else if (t2.low > t1.low)
    {
      ret_val.high = t2.high - t1.high;
      ret_val.low  = t2.low - t1.low;
    }
  else
    {
      ret_val.high = t2.high - t1.high -1; 
      ret_val.low  = ~t1.low + t2.low;
    }
  
  if (ret_val.low >= GETTSC_OVERHEAD)
    {
      ret_val.low -= GETTSC_OVERHEAD;
    }
  else
    {
      ret_val.high--;
      ret_val.low = ~ret_val.low + GETTSC_OVERHEAD;
    }
  
  return ret_val;
}

float TSCdiff_to_sec(TSC diff)
{
  return (((float) 0xffffffff/CPS)*diff.high + ((float) diff.low)/CPS);
}

void PrintResults(TSC mmx, TSC nor)
{
  float mmxt, nort;

  mmxt = TSCdiff_to_sec(mmx);
  nort = TSCdiff_to_sec(nor);
  
  printf("CPU:    %i MHz\n", CPS/1000000);
  printf("MMX:    %i %i time: %f sec\n", mmx.high, mmx.low, mmxt);
  printf("Normal: %i %i time: %f sec\n", nor.high, nor.low, nort);
  printf("Factor: %f\n\n", nort/mmxt);
}


int detect_mmx( void )

{
  int mmx_bit;

  asm( "mov    %2, %%eax     \n\t"     // request feature flag
       "cpuid                \n\t"     // get CPU ID flag
       "and    %1, %%edx     \n\t"     // check MMX bit (bit 23)
       "mov    %%edx, %0     \n\t"     // move result to mmx_bit

       :       "=m" (mmx_bit)          // %0

       :       "i"  (0x00000001),      // %1
               "i"  (0x00800000)       // %2
     );

  return mmx_bit;
}                                      






