/*
Single pass code by Prodip Hore; FUZZ-IEEE, 2007;
g++ singlePass.cpp memAllocation.cpp -O3 -o singlePass
Usage: singlePass iris.bin(binary form) percent(how much % data to be loaded each time) 
Example: singlePass iris.bin 10
Please Note: The algorithm assumes data is reasonably scrambled. If not, scramble it by using the
binScramblefloat.cpp program. This program will take any data set (float binary format) and output
a scrambled data set.  
*/



#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <ctime>
#include <sys/resource.h>
#include <sys/time.h>

#include "memheader.h"


/** change here **/
typedef float DATA_TYPE;	//type of binary file to be loaded; for MRI generally short int 

#define INIT_CENTER 0		//1 if center is already initialized; 0 would produce random initialization
#define MAXDATA 150		//maximum number of examples
#define S 4			//dimension
#define CLUSTER 3		//number of clusters
#define EXP 32			//number of experiments

/*
terminating criteria; keep peps and seps to be the same; 
2 variables are kept because in future modification might be 
done to make stopping criteria change with time; In this program they should be same.
*/

float peps = 0.001, seps = 0.001, eps, m = 1.2; //difference of centroids

/***end**/

#define DATA_BYTE sizeof(DATA_TYPE)
DATA_TYPE dbuff[S];		//to load a single record
float **X;			//pointer to allocate memory for entire data set
float **U;			//pointer for the U matrix

float V[CLUSTER][S],V1[CLUSTER][S], LV[CLUSTER][S];	//centroids for singleton points and weighted points respectively
float W[CLUSTER], LU[CLUSTER][CLUSTER], LW[CLUSTER];	// to keep weight and U matrix values of c weighted points

void load_files (FILE * ptr, int no, int dim);
void updateU (int no, int dim);
double ED (int k, float v2[]);
double NORM (float v1[], float v2[]);
double update_centroids (int no, int dim);
void init (int no, int dim);
void get_random_example (float vec[], int no);
void spfcm (int no, int dim);
void compute_JM (int no, int dim, FILE * gdp, float per);
void writeV ();
void updateWeights (int no, int dim);
double timing_of (struct rusage start, struct rusage stop);
void initU (int no);


int sp, loop;
float percent;			//percent of data to be loaded in each pass
char fname[180], name[180];
FILE *info;

main (int argc, char *argv[])
{
  int i, j;
  FILE *gdp, *note;
  int dim = S, tot, size, parts;

  percent = atof (argv[2]);	//percent of data to be loaded
  strcpy (fname, argv[1]);


  struct rusage start_time, end_time;	//to calculate time

  srand (100);			//you may assign random seed here
  srand48 (200);		//random seed here

  sprintf (fname, "%dinfo.dat", (int) percent);	//for storing debugging information
  info = fopen (fname, "w");
  fprintf (info, "peps seps %f %f", peps, seps);
  for (loop = 0; loop < EXP; loop++)
    {				//total number of chunk and its size computed here
      size = (int) ceil ((percent / 100.00) * MAXDATA);
      parts = (int) ceil ((float) MAXDATA / (float) size);
      printf ("\nLoop %d Broken data size %d parts %d\n", loop + 1, size,
	      parts);

      getrusage (RUSAGE_SELF, &start_time);
      gdp = fopen (argv[1], "rb");	//data file opened
      tot = 0;			//to find how much data loaded
      sp = 0;			//to detect first pass
      eps = peps;
      for (i = 0; i < parts; i++)
	{
	  if (i == 1)		//one may change stopping criteria with time;but kept same here
	    {
	      sp = 1;
	      eps = seps;
	    }
	  tot = tot + size;
//checking how much data left and getting correct chunk size for the last data access
	  if (tot > MAXDATA)
	    {
	      tot = tot - size;
	      size = MAXDATA - tot;
	      tot = tot + size;
	    }
	  printf ("\nsize=%d\n", size);
//
	  allocate2Dfloat (&X, dim, size);	//allocate memory to load data
	  allocate2Dfloat (&U, CLUSTER, size);	//allocate U matrix
	  initU (size);		//initialize U matrix
	  printf ("\nMemory Allocation Done..\n");
//
	  load_files (gdp, size, dim);	//pass only pointer of X to load data
	  if (i == 0)
	    {
	      init (size, dim);	//initialize for the first pass only
	    }
	  fprintf (info, "\ndata chunk=%d\n", i + 1);
	  spfcm (size, dim);	//cluster data
	  updateWeights (size, dim);	//update centroid weights
//
	  free2Dfloat (&X, dim, size);	//free memory
	  free2Dfloat (&U, CLUSTER, size);
//
	  printf ("\nClustered %d examples of %d: Chunk= %d\n", tot, MAXDATA,
		  i + 1);
	}
      getrusage (RUSAGE_SELF, &end_time);
      sprintf (name, "%dtime.dat", (int) percent);
      note = fopen (name, "a");
      fprintf (note, "%f\n", timing_of (start_time, end_time));
      fclose (note);

      if (1)			//write Centroid matrix and compute Jm value
	{
	  writeV ();
	  rewind (gdp);
	  // rewind data file pointer before computing Jm value
	  compute_JM (MAXDATA, dim, gdp, percent);
	}
      fclose (gdp);
    }
  fclose (info);
}

/*
This fucntion update weights of weighted points after each data  access
*/

void
updateWeights (int no, int dim)
{
  int i, j, pos;

  for (i = 0; i < CLUSTER; i++)
    LW[i] = 0;

  if (0)
    {
      for (i = 0; i < CLUSTER; i++)
	{
	  for (j = 0; j < CLUSTER; j++)
	    printf ("%f ", LU[i][j]);
	  printf ("\n");
	}
    }

  for (i = 0; i < CLUSTER; i++)	//find weights of weighted points
    for (j = 0; j < CLUSTER; j++)
      LW[i] = LW[i] + W[j] * LU[i][j];

  for (i = 0; i < CLUSTER; i++)	//find weight of singleton points
    {
      for (j = 0; j < no; j++)
	LW[i] = LW[i] + U[i][j];
    }

  for (i = 0; i < CLUSTER; i++)	//update the weights 
    W[i] = LW[i];
  for (i = 0; i < CLUSTER; i++)	//weighted points represented by cluster centroids
    for (j = 0; j < S; j++)
      LV[i][j] = V[i][j];

  for (i = 0; i < CLUSTER; i++)	//U matrix of weighted points initialized for next iteration; 
    for (j = 0; j < CLUSTER; j++)
      if (i == j)
	LU[i][j] = 1;
      else
	LU[i][j] = 0;


  if (0)
    {
      double sum = 0;

      for (i = 0; i < CLUSTER; i++)
	printf ("\nW=%f\n", W[i]);

      for (i = 0; i < CLUSTER; i++)
	sum = sum + W[i];
      printf ("\ntot weights=%f\n", sum);
    }



}

//U matrix initialized to 0 to prevent summing up of garbage values in the first iteration 
void
initU (int no)
{
  int i, j;

  for (i = 0; i < no; i++)
    for (j = 0; j < CLUSTER; j++)
      U[j][i] = 0;
}

//final centroids written
void
writeV ()
{
  int i, j;
  FILE *ptr;

  sprintf (name, "%dV.dat", (int) percent);
  ptr = fopen (name, "a");
  for (i = 0; i < CLUSTER; i++)
    {
      for (j = 0; j < S; j++)
	fprintf (ptr, "%f ", V[i][j]);
      fprintf (ptr, "\n");
    }
  fclose (ptr);
}


//clustering of data loaded in each scan done here
void
spfcm (int no, int dim)
{

  int iterations = 0;
  double error = 100;

  printf ("\nepsilon=%f m=%f\n", eps, m);

  updateU (no, dim);
  while (error > eps)
    {

      iterations++;
      error=update_centroids (no, dim);
      updateU (no, dim);

      printf ("\niteration =%d %.15f\n", iterations,error);
    }
  fprintf (info, "\niteration required %d\n", iterations);

}

//This fucntion loads partial data 

void
load_files (FILE * ptr, int no, int dim)
{
  int i, j, size;

  printf ("\nno %d dim %d\n", no, dim);


  for (i = 0; i < no; i++)
    {
      fread (dbuff, DATA_BYTE, dim, ptr);

      for (j = 0; j < dim; j++)
	X[j][i] = dbuff[j];	//data assigned to X matrix
    }

  if (0)			//for debugging
    {
      for (i = 0; i < no; i++)
	{
	  for (j = 0; j < dim; j++)
	    printf ("%f ", X[j][i]);
	  printf ("\n");
	}
    }
  printf ("\nLoaded %d examples into memory\n", no);
}



//This fucntion updates centroids in each iteration of clustering

double
update_centroids (int no, int dim)
{
  int i, k, x;
  double numerator[S], denominator;
  double U_ikm;
  double error=0; 
  /* For each cluster */
  for (i = 0; i < CLUSTER; i++)
    {

      /* Zero out numerator and denominator options */
      denominator = 0;
      for (x = 0; x < S; x++)
	numerator[x] = 0;


      /* Calculate numerator and denominator together */
      for (k = 0; k < no; k++)
	{

	  U_ikm = pow (U[i][k], m);
	  denominator += U_ikm;


	  for (x = 0; x < S; x++)
	    numerator[x] += U_ikm * X[x][k];
	}

      if (sp)			//calculation for weighted points done here after the first pass
	{
	  for (k = 0; k < CLUSTER; k++)
	    {

	      U_ikm = pow (LU[i][k], m);
	      denominator += U_ikm * W[k];


	      for (x = 0; x < S; x++)
		numerator[x] += U_ikm * LV[k][x] * W[k];
	    }



	}

      /* Calculate V */

      for (x = 0; x < S; x++)
	{
 V1[i][x] = numerator[x] / denominator;

	}

    }				/* endfor: C clusters */

  if (0)
    {
      int j;

      for (i = 0; i < CLUSTER; i++)
	{
	  for (j = 0; j < dim; j++)
	    printf ("%f ", V1[i][j]);
	  printf ("\n");
	}

    }

for (i=0; i < CLUSTER; i++) 
 for (x=0; x < S; x++) 
  error=error+(V1[i][x]-V[i][x])*(V1[i][x]-V[i][x]);

error=error/float(CLUSTER*S);
error=sqrt(error);
for (i=0; i < CLUSTER; i++) 
 for (x=0; x < S; x++) 
V[i][x]=V1[i][x];

return error;




}



//U matrix update function

void 
updateU (int no, int dim)
{

  int i, j, k;
  double sum, D[CLUSTER];
  double sqdiff = 0;
  float U1;
  int pos;

  for (k = 0; k < no; k++)
    {

      pos = -1;
      for (i = 0; i < CLUSTER; i++)
	{
	  for (j = 0; j < S; j++)
	    if (X[j][k] != V[i][j])
	      break;

	  if (j == S)
	    pos = i;
	}
      if (pos != -1)
	{
	  for (i = 0; i < CLUSTER; i++)
	    if (i == pos)
	      U[i][k] = 1.0;
	    else
	      U[i][k] = 0.0;

	}
      else
	{
	  for (i = 0; i < CLUSTER; i++)
	    D[i] = ED (k, V[i]);	//kth example of X

	  for (i = 0; i < CLUSTER; i++)
	    {
	      sum = 0;

	      for (j = 0; j < CLUSTER; j++)
		{
		  if (i == j)
		    sum = sum + 1;
		  else
		    sum = sum + pow (D[i] / D[j], (2.0 / (m - 1)));

		}


	      U1 = 1.0 / sum;

	  

	      U[i][k] = U1;
	    }
	}
    }

  if (sp)			//weighted points updated here after the first pass
    {
      for (k = 0; k < CLUSTER; k++)
	{

	  pos = -1;
	  for (i = 0; i < CLUSTER; i++)
	    {
	      for (j = 0; j < S; j++)
		if (LV[k][j] != V[i][j])
		  break;

	      if (j == S)
		pos = i;
	    }
	  if (pos != -1)
	    {
	      for (i = 0; i < CLUSTER; i++)
		if (i == pos)
		  LU[i][k] = 1.0;
		else
		  LU[i][k] = 0.0;

	    }
	  else
	    {
	      for (i = 0; i < CLUSTER; i++)
		D[i] = NORM (LV[k], V[i]);

	      for (i = 0; i < CLUSTER; i++)
		{
		  sum = 0;

		  for (j = 0; j < CLUSTER; j++)
		    {
		      if (i == j)
			sum = sum + 1;
		      else
			sum = sum + pow (D[i] / D[j], (2.0 / (m - 1)));

		    }


		  U1 = 1.0 / sum;



		  LU[i][k] = U1;
		}
	    }

	}
      if (0)
	{
	  int j;

	  for (i = 0; i < CLUSTER; i++)
	    {
	      for (j = 0; j < CLUSTER; j++)
		printf ("%f ", LU[i][j]);
	      printf ("\n");
	    }
	}

    }



}


//distance metric

double
ED (int k, float v2[])
{
  int x;
  double sum = 0;

  for (x = 0; x < S; x++)
    sum += (X[x][k] - v2[x]) * (X[x][k] - v2[x]);

  return sqrt (sum);
}

//distance metric
double
NORM (float v1[], float v2[])
{
  int x;
  double sum = 0;

  for (x = 0; x < S; x++)
    sum += (v1[x] - v2[x]) * (v1[x] - v2[x]);

  return sqrt (sum);
}


//function for initialization of clustering
void
init (int no, int dim)
{
  int i, j;
  FILE *ptr;

  if (INIT_CENTER == 0)		//get randome example for the data set
    {
      for (i = 0; i < CLUSTER; i++)
	get_random_example (V[i], no);
    }
  else				//if initialized center already present 
    {
      float initbuffer[EXP * CLUSTER][S];
      int loc;

      loc = loop * CLUSTER;
      ptr = fopen ("initialized.dat", "r");
      if (ptr == NULL)
	printf ("\ninitialized.dat not found\n");
      for (i = 0; i < CLUSTER * EXP; i++)
	for (j = 0; j < S; j++)
	  fscanf (ptr, "%f", &initbuffer[i][j]);
      fclose (ptr);

      for (i = 0; i < CLUSTER; i++)
	for (j = 0; j < S; j++)
	  V[i][j] = initbuffer[loc + i][j];
    }

//initialization of weighted points and its U matrix before algorithm starts

  for (i = 0; i < CLUSTER; i++)
    W[i] = 0;

  for (i = 0; i < CLUSTER; i++)
    for (j = 0; j < CLUSTER; j++)
      if (i == j)
	LU[i][j] = 1;
      else
	LU[i][j] = 0;


//initialized center saved
  sprintf (name, "%dint.dat", (int) percent);
  ptr = fopen (name, "a");
  for (i = 0; i < CLUSTER; i++)
    {
      for (j = 0; j < S; j++)
	fprintf (ptr, "%f ", V[i][j]);
      fprintf (ptr, "\n");
    }
  fclose (ptr);

}

//function to get a random example
void
get_random_example (float vec[], int no)
{
  int i, j;

  i = lrand48 () % no;
  printf ("\nexample no %d selected\n", i + 1);
  for (j = 0; j < S; j++)
    vec[j] = X[j][i];

}

/*
computes Jm by incrementally loading data; This is not a part of the clustering algorithm,
but required to compute quality.
*/


void
compute_JM (int no, int dim, FILE * gdp, float per)
{
  int i, j, p;
  int size, parts, tot = 0;
  FILE *ptr;
  double temp, tempj, J2 = 0;

  sprintf (name, "%dJ2.dat", (int) per);
  ptr = fopen (name, "a");

//number of chunks and its size computed to load data incrementally
  size = (int) ceil ((per / 100.00) * no);
  parts = (int) ceil ((float) no / (float) size);
  printf ("\nComputing J_m Loop %d Broken data size %d parts %d\n", loop + 1,
	  size, parts);
  for (p = 0; p < parts; p++)
    {
      tot = tot + size;
      if (tot > MAXDATA)
	{
	  tot = tot - size;
	  size = MAXDATA - tot;
	  tot = tot + size;
	}
      allocate2Dfloat (&X, dim, size);
      load_files (gdp, size, dim);

//compute jm (basically RM formula)
      for (i = 0; i < size; i++)
	{
	  tempj = 0;
	  for (j = 0; j < CLUSTER; j++)
	    {
	      temp = ED (i, V[j]);
	      if (temp)
		tempj = tempj + pow (temp, 2.0 / (1.0 - m));
	    }
	  if (tempj)
	    J2 = J2 + pow (tempj, 1.0 - m);
	}
//end compute jm

      free2Dfloat (&X, dim, size);
      printf ("\nparts =%d Computed J_m of %d examples\n", p + 1, tot);
    }

  printf ("\n The J2 value is %f \n", J2);
  fprintf (ptr, "%f\n", J2);
  fclose (ptr);

}



/* Get time, in seconds ; take from brfcm code*/
double
timing_of (struct rusage start, struct rusage stop)
{
  long totaluMicroseconds, totalsMicroseconds;
  static double totalTime[2];

  totaluMicroseconds = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
  totalsMicroseconds = stop.ru_stime.tv_usec - start.ru_stime.tv_usec;

  /* If we need to borrow, do so */
  if (totaluMicroseconds < 0)
    {
      stop.ru_utime.tv_sec--;
      totaluMicroseconds = 1000000L + stop.ru_utime.tv_usec -
	start.ru_utime.tv_usec;
      //printf("\ngot it\n");
    }

  if (totalsMicroseconds < 0)
    {
      stop.ru_stime.tv_sec--;
      totalsMicroseconds = 1000000L + stop.ru_stime.tv_usec -
	start.ru_stime.tv_usec;
      //printf("\ngot it\n");

    }

  totalTime[0] = (stop.ru_utime.tv_sec - start.ru_utime.tv_sec) +
    0.000001 * totaluMicroseconds;
  totalTime[1] = (stop.ru_stime.tv_sec - start.ru_stime.tv_sec) +
    0.000001 * totalsMicroseconds;

  return totalTime[0] + totalTime[1];
}
