/**
   \file convergence.cpp

   This is the file that will contain the top-level functions used for
   judging convergence of a Markov Chain of graph
   realizations. Convergence will be judged by calculating the
   auto-correlation of all the edges in the chain of graphs. The
   particular convergence metric used, and the reference for the
   metric is found in the comments on top of the particular function
   implementing that convergence metric.

   Jaideep Ray
   7/26/2011.
*/

#include <iostream>
#include <fstream>
#include <cassert>
#include "convergence.h"

extern "C"
{
#include "mcconv.h"  
}

/*
  These are functions that are helper functions for the functions
  mentioned in convergence.h. These functions should never be used by
  the caller and are reserved for use by functions in
  covergence.cpp.

  These will never be mentioned in convergence.h since they are private.
*/

/* 
   Given a graph gr and an edge specified by [fronNode, toNode],
   does it exist in the graph ?
*/
bool doesItExist_( struct graph *gr, int fromNode, int toNode )
{
  // How many edges emanate from fromNode ? fromNode is indexed
  // from 1, but the pointer runs from zero
  int fromNode_local = fromNode - 1 ;
  int toNode_local   = toNode - 1 ;
  int numEdges = gr->ptr[fromNode_local+1] - gr->ptr[fromNode_local] ;

  // What's the starting edge id for fromNode ?
  int startingEdgeNo = gr->ptr[fromNode_local] ;
  
  /*
    Loop over all my edges. If one of them terminates at toNode,
    the edge [fromNode, toNode] exists.
  */
  for (int edge = 0; edge < numEdges; edge++)
  {
    int terminatingNode = gr->ind[ startingEdgeNo + edge ] ;

    // Found an edge from fromNode to toNode ?
    if (terminatingNode == toNode_local) return (true) ;
  }

  // Did not find an edge to toNode.
  return (false) ;
} // End of function

/**
   This function takes in a graphDB, containing a time-series of
   numGraph graphs, stored in Ali's compressed format and extracts a
   time-history of a specified edge

   @param db          reference to a graph Database
   @param fromNode    An edge is specified as emanating from a fromNode ...
   @param toNode      ... and terminating at a toNode
   @param startGraph  We don't have to create a time-history of edge
                      [fromNode, toNode] starting from the first
                      graph. We can start at an arbitrary point in the
                      graph Markov chain. startGraph >= 0
   @param endGraph    End of the Markov chain of graphs to be
                      interrogated. startGraph <= endGraph <=
                      numGraphs.
   @param edgeTS      A vector containing 1s and 0s indicating whether
                      the edge [fromNode, toNode] exists or not. The
                      vector is allocated by the caller, but the
                      function fills it up.
   @return            0 if all OK, -ve number if something screws up
                      -1 : endGraph bigger than number of graphs in DB
		      -2 : startGraph > endGraph
   
*/
int extractEdgeTimeHistory( graphDB &db, int fromNode, int toNode, int startGraph,
			    int endGraph, std::vector<int> &edgeTS )
{
  // Is the endGraph smaller than the database ?
  if ( endGraph > db.getNumGraphsInDB() ) return (-1) ;

  // startGraph has to be smaller than / equal to endGraph.
  if ( startGraph > endGraph ) return (-2) ;

  // Allocate the number of entries in the vector
  edgeTS.reserve( endGraph - startGraph + 1 ) ;

  // Loop over the entries in DB and fill up
  for (int i = startGraph; i < (endGraph+1) ; i++)
  {
    bool ans = doesItExist_( db.extractGraph( i ), fromNode, toNode ) ;

    if (ans == true) edgeTS.push_back( 1 ) ;  else edgeTS.push_back( 0 ) ;
  }

  return (0) ;
}
/* =================== END OF PRIVATE FUNCTIONS =============================*/

/**
   Given a sequence of graphs stored in a DB, check out each of the
   edges in the graph.  Identify, Kthin, the thinning factor that turns
   the binary time-series into a first-order Markov chain. Then, find
   the thinning factor, Kmind, that turns the time-series into
   independent draws from a binary distribution. Since (Kthin, Kmind)
   will be different for different each edge in the MC of graphs, take
   the worst-case scenario i.e., maximum values of each, over all edge time-series.
   
   @param db the graph database with graphs in it
   @param numNodes, nodes in the graph
   @param numEdges, number of edges in the graph
   @param r, I want a chain long enought that my edge means will be estimated within 
            +/- this tolerance.Usually, 0.01
   @param s, The chain will be long enough that the edge means will be
             estimated within +/- r with s confidence. Usually, s = 0.95	    
   @param maxNprec, the number of graphs we must have so that we can
             estimate a 1st-order Markov chain or independent draws correctly.
   @param maxKthin, the factor by which we have to thin the chain to get an uncorrelated chain
   @param maxKmind, the factor by which we have to thin to get an independent chain. 
   @param unique_edge_instance, in the chain of graphs in the DB, how many unique edges did we find?
   @return true or false i.e. do we have enough graphs in DB, so that
                         thinning by maxKmind will give us atleast 2 independent realizations?
*/
bool checkConvergence(graphDB &db, int numNodes, int numEdges, double r, double s,
		      int &maxNprec, int &maxKthin, int &maxKmind, int &unique_edge_instance)
{

  /*
    Set up the data structures to call gibbsmain
  */
  int chainLen  = db.getNumGraphsInDB() ; // length of edge time-series
  double *dwrk  = new double [chainLen] ;
  int *iwrk     = new int    [2*chainLen] ;
  double *chain = new double [chainLen] ;
  double epsilon = 1.0e-3 ;

  int nmin ; /* number of independent iterates needed */
  int kthin ; /* thinning factor that'll given me a 1st order Markov chain */
  int kmind ; /* thinning factor to get me an independence chain */
  int nburn ; /* burnin time */
  int nprec ; /* iterates needed to achieve desired accuracy */
  int r15 ;

  /* Initialize these to zero */
  maxNprec = 0;
  maxKthin = 0;
  maxKmind = 0;
  unique_edge_instance = 0 ;
  
  /*
    Loop over all pairs of nodes and extract the time-history of each
    of those edges. Check whether that particular edge ever
    existed. Nodes are numbered from 1 not zero.
  */
  for (int fromNode = 1; fromNode <= numNodes; fromNode++)
    for ( int toNode = fromNode ; toNode <= numNodes; toNode++)
    {
      std::vector<int> edgesTS ;
      extractEdgeTimeHistory( db, fromNode, toNode, 0, chainLen-1, edgesTS );
      assert(edgesTS.size() == chainLen) ; 

      /* If the edge never existed, pointless to do anything */
      long ones = (long) std::count(edgesTS.begin(), edgesTS.end(), 1) ;
 
      if ( ones != 0 ) // I found at least one '1' in the edge time-series
      {
#ifdef JR_VERBOSE
	std::cout << " Doing edge [" << fromNode << ", " << toNode << "] " ;
	std::cout << " ... found " << ones << " instances " << std::endl ;
#endif
	unique_edge_instance++ ;

	//#define JR_OUT
#ifdef JR_OUT
	// Write out a time-series for use in ex01 
	if (unique_edge_instance == 50) // an arbitrary edge
	{
	  std::ofstream fout("timeSeries.dat") ;
	  fout << edgesTS.size() << std::endl ; // Write size first
	  for (int i = 0; i < edgesTS.size(); i++)
	    fout << edgesTS[i] << std::endl ;
	  fout.close();
	}
#endif	
	/* 
	   Check this edge time-series for the thinning ratios that
	   will make it a 1st-order Markov (kthin) and independent
	   draws from a binary distribution (kmind). We do this by
	   thinning the Markov chain by 'k', fitting a first-order
	   Markov and independent models [Bishop, Y.M., Fienberg,
	   S.E., Holland, P.W.: Discrete multivariate analysis: Theory
	   and practice. Springer-Verlag, New York, NY (2007)]

	   However, the biggest fear is that as we may not have a long
	   enough time-series to calculate kmind and kthin. First,
	   make sure that we have a long enough chain. We use the
	   time-series, thin it down to the point that it resemble a
	   first-order Markov process and ensure that the time-series
	   mean, from the raw time-series (Zbar) and from the thinned
	   version (q) lie within a tolerance r i.e q-r < Zbar < q+r
	   with confidence s (95%). The length of the original
	   time-series, which when thinned by kthin, will satify the
	   above condition, is nprec. 

	   Make sure that the length of the graph chain in the
	   database is greater than the worst case nprec. If not, we
	   don't have a long enough Markov chain.
	*/
	for(int i = 0; i < edgesTS.size(); i++) chain[i] = edgesTS[i] ;
	gibbmain(chain, chainLen, -1, r, s, epsilon, dwrk, iwrk, 
		 &nmin, &kthin, &nburn, &nprec, &kmind, &r15) ;

	/* Track the worst performing edge */
	maxNprec = MAX(maxNprec, nprec) ;
	maxKthin = MAX(maxKthin, kthin) ;
	maxKmind = MAX(maxKmind, kmind) ;

	/* Clean up; start a new edge time-series */
	edgesTS.clear();

      } // if i found a good edge with a time-history
    } // end of loop over edges

  std::cout << " checkConvergence() Max values of [nprec, kthin, kmind] = [" 
	    << maxNprec << ", " << maxKthin << ", " << maxKmind << "] " << std::endl ;
  std::cout << " checkConvergence() Chain length = " <<  db.getNumGraphsInDB() << std::endl ;
  std::cout << " checkConvergence() No. of unique_edge_instances = " 
	    << unique_edge_instance << std::endl ;
  
  bool longEnough = true ;
  /* 
     If the chain is not long enough that i can estimate kmind and
     kthin properly, the chain is too short for any use.
  */
  if (maxNprec > chainLen) longEnough = false ; 

  delete [] dwrk ;
  delete [] iwrk ;
  delete [] chain;
  return (longEnough) ;
}

