// Lemur OLAP library (c) 2003 National Research Council of Canada by Daniel Lemire, and Owen Kaser
 /**
 *  This program is free software; you can
 *  redistribute it and/or modify it under the terms of the GNU General Public
 *  License as published by the Free Software Foundation (version 2). This
 *  program is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 *  details. You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
#ifndef CUBESTATISTICS_H
#define CUBESTATISTICS_H 

#include "normalizationscommon.h"

// just trying to have not so many crazy templates in the code
typedef vector<float> row_type;
typedef  vector<row_type > float_table_type;

template <class DT, class LDT>
class CubeStatistics {
  public:

    /*
     * This will compute the indepence product and sum.
     *
     * If H_{i} are the histograms, then that reconstructed data
     * cube is given by
     *
     * (d/2 + 1) *  card * Product_{i=0,...,d-1} H_{i}/card
     *
     * where card is the number of allocated cells in the data cube.
     *
     * The  is an estimate of how far off the frequency sort algorithm
     * can be from the optimal solution. It is an upper bound on the
     * error, so that frequency sort can do much better in some cases.
     *
     * The first return value is the bound on the lack of optimality of
     * frequency sort whereas the second one is the independence sum
     * which is a 0 to 1 value measuring independence (1 = independent,
     * 0 = dependent) of the attribute values.
     * 
     */
    static pair<float,float> independenceInfo(DataCube<DT,LDT>& DC) {
      // this next line can be expensive
      vector< vector<uint> > histograms = attributeHistograms(DC);
      assert(histograms.size() > 0);
      // next, I must compute card
      uint64 card = 0;
      for(uint value = 0; value < histograms[0].size(); ++value)	card +=	histograms[0][value];
      const vector<int> shape = DC.getShape();
      const int d = shape.size();
      // now we have to loop to compute the product
      vector<int> Bounds(shape);
      vector<int> Start(Bounds.size(),0);
      vector<int> indices(Start);
      float sum = 0.0;
      do {
        if( DC.get(indices) != 0 ) {
            float product = 1.0f;
            for(int dim = 0; dim < d; ++dim) product *= (histograms[dim][indices[dim]] / (float) card );
            sum += product;
        }
      }	while(MathUtil::increment( indices, Start, Bounds));
      assert(sum <= 1.0f);
      assert(sum >= 0.0f);
      //			cout << " independence sum = " << sum << " (1.0 = independent, 0.0 = dependent) " <<endl;
      return pair<float,float>( card * (HOLAPUtil<DT,LDT>::SparseCost(DC) )  * (1.0f - sum), sum);
        
    } 

    /*
     * 
     * This will compute the attribute-wise frequency histograms of
     * a data cube all at once. It is used to compute the idendence product.
     *  
     */
    static vector< vector<uint> > attributeHistograms(DataCube<DT,LDT>& DC) {
      const vector<int> shape = DC.getShape();
      const int d = shape.size(); 
      vector< vector<uint> > ans(d);
      for(int dim = 0; dim < d ; ++dim) ans[dim] = vector<uint>(shape[dim],0);
      vector<int> Bounds(shape);
      vector<int> Start(Bounds.size(),0);
      vector<int> indices(Start);
      do {
        if( DC.get(indices) != 0 ) {
            for(int dim = 0; dim < d; ++dim) {
              ++ans[dim][indices[dim]];
            }
        }
      }	while(MathUtil::increment( indices, Start, Bounds));
      return ans; 			
    }

    
    // Compute the joint distribution of a data cube along
    // two variables.
    static float_table_type jointDistribution(DataCube<DT,LDT>& DC,
        const int dimension1, const int dimension2) {
     vector<int> shape = DC.getShape();
     assert(dimension1 >= 0);
     assert(dimension2 >= 0); 
     assert(dimension1 != dimension2);
     assert((uint) dimension1 < shape.size()); 
     assert((uint) dimension2 < shape.size());    
     const int range1 = shape[dimension1];
     const int range2 = shape[dimension2];  
     float_table_type answer (range1,	row_type(range2,0.0f));
     int Total = 0;
     for(int value1 = 0 ; value1 < range1; ++value1) {
       for(int value2 = 0; value2 < range2; ++value2) {
        vector<int> Bounds(shape);
        vector<int> Start(Bounds.size(),0);
        Start[dimension1] = value1;
        Start[dimension2] = value2; 
        Bounds[dimension1] = value1+1;
        Bounds[dimension2] = value2+1; 
        vector<int> indices(Start);
        do {
            if(DC.get(indices) != 0) {
              ++answer[value1][value2];
              ++Total;
            }
        }	while(MathUtil::increment( indices, Start, Bounds));
       }//for(int value2 = 0; value2 < range2; ++value2) {  
     }//for(int value1 = 0 ; value1 < range1; ++value1) { 
     if(Total == 0) 
       return answer;
     for(int value1 = 0 ; value1 < range1; ++value1) 
       for(int value2 = 0; value2 < range2; ++value2) 
        answer[value1][value2] /= Total;
//		 cout << " sum should be one = " << sum(answer) << endl;
     return answer;
    }
     
    // Get the corresponding separateJointDistribution
    // that should match as closely as possible the given
    // joint distribution.
    static float_table_type separableJointDistribution(
        const float_table_type& input) {		 
      const int range1 = input.size();
      const int range2 = input[0].size();  
      pair<vector<float> , vector<float> > dist = distributions(input);
      vector<float> freq1 = dist.first;
      vector<float> freq2 = dist.second;
      float_table_type answer (range1,row_type(range2,0.0f)); 
      for(int value1 = 0 ; value1 < range1; ++value1) 
       for(int value2 = 0; value2 < range2; ++value2) 
         answer[value1][value2] = freq1[value1] * freq2[value2];
      return answer;
    }			
    
    static pair<vector<float> , vector<float> > distributions(
      const float_table_type& input) {
      const int range1 = input.size();
      const int range2 = input[0].size();  
      vector<float> freq1(range1,0.0f);
      vector<float> freq2(range2,0.0f) ;
      if((range1 == 0) || (range2 == 0)) 
        return pair<vector<float>, vector<float> > (freq1, freq2); 
      for(int value1 = 0 ; value1 < range1; ++value1) {
       for(int value2 = 0; value2 < range2; ++value2) {
          freq1[value1] += input[value1][value2] ;
          freq2[value2] += input[value1][value2] ; 
       }
      }
    //	cout << " sum should be one (freq1) = " << sum(freq1) << endl; 
    //	cout << " sum should be one (freq1) = " << sum(freq2) << endl;  
      return pair<vector<float>, vector<float> > (freq1, freq2);
    }

    static float average (const vector<float>& distribution) {
      float average = 0.0f;
      for(uint k = 0; k < distribution.size(); ++k) 
        average += k * distribution[k];
      return average;
    }

    static float variance(const vector<float>& distribution) {
      const float average_value = average(distribution);
      float variance = 0.0f;
      for(uint k = 0; k < distribution.size(); ++k) 
        variance += pow(k - average_value ,2) * distribution[k];
      return variance;
    } 

    static float standardDeviation(const vector<float>& distribution) { 
      return sqrt(variance(distribution));
    }

    // compute the correlation of two variables given the
    // joint distribution
    static float correlation(
        const float_table_type& input) {		 
      pair<vector<float> , vector<float> > dist = distributions(input);
      vector<float> freq1 = dist.first;
      vector<float> freq2 = dist.second; 
      const int range1 = freq1.size();
      const int range2 = freq2.size(); 
      const float average1 = average(freq1);
      const float stdev1 = standardDeviation(freq1);
      const float average2 = average(freq2);
      const float stdev2 = standardDeviation(freq2); 
      if((stdev1 == 0 ) || (stdev2 == 0)) return 0.0f;
      float covariance = 0.0f;
      for(int value1 = 0 ; value1 < range1; ++value1) 
       for(int value2 = 0; value2 < range2; ++value2) 
         covariance += (value1 - average1) * (value2 - average2) 
           * input[value1][value2];
      return covariance / (stdev1 * stdev2);
    }			
    
    static float sum(const float_table_type& input) {		 
      float sum = 0.0;
      const int range1 = input.size();
      const int range2 = input[0].size();  
      for(int value1 = 0 ; value1 < range1; ++value1) 
       for(int value2 = 0; value2 < range2; ++value2) 
         sum += input[value1][value2]; 			
      return sum;
    }

    static float sum(const row_type & input) {		 
      float sum = 0.0;
      for(uint k = 0; k < input.size(); ++k) sum+= input[k];
      return sum;
    }

    static inline float max(float f1, float f2) {
      if(f1 > f2) return f1;
      return f2;
    }
    static inline float min(float f1, float f2) {
      if(f1 < f2) return f1;
      return f2;
    }
 
    
    static float max(const float_table_type& input)  {
      const int range1 = input.size();
      const int range2 = input[0].size();   
      if( (range1 == 0) || (range2 == 0) ) return 0.0f;
      float current_max = input[0][0];
      for(int value1 = 0 ; value1 < range1; ++value1) 
       for(int value2 = 0; value2 < range2; ++value2) 
         current_max  = max(input[value1][value2], current_max  );
      return current_max ; 
    }	
    static float min(const float_table_type& input)  {
      const int range1 = input.size();
      const int range2 = input[0].size();   
      if( (range1 == 0) || (range2 == 0) ) return 0.0f;
      float current_min = input[0][0];
      for(int value1 = 0 ; value1 < range1; ++value1) 
       for(int value2 = 0; value2 < range2; ++value2) 
         current_min  = min(input[value1][value2], current_min  );
      return current_min ; 
    }	 			
};

#endif
