// Lemur OLAP library (c) 2003 National Research Council of Canada by Daniel Lemire, and Owen Kaser
 /**
 *  This program is free software; you can
 *  redistribute it and/or modify it under the terms of the GNU General Public
 *  License as published by the Free Software Foundation (version 2). This
 *  program is distributed in the hope that it will be useful, but WITHOUT ANY
 *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 *  FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 *  details. You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software Foundation,
 *  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
#ifndef NORMALIZATION_H
#define NORMALIZATION_H

#include "normalizationscommon.h"

#include "normalutil.h"
#include "holaputil.h"
#include "graphs.h"


/*
 *
 * We have Normalization as the base class.
 *
 * Key: _DT = DataType, LongDataType = _LDT and ComparatorType = _CT 
 */


//
// Most naive normalization, serves as base class
//

template <class _DT, class _LDT>
class ComposedNormalization;


// This is the INDENTITY normalization
template <class _DT, class _LDT>
class Normalization {
public:
    Normalization(){}
    // clone(), getTextName() should be overridden in every derived class. Pain!
    virtual Normalization* clone() const { return new Normalization(*this);}
    // See Stroustrup 3rd ed, 15.6.2
    virtual ~Normalization() {}
    virtual const string getTextName(void) const {
        return "default normalization";}

    // overwrite this
    // the base class returns the stupid normalization given it.

    virtual norm_type computeNormalFrom( DataCube<_DT,_LDT>& DC,
            const norm_type& initial){
        return initial;
    }

    virtual norm_type computeNormal( DataCube<_DT,_LDT>& DC) {
        vector<int> shape = DC.getShape();
        norm_type Id = PermutationUtil::identity(shape);
        return computeNormalFrom( DC, Id);
    }
};

//   Allow functional composition of normalizers.  If f and g are Normalizations,
//   then f * g is the Normalization that first runs g, then runs f on the result
//
//   No need to have this inside a class. DL
//   ...but conceptually, isn't it something that this object can do? -OK
//
template <class _DT, class _LDT>
ComposedNormalization<_DT,_LDT> operator*( const Normalization<_DT,_LDT> & n1,
        const Normalization<_DT,_LDT>  &n2) {
    return ComposedNormalization<_DT,_LDT>(n1 , n2);
}

template <class _DT, class _LDT>
ComposedNormalization<_DT,_LDT> pow( const Normalization<_DT,_LDT> & n1, const uint power) {
      if(power == 0) { 
        Normalization<_DT,_LDT> Identity ;
        return ComposedNormalization<_DT,_LDT>(Identity,Identity );
      }
      if(power == 1) {
        Normalization<_DT,_LDT> Identity ; 
        return ComposedNormalization<_DT,_LDT>(Identity , n1 ); 
      }
      ComposedNormalization<_DT,_LDT> n(n1,n1);
      for(uint p = 2; p < power; ++p) 
          n = n * n1;
      return n;
}

 
template <class _DT, class _LDT>
class ComposedNormalization: public Normalization<_DT,_LDT> {
public:
    ComposedNormalization(const Normalization<_DT,_LDT> &n1, const Normalization<_DT,_LDT> &n2) {
        nn1 = n1.clone(); nn2 = n2.clone();
    }  //I think I must manage the closure myself, to ensure long lifetimes of n1, n2.
    //some subtlety because nn1 and nn2 should be ptrs, don't want base-class objects!
    ComposedNormalization(const ComposedNormalization &cn) : nn1(cn.nn1->clone() ), nn2(cn.nn2->clone() ){} 
    ComposedNormalization<_DT,_LDT> operator=(const ComposedNormalization<_DT,_LDT> & CN) {
      if(&CN == this) return *this;
      // why one must avoid pointers
      delete nn1;
      nn1 = CN.nn1->clone();
      delete nn2;
      nn2 = CN.nn2->clone();
      return *this;
    }
    // shallow copy is bad
    virtual ComposedNormalization *clone() const { return new ComposedNormalization(*this);}
    virtual ~ComposedNormalization() { delete nn1; delete nn2;}
    virtual const string getTextName() const {
        return "first (" + nn2->getTextName() + ") then (" + nn1->getTextName() + ")";
    }
    virtual norm_type computeNormalFrom( DataCube<_DT,_LDT>& DC, const norm_type & initial){
        return nn1->computeNormalFrom( DC, nn2->computeNormalFrom(DC, initial));
    }
private:
    ComposedNormalization() { cerr << "composedNorm nullary constructor does not make sense"; exit(1);}
    Normalization<_DT,_LDT> *nn1, *nn2;
};



/*********************************************************************/
/*  just ask an external source of data (at construction time)
 * and thereafter return the normalization read at construction
 * Does a few sanity checks on the data and the given datacube
 */

template <class _DT, class _LDT>
class NormalizationReader : public Normalization<_DT,_LDT> {
public:
NormalizationReader(char *fileName=""):
    haveRead(false),
    theNormalization()
    { if (fileName) fName = string(fileName);
        else fName = "** no filename for NormalizationReader**";
    }
virtual NormalizationReader* clone() const { return new NormalizationReader(*this);}
    virtual const string getTextName(void) const { return "External normalization read";}
    virtual ~NormalizationReader() {}
    virtual norm_type computeNormalFrom( DataCube<_DT,_LDT>& DC,
            const norm_type & initial);
private:
    void readNorm(istream& is);
    bool haveRead;
    string fName;
    norm_type theNormalization;
};


template <class _DT, class _LDT>
void NormalizationReader<_DT,_LDT>::
readNorm( istream& is) {
    string line;

    while ( getline(is,line)) {
        if (line.length() == 0) break;  // empty line OR eof terminates

        stringstream ss(line);
        vector<int> current;
        int i;

        for(int k = 0; (ss >> i); ++k) {
            current.push_back(i);
        }
        // sanity check: valid permutation?

        if (!PermutationUtil::isPermutation(current)) {
            cerr << " problem in external normalization: not a permutation "; exit(1);
        }
        theNormalization.push_back(current);
    }
}



template <class _DT, class _LDT>
norm_type  NormalizationReader<_DT,_LDT>::
computeNormalFrom(DataCube<_DT,_LDT>& DC, const norm_type &  initial) {

    if (!haveRead ) {
        ifstream fileContents( fName.c_str() );
        if (!fileContents) { cerr << "opening " << fName <<" failed. Aborting" << endl; exit(1);}
        readNorm( fileContents);
        haveRead = true;
    }

    // sanity checks that it has the correct shape.
    vector<int> shape = DC.getShape();
    if (shape.size() != theNormalization.size()) {
        cerr << "normalization read is for " << theNormalization.size() <<
        " dimensions. Need "  << shape.size() << endl;
        exit(1);
    }

    // new rounding scheme can result in extra "padding" attribute values
    // that we can normalize to themselves [with a warning]

    for (uint k =0; k < shape.size(); ++k) {
        if ((uint) shape[k] !=  theNormalization[k].size()) {
            cerr << "[Warning] normalization read has " << theNormalization[k].size() <<
            " values for dimension " << k << ".  Need " << shape[k] << endl;
            if ( (uint) shape[k] > theNormalization[k].size()) {
                cerr << "[Warning] ...assuming due to dimension-size round-up. Continuing." << endl;
                for ( int j= int(theNormalization[k].size()); j < int(shape[k]); ++j )
                    theNormalization[k].push_back(j);
            }
            else exit(1);
        }
    }
    // nothing seems wrong
    return theNormalization;
}

/*****************************************************************/

/* This is, well, randomized... multiple calls to computeNormalFrom
 * should be expected to return different results...
 */


template <class _DT, class _LDT>
class RandomNormalization : public Normalization<_DT,_LDT> {
public:
    RandomNormalization(){};
    virtual RandomNormalization* clone() const { return new RandomNormalization(*this);}
    virtual const string getTextName(void) const { return "Random";}
    virtual ~RandomNormalization() {}
    virtual norm_type computeNormalFrom( DataCube<_DT,_LDT>& DC,
            const norm_type & initial)
    {  return PermutationUtil::randomPermutation( DC.getShape()); }
};



/***********************************************************/


template <class _DT, class _LDT, class _CT = LessComparator<pair<_LDT,int> > >
class FrequencySort : public Normalization<_DT,_LDT> {
public:
FrequencySort() : mComparator() {}
    virtual FrequencySort* clone() const { return new FrequencySort(*this);}
    virtual ~FrequencySort() {}
    virtual const string getTextName(void) const { return "Frequency Sorting ("+ _CT::getDescription() +")";}
    virtual norm_type computeNormalFrom(DataCube<_DT,_LDT>& DC, 
        const norm_type & initial);
    
   virtual deque<pair<_LDT,int> > sortedFrequencyHistogram(DataCube<_DT,_LDT>& DC, const uint dimension); 
protected:
    virtual vector<_LDT> frequencyHistogram(DataCube<_DT,_LDT>& DC, const uint dimension);
    _CT mComparator;
    // test whether the given index counts toward the histogram for a given dimension
    virtual bool includeInHistogram( const vector<int>& idx,  const uint DimToIgnore){return true; /* default*/	}
    // Unless it is overwritten with something less trivial, a good compiler will
    // be able to optimize this to zero or close. It appears that gcc is a good
    // compiler because I see no noticeable slowdown.  DL
};

//
// Implementations follow...
///////////////////////////////

template <class _DT, class _LDT, class _CT>
deque<pair<_LDT,int> > FrequencySort<_DT,_LDT,_CT>::sortedFrequencyHistogram(DataCube<_DT,_LDT>& DC,
    const uint dimension) {
        vector<_LDT> freq = frequencyHistogram(DC, dimension);
        // everything else should be cheap
        deque<pair<_LDT,int> > freqindexpairs;
        for(uint index = 0; index < freq.size(); ++index) {
            freqindexpairs.push_back(pair<_LDT,int>(freq[index], index));
        }
        sort(freqindexpairs.begin(), freqindexpairs.end(), mComparator);
        return freqindexpairs;
}


template <class _DT, class _LDT,  class _CT>
vector<_LDT> FrequencySort<_DT,_LDT,_CT>::
frequencyHistogram(DataCube<_DT,_LDT>& DC, const uint dimension) {
    // this can be relatively expensive
    vector<int> shape = DC.getShape();
    const int range = shape[dimension];
    vector<_LDT> answer(range,0);
    for(int value = 0 ; value < range; ++value) {
        vector<int> Bounds(shape);
        vector<int> Start(Bounds.size(),0);
        Start[dimension] = value;
        Bounds[dimension] = value+1;
        vector<int> indices(Start);
        do {
            if(DC.get(indices) != 0 && includeInHistogram(indices, dimension)) ++answer[value];
        }	while(MathUtil::increment( indices, Start, Bounds));
    }
    return answer;
}


template <class _DT, class _LDT,  class _CT>
norm_type FrequencySort<_DT,_LDT,_CT>::
computeNormalFrom( DataCube<_DT,_LDT>& DC,const norm_type & initial) {  // ignores initial
    norm_type answer;
    vector<int> shape = DC.getShape();
    for(uint dim = 0; dim < shape.size(); ++dim ) {
        /*vector<uint64> freq = frequencyHistogram(DC, dim);
        deque<pair<uint64,int> > freqindexpairs;
        for(uint index = 0; index < freq.size(); ++index) {
            freqindexpairs.push_back(pair<uint64,int>(freq[index], index));
    }
        sort(freqindexpairs.begin(), freqindexpairs.end(), mComparator);*/
      const deque<pair<_LDT,int> > & freqindexpairs = sortedFrequencyHistogram(DC,dim);
        vector<int> normalization(shape[dim],0);
        int index = 0;
        for(typename deque<pair<_LDT,int> >::const_iterator i = freqindexpairs.begin(); i != freqindexpairs.end(); ++i, ++index) {
            normalization[index] = i->second;
        }
        answer.push_back(normalization);
    }
    return answer;
}

/******************************************************************/
//
// This is OFK's most effective one-chunk normalization.  The implementation
//   could be much faster...
//
//   This is now the "hacked by Daniel version", still slow.
//
//   I tried lowering the number of iterations and it did not worsen things!!!
//

template <class _DT, class _LDT, class _CT = LessComparator<pair<_LDT,int> > >
class GreedyIterSort : public FrequencySort<_DT,_LDT,_CT> {
public:
GreedyIterSort() : mIsSliceDense() {}
    virtual GreedyIterSort* clone() const { return new GreedyIterSort(*this);}
    virtual ~GreedyIterSort() {}
    virtual const string getTextName(void) const { return "Greedy Iteration ("+ _CT::getDescription() + ")";}
    virtual norm_type computeNormalFrom(DataCube<_DT,_LDT>& DC,const norm_type & initial);
protected:
    enum {MAX_ITERS = 20};
    virtual bool includeInHistogram( const vector<int>& idx, const uint DimToIgnore);
    vector<vector<bool> > mIsSliceDense;
};

template <class _DT, class _LDT,  class _CT>
bool GreedyIterSort<_DT,_LDT,_CT>::
includeInHistogram( const vector<int>& idx, const uint DimToIgnore) {
    assert(idx.size() == mIsSliceDense.size());
    for(uint dim = 0; dim < idx.size(); ++ dim) {
        if(dim == DimToIgnore) continue;
        if(! mIsSliceDense[dim][idx[dim]]) return false;
    }
    return true;// include in the computation of histogram
}

template <class _DT, class _LDT,  class _CT>
norm_type GreedyIterSort<_DT,_LDT,_CT>::
computeNormalFrom( DataCube<_DT,_LDT>& DC, const norm_type &  initial) {
    int iterations = 0;
    bool converged = true;
    vector<int> shape = DC.getShape();
    mIsSliceDense = vector<vector<bool> >(shape.size());
    for (uint dim = 0; dim < shape.size(); ++dim )  // all values are dense enough
        mIsSliceDense[dim] = vector<bool>(shape[dim],true);
    assert(mIsSliceDense.size() == shape.size());
    norm_type answer;   // wasteful recalc of answer is least of our worries.
    do {
        converged = true;
        answer = norm_type();
        for(uint dim = 0; dim < shape.size(); ++dim ) {
            //cout << "dimension = "<< dim <<endl;
            // expensive part follows
            vector<_LDT> freq = frequencyHistogram(DC, dim);
            // everything else should be cheap
            //cout << " frequency Histogram computed "<< endl;
            deque<pair<_LDT,int> > freqindexpairs;
            for(uint index = 0; index < freq.size(); ++index)
                freqindexpairs.push_back(pair<_LDT,int>(freq[index], index));
            sort(freqindexpairs.begin(), freqindexpairs.end(), mComparator);
            //const deque<pair<uint64,int> > & freqindexpairs = sortedFrequencyHistogram(DC,dim);
            // should modularize this.
            // what is the volume of a slice (d-1 dimensional), omitting dim?
            double subCubeSliceVol = 1.0;
            for (uint OtherDimension = 0; OtherDimension < (uint) shape.size(); ++OtherDimension) {
                if(OtherDimension == dim) continue;
                int numDenseValsThisDim = 0;
                for (int value = 0; value < shape[OtherDimension]; ++value)
                    if (mIsSliceDense[OtherDimension][value]) numDenseValsThisDim++;
                subCubeSliceVol *= numDenseValsThisDim;
            }

            //cout << "dense subcube slice volume is " << subCubeSliceVol << endl;

            // any slice with > threshold occupied cells is okay.
            // no advantage taken of the fact they're sorted by occupied cells

            double threshold = subCubeSliceVol / (double)HOLAPUtil<_DT,_LDT>::SparseCost(DC);
            //cout << " threshold for slices freq = "<< threshold << endl;

            // keep the values whose slices are dense enough.  Always keep best value (avoid 0 volume)
            // should probably <algorithm>-ize it better...

            LessComparator<pair<_LDT,int> > lss;
            int bestv = max_element(freqindexpairs.begin(), freqindexpairs.end(), lss)->second;

            for (int value = 0; value < shape[dim]; ++value) {
                bool shouldKeep = (freq[value] > threshold || value == bestv) ;
                if (mIsSliceDense[dim][value] != shouldKeep) {
                    converged = false;
                    /*cout << "setting value = "<< value << " to " << shouldKeep <<  " because of freq = " <<\
                      freq[value] << endl;*/
                    mIsSliceDense[dim][value] = shouldKeep;
                }
            }
            vector<int> normalization(shape[dim],0);
            int index = 0;
            for(typename deque<pair<_LDT,int> >::iterator i = freqindexpairs.begin(); i != freqindexpairs.end(); ++i, ++index) {
                normalization[index] = i->second;
            }
            answer.push_back(normalization);
        }
        ++iterations;
    }
    while (!converged && iterations < MAX_ITERS );

    if (!converged) {
        cout << endl << "[Warning] Failed to converge after " << iterations << " iterations" << endl;
        cout << "[Warning] I gave up. Attempting to continue with imperfect solution. " << endl;
    }

    return answer;
}



/********************************************************************************/

// This is Reversed OFK's most effective one-chunk normalization hacked by Daniel.
// Basically, here we sort over the sparse regions instead of sorting over
// the dense regions.
// Here, the dense regions are "erased". One would think that sorting over
// the sparse regions only would be a waste of time... In practice we observed
// that ReversedGreedyIterSort performs equally well.
// I think this is because most of the gain is done in the first few iterations.
/*
template <class _DT, class _LDT, class _CT>
class ReversedGreedyIterSort : public GreedyIterSort<_DT,_LDT,_CT> {
public:
    virtual ~ReversedGreedyIterSort() {}
    virtual ReversedGreedyIterSort* clone() const { return new ReversedGreedyIterSort(*this);}
    virtual const string getTextName(void) const { return "Reversed Greedy Iteration ("+ _CT::getDescription() +")";}
protected:
    virtual bool includeInHistogram( const vector<int>& idx,  const uint DimToIgnore);
};

template <class _DT, class _LDT,  class _CT>
bool ReversedGreedyIterSort<_DT,_LDT,_CT>::
includeInHistogram( const vector<int>& idx, const uint DimToIgnore) {
    assert(idx.size() == mIsSliceDense.size());
    for(uint dim = 0; dim < idx.size() ;++ dim) {
        if(dim == DimToIgnore) continue;
        if(!mIsSliceDense[dim][idx[dim]]) return true;
    }
    return false;// include in the computation of histogram
    // to return true, we must have that it is not dense!!!
}


*/

/*******************************************************************/

/*
 * A lazy version of FrequencySort. Only sort one dimension (the largest).
 * I go with the assumption that it will be nearly as good.
 */

#if 0
template <class _DT, class _LDT, class _CT>
class LazyFrequencySort : public FrequencySort<_DT,_LDT,_CT> {
public:
    virtual ~LazyFrequencySort() {}
    virtual const string getTextName(void) const { return "Lazy Frequency Sort ("+ _CT::getDescription() +")";}
    virtual LazyFrequencySort* clone() const{ return new LazyFrequencySort(*this);}
    virtual norm_type computeNormalFrom(DataCube<_DT,_LDT>& DC, const norm_type initial);
};


template <class _DT, class _LDT,  class _CT>
norm_type LazyFrequencySort<_DT,_LDT,_CT>::
computeNormalFrom( DataCube<_DT,_LDT>& DC, const norm_type initial) {
    norm_type answer(initial);  //Normalization<_DT,_LDT>::computeNormal(DC);
    vector<int> shape = DC.getShape();
    int max = 0;
    for(uint dim = 1; dim < shape.size(); ++dim ) if(shape[max] < shape[dim]) max = dim;
    // expensive part follows
    vector<uint64> freq = frequencyHistogram(DC, max);
    // everything else should be cheap
    deque<pair<uint64,int> > freqindexpairs;
    for(uint index = 0; index < freq.size(); ++index) {
        freqindexpairs.push_back(pair<uint64,int>(freq[index], index));
    }
    sort(freqindexpairs.begin(), freqindexpairs.end(), mComparator);
    vector<int> normalization;
    int index = 0;
    for(deque<pair<uint64,int> >::iterator i = freqindexpairs.begin(); i != freqindexpairs.end(); ++i, ++index) {
        normalization[index] = i->second;
    }
    answer[max] = normalization;
    return answer;
}

#endif

/**************************************************************************/

/* The ostracisation algorithm.  As originally envisioned, it performs poorly.
 * The reason is pretty obvious: suppose you have a whole bunch of
 * cities with a mix of punks and skinheads.  From the '80s, we know they
 * don't get along at all...  In every city, punks are in the majority.  
 * So the ostracised folks are all skinheads.  Reassigning skinheads between
 * cities does not really help; we really should have a few skinhead-only
 * cities.  But the political system does not account for such wholesale
 * reorganization (apartheid revolution? :) )
 *
 * The target chunk shapes are needed.  Logically, this is an attribute of
 * the DataCube and should be stored/retrieved from there.   Yet I
 * want to use RAMCube for speed, and this is not inherently cubed.
 *  And they need not be regular or even consistent in
 * a dimension. 
 */

template <class _DT, class _LDT>
class OstraciseAssign : public Normalization<_DT,_LDT> {
public:
OstraciseAssign(vector<int> chunkShape) : /* Normalization<_DT,_LDT>(), */ Ms(chunkShape) {};
    OstraciseAssign() { cerr << "OstraciseAssign needs chunkShape vector"; };

    virtual OstraciseAssign* clone() const { return new OstraciseAssign(*this);}
    virtual ~OstraciseAssign() {}
    virtual const string getTextName(void) const {
        ostringstream ost; ost << "Ostracise/Assign";
        if (Ms.size() == 0)
            ost << " zero dimensional chunks? ";
        else ost << " chunks of " << Ms[0];

        for (uint i=1; i < Ms.size(); ++i)  ost << " x " << Ms[i];
        return ost.str();
    }

    virtual norm_type computeNormalFrom(DataCube<_DT,_LDT>& DC,
            const norm_type & initial) {
        //        vector<int>chunkShape(DC.getShape().size(),M);
        return computeNormalFrom(DC, Ms, initial);
    }

    virtual norm_type computeNormalFrom(DataCube<_DT,_LDT>& DC,
            vector<int> chunkShape,
            const norm_type &	initial);
protected:
    // interfaces of these are admittedly horribly wide.
    void setUpChunkDensities(DataCube<_DT,_LDT>& DC, const norm_type &theNorm,
                             const vector<int>& chunkShape,  vector<double> &answer);
    void densitiesOfIntersectingChunks(DataCube<_DT,_LDT>&DC, const vector<int>& chunkShape, uint dim,
                                       const vector<double> &allChunkDensities, int range,
                                       vector<double> &chunkDensity);
private:

    // a few class constants
    vector<int> Ms;
    enum { MAX_ITERS = 5};
};


template <class _DT, class _LDT>
void OstraciseAssign<_DT,_LDT>::
setUpChunkDensities(DataCube<_DT,_LDT>& DC,
                    const norm_type &theNorm,
                    const vector<int>& chunkShape,
                    vector<double> &answer) {
    vector<int> shape = DC.getShape();
    for(uint dim = 0; dim < shape.size() ; ++dim) assert(shape[dim] >= chunkShape[dim]);// sanity check
    vector<int> chunkSize;
    vector<int> numPointsChunk;

    HOLAPUtil<_DT,_LDT>::computeChunkDensityInfo(DC,chunkShape,theNorm,chunkSize,numPointsChunk);
    answer.resize(chunkSize.size());
    for (uint i=0; i < chunkSize.size(); ++i)
        answer[i] = double(numPointsChunk[i])/ chunkSize[i];

    if (false) {
        cout << "Computed all chunk densities as ";
        for (uint i=0; i < answer.size(); ++i)
            cout << answer[i] << " ";
        cout << endl;
    }

}


template <class _DT, class _LDT>
void  OstraciseAssign<_DT,_LDT>::
densitiesOfIntersectingChunks(DataCube<_DT,_LDT>&DC, const vector<int>& chunkShape, uint dim,
                              const vector<double> &allChunkDensities, int range,
                              vector<double>& chunkDensity
                             )
{
    vector<int> shape = DC.getShape();

    //  just give a chunkShape that is "x 1" in dimension of interest, to get individual
    //  slices as "chunks"

    // figure out the "stride" of slice/chunk info: distance separating consecutive chunks
    // cell at index 0 is always in 0th slice/chunk

    // following Richard, we can crunch higher-dimensional aggregations into 3D
    // by separating dims  0..dim-1, dim, dim+1..d-1
    // the product of the number of chunks along dims 0..dim-1 will be called "sliceStride"
    // the product of the number of chunks alongs dims 0..dim will be called "sliceGap"

    vector<int> tempInd(shape.size(),0);
    tempInd[dim]=chunkShape[dim];  // smallest coordinate in 1st chunk
    uint chunkStride = HOLAPUtil<_DT,_LDT>::getChunkIndex(tempInd, shape, chunkShape);
    uint chunkGap;

    if (dim == shape.size() -1) {
        chunkGap = HOLAPUtil<_DT,_LDT>::computeNumberOfChunks(DC,chunkShape);
    }
    else {
        tempInd[dim]=0;
        tempInd[dim+1] = chunkShape[dim+1];  // actually, unchanged?
        chunkGap = HOLAPUtil<_DT,_LDT>::getChunkIndex(tempInd, shape, chunkShape);
        tempInd[dim+1] = 0;  // want this later
    }

    // go along, picking up only those chunks whose indices agree with "range"
    // note: "range" indicates WHICH range, ie, dim=0, range=0 is all the
    // chunks whose chunk index is of the form (0xxxxx).

    if (false)
        cout << "For dimension " << dim <<  " chunkStride =" <<
        chunkStride << " chunkGap =" << chunkGap<< endl;

    int numRanges = int(ceil(shape[dim]/double(chunkShape[dim])));


    // build density vector of all chunks containing this attribute
    tempInd[dim] = range*chunkShape[dim];  // remaining coordinates 0
    int startChunk = HOLAPUtil<_DT,_LDT>::getChunkIndex(tempInd, shape, chunkShape);

    // this loop DOES look weird...
    for (uint i=0, ch=startChunk; i < allChunkDensities.size()/numRanges; ch += chunkGap, i += chunkStride ) {
        for (uint j=0; j < chunkStride; j++) {
            chunkDensity.push_back(allChunkDensities.at(ch+j));  // storing at index i+j
            //chunkDensity[i+j] = (chunkDensity[i+j] > 1.0/ NormalUtil<_DT,_LDT>::holapSparseCost(shape.size()) );
        }
    }
    if (false) {
        cout << "Computed chunk densities for range " << range << ": ";
        for (uint i=0; i < chunkDensity.size(); ++i)
            cout << chunkDensity[i] << " ";
        cout << endl;
    }

}

template <class _DT, class _LDT>
norm_type OstraciseAssign<_DT,_LDT>::
computeNormalFrom( DataCube<_DT,_LDT>& DC, vector<int> chunkShape, const norm_type & initial) {
    int iter_ctr = 0;
    bool converged;

    vector<int> shape = DC.getShape();
    for(uint dim = 0; dim < shape.size() ; ++dim) assert(shape[dim] >= chunkShape[dim]);// sanity check
    norm_type theNorm(initial);

    for (converged = false; !converged  && iter_ctr < MAX_ITERS; ++iter_ctr) {
        converged = true;

        cout << "iteration " << iter_ctr << endl;

        // I feel this code can be substantially simpler, but haven't found how yet. -ofk

        for (uint dim = 0; dim < shape.size(); ++dim) {

    if (chunkShape[dim] == 1) continue;

            vector<double> chunkDens;  // changes each iteration, since it depends on theNorm
            setUpChunkDensities(DC,theNorm,chunkShape,chunkDens);

            vector<int> sliceShape(chunkShape);  // same except...
            sliceShape[dim] = 1;
            vector<double> sliceDens;
            setUpChunkDensities(DC,theNorm,sliceShape,sliceDens);


            int numRanges = int(ceil(shape[dim]/double(chunkShape[dim])));
            GeometricBipartiteGraph g(numRanges, numRanges);

            vector<int> holePos(numRanges);               // offset in range
            vector<int> ostracisedPosition(numRanges);

            // for all attribute ranges
            int attrVal = 0;
            for (int range=0; range < numRanges; ++range) {
                vector<double> chunkDensity; // calculated next (by appending from empty)
                densitiesOfIntersectingChunks(DC, chunkShape, dim, chunkDens, range, chunkDensity);

                // in each range, choose which value (slice) to ostracise
                // right now, chunkDensity vector takes into account the contribution of all slices, including
                // the one to be ostracised.  Later, we modify to null out the ostracised one.

                ostracisedPosition[range] = -1;  // flag, not yet begun deciding
                vector<double> worstDensity; // (chunkDensity.size());
                double worstDistance = 0.0;

                for ( int posnInChunk = 0;
                        posnInChunk < chunkShape[dim] && attrVal < shape[dim];  // last chunk irregular
                        ++posnInChunk, ++attrVal) {

                    vector<double> sliceDensity; // calculated next
                    densitiesOfIntersectingChunks(DC, sliceShape, dim, sliceDens, attrVal, sliceDensity);

                    double distanceFromPeers = MathUtil::L2_dist(sliceDensity,chunkDensity);
                    if (ostracisedPosition[range] == -1  || distanceFromPeers > worstDistance) {
                        worstDistance = distanceFromPeers;
                        worstDensity = sliceDensity;  // copy assignment on big vector
                        ostracisedPosition[range]= attrVal;
                    }
                }
                // now know who to ostracise
                g.setCoordinates(g.RIGHT, range, worstDensity);
                if (false) {
                    cout << "setCoords on right, for range " << range << " at worstDensity ";
                    for (uint i=0; i < worstDensity.size(); ++i)
                        cout << worstDensity[i] << " ";
                    cout << endl;
                }

                // null out the ostracised one.  Slightly wrong because it does not count for fewer slices/chunk
                // at the boundaries, in case the number of values is not a multiple of the chunk size.

                // this makes no sense for chunks with m=1.
                assert(chunkShape[dim] > 1);

                for (uint i = 0; i < chunkDensity.size(); ++i)
                    chunkDensity[i] = (chunkDensity[i] * chunkShape[dim] - worstDensity[i])/(chunkShape[dim]-1);

                g.setCoordinates(g.LEFT, range, chunkDensity);
                if (false) {
                    cout << "setCoords on LEFT, for range " << range << " at chunkDensity ";
                    for (uint i=0; i < chunkDensity.size(); ++i)
                        cout << chunkDensity[i] << " ";
                    cout << endl;
                }

            }
            // for every hole, which ostracised value should fill it?
            vector<int> reassignment = g.geometricMinCostAssignment();

            // calculate new normalization...  [should carefully check to see whether it's bkwds]

            // not sure if in-place update can work, so don't
            vector<int> newNorm(theNorm[dim]);  // copy
            for (int range=0; range < numRanges; ++range)
                newNorm[ ostracisedPosition[range]] = theNorm[dim][ostracisedPosition[reassignment[range]]];

            assert(PermutationUtil::isPermutation(newNorm));
            if (theNorm[dim] != newNorm) converged = false;  // componentwise identity

            // the domains of the maps are the same, so I will do an in-place overwrite
            // (not sure about whether the copy-assign operator would invoke destructor)
            vector<int> &mapToUpdate = theNorm[dim];
            for (int i=0; i < shape[dim]; ++i)
                mapToUpdate[i] = newNorm[i];
        }
    }
    if (!converged) cout << "[Warning]: ostraciseAssign did not coverge, using last value" << endl;
    return theNorm;
}


// utility.

class ByDistanceTo {
public:
    ByDistanceTo( const vector<vector<double> > & allSliceDensities, int referenceSlice)
: refSliceVals( allSliceDensities[referenceSlice] ), asd( allSliceDensities) {};
    bool operator() (int v1, int v2) const {
        double v1Dist = MathUtil::L2_dist(refSliceVals, asd[v1]);
        double v2Dist = MathUtil::L2_dist(refSliceVals, asd[v2]);
        if (v1Dist < v2Dist) return false;
        if (v2Dist < v1Dist) return true;
        return (v1 < v2);  // tiebreaker
    }
private:
    const vector<double> &refSliceVals;
    const vector<vector<double> > & asd;
};


/********************************************************************/

/* It is bad OOD to say that "IteratedSliceCluster is a kind of
 *  OstraciseAssign", just because they both need "densitiesOfIntersectingChunks".
 *  But, to keep code short, that's what we will pretend.  -OFK
 */


template <class _DT, class _LDT>
class IteratedSliceCluster : public OstraciseAssign<_DT,_LDT> {
public:
IteratedSliceCluster(const vector<int> chunkShape) : OstraciseAssign<_DT,_LDT>(chunkShape), Ms(chunkShape) {};
    virtual IteratedSliceCluster* clone() const { return new IteratedSliceCluster(*this);}
    virtual ~IteratedSliceCluster() {}
    virtual const string getTextName(void) const {
        ostringstream ost; ost << "IteratedSliceCluster";
        if (Ms.size() == 0)
            ost << " zero dimensional chunks? ";
        else ost << " chunks of " << Ms[0];

        for (uint i=1; i < Ms.size(); ++i)  ost << " x " << Ms[i];
        return ost.str();}
    using OstraciseAssign<_DT,_LDT>::computeNormalFrom;  // 2-arg form should be okay here
    virtual norm_type computeNormalFrom(DataCube<_DT,_LDT>& DC,
            vector<int> chunkShape,
            const norm_type & initial);
private:
    // a few class constants  (don't use base class's)
    vector<int> Ms;
    enum { MAX_ITERS = 5};

};


template <class _DT, class _LDT>
norm_type IteratedSliceCluster<_DT,_LDT>::
computeNormalFrom(DataCube<_DT,_LDT>& DC, vector<int> chunkShape, const norm_type & initial) {
    // start with some cut-and-paste  :(
    int iter_ctr = 0;
    bool converged;

    vector<int> shape = DC.getShape();
    norm_type theNorm(initial);

    for (converged = false; !converged  && iter_ctr < MAX_ITERS; ++iter_ctr) {
        converged = true;

        cout << "iteration " << iter_ctr << endl;

        for (uint dim = 0; dim < shape.size(); ++dim) {
            vector<int> sliceShape(chunkShape);  sliceShape[dim] = 1;

            vector<double> sliceDens;
            setUpChunkDensities(DC,theNorm,sliceShape,sliceDens);

            // calculate density vectors for _every_ slice, make an array of them

            vector<vector<double> > allSliceDensities(shape[dim]);  // calculated in loop below

            for (int attrVal = 0; attrVal < shape[dim]; attrVal++)
                densitiesOfIntersectingChunks(DC, sliceShape, dim, sliceDens, attrVal, allSliceDensities[attrVal]);

            // elegant multiscale algorithm, not implemented and assumes M a power of 2:
            //    pair up  slices by minimum distance.  coalesce each into a superslice.
            //    pair up  superslices by minimum distance (suitably defined).  repeat logM times


            // inelegant algorithm: slice 0 is in chunk0.  Add its M-1 closest nbrs
            // next free slice is in chunk0.  Add its M-1 available closest nbrs, etc.
            // implemention: pretty much brute force...
            //
            // didn't work too well, one reason is visually apparent: we should always start a chunk
            // with a slice whose M-1 closest nbrs are minimal.  Thus I updated the code.

            vector<int> chunkAssignment(shape[dim],-1);  // -1 denotes "not assigned yet"

            int numRanges = int(ceil(shape[dim]/double(chunkShape[dim])));  // put as a utility somewhere?

            for (int chunk=0; chunk < numRanges; ++chunk) {

                int free;  // now a misnomer
                int whoHasBestNbrs = 0;
                double BestNbrhoodCost = numeric_limits<double>::max();
                for ( free=0; free < shape[dim]; ++free) {
                    if (chunkAssignment[free] == -1) {
                        priority_queue<int, vector<int>, ByDistanceTo> pq(ByDistanceTo(allSliceDensities,free));

                        for (int i=0; i < shape[dim]; ++i) {
                            if (chunkAssignment[i] == -1 && i != free)
                                pq.push(i);
                        }

                        // get out free's  peer slices  for this chunk (emptiness check for last chunk)
                        double thisNbrhoodCost = 0;
                        for (int i=1; i < chunkShape[dim] && !pq.empty(); ++i) {
                            thisNbrhoodCost += MathUtil::L2_dist(allSliceDensities[free],allSliceDensities[pq.top()]);
                            pq.pop();
                        }

                        //   cout << "nbrhd for " << free << "would have cost " << thisNbrhoodCost << endl;

                        if (thisNbrhoodCost < BestNbrhoodCost) {
                            BestNbrhoodCost = thisNbrhoodCost;
                            whoHasBestNbrs = free;
                        }
                    }
                }
                free = whoHasBestNbrs;

                chunkAssignment[free] = chunk;
                vector<double> idealizedDensity(allSliceDensities[free]);

                if (false) // too easy to get all 0s.
                    for (uint i=0; i < idealizedDensity.size(); ++i) {
                        // hacking here: boost towards true ideal but retain distinctions...
                        idealizedDensity[i] = (idealizedDensity[i] > 1.0/ HOLAPUtil<_DT,_LDT>::SparseCost(DC) )/2.0 +
                                              idealizedDensity[i]/2.0;  // average!  (does not seem to make much difference
                    }


                if (false) {
                    cout << "Idealized density for " << free << ": ";
                    for (uint i=0; i < idealizedDensity.size(); ++i)
                        cout << idealizedDensity[i] << " ";
                    cout << endl;
                }

                // sorta silly to use a priority queue when I will only get a few values from it.
                priority_queue<int, vector<int>, ByDistanceTo> pq(ByDistanceTo(allSliceDensities,free));

                for (int i=0; i < shape[dim]; ++i)
                    if (chunkAssignment[i] == -1) {
                        pq.push(i);
                        if (false)
                            cout << "pushing " << i << " at distance " <<
                            MathUtil::L2_dist(idealizedDensity,allSliceDensities[i]) << endl;
                    }

                // get out free's  peer slices  for this chunk (emptiness check for last chunk)
                for (int i=1; i < chunkShape[dim] && !pq.empty(); ++i) {
                    chunkAssignment.at(pq.top() ) = chunk;
                    if (false)
                        cout << "taking " << pq.top() << " at distance " <<
                        MathUtil::L2_dist(idealizedDensity,allSliceDensities[pq.top()]) << endl;
                    pq.pop();
                }
            }

            // now, every value knows its chunk.  So generate appropriate normalization.

            vector<int> chunkCtr(numRanges,0);
            vector<int> newNorm(theNorm[dim]);

            for (int attrVal=0; attrVal < shape[dim]; ++attrVal) {
                assert(chunkAssignment[attrVal] != -1);
                int mapFrom = chunkAssignment[attrVal]*chunkShape[dim] + chunkCtr[chunkAssignment[attrVal]];

                // mapFrom ==> attrVal ==> [old norm] ==> storageAttrval

                // compose normalisations
                newNorm[mapFrom] = theNorm[dim][attrVal];

                // cout << mapFrom << " --> " << newNorm[mapFrom] << endl;
                chunkCtr[chunkAssignment[attrVal]]++;
            }

            assert(PermutationUtil::isPermutation(newNorm));
            if (theNorm[dim] != newNorm) {
                converged = false;  // componentwise identity
                vector<int> &mapToUpdate = theNorm[dim];
                for (int i=0; i < shape[dim]; ++i)
                    mapToUpdate[i] = newNorm[i];
                assert(PermutationUtil::isPermutation(theNorm[dim]));
            }
            NormalUtil<_DT,_LDT>::printSmall2d(DC,theNorm,chunkShape);  // no-op for non 2d
        }
    }
    if (!converged)
      cout << "[Warning:] " << getTextName() << "did not converge, using last value" << endl;
    return theNorm;
}
#endif

