/* -*- mode: c++ -*-
*/
/*
GIFT, a flexible content based image retrieval system.
Copyright (C) 1998, 1999, 2000 CUI, University of Geneva
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
// -*- mode: c++ -*-
class CXMLElement;
/**
*
*
* This class manages the access to the inverted file as well
* as its generation
*
*
*
* modification history:
*
* WM 1099 changed documentation format
* completed documentation
* HM 090399 created the documentation
* WM 1098 created the file
*
*
*
* compiler defines used:
*
*
*/
#ifndef _CACIFFILESYSTEM
#define _CACIFFILESYSTEM
#include <string>
#include "TID.h"
#include "CSelfDestroyPointer.h"
#include "CArraySelfDestroyPointer.h"
#include "CDocumentFrequencyList.h"
//#include "CCollectionFrequencyList.h"
#include "CADIHash.h"
#include "CAcURL2FTS.h"
#include "CAcInvertedFile.h"
#include <iostream.h>
#include <fstream.h>
#include <map>
#include <vector>
#include <hash_map>
#include <functional>
#include <algorithm>
#include "CMagic.h"
typedef TID TFeatureID ;
/**
An accessor to an inverted file. This access is done
"by hand" at present this not really efficient, however
we plan to move to memory mapped files.
*/
class CAcIFFileSystem:public CAcInvertedFile{
protected:
/** In order to have just one parent,
I have to limit on single inheritance.
I cannot use virtual base classes, because then I
cannot downcast
*/
CSelfDestroyPointer<CAcURL2FTS> mURL2FTS;
/** the maximum feature ID arising in this file */
TID mMaximumFeatureID;
/** A buffer, if the inverted file is to be
held in ram */
CArraySelfDestroyPointer<char> mInvertedFileBuffer;
/** The inverted file */
mutable CSelfDestroyPointer<istream> mInvertedFile;
/** Feature -> Offset in inverted file */
mutable ifstream mOffsetFile;
/** File of feature descriptions */
ifstream mFeatureDescriptionFile;
/** Name of the inverted file */
string mInvertedFileName;
/** Name of the Offset file */
string mOffsetFileName;
/** Name for the file with the feature description */
string mFeatureDescriptionFileName;
/** map from feature id to the offset for this feature */
typedef hash_map<TID,unsigned int> CIDToOffset;//new hash
/** map from feature id to the offset for this feature */
CIDToOffset mIDToOffset;
/** map from feature to the collection frequency */
mutable hash_map<TID,double> mFeatureToCollectionFrequency;//new hash
/**@name for fast access...*/
//@{
/** map from the feature ID to the feature description */
hash_map<TID,unsigned int> mFeatureDescription;//new hash_
/** additional information about the document like, e.g.
the euclidean length of the feature list.
*/
CADIHash mDocumentInformation;
//@}
/** add a pair of FeatureID,Offset to the open offset file
(helper function for inverted file construction)
*/
void writeOffsetFileElement(TID inFeatureID,
int inPosition,
ostream& inOpenOffsetFile);
/** loads a *.fts file. and returns the feature list*/
CDocumentFrequencyList* getFeatureFile(string inFileName)const;
public:
/** for testing if the inverted file is correctly constructed*/
bool operator()()const;
/** This opens an exsisting inverted file, and then
inits this structure. After that it is fully
usable
As a paramter it takes an XMLElement which contains
a "collection" element and its content.
If the attribute vi-generate-inverted-file is true,
then a new inverted file will be generated using
the parameters given in inCollectionElement. you will
NOT be able to use *this afterwards.
The REAL constructor.
*/
CAcIFFileSystem(const CXMLElement& inCollectionElement);
/** called by constructors */
bool init(bool);
/** Destructor */
~CAcIFFileSystem();
/** Translate a DocumentID to a URL (for output) */
string IDToURL(TID inID)const;
/**@name The proper inverted file access*/
//@{
/** List of documents containing the feature */
CDocumentFrequencyList* FeatureToList(TFeatureID)const;
/** List of features contained by a document */
CDocumentFrequencyList* URLToFeatureList(string inURL)const;
/** List of features contained by a document with ID inDID */
CDocumentFrequencyList* DIDToFeatureList(TID inDID)const;
//@}
/**@name Accessing information about features*/
//@{
/** Collection frequency for a given feature */
double FeatureToCollectionFrequency(TFeatureID)const;
/** What kind of feature is the feature with ID inFeatureID? */
unsigned int getFeatureDescription(TID inFeatureID)const;
//@}
/**@name Accessing additional document information*/
//@{
/** returns the maximum document frequency for one document ID */
double DIDToMaxDocumentFrequency(TID)const;
/** Returns the document-frequency square sum for a given document ID */
double DIDToDFSquareSum(TID)const;
/** Returns this function for a given document ID */
double DIDToSquareDFLogICFSum(TID)const;
//@}
/*@name Inverted File Generation and Consistency Checking*/
//@{
/** Generating an inverted File, if there is none.
Fast but stupid in-memory method. This method is
very fast, if all the inverted file (and a bit more)
can be kept in memory at runtime. If this is not the
case, extensive swapping is the result, virtually halting
the inverted file creation.
*/
bool generateInvertedFile();
/** Generating an inverted File, if there is none.
Employing the two-way-merge method described
in "managing gigabytes", chapter 5.2. Sort-based
inversion. (Page 181)
*/
bool newGenerateInvertedFile();
/**Check the consistency of the inverted file system accessed
by this accessor.*/
bool checkConsistency();
/**Is the Document with inDocumentID contained in the
document frequency list of the feature inFeatureID and
is the associated document frequency the same?*/
bool findWithinStream(TID inFeatureID,
TID inDocumentID,
double inDocumentFrequency)const;
//@}
/**
*
* Translate an URL to its document ID
*
*/
virtual pair<bool,TID> URLToID(const string& inURL)const;
/** List of the IDs of all documents present in the inverted file */
void getAllIDs(list<TID>&)const;
/** List of triplets (ID,imageURL,thumbnailURL) of all
the documents present in the inverted file */
void getAllAccessorElements(list<CAccessorElement>&)const;
/** get a given number of random AccessorElement's
@param inoutResultList the list which will contain the result
@param inSize the desired size of the inoutResultList
*/
void getRandomIDs(list<TID>&,
list<TID>::size_type)const;
/** For drawing random sets. Why is this part of an CAccessorImplementation?
The way the accessor is organised might influence the way
random sets can be drawn. At present everything happens in
RAM, but we do not want to be fixed on that.
@param inoutResultList the list which will contain the result
@param inSize the desired size of the inoutResultList
*/
void getRandomAccessorElements(list<CAccessorElement>& outResult,
list<CAccessorElement>::size_type inSize)const;
/** The number of images in this accessor */
int size()const;
//@}
/** This is interesting for browsing*/
TID getMaximumFeatureID()const;
/** Getting a list of all features contained in this.
This function is necessary, because in the present
system only about 50 percent of the features are
really used.
A feature is considered used if it arises in mIDToOffset.
*/
list<TID>* getAllFeatureIDs()const;
/**
*
* Translate a DocumentID to an accessor Element
*
*/
virtual pair<bool,CAccessorElement> IDToAccessorElement(TID inID)const;
/** is this well constructed? */
operator bool()const;
};
#endif
Documentation generated by muellerw@pc7170 on Son Okt 8 16:04:40 CEST 2000