#include <CAcIFFileSystem.h>

Public Member Functions | |
| bool | operator() () const |
| CAcIFFileSystem (const CXMLElement &inCollectionElement) | |
| bool | init (bool) |
| ~CAcIFFileSystem () | |
| string | IDToURL (TID inID) const |
| virtual pair< bool, TID > | URLToID (const string &inURL) const |
| void | getAllIDs (list< TID > &) const |
| void | getAllAccessorElements (list< CAccessorElement > &) const |
| void | getRandomIDs (list< TID > &, list< TID >::size_type) const |
| void | getRandomAccessorElements (list< CAccessorElement > &outResult, list< CAccessorElement >::size_type inSize) const |
| int | size () const |
| TID | getMaximumFeatureID () const |
| list< TID > * | getAllFeatureIDs () const |
| virtual pair< bool, CAccessorElement > | IDToAccessorElement (TID inID) const |
| operator bool () const | |
The proper inverted file access | |
| CDocumentFrequencyList * | FeatureToList (TFeatureID) const |
| CDocumentFrequencyList * | URLToFeatureList (string inURL) const |
| CDocumentFrequencyList * | DIDToFeatureList (TID inDID) const |
Accessing information about features | |
| double | FeatureToCollectionFrequency (TFeatureID) const |
| unsigned int | getFeatureDescription (TID inFeatureID) const |
Accessing additional document information | |
| double | DIDToMaxDocumentFrequency (TID) const |
| double | DIDToDFSquareSum (TID) const |
| returns the Document frequency squaresum for a given document ID | |
| double | DIDToSquareDFLogICFSum (TID) const |
| bool | generateInvertedFile () |
| bool | newGenerateInvertedFile () |
| bool | checkConsistency () |
| bool | findWithinStream (TID inFeatureID, TID inDocumentID, double inDocumentFrequency) const |
| needed for checking the consistency | |
Protected Types | |
| typedef HASH_MAP< TID, streampos > | CIDToOffset |
Protected Member Functions | |
| void | writeOffsetFileElement (TID inFeatureID, streampos inPosition, ostream &inOpenOffsetFile) |
| CDocumentFrequencyList * | getFeatureFile (string inFileName) const |
Protected Attributes | |
| CMutex | mMutex |
| CSelfDestroyPointer< CAcURL2FTS > | mURL2FTS |
| TID | mMaximumFeatureID |
| string | mInvertedFileBuffer |
| string | mTemporaryIndexingFileBase |
| CSelfDestroyPointer< istream > | mInvertedFile |
| ifstream | mOffsetFile |
| ifstream | mFeatureDescriptionFile |
| string | mInvertedFileName |
| string | mOffsetFileName |
| string | mFeatureDescriptionFileName |
| CIDToOffset | mIDToOffset |
| HASH_MAP< TID, double > | mFeatureToCollectionFrequency |
for fast access... | |
| HASH_MAP< TID, unsigned int > | mFeatureDescription |
| CADIHash | mDocumentInformation |
For a long time we wanted to move to memory mapped files (like SWISH++) but currently I think this is not the best idea.
Definition at line 93 of file CAcIFFileSystem.h.
typedef HASH_MAP<TID,streampos> CAcIFFileSystem::CIDToOffset [protected] |
map from feature id to the offset for this feature
Reimplemented from CAcInvertedFile.
Definition at line 138 of file CAcIFFileSystem.h.
| CAcIFFileSystem::CAcIFFileSystem | ( | const CXMLElement & | inCollectionElement | ) |
This opens an exsisting inverted file, and then inits this structure. After that it is fully usable
As a paramter it takes an XMLElement which contains a "collection" element and its content.
If the attribute cui-generate-inverted-file is true, then a new inverted file will be generated using the parameters given in inCollectionElement. you will NOT be able to use *this afterwards.
Like every accessor, this accessor takes a <collection> MRML element as input (
Definition at line 769 of file CAcIFFileSystem.cc.
References CXMLElement::boolReadAttribute(), mrml_const::cui_base_dir, mrml_const::cui_feature_file_location, mrml_const::cui_generate_inverted_file, mrml_const::cui_in_memory, mrml_const::cui_inverted_file_location, mrml_const::cui_offset_file_location, init(), CMutex::lock(), mFeatureDescriptionFileName, mInvertedFileName, mOffsetFileName, mURL2FTS, CAccessorImplementation::mURLToID, newGenerateInvertedFile(), CXMLElement::stringReadAttribute(), and CMutex::unlock().
00769 : 00770 mURL2FTS(new CAcURL2FTS(inCollectionElement)), 00771 mOffsetFileName(inCollectionElement.stringReadAttribute(mrml_const::cui_base_dir).second 00772 +inCollectionElement.stringReadAttribute(mrml_const::cui_offset_file_location).second), 00773 mInvertedFileName(inCollectionElement.stringReadAttribute(mrml_const::cui_base_dir).second 00774 +inCollectionElement.stringReadAttribute(mrml_const::cui_inverted_file_location).second), 00775 mFeatureDescriptionFileName(inCollectionElement.stringReadAttribute(mrml_const::cui_base_dir).second 00776 +inCollectionElement.stringReadAttribute(mrml_const::cui_feature_description_location).second), 00777 mTemporaryIndexingFileBase(inCollectionElement.stringReadAttribute(mrml_const::cui_base_dir).second), 00778 mInvertedFile(0), 00779 #ifdef V295 00780 mInvertedFileBuffer(0), 00781 #else 00782 mInvertedFileBuffer(""), 00783 #endif 00784 mMaximumFeatureID(0){ 00785 gMutex->lock(); 00786 00787 if(inCollectionElement.stringReadAttribute(mrml_const::cui_generate_inverted_file).first 00788 && inCollectionElement.boolReadAttribute(mrml_const::cui_generate_inverted_file).second){ 00789 newGenerateInvertedFile(); 00790 }else{ 00791 00792 00793 bool lSuccessfulStart(inCollectionElement.stringReadAttribute(mrml_const::cui_base_dir).first 00794 && inCollectionElement.stringReadAttribute(mrml_const::cui_feature_file_location).first 00795 && inCollectionElement.stringReadAttribute(mrml_const::cui_offset_file_location).first 00796 && inCollectionElement.stringReadAttribute(mrml_const::cui_inverted_file_location).first); 00797 00798 if(lSuccessfulStart 00799 && init(inCollectionElement.boolReadAttribute(mrml_const::cui_in_memory).first 00800 && inCollectionElement.boolReadAttribute(mrml_const::cui_in_memory).second)){ 00801 cout << "CInvertedFile succuessfully initialised. Parameters:" 00802 << mInvertedFileName << endl 00803 << mOffsetFileName << endl 00804 << mURL2FTS->getURLToFeatureFileName() << endl 00805 << mFeatureDescriptionFileName << endl 00806 << endl; 00807 00808 #ifndef _NO_PRINT_INIT 00809 { 00810 cout << "DIAGNOSE" << flush << endl; 00811 cout << "The current size of mURLToID is " << mURLToID.size() << endl; 00812 cout << "All elements:" 00813 << endl; 00814 int lCount=1; 00815 for(string_TID_map::const_iterator i=mURLToID.begin(); 00816 i!=mURLToID.end(); 00817 i++){ 00818 cout << "," 00819 << lCount++ 00820 << flush; 00821 } 00822 lCount=1; 00823 for(string_TID_map::const_iterator i=mURLToID.begin(); 00824 i!=mURLToID.end(); 00825 i++){ 00826 cout << lCount++ 00827 << ":" 00828 << flush 00829 << i->first 00830 << "->" 00831 << i->second 00832 << flush 00833 << endl 00834 << flush; 00835 } 00836 } 00837 #endif 00838 00839 00840 }; 00841 #ifndef _NO_PRINT_INIT 00842 checkNPrint(); 00843 cout << "Constructor left" 00844 << endl; 00845 #endif 00846 } 00847 gMutex->unlock(); 00848 }
| CAcIFFileSystem::~CAcIFFileSystem | ( | ) |
Destructor
Definition at line 1748 of file CAcIFFileSystem.cc.
References CMutex::lock(), and CMutex::unlock().
01748 { 01749 cout << "CAcIFFileSystem::~CAcIFFileSystem() called " 01750 << endl 01751 << flush; 01752 gMutex->lock(); 01753 gMutex->unlock(); 01754 };
| void CAcIFFileSystem::writeOffsetFileElement | ( | TID | inFeatureID, | |
| streampos | inPosition, | |||
| ostream & | inOpenOffsetFile | |||
| ) | [protected] |
add a pair of FeatureID,Offset to the open offset file (helper function for inverted file construction)
add a pair of FeatureID,Offset to the open offset file (helper function for inverted file construction)
used for reading the offset file
Definition at line 415 of file CAcIFFileSystem.cc.
References CMutex::lock(), SUIntStreampos::mStreampos, SUIntStreampos::mUInt1, and CMutex::unlock().
Referenced by generateInvertedFile(), and newGenerateInvertedFile().
00417 { 00418 gMutex->lock(); 00420 SUIntStreampos lWriter; 00421 00422 lWriter.mUInt1=inFeatureID; 00423 lWriter.mStreampos=inPosition; 00424 #ifndef _NO_PRINT_OFFSET_CHECK 00425 lWriter.mUInt2=int(inPosition); 00426 #endif 00427 00428 // inOpenOffsetFile.write((char*)&inFeatureID, 00429 // sizeof(TID)); 00430 // inOpenOffsetFile.write((char*)&inPosition, 00431 // sizeof(streampos)); 00432 inOpenOffsetFile.write((char*)&lWriter, 00433 sizeof(lWriter)); 00434 std::cout << "[inFeatureID:" << sizeof(TID) << "/" << inFeatureID 00435 << ";inPosition:" << sizeof(streampos) << "/" << inPosition << "==" << int(inPosition) 00436 << "]" << sizeof(lWriter) << endl; 00437 gMutex->unlock(); 00438 00439 };
| CDocumentFrequencyList * CAcIFFileSystem::getFeatureFile | ( | string | inFileName | ) | const [protected] |
loads a *.fts file. and returns the feature list
Reimplemented from CAcInvertedFile.
Definition at line 1592 of file CAcIFFileSystem.cc.
References CMutex::lock(), CDocumentFrequencyList::readBinary(), and CMutex::unlock().
Referenced by DIDToFeatureList(), and URLToFeatureList().
01592 { 01593 gMutex->lock(); 01594 CDocumentFrequencyList* lRetVal(0); 01595 /* if the filename has a size bigger than one, meaning it is defined */ 01596 if(inFileName.size()) 01597 { 01598 /* a file with the given file name */ 01599 ifstream lFile(inFileName.c_str()); 01600 unsigned int lNumberOfFeatures(0); 01601 if(lFile){ 01602 lFile.read((char*)&lNumberOfFeatures, 01603 sizeof(lNumberOfFeatures)); 01604 01605 if(lFile && (lRetVal=new CDocumentFrequencyList(lNumberOfFeatures))){ 01606 01607 lRetVal->readBinary(lFile); 01608 01609 } 01610 } 01611 } /* end of if the URL was proper */ 01612 gMutex->unlock(); 01613 return lRetVal; 01614 }
| bool CAcIFFileSystem::operator() | ( | ) | const [virtual] |
for testing if the inverted file is correctly constructed
Implements CAcInvertedFile.
Definition at line 130 of file CAcIFFileSystem.cc.
References CMutex::lock(), mInvertedFile, mOffsetFile, mURL2FTS, and CMutex::unlock().
00130 { 00131 gMutex->lock(); 00132 bool lReturnValue(mURL2FTS 00133 && mInvertedFile 00134 && *mInvertedFile 00135 && mOffsetFile 00136 && mURL2FTS->operator bool()); 00137 gMutex->unlock(); 00138 return lReturnValue; 00139 };
called by constructors
Reimplemented from CAcInvertedFile.
Definition at line 862 of file CAcIFFileSystem.cc.
References CIFListStart::getCollectionFrequency(), CIFListStart::getFeatureID(), CMutex::lock(), mDocumentInformation, mFeatureDescription, mFeatureDescriptionFile, mFeatureDescriptionFileName, mFeatureToCollectionFrequency, mIDToOffset, mInvertedFile, mInvertedFileBuffer, mInvertedFileName, mMaximumFeatureID, mOffsetFile, mOffsetFileName, mURL2FTS, and CMutex::unlock().
Referenced by CAcIFFileSystem(), and checkConsistency().
00863 { 00864 gMutex->lock(); 00865 try{ 00866 mMaximumFeatureID=0; 00867 cout << "Opening _" 00868 << mInvertedFileName 00869 << "_"; 00870 if(inMemory){ 00871 ifstream lInvertedFile(mInvertedFileName.c_str()); 00872 if(lInvertedFile){ 00873 00874 cout << endl 00875 << "(TRYING TO READ THE WHOLE FILE INTO MEMORY" 00876 << endl; 00877 00878 lInvertedFile.seekg(0,ios::end); 00879 size_t lFileSize=lInvertedFile.tellg(); 00880 00881 #ifdef V295 00882 mInvertedFileBuffer=new char[lFileSize]; 00883 #else 00884 mInvertedFileBuffer="x"; 00885 mInvertedFileBuffer.resize(lFileSize); 00886 #endif 00887 lInvertedFile.seekg(0,ios::beg); 00888 #ifdef V295 00889 lInvertedFile.read(mInvertedFileBuffer, 00890 lFileSize); 00891 #else 00892 // this is a kludge but speed does not matter here 00893 for(int i=0;i<lFileSize && lInvertedFile;i++){ 00894 char lChar; 00895 lInvertedFile.get(lChar); 00896 mInvertedFileBuffer[i]=lChar; 00897 } 00898 #endif 00899 00900 mInvertedFile=new istringstream(mInvertedFileBuffer); 00901 // lFileSize does not need to be given 00902 cout << "DONE)" 00903 << endl; 00904 }else{ 00905 mInvertedFile=0; 00906 } 00907 } 00908 00909 mInvertedFile=new ifstream(mInvertedFileName.c_str()); 00910 if(!(*mInvertedFile)){ 00911 cout << " ...FAILED:" << strerror(errno) << endl; 00912 }else{ 00913 cout << " ...success. " << endl; 00914 } 00915 00916 cout << "Opening _" 00917 << mOffsetFileName 00918 << "_"; 00919 mOffsetFile.close(); 00920 mOffsetFile.clear(); 00921 mOffsetFile.open(mOffsetFileName.c_str()); 00922 if(!mOffsetFile){ 00923 cout << " FAILED!" << strerror(errno) << endl; 00924 }else{ 00925 cout << " ...success. " << endl; 00926 } 00927 00928 00929 cout << "Opening _" 00930 << mFeatureDescriptionFileName 00931 << "_"; 00932 mFeatureDescriptionFile.close(); 00933 mFeatureDescriptionFile.clear(); 00934 mFeatureDescriptionFile.open(mFeatureDescriptionFileName.c_str()); 00935 if(!mFeatureDescriptionFile){ 00936 cout << " ...FAILED!" << strerror(errno) << endl; 00937 }else{ 00938 cout << " ...success. " << endl; 00939 } 00940 00941 00942 00943 bool lRetVal=(mURL2FTS->operator bool() 00944 //was the superclass well constructed? 00945 && 00946 mFeatureDescriptionFile 00947 && 00948 mOffsetFile 00949 && 00950 *mInvertedFile); 00951 00952 assert(mURL2FTS->size()); 00953 00954 cout << endl 00955 << "Current success status" 00956 << lRetVal 00957 << endl; 00958 00959 { 00960 list<CAccessorElement> lAllAccessorElements; 00961 00962 mURL2FTS->getAllAccessorElements(lAllAccessorElements); 00963 00964 //for each element in the database 00965 for(list<CAccessorElement>::const_iterator i=lAllAccessorElements.begin(); 00966 i!=lAllAccessorElements.end(); 00967 i++){ 00968 TID lID =i->getID(); 00969 string lURL=i->getURL(); 00970 00971 pair<bool,string> lFeatureFileName=mURL2FTS->URLToFFN(lURL); 00972 00973 assert(lFeatureFileName.first); 00974 00975 mDocumentInformation 00976 .insert(make_pair(lID, 00977 CAdditionalDocumentInformation(lFeatureFileName.second))); 00978 00979 lRetVal = 00980 lRetVal && mDocumentInformation[lID].input(); 00981 00982 } 00983 } 00984 00985 cout << "URLFile " 00986 << mURL2FTS->getURLToFeatureFileName() 00987 << " processed. Current success status" 00988 << lRetVal 00989 << endl; 00990 00991 /* erase the offset file */ 00992 mIDToOffset.erase(mIDToOffset.begin(), 00993 mIDToOffset.end()); 00994 00995 mOffsetFile.seekg(0); 00996 00997 00998 /* while there is no end of file for the offsets */ 00999 while(mOffsetFile){ 01000 SUIntStreampos lVal; 01001 01002 /* read a value into the offset file */ 01003 mOffsetFile.read((char*)&lVal,sizeof(lVal)); 01004 01005 01006 if(mOffsetFile){ 01007 01008 01009 // actually useful code here! ;-I 01010 unsigned int lFeatureID=lVal.mUInt1; 01011 streampos lOffset(lVal.mStreampos);//FIXME streampos 01012 mMaximumFeatureID=(mMaximumFeatureID < lFeatureID)?lFeatureID:mMaximumFeatureID; 01013 mIDToOffset[lFeatureID]=lOffset; 01014 01015 #ifndef _NO_PRINT_OFFSET_CHECK 01016 // hexdump of the characters read 01017 for(int i=0;i<sizeof(lVal);i++){ 01018 cout << hex << (unsigned int)(((unsigned char*)&lVal)[i]) << "/" << dec << flush; 01019 } 01020 01021 01022 // value different from check value? 01023 if((unsigned int)(lVal.mStreampos) != lVal.mUInt2){ 01024 cout << "Size:" << sizeof(lVal) << ":" << lVal.mUInt1 << ":" << lVal.mStreampos << "!=" << lVal.mUInt2 << "!!!" << long(mOffsetFile.tellg()) << endl; 01025 } 01026 assert((unsigned int)(lVal.mStreampos) == lVal.mUInt2); 01027 01028 // the actual values 01029 cout << "[read:" << sizeof(lVal) << ":inFeatureID:" << flush 01030 << hex 01031 << lFeatureID << "==" << dec << lFeatureID 01032 << ";inPosition:" 01033 << dec 01034 << lOffset 01035 << "/" 01036 << lVal.mStreampos 01037 << "]" << long(mOffsetFile.tellg()) << endl; 01038 #endif 01039 //Reading the offsetfile 01040 { //move to the right position 01041 mInvertedFile->seekg(lOffset); 01042 01043 //read the list start chunk 01044 //(by constructing an instance of the list start cunk) 01045 CIFListStart lListStart(*mInvertedFile); 01046 01047 //checking it 01048 if(lListStart.getFeatureID()!=lFeatureID){ 01049 cout << "[ERROR" << flush 01050 << hex 01051 << lFeatureID 01052 << "," 01053 << hex 01054 << lListStart.getFeatureID() 01055 << dec 01056 << "]" << flush; 01057 assert(0); 01058 } 01059 #ifndef _NO_PRINT_OFFSET_CHECK 01060 else{ 01061 cout << "-" << flush; 01062 } 01063 #endif 01064 01065 //And setting up the translation table from feature 01066 //ID to collection frequency 01067 mFeatureToCollectionFrequency[lListStart.getFeatureID()]= 01068 lListStart.getCollectionFrequency(); 01069 01070 /* assert(mFeatureToCollectionFrequency[lListStart.getFeatureID()]);*/ 01071 01072 } 01073 #ifndef _NO_CHECK_OFFSET_FILE 01074 #endif 01075 } // if the file is valid --> ID is valid! 01076 } 01077 01078 cout << "OffsetFile " 01079 << mOffsetFileName 01080 << " processed. Current success status: " 01081 << lRetVal 01082 << endl; 01083 01084 { 01085 /* as long as there is no end of file */ 01086 while(mFeatureDescriptionFile){ 01087 TID lFeatureID; 01088 unsigned int lType; 01089 01090 01091 mFeatureDescriptionFile >> lFeatureID 01092 >> lType; 01093 01094 mFeatureDescription[lFeatureID]=lType; 01095 } /* end of while */ 01096 01097 } 01098 01099 cout << "FeatureDescriptionFile " 01100 << mFeatureDescriptionFileName 01101 << " processed: " 01102 << mFeatureDescription.size() 01103 << " elements in hash." 01104 << endl 01105 << "Initialisation successful? Returning " 01106 << lRetVal 01107 << endl; 01108 01109 #ifndef _NO_PRINT_INIT 01110 checkNPrint(); 01111 #endif 01112 gMutex->unlock(); 01113 return lRetVal; 01114 }catch(...){ 01115 cerr << "caught here " << endl; 01116 } 01117 01118 };
| string CAcIFFileSystem::IDToURL | ( | TID | inID | ) | const [virtual] |
Translate a DocumentID to a URL (for output)
Implements CAcInvertedFile.
Definition at line 1682 of file CAcIFFileSystem.cc.
References mrml_const::error, CMutex::lock(), mURL2FTS, and CMutex::unlock().
01683 { 01684 gMutex->lock(); 01685 01686 pair<bool,CAccessorElement> lElement=mURL2FTS->IDToAccessorElement(inID); 01687 01688 if(lElement.first){ 01689 gMutex->unlock(); 01690 return lElement.second.getURL(); 01691 } else { 01692 cerr << "Error in Conversion from ID " 01693 << inID 01694 << " to URL." 01695 << endl; 01696 gMutex->unlock(); 01697 return mrml_const::error; 01698 } 01699 }
| CDocumentFrequencyList * CAcIFFileSystem::FeatureToList | ( | TFeatureID | inFeatureID | ) | const [virtual] |
List of documents containing the feature
Implements CAcInvertedFile.
Definition at line 1252 of file CAcIFFileSystem.cc.
References CIFListStart::getFeatureID(), CIFListStart::getNumberOfElements(), CMutex::lock(), mIDToOffset, mInvertedFile, CDocumentFrequencyList::readBinary(), and CMutex::unlock().
01253 { 01254 gMutex->lock(); 01255 CDocumentFrequencyList* lRetVal=0; 01256 01257 01258 { 01259 mInvertedFile->clear(); 01260 01261 01262 //Find the list of URL-IDs for the feature 01263 if(mIDToOffset.find(inFeatureID)!=mIDToOffset.end()){ 01264 mInvertedFile->seekg((*mIDToOffset.find(inFeatureID)).second); 01265 assert(*mInvertedFile); 01266 /* if the inverted file has been able to be opened */ 01267 if(*mInvertedFile){ 01268 01269 //read the beginning chunk of the list; 01270 CIFListStart lListStart(*mInvertedFile); 01271 01272 // 01273 lRetVal=new CDocumentFrequencyList(lListStart.getNumberOfElements()); 01274 01275 if(lListStart.getFeatureID()!=inFeatureID){ 01276 cerr << "Feature " 01277 << hex 01278 << inFeatureID 01279 << " not found."; 01280 gMutex->unlock(); 01281 return 0; 01282 } 01283 01284 #ifndef _NO_DIDPRINT 01285 cout << endl; 01286 #endif 01287 01288 01289 lRetVal->readBinary(*mInvertedFile); 01290 01291 01292 #ifndef _NO_DIDPRINT 01293 cout << endl; 01294 #endif 01295 } 01296 }else{ 01297 cerr << "II:Feature " 01298 << hex 01299 << inFeatureID 01300 << " not found." 01301 << mIDToOffset.size() 01302 << endl; 01303 gMutex->unlock(); 01304 return 0; 01305 } 01306 } 01307 gMutex->unlock(); 01308 return lRetVal; 01309 };
| CDocumentFrequencyList * CAcIFFileSystem::URLToFeatureList | ( | string | inURL | ) | const [virtual] |
List of features contained by a document
Implements CAcInvertedFile.
Definition at line 1511 of file CAcIFFileSystem.cc.
References DIDToFeatureList(), getFeatureFile(), CMutex::lock(), mURL2FTS, my_assert, CMutex::unlock(), and URLToID().
Referenced by checkConsistency().
01512 { 01513 gMutex->lock(); 01514 01515 #ifdef PRINT_ADI 01516 cout <<inURL 01517 << "(ADI:" ; 01518 #endif 01519 01520 pair<bool,TID> lID=URLToID(inURL); 01521 01522 if(!lID.first){//i.e. the URL is not part of the collection 01523 01524 CDocumentFrequencyList* lReturnValue(0); 01525 #ifdef IGNORE_UNKNOWN_URLS 01526 #else 01527 pid_t lPID= getpid(); 01528 01529 char lFeatureFileName[30]; 01530 char lThumbnailName[30]; 01531 01532 int lTime = int(time(0)); 01533 int lRandomId = random();// in order to make sure that no two images have the same ID 01534 01535 sprintf(lFeatureFileName,"/tmp/testFTS-%d-%d-%d.fts",int(lPID),lTime,lRandomId); 01536 sprintf(lThumbnailName,"/tmp/testThumbnail-%d-%d-%d.jpg",int(lPID),lTime,lRandomId); 01537 01538 //FIXME this is a potential security leak. 01539 system(string(string(__PERL_LOCATION__)+ " " + string(__EXECBINDIR__)+ "/gift-url-to-fts.pl "+ inURL +" "+ lFeatureFileName + " " + lThumbnailName).c_str()); 01540 01541 // lThumbnailName exists, but is probably less accessible than the initial URL 01542 mURL2FTS->addImage(inURL,inURL,lFeatureFileName); 01543 01544 lReturnValue=this->getFeatureFile(lFeatureFileName); 01545 01546 #endif 01547 if(!lReturnValue){ 01548 lReturnValue=new CDocumentFrequencyList(); 01549 } 01550 gMutex->unlock(); 01551 return lReturnValue; 01552 // { 01553 // cout << endl << "this= " << this << endl; 01554 // checkNPrint(); 01555 // cout << "could not find ID for URL " 01556 // << inURL << endl; 01557 // cout << "The current size of mURLToID is " << mURLToID.size() << endl; 01558 // cout << "All elements:" 01559 // << endl; 01560 // int lCount=1; 01561 // for(string_TID_map::const_iterator i=mURLToID.begin(); 01562 // i!=mURLToID.end(); 01563 // i++){ 01564 // cout << "," 01565 // << lCount++ 01566 // << flush; 01567 // } 01568 // lCount=1; 01569 // for(string_TID_map::const_iterator i=mURLToID.begin(); 01570 // i!=mURLToID.end(); 01571 // i++){ 01572 // cout << lCount++ 01573 // << ":" 01574 // << flush 01575 // << i->first 01576 // << "->" 01577 // << i->second 01578 // << flush 01579 // << endl 01580 // << flush; 01581 // } 01582 // } 01583 } 01584 my_assert(lID.first,inURL.c_str()); 01585 01586 gMutex->unlock(); 01587 return DIDToFeatureList(lID.second); 01588 };
| CDocumentFrequencyList * CAcIFFileSystem::DIDToFeatureList | ( | TID | inDID | ) | const [virtual] |
List of features contained by a document with ID inDID
Implements CAcInvertedFile.
Definition at line 1628 of file CAcIFFileSystem.cc.
References getFeatureFile(), CMutex::lock(), mDocumentInformation, mURL2FTS, CADIHash::output(), and CMutex::unlock().
Referenced by URLToFeatureList().
01628 { 01629 gMutex->lock(); 01630 01631 #ifdef PRINT_ADI 01632 cout <<inURL 01633 << "(ADI:" ; 01634 01635 //FIXME this is temporal. The following two lines should be put back in 01636 CADIHash::const_iterator iADI=mDocumentInformation.find(inDID); 01637 assert(iADI!=mDocumentInformation.end()); 01638 //FIXME I am not quite clear if, when and where the ADI is generated. 01639 //FIXME fix on demand. 01640 #endif 01641 01642 #ifdef PRINT_ADI 01643 iADI->second.output(cout); 01644 01645 cout << ":ADI)" 01646 << endl; 01647 #endif 01648 01649 01650 CDocumentFrequencyList* lRetVal=0; 01651 01652 /* tests if URL is found */ 01653 pair<bool,string> lFeatureFileName=mURL2FTS->IDToFFN(inDID); 01654 if(lFeatureFileName.first){ 01655 01656 lRetVal=this->getFeatureFile(lFeatureFileName.second); 01657 01658 01659 } /* end of the if statement */ 01660 01661 if(!lRetVal){ 01662 lRetVal=new CDocumentFrequencyList(0); 01663 } 01664 gMutex->unlock(); 01665 return lRetVal; 01666 };
| double CAcIFFileSystem::FeatureToCollectionFrequency | ( | TFeatureID | inFeatureID | ) | const [virtual] |
Collection frequency for a given feature
Implements CAcInvertedFile.
Definition at line 1134 of file CAcIFFileSystem.cc.
References CMutex::lock(), mFeatureToCollectionFrequency, and CMutex::unlock().
01134 { 01135 gMutex->lock(); 01136 01137 if((mFeatureToCollectionFrequency.find(inFeatureID))!= 01138 mFeatureToCollectionFrequency.end()) 01139 { 01140 assert(0<(*(mFeatureToCollectionFrequency.find(inFeatureID))).second); 01141 01142 double lReturnValue((*(mFeatureToCollectionFrequency.find(inFeatureID))).second); 01143 gMutex->unlock(); 01144 return lReturnValue; 01145 } 01146 else 01147 { 01148 gMutex->unlock(); 01149 return 1; 01150 } 01151 }
| unsigned int CAcIFFileSystem::getFeatureDescription | ( | TID | inFeatureID | ) | const [virtual] |
What kind of feature is the feature with ID inFeatureID?
debugging code
/debugging code
Implements CAcInvertedFile.
Definition at line 1713 of file CAcIFFileSystem.cc.
References CMutex::lock(), mFeatureDescription, and CMutex::unlock().
01713 { 01714 gMutex->lock(); 01715 01716 if(mFeatureDescription.find(inID)!=mFeatureDescription.end()){ 01717 unsigned int lReturnValue((*mFeatureDescription.find(inID)).second); 01718 gMutex->unlock(); 01719 return lReturnValue; 01720 }else{ 01721 cout << "[UnknownFeature: " 01722 << inID 01723 << "]" 01724 << flush; 01725 01727 cout << mFeatureDescription.size() << flush; 01730 gMutex->unlock(); 01731 return 0; 01732 } 01733 }
| double CAcIFFileSystem::DIDToMaxDocumentFrequency | ( | TID | inID | ) | const [virtual] |
returns the maximum document frequency for one document ID
Implements CAcInvertedFile.
Definition at line 1166 of file CAcIFFileSystem.cc.
References CMutex::lock(), mDocumentInformation, and CMutex::unlock().
01166 { 01167 gMutex->lock(); 01168 01169 if(mDocumentInformation.find(inID)!=mDocumentInformation.end()) 01170 { 01171 double lReturnValue((*mDocumentInformation.find(inID)).second.getMaximumDF()); 01172 gMutex->unlock(); 01173 return lReturnValue; 01174 } 01175 else 01176 { 01177 assert(1==0); 01178 gMutex->unlock(); 01179 return 1; 01180 } 01181 };
| double CAcIFFileSystem::DIDToDFSquareSum | ( | TID | inID | ) | const [virtual] |
returns the Document frequency squaresum for a given document ID
Returns the document-frequency square sum for a given document ID
Implements CAcInvertedFile.
Definition at line 1197 of file CAcIFFileSystem.cc.
References CMutex::lock(), mDocumentInformation, and CMutex::unlock().
01197 { 01198 gMutex->lock(); 01199 if(mDocumentInformation.find(inID)!=mDocumentInformation.end()){ 01200 double lReturnValue(mDocumentInformation.find(inID)->second.getDFSquareSum()); 01201 01202 gMutex->unlock(); 01203 return lReturnValue; 01204 }else{ 01205 assert(1+1==0); 01206 gMutex->unlock(); 01207 return 1; 01208 } 01209 }
| double CAcIFFileSystem::DIDToSquareDFLogICFSum | ( | TID | inID | ) | const [virtual] |
Returns this function for a given document ID
Implements CAcInvertedFile.
Definition at line 1224 of file CAcIFFileSystem.cc.
References CMutex::lock(), mDocumentInformation, and CMutex::unlock().
01224 { 01225 if(mDocumentInformation.find(inID)!=mDocumentInformation.end()){ 01226 gMutex->lock(); 01227 double lReturnValue((*mDocumentInformation.find(inID)) 01228 .second.getSquareDFLogICFSum()); 01229 gMutex->unlock(); 01230 return lReturnValue; 01231 }else{ 01232 assert(1+1+1==0); 01233 gMutex->unlock(); 01234 return 1; 01235 } 01236 };
| bool CAcIFFileSystem::generateInvertedFile | ( | ) | [virtual] |
Generating an inverted File, if there is none. Fast but stupid in-memory method. This method is very fast, if all the inverted file (and a bit more) can be kept in memory at runtime. If this is not the case, extensive swapping is the result, virtually halting the inverted file creation.
Meaning additional document information
Implements CAcInvertedFile.
Definition at line 153 of file CAcIFFileSystem.cc.
References checkConsistency(), CDocumentFrequencyElement::getDocumentFrequency(), CDocumentFrequencyElement::getID(), CMutex::lock(), mFeatureDescriptionFile, mFeatureDescriptionFileName, mInvertedFileName, mOffsetFileName, mURL2FTS, CADIHash::output(), CMutex::unlock(), and writeOffsetFileElement().
00153 { 00154 gMutex->lock(); 00155 // open the feature description file 00156 // 00157 cout << "I want to use/generate the files:" 00158 << mInvertedFileName << endl 00159 << mOffsetFileName << endl 00160 << mURL2FTS->getURLToFeatureFileName() << endl 00161 << mFeatureDescriptionFileName << endl; 00162 00163 ofstream lNewInvertedFile(mInvertedFileName.c_str()); 00164 ofstream lNewOffsetFile(mOffsetFileName.c_str()); 00165 00166 /* if one of the files does not open correctly */ 00167 if(!(mURL2FTS->operator bool() 00168 && 00169 mFeatureDescriptionFile 00170 && 00171 lNewInvertedFile 00172 && 00173 lNewOffsetFile)){ 00174 cerr << "I could not open the necessary files for" 00175 << "generating an inverted file"; 00176 gMutex->unlock(); 00177 return false; 00178 } 00179 cout << "files successfully opened" 00180 << flush 00181 << endl; 00182 00183 00184 bool lError=false; 00185 //Local: A hash of Inverted File Chunks by the feature ID 00186 map<TID,CInvertedFileChunk> lInvertedFileHash; 00187 00188 00190 CADIHash lADI; 00191 00192 list<CAccessorElement> lAllAccessorElements; 00193 00194 mURL2FTS->getAllAccessorElements(lAllAccessorElements); 00195 00196 /* process this for all the images in the file URL to Feature file name */ 00197 for(list<CAccessorElement>::const_iterator i=lAllAccessorElements.begin(); 00198 i!=lAllAccessorElements.end(); 00199 i++){ 00200 00201 //This variable is used in the for loop to translate positions in the map into IDs 00202 int lDocumentID=i->getID(); 00203 00204 00205 cout << endl 00206 << "Processing File: " 00207 << lDocumentID 00208 << flush; 00209 cout << " " 00210 << *i 00211 << "..." 00212 << flush 00213 << endl; 00214 cout << "h" 00215 << flush 00216 << endl; 00217 00218 /* use the second part of the map as the 00219 file name for the next feature file */ 00220 ifstream lFeatureFile(i->getFeatureFileName().c_str()); 00221 lADI[lDocumentID]=CAdditionalDocumentInformation(i->getFeatureFileName().c_str()); 00222 00223 unsigned int lNumberOfFeatures=0; 00224 /* reads the number of features out of the file */ 00225 lFeatureFile.read((char*)&lNumberOfFeatures, 00226 sizeof(lNumberOfFeatures)); 00227 00228 double lMaxDocumentFrequency=0; 00229 double lDocumentFrequencySquareSum=0; 00230 00231 //Read features for one image 00232 if(lFeatureFile) 00233 { 00234 /* for each feature in the file */ 00235 for(unsigned int j=0; 00236 j<lNumberOfFeatures && lFeatureFile; 00237 j++) 00238 { 00239 /* does this already read in the element from the file ? yes*/ 00240 struct{ 00241 TID first; 00242 float second; 00243 } lInElement; 00244 00245 lFeatureFile.read((char*)&lInElement, 00246 sizeof(lInElement)); 00247 00248 00249 CDocumentFrequencyElement lElement(lInElement.first, 00250 lInElement.second); 00251 00252 // cout << "RSIZE" << sizeof(lInElement) 00253 // << " " << lElement.getID() 00254 // << " " << lInElement.first 00255 // << " " << lElement.getDocumentFrequency() 00256 // << "== 0x" << hex << lInElement.second 00257 // << endl; 00258 00259 //if this assertion fails, there has been a misunderstanding 00260 //about the data format of the feature file 00261 //(padding: where and how?) WM 00262 assert(lElement.getDocumentFrequency()>1/1000000000); 00263 assert(lElement.getDocumentFrequency()<=1); 00264 00265 //Adjust values which depend on the document Frequency 00266 //in the Additional Document Information 00267 lADI[lDocumentID]. 00268 adjustDF(lElement.getDocumentFrequency()); 00269 00270 /* this adds a feature into the inverted file list of features */ 00271 lInvertedFileHash[lElement.getID()]. 00272 addElement(lDocumentID, 00273 lElement.getDocumentFrequency()); 00274 } 00275 } 00276 00277 cout << "...finished" 00278 << flush 00279 << endl; 00280 00281 00282 //one or more errors... 00283 if(!lFeatureFile){ 00284 lError=true; 00285 cout << "Error reading file " 00286 << i->getFeatureFileName() 00287 << "!" 00288 << endl; 00289 } 00290 } 00291 00292 // For each Inverted file chunk: this means for each feature 00293 // including the documents containing this feature 00294 // Write it and its offset 00295 for(map<TID,CInvertedFileChunk>::const_iterator i=lInvertedFileHash.begin(); 00296 i!=lInvertedFileHash.end() && lNewInvertedFile; 00297 i++) 00298 { 00299 00300 assert((*i).second.size()); 00301 00302 //find out the ID of the feature 00303 TID lFeatureID=(*i).first; 00304 //writing the offset 00305 { 00306 // this is the position of the next feature in the inverted file 00307 streampos lPos(lNewInvertedFile.tellp()); 00308 00309 /* updating the offset file */ 00310 00311 writeOffsetFileElement(lFeatureID, 00312 lPos, 00313 lNewOffsetFile); 00314 00315 00316 cout << endl 00317 << "Writing Chunk for Feature ID " 00318 << hex 00319 << lFeatureID 00320 << ". The Offset is 0x" 00321 << hex 00322 << lPos 00323 << dec 00324 << "=" 00325 << lPos 00326 << endl; 00327 } 00328 // Writing the next piece of the inverted file 00329 { 00330 /* this writes the actual data for the feature, 00331 meaning the list of all the documents containg 00332 this feature */ 00333 bool lSuccessfullyWritten(((*i).second).writeBinary(lNewInvertedFile, 00334 lFeatureID, 00335 mURL2FTS->size())); 00336 if(!lSuccessfullyWritten) <