+ few imp. in the indexer code

This commit is contained in:
kelson42 2009-12-20 14:10:58 +00:00
parent bc65a459d9
commit 86597c856d
2 changed files with 50 additions and 37 deletions

View File

@ -3,7 +3,7 @@
namespace kiwix { namespace kiwix {
/* Count word */ /* Count word */
unsigned int countWords(const string &text) { unsigned int Indexer::countWords(const string &text) {
unsigned int numWords = 1; unsigned int numWords = 1;
for(int i=0; i<text.size();) { for(int i=0; i<text.size();) {
while(i<text.size() && text[i] != ' ') { while(i<text.size() && text[i] != ' ') {
@ -25,33 +25,26 @@ namespace kiwix {
/* Open the ZIM file */ /* Open the ZIM file */
this->zimFileHandler = new zim::File(zimFilePath); this->zimFileHandler = new zim::File(zimFilePath);
if (this->zimFileHandler != NULL) {
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
} else {
throw("Unable to open " + zimFilePath);
}
/* Open the Xapian directory */ /* Open the Xapian directory */
this->writableDatabase = Xapian::WritableDatabase(xapianDirectoryPath, this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
Xapian::DB_CREATE_OR_OVERWRITE); Xapian::DB_CREATE_OR_OVERWRITE);
/* Prepare the indexation */
this->prepareIndexing();
} }
/* Destructor */ /* Destructor */
Indexer::~Indexer() { Indexer::~Indexer() {
this->stopIndexing();
/* delete the zimFileHandler */
if (this->zimFileHandler != NULL) {
delete this->zimFileHandler;
}
/* delte the Xapian writableDatabase */
this->writableDatabase.~WritableDatabase();
} }
/* Start indexing */ /* Start indexing */
void Indexer::startIndexing() { void Indexer::prepareIndexing() {
/* Define a few values */
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
/* Compute few things */ /* Compute few things */
this->articleCount = this->zimFileHandler->getNamespaceCount('A'); this->articleCount = this->zimFileHandler->getNamespaceCount('A');
@ -63,10 +56,15 @@ namespace kiwix {
float thresholdOffset = this->currentArticleOffset + this->stepSize; float thresholdOffset = this->currentArticleOffset + this->stepSize;
size_t found; size_t found;
/* Check if we can start */
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
return false;
}
while(this->currentArticleOffset < thresholdOffset && while(this->currentArticleOffset < thresholdOffset &&
this->currentArticleOffset < this->lastArticleOffset) { this->currentArticleOffset < this->lastArticleOffset) {
/* get next non redirect article */ /* Get next non redirect article */
do { do {
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset); currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
} while (this->currentArticleOffset++ && } while (this->currentArticleOffset++ &&
@ -74,17 +72,20 @@ namespace kiwix {
this->currentArticleOffset != this->lastArticleOffset); this->currentArticleOffset != this->lastArticleOffset);
if (!currentArticle.isRedirect()) { if (!currentArticle.isRedirect()) {
/* Index the content */ /* Index the content */
this->htmlParser.reset(); this->htmlParser.reset();
string content (currentArticle.getData().data(), currentArticle.getData().size()); string content (currentArticle.getData().data(), currentArticle.getData().size());
/* The parser generate a lot of exceptions which should be avoided */
try { try {
this->htmlParser.parse_html(content, "UTF-8", true); this->htmlParser.parse_html(content, "UTF-8", true);
} catch (...) { } catch (...) {
} }
/* if content does not have the noindex meta tag */ /* If content does not have the noindex meta tag */
found=this->htmlParser.dump.find("NOINDEX"); /* Seems that the parser generates an exception in such case */
found = this->htmlParser.dump.find("NOINDEX");
if (found == string::npos) { if (found == string::npos) {
@ -119,13 +120,13 @@ namespace kiwix {
} }
/* add to the database */ /* add to the database */
this->writableDatabase.add_document(document); this->writableDatabase->add_document(document);
} }
} }
} }
/* Write Xapian DB to the disk */ /* Write Xapian DB to the disk */
this->writableDatabase.flush(); this->writableDatabase->flush();
/* increment the offset and set returned value */ /* increment the offset and set returned value */
if (this->currentArticleOffset < this->lastArticleOffset) { if (this->currentArticleOffset < this->lastArticleOffset) {
@ -139,8 +140,17 @@ namespace kiwix {
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
void Indexer::stopIndexing() { void Indexer::stopIndexing() {
this->currentArticleOffset = this->firstArticleOffset; /* Delete the zimFileHandler */
this->writableDatabase.~WritableDatabase(); if (this->zimFileHandler != NULL) {
delete this->zimFileHandler;
this->zimFileHandler = NULL;
}
/* Delete the Xapian writableDatabase */
if (this->writableDatabase != NULL) {
delete this->writableDatabase;
this->writableDatabase = NULL;
}
} }
} }

View File

@ -18,23 +18,26 @@ namespace kiwix {
Indexer(const string &zimFilePath, const string &xapianDirectoryPath); Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
~Indexer(); ~Indexer();
void startIndexing();
bool indexNextPercent(); bool indexNextPercent();
void stopIndexing();
protected: protected:
void prepareIndexing();
void stopIndexing();
unsigned int countWords(const string &text);
unsigned int articleCount;
float stepSize;
zim::File* zimFileHandler; zim::File* zimFileHandler;
zim::size_type firstArticleOffset; zim::size_type firstArticleOffset;
zim::size_type lastArticleOffset; zim::size_type lastArticleOffset;
zim::size_type currentArticleOffset; zim::size_type currentArticleOffset;
zim::Article currentArticle; zim::Article currentArticle;
unsigned int articleCount; Xapian::WritableDatabase *writableDatabase;
float stepSize;
Xapian::WritableDatabase writableDatabase;
Xapian::Stem stemmer; Xapian::Stem stemmer;
Xapian::TermGenerator indexer; Xapian::TermGenerator indexer;
MyHtmlParser htmlParser; MyHtmlParser htmlParser;
}; };