mirror of https://github.com/kiwix/libkiwix.git
+ few imp. in the indexer code
This commit is contained in:
parent
bc65a459d9
commit
86597c856d
|
@ -3,7 +3,7 @@
|
||||||
namespace kiwix {
|
namespace kiwix {
|
||||||
|
|
||||||
/* Count word */
|
/* Count word */
|
||||||
unsigned int countWords(const string &text) {
|
unsigned int Indexer::countWords(const string &text) {
|
||||||
unsigned int numWords = 1;
|
unsigned int numWords = 1;
|
||||||
for(int i=0; i<text.size();) {
|
for(int i=0; i<text.size();) {
|
||||||
while(i<text.size() && text[i] != ' ') {
|
while(i<text.size() && text[i] != ' ') {
|
||||||
|
@ -25,33 +25,26 @@ namespace kiwix {
|
||||||
/* Open the ZIM file */
|
/* Open the ZIM file */
|
||||||
this->zimFileHandler = new zim::File(zimFilePath);
|
this->zimFileHandler = new zim::File(zimFilePath);
|
||||||
|
|
||||||
if (this->zimFileHandler != NULL) {
|
|
||||||
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
|
||||||
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
|
||||||
this->currentArticleOffset = this->firstArticleOffset;
|
|
||||||
} else {
|
|
||||||
throw("Unable to open " + zimFilePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Open the Xapian directory */
|
/* Open the Xapian directory */
|
||||||
this->writableDatabase = Xapian::WritableDatabase(xapianDirectoryPath,
|
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
|
||||||
Xapian::DB_CREATE_OR_OVERWRITE);
|
Xapian::DB_CREATE_OR_OVERWRITE);
|
||||||
|
|
||||||
|
/* Prepare the indexation */
|
||||||
|
this->prepareIndexing();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Destructor */
|
/* Destructor */
|
||||||
Indexer::~Indexer() {
|
Indexer::~Indexer() {
|
||||||
|
this->stopIndexing();
|
||||||
/* delete the zimFileHandler */
|
|
||||||
if (this->zimFileHandler != NULL) {
|
|
||||||
delete this->zimFileHandler;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* delte the Xapian writableDatabase */
|
|
||||||
this->writableDatabase.~WritableDatabase();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Start indexing */
|
/* Start indexing */
|
||||||
void Indexer::startIndexing() {
|
void Indexer::prepareIndexing() {
|
||||||
|
|
||||||
|
/* Define a few values */
|
||||||
|
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
|
||||||
|
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
|
||||||
|
this->currentArticleOffset = this->firstArticleOffset;
|
||||||
|
|
||||||
/* Compute few things */
|
/* Compute few things */
|
||||||
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||||
|
@ -63,10 +56,15 @@ namespace kiwix {
|
||||||
float thresholdOffset = this->currentArticleOffset + this->stepSize;
|
float thresholdOffset = this->currentArticleOffset + this->stepSize;
|
||||||
size_t found;
|
size_t found;
|
||||||
|
|
||||||
|
/* Check if we can start */
|
||||||
|
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
while(this->currentArticleOffset < thresholdOffset &&
|
while(this->currentArticleOffset < thresholdOffset &&
|
||||||
this->currentArticleOffset < this->lastArticleOffset) {
|
this->currentArticleOffset < this->lastArticleOffset) {
|
||||||
|
|
||||||
/* get next non redirect article */
|
/* Get next non redirect article */
|
||||||
do {
|
do {
|
||||||
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
|
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
|
||||||
} while (this->currentArticleOffset++ &&
|
} while (this->currentArticleOffset++ &&
|
||||||
|
@ -74,17 +72,20 @@ namespace kiwix {
|
||||||
this->currentArticleOffset != this->lastArticleOffset);
|
this->currentArticleOffset != this->lastArticleOffset);
|
||||||
|
|
||||||
if (!currentArticle.isRedirect()) {
|
if (!currentArticle.isRedirect()) {
|
||||||
|
|
||||||
/* Index the content */
|
/* Index the content */
|
||||||
this->htmlParser.reset();
|
this->htmlParser.reset();
|
||||||
string content (currentArticle.getData().data(), currentArticle.getData().size());
|
string content (currentArticle.getData().data(), currentArticle.getData().size());
|
||||||
|
|
||||||
|
/* The parser generate a lot of exceptions which should be avoided */
|
||||||
try {
|
try {
|
||||||
this->htmlParser.parse_html(content, "UTF-8", true);
|
this->htmlParser.parse_html(content, "UTF-8", true);
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if content does not have the noindex meta tag */
|
/* If content does not have the noindex meta tag */
|
||||||
found=this->htmlParser.dump.find("NOINDEX");
|
/* Seems that the parser generates an exception in such case */
|
||||||
|
found = this->htmlParser.dump.find("NOINDEX");
|
||||||
|
|
||||||
if (found == string::npos) {
|
if (found == string::npos) {
|
||||||
|
|
||||||
|
@ -119,13 +120,13 @@ namespace kiwix {
|
||||||
}
|
}
|
||||||
|
|
||||||
/* add to the database */
|
/* add to the database */
|
||||||
this->writableDatabase.add_document(document);
|
this->writableDatabase->add_document(document);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Write Xapian DB to the disk */
|
/* Write Xapian DB to the disk */
|
||||||
this->writableDatabase.flush();
|
this->writableDatabase->flush();
|
||||||
|
|
||||||
/* increment the offset and set returned value */
|
/* increment the offset and set returned value */
|
||||||
if (this->currentArticleOffset < this->lastArticleOffset) {
|
if (this->currentArticleOffset < this->lastArticleOffset) {
|
||||||
|
@ -139,8 +140,17 @@ namespace kiwix {
|
||||||
|
|
||||||
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
|
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
|
||||||
void Indexer::stopIndexing() {
|
void Indexer::stopIndexing() {
|
||||||
this->currentArticleOffset = this->firstArticleOffset;
|
/* Delete the zimFileHandler */
|
||||||
this->writableDatabase.~WritableDatabase();
|
if (this->zimFileHandler != NULL) {
|
||||||
|
delete this->zimFileHandler;
|
||||||
|
this->zimFileHandler = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Delete the Xapian writableDatabase */
|
||||||
|
if (this->writableDatabase != NULL) {
|
||||||
|
delete this->writableDatabase;
|
||||||
|
this->writableDatabase = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,23 +18,26 @@ namespace kiwix {
|
||||||
Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
Indexer(const string &zimFilePath, const string &xapianDirectoryPath);
|
||||||
~Indexer();
|
~Indexer();
|
||||||
|
|
||||||
void startIndexing();
|
|
||||||
bool indexNextPercent();
|
bool indexNextPercent();
|
||||||
void stopIndexing();
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
void prepareIndexing();
|
||||||
|
void stopIndexing();
|
||||||
|
unsigned int countWords(const string &text);
|
||||||
|
|
||||||
|
unsigned int articleCount;
|
||||||
|
float stepSize;
|
||||||
|
|
||||||
zim::File* zimFileHandler;
|
zim::File* zimFileHandler;
|
||||||
zim::size_type firstArticleOffset;
|
zim::size_type firstArticleOffset;
|
||||||
zim::size_type lastArticleOffset;
|
zim::size_type lastArticleOffset;
|
||||||
zim::size_type currentArticleOffset;
|
zim::size_type currentArticleOffset;
|
||||||
zim::Article currentArticle;
|
zim::Article currentArticle;
|
||||||
|
|
||||||
unsigned int articleCount;
|
Xapian::WritableDatabase *writableDatabase;
|
||||||
float stepSize;
|
|
||||||
|
|
||||||
Xapian::WritableDatabase writableDatabase;
|
|
||||||
Xapian::Stem stemmer;
|
Xapian::Stem stemmer;
|
||||||
Xapian::TermGenerator indexer;
|
Xapian::TermGenerator indexer;
|
||||||
|
|
||||||
MyHtmlParser htmlParser;
|
MyHtmlParser htmlParser;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue