+ better factori. of the indexer code

This commit is contained in:
kelson42 2010-10-30 21:26:14 +00:00
parent ef2423b1a7
commit c003035a5e
4 changed files with 125 additions and 117 deletions

View File

@ -16,13 +16,25 @@ namespace kiwix {
} }
/* Constructor */ /* Constructor */
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath) Indexer::Indexer(const string &zimFilePath)
: zimFileHandler(NULL), : zimFileHandler(NULL),
articleCount(0), articleCount(0),
stepSize(0) { stepSize(0) {
/* Open the ZIM file */ /* Open the ZIM file */
this->zimFileHandler = new zim::File(zimFilePath); this->zimFileHandler = new zim::File(zimFilePath);
/* Define a few values */
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
/* Compute few things */
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
this->stepSize = (float)this->articleCount / (float)100;
/* Read the stopwords file */
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
} }
/* Read the file containing the stopwords */ /* Read the file containing the stopwords */
@ -39,4 +51,73 @@ namespace kiwix {
std::cout << "Read " << this->stopWords.size() << " lines.\n"; std::cout << "Read " << this->stopWords.size() << " lines.\n";
return true; return true;
} }
/* Index next percent */
bool Indexer::indexNextPercent(const bool &verbose) {
float thresholdOffset = this->currentArticleOffset + this->stepSize;
size_t found;
/* Check if we can start */
if (this->zimFileHandler == NULL) {
return false;
}
this->indexNextPercentPre();
while(this->currentArticleOffset < thresholdOffset &&
this->currentArticleOffset < this->lastArticleOffset) {
zim::Article currentArticle;
/* Get next non redirect article */
do {
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
} while (this->currentArticleOffset++ &&
currentArticle.isRedirect() &&
this->currentArticleOffset != this->lastArticleOffset);
if (!currentArticle.isRedirect()) {
/* Index the content */
this->htmlParser.reset();
string content (currentArticle.getData().data(), currentArticle.getData().size());
/* The parser generate a lot of exceptions which should be avoided */
try {
this->htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {
}
/* If content does not have the noindex meta tag */
/* Seems that the parser generates an exception in such case */
found = this->htmlParser.dump.find("NOINDEX");
if (found == string::npos) {
string url = currentArticle.getLongUrl();
/* Debug output */
if (verbose) {
std::cout << "Indexing " << url << "..." << std::endl;
}
this->indexNextArticle(url, this->htmlParser.title,
removeAccents(this->htmlParser.title),
removeAccents(this->htmlParser.keywords),
removeAccents(this->htmlParser.dump));
}
}
}
this->indexNextPercentPost();
/* increment the offset and set returned value */
if (this->currentArticleOffset < this->lastArticleOffset) {
this->currentArticleOffset++;
return true;
} else {
this->stopIndexing();
return false;
}
}
} }

View File

@ -20,11 +20,14 @@ namespace kiwix {
class Indexer { class Indexer {
public: public:
Indexer(const string &zimFilePath, const string &xapianDirectoryPath); Indexer(const string &zimFilePath);
virtual bool indexNextPercent(const bool &verbose = false) = 0; bool indexNextPercent(const bool &verbose = false);
protected: protected:
virtual void prepareIndexing() = 0; virtual void indexNextPercentPre() = 0;
virtual void indexNextArticle(string &url, string &title, string &unaccentedTitle,
string &keywords, string &content) = 0;
virtual void indexNextPercentPost() = 0;
virtual void stopIndexing() = 0; virtual void stopIndexing() = 0;
/* ZIM file handling */ /* ZIM file handling */

View File

@ -4,7 +4,7 @@ namespace kiwix {
/* Constructor */ /* Constructor */
XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) : XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) :
Indexer(zimFilePath, xapianDirectoryPath) { Indexer(zimFilePath) {
/* Open the Xapian directory */ /* Open the Xapian directory */
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
@ -16,9 +16,7 @@ namespace kiwix {
indexer.set_stemmer(stemmer); indexer.set_stemmer(stemmer);
*/ */
/* Read the stopwords file */ /* Stop words
/*
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin(); std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
this->stopper.add("ceci"); this->stopper.add("ceci");
while (stopWordsIterator != this->stopWords.end()) { while (stopWordsIterator != this->stopWords.end()) {
@ -27,118 +25,43 @@ namespace kiwix {
} }
indexer.set_stopper(&(this->stopper)); indexer.set_stopper(&(this->stopper));
*/ */
/* Prepare the indexation */
this->prepareIndexing();
} }
/* Destructor */ void XapianIndexer::indexNextPercentPre() {
XapianIndexer::~XapianIndexer() {
this->stopIndexing();
}
/* Start indexing */
void XapianIndexer::prepareIndexing() {
/* Define a few values */
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
/* Compute few things */
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
this->stepSize = (float)this->articleCount / (float)100;
}
/* Index next percent */
bool XapianIndexer::indexNextPercent(const bool &verbose) {
float thresholdOffset = this->currentArticleOffset + this->stepSize;
size_t found;
/* Check if we can start */
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
return false;
}
/* Begin the Xapian transation */
this->writableDatabase->begin_transaction(true); this->writableDatabase->begin_transaction(true);
while(this->currentArticleOffset < thresholdOffset &&
this->currentArticleOffset < this->lastArticleOffset) {
zim::Article currentArticle;
Xapian::Document currentDocument;
/* Get next non redirect article */
do {
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
} while (this->currentArticleOffset++ &&
currentArticle.isRedirect() &&
this->currentArticleOffset != this->lastArticleOffset);
if (!currentArticle.isRedirect()) {
/* Index the content */
this->htmlParser.reset();
string content (currentArticle.getData().data(), currentArticle.getData().size());
/* The parser generate a lot of exceptions which should be avoided */
try {
this->htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {
} }
/* If content does not have the noindex meta tag */ void XapianIndexer::indexNextArticle(string &url, string &title, string &unaccentedTitle,
/* Seems that the parser generates an exception in such case */ string &keywords, string &content) {
found = this->htmlParser.dump.find("NOINDEX");
if (found == string::npos) {
/* Put the data in the document */ /* Put the data in the document */
currentDocument.clear_values(); currentDocument.clear_values();
currentDocument.add_value(0, this->htmlParser.title); currentDocument.add_value(0, title);
currentDocument.set_data(currentArticle.getLongUrl().c_str()); currentDocument.set_data(url);
indexer.set_document(currentDocument); indexer.set_document(currentDocument);
/* Debug output */
if (verbose) {
std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
}
/* Index the title */ /* Index the title */
if (!this->htmlParser.title.empty()) { if (!unaccentedTitle.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.title), indexer.index_text_without_positions(unaccentedTitle, 5);
((this->htmlParser.dump.size() / 100) + 1) /
countWords(this->htmlParser.title) );
} }
/* Index the keywords */ /* Index the keywords */
if (!this->htmlParser.keywords.empty()) { if (!keywords.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3); indexer.index_text_without_positions(keywords, 3);
} }
/* Index the content */ /* Index the content */
if (!this->htmlParser.dump.empty()) { if (!content.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump)); indexer.index_text_without_positions(content);
} }
/* add to the database */ /* add to the database */
this->writableDatabase->add_document(currentDocument); this->writableDatabase->add_document(currentDocument);
} }
}
}
void XapianIndexer::indexNextPercentPost() {
/* Flush and close Xapian transaction*/ /* Flush and close Xapian transaction*/
this->writableDatabase->commit_transaction(); this->writableDatabase->commit_transaction();
/* increment the offset and set returned value */
if (this->currentArticleOffset < this->lastArticleOffset) {
this->currentArticleOffset++;
return true;
} else {
this->stopIndexing();
return false;
}
} }
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */

View File

@ -22,18 +22,19 @@ namespace kiwix {
public: public:
XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath); XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath);
~XapianIndexer();
bool indexNextPercent(const bool &verbose = false);
protected: protected:
void prepareIndexing(); void indexNextPercentPre();
void indexNextArticle(string &url, string &title, string &unaccentedTitle,
string &keywords, string &content);
void indexNextPercentPost();
void stopIndexing(); void stopIndexing();
Xapian::WritableDatabase *writableDatabase; Xapian::WritableDatabase *writableDatabase;
Xapian::Stem stemmer; Xapian::Stem stemmer;
Xapian::SimpleStopper stopper; Xapian::SimpleStopper stopper;
Xapian::TermGenerator indexer; Xapian::TermGenerator indexer;
Xapian::Document currentDocument;
}; };
} }