From c003035a5e750eac89de96b1e2a4363e8779f8a7 Mon Sep 17 00:00:00 2001 From: kelson42 Date: Sat, 30 Oct 2010 21:26:14 +0000 Subject: [PATCH] + better factori. of the indexer code --- src/common/kiwix/indexer.cpp | 85 +++++++++++++++++- src/common/kiwix/indexer.h | 9 +- src/common/kiwix/xapianIndexer.cpp | 139 +++++++---------------------- src/common/kiwix/xapianIndexer.h | 9 +- 4 files changed, 125 insertions(+), 117 deletions(-) diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp index 66be71a8b..fad04d97c 100644 --- a/src/common/kiwix/indexer.cpp +++ b/src/common/kiwix/indexer.cpp @@ -16,15 +16,27 @@ namespace kiwix { } /* Constructor */ - Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath) + Indexer::Indexer(const string &zimFilePath) : zimFileHandler(NULL), articleCount(0), stepSize(0) { /* Open the ZIM file */ this->zimFileHandler = new zim::File(zimFilePath); - } + /* Define a few values */ + this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A'); + this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A'); + this->currentArticleOffset = this->firstArticleOffset; + + /* Compute few things */ + this->articleCount = this->zimFileHandler->getNamespaceCount('A'); + this->stepSize = (float)this->articleCount / (float)100; + + /* Read the stopwords file */ + //this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); + } + /* Read the file containing the stopwords */ bool Indexer::readStopWordsFile(const string path) { std::string stopWord; @@ -39,4 +51,73 @@ namespace kiwix { std::cout << "Read " << this->stopWords.size() << " lines.\n"; return true; } + + /* Index next percent */ + bool Indexer::indexNextPercent(const bool &verbose) { + float thresholdOffset = this->currentArticleOffset + this->stepSize; + size_t found; + + /* Check if we can start */ + if (this->zimFileHandler == NULL) { + return false; + } + + this->indexNextPercentPre(); + + while(this->currentArticleOffset < thresholdOffset && + this->currentArticleOffset < this->lastArticleOffset) { + + zim::Article currentArticle; + + /* Get next non redirect article */ + do { + currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset); + } while (this->currentArticleOffset++ && + currentArticle.isRedirect() && + this->currentArticleOffset != this->lastArticleOffset); + + if (!currentArticle.isRedirect()) { + + /* Index the content */ + this->htmlParser.reset(); + string content (currentArticle.getData().data(), currentArticle.getData().size()); + + /* The parser generate a lot of exceptions which should be avoided */ + try { + this->htmlParser.parse_html(content, "UTF-8", true); + } catch (...) { + } + + /* If content does not have the noindex meta tag */ + /* Seems that the parser generates an exception in such case */ + found = this->htmlParser.dump.find("NOINDEX"); + + if (found == string::npos) { + string url = currentArticle.getLongUrl(); + + /* Debug output */ + if (verbose) { + std::cout << "Indexing " << url << "..." << std::endl; + } + + this->indexNextArticle(url, this->htmlParser.title, + removeAccents(this->htmlParser.title), + removeAccents(this->htmlParser.keywords), + removeAccents(this->htmlParser.dump)); + } + } + } + + this->indexNextPercentPost(); + + /* increment the offset and set returned value */ + if (this->currentArticleOffset < this->lastArticleOffset) { + this->currentArticleOffset++; + return true; + } else { + this->stopIndexing(); + return false; + } + } + } diff --git a/src/common/kiwix/indexer.h b/src/common/kiwix/indexer.h index 02e796e5c..6955aa030 100644 --- a/src/common/kiwix/indexer.h +++ b/src/common/kiwix/indexer.h @@ -20,11 +20,14 @@ namespace kiwix { class Indexer { public: - Indexer(const string &zimFilePath, const string &xapianDirectoryPath); - virtual bool indexNextPercent(const bool &verbose = false) = 0; + Indexer(const string &zimFilePath); + bool indexNextPercent(const bool &verbose = false); protected: - virtual void prepareIndexing() = 0; + virtual void indexNextPercentPre() = 0; + virtual void indexNextArticle(string &url, string &title, string &unaccentedTitle, + string &keywords, string &content) = 0; + virtual void indexNextPercentPost() = 0; virtual void stopIndexing() = 0; /* ZIM file handling */ diff --git a/src/common/kiwix/xapianIndexer.cpp b/src/common/kiwix/xapianIndexer.cpp index 9eb9aef3e..8c182842d 100644 --- a/src/common/kiwix/xapianIndexer.cpp +++ b/src/common/kiwix/xapianIndexer.cpp @@ -4,7 +4,7 @@ namespace kiwix { /* Constructor */ XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) : - Indexer(zimFilePath, xapianDirectoryPath) { + Indexer(zimFilePath) { /* Open the Xapian directory */ this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, @@ -16,9 +16,7 @@ namespace kiwix { indexer.set_stemmer(stemmer); */ - /* Read the stopwords file */ - /* - this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); + /* Stop words std::vector::const_iterator stopWordsIterator = this->stopWords.begin(); this->stopper.add("ceci"); while (stopWordsIterator != this->stopWords.end()) { @@ -27,118 +25,43 @@ namespace kiwix { } indexer.set_stopper(&(this->stopper)); */ - - /* Prepare the indexation */ - this->prepareIndexing(); } - /* Destructor */ - XapianIndexer::~XapianIndexer() { - this->stopIndexing(); - } - - /* Start indexing */ - void XapianIndexer::prepareIndexing() { - - /* Define a few values */ - this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A'); - this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A'); - this->currentArticleOffset = this->firstArticleOffset; - - /* Compute few things */ - this->articleCount = this->zimFileHandler->getNamespaceCount('A'); - this->stepSize = (float)this->articleCount / (float)100; - } - - /* Index next percent */ - bool XapianIndexer::indexNextPercent(const bool &verbose) { - float thresholdOffset = this->currentArticleOffset + this->stepSize; - size_t found; - - /* Check if we can start */ - if (this->zimFileHandler == NULL || this->writableDatabase == NULL) { - return false; - } - - /* Begin the Xapian transation */ + void XapianIndexer::indexNextPercentPre() { this->writableDatabase->begin_transaction(true); + } + + void XapianIndexer::indexNextArticle(string &url, string &title, string &unaccentedTitle, + string &keywords, string &content) { + + /* Put the data in the document */ + currentDocument.clear_values(); + currentDocument.add_value(0, title); + currentDocument.set_data(url); + indexer.set_document(currentDocument); - while(this->currentArticleOffset < thresholdOffset && - this->currentArticleOffset < this->lastArticleOffset) { - - zim::Article currentArticle; - Xapian::Document currentDocument; - - /* Get next non redirect article */ - do { - currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset); - } while (this->currentArticleOffset++ && - currentArticle.isRedirect() && - this->currentArticleOffset != this->lastArticleOffset); - - if (!currentArticle.isRedirect()) { - - /* Index the content */ - this->htmlParser.reset(); - string content (currentArticle.getData().data(), currentArticle.getData().size()); - - /* The parser generate a lot of exceptions which should be avoided */ - try { - this->htmlParser.parse_html(content, "UTF-8", true); - } catch (...) { - } - - /* If content does not have the noindex meta tag */ - /* Seems that the parser generates an exception in such case */ - found = this->htmlParser.dump.find("NOINDEX"); - - if (found == string::npos) { - - /* Put the data in the document */ - currentDocument.clear_values(); - currentDocument.add_value(0, this->htmlParser.title); - currentDocument.set_data(currentArticle.getLongUrl().c_str()); - indexer.set_document(currentDocument); - - /* Debug output */ - if (verbose) { - std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl; - } - - /* Index the title */ - if (!this->htmlParser.title.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.title), - ((this->htmlParser.dump.size() / 100) + 1) / - countWords(this->htmlParser.title) ); - } - - /* Index the keywords */ - if (!this->htmlParser.keywords.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3); - } - - /* Index the content */ - if (!this->htmlParser.dump.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.dump)); - } - - /* add to the database */ - this->writableDatabase->add_document(currentDocument); - } - } + /* Index the title */ + if (!unaccentedTitle.empty()) { + indexer.index_text_without_positions(unaccentedTitle, 5); } + /* Index the keywords */ + if (!keywords.empty()) { + indexer.index_text_without_positions(keywords, 3); + } + + /* Index the content */ + if (!content.empty()) { + indexer.index_text_without_positions(content); + } + + /* add to the database */ + this->writableDatabase->add_document(currentDocument); + } + + void XapianIndexer::indexNextPercentPost() { /* Flush and close Xapian transaction*/ this->writableDatabase->commit_transaction(); - - /* increment the offset and set returned value */ - if (this->currentArticleOffset < this->lastArticleOffset) { - this->currentArticleOffset++; - return true; - } else { - this->stopIndexing(); - return false; - } } /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ diff --git a/src/common/kiwix/xapianIndexer.h b/src/common/kiwix/xapianIndexer.h index fd8fc1dc2..bb4aafd34 100644 --- a/src/common/kiwix/xapianIndexer.h +++ b/src/common/kiwix/xapianIndexer.h @@ -22,18 +22,19 @@ namespace kiwix { public: XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath); - ~XapianIndexer(); - - bool indexNextPercent(const bool &verbose = false); protected: - void prepareIndexing(); + void indexNextPercentPre(); + void indexNextArticle(string &url, string &title, string &unaccentedTitle, + string &keywords, string &content); + void indexNextPercentPost(); void stopIndexing(); Xapian::WritableDatabase *writableDatabase; Xapian::Stem stemmer; Xapian::SimpleStopper stopper; Xapian::TermGenerator indexer; + Xapian::Document currentDocument; }; }