diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp index 05850abe3..66be71a8b 100644 --- a/src/common/kiwix/indexer.cpp +++ b/src/common/kiwix/indexer.cpp @@ -20,158 +20,9 @@ namespace kiwix { : zimFileHandler(NULL), articleCount(0), stepSize(0) { - + /* Open the ZIM file */ this->zimFileHandler = new zim::File(zimFilePath); - - /* Open the Xapian directory */ - this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, - Xapian::DB_CREATE_OR_OVERWRITE); - - /* Stemming */ - /* - stemmer = Xapian::Stem("french"); - indexer.set_stemmer(stemmer); - */ - - /* Read the stopwords file */ - /* - this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); - std::vector::const_iterator stopWordsIterator = this->stopWords.begin(); - this->stopper.add("ceci"); - while (stopWordsIterator != this->stopWords.end()) { - this->stopper.add(*stopWordsIterator); - stopWordsIterator++; - } - indexer.set_stopper(&(this->stopper)); - */ - - /* Prepare the indexation */ - this->prepareIndexing(); - } - - /* Destructor */ - Indexer::~Indexer() { - this->stopIndexing(); - } - - /* Start indexing */ - void Indexer::prepareIndexing() { - - /* Define a few values */ - this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A'); - this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A'); - this->currentArticleOffset = this->firstArticleOffset; - - /* Compute few things */ - this->articleCount = this->zimFileHandler->getNamespaceCount('A'); - this->stepSize = (float)this->articleCount / (float)100; - } - - /* Index next percent */ - bool Indexer::indexNextPercent(const bool &verbose) { - float thresholdOffset = this->currentArticleOffset + this->stepSize; - size_t found; - - /* Check if we can start */ - if (this->zimFileHandler == NULL || this->writableDatabase == NULL) { - return false; - } - - /* Begin the Xapian transation */ - this->writableDatabase->begin_transaction(true); - - while(this->currentArticleOffset < thresholdOffset && - this->currentArticleOffset < this->lastArticleOffset) { - - zim::Article currentArticle; - Xapian::Document currentDocument; - - /* Get next non redirect article */ - do { - currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset); - } while (this->currentArticleOffset++ && - currentArticle.isRedirect() && - this->currentArticleOffset != this->lastArticleOffset); - - if (!currentArticle.isRedirect()) { - - /* Index the content */ - this->htmlParser.reset(); - string content (currentArticle.getData().data(), currentArticle.getData().size()); - - /* The parser generate a lot of exceptions which should be avoided */ - try { - this->htmlParser.parse_html(content, "UTF-8", true); - } catch (...) { - } - - /* If content does not have the noindex meta tag */ - /* Seems that the parser generates an exception in such case */ - found = this->htmlParser.dump.find("NOINDEX"); - - if (found == string::npos) { - - /* Put the data in the document */ - currentDocument.clear_values(); - currentDocument.add_value(0, this->htmlParser.title); - currentDocument.set_data(currentArticle.getLongUrl().c_str()); - indexer.set_document(currentDocument); - - /* Debug output */ - if (verbose) { - std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl; - } - - /* Index the title */ - if (!this->htmlParser.title.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.title), - ((this->htmlParser.dump.size() / 100) + 1) / - countWords(this->htmlParser.title) ); - } - - /* Index the keywords */ - if (!this->htmlParser.keywords.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3); - } - - /* Index the content */ - if (!this->htmlParser.dump.empty()) { - indexer.index_text_without_positions(removeAccents(this->htmlParser.dump)); - } - - /* add to the database */ - this->writableDatabase->add_document(currentDocument); - } - } - } - - /* Flush and close Xapian transaction*/ - this->writableDatabase->commit_transaction(); - - /* increment the offset and set returned value */ - if (this->currentArticleOffset < this->lastArticleOffset) { - this->currentArticleOffset++; - return true; - } else { - this->stopIndexing(); - return false; - } - } - - /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ - void Indexer::stopIndexing() { - /* Delete the zimFileHandler */ - if (this->zimFileHandler != NULL) { - delete this->zimFileHandler; - this->zimFileHandler = NULL; - } - - /* Delete the Xapian writableDatabase */ - if (this->writableDatabase != NULL) { - delete this->writableDatabase; - this->writableDatabase = NULL; - } } /* Read the file containing the stopwords */ diff --git a/src/common/kiwix/indexer.h b/src/common/kiwix/indexer.h index a73073ec4..02e796e5c 100644 --- a/src/common/kiwix/indexer.h +++ b/src/common/kiwix/indexer.h @@ -21,34 +21,30 @@ namespace kiwix { public: Indexer(const string &zimFilePath, const string &xapianDirectoryPath); - ~Indexer(); - - bool indexNextPercent(const bool &verbose = false); + virtual bool indexNextPercent(const bool &verbose = false) = 0; protected: - void prepareIndexing(); - void stopIndexing(); - unsigned int countWords(const string &text); - - bool readStopWordsFile(const string path); - - unsigned int articleCount; - float stepSize; - + virtual void prepareIndexing() = 0; + virtual void stopIndexing() = 0; + + /* ZIM file handling */ zim::File* zimFileHandler; zim::size_type firstArticleOffset; zim::size_type lastArticleOffset; zim::size_type currentArticleOffset; - Xapian::WritableDatabase *writableDatabase; - Xapian::Stem stemmer; - Xapian::SimpleStopper stopper; - Xapian::TermGenerator indexer; - - std::vector stopWords; + /* HTML parsing */ MyHtmlParser htmlParser; - }; + unsigned int countWords(const string &text); + /* Stopwords */ + bool readStopWordsFile(const string path); + std::vector stopWords; + + /* Others */ + unsigned int articleCount; + float stepSize; + }; } #endif diff --git a/src/common/kiwix/xapianIndexer.cpp b/src/common/kiwix/xapianIndexer.cpp new file mode 100644 index 000000000..9eb9aef3e --- /dev/null +++ b/src/common/kiwix/xapianIndexer.cpp @@ -0,0 +1,158 @@ +#include "xapianIndexer.h" + +namespace kiwix { + + /* Constructor */ + XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) : + Indexer(zimFilePath, xapianDirectoryPath) { + + /* Open the Xapian directory */ + this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, + Xapian::DB_CREATE_OR_OVERWRITE); + + /* Stemming */ + /* + stemmer = Xapian::Stem("french"); + indexer.set_stemmer(stemmer); + */ + + /* Read the stopwords file */ + /* + this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); + std::vector::const_iterator stopWordsIterator = this->stopWords.begin(); + this->stopper.add("ceci"); + while (stopWordsIterator != this->stopWords.end()) { + this->stopper.add(*stopWordsIterator); + stopWordsIterator++; + } + indexer.set_stopper(&(this->stopper)); + */ + + /* Prepare the indexation */ + this->prepareIndexing(); + } + + /* Destructor */ + XapianIndexer::~XapianIndexer() { + this->stopIndexing(); + } + + /* Start indexing */ + void XapianIndexer::prepareIndexing() { + + /* Define a few values */ + this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A'); + this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A'); + this->currentArticleOffset = this->firstArticleOffset; + + /* Compute few things */ + this->articleCount = this->zimFileHandler->getNamespaceCount('A'); + this->stepSize = (float)this->articleCount / (float)100; + } + + /* Index next percent */ + bool XapianIndexer::indexNextPercent(const bool &verbose) { + float thresholdOffset = this->currentArticleOffset + this->stepSize; + size_t found; + + /* Check if we can start */ + if (this->zimFileHandler == NULL || this->writableDatabase == NULL) { + return false; + } + + /* Begin the Xapian transation */ + this->writableDatabase->begin_transaction(true); + + while(this->currentArticleOffset < thresholdOffset && + this->currentArticleOffset < this->lastArticleOffset) { + + zim::Article currentArticle; + Xapian::Document currentDocument; + + /* Get next non redirect article */ + do { + currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset); + } while (this->currentArticleOffset++ && + currentArticle.isRedirect() && + this->currentArticleOffset != this->lastArticleOffset); + + if (!currentArticle.isRedirect()) { + + /* Index the content */ + this->htmlParser.reset(); + string content (currentArticle.getData().data(), currentArticle.getData().size()); + + /* The parser generate a lot of exceptions which should be avoided */ + try { + this->htmlParser.parse_html(content, "UTF-8", true); + } catch (...) { + } + + /* If content does not have the noindex meta tag */ + /* Seems that the parser generates an exception in such case */ + found = this->htmlParser.dump.find("NOINDEX"); + + if (found == string::npos) { + + /* Put the data in the document */ + currentDocument.clear_values(); + currentDocument.add_value(0, this->htmlParser.title); + currentDocument.set_data(currentArticle.getLongUrl().c_str()); + indexer.set_document(currentDocument); + + /* Debug output */ + if (verbose) { + std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl; + } + + /* Index the title */ + if (!this->htmlParser.title.empty()) { + indexer.index_text_without_positions(removeAccents(this->htmlParser.title), + ((this->htmlParser.dump.size() / 100) + 1) / + countWords(this->htmlParser.title) ); + } + + /* Index the keywords */ + if (!this->htmlParser.keywords.empty()) { + indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3); + } + + /* Index the content */ + if (!this->htmlParser.dump.empty()) { + indexer.index_text_without_positions(removeAccents(this->htmlParser.dump)); + } + + /* add to the database */ + this->writableDatabase->add_document(currentDocument); + } + } + } + + /* Flush and close Xapian transaction*/ + this->writableDatabase->commit_transaction(); + + /* increment the offset and set returned value */ + if (this->currentArticleOffset < this->lastArticleOffset) { + this->currentArticleOffset++; + return true; + } else { + this->stopIndexing(); + return false; + } + } + + /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ + void XapianIndexer::stopIndexing() { + /* Delete the zimFileHandler */ + if (this->zimFileHandler != NULL) { + delete this->zimFileHandler; + this->zimFileHandler = NULL; + } + + /* Delete the Xapian writableDatabase */ + if (this->writableDatabase != NULL) { + delete this->writableDatabase; + this->writableDatabase = NULL; + } + } +} diff --git a/src/common/kiwix/xapianIndexer.h b/src/common/kiwix/xapianIndexer.h new file mode 100644 index 000000000..fd8fc1dc2 --- /dev/null +++ b/src/common/kiwix/xapianIndexer.h @@ -0,0 +1,41 @@ +#ifndef KIWIX_XAPIAN_INDEXER_H +#define KIWIX_XAPIAN_INDEXER_H + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "xapian/myhtmlparse.h" +#include "indexer.h" + +using namespace std; + +namespace kiwix { + + class XapianIndexer : public Indexer { + + public: + XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath); + ~XapianIndexer(); + + bool indexNextPercent(const bool &verbose = false); + + protected: + void prepareIndexing(); + void stopIndexing(); + + Xapian::WritableDatabase *writableDatabase; + Xapian::Stem stemmer; + Xapian::SimpleStopper stopper; + Xapian::TermGenerator indexer; + }; + +} + +#endif