diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp index 5d2e0d6c3..e114c20f3 100644 --- a/src/common/kiwix/indexer.cpp +++ b/src/common/kiwix/indexer.cpp @@ -35,20 +35,9 @@ namespace kiwix { } /* Constructor */ - Indexer::Indexer(const string &zimFilePath) - : zimFileHandler(NULL), - articleCount(0), - stepSize(0), - keywordsBoostFactor(3) { - - this->initialize(); - this->setZimFilePath(zimFilePath); + Indexer::Indexer() : + keywordsBoostFactor(3) { - /* Read the stopwords file */ - //this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); - } - - void Indexer::initialize() { /* Initialize mutex */ pthread_mutex_init(&threadIdsMutex, NULL); pthread_mutex_init(&toParseQueueMutex, NULL); @@ -57,44 +46,34 @@ namespace kiwix { pthread_mutex_init(&articleParserRunningMutex, NULL); pthread_mutex_init(&articleIndexerRunningMutex, NULL); pthread_mutex_init(&articleCountMutex, NULL); + pthread_mutex_init(&zimPathMutex, NULL); + pthread_mutex_init(&indexPathMutex, NULL); pthread_mutex_init(&progressionMutex, NULL); - /* Article count & Progression */ - this->setProgression(0); - } - - bool Indexer::setZimFilePath(const string &zimFilePath) { - /* Open the ZIM file */ - this->zimFileHandler = new zim::File(zimFilePath); - - /* Define a few values */ - this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A'); - this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A'); - this->currentArticleOffset = this->firstArticleOffset; - - /* Compute few things */ - kiwix::Reader reader(zimFilePath); - this->setArticleCount(reader.getArticleCount()); - //this->articleCount = this->zimFileHandler->getNamespaceCount('A'); - this->stepSize = (float)this->articleCount / (float)100; + /* Read the stopwords file */ + //this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); } /* Article extractor methods */ void *Indexer::extractArticles(void *ptr) { kiwix::Indexer *self = (kiwix::Indexer *)ptr; self->articleExtractorRunning(true); - unsigned int startOffset = self->zimFileHandler->getNamespaceBeginOffset('A'); - unsigned int endOffset = self->zimFileHandler->getNamespaceEndOffset('A'); + + /* Get the number of article to index */ + kiwix::Reader reader(self->getZimPath()); + self->setArticleCount(reader.getArticleCount()); /* Goes trough all articles */ - unsigned int currentOffset = startOffset; + zim::File *zimHandler = reader.getZimFileHandler(); + unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');; + unsigned int lastOffset = zimHandler->getNamespaceEndOffset('A');; zim::Article currentArticle; - while (currentOffset <= endOffset) { + while (currentOffset <= lastOffset) { /* Redirects are not indexed */ do { - currentArticle = self->zimFileHandler->getArticle(currentOffset++); - } while (currentArticle.isRedirect() && currentOffset != endOffset); + currentArticle = zimHandler->getArticle(currentOffset++); + } while (currentArticle.isRedirect() && currentOffset != lastOffset); /* Add articles to the queue */ indexerToken token; @@ -176,7 +155,8 @@ namespace kiwix { self->pushToIndexQueue(token); /* Test if the thread should be cancelled */ - pthread_testcancel(); } + pthread_testcancel(); + } } self->articleParserRunning(false); @@ -201,29 +181,36 @@ namespace kiwix { void *Indexer::indexArticles(void *ptr) { kiwix::Indexer *self = (kiwix::Indexer *)ptr; self->articleIndexerRunning(true); + indexerToken token; - unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100)); unsigned indexedArticleCount = 0; + unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100)); + self->indexingPrelude(self->getIndexPath()); while (self->popFromToIndexQueue(token)) { - self->indexNextArticle(token.url, - token.accentedTitle, - token.title, - token.keywords, - token.content, - token.snippet, - token.size, - token.wordCount - ); - + self->index(token.url, + token.accentedTitle, + token.title, + token.keywords, + token.content, + token.snippet, + token.size, + token.wordCount + ); + if (++indexedArticleCount % stepSize == 0) { self->setProgression(self->getProgression() + 1); } + + if (indexedArticleCount % 10000 == 0) { + self->flush(); + } + + /* Test if the thread should be cancelled */ + pthread_testcancel(); } - self->setProgression(100); - self->indexNextPercentPost(); - + self->indexingPostlude(); self->articleIndexerRunning(false); pthread_exit(NULL); return NULL; @@ -306,8 +293,34 @@ namespace kiwix { return true; } - /* Article Count & Progression */ - void Indexer::setArticleCount(unsigned int articleCount) { + /* ZIM & Index methods */ + void Indexer::setZimPath(const string path) { + pthread_mutex_lock(&zimPathMutex); + this->zimPath = path; + pthread_mutex_unlock(&zimPathMutex); + } + + string Indexer::getZimPath() { + pthread_mutex_lock(&zimPathMutex); + string retVal = this->zimPath; + pthread_mutex_unlock(&zimPathMutex); + return retVal; + } + + void Indexer::setIndexPath(const string path) { + pthread_mutex_lock(&indexPathMutex); + this->indexPath = path; + pthread_mutex_unlock(&indexPathMutex); + } + + string Indexer::getIndexPath() { + pthread_mutex_lock(&indexPathMutex); + string retVal = this->indexPath; + pthread_mutex_unlock(&indexPathMutex); + return retVal; + } + + void Indexer::setArticleCount(const unsigned int articleCount) { pthread_mutex_lock(&articleCountMutex); this->articleCount = articleCount; pthread_mutex_unlock(&articleCountMutex); @@ -320,7 +333,7 @@ namespace kiwix { return retVal; } - void Indexer::setProgression(unsigned int progression) { + void Indexer::setProgression(const unsigned int progression) { pthread_mutex_lock(&progressionMutex); this->progression = progression; pthread_mutex_unlock(&progressionMutex); @@ -333,8 +346,12 @@ namespace kiwix { return retVal; } - bool Indexer::start() { - this->indexNextPercentPre(); + /* Manage */ + bool Indexer::start(const string &zimPath, const string &indexPath) { + this->setProgression(0); + this->setZimPath(zimPath); + this->setIndexPath(indexPath); + pthread_mutex_lock(&threadIdsMutex); pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this); pthread_detach(this->articleExtractor); @@ -343,15 +360,7 @@ namespace kiwix { pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this); pthread_detach(this->articleIndexer); pthread_mutex_unlock(&threadIdsMutex); - return true; - } - bool Indexer::stop() { - pthread_mutex_lock(&threadIdsMutex); - pthread_cancel(this->articleExtractor); - pthread_cancel(this->articleParser); - pthread_cancel(this->articleIndexer); - pthread_mutex_unlock(&threadIdsMutex); return true; } @@ -359,12 +368,27 @@ namespace kiwix { return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning(); } - void Indexer::setCurrentArticleOffset(unsigned int offset) { - this->currentArticleOffset = offset; - } + bool Indexer::stop() { + if (this->isRunning()) { + bool isArticleExtractorRunning = this->isArticleExtractorRunning(); + bool isArticleIndexerRunning = this->isArticleIndexerRunning(); + bool isArticleParserRunning = this->isArticleParserRunning(); + + pthread_mutex_lock(&threadIdsMutex); + + if (isArticleExtractorRunning) + pthread_cancel(this->articleExtractor); + if (isArticleIndexerRunning) + pthread_cancel(this->articleParser); + if (isArticleParserRunning) + pthread_cancel(this->articleIndexer); + + pthread_mutex_unlock(&threadIdsMutex); + + this->articleIndexerRunning(false); + } - unsigned int Indexer::getCurrentArticleOffset() { - return this->currentArticleOffset; + return true; } /* Read the file containing the stopwords */ @@ -382,102 +406,4 @@ namespace kiwix { return true; } - /* Index next percent */ - bool Indexer::indexNextPercent(const bool &verbose) { - float thresholdOffset = this->currentArticleOffset + this->stepSize; - size_t found; - - /* Check if we can start */ - if (this->zimFileHandler == NULL) { - return false; - } - - this->indexNextPercentPre(); - - while(this->currentArticleOffset < thresholdOffset && - this->currentArticleOffset <= this->lastArticleOffset) { - - zim::Article currentArticle; - - /* Get next non redirect article */ - do { - currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset); - } while (this->currentArticleOffset++ && - currentArticle.isRedirect() && - this->currentArticleOffset != this->lastArticleOffset); - - if (!currentArticle.isRedirect()) { - - /* Index the content */ - this->htmlParser.reset(); - string content (currentArticle.getData().data(), currentArticle.getData().size()); - - /* The parser generate a lot of exceptions which should be avoided */ - try { - this->htmlParser.parse_html(content, "UTF-8", true); - } catch (...) { - } - - /* If content does not have the noindex meta tag */ - /* Seems that the parser generates an exception in such case */ - found = this->htmlParser.dump.find("NOINDEX"); - - if (found == string::npos) { - string url = currentArticle.getLongUrl(); - - /* Debug output */ - if (verbose) { - std::cout << "Indexing " << url << "..." << std::endl; - } - - /* Get the title */ - string accentedTitle = this->htmlParser.title; - if (accentedTitle.empty()) { - accentedTitle = currentArticle.getTitle(); - } - - /* count words */ - stringstream countWordStringStream; - countWordStringStream << countWords(this->htmlParser.dump); - const std::string wordCountString = countWordStringStream.str(); - - /* snippet */ - std::string snippet = std::string(this->htmlParser.dump, 0, 300); - std::string::size_type last = snippet.find_last_of('.'); - if (last == snippet.npos) - last = snippet.find_last_of(' '); - if (last != snippet.npos) - snippet = snippet.substr(0, last); - - /* size */ - stringstream sizeStringStream; - sizeStringStream << content.size() / 1024; - const std::string size = sizeStringStream.str(); - - this->indexNextArticle(url, - accentedTitle, - removeAccents(this->htmlParser.title), - removeAccents(this->htmlParser.keywords), - removeAccents(this->htmlParser.dump), - snippet, - size, - wordCountString - ); - - } - } - } - - this->indexNextPercentPost(); - - /* increment the offset and set returned value */ - if (this->currentArticleOffset <= this->lastArticleOffset) { - return true; - } else { - // commented as it never returns on OSX. - //this->stopIndexing(); - return false; - } - } - } diff --git a/src/common/kiwix/indexer.h b/src/common/kiwix/indexer.h index 1d151e7db..f9c8fbb9e 100644 --- a/src/common/kiwix/indexer.h +++ b/src/common/kiwix/indexer.h @@ -54,17 +54,14 @@ namespace kiwix { class Indexer { public: - Indexer(const string &zimFilePath); - bool indexNextPercent(const bool &verbose = false); - bool setZimFilePath(const string &zimFilePath); - bool start(); + Indexer(); + bool start(const string &zimPath, const string &indexPath); bool stop(); bool isRunning(); unsigned int getProgression(); private: pthread_mutex_t threadIdsMutex; - void initialize(); /* Article extraction */ pthread_t articleExtractor; @@ -107,46 +104,47 @@ namespace kiwix { /* Article Count & Progression */ unsigned int articleCount; pthread_mutex_t articleCountMutex; - void setArticleCount(unsigned int articleCount); + void setArticleCount(const unsigned int articleCount); unsigned int getArticleCount(); + + /* Progression */ unsigned int progression; pthread_mutex_t progressionMutex; - void setProgression(unsigned int progression); + void setProgression(const unsigned int progression); + /* getProgression() is public */ + + /* ZIM path */ + pthread_mutex_t zimPathMutex; + string zimPath; + void setZimPath(const string path); + string getZimPath(); + + /* Index path */ + pthread_mutex_t indexPathMutex; + string indexPath; + void setIndexPath(const string path); + string getIndexPath(); protected: - virtual void indexNextPercentPre() = 0; - virtual void indexNextArticle(const string &url, - const string &title, - const string &unaccentedTitle, - const string &keywords, - const string &content, - const string &snippet, - const string &size, - const string &wordCount) = 0; - virtual void indexNextPercentPost() = 0; - virtual void stopIndexing() = 0; - - /* Article offset */ - void setCurrentArticleOffset(unsigned int offset); - unsigned int getCurrentArticleOffset(); - - /* ZIM file handling */ - zim::File* zimFileHandler; - zim::size_type firstArticleOffset; - zim::size_type lastArticleOffset; - zim::size_type currentArticleOffset; + virtual void indexingPrelude(const string &indexPath) = 0; + virtual void index(const string &url, + const string &title, + const string &unaccentedTitle, + const string &keywords, + const string &content, + const string &snippet, + const string &size, + const string &wordCount) = 0; + virtual void flush() = 0; + virtual void indexingPostlude() = 0; - /* HTML parsing */ - MyHtmlParser htmlParser; + /* Others */ unsigned int countWords(const string &text); /* Stopwords */ bool readStopWordsFile(const string path); std::vector stopWords; - /* Others */ - float stepSize; - /* Boost factor */ unsigned int keywordsBoostFactor; inline unsigned int getTitleBoostFactor(const unsigned int contentLength) { diff --git a/src/common/kiwix/reader.cpp b/src/common/kiwix/reader.cpp index 80c8c725e..2d92a1c7f 100644 --- a/src/common/kiwix/reader.cpp +++ b/src/common/kiwix/reader.cpp @@ -62,6 +62,10 @@ namespace kiwix { delete this->zimFileHandler; } } + + zim::File* Reader::getZimFileHandler() { + return this->zimFileHandler; + } /* Reset the cursor for GetNextArticle() */ void Reader::reset() { diff --git a/src/common/kiwix/reader.h b/src/common/kiwix/reader.h index b2e984e55..9d1f1f8fc 100644 --- a/src/common/kiwix/reader.h +++ b/src/common/kiwix/reader.h @@ -63,6 +63,7 @@ namespace kiwix { bool canCheckIntegrity(); bool isCorrupted(); bool parseUrl(const string &urlStr, char *ns, string &titleStr); + zim::File* getZimFileHandler(); protected: zim::File* zimFileHandler; diff --git a/src/common/kiwix/xapianIndexer.cpp b/src/common/kiwix/xapianIndexer.cpp index 19e8da1ca..e5700e1cd 100644 --- a/src/common/kiwix/xapianIndexer.cpp +++ b/src/common/kiwix/xapianIndexer.cpp @@ -22,20 +22,15 @@ namespace kiwix { /* Constructor */ - XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) : - Indexer(zimFilePath) { - - /* Open the Xapian directory */ - this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, - Xapian::DB_CREATE_OR_OVERWRITE); - + XapianIndexer::XapianIndexer() { /* Stemming */ /* stemmer = Xapian::Stem("french"); indexer.set_stemmer(stemmer); */ - /* Stop words + /* Stop words */ + /* std::vector::const_iterator stopWordsIterator = this->stopWords.begin(); this->stopper.add("ceci"); while (stopWordsIterator != this->stopWords.end()) { @@ -46,19 +41,20 @@ namespace kiwix { */ } - void XapianIndexer::indexNextPercentPre() { - this->writableDatabase->begin_transaction(true); + void XapianIndexer::indexingPrelude(const string &indexPath) { + this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE); + this->writableDatabase.begin_transaction(true); } - void XapianIndexer::indexNextArticle(const string &url, - const string &title, - const string &unaccentedTitle, - const string &keywords, - const string &content, - const string &snippet, - const string &size, - const string &wordCount) { - + void XapianIndexer::index(const string &url, + const string &title, + const string &unaccentedTitle, + const string &keywords, + const string &content, + const string &snippet, + const string &size, + const string &wordCount) { + /* Put the data in the document */ Xapian::Document currentDocument; currentDocument.clear_values(); @@ -85,26 +81,17 @@ namespace kiwix { } /* add to the database */ - this->writableDatabase->add_document(currentDocument); - } - - void XapianIndexer::indexNextPercentPost() { - /* Flush and close Xapian transaction*/ - this->writableDatabase->commit_transaction(); + this->writableDatabase.add_document(currentDocument); } - /* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ - void XapianIndexer::stopIndexing() { - /* Delete the zimFileHandler */ - if (this->zimFileHandler != NULL) { - delete this->zimFileHandler; - this->zimFileHandler = NULL; - } - - /* Delete the Xapian writableDatabase */ - if (this->writableDatabase != NULL) { - delete this->writableDatabase; - this->writableDatabase = NULL; - } + void XapianIndexer::flush() { + this->writableDatabase.commit_transaction(); + this->writableDatabase.begin_transaction(true); + } + + void XapianIndexer::indexingPostlude() { + this->flush(); + this->writableDatabase.commit_transaction(); + this->writableDatabase.commit(); } } diff --git a/src/common/kiwix/xapianIndexer.h b/src/common/kiwix/xapianIndexer.h index 899492c7a..635fc80de 100644 --- a/src/common/kiwix/xapianIndexer.h +++ b/src/common/kiwix/xapianIndexer.h @@ -30,22 +30,23 @@ namespace kiwix { class XapianIndexer : public Indexer { public: - XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath); + XapianIndexer(); + ~XapianIndexer(); protected: - void indexNextPercentPre(); - void indexNextArticle(const string &url, - const string &title, - const string &unaccentedTitle, - const string &keywords, - const string &content, - const string &snippet, - const string &size, - const string &wordCount); - void indexNextPercentPost(); - void stopIndexing(); + void indexingPrelude(const string &indexPath); + void index(const string &url, + const string &title, + const string &unaccentedTitle, + const string &keywords, + const string &content, + const string &snippet, + const string &size, + const string &wordCount); + void flush(); + void indexingPostlude(); - Xapian::WritableDatabase *writableDatabase; + Xapian::WritableDatabase writableDatabase; Xapian::Stem stemmer; Xapian::SimpleStopper stopper; Xapian::TermGenerator indexer;