mirror of https://github.com/kiwix/libkiwix.git
+ imp. of the new indexer
This commit is contained in:
parent
9ead81ca0c
commit
735d9afd3a
|
@ -41,6 +41,14 @@ namespace kiwix {
|
|||
stepSize(0),
|
||||
keywordsBoostFactor(3) {
|
||||
|
||||
this->initialize();
|
||||
this->setZimFilePath(zimFilePath);
|
||||
|
||||
/* Read the stopwords file */
|
||||
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||
}
|
||||
|
||||
void Indexer::initialize() {
|
||||
/* Initialize mutex */
|
||||
pthread_mutex_init(&threadIdsMutex, NULL);
|
||||
pthread_mutex_init(&toParseQueueMutex, NULL);
|
||||
|
@ -48,11 +56,11 @@ namespace kiwix {
|
|||
pthread_mutex_init(&articleExtractorRunningMutex, NULL);
|
||||
pthread_mutex_init(&articleParserRunningMutex, NULL);
|
||||
pthread_mutex_init(&articleIndexerRunningMutex, NULL);
|
||||
pthread_mutex_init(&articleCountMutex, NULL);
|
||||
pthread_mutex_init(&progressionMutex, NULL);
|
||||
|
||||
this->setZimFilePath(zimFilePath);
|
||||
|
||||
/* Read the stopwords file */
|
||||
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||
/* Article count & Progression */
|
||||
this->setProgression(0);
|
||||
}
|
||||
|
||||
bool Indexer::setZimFilePath(const string &zimFilePath) {
|
||||
|
@ -65,7 +73,9 @@ namespace kiwix {
|
|||
this->currentArticleOffset = this->firstArticleOffset;
|
||||
|
||||
/* Compute few things */
|
||||
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||
kiwix::Reader reader(zimFilePath);
|
||||
this->setArticleCount(reader.getArticleCount());
|
||||
//this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||
this->stepSize = (float)this->articleCount / (float)100;
|
||||
}
|
||||
|
||||
|
@ -118,6 +128,7 @@ namespace kiwix {
|
|||
/* Article parser methods */
|
||||
void *Indexer::parseArticles(void *ptr) {
|
||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
||||
self->articleParserRunning(true);
|
||||
size_t found;
|
||||
indexerToken token;
|
||||
|
||||
|
@ -189,7 +200,10 @@ namespace kiwix {
|
|||
/* Article indexer methods */
|
||||
void *Indexer::indexArticles(void *ptr) {
|
||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
||||
self->articleIndexerRunning(true);
|
||||
indexerToken token;
|
||||
unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100));
|
||||
unsigned indexedArticleCount = 0;
|
||||
|
||||
while (self->popFromToIndexQueue(token)) {
|
||||
self->indexNextArticle(token.url,
|
||||
|
@ -201,8 +215,13 @@ namespace kiwix {
|
|||
token.size,
|
||||
token.wordCount
|
||||
);
|
||||
|
||||
if (++indexedArticleCount % stepSize == 0) {
|
||||
self->setProgression(self->getProgression() + 1);
|
||||
}
|
||||
}
|
||||
|
||||
self->setProgression(100);
|
||||
self->indexNextPercentPost();
|
||||
|
||||
self->articleIndexerRunning(false);
|
||||
|
@ -287,6 +306,33 @@ namespace kiwix {
|
|||
return true;
|
||||
}
|
||||
|
||||
/* Article Count & Progression */
|
||||
void Indexer::setArticleCount(unsigned int articleCount) {
|
||||
pthread_mutex_lock(&articleCountMutex);
|
||||
this->articleCount = articleCount;
|
||||
pthread_mutex_unlock(&articleCountMutex);
|
||||
}
|
||||
|
||||
unsigned int Indexer::getArticleCount() {
|
||||
pthread_mutex_lock(&articleCountMutex);
|
||||
unsigned int retVal = this->articleCount;
|
||||
pthread_mutex_unlock(&articleCountMutex);
|
||||
return retVal;
|
||||
}
|
||||
|
||||
void Indexer::setProgression(unsigned int progression) {
|
||||
pthread_mutex_lock(&progressionMutex);
|
||||
this->progression = progression;
|
||||
pthread_mutex_unlock(&progressionMutex);
|
||||
}
|
||||
|
||||
unsigned int Indexer::getProgression() {
|
||||
pthread_mutex_lock(&progressionMutex);
|
||||
unsigned int retVal = this->progression;
|
||||
pthread_mutex_unlock(&progressionMutex);
|
||||
return retVal;
|
||||
}
|
||||
|
||||
bool Indexer::start() {
|
||||
this->indexNextPercentPre();
|
||||
pthread_mutex_lock(&threadIdsMutex);
|
||||
|
@ -301,7 +347,11 @@ namespace kiwix {
|
|||
}
|
||||
|
||||
bool Indexer::stop() {
|
||||
pthread_mutex_lock(&threadIdsMutex);
|
||||
pthread_cancel(this->articleExtractor);
|
||||
pthread_cancel(this->articleParser);
|
||||
pthread_cancel(this->articleIndexer);
|
||||
pthread_mutex_unlock(&threadIdsMutex);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -317,11 +367,6 @@ namespace kiwix {
|
|||
return this->currentArticleOffset;
|
||||
}
|
||||
|
||||
unsigned int Indexer::getProgression() {
|
||||
unsigned int progression = 0;
|
||||
return progression;
|
||||
}
|
||||
|
||||
/* Read the file containing the stopwords */
|
||||
bool Indexer::readStopWordsFile(const string path) {
|
||||
std::string stopWord;
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include <zim/file.h>
|
||||
#include <zim/article.h>
|
||||
#include <zim/fileiterator.h>
|
||||
#include "reader.h"
|
||||
#include "xapian/myhtmlparse.h"
|
||||
|
||||
using namespace std;
|
||||
|
@ -63,6 +64,7 @@ namespace kiwix {
|
|||
|
||||
private:
|
||||
pthread_mutex_t threadIdsMutex;
|
||||
void initialize();
|
||||
|
||||
/* Article extraction */
|
||||
pthread_t articleExtractor;
|
||||
|
@ -102,6 +104,15 @@ namespace kiwix {
|
|||
bool popFromToIndexQueue(indexerToken &token);
|
||||
bool isToIndexQueueEmpty();
|
||||
|
||||
/* Article Count & Progression */
|
||||
unsigned int articleCount;
|
||||
pthread_mutex_t articleCountMutex;
|
||||
void setArticleCount(unsigned int articleCount);
|
||||
unsigned int getArticleCount();
|
||||
unsigned int progression;
|
||||
pthread_mutex_t progressionMutex;
|
||||
void setProgression(unsigned int progression);
|
||||
|
||||
protected:
|
||||
virtual void indexNextPercentPre() = 0;
|
||||
virtual void indexNextArticle(const string &url,
|
||||
|
@ -134,7 +145,6 @@ namespace kiwix {
|
|||
std::vector<std::string> stopWords;
|
||||
|
||||
/* Others */
|
||||
unsigned int articleCount;
|
||||
float stepSize;
|
||||
|
||||
/* Boost factor */
|
||||
|
|
Loading…
Reference in New Issue