mirror of https://github.com/kiwix/libkiwix.git
+ imp. of the new indexer
This commit is contained in:
parent
9ead81ca0c
commit
735d9afd3a
|
@ -41,6 +41,14 @@ namespace kiwix {
|
||||||
stepSize(0),
|
stepSize(0),
|
||||||
keywordsBoostFactor(3) {
|
keywordsBoostFactor(3) {
|
||||||
|
|
||||||
|
this->initialize();
|
||||||
|
this->setZimFilePath(zimFilePath);
|
||||||
|
|
||||||
|
/* Read the stopwords file */
|
||||||
|
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
||||||
|
}
|
||||||
|
|
||||||
|
void Indexer::initialize() {
|
||||||
/* Initialize mutex */
|
/* Initialize mutex */
|
||||||
pthread_mutex_init(&threadIdsMutex, NULL);
|
pthread_mutex_init(&threadIdsMutex, NULL);
|
||||||
pthread_mutex_init(&toParseQueueMutex, NULL);
|
pthread_mutex_init(&toParseQueueMutex, NULL);
|
||||||
|
@ -48,11 +56,11 @@ namespace kiwix {
|
||||||
pthread_mutex_init(&articleExtractorRunningMutex, NULL);
|
pthread_mutex_init(&articleExtractorRunningMutex, NULL);
|
||||||
pthread_mutex_init(&articleParserRunningMutex, NULL);
|
pthread_mutex_init(&articleParserRunningMutex, NULL);
|
||||||
pthread_mutex_init(&articleIndexerRunningMutex, NULL);
|
pthread_mutex_init(&articleIndexerRunningMutex, NULL);
|
||||||
|
pthread_mutex_init(&articleCountMutex, NULL);
|
||||||
|
pthread_mutex_init(&progressionMutex, NULL);
|
||||||
|
|
||||||
this->setZimFilePath(zimFilePath);
|
/* Article count & Progression */
|
||||||
|
this->setProgression(0);
|
||||||
/* Read the stopwords file */
|
|
||||||
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Indexer::setZimFilePath(const string &zimFilePath) {
|
bool Indexer::setZimFilePath(const string &zimFilePath) {
|
||||||
|
@ -65,7 +73,9 @@ namespace kiwix {
|
||||||
this->currentArticleOffset = this->firstArticleOffset;
|
this->currentArticleOffset = this->firstArticleOffset;
|
||||||
|
|
||||||
/* Compute few things */
|
/* Compute few things */
|
||||||
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
kiwix::Reader reader(zimFilePath);
|
||||||
|
this->setArticleCount(reader.getArticleCount());
|
||||||
|
//this->articleCount = this->zimFileHandler->getNamespaceCount('A');
|
||||||
this->stepSize = (float)this->articleCount / (float)100;
|
this->stepSize = (float)this->articleCount / (float)100;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,6 +128,7 @@ namespace kiwix {
|
||||||
/* Article parser methods */
|
/* Article parser methods */
|
||||||
void *Indexer::parseArticles(void *ptr) {
|
void *Indexer::parseArticles(void *ptr) {
|
||||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
||||||
|
self->articleParserRunning(true);
|
||||||
size_t found;
|
size_t found;
|
||||||
indexerToken token;
|
indexerToken token;
|
||||||
|
|
||||||
|
@ -189,7 +200,10 @@ namespace kiwix {
|
||||||
/* Article indexer methods */
|
/* Article indexer methods */
|
||||||
void *Indexer::indexArticles(void *ptr) {
|
void *Indexer::indexArticles(void *ptr) {
|
||||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
||||||
|
self->articleIndexerRunning(true);
|
||||||
indexerToken token;
|
indexerToken token;
|
||||||
|
unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100));
|
||||||
|
unsigned indexedArticleCount = 0;
|
||||||
|
|
||||||
while (self->popFromToIndexQueue(token)) {
|
while (self->popFromToIndexQueue(token)) {
|
||||||
self->indexNextArticle(token.url,
|
self->indexNextArticle(token.url,
|
||||||
|
@ -201,8 +215,13 @@ namespace kiwix {
|
||||||
token.size,
|
token.size,
|
||||||
token.wordCount
|
token.wordCount
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (++indexedArticleCount % stepSize == 0) {
|
||||||
|
self->setProgression(self->getProgression() + 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self->setProgression(100);
|
||||||
self->indexNextPercentPost();
|
self->indexNextPercentPost();
|
||||||
|
|
||||||
self->articleIndexerRunning(false);
|
self->articleIndexerRunning(false);
|
||||||
|
@ -287,6 +306,33 @@ namespace kiwix {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Article Count & Progression */
|
||||||
|
void Indexer::setArticleCount(unsigned int articleCount) {
|
||||||
|
pthread_mutex_lock(&articleCountMutex);
|
||||||
|
this->articleCount = articleCount;
|
||||||
|
pthread_mutex_unlock(&articleCountMutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int Indexer::getArticleCount() {
|
||||||
|
pthread_mutex_lock(&articleCountMutex);
|
||||||
|
unsigned int retVal = this->articleCount;
|
||||||
|
pthread_mutex_unlock(&articleCountMutex);
|
||||||
|
return retVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Indexer::setProgression(unsigned int progression) {
|
||||||
|
pthread_mutex_lock(&progressionMutex);
|
||||||
|
this->progression = progression;
|
||||||
|
pthread_mutex_unlock(&progressionMutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int Indexer::getProgression() {
|
||||||
|
pthread_mutex_lock(&progressionMutex);
|
||||||
|
unsigned int retVal = this->progression;
|
||||||
|
pthread_mutex_unlock(&progressionMutex);
|
||||||
|
return retVal;
|
||||||
|
}
|
||||||
|
|
||||||
bool Indexer::start() {
|
bool Indexer::start() {
|
||||||
this->indexNextPercentPre();
|
this->indexNextPercentPre();
|
||||||
pthread_mutex_lock(&threadIdsMutex);
|
pthread_mutex_lock(&threadIdsMutex);
|
||||||
|
@ -301,7 +347,11 @@ namespace kiwix {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Indexer::stop() {
|
bool Indexer::stop() {
|
||||||
|
pthread_mutex_lock(&threadIdsMutex);
|
||||||
pthread_cancel(this->articleExtractor);
|
pthread_cancel(this->articleExtractor);
|
||||||
|
pthread_cancel(this->articleParser);
|
||||||
|
pthread_cancel(this->articleIndexer);
|
||||||
|
pthread_mutex_unlock(&threadIdsMutex);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -317,11 +367,6 @@ namespace kiwix {
|
||||||
return this->currentArticleOffset;
|
return this->currentArticleOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int Indexer::getProgression() {
|
|
||||||
unsigned int progression = 0;
|
|
||||||
return progression;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Read the file containing the stopwords */
|
/* Read the file containing the stopwords */
|
||||||
bool Indexer::readStopWordsFile(const string path) {
|
bool Indexer::readStopWordsFile(const string path) {
|
||||||
std::string stopWord;
|
std::string stopWord;
|
||||||
|
|
|
@ -33,6 +33,7 @@
|
||||||
#include <zim/file.h>
|
#include <zim/file.h>
|
||||||
#include <zim/article.h>
|
#include <zim/article.h>
|
||||||
#include <zim/fileiterator.h>
|
#include <zim/fileiterator.h>
|
||||||
|
#include "reader.h"
|
||||||
#include "xapian/myhtmlparse.h"
|
#include "xapian/myhtmlparse.h"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
@ -63,6 +64,7 @@ namespace kiwix {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
pthread_mutex_t threadIdsMutex;
|
pthread_mutex_t threadIdsMutex;
|
||||||
|
void initialize();
|
||||||
|
|
||||||
/* Article extraction */
|
/* Article extraction */
|
||||||
pthread_t articleExtractor;
|
pthread_t articleExtractor;
|
||||||
|
@ -102,6 +104,15 @@ namespace kiwix {
|
||||||
bool popFromToIndexQueue(indexerToken &token);
|
bool popFromToIndexQueue(indexerToken &token);
|
||||||
bool isToIndexQueueEmpty();
|
bool isToIndexQueueEmpty();
|
||||||
|
|
||||||
|
/* Article Count & Progression */
|
||||||
|
unsigned int articleCount;
|
||||||
|
pthread_mutex_t articleCountMutex;
|
||||||
|
void setArticleCount(unsigned int articleCount);
|
||||||
|
unsigned int getArticleCount();
|
||||||
|
unsigned int progression;
|
||||||
|
pthread_mutex_t progressionMutex;
|
||||||
|
void setProgression(unsigned int progression);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void indexNextPercentPre() = 0;
|
virtual void indexNextPercentPre() = 0;
|
||||||
virtual void indexNextArticle(const string &url,
|
virtual void indexNextArticle(const string &url,
|
||||||
|
@ -134,7 +145,6 @@ namespace kiwix {
|
||||||
std::vector<std::string> stopWords;
|
std::vector<std::string> stopWords;
|
||||||
|
|
||||||
/* Others */
|
/* Others */
|
||||||
unsigned int articleCount;
|
|
||||||
float stepSize;
|
float stepSize;
|
||||||
|
|
||||||
/* Boost factor */
|
/* Boost factor */
|
||||||
|
|
Loading…
Reference in New Issue