mirror of https://github.com/kiwix/libkiwix.git
Remove the indexer functionnality from kiwix-lib.
This is not used anymore.
This commit is contained in:
parent
2906202056
commit
e28dbe7c7e
|
@ -1,169 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2014 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef KIWIX_INDEXER_H
|
|
||||||
#define KIWIX_INDEXER_H
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <stack>
|
|
||||||
#include <queue>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include <pthread.h>
|
|
||||||
#include "common/stringTools.h"
|
|
||||||
#include "common/otherTools.h"
|
|
||||||
#include <zim/file.h>
|
|
||||||
#include <zim/article.h>
|
|
||||||
#include <zim/fileiterator.h>
|
|
||||||
#include "reader.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
struct indexerToken {
|
|
||||||
string url;
|
|
||||||
string accentedTitle;
|
|
||||||
string title;
|
|
||||||
string keywords;
|
|
||||||
string content;
|
|
||||||
string snippet;
|
|
||||||
string size;
|
|
||||||
string wordCount;
|
|
||||||
};
|
|
||||||
|
|
||||||
class Indexer {
|
|
||||||
|
|
||||||
typedef void (* ProgressCallback)(const unsigned int processedArticleCount, const unsigned int totalArticleCount);
|
|
||||||
|
|
||||||
public:
|
|
||||||
Indexer();
|
|
||||||
virtual ~Indexer();
|
|
||||||
|
|
||||||
bool start(const string zimPath, const string indexPath, ProgressCallback callback = NULL);
|
|
||||||
bool stop();
|
|
||||||
bool isRunning();
|
|
||||||
unsigned int getProgression();
|
|
||||||
void setVerboseFlag(const bool value);
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual void indexingPrelude(const string indexPath) = 0;
|
|
||||||
virtual void index(const string &url,
|
|
||||||
const string &title,
|
|
||||||
const string &unaccentedTitle,
|
|
||||||
const string &keywords,
|
|
||||||
const string &content,
|
|
||||||
const string &snippet,
|
|
||||||
const string &size,
|
|
||||||
const string &wordCount) = 0;
|
|
||||||
virtual void flush() = 0;
|
|
||||||
virtual void indexingPostlude(const string indexPath) = 0;
|
|
||||||
|
|
||||||
/* Others */
|
|
||||||
unsigned int countWords(const string &text);
|
|
||||||
|
|
||||||
/* Boost factor */
|
|
||||||
unsigned int keywordsBoostFactor;
|
|
||||||
inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {
|
|
||||||
return contentLength / 500 + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Verbose */
|
|
||||||
pthread_mutex_t verboseMutex;
|
|
||||||
bool getVerboseFlag();
|
|
||||||
bool verboseFlag;
|
|
||||||
|
|
||||||
private:
|
|
||||||
ProgressCallback progressCallback;
|
|
||||||
pthread_mutex_t threadIdsMutex;
|
|
||||||
|
|
||||||
/* Article extraction */
|
|
||||||
pthread_t articleExtractor;
|
|
||||||
pthread_mutex_t articleExtractorRunningMutex;
|
|
||||||
static void *extractArticles(void *ptr);
|
|
||||||
bool articleExtractorRunningFlag;
|
|
||||||
bool isArticleExtractorRunning();
|
|
||||||
void articleExtractorRunning(bool value);
|
|
||||||
|
|
||||||
/* Article parsing */
|
|
||||||
pthread_t articleParser;
|
|
||||||
pthread_mutex_t articleParserRunningMutex;
|
|
||||||
static void *parseArticles(void *ptr);
|
|
||||||
bool articleParserRunningFlag;
|
|
||||||
bool isArticleParserRunning();
|
|
||||||
void articleParserRunning(bool value);
|
|
||||||
|
|
||||||
/* Index writting */
|
|
||||||
pthread_t articleIndexer;
|
|
||||||
pthread_mutex_t articleIndexerRunningMutex;
|
|
||||||
static void *indexArticles(void *ptr);
|
|
||||||
bool articleIndexerRunningFlag;
|
|
||||||
bool isArticleIndexerRunning();
|
|
||||||
void articleIndexerRunning(bool value);
|
|
||||||
|
|
||||||
/* To parse queue */
|
|
||||||
std::queue<indexerToken> toParseQueue;
|
|
||||||
pthread_mutex_t toParseQueueMutex;
|
|
||||||
void pushToParseQueue(indexerToken &token);
|
|
||||||
bool popFromToParseQueue(indexerToken &token);
|
|
||||||
bool isToParseQueueEmpty();
|
|
||||||
|
|
||||||
/* To index queue */
|
|
||||||
std::queue<indexerToken> toIndexQueue;
|
|
||||||
pthread_mutex_t toIndexQueueMutex;
|
|
||||||
void pushToIndexQueue(indexerToken &token);
|
|
||||||
bool popFromToIndexQueue(indexerToken &token);
|
|
||||||
bool isToIndexQueueEmpty();
|
|
||||||
|
|
||||||
/* Article Count & Progression */
|
|
||||||
unsigned int articleCount;
|
|
||||||
pthread_mutex_t articleCountMutex;
|
|
||||||
void setArticleCount(const unsigned int articleCount);
|
|
||||||
unsigned int getArticleCount();
|
|
||||||
|
|
||||||
/* Progression */
|
|
||||||
unsigned int progression;
|
|
||||||
pthread_mutex_t progressionMutex;
|
|
||||||
void setProgression(const unsigned int progression);
|
|
||||||
/* getProgression() is public */
|
|
||||||
|
|
||||||
/* ZIM path */
|
|
||||||
pthread_mutex_t zimPathMutex;
|
|
||||||
string zimPath;
|
|
||||||
void setZimPath(const string path);
|
|
||||||
string getZimPath();
|
|
||||||
|
|
||||||
/* Index path */
|
|
||||||
pthread_mutex_t indexPathMutex;
|
|
||||||
string indexPath;
|
|
||||||
void setIndexPath(const string path);
|
|
||||||
string getIndexPath();
|
|
||||||
|
|
||||||
/* ZIM id */
|
|
||||||
pthread_mutex_t zimIdMutex;
|
|
||||||
string zimId;
|
|
||||||
void setZimId(const string id);
|
|
||||||
string getZimId();
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -5,12 +5,8 @@ headers = [
|
||||||
'searcher.h'
|
'searcher.h'
|
||||||
]
|
]
|
||||||
|
|
||||||
if not get_option('android')
|
|
||||||
headers += ['indexer.h']
|
|
||||||
endif
|
|
||||||
|
|
||||||
if xapian_dep.found()
|
if xapian_dep.found()
|
||||||
headers += ['xapianIndexer.h', 'xapianSearcher.h']
|
headers += ['xapianSearcher.h']
|
||||||
endif
|
endif
|
||||||
|
|
||||||
install_headers(headers, subdir:'kiwix')
|
install_headers(headers, subdir:'kiwix')
|
||||||
|
|
|
@ -1,56 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef KIWIX_XAPIAN_INDEXER_H
|
|
||||||
#define KIWIX_XAPIAN_INDEXER_H
|
|
||||||
|
|
||||||
#include <xapian.h>
|
|
||||||
#include "indexer.h"
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
class XapianIndexer : public Indexer {
|
|
||||||
|
|
||||||
public:
|
|
||||||
XapianIndexer();
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void indexingPrelude(const string indexPath);
|
|
||||||
void index(const string &url,
|
|
||||||
const string &title,
|
|
||||||
const string &unaccentedTitle,
|
|
||||||
const string &keywords,
|
|
||||||
const string &content,
|
|
||||||
const string &snippet,
|
|
||||||
const string &size,
|
|
||||||
const string &wordCount);
|
|
||||||
void flush();
|
|
||||||
void indexingPostlude(const string indexPath);
|
|
||||||
|
|
||||||
Xapian::WritableDatabase writableDatabase;
|
|
||||||
Xapian::Stem stemmer;
|
|
||||||
Xapian::SimpleStopper stopper;
|
|
||||||
Xapian::TermGenerator indexer;
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
513
src/indexer.cpp
513
src/indexer.cpp
|
@ -1,513 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011-2014 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "indexer.h"
|
|
||||||
#include "xapian/myhtmlparse.h"
|
|
||||||
#include "kiwixlib-resources.h"
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
/* Count word */
|
|
||||||
unsigned int Indexer::countWords(const string &text) {
|
|
||||||
unsigned int numWords = 1;
|
|
||||||
unsigned int length = text.size();
|
|
||||||
|
|
||||||
for(unsigned int i=0; i<length;) {
|
|
||||||
while(i<length && text[i] != ' ') {
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
numWords++;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return numWords;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Constructor */
|
|
||||||
Indexer::Indexer() :
|
|
||||||
keywordsBoostFactor(3),
|
|
||||||
verboseFlag(false) {
|
|
||||||
|
|
||||||
/* Initialize mutex */
|
|
||||||
pthread_mutex_init(&threadIdsMutex, NULL);
|
|
||||||
pthread_mutex_init(&toParseQueueMutex, NULL);
|
|
||||||
pthread_mutex_init(&toIndexQueueMutex, NULL);
|
|
||||||
pthread_mutex_init(&articleExtractorRunningMutex, NULL);
|
|
||||||
pthread_mutex_init(&articleParserRunningMutex, NULL);
|
|
||||||
pthread_mutex_init(&articleIndexerRunningMutex, NULL);
|
|
||||||
pthread_mutex_init(&articleCountMutex, NULL);
|
|
||||||
pthread_mutex_init(&zimPathMutex, NULL);
|
|
||||||
pthread_mutex_init(&zimIdMutex, NULL);
|
|
||||||
pthread_mutex_init(&indexPathMutex, NULL);
|
|
||||||
pthread_mutex_init(&progressionMutex, NULL);
|
|
||||||
pthread_mutex_init(&verboseMutex, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Destructor */
|
|
||||||
Indexer::~Indexer() {
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#pragma mark - Extractor
|
|
||||||
|
|
||||||
/* Article extractor methods */
|
|
||||||
void *Indexer::extractArticles(void *ptr) {
|
|
||||||
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
|
|
||||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
|
||||||
|
|
||||||
/* Get the number of article to index and the ZIM id */
|
|
||||||
kiwix::Reader reader(self->getZimPath());
|
|
||||||
unsigned int articleCount = reader.getArticleCount();
|
|
||||||
self->setArticleCount(articleCount);
|
|
||||||
string zimId = reader.getId();
|
|
||||||
self->setZimId(zimId);
|
|
||||||
|
|
||||||
/* Progression */
|
|
||||||
unsigned int readArticleCount = 0;
|
|
||||||
unsigned int currentProgression = 0;
|
|
||||||
self->setProgression(currentProgression);
|
|
||||||
unsigned int newProgress;
|
|
||||||
|
|
||||||
/* StopWords */
|
|
||||||
// self->readStopWords(reader.getLanguage());
|
|
||||||
|
|
||||||
/* Goes trough all articles */
|
|
||||||
zim::File *zimHandler = reader.getZimFileHandler();
|
|
||||||
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
|
|
||||||
unsigned int lastOffset = zimHandler->getNamespaceEndOffset('A');
|
|
||||||
zim::Article currentArticle;
|
|
||||||
|
|
||||||
while (currentOffset < lastOffset) {
|
|
||||||
currentArticle = zimHandler->getArticle(currentOffset);
|
|
||||||
|
|
||||||
if (!currentArticle.isRedirect()) {
|
|
||||||
/* Add articles to the queue */
|
|
||||||
indexerToken token;
|
|
||||||
token.title = currentArticle.getTitle();
|
|
||||||
token.url = currentArticle.getLongUrl();
|
|
||||||
token.content = string(currentArticle.getData().data(), currentArticle.getData().size());
|
|
||||||
self->pushToParseQueue(token);
|
|
||||||
readArticleCount += 1;
|
|
||||||
|
|
||||||
/* Update progress */
|
|
||||||
if (self->progressCallback) {
|
|
||||||
self->progressCallback(readArticleCount, articleCount);
|
|
||||||
}
|
|
||||||
newProgress = (unsigned int)((float)readArticleCount / (float)articleCount * 100);
|
|
||||||
if (newProgress != currentProgression) {
|
|
||||||
self->setProgression(newProgress);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
currentOffset += 1;
|
|
||||||
|
|
||||||
/* Test if the thread should be cancelled */
|
|
||||||
pthread_testcancel();
|
|
||||||
}
|
|
||||||
|
|
||||||
self->articleExtractorRunning(false);
|
|
||||||
pthread_exit(NULL);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::articleExtractorRunning(bool value) {
|
|
||||||
pthread_mutex_lock(&articleExtractorRunningMutex);
|
|
||||||
this->articleExtractorRunningFlag = value;
|
|
||||||
pthread_mutex_unlock(&articleExtractorRunningMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::isArticleExtractorRunning() {
|
|
||||||
pthread_mutex_lock(&articleExtractorRunningMutex);
|
|
||||||
bool retVal = this->articleExtractorRunningFlag;
|
|
||||||
pthread_mutex_unlock(&articleExtractorRunningMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - Parser
|
|
||||||
|
|
||||||
/* Article parser methods */
|
|
||||||
void *Indexer::parseArticles(void *ptr) {
|
|
||||||
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
|
|
||||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
|
||||||
size_t found;
|
|
||||||
indexerToken token;
|
|
||||||
|
|
||||||
while (self->popFromToParseQueue(token)) {
|
|
||||||
MyHtmlParser htmlParser;
|
|
||||||
|
|
||||||
/* The parser generate a lot of exceptions which should be avoided */
|
|
||||||
try {
|
|
||||||
htmlParser.parse_html(token.content, "UTF-8", true);
|
|
||||||
} catch (...) {
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If content does not have the noindex meta tag */
|
|
||||||
/* Seems that the parser generates an exception in such case */
|
|
||||||
found = htmlParser.dump.find("NOINDEX");
|
|
||||||
|
|
||||||
if (found == string::npos) {
|
|
||||||
/* Get the accented title */
|
|
||||||
token.accentedTitle = (htmlParser.title.empty() ? token.title : htmlParser.title);
|
|
||||||
|
|
||||||
/* count words */
|
|
||||||
stringstream countWordStringStream;
|
|
||||||
countWordStringStream << self->countWords(htmlParser.dump);
|
|
||||||
token.wordCount = countWordStringStream.str();
|
|
||||||
|
|
||||||
/* snippet */
|
|
||||||
std::string snippet = std::string(htmlParser.dump, 0, 300);
|
|
||||||
std::string::size_type last = snippet.find_last_of('.');
|
|
||||||
if (last == snippet.npos)
|
|
||||||
last = snippet.find_last_of(' ');
|
|
||||||
if (last != snippet.npos)
|
|
||||||
snippet = snippet.substr(0, last);
|
|
||||||
token.snippet = snippet;
|
|
||||||
|
|
||||||
/* size */
|
|
||||||
stringstream sizeStringStream;
|
|
||||||
sizeStringStream << token.content.size() / 1024;
|
|
||||||
token.size = sizeStringStream.str();
|
|
||||||
|
|
||||||
/* Remove accent */
|
|
||||||
token.title = kiwix::removeAccents(token.accentedTitle);
|
|
||||||
token.keywords = kiwix::removeAccents(htmlParser.keywords);
|
|
||||||
token.content = kiwix::removeAccents(htmlParser.dump);
|
|
||||||
self->pushToIndexQueue(token);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Test if the thread should be cancelled */
|
|
||||||
pthread_testcancel();
|
|
||||||
}
|
|
||||||
|
|
||||||
self->articleParserRunning(false);
|
|
||||||
pthread_exit(NULL);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::articleParserRunning(bool value) {
|
|
||||||
pthread_mutex_lock(&articleParserRunningMutex);
|
|
||||||
this->articleParserRunningFlag = value;
|
|
||||||
pthread_mutex_unlock(&articleParserRunningMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::isArticleParserRunning() {
|
|
||||||
pthread_mutex_lock(&articleParserRunningMutex);
|
|
||||||
bool retVal = this->articleParserRunningFlag;
|
|
||||||
pthread_mutex_unlock(&articleParserRunningMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - Indexer
|
|
||||||
|
|
||||||
/* Article indexer methods */
|
|
||||||
void *Indexer::indexArticles(void *ptr) {
|
|
||||||
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
|
|
||||||
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
|
||||||
unsigned int indexedArticleCount = 0;
|
|
||||||
indexerToken token;
|
|
||||||
|
|
||||||
self->indexingPrelude(self->getIndexPath());
|
|
||||||
|
|
||||||
while (self->popFromToIndexQueue(token)) {
|
|
||||||
self->index(token.url,
|
|
||||||
token.accentedTitle,
|
|
||||||
token.title,
|
|
||||||
token.keywords,
|
|
||||||
token.content,
|
|
||||||
token.snippet,
|
|
||||||
token.size,
|
|
||||||
token.wordCount
|
|
||||||
);
|
|
||||||
|
|
||||||
indexedArticleCount += 1;
|
|
||||||
|
|
||||||
/* Make a hard-disk flush every 10.000 articles */
|
|
||||||
if (indexedArticleCount % 5000 == 0) {
|
|
||||||
self->flush();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Test if the thread should be cancelled */
|
|
||||||
pthread_testcancel();
|
|
||||||
}
|
|
||||||
self->indexingPostlude(self->getIndexPath());
|
|
||||||
|
|
||||||
/* Write content id file */
|
|
||||||
string path = appendToDirectory(self->getIndexPath(), "content.id");
|
|
||||||
writeTextFile(path, self->getZimId());
|
|
||||||
|
|
||||||
self->setProgression(100);
|
|
||||||
kiwix::sleep(100);
|
|
||||||
|
|
||||||
self->articleIndexerRunning(false);
|
|
||||||
pthread_exit(NULL);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::articleIndexerRunning(bool value) {
|
|
||||||
pthread_mutex_lock(&articleIndexerRunningMutex);
|
|
||||||
this->articleIndexerRunningFlag = value;
|
|
||||||
pthread_mutex_unlock(&articleIndexerRunningMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::isArticleIndexerRunning() {
|
|
||||||
pthread_mutex_lock(&articleIndexerRunningMutex);
|
|
||||||
bool retVal = this->articleIndexerRunningFlag;
|
|
||||||
pthread_mutex_unlock(&articleIndexerRunningMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - Parse Queue
|
|
||||||
|
|
||||||
/* ToParseQueue methods */
|
|
||||||
bool Indexer::isToParseQueueEmpty() {
|
|
||||||
pthread_mutex_lock(&toParseQueueMutex);
|
|
||||||
bool retVal = this->toParseQueue.empty();
|
|
||||||
pthread_mutex_unlock(&toParseQueueMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::pushToParseQueue(indexerToken &token) {
|
|
||||||
pthread_mutex_lock(&toParseQueueMutex);
|
|
||||||
this->toParseQueue.push(token);
|
|
||||||
pthread_mutex_unlock(&toParseQueueMutex);
|
|
||||||
kiwix::sleep(int(this->toParseQueue.size() / 200) / 10 * 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::popFromToParseQueue(indexerToken &token) {
|
|
||||||
while (this->isToParseQueueEmpty() && this->isArticleExtractorRunning()) {
|
|
||||||
kiwix::sleep(500);
|
|
||||||
if (this->getVerboseFlag()) {
|
|
||||||
std::cout << "Waiting... ToParseQueue is empty for now..." << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
pthread_testcancel();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!this->isToParseQueueEmpty()) {
|
|
||||||
pthread_mutex_lock(&toParseQueueMutex);
|
|
||||||
token = this->toParseQueue.front();
|
|
||||||
this->toParseQueue.pop();
|
|
||||||
pthread_mutex_unlock(&toParseQueueMutex);
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - Index Queue
|
|
||||||
|
|
||||||
/* ToIndexQueue methods */
|
|
||||||
bool Indexer::isToIndexQueueEmpty() {
|
|
||||||
pthread_mutex_lock(&toIndexQueueMutex);
|
|
||||||
bool retVal = this->toIndexQueue.empty();
|
|
||||||
pthread_mutex_unlock(&toIndexQueueMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::pushToIndexQueue(indexerToken &token) {
|
|
||||||
pthread_mutex_lock(&toIndexQueueMutex);
|
|
||||||
this->toIndexQueue.push(token);
|
|
||||||
pthread_mutex_unlock(&toIndexQueueMutex);
|
|
||||||
kiwix::sleep(int(this->toIndexQueue.size() / 200) / 10 * 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::popFromToIndexQueue(indexerToken &token) {
|
|
||||||
while (this->isToIndexQueueEmpty() && this->isArticleParserRunning()) {
|
|
||||||
kiwix::sleep(500);
|
|
||||||
if (this->getVerboseFlag()) {
|
|
||||||
std::cout << "Waiting... ToIndexQueue is empty for now..." << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
pthread_testcancel();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!this->isToIndexQueueEmpty()) {
|
|
||||||
pthread_mutex_lock(&toIndexQueueMutex);
|
|
||||||
token = this->toIndexQueue.front();
|
|
||||||
this->toIndexQueue.pop();
|
|
||||||
pthread_mutex_unlock(&toIndexQueueMutex);
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - Properties Getter & Setter
|
|
||||||
|
|
||||||
/* ZIM & Index methods */
|
|
||||||
void Indexer::setZimPath(const string path) {
|
|
||||||
pthread_mutex_lock(&zimPathMutex);
|
|
||||||
this->zimPath = path;
|
|
||||||
pthread_mutex_unlock(&zimPathMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
string Indexer::getZimPath() {
|
|
||||||
pthread_mutex_lock(&zimPathMutex);
|
|
||||||
string retVal = this->zimPath;
|
|
||||||
pthread_mutex_unlock(&zimPathMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::setIndexPath(const string path) {
|
|
||||||
pthread_mutex_lock(&indexPathMutex);
|
|
||||||
this->indexPath = path;
|
|
||||||
pthread_mutex_unlock(&indexPathMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
string Indexer::getIndexPath() {
|
|
||||||
pthread_mutex_lock(&indexPathMutex);
|
|
||||||
string retVal = this->indexPath;
|
|
||||||
pthread_mutex_unlock(&indexPathMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::setArticleCount(const unsigned int articleCount) {
|
|
||||||
pthread_mutex_lock(&articleCountMutex);
|
|
||||||
this->articleCount = articleCount;
|
|
||||||
pthread_mutex_unlock(&articleCountMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int Indexer::getArticleCount() {
|
|
||||||
pthread_mutex_lock(&articleCountMutex);
|
|
||||||
unsigned int retVal = this->articleCount;
|
|
||||||
pthread_mutex_unlock(&articleCountMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::setProgression(const unsigned int progression) {
|
|
||||||
pthread_mutex_lock(&progressionMutex);
|
|
||||||
this->progression = progression;
|
|
||||||
pthread_mutex_unlock(&progressionMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int Indexer::getProgression() {
|
|
||||||
pthread_mutex_lock(&progressionMutex);
|
|
||||||
unsigned int retVal = this->progression;
|
|
||||||
pthread_mutex_unlock(&progressionMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Indexer::setZimId(const string id) {
|
|
||||||
pthread_mutex_lock(&zimIdMutex);
|
|
||||||
this->zimId = id;
|
|
||||||
pthread_mutex_unlock(&zimIdMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
string Indexer::getZimId() {
|
|
||||||
pthread_mutex_lock(&zimIdMutex);
|
|
||||||
string retVal = this->zimId;
|
|
||||||
pthread_mutex_unlock(&zimIdMutex);
|
|
||||||
return retVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - Status Management
|
|
||||||
|
|
||||||
/* Manage */
|
|
||||||
bool Indexer::start(const string zimPath, const string indexPath, ProgressCallback callback) {
|
|
||||||
if (this->getVerboseFlag()) {
|
|
||||||
std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (callback) {
|
|
||||||
this->progressCallback = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
this->setArticleCount(0);
|
|
||||||
this->setProgression(0);
|
|
||||||
this->setZimPath(zimPath);
|
|
||||||
this->setIndexPath(indexPath);
|
|
||||||
|
|
||||||
pthread_mutex_lock(&threadIdsMutex);
|
|
||||||
this->articleExtractorRunning(true);
|
|
||||||
pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this);
|
|
||||||
pthread_detach(this->articleExtractor);
|
|
||||||
|
|
||||||
while(this->isArticleExtractorRunning() && this->getArticleCount() == 0) {
|
|
||||||
kiwix::sleep(100);
|
|
||||||
}
|
|
||||||
|
|
||||||
this->articleParserRunning(true);
|
|
||||||
pthread_create(&(this->articleParser), NULL, Indexer::parseArticles, (void*)this);
|
|
||||||
pthread_detach(this->articleParser);
|
|
||||||
|
|
||||||
this->articleIndexerRunning(true);
|
|
||||||
pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this);
|
|
||||||
pthread_detach(this->articleIndexer);
|
|
||||||
pthread_mutex_unlock(&threadIdsMutex);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::isRunning() {
|
|
||||||
if (this->getVerboseFlag()) {
|
|
||||||
std::cout << "isArticleExtractor running: " << (this->isArticleExtractorRunning() ? "yes" : "no") << std::endl;
|
|
||||||
std::cout << "isArticleParser running: " << (this->isArticleParserRunning() ? "yes" : "no") << std::endl;
|
|
||||||
std::cout << "isArticleIndexer running: " << (this->isArticleIndexerRunning() ? "yes" : "no") << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::stop() {
|
|
||||||
if (this->isRunning()) {
|
|
||||||
bool isArticleExtractorRunning = this->isArticleExtractorRunning();
|
|
||||||
bool isArticleIndexerRunning = this->isArticleIndexerRunning();
|
|
||||||
bool isArticleParserRunning = this->isArticleParserRunning();
|
|
||||||
|
|
||||||
pthread_mutex_lock(&threadIdsMutex);
|
|
||||||
|
|
||||||
if (isArticleIndexerRunning) {
|
|
||||||
pthread_cancel(this->articleIndexer);
|
|
||||||
this->articleIndexerRunning(false);
|
|
||||||
}
|
|
||||||
if (isArticleParserRunning) {
|
|
||||||
pthread_cancel(this->articleParser);
|
|
||||||
this->articleParserRunning(false);
|
|
||||||
}
|
|
||||||
if (isArticleExtractorRunning) {
|
|
||||||
pthread_cancel(this->articleExtractor);
|
|
||||||
this->articleExtractorRunning(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
pthread_mutex_unlock(&threadIdsMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma mark - verbose
|
|
||||||
|
|
||||||
/* Manage the verboseFlag */
|
|
||||||
void Indexer::setVerboseFlag(const bool value) {
|
|
||||||
pthread_mutex_lock(&verboseMutex);
|
|
||||||
this->verboseFlag = value;
|
|
||||||
pthread_mutex_unlock(&verboseMutex);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Indexer::getVerboseFlag() {
|
|
||||||
bool value;
|
|
||||||
pthread_mutex_lock(&verboseMutex);
|
|
||||||
value = this->verboseFlag;
|
|
||||||
pthread_mutex_unlock(&verboseMutex);
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -16,14 +16,9 @@ kiwix_sources += lib_resources
|
||||||
|
|
||||||
if xapian_dep.found()
|
if xapian_dep.found()
|
||||||
kiwix_sources += ['xapianSearcher.cpp']
|
kiwix_sources += ['xapianSearcher.cpp']
|
||||||
if not get_option('android')
|
|
||||||
kiwix_sources += ['xapianIndexer.cpp']
|
|
||||||
endif
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if not get_option('android')
|
if get_option('android')
|
||||||
kiwix_sources += ['indexer.cpp']
|
|
||||||
else
|
|
||||||
subdir('android')
|
subdir('android')
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -1,111 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "xapianIndexer.h"
|
|
||||||
|
|
||||||
namespace kiwix {
|
|
||||||
|
|
||||||
/* Constructor */
|
|
||||||
XapianIndexer::XapianIndexer() {
|
|
||||||
/*
|
|
||||||
stemmer(Xapian::Stem("french")) {
|
|
||||||
this->indexer.set_stemmer(this->stemmer);
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
void XapianIndexer::indexingPrelude(const string indexPath) {
|
|
||||||
this->writableDatabase = Xapian::WritableDatabase(indexPath+".tmp", Xapian::DB_CREATE_OR_OVERWRITE | Xapian::DB_BACKEND_GLASS);
|
|
||||||
this->writableDatabase.begin_transaction(true);
|
|
||||||
|
|
||||||
/* Insert the stopwords */
|
|
||||||
if (!this->stopWords.empty()) {
|
|
||||||
std::vector<std::string>::iterator it = this->stopWords.begin();
|
|
||||||
for( ; it != this->stopWords.end(); ++it) {
|
|
||||||
this->stopper.add(*it);
|
|
||||||
}
|
|
||||||
|
|
||||||
this->indexer.set_stopper(&(this->stopper));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void XapianIndexer::index(const string &url,
|
|
||||||
const string &title,
|
|
||||||
const string &unaccentedTitle,
|
|
||||||
const string &keywords,
|
|
||||||
const string &content,
|
|
||||||
const string &snippet,
|
|
||||||
const string &size,
|
|
||||||
const string &wordCount) {
|
|
||||||
|
|
||||||
/* Put the data in the document */
|
|
||||||
Xapian::Document currentDocument;
|
|
||||||
currentDocument.clear_values();
|
|
||||||
currentDocument.add_value(0, title);
|
|
||||||
currentDocument.add_value(1, snippet);
|
|
||||||
currentDocument.add_value(2, size);
|
|
||||||
currentDocument.add_value(3, wordCount);
|
|
||||||
currentDocument.set_data(url);
|
|
||||||
indexer.set_document(currentDocument);
|
|
||||||
|
|
||||||
/* Index the title */
|
|
||||||
if (!unaccentedTitle.empty()) {
|
|
||||||
this->indexer.index_text_without_positions(unaccentedTitle, this->getTitleBoostFactor(content.size()));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Index the keywords */
|
|
||||||
if (!keywords.empty()) {
|
|
||||||
this->indexer.index_text_without_positions(keywords, keywordsBoostFactor);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Index the content */
|
|
||||||
if (!content.empty()) {
|
|
||||||
this->indexer.index_text_without_positions(content);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* add to the database */
|
|
||||||
this->writableDatabase.add_document(currentDocument);
|
|
||||||
}
|
|
||||||
|
|
||||||
void XapianIndexer::flush() {
|
|
||||||
this->writableDatabase.commit_transaction();
|
|
||||||
this->writableDatabase.begin_transaction(true);
|
|
||||||
}
|
|
||||||
|
|
||||||
void XapianIndexer::indexingPostlude(const string indexPath) {
|
|
||||||
this->flush();
|
|
||||||
this->writableDatabase.commit_transaction();
|
|
||||||
#ifdef _WIN32
|
|
||||||
this->writableDatabase.close();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Compacting the index */
|
|
||||||
Xapian::Compactor compactor;
|
|
||||||
try {
|
|
||||||
Xapian::Database src;
|
|
||||||
src.add_database(Xapian::Database(indexPath+".tmp"));
|
|
||||||
src.compact(indexPath, Xapian::Compactor::FULL | Xapian::DBCOMPACT_SINGLE_FILE, 0, compactor);
|
|
||||||
} catch (const Xapian::Error &error) {
|
|
||||||
cerr << indexPath << ": " << error.get_description() << endl;
|
|
||||||
exit(1);
|
|
||||||
} catch (const char * msg) {
|
|
||||||
cerr << indexPath << ": " << msg << endl;
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue