mirror of https://github.com/kiwix/libkiwix.git
529 lines
15 KiB
C++
Executable File
529 lines
15 KiB
C++
Executable File
/*
|
|
* Copyright 2011-2014 Emmanuel Engelhart <kelson@kiwix.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 3 of the License, or
|
|
* any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
* MA 02110-1301, USA.
|
|
*/
|
|
|
|
#include "indexer.h"
|
|
#include "xapian/myhtmlparse.h"
|
|
#include "kiwixlib-resources.h"
|
|
|
|
namespace kiwix {
|
|
|
|
/* Count word */
|
|
unsigned int Indexer::countWords(const string &text) {
|
|
unsigned int numWords = 1;
|
|
unsigned int length = text.size();
|
|
|
|
for(unsigned int i=0; i<length;) {
|
|
while(i<length && text[i] != ' ') {
|
|
i++;
|
|
}
|
|
numWords++;
|
|
i++;
|
|
}
|
|
|
|
return numWords;
|
|
}
|
|
|
|
/* Constructor */
|
|
Indexer::Indexer() :
|
|
keywordsBoostFactor(3),
|
|
verboseFlag(false) {
|
|
|
|
/* Initialize mutex */
|
|
pthread_mutex_init(&threadIdsMutex, NULL);
|
|
pthread_mutex_init(&toParseQueueMutex, NULL);
|
|
pthread_mutex_init(&toIndexQueueMutex, NULL);
|
|
pthread_mutex_init(&articleExtractorRunningMutex, NULL);
|
|
pthread_mutex_init(&articleParserRunningMutex, NULL);
|
|
pthread_mutex_init(&articleIndexerRunningMutex, NULL);
|
|
pthread_mutex_init(&articleCountMutex, NULL);
|
|
pthread_mutex_init(&zimPathMutex, NULL);
|
|
pthread_mutex_init(&zimIdMutex, NULL);
|
|
pthread_mutex_init(&indexPathMutex, NULL);
|
|
pthread_mutex_init(&progressionMutex, NULL);
|
|
pthread_mutex_init(&verboseMutex, NULL);
|
|
}
|
|
|
|
/* Destructor */
|
|
Indexer::~Indexer() {
|
|
}
|
|
|
|
/* Read the stopwords */
|
|
void Indexer::readStopWords(const string languageCode) {
|
|
std::string stopWord;
|
|
std::istringstream file(getResource("stopwords/" + languageCode));
|
|
|
|
this->stopWords.clear();
|
|
|
|
while (getline(file, stopWord, '\n')) {
|
|
this->stopWords.push_back(stopWord);
|
|
}
|
|
|
|
if (this->verboseFlag) {
|
|
std::cout << "Read stop words, lang code:" << languageCode << ", count:" << this->stopWords.size() << std::endl;
|
|
}
|
|
}
|
|
|
|
#pragma mark - Extractor
|
|
|
|
/* Article extractor methods */
|
|
void *Indexer::extractArticles(void *ptr) {
|
|
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
|
|
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
|
|
|
/* Get the number of article to index and the ZIM id */
|
|
kiwix::Reader reader(self->getZimPath());
|
|
unsigned int articleCount = reader.getArticleCount();
|
|
self->setArticleCount(articleCount);
|
|
string zimId = reader.getId();
|
|
self->setZimId(zimId);
|
|
|
|
/* Progression */
|
|
unsigned int readArticleCount = 0;
|
|
unsigned int currentProgression = 0;
|
|
self->setProgression(currentProgression);
|
|
unsigned int newProgress;
|
|
|
|
/* StopWords */
|
|
self->readStopWords(reader.getLanguage());
|
|
|
|
/* Goes trough all articles */
|
|
zim::File *zimHandler = reader.getZimFileHandler();
|
|
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');
|
|
unsigned int lastOffset = zimHandler->getNamespaceEndOffset('A');
|
|
zim::Article currentArticle;
|
|
|
|
while (currentOffset < lastOffset) {
|
|
currentArticle = zimHandler->getArticle(currentOffset);
|
|
|
|
if (!currentArticle.isRedirect()) {
|
|
/* Add articles to the queue */
|
|
indexerToken token;
|
|
token.title = currentArticle.getTitle();
|
|
token.url = currentArticle.getLongUrl();
|
|
token.content = string(currentArticle.getData().data(), currentArticle.getData().size());
|
|
self->pushToParseQueue(token);
|
|
readArticleCount += 1;
|
|
|
|
/* Update progress */
|
|
if (self->progressCallback) {
|
|
self->progressCallback(readArticleCount, articleCount);
|
|
}
|
|
newProgress = (unsigned int)((float)readArticleCount / (float)articleCount * 100);
|
|
if (newProgress != currentProgression) {
|
|
self->setProgression(newProgress);
|
|
}
|
|
}
|
|
|
|
currentOffset += 1;
|
|
|
|
/* Test if the thread should be cancelled */
|
|
pthread_testcancel();
|
|
}
|
|
|
|
self->articleExtractorRunning(false);
|
|
pthread_exit(NULL);
|
|
return NULL;
|
|
}
|
|
|
|
void Indexer::articleExtractorRunning(bool value) {
|
|
pthread_mutex_lock(&articleExtractorRunningMutex);
|
|
this->articleExtractorRunningFlag = value;
|
|
pthread_mutex_unlock(&articleExtractorRunningMutex);
|
|
}
|
|
|
|
bool Indexer::isArticleExtractorRunning() {
|
|
pthread_mutex_lock(&articleExtractorRunningMutex);
|
|
bool retVal = this->articleExtractorRunningFlag;
|
|
pthread_mutex_unlock(&articleExtractorRunningMutex);
|
|
return retVal;
|
|
}
|
|
|
|
#pragma mark - Parser
|
|
|
|
/* Article parser methods */
|
|
void *Indexer::parseArticles(void *ptr) {
|
|
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
|
|
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
|
size_t found;
|
|
indexerToken token;
|
|
|
|
while (self->popFromToParseQueue(token)) {
|
|
MyHtmlParser htmlParser;
|
|
|
|
/* The parser generate a lot of exceptions which should be avoided */
|
|
try {
|
|
htmlParser.parse_html(token.content, "UTF-8", true);
|
|
} catch (...) {
|
|
}
|
|
|
|
/* If content does not have the noindex meta tag */
|
|
/* Seems that the parser generates an exception in such case */
|
|
found = htmlParser.dump.find("NOINDEX");
|
|
|
|
if (found == string::npos) {
|
|
/* Get the accented title */
|
|
token.accentedTitle = (htmlParser.title.empty() ? token.title : htmlParser.title);
|
|
|
|
/* count words */
|
|
stringstream countWordStringStream;
|
|
countWordStringStream << self->countWords(htmlParser.dump);
|
|
token.wordCount = countWordStringStream.str();
|
|
|
|
/* snippet */
|
|
std::string snippet = std::string(htmlParser.dump, 0, 300);
|
|
std::string::size_type last = snippet.find_last_of('.');
|
|
if (last == snippet.npos)
|
|
last = snippet.find_last_of(' ');
|
|
if (last != snippet.npos)
|
|
snippet = snippet.substr(0, last);
|
|
token.snippet = snippet;
|
|
|
|
/* size */
|
|
stringstream sizeStringStream;
|
|
sizeStringStream << token.content.size() / 1024;
|
|
token.size = sizeStringStream.str();
|
|
|
|
/* Remove accent */
|
|
token.title = kiwix::removeAccents(token.accentedTitle);
|
|
token.keywords = kiwix::removeAccents(htmlParser.keywords);
|
|
token.content = kiwix::removeAccents(htmlParser.dump);
|
|
self->pushToIndexQueue(token);
|
|
}
|
|
|
|
/* Test if the thread should be cancelled */
|
|
pthread_testcancel();
|
|
}
|
|
|
|
self->articleParserRunning(false);
|
|
pthread_exit(NULL);
|
|
return NULL;
|
|
}
|
|
|
|
void Indexer::articleParserRunning(bool value) {
|
|
pthread_mutex_lock(&articleParserRunningMutex);
|
|
this->articleParserRunningFlag = value;
|
|
pthread_mutex_unlock(&articleParserRunningMutex);
|
|
}
|
|
|
|
bool Indexer::isArticleParserRunning() {
|
|
pthread_mutex_lock(&articleParserRunningMutex);
|
|
bool retVal = this->articleParserRunningFlag;
|
|
pthread_mutex_unlock(&articleParserRunningMutex);
|
|
return retVal;
|
|
}
|
|
|
|
#pragma mark - Indexer
|
|
|
|
/* Article indexer methods */
|
|
void *Indexer::indexArticles(void *ptr) {
|
|
pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
|
|
kiwix::Indexer *self = (kiwix::Indexer *)ptr;
|
|
unsigned int indexedArticleCount = 0;
|
|
indexerToken token;
|
|
|
|
self->indexingPrelude(self->getIndexPath());
|
|
|
|
while (self->popFromToIndexQueue(token)) {
|
|
self->index(token.url,
|
|
token.accentedTitle,
|
|
token.title,
|
|
token.keywords,
|
|
token.content,
|
|
token.snippet,
|
|
token.size,
|
|
token.wordCount
|
|
);
|
|
|
|
indexedArticleCount += 1;
|
|
|
|
/* Make a hard-disk flush every 10.000 articles */
|
|
if (indexedArticleCount % 5000 == 0) {
|
|
self->flush();
|
|
}
|
|
|
|
/* Test if the thread should be cancelled */
|
|
pthread_testcancel();
|
|
}
|
|
self->indexingPostlude(self->getIndexPath());
|
|
|
|
/* Write content id file */
|
|
string path = appendToDirectory(self->getIndexPath(), "content.id");
|
|
writeTextFile(path, self->getZimId());
|
|
|
|
self->setProgression(100);
|
|
kiwix::sleep(100);
|
|
|
|
self->articleIndexerRunning(false);
|
|
pthread_exit(NULL);
|
|
return NULL;
|
|
}
|
|
|
|
void Indexer::articleIndexerRunning(bool value) {
|
|
pthread_mutex_lock(&articleIndexerRunningMutex);
|
|
this->articleIndexerRunningFlag = value;
|
|
pthread_mutex_unlock(&articleIndexerRunningMutex);
|
|
}
|
|
|
|
bool Indexer::isArticleIndexerRunning() {
|
|
pthread_mutex_lock(&articleIndexerRunningMutex);
|
|
bool retVal = this->articleIndexerRunningFlag;
|
|
pthread_mutex_unlock(&articleIndexerRunningMutex);
|
|
return retVal;
|
|
}
|
|
|
|
#pragma mark - Parse Queue
|
|
|
|
/* ToParseQueue methods */
|
|
bool Indexer::isToParseQueueEmpty() {
|
|
pthread_mutex_lock(&toParseQueueMutex);
|
|
bool retVal = this->toParseQueue.empty();
|
|
pthread_mutex_unlock(&toParseQueueMutex);
|
|
return retVal;
|
|
}
|
|
|
|
void Indexer::pushToParseQueue(indexerToken &token) {
|
|
pthread_mutex_lock(&toParseQueueMutex);
|
|
this->toParseQueue.push(token);
|
|
pthread_mutex_unlock(&toParseQueueMutex);
|
|
kiwix::sleep(int(this->toParseQueue.size() / 200) / 10 * 1000);
|
|
}
|
|
|
|
bool Indexer::popFromToParseQueue(indexerToken &token) {
|
|
while (this->isToParseQueueEmpty() && this->isArticleExtractorRunning()) {
|
|
kiwix::sleep(500);
|
|
if (this->getVerboseFlag()) {
|
|
std::cout << "Waiting... ToParseQueue is empty for now..." << std::endl;
|
|
}
|
|
|
|
pthread_testcancel();
|
|
}
|
|
|
|
if (!this->isToParseQueueEmpty()) {
|
|
pthread_mutex_lock(&toParseQueueMutex);
|
|
token = this->toParseQueue.front();
|
|
this->toParseQueue.pop();
|
|
pthread_mutex_unlock(&toParseQueueMutex);
|
|
} else {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#pragma mark - Index Queue
|
|
|
|
/* ToIndexQueue methods */
|
|
bool Indexer::isToIndexQueueEmpty() {
|
|
pthread_mutex_lock(&toIndexQueueMutex);
|
|
bool retVal = this->toIndexQueue.empty();
|
|
pthread_mutex_unlock(&toIndexQueueMutex);
|
|
return retVal;
|
|
}
|
|
|
|
void Indexer::pushToIndexQueue(indexerToken &token) {
|
|
pthread_mutex_lock(&toIndexQueueMutex);
|
|
this->toIndexQueue.push(token);
|
|
pthread_mutex_unlock(&toIndexQueueMutex);
|
|
kiwix::sleep(int(this->toIndexQueue.size() / 200) / 10 * 1000);
|
|
}
|
|
|
|
bool Indexer::popFromToIndexQueue(indexerToken &token) {
|
|
while (this->isToIndexQueueEmpty() && this->isArticleParserRunning()) {
|
|
kiwix::sleep(500);
|
|
if (this->getVerboseFlag()) {
|
|
std::cout << "Waiting... ToIndexQueue is empty for now..." << std::endl;
|
|
}
|
|
|
|
pthread_testcancel();
|
|
}
|
|
|
|
if (!this->isToIndexQueueEmpty()) {
|
|
pthread_mutex_lock(&toIndexQueueMutex);
|
|
token = this->toIndexQueue.front();
|
|
this->toIndexQueue.pop();
|
|
pthread_mutex_unlock(&toIndexQueueMutex);
|
|
} else {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#pragma mark - Properties Getter & Setter
|
|
|
|
/* ZIM & Index methods */
|
|
void Indexer::setZimPath(const string path) {
|
|
pthread_mutex_lock(&zimPathMutex);
|
|
this->zimPath = path;
|
|
pthread_mutex_unlock(&zimPathMutex);
|
|
}
|
|
|
|
string Indexer::getZimPath() {
|
|
pthread_mutex_lock(&zimPathMutex);
|
|
string retVal = this->zimPath;
|
|
pthread_mutex_unlock(&zimPathMutex);
|
|
return retVal;
|
|
}
|
|
|
|
void Indexer::setIndexPath(const string path) {
|
|
pthread_mutex_lock(&indexPathMutex);
|
|
this->indexPath = path;
|
|
pthread_mutex_unlock(&indexPathMutex);
|
|
}
|
|
|
|
string Indexer::getIndexPath() {
|
|
pthread_mutex_lock(&indexPathMutex);
|
|
string retVal = this->indexPath;
|
|
pthread_mutex_unlock(&indexPathMutex);
|
|
return retVal;
|
|
}
|
|
|
|
void Indexer::setArticleCount(const unsigned int articleCount) {
|
|
pthread_mutex_lock(&articleCountMutex);
|
|
this->articleCount = articleCount;
|
|
pthread_mutex_unlock(&articleCountMutex);
|
|
}
|
|
|
|
unsigned int Indexer::getArticleCount() {
|
|
pthread_mutex_lock(&articleCountMutex);
|
|
unsigned int retVal = this->articleCount;
|
|
pthread_mutex_unlock(&articleCountMutex);
|
|
return retVal;
|
|
}
|
|
|
|
void Indexer::setProgression(const unsigned int progression) {
|
|
pthread_mutex_lock(&progressionMutex);
|
|
this->progression = progression;
|
|
pthread_mutex_unlock(&progressionMutex);
|
|
}
|
|
|
|
unsigned int Indexer::getProgression() {
|
|
pthread_mutex_lock(&progressionMutex);
|
|
unsigned int retVal = this->progression;
|
|
pthread_mutex_unlock(&progressionMutex);
|
|
return retVal;
|
|
}
|
|
|
|
void Indexer::setZimId(const string id) {
|
|
pthread_mutex_lock(&zimIdMutex);
|
|
this->zimId = id;
|
|
pthread_mutex_unlock(&zimIdMutex);
|
|
}
|
|
|
|
string Indexer::getZimId() {
|
|
pthread_mutex_lock(&zimIdMutex);
|
|
string retVal = this->zimId;
|
|
pthread_mutex_unlock(&zimIdMutex);
|
|
return retVal;
|
|
}
|
|
|
|
#pragma mark - Status Management
|
|
|
|
/* Manage */
|
|
bool Indexer::start(const string zimPath, const string indexPath, ProgressCallback callback) {
|
|
if (this->getVerboseFlag()) {
|
|
std::cout << "Indexing of '" << zimPath << "' starting..." <<std::endl;
|
|
}
|
|
|
|
if (callback) {
|
|
this->progressCallback = callback;
|
|
}
|
|
|
|
this->setArticleCount(0);
|
|
this->setProgression(0);
|
|
this->setZimPath(zimPath);
|
|
this->setIndexPath(indexPath);
|
|
|
|
pthread_mutex_lock(&threadIdsMutex);
|
|
this->articleExtractorRunning(true);
|
|
pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this);
|
|
pthread_detach(this->articleExtractor);
|
|
|
|
while(this->isArticleExtractorRunning() && this->getArticleCount() == 0) {
|
|
kiwix::sleep(100);
|
|
}
|
|
|
|
this->articleParserRunning(true);
|
|
pthread_create(&(this->articleParser), NULL, Indexer::parseArticles, (void*)this);
|
|
pthread_detach(this->articleParser);
|
|
|
|
this->articleIndexerRunning(true);
|
|
pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this);
|
|
pthread_detach(this->articleIndexer);
|
|
pthread_mutex_unlock(&threadIdsMutex);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Indexer::isRunning() {
|
|
if (this->getVerboseFlag()) {
|
|
std::cout << "isArticleExtractor running: " << (this->isArticleExtractorRunning() ? "yes" : "no") << std::endl;
|
|
std::cout << "isArticleParser running: " << (this->isArticleParserRunning() ? "yes" : "no") << std::endl;
|
|
std::cout << "isArticleIndexer running: " << (this->isArticleIndexerRunning() ? "yes" : "no") << std::endl;
|
|
}
|
|
|
|
return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning();
|
|
}
|
|
|
|
bool Indexer::stop() {
|
|
if (this->isRunning()) {
|
|
bool isArticleExtractorRunning = this->isArticleExtractorRunning();
|
|
bool isArticleIndexerRunning = this->isArticleIndexerRunning();
|
|
bool isArticleParserRunning = this->isArticleParserRunning();
|
|
|
|
pthread_mutex_lock(&threadIdsMutex);
|
|
|
|
if (isArticleIndexerRunning) {
|
|
pthread_cancel(this->articleIndexer);
|
|
this->articleIndexerRunning(false);
|
|
}
|
|
if (isArticleParserRunning) {
|
|
pthread_cancel(this->articleParser);
|
|
this->articleParserRunning(false);
|
|
}
|
|
if (isArticleExtractorRunning) {
|
|
pthread_cancel(this->articleExtractor);
|
|
this->articleExtractorRunning(false);
|
|
}
|
|
|
|
pthread_mutex_unlock(&threadIdsMutex);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#pragma mark - verbose
|
|
|
|
/* Manage the verboseFlag */
|
|
void Indexer::setVerboseFlag(const bool value) {
|
|
pthread_mutex_lock(&verboseMutex);
|
|
this->verboseFlag = value;
|
|
pthread_mutex_unlock(&verboseMutex);
|
|
}
|
|
|
|
bool Indexer::getVerboseFlag() {
|
|
bool value;
|
|
pthread_mutex_lock(&verboseMutex);
|
|
value = this->verboseFlag;
|
|
pthread_mutex_unlock(&verboseMutex);
|
|
return value;
|
|
}
|
|
|
|
}
|