+ new indexer code

This commit is contained in:
kelson42 2012-04-06 21:48:43 +00:00
parent 735d9afd3a
commit 62daa9ffe5
6 changed files with 169 additions and 252 deletions

View File

@ -35,20 +35,9 @@ namespace kiwix {
} }
/* Constructor */ /* Constructor */
Indexer::Indexer(const string &zimFilePath) Indexer::Indexer() :
: zimFileHandler(NULL), keywordsBoostFactor(3) {
articleCount(0),
stepSize(0),
keywordsBoostFactor(3) {
this->initialize();
this->setZimFilePath(zimFilePath);
/* Read the stopwords file */
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
}
void Indexer::initialize() {
/* Initialize mutex */ /* Initialize mutex */
pthread_mutex_init(&threadIdsMutex, NULL); pthread_mutex_init(&threadIdsMutex, NULL);
pthread_mutex_init(&toParseQueueMutex, NULL); pthread_mutex_init(&toParseQueueMutex, NULL);
@ -57,44 +46,34 @@ namespace kiwix {
pthread_mutex_init(&articleParserRunningMutex, NULL); pthread_mutex_init(&articleParserRunningMutex, NULL);
pthread_mutex_init(&articleIndexerRunningMutex, NULL); pthread_mutex_init(&articleIndexerRunningMutex, NULL);
pthread_mutex_init(&articleCountMutex, NULL); pthread_mutex_init(&articleCountMutex, NULL);
pthread_mutex_init(&zimPathMutex, NULL);
pthread_mutex_init(&indexPathMutex, NULL);
pthread_mutex_init(&progressionMutex, NULL); pthread_mutex_init(&progressionMutex, NULL);
/* Article count & Progression */ /* Read the stopwords file */
this->setProgression(0); //this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
}
bool Indexer::setZimFilePath(const string &zimFilePath) {
/* Open the ZIM file */
this->zimFileHandler = new zim::File(zimFilePath);
/* Define a few values */
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
/* Compute few things */
kiwix::Reader reader(zimFilePath);
this->setArticleCount(reader.getArticleCount());
//this->articleCount = this->zimFileHandler->getNamespaceCount('A');
this->stepSize = (float)this->articleCount / (float)100;
} }
/* Article extractor methods */ /* Article extractor methods */
void *Indexer::extractArticles(void *ptr) { void *Indexer::extractArticles(void *ptr) {
kiwix::Indexer *self = (kiwix::Indexer *)ptr; kiwix::Indexer *self = (kiwix::Indexer *)ptr;
self->articleExtractorRunning(true); self->articleExtractorRunning(true);
unsigned int startOffset = self->zimFileHandler->getNamespaceBeginOffset('A');
unsigned int endOffset = self->zimFileHandler->getNamespaceEndOffset('A'); /* Get the number of article to index */
kiwix::Reader reader(self->getZimPath());
self->setArticleCount(reader.getArticleCount());
/* Goes trough all articles */ /* Goes trough all articles */
unsigned int currentOffset = startOffset; zim::File *zimHandler = reader.getZimFileHandler();
unsigned int currentOffset = zimHandler->getNamespaceBeginOffset('A');;
unsigned int lastOffset = zimHandler->getNamespaceEndOffset('A');;
zim::Article currentArticle; zim::Article currentArticle;
while (currentOffset <= endOffset) { while (currentOffset <= lastOffset) {
/* Redirects are not indexed */ /* Redirects are not indexed */
do { do {
currentArticle = self->zimFileHandler->getArticle(currentOffset++); currentArticle = zimHandler->getArticle(currentOffset++);
} while (currentArticle.isRedirect() && currentOffset != endOffset); } while (currentArticle.isRedirect() && currentOffset != lastOffset);
/* Add articles to the queue */ /* Add articles to the queue */
indexerToken token; indexerToken token;
@ -176,7 +155,8 @@ namespace kiwix {
self->pushToIndexQueue(token); self->pushToIndexQueue(token);
/* Test if the thread should be cancelled */ /* Test if the thread should be cancelled */
pthread_testcancel(); } pthread_testcancel();
}
} }
self->articleParserRunning(false); self->articleParserRunning(false);
@ -201,29 +181,36 @@ namespace kiwix {
void *Indexer::indexArticles(void *ptr) { void *Indexer::indexArticles(void *ptr) {
kiwix::Indexer *self = (kiwix::Indexer *)ptr; kiwix::Indexer *self = (kiwix::Indexer *)ptr;
self->articleIndexerRunning(true); self->articleIndexerRunning(true);
indexerToken token; indexerToken token;
unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100));
unsigned indexedArticleCount = 0; unsigned indexedArticleCount = 0;
unsigned int stepSize = ((self->getArticleCount() / 100) < 1 ? 1 : (self->getArticleCount() / 100));
self->indexingPrelude(self->getIndexPath());
while (self->popFromToIndexQueue(token)) { while (self->popFromToIndexQueue(token)) {
self->indexNextArticle(token.url, self->index(token.url,
token.accentedTitle, token.accentedTitle,
token.title, token.title,
token.keywords, token.keywords,
token.content, token.content,
token.snippet, token.snippet,
token.size, token.size,
token.wordCount token.wordCount
); );
if (++indexedArticleCount % stepSize == 0) { if (++indexedArticleCount % stepSize == 0) {
self->setProgression(self->getProgression() + 1); self->setProgression(self->getProgression() + 1);
} }
if (indexedArticleCount % 10000 == 0) {
self->flush();
}
/* Test if the thread should be cancelled */
pthread_testcancel();
} }
self->setProgression(100); self->setProgression(100);
self->indexNextPercentPost(); self->indexingPostlude();
self->articleIndexerRunning(false); self->articleIndexerRunning(false);
pthread_exit(NULL); pthread_exit(NULL);
return NULL; return NULL;
@ -306,8 +293,34 @@ namespace kiwix {
return true; return true;
} }
/* Article Count & Progression */ /* ZIM & Index methods */
void Indexer::setArticleCount(unsigned int articleCount) { void Indexer::setZimPath(const string path) {
pthread_mutex_lock(&zimPathMutex);
this->zimPath = path;
pthread_mutex_unlock(&zimPathMutex);
}
string Indexer::getZimPath() {
pthread_mutex_lock(&zimPathMutex);
string retVal = this->zimPath;
pthread_mutex_unlock(&zimPathMutex);
return retVal;
}
void Indexer::setIndexPath(const string path) {
pthread_mutex_lock(&indexPathMutex);
this->indexPath = path;
pthread_mutex_unlock(&indexPathMutex);
}
string Indexer::getIndexPath() {
pthread_mutex_lock(&indexPathMutex);
string retVal = this->indexPath;
pthread_mutex_unlock(&indexPathMutex);
return retVal;
}
void Indexer::setArticleCount(const unsigned int articleCount) {
pthread_mutex_lock(&articleCountMutex); pthread_mutex_lock(&articleCountMutex);
this->articleCount = articleCount; this->articleCount = articleCount;
pthread_mutex_unlock(&articleCountMutex); pthread_mutex_unlock(&articleCountMutex);
@ -320,7 +333,7 @@ namespace kiwix {
return retVal; return retVal;
} }
void Indexer::setProgression(unsigned int progression) { void Indexer::setProgression(const unsigned int progression) {
pthread_mutex_lock(&progressionMutex); pthread_mutex_lock(&progressionMutex);
this->progression = progression; this->progression = progression;
pthread_mutex_unlock(&progressionMutex); pthread_mutex_unlock(&progressionMutex);
@ -333,8 +346,12 @@ namespace kiwix {
return retVal; return retVal;
} }
bool Indexer::start() { /* Manage */
this->indexNextPercentPre(); bool Indexer::start(const string &zimPath, const string &indexPath) {
this->setProgression(0);
this->setZimPath(zimPath);
this->setIndexPath(indexPath);
pthread_mutex_lock(&threadIdsMutex); pthread_mutex_lock(&threadIdsMutex);
pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this); pthread_create(&(this->articleExtractor), NULL, Indexer::extractArticles, (void*)this);
pthread_detach(this->articleExtractor); pthread_detach(this->articleExtractor);
@ -343,15 +360,7 @@ namespace kiwix {
pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this); pthread_create(&(this->articleIndexer), NULL, Indexer::indexArticles, (void*)this);
pthread_detach(this->articleIndexer); pthread_detach(this->articleIndexer);
pthread_mutex_unlock(&threadIdsMutex); pthread_mutex_unlock(&threadIdsMutex);
return true;
}
bool Indexer::stop() {
pthread_mutex_lock(&threadIdsMutex);
pthread_cancel(this->articleExtractor);
pthread_cancel(this->articleParser);
pthread_cancel(this->articleIndexer);
pthread_mutex_unlock(&threadIdsMutex);
return true; return true;
} }
@ -359,12 +368,27 @@ namespace kiwix {
return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning(); return this->isArticleExtractorRunning() || this->isArticleIndexerRunning() || this->isArticleParserRunning();
} }
void Indexer::setCurrentArticleOffset(unsigned int offset) { bool Indexer::stop() {
this->currentArticleOffset = offset; if (this->isRunning()) {
} bool isArticleExtractorRunning = this->isArticleExtractorRunning();
bool isArticleIndexerRunning = this->isArticleIndexerRunning();
bool isArticleParserRunning = this->isArticleParserRunning();
unsigned int Indexer::getCurrentArticleOffset() { pthread_mutex_lock(&threadIdsMutex);
return this->currentArticleOffset;
if (isArticleExtractorRunning)
pthread_cancel(this->articleExtractor);
if (isArticleIndexerRunning)
pthread_cancel(this->articleParser);
if (isArticleParserRunning)
pthread_cancel(this->articleIndexer);
pthread_mutex_unlock(&threadIdsMutex);
this->articleIndexerRunning(false);
}
return true;
} }
/* Read the file containing the stopwords */ /* Read the file containing the stopwords */
@ -382,102 +406,4 @@ namespace kiwix {
return true; return true;
} }
/* Index next percent */
bool Indexer::indexNextPercent(const bool &verbose) {
float thresholdOffset = this->currentArticleOffset + this->stepSize;
size_t found;
/* Check if we can start */
if (this->zimFileHandler == NULL) {
return false;
}
this->indexNextPercentPre();
while(this->currentArticleOffset < thresholdOffset &&
this->currentArticleOffset <= this->lastArticleOffset) {
zim::Article currentArticle;
/* Get next non redirect article */
do {
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
} while (this->currentArticleOffset++ &&
currentArticle.isRedirect() &&
this->currentArticleOffset != this->lastArticleOffset);
if (!currentArticle.isRedirect()) {
/* Index the content */
this->htmlParser.reset();
string content (currentArticle.getData().data(), currentArticle.getData().size());
/* The parser generate a lot of exceptions which should be avoided */
try {
this->htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {
}
/* If content does not have the noindex meta tag */
/* Seems that the parser generates an exception in such case */
found = this->htmlParser.dump.find("NOINDEX");
if (found == string::npos) {
string url = currentArticle.getLongUrl();
/* Debug output */
if (verbose) {
std::cout << "Indexing " << url << "..." << std::endl;
}
/* Get the title */
string accentedTitle = this->htmlParser.title;
if (accentedTitle.empty()) {
accentedTitle = currentArticle.getTitle();
}
/* count words */
stringstream countWordStringStream;
countWordStringStream << countWords(this->htmlParser.dump);
const std::string wordCountString = countWordStringStream.str();
/* snippet */
std::string snippet = std::string(this->htmlParser.dump, 0, 300);
std::string::size_type last = snippet.find_last_of('.');
if (last == snippet.npos)
last = snippet.find_last_of(' ');
if (last != snippet.npos)
snippet = snippet.substr(0, last);
/* size */
stringstream sizeStringStream;
sizeStringStream << content.size() / 1024;
const std::string size = sizeStringStream.str();
this->indexNextArticle(url,
accentedTitle,
removeAccents(this->htmlParser.title),
removeAccents(this->htmlParser.keywords),
removeAccents(this->htmlParser.dump),
snippet,
size,
wordCountString
);
}
}
}
this->indexNextPercentPost();
/* increment the offset and set returned value */
if (this->currentArticleOffset <= this->lastArticleOffset) {
return true;
} else {
// commented as it never returns on OSX.
//this->stopIndexing();
return false;
}
}
} }

View File

@ -54,17 +54,14 @@ namespace kiwix {
class Indexer { class Indexer {
public: public:
Indexer(const string &zimFilePath); Indexer();
bool indexNextPercent(const bool &verbose = false); bool start(const string &zimPath, const string &indexPath);
bool setZimFilePath(const string &zimFilePath);
bool start();
bool stop(); bool stop();
bool isRunning(); bool isRunning();
unsigned int getProgression(); unsigned int getProgression();
private: private:
pthread_mutex_t threadIdsMutex; pthread_mutex_t threadIdsMutex;
void initialize();
/* Article extraction */ /* Article extraction */
pthread_t articleExtractor; pthread_t articleExtractor;
@ -107,46 +104,47 @@ namespace kiwix {
/* Article Count & Progression */ /* Article Count & Progression */
unsigned int articleCount; unsigned int articleCount;
pthread_mutex_t articleCountMutex; pthread_mutex_t articleCountMutex;
void setArticleCount(unsigned int articleCount); void setArticleCount(const unsigned int articleCount);
unsigned int getArticleCount(); unsigned int getArticleCount();
/* Progression */
unsigned int progression; unsigned int progression;
pthread_mutex_t progressionMutex; pthread_mutex_t progressionMutex;
void setProgression(unsigned int progression); void setProgression(const unsigned int progression);
/* getProgression() is public */
/* ZIM path */
pthread_mutex_t zimPathMutex;
string zimPath;
void setZimPath(const string path);
string getZimPath();
/* Index path */
pthread_mutex_t indexPathMutex;
string indexPath;
void setIndexPath(const string path);
string getIndexPath();
protected: protected:
virtual void indexNextPercentPre() = 0; virtual void indexingPrelude(const string &indexPath) = 0;
virtual void indexNextArticle(const string &url, virtual void index(const string &url,
const string &title, const string &title,
const string &unaccentedTitle, const string &unaccentedTitle,
const string &keywords, const string &keywords,
const string &content, const string &content,
const string &snippet, const string &snippet,
const string &size, const string &size,
const string &wordCount) = 0; const string &wordCount) = 0;
virtual void indexNextPercentPost() = 0; virtual void flush() = 0;
virtual void stopIndexing() = 0; virtual void indexingPostlude() = 0;
/* Article offset */ /* Others */
void setCurrentArticleOffset(unsigned int offset);
unsigned int getCurrentArticleOffset();
/* ZIM file handling */
zim::File* zimFileHandler;
zim::size_type firstArticleOffset;
zim::size_type lastArticleOffset;
zim::size_type currentArticleOffset;
/* HTML parsing */
MyHtmlParser htmlParser;
unsigned int countWords(const string &text); unsigned int countWords(const string &text);
/* Stopwords */ /* Stopwords */
bool readStopWordsFile(const string path); bool readStopWordsFile(const string path);
std::vector<std::string> stopWords; std::vector<std::string> stopWords;
/* Others */
float stepSize;
/* Boost factor */ /* Boost factor */
unsigned int keywordsBoostFactor; unsigned int keywordsBoostFactor;
inline unsigned int getTitleBoostFactor(const unsigned int contentLength) { inline unsigned int getTitleBoostFactor(const unsigned int contentLength) {

View File

@ -63,6 +63,10 @@ namespace kiwix {
} }
} }
zim::File* Reader::getZimFileHandler() {
return this->zimFileHandler;
}
/* Reset the cursor for GetNextArticle() */ /* Reset the cursor for GetNextArticle() */
void Reader::reset() { void Reader::reset() {
this->currentArticleOffset = this->firstArticleOffset; this->currentArticleOffset = this->firstArticleOffset;

View File

@ -63,6 +63,7 @@ namespace kiwix {
bool canCheckIntegrity(); bool canCheckIntegrity();
bool isCorrupted(); bool isCorrupted();
bool parseUrl(const string &urlStr, char *ns, string &titleStr); bool parseUrl(const string &urlStr, char *ns, string &titleStr);
zim::File* getZimFileHandler();
protected: protected:
zim::File* zimFileHandler; zim::File* zimFileHandler;

View File

@ -22,20 +22,15 @@
namespace kiwix { namespace kiwix {
/* Constructor */ /* Constructor */
XapianIndexer::XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath) : XapianIndexer::XapianIndexer() {
Indexer(zimFilePath) {
/* Open the Xapian directory */
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
Xapian::DB_CREATE_OR_OVERWRITE);
/* Stemming */ /* Stemming */
/* /*
stemmer = Xapian::Stem("french"); stemmer = Xapian::Stem("french");
indexer.set_stemmer(stemmer); indexer.set_stemmer(stemmer);
*/ */
/* Stop words /* Stop words */
/*
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin(); std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
this->stopper.add("ceci"); this->stopper.add("ceci");
while (stopWordsIterator != this->stopWords.end()) { while (stopWordsIterator != this->stopWords.end()) {
@ -46,18 +41,19 @@ namespace kiwix {
*/ */
} }
void XapianIndexer::indexNextPercentPre() { void XapianIndexer::indexingPrelude(const string &indexPath) {
this->writableDatabase->begin_transaction(true); this->writableDatabase = Xapian::WritableDatabase(indexPath, Xapian::DB_CREATE_OR_OVERWRITE);
this->writableDatabase.begin_transaction(true);
} }
void XapianIndexer::indexNextArticle(const string &url, void XapianIndexer::index(const string &url,
const string &title, const string &title,
const string &unaccentedTitle, const string &unaccentedTitle,
const string &keywords, const string &keywords,
const string &content, const string &content,
const string &snippet, const string &snippet,
const string &size, const string &size,
const string &wordCount) { const string &wordCount) {
/* Put the data in the document */ /* Put the data in the document */
Xapian::Document currentDocument; Xapian::Document currentDocument;
@ -85,26 +81,17 @@ namespace kiwix {
} }
/* add to the database */ /* add to the database */
this->writableDatabase->add_document(currentDocument); this->writableDatabase.add_document(currentDocument);
} }
void XapianIndexer::indexNextPercentPost() { void XapianIndexer::flush() {
/* Flush and close Xapian transaction*/ this->writableDatabase.commit_transaction();
this->writableDatabase->commit_transaction(); this->writableDatabase.begin_transaction(true);
} }
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */ void XapianIndexer::indexingPostlude() {
void XapianIndexer::stopIndexing() { this->flush();
/* Delete the zimFileHandler */ this->writableDatabase.commit_transaction();
if (this->zimFileHandler != NULL) { this->writableDatabase.commit();
delete this->zimFileHandler;
this->zimFileHandler = NULL;
}
/* Delete the Xapian writableDatabase */
if (this->writableDatabase != NULL) {
delete this->writableDatabase;
this->writableDatabase = NULL;
}
} }
} }

View File

@ -30,22 +30,23 @@ namespace kiwix {
class XapianIndexer : public Indexer { class XapianIndexer : public Indexer {
public: public:
XapianIndexer(const string &zimFilePath, const string &xapianDirectoryPath); XapianIndexer();
~XapianIndexer();
protected: protected:
void indexNextPercentPre(); void indexingPrelude(const string &indexPath);
void indexNextArticle(const string &url, void index(const string &url,
const string &title, const string &title,
const string &unaccentedTitle, const string &unaccentedTitle,
const string &keywords, const string &keywords,
const string &content, const string &content,
const string &snippet, const string &snippet,
const string &size, const string &size,
const string &wordCount); const string &wordCount);
void indexNextPercentPost(); void flush();
void stopIndexing(); void indexingPostlude();
Xapian::WritableDatabase *writableDatabase; Xapian::WritableDatabase writableDatabase;
Xapian::Stem stemmer; Xapian::Stem stemmer;
Xapian::SimpleStopper stopper; Xapian::SimpleStopper stopper;
Xapian::TermGenerator indexer; Xapian::TermGenerator indexer;