libkiwix/src/common/kiwix/indexer.cpp

192 lines
5.5 KiB
C++

#include "indexer.h"
namespace kiwix {
/* Count word */
unsigned int Indexer::countWords(const string &text) {
unsigned int numWords = 1;
for(unsigned int i=0; i<text.size();) {
while(i<text.size() && text[i] != ' ') {
i++;
}
numWords++;
i++;
}
return numWords;
}
/* Constructor */
Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath)
: zimFileHandler(NULL),
articleCount(0),
stepSize(0) {
/* Open the ZIM file */
this->zimFileHandler = new zim::File(zimFilePath);
/* Open the Xapian directory */
this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath,
Xapian::DB_CREATE_OR_OVERWRITE);
/* Stemming */
/*
stemmer = Xapian::Stem("french");
indexer.set_stemmer(stemmer);
*/
/* Read the stopwords file */
/*
this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
std::vector<std::string>::const_iterator stopWordsIterator = this->stopWords.begin();
this->stopper.add("ceci");
while (stopWordsIterator != this->stopWords.end()) {
this->stopper.add(*stopWordsIterator);
stopWordsIterator++;
}
indexer.set_stopper(&(this->stopper));
*/
/* Prepare the indexation */
this->prepareIndexing();
}
/* Destructor */
Indexer::~Indexer() {
this->stopIndexing();
}
/* Start indexing */
void Indexer::prepareIndexing() {
/* Define a few values */
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
/* Compute few things */
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
this->stepSize = (float)this->articleCount / (float)100;
}
/* Index next percent */
bool Indexer::indexNextPercent(const bool &verbose) {
float thresholdOffset = this->currentArticleOffset + this->stepSize;
size_t found;
/* Check if we can start */
if (this->zimFileHandler == NULL || this->writableDatabase == NULL) {
return false;
}
/* Begin the Xapian transation */
this->writableDatabase->begin_transaction(true);
while(this->currentArticleOffset < thresholdOffset &&
this->currentArticleOffset < this->lastArticleOffset) {
zim::Article currentArticle;
Xapian::Document currentDocument;
/* Get next non redirect article */
do {
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
} while (this->currentArticleOffset++ &&
currentArticle.isRedirect() &&
this->currentArticleOffset != this->lastArticleOffset);
if (!currentArticle.isRedirect()) {
/* Index the content */
this->htmlParser.reset();
string content (currentArticle.getData().data(), currentArticle.getData().size());
/* The parser generate a lot of exceptions which should be avoided */
try {
this->htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {
}
/* If content does not have the noindex meta tag */
/* Seems that the parser generates an exception in such case */
found = this->htmlParser.dump.find("NOINDEX");
if (found == string::npos) {
/* Put the data in the document */
currentDocument.clear_values();
currentDocument.add_value(0, this->htmlParser.title);
currentDocument.set_data(currentArticle.getLongUrl().c_str());
indexer.set_document(currentDocument);
/* Debug output */
if (verbose) {
std::cout << "Indexing " << currentArticle.getLongUrl() << "..." << std::endl;
}
/* Index the title */
if (!this->htmlParser.title.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.title),
((this->htmlParser.dump.size() / 100) + 1) /
countWords(this->htmlParser.title) );
}
/* Index the keywords */
if (!this->htmlParser.keywords.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.keywords), 3);
}
/* Index the content */
if (!this->htmlParser.dump.empty()) {
indexer.index_text_without_positions(removeAccents(this->htmlParser.dump));
}
/* add to the database */
this->writableDatabase->add_document(currentDocument);
}
}
}
/* Flush and close Xapian transaction*/
this->writableDatabase->commit_transaction();
/* increment the offset and set returned value */
if (this->currentArticleOffset < this->lastArticleOffset) {
this->currentArticleOffset++;
return true;
} else {
this->stopIndexing();
return false;
}
}
/* Stop indexing. TODO: using it crashs the soft under windows. Have to do it in indexNextPercent() */
void Indexer::stopIndexing() {
/* Delete the zimFileHandler */
if (this->zimFileHandler != NULL) {
delete this->zimFileHandler;
this->zimFileHandler = NULL;
}
/* Delete the Xapian writableDatabase */
if (this->writableDatabase != NULL) {
delete this->writableDatabase;
this->writableDatabase = NULL;
}
}
/* Read the file containing the stopwords */
bool Indexer::readStopWordsFile(const string path) {
std::string stopWord;
std::ifstream file(path.c_str(), std::ios_base::in);
this->stopWords.clear();
while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord);
}
std::cout << "Read " << this->stopWords.size() << " lines.\n";
return true;
}
}