libkiwix/src/common/kiwix/indexer.cpp

198 lines
5.4 KiB
C++

/*
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
#include "indexer.h"
namespace kiwix {
/* Count word */
unsigned int Indexer::countWords(const string &text) {
unsigned int numWords = 1;
for(unsigned int i=0; i<text.size();) {
while(i<text.size() && text[i] != ' ') {
i++;
}
numWords++;
i++;
}
return numWords;
}
/* Constructor */
Indexer::Indexer(const string &zimFilePath)
: zimFileHandler(NULL),
articleCount(0),
stepSize(0),
keywordsBoostFactor(3) {
this->setZimFilePath(zimFilePath);
/* Read the stopwords file */
//this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr");
}
bool Indexer::setZimFilePath(const string &zimfilePath) {
/* Open the ZIM file */
this->zimFileHandler = new zim::File(zimFilePath);
/* Define a few values */
this->firstArticleOffset = this->zimFileHandler->getNamespaceBeginOffset('A');
this->lastArticleOffset = this->zimFileHandler->getNamespaceEndOffset('A');
this->currentArticleOffset = this->firstArticleOffset;
/* Compute few things */
this->articleCount = this->zimFileHandler->getNamespaceCount('A');
this->stepSize = (float)this->articleCount / (float)100;
}
bool Indexer::start() {
}
bool Indexer::stop() {
}
unsigned Indexer::isRunning() {
}
void Indexer::setCurrentArticleOffset(unsigned int offset) {
this->currentArticleOffset = offset;
}
unsigned int Indexer::getCurrentArticleOffset() {
return this->currentArticleOffset;
}
unsigned int Indexer::getProgression() {
}
/* Read the file containing the stopwords */
bool Indexer::readStopWordsFile(const string path) {
std::string stopWord;
std::ifstream file(path.c_str(), std::ios_base::in);
this->stopWords.clear();
while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord);
}
std::cout << "Read " << this->stopWords.size() << " lines.\n";
return true;
}
/* Index next percent */
bool Indexer::indexNextPercent(const bool &verbose) {
float thresholdOffset = this->currentArticleOffset + this->stepSize;
size_t found;
/* Check if we can start */
if (this->zimFileHandler == NULL) {
return false;
}
this->indexNextPercentPre();
while(this->currentArticleOffset < thresholdOffset &&
this->currentArticleOffset <= this->lastArticleOffset) {
zim::Article currentArticle;
/* Get next non redirect article */
do {
currentArticle = this->zimFileHandler->getArticle(this->currentArticleOffset);
} while (this->currentArticleOffset++ &&
currentArticle.isRedirect() &&
this->currentArticleOffset != this->lastArticleOffset);
if (!currentArticle.isRedirect()) {
/* Index the content */
this->htmlParser.reset();
string content (currentArticle.getData().data(), currentArticle.getData().size());
/* The parser generate a lot of exceptions which should be avoided */
try {
this->htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {
}
/* If content does not have the noindex meta tag */
/* Seems that the parser generates an exception in such case */
found = this->htmlParser.dump.find("NOINDEX");
if (found == string::npos) {
string url = currentArticle.getLongUrl();
/* Debug output */
if (verbose) {
std::cout << "Indexing " << url << "..." << std::endl;
}
/* Get the title */
string accentedTitle = this->htmlParser.title;
if (accentedTitle.empty()) {
accentedTitle = currentArticle.getTitle();
}
/* count words */
stringstream countWordStringStream;
countWordStringStream << countWords(this->htmlParser.dump);
const std::string wordCountString = countWordStringStream.str();
/* snippet */
std::string snippet = std::string(this->htmlParser.dump, 0, 300);
std::string::size_type last = snippet.find_last_of('.');
if (last == snippet.npos)
last = snippet.find_last_of(' ');
if (last != snippet.npos)
snippet = snippet.substr(0, last);
/* size */
stringstream sizeStringStream;
sizeStringStream << content.size() / 1024;
const std::string size = sizeStringStream.str();
this->indexNextArticle(url,
accentedTitle,
removeAccents(this->htmlParser.title),
removeAccents(this->htmlParser.keywords),
removeAccents(this->htmlParser.dump),
snippet,
size,
wordCountString
);
}
}
}
this->indexNextPercentPost();
/* increment the offset and set returned value */
if (this->currentArticleOffset <= this->lastArticleOffset) {
return true;
} else {
// commented as it never returns on OSX.
//this->stopIndexing();
return false;
}
}
}