diff --git a/src/common/kiwix/indexer.cpp b/src/common/kiwix/indexer.cpp index 9e4268a7d..b1c477148 100644 --- a/src/common/kiwix/indexer.cpp +++ b/src/common/kiwix/indexer.cpp @@ -18,7 +18,6 @@ namespace kiwix { /* Constructor */ Indexer::Indexer(const string &zimFilePath, const string &xapianDirectoryPath) : zimFileHandler(NULL), - stemmer(Xapian::Stem("english")), articleCount(0), stepSize(0) { @@ -28,7 +27,25 @@ namespace kiwix { /* Open the Xapian directory */ this->writableDatabase = new Xapian::WritableDatabase(xapianDirectoryPath, Xapian::DB_CREATE_OR_OVERWRITE); - + + /* Stemming * + /* + stemmer = Xapian::Stem("french"); + indexer.set_stemmer(stemmer); + */ + + /* Read the stopwords file */ + /* + this->readStopWordsFile("/home/kelson/kiwix/moulinkiwix/stopwords/fr"); + std::vector::const_iterator stopWordsIterator = this->stopWords.begin(); + this->stopper.add("ceci"); + while (stopWordsIterator != this->stopWords.end()) { + this->stopper.add(*stopWordsIterator); + stopWordsIterator++; + } + indexer.set_stopper(&(this->stopper)); + */ + /* Prepare the indexation */ this->prepareIndexing(); } @@ -89,10 +106,6 @@ namespace kiwix { if (found == string::npos) { - /* Set the stemmer */ - /* TODO, autodetect the language */ - //indexer.set_stemmer(stemmer); - /* Put the data in the document */ Xapian::Document document; document.add_value(0, this->htmlParser.title); @@ -152,5 +165,19 @@ namespace kiwix { this->writableDatabase = NULL; } } + + /* Read the file containing the stopwords */ + bool Indexer::readStopWordsFile(const string path) { + std::string stopWord; + std::ifstream file(path.c_str(), std::ios_base::in); + + this->stopWords.clear(); + + while (getline(file, stopWord, '\n')) { + this->stopWords.push_back(stopWord); + } + + std::cout << "Read " << this->stopWords.size() << " lines.\n"; + } } diff --git a/src/common/kiwix/indexer.h b/src/common/kiwix/indexer.h index 901c14eb9..6f9ab5636 100644 --- a/src/common/kiwix/indexer.h +++ b/src/common/kiwix/indexer.h @@ -1,6 +1,11 @@ #ifndef KIWIX_INDEXER_H #define KIWIX_INDEXER_H +#include +#include +#include +#include + #include #include #include @@ -11,7 +16,7 @@ using namespace std; namespace kiwix { - + class Indexer { public: @@ -25,6 +30,8 @@ namespace kiwix { void stopIndexing(); unsigned int countWords(const string &text); + bool readStopWordsFile(const string path); + unsigned int articleCount; float stepSize; @@ -36,8 +43,10 @@ namespace kiwix { Xapian::WritableDatabase *writableDatabase; Xapian::Stem stemmer; + Xapian::SimpleStopper stopper; Xapian::TermGenerator indexer; - + + std::vector stopWords; MyHtmlParser htmlParser; };