From b8d950c1a073f8fcd629ebea8f22d618127c91a3 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Mar 2017 09:59:39 +0200 Subject: [PATCH] Use the stop words stored in the database to configure the queryparser. To properly search in the xapian database, we need to use the same stop words that the ones used during the indexing. --- include/indexer.h | 4 ---- include/xapianSearcher.h | 2 ++ src/indexer.cpp | 15 --------------- src/xapianSearcher.cpp | 11 +++++++++++ 4 files changed, 13 insertions(+), 19 deletions(-) mode change 100755 => 100644 include/indexer.h diff --git a/include/indexer.h b/include/indexer.h old mode 100755 new mode 100644 index a4f160638..9435819b6 --- a/include/indexer.h +++ b/include/indexer.h @@ -78,10 +78,6 @@ namespace kiwix { virtual void flush() = 0; virtual void indexingPostlude(const string indexPath) = 0; - /* Stop words */ - std::vector stopWords; - void readStopWords(const string languageCode); - /* Others */ unsigned int countWords(const string &text); diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index df5df276a..e11c03e68 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -75,8 +75,10 @@ namespace kiwix { Reader* reader; Xapian::Database readableDatabase; std::string language; + std::string stopwords; Xapian::QueryParser queryParser; Xapian::Stem stemmer; + Xapian::SimpleStopper stopper; Xapian::MSet results; Xapian::MSetIterator current_result; std::map valuesmap; diff --git a/src/indexer.cpp b/src/indexer.cpp index f265aa720..7e230bc9a 100755 --- a/src/indexer.cpp +++ b/src/indexer.cpp @@ -62,22 +62,7 @@ namespace kiwix { /* Destructor */ Indexer::~Indexer() { } - - /* Read the stopwords */ - void Indexer::readStopWords(const string languageCode) { - std::string stopWord; - std::istringstream file(getResource("stopwords/" + languageCode)); - - this->stopWords.clear(); - - while (getline(file, stopWord, '\n')) { - this->stopWords.push_back(stopWord); - } - if (this->verboseFlag) { - std::cout << "Read stop words, lang code:" << languageCode << ", count:" << this->stopWords.size() << std::endl; - } - } #pragma mark - Extractor diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index f636336d0..7e0fab28f 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -69,6 +69,7 @@ std::map read_valuesmap(const std::string &s) { } this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap")); this->language = this->readableDatabase.get_metadata("language"); + this->stopwords = this->readableDatabase.get_metadata("stopwords"); setup_queryParser(); } @@ -95,6 +96,16 @@ std::map read_valuesmap(const std::string &s) { std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } + + if ( ! stopwords.empty() ) + { + std::string stopWord; + std::istringstream file(this->stopwords); + while (std::getline(file, stopWord, '\n')) { + this->stopper.add(stopWord); + } + queryParser.set_stopper(&(this->stopper)); + } } /* Search strings in the database */