From 998db0eb2b446da46fb58951fdacf07544a37142 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 27 Mar 2017 18:33:42 +0200 Subject: [PATCH 1/2] Use the language stored in the database to configure the queryparser. To properly search in the xapian database, we need a stemmer using the same language that the one used during the indexing. --- include/xapianSearcher.h | 3 +++ src/xapianSearcher.cpp | 28 +++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index 8b27eb229..df5df276a 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -70,9 +70,12 @@ namespace kiwix { protected: void closeIndex(); void openIndex(const string &xapianDirectoryPath); + void setup_queryParser(); Reader* reader; Xapian::Database readableDatabase; + std::string language; + Xapian::QueryParser queryParser; Xapian::Stem stemmer; Xapian::MSet results; Xapian::MSetIterator current_result; diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index 22fd06e17..f636336d0 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -46,8 +47,8 @@ std::map read_valuesmap(const std::string &s) { /* Constructor */ XapianSearcher::XapianSearcher(const string &xapianDirectoryPath, Reader* reader) : Searcher(), - reader(reader), - stemmer(Xapian::Stem("english")) { + reader(reader) + { this->openIndex(xapianDirectoryPath); } @@ -67,18 +68,39 @@ std::map read_valuesmap(const std::string &s) { this->readableDatabase = Xapian::Database(directoryPath); } this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap")); + this->language = this->readableDatabase.get_metadata("language"); + setup_queryParser(); } /* Close Xapian writable database */ void XapianSearcher::closeIndex() { return; } + + void XapianSearcher::setup_queryParser() + { + queryParser.set_database(readableDatabase); + if ( ! language.empty() ) + { + /* Build ICU Local object to retrieve ISO-639 language code (from + ISO-639-3) */ + icu::Locale languageLocale(language.c_str()); + + /* Configuring language base steemming */ + try { + stemmer = Xapian::Stem(languageLocale.getLanguage()); + queryParser.set_stemmer(stemmer); + queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL); + } catch (...) { + std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + } + } + } /* Search strings in the database */ void XapianSearcher::searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd, const bool verbose) { /* Create the query */ - Xapian::QueryParser queryParser; Xapian::Query query = queryParser.parse_query(search); /* Create the enquire object */ From b8d950c1a073f8fcd629ebea8f22d618127c91a3 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 28 Mar 2017 09:59:39 +0200 Subject: [PATCH 2/2] Use the stop words stored in the database to configure the queryparser. To properly search in the xapian database, we need to use the same stop words that the ones used during the indexing. --- include/indexer.h | 4 ---- include/xapianSearcher.h | 2 ++ src/indexer.cpp | 15 --------------- src/xapianSearcher.cpp | 11 +++++++++++ 4 files changed, 13 insertions(+), 19 deletions(-) mode change 100755 => 100644 include/indexer.h diff --git a/include/indexer.h b/include/indexer.h old mode 100755 new mode 100644 index a4f160638..9435819b6 --- a/include/indexer.h +++ b/include/indexer.h @@ -78,10 +78,6 @@ namespace kiwix { virtual void flush() = 0; virtual void indexingPostlude(const string indexPath) = 0; - /* Stop words */ - std::vector stopWords; - void readStopWords(const string languageCode); - /* Others */ unsigned int countWords(const string &text); diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index df5df276a..e11c03e68 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -75,8 +75,10 @@ namespace kiwix { Reader* reader; Xapian::Database readableDatabase; std::string language; + std::string stopwords; Xapian::QueryParser queryParser; Xapian::Stem stemmer; + Xapian::SimpleStopper stopper; Xapian::MSet results; Xapian::MSetIterator current_result; std::map valuesmap; diff --git a/src/indexer.cpp b/src/indexer.cpp index f265aa720..7e230bc9a 100755 --- a/src/indexer.cpp +++ b/src/indexer.cpp @@ -62,22 +62,7 @@ namespace kiwix { /* Destructor */ Indexer::~Indexer() { } - - /* Read the stopwords */ - void Indexer::readStopWords(const string languageCode) { - std::string stopWord; - std::istringstream file(getResource("stopwords/" + languageCode)); - - this->stopWords.clear(); - - while (getline(file, stopWord, '\n')) { - this->stopWords.push_back(stopWord); - } - if (this->verboseFlag) { - std::cout << "Read stop words, lang code:" << languageCode << ", count:" << this->stopWords.size() << std::endl; - } - } #pragma mark - Extractor diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index f636336d0..7e0fab28f 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -69,6 +69,7 @@ std::map read_valuesmap(const std::string &s) { } this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap")); this->language = this->readableDatabase.get_metadata("language"); + this->stopwords = this->readableDatabase.get_metadata("stopwords"); setup_queryParser(); } @@ -95,6 +96,16 @@ std::map read_valuesmap(const std::string &s) { std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } + + if ( ! stopwords.empty() ) + { + std::string stopWord; + std::istringstream file(this->stopwords); + while (std::getline(file, stopWord, '\n')) { + this->stopper.add(stopWord); + } + queryParser.set_stopper(&(this->stopper)); + } } /* Search strings in the database */