Merge pull request #35 from kiwix/stem_stop

Let's use stem and stop words information (if) present in the database.
This commit is contained in:
Matthieu Gautier 2017-04-04 17:07:48 +02:00 committed by GitHub
commit 9771506985
4 changed files with 41 additions and 22 deletions

4
include/indexer.h Executable file → Normal file
View File

@ -78,10 +78,6 @@ namespace kiwix {
virtual void flush() = 0;
virtual void indexingPostlude(const string indexPath) = 0;
/* Stop words */
std::vector<std::string> stopWords;
void readStopWords(const string languageCode);
/* Others */
unsigned int countWords(const string &text);

View File

@ -70,10 +70,15 @@ namespace kiwix {
protected:
void closeIndex();
void openIndex(const string &xapianDirectoryPath);
void setup_queryParser();
Reader* reader;
Xapian::Database readableDatabase;
std::string language;
std::string stopwords;
Xapian::QueryParser queryParser;
Xapian::Stem stemmer;
Xapian::SimpleStopper stopper;
Xapian::MSet results;
Xapian::MSetIterator current_result;
std::map<std::string, int> valuesmap;

View File

@ -63,21 +63,6 @@ namespace kiwix {
Indexer::~Indexer() {
}
/* Read the stopwords */
void Indexer::readStopWords(const string languageCode) {
std::string stopWord;
std::istringstream file(getResource("stopwords/" + languageCode));
this->stopWords.clear();
while (getline(file, stopWord, '\n')) {
this->stopWords.push_back(stopWord);
}
if (this->verboseFlag) {
std::cout << "Read stop words, lang code:" << languageCode << ", count:" << this->stopWords.size() << std::endl;
}
}
#pragma mark - Extractor

View File

@ -25,6 +25,7 @@
#include <zim/error.h>
#include <sys/types.h>
#include <unistd.h>
#include <unicode/locid.h>
#include <vector>
@ -46,8 +47,8 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
/* Constructor */
XapianSearcher::XapianSearcher(const string &xapianDirectoryPath, Reader* reader)
: Searcher(),
reader(reader),
stemmer(Xapian::Stem("english")) {
reader(reader)
{
this->openIndex(xapianDirectoryPath);
}
@ -67,6 +68,9 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
this->readableDatabase = Xapian::Database(directoryPath);
}
this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap"));
this->language = this->readableDatabase.get_metadata("language");
this->stopwords = this->readableDatabase.get_metadata("stopwords");
setup_queryParser();
}
/* Close Xapian writable database */
@ -74,11 +78,40 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
return;
}
void XapianSearcher::setup_queryParser()
{
queryParser.set_database(readableDatabase);
if ( ! language.empty() )
{
/* Build ICU Local object to retrieve ISO-639 language code (from
ISO-639-3) */
icu::Locale languageLocale(language.c_str());
/* Configuring language base steemming */
try {
stemmer = Xapian::Stem(languageLocale.getLanguage());
queryParser.set_stemmer(stemmer);
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
} catch (...) {
std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
}
}
if ( ! stopwords.empty() )
{
std::string stopWord;
std::istringstream file(this->stopwords);
while (std::getline(file, stopWord, '\n')) {
this->stopper.add(stopWord);
}
queryParser.set_stopper(&(this->stopper));
}
}
/* Search strings in the database */
void XapianSearcher::searchInIndex(string &search, const unsigned int resultStart,
const unsigned int resultEnd, const bool verbose) {
/* Create the query */
Xapian::QueryParser queryParser;
Xapian::Query query = queryParser.parse_query(search);
/* Create the enquire object */