mirror of https://github.com/kiwix/libkiwix.git
Merge pull request #35 from kiwix/stem_stop
Let's use stem and stop words information (if) present in the database.
This commit is contained in:
commit
9771506985
|
@ -78,10 +78,6 @@ namespace kiwix {
|
|||
virtual void flush() = 0;
|
||||
virtual void indexingPostlude(const string indexPath) = 0;
|
||||
|
||||
/* Stop words */
|
||||
std::vector<std::string> stopWords;
|
||||
void readStopWords(const string languageCode);
|
||||
|
||||
/* Others */
|
||||
unsigned int countWords(const string &text);
|
||||
|
||||
|
|
|
@ -70,10 +70,15 @@ namespace kiwix {
|
|||
protected:
|
||||
void closeIndex();
|
||||
void openIndex(const string &xapianDirectoryPath);
|
||||
void setup_queryParser();
|
||||
|
||||
Reader* reader;
|
||||
Xapian::Database readableDatabase;
|
||||
std::string language;
|
||||
std::string stopwords;
|
||||
Xapian::QueryParser queryParser;
|
||||
Xapian::Stem stemmer;
|
||||
Xapian::SimpleStopper stopper;
|
||||
Xapian::MSet results;
|
||||
Xapian::MSetIterator current_result;
|
||||
std::map<std::string, int> valuesmap;
|
||||
|
|
|
@ -63,21 +63,6 @@ namespace kiwix {
|
|||
Indexer::~Indexer() {
|
||||
}
|
||||
|
||||
/* Read the stopwords */
|
||||
void Indexer::readStopWords(const string languageCode) {
|
||||
std::string stopWord;
|
||||
std::istringstream file(getResource("stopwords/" + languageCode));
|
||||
|
||||
this->stopWords.clear();
|
||||
|
||||
while (getline(file, stopWord, '\n')) {
|
||||
this->stopWords.push_back(stopWord);
|
||||
}
|
||||
|
||||
if (this->verboseFlag) {
|
||||
std::cout << "Read stop words, lang code:" << languageCode << ", count:" << this->stopWords.size() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma mark - Extractor
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include <zim/error.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <unicode/locid.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
|
@ -46,8 +47,8 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
|
|||
/* Constructor */
|
||||
XapianSearcher::XapianSearcher(const string &xapianDirectoryPath, Reader* reader)
|
||||
: Searcher(),
|
||||
reader(reader),
|
||||
stemmer(Xapian::Stem("english")) {
|
||||
reader(reader)
|
||||
{
|
||||
this->openIndex(xapianDirectoryPath);
|
||||
}
|
||||
|
||||
|
@ -67,6 +68,9 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
|
|||
this->readableDatabase = Xapian::Database(directoryPath);
|
||||
}
|
||||
this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap"));
|
||||
this->language = this->readableDatabase.get_metadata("language");
|
||||
this->stopwords = this->readableDatabase.get_metadata("stopwords");
|
||||
setup_queryParser();
|
||||
}
|
||||
|
||||
/* Close Xapian writable database */
|
||||
|
@ -74,11 +78,40 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
|
|||
return;
|
||||
}
|
||||
|
||||
void XapianSearcher::setup_queryParser()
|
||||
{
|
||||
queryParser.set_database(readableDatabase);
|
||||
if ( ! language.empty() )
|
||||
{
|
||||
/* Build ICU Local object to retrieve ISO-639 language code (from
|
||||
ISO-639-3) */
|
||||
icu::Locale languageLocale(language.c_str());
|
||||
|
||||
/* Configuring language base steemming */
|
||||
try {
|
||||
stemmer = Xapian::Stem(languageLocale.getLanguage());
|
||||
queryParser.set_stemmer(stemmer);
|
||||
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
|
||||
} catch (...) {
|
||||
std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! stopwords.empty() )
|
||||
{
|
||||
std::string stopWord;
|
||||
std::istringstream file(this->stopwords);
|
||||
while (std::getline(file, stopWord, '\n')) {
|
||||
this->stopper.add(stopWord);
|
||||
}
|
||||
queryParser.set_stopper(&(this->stopper));
|
||||
}
|
||||
}
|
||||
|
||||
/* Search strings in the database */
|
||||
void XapianSearcher::searchInIndex(string &search, const unsigned int resultStart,
|
||||
const unsigned int resultEnd, const bool verbose) {
|
||||
/* Create the query */
|
||||
Xapian::QueryParser queryParser;
|
||||
Xapian::Query query = queryParser.parse_query(search);
|
||||
|
||||
/* Create the enquire object */
|
||||
|
|
Loading…
Reference in New Issue