Language code conversion via ICU

Language code is converted from ISO 639-3 to ISO 639 (which is
understood by Xapian) via ICU. The previous approach via an explicit
map had its advantages since Xapian has more than one stemmer
implementations for some languages (selectable via Xapian-specific
identifiers). This commit relies on the defaults associated with the
ISO 639 language codes.
This commit is contained in:
Veloman Yunkan 2021-03-10 16:38:08 +04:00 committed by Matthieu Gautier
parent 09233bf4f3
commit e214efecd4
1 changed files with 5 additions and 9 deletions

View File

@ -30,6 +30,7 @@
#include <pugixml.hpp> #include <pugixml.hpp>
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <unicode/locid.h>
namespace kiwix namespace kiwix
{ {
@ -37,13 +38,8 @@ namespace kiwix
namespace namespace
{ {
const std::map<std::string, std::string> iso639_3ToXapian { std::string iso639_3ToXapian(const std::string& lang) {
{"deu", "german" }, return icu::Locale(lang.c_str()).getLanguage();
{"eng", "english" },
{"fra", "french" },
{"hye", "armenian"},
{"rus", "russian" },
{"spa", "spanish" },
}; };
std::string normalizeText(const std::string& text, const std::string& language) std::string normalizeText(const std::string& text, const std::string& language)
@ -260,7 +256,7 @@ void Library::updateBookDB(const Book& book)
Xapian::TermGenerator indexer; Xapian::TermGenerator indexer;
const std::string lang = book.getLanguage(); const std::string lang = book.getLanguage();
try { try {
stemmer = Xapian::Stem(iso639_3ToXapian.at(lang)); stemmer = Xapian::Stem(iso639_3ToXapian(lang));
indexer.set_stemmer(stemmer); indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {} } catch (...) {}
@ -301,7 +297,7 @@ Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& fi
: 0; : 0;
// Language assumed for the query is not known for sure so stemming // Language assumed for the query is not known for sure so stemming
// is not applied // is not applied
//queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian.at(???))); //queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian(???)));
//queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); //queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
const auto flags = Xapian::QueryParser::FLAG_PHRASE const auto flags = Xapian::QueryParser::FLAG_PHRASE
| Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_BOOLEAN