Initial version of Xapian-based catalog search

This commit is contained in:
Veloman Yunkan 2021-03-07 01:11:04 +04:00 committed by Matthieu Gautier
parent a17fc0ef2d
commit a599fb3892
4 changed files with 104 additions and 8 deletions

View File

@ -28,6 +28,7 @@
#include "book.h" #include "book.h"
#include "bookmark.h" #include "bookmark.h"
#include "common.h" #include "common.h"
#include <xapian.h>
#define KIWIX_LIBRARY_VERSION "20110515" #define KIWIX_LIBRARY_VERSION "20110515"
@ -106,6 +107,7 @@ class Filter {
Filter& name(std::string name); Filter& name(std::string name);
bool hasQuery() const; bool hasQuery() const;
const std::string& getQuery() const { return _query; }
bool accept(const Book& book) const; bool accept(const Book& book) const;
bool acceptByQueryOnly(const Book& book) const; bool acceptByQueryOnly(const Book& book) const;
@ -121,6 +123,7 @@ class Library
std::map<std::string, kiwix::Book> m_books; std::map<std::string, kiwix::Book> m_books;
std::map<std::string, std::shared_ptr<Reader>> m_readers; std::map<std::string, std::shared_ptr<Reader>> m_readers;
std::vector<kiwix::Bookmark> m_bookmarks; std::vector<kiwix::Bookmark> m_bookmarks;
Xapian::WritableDatabase m_bookDB;
public: public:
typedef std::vector<std::string> BookIdCollection; typedef std::vector<std::string> BookIdCollection;
@ -295,6 +298,7 @@ class Library
private: // functions private: // functions
BookIdCollection getBooksByTitleOrDescription(const Filter& filter); BookIdCollection getBooksByTitleOrDescription(const Filter& filter);
void updateBookDB(const Book& book);
}; };
} }

View File

@ -34,10 +34,31 @@
namespace kiwix namespace kiwix
{ {
namespace
{
const std::map<std::string, std::string> iso639_3ToXapian {
{"deu", "german" },
{"eng", "english" },
{"fra", "french" },
{"hye", "armenian"},
{"rus", "russian" },
{"spa", "spanish" },
};
std::string normalizeText(const std::string& text, const std::string& language)
{
return removeAccents(text);
}
} // unnamed namespace
/* Constructor */ /* Constructor */
Library::Library() Library::Library()
: m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
{ {
} }
/* Destructor */ /* Destructor */
Library::~Library() Library::~Library()
{ {
@ -47,6 +68,7 @@ Library::~Library()
bool Library::addBook(const Book& book) bool Library::addBook(const Book& book)
{ {
/* Try to find it */ /* Try to find it */
updateBookDB(book);
try { try {
auto& oldbook = m_books.at(book.getId()); auto& oldbook = m_books.at(book.getId());
oldbook.update(book); oldbook.update(book);
@ -232,14 +254,64 @@ Library::BookIdCollection Library::filter(const std::string& search)
} }
void Library::updateBookDB(const Book& book)
{
Xapian::Stem stemmer;
Xapian::TermGenerator indexer;
const std::string lang = book.getLanguage();
try {
stemmer = Xapian::Stem(iso639_3ToXapian.at(lang));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
Xapian::Document doc;
indexer.set_document(doc);
const std::string title = normalizeText(book.getTitle(), lang);
const std::string desc = normalizeText(book.getDescription(), lang);
doc.add_value(0, title);
doc.add_value(1, desc);
doc.set_data(book.getId());
indexer.index_text(title, 1, "S");
indexer.index_text(desc, 1, "XD");
// Index fields without prefixes for general search
indexer.index_text(title);
indexer.increase_termpos();
indexer.index_text(desc);
const std::string idterm = "Q" + book.getId();
doc.add_boolean_term(idterm);
m_bookDB.replace_document(idterm, doc);
}
Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter) Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter)
{ {
if ( !filter.hasQuery() )
return getBooksIds();
BookIdCollection bookIds; BookIdCollection bookIds;
for(auto& pair:m_books) { Xapian::QueryParser queryParser;
if(filter.acceptByQueryOnly(pair.second)) { queryParser.set_default_op(Xapian::Query::OP_AND);
bookIds.push_back(pair.first); queryParser.add_prefix("title", "S");
} queryParser.add_prefix("description", "XD");
// Language assumed for the query is not known for sure so stemming
// is not applied
//queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian.at(???)));
//queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
const auto flags = Xapian::QueryParser::FLAG_PHRASE
| Xapian::QueryParser::FLAG_BOOLEAN
| Xapian::QueryParser::FLAG_LOVEHATE
| Xapian::QueryParser::FLAG_WILDCARD;
const auto query = queryParser.parse_query(filter.getQuery(), flags);
Xapian::Enquire enquire(m_bookDB);
enquire.set_query(query);
const auto results = enquire.get_mset(0, m_books.size());
for ( auto it = results.begin(); it != results.end(); ++it ) {
bookIds.push_back(it.get_document().get_data());
} }
return bookIds; return bookIds;
} }

View File

@ -263,7 +263,7 @@ TEST_F(LibraryTest, filterCheck)
bookIds = lib.filter(kiwix::Filter().query("folklore")); bookIds = lib.filter(kiwix::Filter().query("folklore"));
EXPECT_EQ(bookIds.size(), 1U); EXPECT_EQ(bookIds.size(), 1U);
bookIds = lib.filter(kiwix::Filter().query("Wiki")); bookIds = lib.filter(kiwix::Filter().query("Wiki*"));
EXPECT_EQ(bookIds.size(), 4U); EXPECT_EQ(bookIds.size(), 4U);
bookIds = lib.filter(kiwix::Filter().query("Wiki").creator("Wiki")); bookIds = lib.filter(kiwix::Filter().query("Wiki").creator("Wiki"));

View File

@ -733,7 +733,26 @@ TEST_F(LibraryServerTest, catalog_searchdescription_xml)
); );
} }
TEST_F(LibraryServerTest, catalog_search_by_text) TEST_F(LibraryServerTest, catalog_search_by_phrase)
{
const auto r = zfs1_->GET("/catalog/search?q=\"ray%20charles\"");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for \"ray charles\"</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
"</feed>\n"
);
}
TEST_F(LibraryServerTest, catalog_search_by_words)
{ {
const auto r = zfs1_->GET("/catalog/search?q=ray%20charles"); const auto r = zfs1_->GET("/catalog/search?q=ray%20charles");
EXPECT_EQ(r->status, 200); EXPECT_EQ(r->status, 200);
@ -742,12 +761,13 @@ TEST_F(LibraryServerTest, catalog_search_by_text)
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n" " <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for ray charles</title>\n" " <title>Search result for ray charles</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n" " <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n" " <totalResults>3</totalResults>\n"
" <startIndex>0</startIndex>\n" " <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n" " <itemsPerPage>3</itemsPerPage>\n"
CATALOG_LINK_TAGS CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY RAY_CHARLES_CATALOG_ENTRY
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
"</feed>\n" "</feed>\n"
); );
} }