mirror of https://github.com/kiwix/libkiwix.git
Initial version of Xapian-based catalog search
This commit is contained in:
parent
a17fc0ef2d
commit
a599fb3892
|
@ -28,6 +28,7 @@
|
||||||
#include "book.h"
|
#include "book.h"
|
||||||
#include "bookmark.h"
|
#include "bookmark.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include <xapian.h>
|
||||||
|
|
||||||
#define KIWIX_LIBRARY_VERSION "20110515"
|
#define KIWIX_LIBRARY_VERSION "20110515"
|
||||||
|
|
||||||
|
@ -106,6 +107,7 @@ class Filter {
|
||||||
Filter& name(std::string name);
|
Filter& name(std::string name);
|
||||||
|
|
||||||
bool hasQuery() const;
|
bool hasQuery() const;
|
||||||
|
const std::string& getQuery() const { return _query; }
|
||||||
|
|
||||||
bool accept(const Book& book) const;
|
bool accept(const Book& book) const;
|
||||||
bool acceptByQueryOnly(const Book& book) const;
|
bool acceptByQueryOnly(const Book& book) const;
|
||||||
|
@ -121,6 +123,7 @@ class Library
|
||||||
std::map<std::string, kiwix::Book> m_books;
|
std::map<std::string, kiwix::Book> m_books;
|
||||||
std::map<std::string, std::shared_ptr<Reader>> m_readers;
|
std::map<std::string, std::shared_ptr<Reader>> m_readers;
|
||||||
std::vector<kiwix::Bookmark> m_bookmarks;
|
std::vector<kiwix::Bookmark> m_bookmarks;
|
||||||
|
Xapian::WritableDatabase m_bookDB;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
typedef std::vector<std::string> BookIdCollection;
|
typedef std::vector<std::string> BookIdCollection;
|
||||||
|
@ -295,6 +298,7 @@ class Library
|
||||||
|
|
||||||
private: // functions
|
private: // functions
|
||||||
BookIdCollection getBooksByTitleOrDescription(const Filter& filter);
|
BookIdCollection getBooksByTitleOrDescription(const Filter& filter);
|
||||||
|
void updateBookDB(const Book& book);
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,10 +34,31 @@
|
||||||
namespace kiwix
|
namespace kiwix
|
||||||
{
|
{
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
const std::map<std::string, std::string> iso639_3ToXapian {
|
||||||
|
{"deu", "german" },
|
||||||
|
{"eng", "english" },
|
||||||
|
{"fra", "french" },
|
||||||
|
{"hye", "armenian"},
|
||||||
|
{"rus", "russian" },
|
||||||
|
{"spa", "spanish" },
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string normalizeText(const std::string& text, const std::string& language)
|
||||||
|
{
|
||||||
|
return removeAccents(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // unnamed namespace
|
||||||
|
|
||||||
/* Constructor */
|
/* Constructor */
|
||||||
Library::Library()
|
Library::Library()
|
||||||
|
: m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Destructor */
|
/* Destructor */
|
||||||
Library::~Library()
|
Library::~Library()
|
||||||
{
|
{
|
||||||
|
@ -47,6 +68,7 @@ Library::~Library()
|
||||||
bool Library::addBook(const Book& book)
|
bool Library::addBook(const Book& book)
|
||||||
{
|
{
|
||||||
/* Try to find it */
|
/* Try to find it */
|
||||||
|
updateBookDB(book);
|
||||||
try {
|
try {
|
||||||
auto& oldbook = m_books.at(book.getId());
|
auto& oldbook = m_books.at(book.getId());
|
||||||
oldbook.update(book);
|
oldbook.update(book);
|
||||||
|
@ -232,14 +254,64 @@ Library::BookIdCollection Library::filter(const std::string& search)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void Library::updateBookDB(const Book& book)
|
||||||
|
{
|
||||||
|
Xapian::Stem stemmer;
|
||||||
|
Xapian::TermGenerator indexer;
|
||||||
|
const std::string lang = book.getLanguage();
|
||||||
|
try {
|
||||||
|
stemmer = Xapian::Stem(iso639_3ToXapian.at(lang));
|
||||||
|
indexer.set_stemmer(stemmer);
|
||||||
|
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
|
||||||
|
} catch (...) {}
|
||||||
|
Xapian::Document doc;
|
||||||
|
indexer.set_document(doc);
|
||||||
|
|
||||||
|
const std::string title = normalizeText(book.getTitle(), lang);
|
||||||
|
const std::string desc = normalizeText(book.getDescription(), lang);
|
||||||
|
doc.add_value(0, title);
|
||||||
|
doc.add_value(1, desc);
|
||||||
|
doc.set_data(book.getId());
|
||||||
|
|
||||||
|
indexer.index_text(title, 1, "S");
|
||||||
|
indexer.index_text(desc, 1, "XD");
|
||||||
|
|
||||||
|
// Index fields without prefixes for general search
|
||||||
|
indexer.index_text(title);
|
||||||
|
indexer.increase_termpos();
|
||||||
|
indexer.index_text(desc);
|
||||||
|
|
||||||
|
const std::string idterm = "Q" + book.getId();
|
||||||
|
doc.add_boolean_term(idterm);
|
||||||
|
m_bookDB.replace_document(idterm, doc);
|
||||||
|
}
|
||||||
|
|
||||||
Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter)
|
Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter)
|
||||||
{
|
{
|
||||||
|
if ( !filter.hasQuery() )
|
||||||
|
return getBooksIds();
|
||||||
|
|
||||||
BookIdCollection bookIds;
|
BookIdCollection bookIds;
|
||||||
for(auto& pair:m_books) {
|
Xapian::QueryParser queryParser;
|
||||||
if(filter.acceptByQueryOnly(pair.second)) {
|
queryParser.set_default_op(Xapian::Query::OP_AND);
|
||||||
bookIds.push_back(pair.first);
|
queryParser.add_prefix("title", "S");
|
||||||
}
|
queryParser.add_prefix("description", "XD");
|
||||||
|
// Language assumed for the query is not known for sure so stemming
|
||||||
|
// is not applied
|
||||||
|
//queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian.at(???)));
|
||||||
|
//queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
|
||||||
|
const auto flags = Xapian::QueryParser::FLAG_PHRASE
|
||||||
|
| Xapian::QueryParser::FLAG_BOOLEAN
|
||||||
|
| Xapian::QueryParser::FLAG_LOVEHATE
|
||||||
|
| Xapian::QueryParser::FLAG_WILDCARD;
|
||||||
|
const auto query = queryParser.parse_query(filter.getQuery(), flags);
|
||||||
|
Xapian::Enquire enquire(m_bookDB);
|
||||||
|
enquire.set_query(query);
|
||||||
|
const auto results = enquire.get_mset(0, m_books.size());
|
||||||
|
for ( auto it = results.begin(); it != results.end(); ++it ) {
|
||||||
|
bookIds.push_back(it.get_document().get_data());
|
||||||
}
|
}
|
||||||
|
|
||||||
return bookIds;
|
return bookIds;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -263,7 +263,7 @@ TEST_F(LibraryTest, filterCheck)
|
||||||
bookIds = lib.filter(kiwix::Filter().query("folklore"));
|
bookIds = lib.filter(kiwix::Filter().query("folklore"));
|
||||||
EXPECT_EQ(bookIds.size(), 1U);
|
EXPECT_EQ(bookIds.size(), 1U);
|
||||||
|
|
||||||
bookIds = lib.filter(kiwix::Filter().query("Wiki"));
|
bookIds = lib.filter(kiwix::Filter().query("Wiki*"));
|
||||||
EXPECT_EQ(bookIds.size(), 4U);
|
EXPECT_EQ(bookIds.size(), 4U);
|
||||||
|
|
||||||
bookIds = lib.filter(kiwix::Filter().query("Wiki").creator("Wiki"));
|
bookIds = lib.filter(kiwix::Filter().query("Wiki").creator("Wiki"));
|
||||||
|
|
|
@ -733,7 +733,26 @@ TEST_F(LibraryServerTest, catalog_searchdescription_xml)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibraryServerTest, catalog_search_by_text)
|
TEST_F(LibraryServerTest, catalog_search_by_phrase)
|
||||||
|
{
|
||||||
|
const auto r = zfs1_->GET("/catalog/search?q=\"ray%20charles\"");
|
||||||
|
EXPECT_EQ(r->status, 200);
|
||||||
|
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
|
||||||
|
OPDS_FEED_TAG
|
||||||
|
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
|
||||||
|
" <title>Search result for \"ray charles\"</title>\n"
|
||||||
|
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
|
||||||
|
" <totalResults>2</totalResults>\n"
|
||||||
|
" <startIndex>0</startIndex>\n"
|
||||||
|
" <itemsPerPage>2</itemsPerPage>\n"
|
||||||
|
CATALOG_LINK_TAGS
|
||||||
|
RAY_CHARLES_CATALOG_ENTRY
|
||||||
|
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
|
||||||
|
"</feed>\n"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(LibraryServerTest, catalog_search_by_words)
|
||||||
{
|
{
|
||||||
const auto r = zfs1_->GET("/catalog/search?q=ray%20charles");
|
const auto r = zfs1_->GET("/catalog/search?q=ray%20charles");
|
||||||
EXPECT_EQ(r->status, 200);
|
EXPECT_EQ(r->status, 200);
|
||||||
|
@ -742,12 +761,13 @@ TEST_F(LibraryServerTest, catalog_search_by_text)
|
||||||
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
|
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
|
||||||
" <title>Search result for ray charles</title>\n"
|
" <title>Search result for ray charles</title>\n"
|
||||||
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
|
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
|
||||||
" <totalResults>2</totalResults>\n"
|
" <totalResults>3</totalResults>\n"
|
||||||
" <startIndex>0</startIndex>\n"
|
" <startIndex>0</startIndex>\n"
|
||||||
" <itemsPerPage>2</itemsPerPage>\n"
|
" <itemsPerPage>3</itemsPerPage>\n"
|
||||||
CATALOG_LINK_TAGS
|
CATALOG_LINK_TAGS
|
||||||
RAY_CHARLES_CATALOG_ENTRY
|
RAY_CHARLES_CATALOG_ENTRY
|
||||||
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
|
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
|
||||||
|
CHARLES_RAY_CATALOG_ENTRY
|
||||||
"</feed>\n"
|
"</feed>\n"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue