Merge pull request #460 from kiwix/xapian_based_catalog_search

This commit is contained in:
Matthieu Gautier 2021-03-17 14:45:56 +01:00 committed by GitHub
commit baed447dd3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 225 additions and 33 deletions

View File

@ -28,6 +28,7 @@
#include "book.h" #include "book.h"
#include "bookmark.h" #include "bookmark.h"
#include "common.h" #include "common.h"
#include <xapian.h>
#define KIWIX_LIBRARY_VERSION "20110515" #define KIWIX_LIBRARY_VERSION "20110515"
@ -58,6 +59,7 @@ class Filter {
std::string _creator; std::string _creator;
size_t _maxSize; size_t _maxSize;
std::string _query; std::string _query;
bool _queryIsPartial;
std::string _name; std::string _name;
public: public:
@ -102,10 +104,16 @@ class Filter {
Filter& publisher(std::string publisher); Filter& publisher(std::string publisher);
Filter& creator(std::string creator); Filter& creator(std::string creator);
Filter& maxSize(size_t size); Filter& maxSize(size_t size);
Filter& query(std::string query); Filter& query(std::string query, bool partial=true);
Filter& name(std::string name); Filter& name(std::string name);
bool hasQuery() const;
const std::string& getQuery() const { return _query; }
bool queryIsPartial() const { return _queryIsPartial; }
bool accept(const Book& book) const; bool accept(const Book& book) const;
bool acceptByQueryOnly(const Book& book) const;
bool acceptByNonQueryCriteria(const Book& book) const;
}; };
@ -117,6 +125,10 @@ class Library
std::map<std::string, kiwix::Book> m_books; std::map<std::string, kiwix::Book> m_books;
std::map<std::string, std::shared_ptr<Reader>> m_readers; std::map<std::string, std::shared_ptr<Reader>> m_readers;
std::vector<kiwix::Bookmark> m_bookmarks; std::vector<kiwix::Bookmark> m_bookmarks;
Xapian::WritableDatabase m_bookDB;
public:
typedef std::vector<std::string> BookIdCollection;
public: public:
Library(); Library();
@ -220,7 +232,7 @@ class Library
* *
* @return A list of book ids. * @return A list of book ids.
*/ */
std::vector<std::string> getBooksIds(); BookIdCollection getBooksIds();
/** /**
* Filter the library and generate a new one with the keep elements. * Filter the library and generate a new one with the keep elements.
@ -230,7 +242,7 @@ class Library
* @param search List only books with search in the title or description. * @param search List only books with search in the title or description.
* @return The list of bookIds corresponding to the query. * @return The list of bookIds corresponding to the query.
*/ */
DEPRECATED std::vector<std::string> filter(const std::string& search); DEPRECATED BookIdCollection filter(const std::string& search);
/** /**
@ -239,7 +251,7 @@ class Library
* @param filter The filter to use. * @param filter The filter to use.
* @return The list of bookIds corresponding to the filter. * @return The list of bookIds corresponding to the filter.
*/ */
std::vector<std::string> filter(const Filter& filter); BookIdCollection filter(const Filter& filter);
/** /**
@ -249,7 +261,7 @@ class Library
* @param comparator how to sort the books * @param comparator how to sort the books
* @return The sorted list of books * @return The sorted list of books
*/ */
void sort(std::vector<std::string>& bookIds, supportedListSortBy sortBy, bool ascending); void sort(BookIdCollection& bookIds, supportedListSortBy sortBy, bool ascending);
/** /**
* List books in the library. * List books in the library.
@ -273,7 +285,7 @@ class Library
* Set to 0 to cancel this filter. * Set to 0 to cancel this filter.
* @return The list of bookIds corresponding to the query. * @return The list of bookIds corresponding to the query.
*/ */
DEPRECATED std::vector<std::string> listBooksIds( DEPRECATED BookIdCollection listBooksIds(
int supportedListMode = ALL, int supportedListMode = ALL,
supportedListSortBy sortBy = UNSORTED, supportedListSortBy sortBy = UNSORTED,
const std::string& search = "", const std::string& search = "",
@ -285,7 +297,12 @@ class Library
friend class OPDSDumper; friend class OPDSDumper;
friend class libXMLDumper; friend class libXMLDumper;
private: // functions
BookIdCollection getBooksByTitleOrDescription(const Filter& filter);
void updateBookDB(const Book& book);
}; };
} }
#endif #endif

View File

@ -34,6 +34,7 @@ pugixml_dep = dependency('pugixml', static:static_deps)
libcurl_dep = dependency('libcurl', static:static_deps) libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps) microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps) zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)
if compiler.has_header('mustache.hpp') if compiler.has_header('mustache.hpp')
extra_include = [] extra_include = []
@ -55,7 +56,7 @@ if target_machine.system() == 'windows' and static_deps
extra_cflags += '-DCURL_STATICLIB' extra_cflags += '-DCURL_STATICLIB'
endif endif
all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep] all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
inc = include_directories('include', extra_include) inc = include_directories('include', extra_include)
@ -74,7 +75,7 @@ subdir('static')
subdir('src') subdir('src')
subdir('test') subdir('test')
pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd'] pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd', 'xapian-core']
pkg_conf = configuration_data() pkg_conf = configuration_data()
pkg_conf.set('prefix', get_option('prefix')) pkg_conf.set('prefix', get_option('prefix'))

View File

@ -30,14 +30,31 @@
#include <pugixml.hpp> #include <pugixml.hpp>
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <unicode/locid.h>
namespace kiwix namespace kiwix
{ {
namespace
{
std::string iso639_3ToXapian(const std::string& lang) {
return icu::Locale(lang.c_str()).getLanguage();
};
std::string normalizeText(const std::string& text, const std::string& language)
{
return removeAccents(text);
}
} // unnamed namespace
/* Constructor */ /* Constructor */
Library::Library() Library::Library()
: m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
{ {
} }
/* Destructor */ /* Destructor */
Library::~Library() Library::~Library()
{ {
@ -47,6 +64,7 @@ Library::~Library()
bool Library::addBook(const Book& book) bool Library::addBook(const Book& book)
{ {
/* Try to find it */ /* Try to find it */
updateBookDB(book);
try { try {
auto& oldbook = m_books.at(book.getId()); auto& oldbook = m_books.at(book.getId());
oldbook.update(book); oldbook.update(book);
@ -211,9 +229,9 @@ const std::vector<kiwix::Bookmark> Library::getBookmarks(bool onlyValidBookmarks
return validBookmarks; return validBookmarks;
} }
std::vector<std::string> Library::getBooksIds() Library::BookIdCollection Library::getBooksIds()
{ {
std::vector<std::string> bookIds; BookIdCollection bookIds;
for (auto& pair: m_books) { for (auto& pair: m_books) {
bookIds.push_back(pair.first); bookIds.push_back(pair.first);
@ -222,7 +240,7 @@ std::vector<std::string> Library::getBooksIds()
return bookIds; return bookIds;
} }
std::vector<std::string> Library::filter(const std::string& search) Library::BookIdCollection Library::filter(const std::string& search)
{ {
if (search.empty()) { if (search.empty()) {
return getBooksIds(); return getBooksIds();
@ -232,16 +250,80 @@ std::vector<std::string> Library::filter(const std::string& search)
} }
std::vector<std::string> Library::filter(const Filter& filter) void Library::updateBookDB(const Book& book)
{ {
std::vector<std::string> bookIds; Xapian::Stem stemmer;
for(auto& pair:m_books) { Xapian::TermGenerator indexer;
auto book = pair.second; const std::string lang = book.getLanguage();
if(filter.accept(book)) { try {
bookIds.push_back(pair.first); stemmer = Xapian::Stem(iso639_3ToXapian(lang));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
Xapian::Document doc;
indexer.set_document(doc);
const std::string title = normalizeText(book.getTitle(), lang);
const std::string desc = normalizeText(book.getDescription(), lang);
doc.add_value(0, title);
doc.add_value(1, desc);
doc.set_data(book.getId());
indexer.index_text(title, 1, "S");
indexer.index_text(desc, 1, "XD");
// Index fields without prefixes for general search
indexer.index_text(title);
indexer.increase_termpos();
indexer.index_text(desc);
const std::string idterm = "Q" + book.getId();
doc.add_boolean_term(idterm);
m_bookDB.replace_document(idterm, doc);
}
Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter)
{
if ( !filter.hasQuery() )
return getBooksIds();
BookIdCollection bookIds;
Xapian::QueryParser queryParser;
queryParser.set_default_op(Xapian::Query::OP_AND);
queryParser.add_prefix("title", "S");
queryParser.add_prefix("description", "XD");
const auto partialQueryFlag = filter.queryIsPartial()
? Xapian::QueryParser::FLAG_PARTIAL
: 0;
// Language assumed for the query is not known for sure so stemming
// is not applied
//queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian(???)));
//queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
const auto flags = Xapian::QueryParser::FLAG_PHRASE
| Xapian::QueryParser::FLAG_BOOLEAN
| Xapian::QueryParser::FLAG_LOVEHATE
| Xapian::QueryParser::FLAG_WILDCARD
| partialQueryFlag;
const auto query = queryParser.parse_query(filter.getQuery(), flags);
Xapian::Enquire enquire(m_bookDB);
enquire.set_query(query);
const auto results = enquire.get_mset(0, m_books.size());
for ( auto it = results.begin(); it != results.end(); ++it ) {
bookIds.push_back(it.get_document().get_data());
}
return bookIds;
}
Library::BookIdCollection Library::filter(const Filter& filter)
{
BookIdCollection result;
for(auto id : getBooksByTitleOrDescription(filter)) {
if(filter.acceptByNonQueryCriteria(m_books[id])) {
result.push_back(id);
} }
} }
return bookIds; return result;
} }
template<supportedListSortBy SORT> template<supportedListSortBy SORT>
@ -303,7 +385,7 @@ std::string Comparator<PUBLISHER>::get_key(const std::string& id)
return lib->getBookById(id).getPublisher(); return lib->getBookById(id).getPublisher();
} }
void Library::sort(std::vector<std::string>& bookIds, supportedListSortBy sort, bool ascending) void Library::sort(BookIdCollection& bookIds, supportedListSortBy sort, bool ascending)
{ {
switch(sort) { switch(sort) {
case TITLE: case TITLE:
@ -327,7 +409,7 @@ void Library::sort(std::vector<std::string>& bookIds, supportedListSortBy sort,
} }
std::vector<std::string> Library::listBooksIds( Library::BookIdCollection Library::listBooksIds(
int mode, int mode,
supportedListSortBy sortBy, supportedListSortBy sortBy,
const std::string& search, const std::string& search,
@ -479,9 +561,10 @@ Filter& Filter::maxSize(size_t maxSize)
return *this; return *this;
} }
Filter& Filter::query(std::string query) Filter& Filter::query(std::string query, bool partial)
{ {
_query = query; _query = query;
_queryIsPartial = partial;
activeFilters |= QUERY; activeFilters |= QUERY;
return *this; return *this;
} }
@ -495,7 +578,17 @@ Filter& Filter::name(std::string name)
#define ACTIVE(X) (activeFilters & (X)) #define ACTIVE(X) (activeFilters & (X))
#define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; } #define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; }
bool Filter::hasQuery() const
{
return ACTIVE(QUERY);
}
bool Filter::accept(const Book& book) const bool Filter::accept(const Book& book) const
{
return acceptByNonQueryCriteria(book) && acceptByQueryOnly(book);
}
bool Filter::acceptByNonQueryCriteria(const Book& book) const
{ {
auto local = !book.getPath().empty(); auto local = !book.getPath().empty();
FILTER(_LOCAL, local) FILTER(_LOCAL, local)
@ -538,6 +631,11 @@ bool Filter::accept(const Book& book) const
} }
} }
} }
return true;
}
bool Filter::acceptByQueryOnly(const Book& book) const
{
if ( ACTIVE(QUERY) if ( ACTIVE(QUERY)
&& !(matchRegex(book.getTitle(), "\\Q" + _query + "\\E") && !(matchRegex(book.getTitle(), "\\Q" + _query + "\\E")
|| matchRegex(book.getDescription(), "\\Q" + _query + "\\E"))) || matchRegex(book.getDescription(), "\\Q" + _query + "\\E")))

View File

@ -19,24 +19,24 @@
id="raycharles_uncategorized" id="raycharles_uncategorized"
path="./zimfile.zim" path="./zimfile.zim"
url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim" url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim"
title="Ray Charles" title="Ray (uncategorized) Charles"
description="Wikipedia articles about Ray Charles" description="No category is assigned to this library entry."
language="eng" language="eng"
creator="Wikipedia" creator="Wikipedia"
publisher="Kiwix" publisher="Kiwix"
date="2020-03-31" date="2020-03-31"
name="wikipedia_en_ray_charles" name="wikipedia_en_ray_charles"
tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes" tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no"
articleCount="284" articleCount="284"
mediaCount="2" mediaCount="2"
size="556" size="123"
></book> ></book>
<book <book
id="charlesray" id="charlesray"
path="./zimfile.zim" path="./zimfile.zim"
url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim" url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim"
title="Charles, Ray" title="Charles, Ray"
description="Wikipedia articles about Charles, Ray" description="Wikipedia articles about Ray Charles"
language="eng" language="eng"
creator="Wikipedia" creator="Wikipedia"
publisher="Kiwix" publisher="Kiwix"

View File

@ -630,7 +630,7 @@ std::string maskVariableOPDSFeedData(std::string s)
" <entry>\n" \ " <entry>\n" \
" <id>urn:uuid:charlesray</id>\n" \ " <id>urn:uuid:charlesray</id>\n" \
" <title>Charles, Ray</title>\n" \ " <title>Charles, Ray</title>\n" \
" <summary>Wikipedia articles about Charles, Ray</summary>\n" \ " <summary>Wikipedia articles about Ray Charles</summary>\n" \
" <language>eng</language>\n" \ " <language>eng</language>\n" \
" <updated>2020-03-31T00:00::00Z</updated>\n" \ " <updated>2020-03-31T00:00::00Z</updated>\n" \
" <name>wikipedia_en_ray_charles</name>\n" \ " <name>wikipedia_en_ray_charles</name>\n" \
@ -677,14 +677,14 @@ std::string maskVariableOPDSFeedData(std::string s)
#define UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY \ #define UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY \
" <entry>\n" \ " <entry>\n" \
" <id>urn:uuid:raycharles_uncategorized</id>\n" \ " <id>urn:uuid:raycharles_uncategorized</id>\n" \
" <title>Ray Charles</title>\n" \ " <title>Ray (uncategorized) Charles</title>\n" \
" <summary>Wikipedia articles about Ray Charles</summary>\n" \ " <summary>No category is assigned to this library entry.</summary>\n" \
" <language>eng</language>\n" \ " <language>eng</language>\n" \
" <updated>2020-03-31T00:00::00Z</updated>\n" \ " <updated>2020-03-31T00:00::00Z</updated>\n" \
" <name>wikipedia_en_ray_charles</name>\n" \ " <name>wikipedia_en_ray_charles</name>\n" \
" <flavour></flavour>\n" \ " <flavour></flavour>\n" \
" <category></category>\n" \ " <category></category>\n" \
" <tags>unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes</tags>\n" \ " <tags>unittest;wikipedia;_pictures:no;_videos:no;_details:no</tags>\n" \
" <articleCount>284</articleCount>\n" \ " <articleCount>284</articleCount>\n" \
" <mediaCount>2</mediaCount>\n" \ " <mediaCount>2</mediaCount>\n" \
" <icon>/meta?name=favicon&amp;content=zimfile</icon>\n" \ " <icon>/meta?name=favicon&amp;content=zimfile</icon>\n" \
@ -695,7 +695,7 @@ std::string maskVariableOPDSFeedData(std::string s)
" <publisher>\n" \ " <publisher>\n" \
" <name>Kiwix</name>\n" \ " <name>Kiwix</name>\n" \
" </publisher>\n" \ " </publisher>\n" \
" <link rel=\"http://opds-spec.org/acquisition/open-access\" type=\"application/x-zim\" href=\"https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim\" length=\"569344\" />\n" \ " <link rel=\"http://opds-spec.org/acquisition/open-access\" type=\"application/x-zim\" href=\"https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim\" length=\"125952\" />\n" \
" </entry>\n" " </entry>\n"
TEST_F(LibraryServerTest, catalog_root_xml) TEST_F(LibraryServerTest, catalog_root_xml)
@ -733,7 +733,26 @@ TEST_F(LibraryServerTest, catalog_searchdescription_xml)
); );
} }
TEST_F(LibraryServerTest, catalog_search_by_text) TEST_F(LibraryServerTest, catalog_search_by_phrase)
{
const auto r = zfs1_->GET("/catalog/search?q=\"ray%20charles\"");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for \"ray charles\"</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
"</feed>\n"
);
}
TEST_F(LibraryServerTest, catalog_search_by_words)
{ {
const auto r = zfs1_->GET("/catalog/search?q=ray%20charles"); const auto r = zfs1_->GET("/catalog/search?q=ray%20charles");
EXPECT_EQ(r->status, 200); EXPECT_EQ(r->status, 200);
@ -742,12 +761,69 @@ TEST_F(LibraryServerTest, catalog_search_by_text)
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n" " <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for ray charles</title>\n" " <title>Search result for ray charles</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n" " <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>3</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
"</feed>\n"
);
}
TEST_F(LibraryServerTest, catalog_prefix_search)
{
{
const auto r = zfs1_->GET("/catalog/search?q=description:ray%20description:charles");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for description:ray description:charles</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
"</feed>\n"
);
}
{
const auto r = zfs1_->GET("/catalog/search?q=title:\"ray%20charles\"");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for title:\"ray charles\"</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>1</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>1</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
"</feed>\n"
);
}
}
TEST_F(LibraryServerTest, catalog_search_with_word_exclusion)
{
const auto r = zfs1_->GET("/catalog/search?q=ray%20-uncategorized");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for ray -uncategorized</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n" " <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n" " <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n" " <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY RAY_CHARLES_CATALOG_ENTRY
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY CHARLES_RAY_CATALOG_ENTRY
"</feed>\n" "</feed>\n"
); );
} }