Merge pull request #460 from kiwix/xapian_based_catalog_search

This commit is contained in:
Matthieu Gautier 2021-03-17 14:45:56 +01:00 committed by GitHub
commit baed447dd3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 225 additions and 33 deletions

View File

@ -28,6 +28,7 @@
#include "book.h"
#include "bookmark.h"
#include "common.h"
#include <xapian.h>
#define KIWIX_LIBRARY_VERSION "20110515"
@ -58,6 +59,7 @@ class Filter {
std::string _creator;
size_t _maxSize;
std::string _query;
bool _queryIsPartial;
std::string _name;
public:
@ -102,10 +104,16 @@ class Filter {
Filter& publisher(std::string publisher);
Filter& creator(std::string creator);
Filter& maxSize(size_t size);
Filter& query(std::string query);
Filter& query(std::string query, bool partial=true);
Filter& name(std::string name);
bool hasQuery() const;
const std::string& getQuery() const { return _query; }
bool queryIsPartial() const { return _queryIsPartial; }
bool accept(const Book& book) const;
bool acceptByQueryOnly(const Book& book) const;
bool acceptByNonQueryCriteria(const Book& book) const;
};
@ -117,6 +125,10 @@ class Library
std::map<std::string, kiwix::Book> m_books;
std::map<std::string, std::shared_ptr<Reader>> m_readers;
std::vector<kiwix::Bookmark> m_bookmarks;
Xapian::WritableDatabase m_bookDB;
public:
typedef std::vector<std::string> BookIdCollection;
public:
Library();
@ -220,7 +232,7 @@ class Library
*
* @return A list of book ids.
*/
std::vector<std::string> getBooksIds();
BookIdCollection getBooksIds();
/**
* Filter the library and generate a new one with the keep elements.
@ -230,7 +242,7 @@ class Library
* @param search List only books with search in the title or description.
* @return The list of bookIds corresponding to the query.
*/
DEPRECATED std::vector<std::string> filter(const std::string& search);
DEPRECATED BookIdCollection filter(const std::string& search);
/**
@ -239,7 +251,7 @@ class Library
* @param filter The filter to use.
* @return The list of bookIds corresponding to the filter.
*/
std::vector<std::string> filter(const Filter& filter);
BookIdCollection filter(const Filter& filter);
/**
@ -249,7 +261,7 @@ class Library
* @param comparator how to sort the books
* @return The sorted list of books
*/
void sort(std::vector<std::string>& bookIds, supportedListSortBy sortBy, bool ascending);
void sort(BookIdCollection& bookIds, supportedListSortBy sortBy, bool ascending);
/**
* List books in the library.
@ -273,7 +285,7 @@ class Library
* Set to 0 to cancel this filter.
* @return The list of bookIds corresponding to the query.
*/
DEPRECATED std::vector<std::string> listBooksIds(
DEPRECATED BookIdCollection listBooksIds(
int supportedListMode = ALL,
supportedListSortBy sortBy = UNSORTED,
const std::string& search = "",
@ -285,7 +297,12 @@ class Library
friend class OPDSDumper;
friend class libXMLDumper;
private: // functions
BookIdCollection getBooksByTitleOrDescription(const Filter& filter);
void updateBookDB(const Book& book);
};
}
#endif

View File

@ -34,6 +34,7 @@ pugixml_dep = dependency('pugixml', static:static_deps)
libcurl_dep = dependency('libcurl', static:static_deps)
microhttpd_dep = dependency('libmicrohttpd', static:static_deps)
zlib_dep = dependency('zlib', static:static_deps)
xapian_dep = dependency('xapian-core', static:static_deps)
if compiler.has_header('mustache.hpp')
extra_include = []
@ -55,7 +56,7 @@ if target_machine.system() == 'windows' and static_deps
extra_cflags += '-DCURL_STATICLIB'
endif
all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep]
all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep]
inc = include_directories('include', extra_include)
@ -74,7 +75,7 @@ subdir('static')
subdir('src')
subdir('test')
pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd']
pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd', 'xapian-core']
pkg_conf = configuration_data()
pkg_conf.set('prefix', get_option('prefix'))

View File

@ -30,14 +30,31 @@
#include <pugixml.hpp>
#include <algorithm>
#include <set>
#include <unicode/locid.h>
namespace kiwix
{
namespace
{
std::string iso639_3ToXapian(const std::string& lang) {
return icu::Locale(lang.c_str()).getLanguage();
};
std::string normalizeText(const std::string& text, const std::string& language)
{
return removeAccents(text);
}
} // unnamed namespace
/* Constructor */
Library::Library()
: m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
{
}
/* Destructor */
Library::~Library()
{
@ -47,6 +64,7 @@ Library::~Library()
bool Library::addBook(const Book& book)
{
/* Try to find it */
updateBookDB(book);
try {
auto& oldbook = m_books.at(book.getId());
oldbook.update(book);
@ -211,9 +229,9 @@ const std::vector<kiwix::Bookmark> Library::getBookmarks(bool onlyValidBookmarks
return validBookmarks;
}
std::vector<std::string> Library::getBooksIds()
Library::BookIdCollection Library::getBooksIds()
{
std::vector<std::string> bookIds;
BookIdCollection bookIds;
for (auto& pair: m_books) {
bookIds.push_back(pair.first);
@ -222,7 +240,7 @@ std::vector<std::string> Library::getBooksIds()
return bookIds;
}
std::vector<std::string> Library::filter(const std::string& search)
Library::BookIdCollection Library::filter(const std::string& search)
{
if (search.empty()) {
return getBooksIds();
@ -232,16 +250,80 @@ std::vector<std::string> Library::filter(const std::string& search)
}
std::vector<std::string> Library::filter(const Filter& filter)
void Library::updateBookDB(const Book& book)
{
std::vector<std::string> bookIds;
for(auto& pair:m_books) {
auto book = pair.second;
if(filter.accept(book)) {
bookIds.push_back(pair.first);
Xapian::Stem stemmer;
Xapian::TermGenerator indexer;
const std::string lang = book.getLanguage();
try {
stemmer = Xapian::Stem(iso639_3ToXapian(lang));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
Xapian::Document doc;
indexer.set_document(doc);
const std::string title = normalizeText(book.getTitle(), lang);
const std::string desc = normalizeText(book.getDescription(), lang);
doc.add_value(0, title);
doc.add_value(1, desc);
doc.set_data(book.getId());
indexer.index_text(title, 1, "S");
indexer.index_text(desc, 1, "XD");
// Index fields without prefixes for general search
indexer.index_text(title);
indexer.increase_termpos();
indexer.index_text(desc);
const std::string idterm = "Q" + book.getId();
doc.add_boolean_term(idterm);
m_bookDB.replace_document(idterm, doc);
}
Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter)
{
if ( !filter.hasQuery() )
return getBooksIds();
BookIdCollection bookIds;
Xapian::QueryParser queryParser;
queryParser.set_default_op(Xapian::Query::OP_AND);
queryParser.add_prefix("title", "S");
queryParser.add_prefix("description", "XD");
const auto partialQueryFlag = filter.queryIsPartial()
? Xapian::QueryParser::FLAG_PARTIAL
: 0;
// Language assumed for the query is not known for sure so stemming
// is not applied
//queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian(???)));
//queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
const auto flags = Xapian::QueryParser::FLAG_PHRASE
| Xapian::QueryParser::FLAG_BOOLEAN
| Xapian::QueryParser::FLAG_LOVEHATE
| Xapian::QueryParser::FLAG_WILDCARD
| partialQueryFlag;
const auto query = queryParser.parse_query(filter.getQuery(), flags);
Xapian::Enquire enquire(m_bookDB);
enquire.set_query(query);
const auto results = enquire.get_mset(0, m_books.size());
for ( auto it = results.begin(); it != results.end(); ++it ) {
bookIds.push_back(it.get_document().get_data());
}
return bookIds;
}
Library::BookIdCollection Library::filter(const Filter& filter)
{
BookIdCollection result;
for(auto id : getBooksByTitleOrDescription(filter)) {
if(filter.acceptByNonQueryCriteria(m_books[id])) {
result.push_back(id);
}
}
return bookIds;
return result;
}
template<supportedListSortBy SORT>
@ -303,7 +385,7 @@ std::string Comparator<PUBLISHER>::get_key(const std::string& id)
return lib->getBookById(id).getPublisher();
}
void Library::sort(std::vector<std::string>& bookIds, supportedListSortBy sort, bool ascending)
void Library::sort(BookIdCollection& bookIds, supportedListSortBy sort, bool ascending)
{
switch(sort) {
case TITLE:
@ -327,7 +409,7 @@ void Library::sort(std::vector<std::string>& bookIds, supportedListSortBy sort,
}
std::vector<std::string> Library::listBooksIds(
Library::BookIdCollection Library::listBooksIds(
int mode,
supportedListSortBy sortBy,
const std::string& search,
@ -479,9 +561,10 @@ Filter& Filter::maxSize(size_t maxSize)
return *this;
}
Filter& Filter::query(std::string query)
Filter& Filter::query(std::string query, bool partial)
{
_query = query;
_queryIsPartial = partial;
activeFilters |= QUERY;
return *this;
}
@ -495,7 +578,17 @@ Filter& Filter::name(std::string name)
#define ACTIVE(X) (activeFilters & (X))
#define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; }
bool Filter::hasQuery() const
{
return ACTIVE(QUERY);
}
bool Filter::accept(const Book& book) const
{
return acceptByNonQueryCriteria(book) && acceptByQueryOnly(book);
}
bool Filter::acceptByNonQueryCriteria(const Book& book) const
{
auto local = !book.getPath().empty();
FILTER(_LOCAL, local)
@ -538,6 +631,11 @@ bool Filter::accept(const Book& book) const
}
}
}
return true;
}
bool Filter::acceptByQueryOnly(const Book& book) const
{
if ( ACTIVE(QUERY)
&& !(matchRegex(book.getTitle(), "\\Q" + _query + "\\E")
|| matchRegex(book.getDescription(), "\\Q" + _query + "\\E")))

View File

@ -19,24 +19,24 @@
id="raycharles_uncategorized"
path="./zimfile.zim"
url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim"
title="Ray Charles"
description="Wikipedia articles about Ray Charles"
title="Ray (uncategorized) Charles"
description="No category is assigned to this library entry."
language="eng"
creator="Wikipedia"
publisher="Kiwix"
date="2020-03-31"
name="wikipedia_en_ray_charles"
tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes"
tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no"
articleCount="284"
mediaCount="2"
size="556"
size="123"
></book>
<book
id="charlesray"
path="./zimfile.zim"
url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim"
title="Charles, Ray"
description="Wikipedia articles about Charles, Ray"
description="Wikipedia articles about Ray Charles"
language="eng"
creator="Wikipedia"
publisher="Kiwix"

View File

@ -630,7 +630,7 @@ std::string maskVariableOPDSFeedData(std::string s)
" <entry>\n" \
" <id>urn:uuid:charlesray</id>\n" \
" <title>Charles, Ray</title>\n" \
" <summary>Wikipedia articles about Charles, Ray</summary>\n" \
" <summary>Wikipedia articles about Ray Charles</summary>\n" \
" <language>eng</language>\n" \
" <updated>2020-03-31T00:00::00Z</updated>\n" \
" <name>wikipedia_en_ray_charles</name>\n" \
@ -677,14 +677,14 @@ std::string maskVariableOPDSFeedData(std::string s)
#define UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY \
" <entry>\n" \
" <id>urn:uuid:raycharles_uncategorized</id>\n" \
" <title>Ray Charles</title>\n" \
" <summary>Wikipedia articles about Ray Charles</summary>\n" \
" <title>Ray (uncategorized) Charles</title>\n" \
" <summary>No category is assigned to this library entry.</summary>\n" \
" <language>eng</language>\n" \
" <updated>2020-03-31T00:00::00Z</updated>\n" \
" <name>wikipedia_en_ray_charles</name>\n" \
" <flavour></flavour>\n" \
" <category></category>\n" \
" <tags>unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes</tags>\n" \
" <tags>unittest;wikipedia;_pictures:no;_videos:no;_details:no</tags>\n" \
" <articleCount>284</articleCount>\n" \
" <mediaCount>2</mediaCount>\n" \
" <icon>/meta?name=favicon&amp;content=zimfile</icon>\n" \
@ -695,7 +695,7 @@ std::string maskVariableOPDSFeedData(std::string s)
" <publisher>\n" \
" <name>Kiwix</name>\n" \
" </publisher>\n" \
" <link rel=\"http://opds-spec.org/acquisition/open-access\" type=\"application/x-zim\" href=\"https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim\" length=\"569344\" />\n" \
" <link rel=\"http://opds-spec.org/acquisition/open-access\" type=\"application/x-zim\" href=\"https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim\" length=\"125952\" />\n" \
" </entry>\n"
TEST_F(LibraryServerTest, catalog_root_xml)
@ -733,7 +733,26 @@ TEST_F(LibraryServerTest, catalog_searchdescription_xml)
);
}
TEST_F(LibraryServerTest, catalog_search_by_text)
TEST_F(LibraryServerTest, catalog_search_by_phrase)
{
const auto r = zfs1_->GET("/catalog/search?q=\"ray%20charles\"");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for \"ray charles\"</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
"</feed>\n"
);
}
TEST_F(LibraryServerTest, catalog_search_by_words)
{
const auto r = zfs1_->GET("/catalog/search?q=ray%20charles");
EXPECT_EQ(r->status, 200);
@ -742,12 +761,69 @@ TEST_F(LibraryServerTest, catalog_search_by_text)
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for ray charles</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>3</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
"</feed>\n"
);
}
TEST_F(LibraryServerTest, catalog_prefix_search)
{
{
const auto r = zfs1_->GET("/catalog/search?q=description:ray%20description:charles");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for description:ray description:charles</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
"</feed>\n"
);
}
{
const auto r = zfs1_->GET("/catalog/search?q=title:\"ray%20charles\"");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for title:\"ray charles\"</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>1</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>1</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
"</feed>\n"
);
}
}
TEST_F(LibraryServerTest, catalog_search_with_word_exclusion)
{
const auto r = zfs1_->GET("/catalog/search?q=ray%20-uncategorized");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Search result for ray -uncategorized</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>2</totalResults>\n"
" <startIndex>0</startIndex>\n"
" <itemsPerPage>2</itemsPerPage>\n"
CATALOG_LINK_TAGS
RAY_CHARLES_CATALOG_ENTRY
UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY
CHARLES_RAY_CATALOG_ENTRY
"</feed>\n"
);
}