diff --git a/include/library.h b/include/library.h index 86dfafaee..97e5f236e 100644 --- a/include/library.h +++ b/include/library.h @@ -28,6 +28,7 @@ #include "book.h" #include "bookmark.h" #include "common.h" +#include #define KIWIX_LIBRARY_VERSION "20110515" @@ -58,6 +59,7 @@ class Filter { std::string _creator; size_t _maxSize; std::string _query; + bool _queryIsPartial; std::string _name; public: @@ -102,10 +104,16 @@ class Filter { Filter& publisher(std::string publisher); Filter& creator(std::string creator); Filter& maxSize(size_t size); - Filter& query(std::string query); + Filter& query(std::string query, bool partial=true); Filter& name(std::string name); + bool hasQuery() const; + const std::string& getQuery() const { return _query; } + bool queryIsPartial() const { return _queryIsPartial; } + bool accept(const Book& book) const; + bool acceptByQueryOnly(const Book& book) const; + bool acceptByNonQueryCriteria(const Book& book) const; }; @@ -117,6 +125,10 @@ class Library std::map m_books; std::map> m_readers; std::vector m_bookmarks; + Xapian::WritableDatabase m_bookDB; + + public: + typedef std::vector BookIdCollection; public: Library(); @@ -220,7 +232,7 @@ class Library * * @return A list of book ids. */ - std::vector getBooksIds(); + BookIdCollection getBooksIds(); /** * Filter the library and generate a new one with the keep elements. @@ -230,7 +242,7 @@ class Library * @param search List only books with search in the title or description. * @return The list of bookIds corresponding to the query. */ - DEPRECATED std::vector filter(const std::string& search); + DEPRECATED BookIdCollection filter(const std::string& search); /** @@ -239,7 +251,7 @@ class Library * @param filter The filter to use. * @return The list of bookIds corresponding to the filter. */ - std::vector filter(const Filter& filter); + BookIdCollection filter(const Filter& filter); /** @@ -249,7 +261,7 @@ class Library * @param comparator how to sort the books * @return The sorted list of books */ - void sort(std::vector& bookIds, supportedListSortBy sortBy, bool ascending); + void sort(BookIdCollection& bookIds, supportedListSortBy sortBy, bool ascending); /** * List books in the library. @@ -273,7 +285,7 @@ class Library * Set to 0 to cancel this filter. * @return The list of bookIds corresponding to the query. */ - DEPRECATED std::vector listBooksIds( + DEPRECATED BookIdCollection listBooksIds( int supportedListMode = ALL, supportedListSortBy sortBy = UNSORTED, const std::string& search = "", @@ -285,7 +297,12 @@ class Library friend class OPDSDumper; friend class libXMLDumper; + +private: // functions + BookIdCollection getBooksByTitleOrDescription(const Filter& filter); + void updateBookDB(const Book& book); }; + } #endif diff --git a/meson.build b/meson.build index 48e5a7d3b..7af35e225 100644 --- a/meson.build +++ b/meson.build @@ -34,6 +34,7 @@ pugixml_dep = dependency('pugixml', static:static_deps) libcurl_dep = dependency('libcurl', static:static_deps) microhttpd_dep = dependency('libmicrohttpd', static:static_deps) zlib_dep = dependency('zlib', static:static_deps) +xapian_dep = dependency('xapian-core', static:static_deps) if compiler.has_header('mustache.hpp') extra_include = [] @@ -55,7 +56,7 @@ if target_machine.system() == 'windows' and static_deps extra_cflags += '-DCURL_STATICLIB' endif -all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep] +all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep] inc = include_directories('include', extra_include) @@ -74,7 +75,7 @@ subdir('static') subdir('src') subdir('test') -pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd'] +pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd', 'xapian-core'] pkg_conf = configuration_data() pkg_conf.set('prefix', get_option('prefix')) diff --git a/src/library.cpp b/src/library.cpp index c9855bd94..cdfc9a246 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -30,14 +30,31 @@ #include #include #include +#include namespace kiwix { +namespace +{ + +std::string iso639_3ToXapian(const std::string& lang) { + return icu::Locale(lang.c_str()).getLanguage(); +}; + +std::string normalizeText(const std::string& text, const std::string& language) +{ + return removeAccents(text); +} + +} // unnamed namespace + /* Constructor */ Library::Library() + : m_bookDB("", Xapian::DB_BACKEND_INMEMORY) { } + /* Destructor */ Library::~Library() { @@ -47,6 +64,7 @@ Library::~Library() bool Library::addBook(const Book& book) { /* Try to find it */ + updateBookDB(book); try { auto& oldbook = m_books.at(book.getId()); oldbook.update(book); @@ -211,9 +229,9 @@ const std::vector Library::getBookmarks(bool onlyValidBookmarks return validBookmarks; } -std::vector Library::getBooksIds() +Library::BookIdCollection Library::getBooksIds() { - std::vector bookIds; + BookIdCollection bookIds; for (auto& pair: m_books) { bookIds.push_back(pair.first); @@ -222,7 +240,7 @@ std::vector Library::getBooksIds() return bookIds; } -std::vector Library::filter(const std::string& search) +Library::BookIdCollection Library::filter(const std::string& search) { if (search.empty()) { return getBooksIds(); @@ -232,16 +250,80 @@ std::vector Library::filter(const std::string& search) } -std::vector Library::filter(const Filter& filter) +void Library::updateBookDB(const Book& book) { - std::vector bookIds; - for(auto& pair:m_books) { - auto book = pair.second; - if(filter.accept(book)) { - bookIds.push_back(pair.first); + Xapian::Stem stemmer; + Xapian::TermGenerator indexer; + const std::string lang = book.getLanguage(); + try { + stemmer = Xapian::Stem(iso639_3ToXapian(lang)); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); + } catch (...) {} + Xapian::Document doc; + indexer.set_document(doc); + + const std::string title = normalizeText(book.getTitle(), lang); + const std::string desc = normalizeText(book.getDescription(), lang); + doc.add_value(0, title); + doc.add_value(1, desc); + doc.set_data(book.getId()); + + indexer.index_text(title, 1, "S"); + indexer.index_text(desc, 1, "XD"); + + // Index fields without prefixes for general search + indexer.index_text(title); + indexer.increase_termpos(); + indexer.index_text(desc); + + const std::string idterm = "Q" + book.getId(); + doc.add_boolean_term(idterm); + m_bookDB.replace_document(idterm, doc); +} + +Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter) +{ + if ( !filter.hasQuery() ) + return getBooksIds(); + + BookIdCollection bookIds; + Xapian::QueryParser queryParser; + queryParser.set_default_op(Xapian::Query::OP_AND); + queryParser.add_prefix("title", "S"); + queryParser.add_prefix("description", "XD"); + const auto partialQueryFlag = filter.queryIsPartial() + ? Xapian::QueryParser::FLAG_PARTIAL + : 0; + // Language assumed for the query is not known for sure so stemming + // is not applied + //queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian(???))); + //queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); + const auto flags = Xapian::QueryParser::FLAG_PHRASE + | Xapian::QueryParser::FLAG_BOOLEAN + | Xapian::QueryParser::FLAG_LOVEHATE + | Xapian::QueryParser::FLAG_WILDCARD + | partialQueryFlag; + const auto query = queryParser.parse_query(filter.getQuery(), flags); + Xapian::Enquire enquire(m_bookDB); + enquire.set_query(query); + const auto results = enquire.get_mset(0, m_books.size()); + for ( auto it = results.begin(); it != results.end(); ++it ) { + bookIds.push_back(it.get_document().get_data()); + } + + return bookIds; +} + +Library::BookIdCollection Library::filter(const Filter& filter) +{ + BookIdCollection result; + for(auto id : getBooksByTitleOrDescription(filter)) { + if(filter.acceptByNonQueryCriteria(m_books[id])) { + result.push_back(id); } } - return bookIds; + return result; } template @@ -303,7 +385,7 @@ std::string Comparator::get_key(const std::string& id) return lib->getBookById(id).getPublisher(); } -void Library::sort(std::vector& bookIds, supportedListSortBy sort, bool ascending) +void Library::sort(BookIdCollection& bookIds, supportedListSortBy sort, bool ascending) { switch(sort) { case TITLE: @@ -327,7 +409,7 @@ void Library::sort(std::vector& bookIds, supportedListSortBy sort, } -std::vector Library::listBooksIds( +Library::BookIdCollection Library::listBooksIds( int mode, supportedListSortBy sortBy, const std::string& search, @@ -479,9 +561,10 @@ Filter& Filter::maxSize(size_t maxSize) return *this; } -Filter& Filter::query(std::string query) +Filter& Filter::query(std::string query, bool partial) { _query = query; + _queryIsPartial = partial; activeFilters |= QUERY; return *this; } @@ -495,7 +578,17 @@ Filter& Filter::name(std::string name) #define ACTIVE(X) (activeFilters & (X)) #define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; } +bool Filter::hasQuery() const +{ + return ACTIVE(QUERY); +} + bool Filter::accept(const Book& book) const +{ + return acceptByNonQueryCriteria(book) && acceptByQueryOnly(book); +} + +bool Filter::acceptByNonQueryCriteria(const Book& book) const { auto local = !book.getPath().empty(); FILTER(_LOCAL, local) @@ -538,6 +631,11 @@ bool Filter::accept(const Book& book) const } } } + return true; +} + +bool Filter::acceptByQueryOnly(const Book& book) const +{ if ( ACTIVE(QUERY) && !(matchRegex(book.getTitle(), "\\Q" + _query + "\\E") || matchRegex(book.getDescription(), "\\Q" + _query + "\\E"))) diff --git a/test/data/library.xml b/test/data/library.xml index 130f3aa48..1ab2193b5 100644 --- a/test/data/library.xml +++ b/test/data/library.xml @@ -19,24 +19,24 @@ id="raycharles_uncategorized" path="./zimfile.zim" url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim" - title="Ray Charles" - description="Wikipedia articles about Ray Charles" + title="Ray (uncategorized) Charles" + description="No category is assigned to this library entry." language="eng" creator="Wikipedia" publisher="Kiwix" date="2020-03-31" name="wikipedia_en_ray_charles" - tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes" + tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no" articleCount="284" mediaCount="2" - size="556" + size="123" > \n" \ " urn:uuid:charlesray\n" \ " Charles, Ray\n" \ - " Wikipedia articles about Charles, Ray\n" \ + " Wikipedia articles about Ray Charles\n" \ " eng\n" \ " 2020-03-31T00:00::00Z\n" \ " wikipedia_en_ray_charles\n" \ @@ -677,14 +677,14 @@ std::string maskVariableOPDSFeedData(std::string s) #define UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY \ " \n" \ " urn:uuid:raycharles_uncategorized\n" \ - " Ray Charles\n" \ - " Wikipedia articles about Ray Charles\n" \ + " Ray (uncategorized) Charles\n" \ + " No category is assigned to this library entry.\n" \ " eng\n" \ " 2020-03-31T00:00::00Z\n" \ " wikipedia_en_ray_charles\n" \ " \n" \ " \n" \ - " unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes\n" \ + " unittest;wikipedia;_pictures:no;_videos:no;_details:no\n" \ " 284\n" \ " 2\n" \ " /meta?name=favicon&content=zimfile\n" \ @@ -695,7 +695,7 @@ std::string maskVariableOPDSFeedData(std::string s) " \n" \ " Kiwix\n" \ " \n" \ - " \n" \ + " \n" \ " \n" TEST_F(LibraryServerTest, catalog_root_xml) @@ -733,7 +733,26 @@ TEST_F(LibraryServerTest, catalog_searchdescription_xml) ); } -TEST_F(LibraryServerTest, catalog_search_by_text) +TEST_F(LibraryServerTest, catalog_search_by_phrase) +{ + const auto r = zfs1_->GET("/catalog/search?q=\"ray%20charles\""); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for \"ray charles\"\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 2\n" + " 0\n" + " 2\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY + "\n" + ); +} + +TEST_F(LibraryServerTest, catalog_search_by_words) { const auto r = zfs1_->GET("/catalog/search?q=ray%20charles"); EXPECT_EQ(r->status, 200); @@ -742,12 +761,69 @@ TEST_F(LibraryServerTest, catalog_search_by_text) " 12345678-90ab-cdef-1234-567890abcdef\n" " Search result for ray charles\n" " YYYY-MM-DDThh:mm:ssZ\n" + " 3\n" + " 0\n" + " 3\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + "\n" + ); +} + +TEST_F(LibraryServerTest, catalog_prefix_search) +{ + { + const auto r = zfs1_->GET("/catalog/search?q=description:ray%20description:charles"); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for description:ray description:charles\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 2\n" + " 0\n" + " 2\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY + "\n" + ); + } + { + const auto r = zfs1_->GET("/catalog/search?q=title:\"ray%20charles\""); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for title:\"ray charles\"\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 1\n" + " 0\n" + " 1\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + "\n" + ); + } +} + +TEST_F(LibraryServerTest, catalog_search_with_word_exclusion) +{ + const auto r = zfs1_->GET("/catalog/search?q=ray%20-uncategorized"); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for ray -uncategorized\n" + " YYYY-MM-DDThh:mm:ssZ\n" " 2\n" " 0\n" " 2\n" CATALOG_LINK_TAGS RAY_CHARLES_CATALOG_ENTRY - UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY "\n" ); }