From a20f9e2ce18df53b6c062c4a1436b7089b1a558f Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sat, 6 Mar 2021 20:03:41 +0400 Subject: [PATCH 01/10] Library::filter() works in two stages 1. Get the subset of books matching the q (title/description) parameter of the search 2. Filter out books not matching the other parameters of the search. Stage 1. currently works in the old way, but will be replaced by Xapian based search in subsequent commits. --- include/library.h | 4 ++++ src/library.cpp | 27 ++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/library.h b/include/library.h index 86dfafaee..b56dab896 100644 --- a/include/library.h +++ b/include/library.h @@ -105,7 +105,11 @@ class Filter { Filter& query(std::string query); Filter& name(std::string name); + bool hasQuery() const; + bool accept(const Book& book) const; + bool acceptByQueryOnly(const Book& book) const; + bool acceptByNonQueryCriteria(const Book& book) const; }; diff --git a/src/library.cpp b/src/library.cpp index c9855bd94..b16c4b9d6 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -236,12 +236,18 @@ std::vector Library::filter(const Filter& filter) { std::vector bookIds; for(auto& pair:m_books) { - auto book = pair.second; - if(filter.accept(book)) { + if(filter.acceptByQueryOnly(pair.second)) { bookIds.push_back(pair.first); } } - return bookIds; + + std::vector result; + for(auto id : bookIds) { + if(filter.acceptByNonQueryCriteria(m_books[id])) { + result.push_back(id); + } + } + return result; } template @@ -495,7 +501,17 @@ Filter& Filter::name(std::string name) #define ACTIVE(X) (activeFilters & (X)) #define FILTER(TAG, TEST) if (ACTIVE(TAG) && !(TEST)) { return false; } +bool Filter::hasQuery() const +{ + return ACTIVE(QUERY); +} + bool Filter::accept(const Book& book) const +{ + return acceptByNonQueryCriteria(book) && acceptByQueryOnly(book); +} + +bool Filter::acceptByNonQueryCriteria(const Book& book) const { auto local = !book.getPath().empty(); FILTER(_LOCAL, local) @@ -538,6 +554,11 @@ bool Filter::accept(const Book& book) const } } } + return true; +} + +bool Filter::acceptByQueryOnly(const Book& book) const +{ if ( ACTIVE(QUERY) && !(matchRegex(book.getTitle(), "\\Q" + _query + "\\E") || matchRegex(book.getDescription(), "\\Q" + _query + "\\E"))) From db06b2c7cac1c5849d29a8aaab2d3cebb696a139 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sat, 6 Mar 2021 20:13:43 +0400 Subject: [PATCH 02/10] Library::BookIdCollection typedef --- include/library.h | 13 ++++++++----- src/library.cpp | 16 ++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/library.h b/include/library.h index b56dab896..4a859086d 100644 --- a/include/library.h +++ b/include/library.h @@ -122,6 +122,9 @@ class Library std::map> m_readers; std::vector m_bookmarks; + public: + typedef std::vector BookIdCollection; + public: Library(); ~Library(); @@ -224,7 +227,7 @@ class Library * * @return A list of book ids. */ - std::vector getBooksIds(); + BookIdCollection getBooksIds(); /** * Filter the library and generate a new one with the keep elements. @@ -234,7 +237,7 @@ class Library * @param search List only books with search in the title or description. * @return The list of bookIds corresponding to the query. */ - DEPRECATED std::vector filter(const std::string& search); + DEPRECATED BookIdCollection filter(const std::string& search); /** @@ -243,7 +246,7 @@ class Library * @param filter The filter to use. * @return The list of bookIds corresponding to the filter. */ - std::vector filter(const Filter& filter); + BookIdCollection filter(const Filter& filter); /** @@ -253,7 +256,7 @@ class Library * @param comparator how to sort the books * @return The sorted list of books */ - void sort(std::vector& bookIds, supportedListSortBy sortBy, bool ascending); + void sort(BookIdCollection& bookIds, supportedListSortBy sortBy, bool ascending); /** * List books in the library. @@ -277,7 +280,7 @@ class Library * Set to 0 to cancel this filter. * @return The list of bookIds corresponding to the query. */ - DEPRECATED std::vector listBooksIds( + DEPRECATED BookIdCollection listBooksIds( int supportedListMode = ALL, supportedListSortBy sortBy = UNSORTED, const std::string& search = "", diff --git a/src/library.cpp b/src/library.cpp index b16c4b9d6..920d93eda 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -211,9 +211,9 @@ const std::vector Library::getBookmarks(bool onlyValidBookmarks return validBookmarks; } -std::vector Library::getBooksIds() +Library::BookIdCollection Library::getBooksIds() { - std::vector bookIds; + BookIdCollection bookIds; for (auto& pair: m_books) { bookIds.push_back(pair.first); @@ -222,7 +222,7 @@ std::vector Library::getBooksIds() return bookIds; } -std::vector Library::filter(const std::string& search) +Library::BookIdCollection Library::filter(const std::string& search) { if (search.empty()) { return getBooksIds(); @@ -232,16 +232,16 @@ std::vector Library::filter(const std::string& search) } -std::vector Library::filter(const Filter& filter) +Library::BookIdCollection Library::filter(const Filter& filter) { - std::vector bookIds; + BookIdCollection bookIds; for(auto& pair:m_books) { if(filter.acceptByQueryOnly(pair.second)) { bookIds.push_back(pair.first); } } - std::vector result; + BookIdCollection result; for(auto id : bookIds) { if(filter.acceptByNonQueryCriteria(m_books[id])) { result.push_back(id); @@ -309,7 +309,7 @@ std::string Comparator::get_key(const std::string& id) return lib->getBookById(id).getPublisher(); } -void Library::sort(std::vector& bookIds, supportedListSortBy sort, bool ascending) +void Library::sort(BookIdCollection& bookIds, supportedListSortBy sort, bool ascending) { switch(sort) { case TITLE: @@ -333,7 +333,7 @@ void Library::sort(std::vector& bookIds, supportedListSortBy sort, } -std::vector Library::listBooksIds( +Library::BookIdCollection Library::listBooksIds( int mode, supportedListSortBy sortBy, const std::string& search, From a17fc0ef2dcc81f843c79ddfdb4dc71d0f9281b3 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sat, 6 Mar 2021 20:19:43 +0400 Subject: [PATCH 03/10] Library::getBooksByTitleOrDescription() --- include/library.h | 4 ++++ src/library.cpp | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/library.h b/include/library.h index 4a859086d..c09a77551 100644 --- a/include/library.h +++ b/include/library.h @@ -292,7 +292,11 @@ class Library friend class OPDSDumper; friend class libXMLDumper; + +private: // functions + BookIdCollection getBooksByTitleOrDescription(const Filter& filter); }; + } #endif diff --git a/src/library.cpp b/src/library.cpp index 920d93eda..7d428f570 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -232,7 +232,7 @@ Library::BookIdCollection Library::filter(const std::string& search) } -Library::BookIdCollection Library::filter(const Filter& filter) +Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter) { BookIdCollection bookIds; for(auto& pair:m_books) { @@ -240,9 +240,13 @@ Library::BookIdCollection Library::filter(const Filter& filter) bookIds.push_back(pair.first); } } + return bookIds; +} +Library::BookIdCollection Library::filter(const Filter& filter) +{ BookIdCollection result; - for(auto id : bookIds) { + for(auto id : getBooksByTitleOrDescription(filter)) { if(filter.acceptByNonQueryCriteria(m_books[id])) { result.push_back(id); } From a599fb38929e33944b5c7a62ddc1f5553e247380 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 7 Mar 2021 01:11:04 +0400 Subject: [PATCH 04/10] Initial version of Xapian-based catalog search --- include/library.h | 4 +++ src/library.cpp | 80 ++++++++++++++++++++++++++++++++++++++++++++--- test/library.cpp | 2 +- test/server.cpp | 26 +++++++++++++-- 4 files changed, 104 insertions(+), 8 deletions(-) diff --git a/include/library.h b/include/library.h index c09a77551..fb2d8aad4 100644 --- a/include/library.h +++ b/include/library.h @@ -28,6 +28,7 @@ #include "book.h" #include "bookmark.h" #include "common.h" +#include #define KIWIX_LIBRARY_VERSION "20110515" @@ -106,6 +107,7 @@ class Filter { Filter& name(std::string name); bool hasQuery() const; + const std::string& getQuery() const { return _query; } bool accept(const Book& book) const; bool acceptByQueryOnly(const Book& book) const; @@ -121,6 +123,7 @@ class Library std::map m_books; std::map> m_readers; std::vector m_bookmarks; + Xapian::WritableDatabase m_bookDB; public: typedef std::vector BookIdCollection; @@ -295,6 +298,7 @@ class Library private: // functions BookIdCollection getBooksByTitleOrDescription(const Filter& filter); + void updateBookDB(const Book& book); }; } diff --git a/src/library.cpp b/src/library.cpp index 7d428f570..7cbfd08a1 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -34,10 +34,31 @@ namespace kiwix { +namespace +{ + +const std::map iso639_3ToXapian { + {"deu", "german" }, + {"eng", "english" }, + {"fra", "french" }, + {"hye", "armenian"}, + {"rus", "russian" }, + {"spa", "spanish" }, +}; + +std::string normalizeText(const std::string& text, const std::string& language) +{ + return removeAccents(text); +} + +} // unnamed namespace + /* Constructor */ Library::Library() + : m_bookDB("", Xapian::DB_BACKEND_INMEMORY) { } + /* Destructor */ Library::~Library() { @@ -47,6 +68,7 @@ Library::~Library() bool Library::addBook(const Book& book) { /* Try to find it */ + updateBookDB(book); try { auto& oldbook = m_books.at(book.getId()); oldbook.update(book); @@ -232,14 +254,64 @@ Library::BookIdCollection Library::filter(const std::string& search) } +void Library::updateBookDB(const Book& book) +{ + Xapian::Stem stemmer; + Xapian::TermGenerator indexer; + const std::string lang = book.getLanguage(); + try { + stemmer = Xapian::Stem(iso639_3ToXapian.at(lang)); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); + } catch (...) {} + Xapian::Document doc; + indexer.set_document(doc); + + const std::string title = normalizeText(book.getTitle(), lang); + const std::string desc = normalizeText(book.getDescription(), lang); + doc.add_value(0, title); + doc.add_value(1, desc); + doc.set_data(book.getId()); + + indexer.index_text(title, 1, "S"); + indexer.index_text(desc, 1, "XD"); + + // Index fields without prefixes for general search + indexer.index_text(title); + indexer.increase_termpos(); + indexer.index_text(desc); + + const std::string idterm = "Q" + book.getId(); + doc.add_boolean_term(idterm); + m_bookDB.replace_document(idterm, doc); +} + Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& filter) { + if ( !filter.hasQuery() ) + return getBooksIds(); + BookIdCollection bookIds; - for(auto& pair:m_books) { - if(filter.acceptByQueryOnly(pair.second)) { - bookIds.push_back(pair.first); - } + Xapian::QueryParser queryParser; + queryParser.set_default_op(Xapian::Query::OP_AND); + queryParser.add_prefix("title", "S"); + queryParser.add_prefix("description", "XD"); + // Language assumed for the query is not known for sure so stemming + // is not applied + //queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian.at(???))); + //queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); + const auto flags = Xapian::QueryParser::FLAG_PHRASE + | Xapian::QueryParser::FLAG_BOOLEAN + | Xapian::QueryParser::FLAG_LOVEHATE + | Xapian::QueryParser::FLAG_WILDCARD; + const auto query = queryParser.parse_query(filter.getQuery(), flags); + Xapian::Enquire enquire(m_bookDB); + enquire.set_query(query); + const auto results = enquire.get_mset(0, m_books.size()); + for ( auto it = results.begin(); it != results.end(); ++it ) { + bookIds.push_back(it.get_document().get_data()); } + return bookIds; } diff --git a/test/library.cpp b/test/library.cpp index 7f34e008b..0501124df 100644 --- a/test/library.cpp +++ b/test/library.cpp @@ -263,7 +263,7 @@ TEST_F(LibraryTest, filterCheck) bookIds = lib.filter(kiwix::Filter().query("folklore")); EXPECT_EQ(bookIds.size(), 1U); - bookIds = lib.filter(kiwix::Filter().query("Wiki")); + bookIds = lib.filter(kiwix::Filter().query("Wiki*")); EXPECT_EQ(bookIds.size(), 4U); bookIds = lib.filter(kiwix::Filter().query("Wiki").creator("Wiki")); diff --git a/test/server.cpp b/test/server.cpp index d4bd59de1..5fd583f26 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -733,7 +733,26 @@ TEST_F(LibraryServerTest, catalog_searchdescription_xml) ); } -TEST_F(LibraryServerTest, catalog_search_by_text) +TEST_F(LibraryServerTest, catalog_search_by_phrase) +{ + const auto r = zfs1_->GET("/catalog/search?q=\"ray%20charles\""); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for \"ray charles\"\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 2\n" + " 0\n" + " 2\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + "\n" + ); +} + +TEST_F(LibraryServerTest, catalog_search_by_words) { const auto r = zfs1_->GET("/catalog/search?q=ray%20charles"); EXPECT_EQ(r->status, 200); @@ -742,12 +761,13 @@ TEST_F(LibraryServerTest, catalog_search_by_text) " 12345678-90ab-cdef-1234-567890abcdef\n" " Search result for ray charles\n" " YYYY-MM-DDThh:mm:ssZ\n" - " 2\n" + " 3\n" " 0\n" - " 2\n" + " 3\n" CATALOG_LINK_TAGS RAY_CHARLES_CATALOG_ENTRY UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY "\n" ); } From 9e887cadf12f88bfb89b1db47460d1d7207fb019 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 7 Mar 2021 20:14:12 +0400 Subject: [PATCH 05/10] Added some diversity to test/data/library.xml --- test/data/library.xml | 10 +++++----- test/server.cpp | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/test/data/library.xml b/test/data/library.xml index 130f3aa48..1ab2193b5 100644 --- a/test/data/library.xml +++ b/test/data/library.xml @@ -19,24 +19,24 @@ id="raycharles_uncategorized" path="./zimfile.zim" url="https://github.com/kiwix/kiwix-lib/raw/master/test/data/zimfile.zim" - title="Ray Charles" - description="Wikipedia articles about Ray Charles" + title="Ray (uncategorized) Charles" + description="No category is assigned to this library entry." language="eng" creator="Wikipedia" publisher="Kiwix" date="2020-03-31" name="wikipedia_en_ray_charles" - tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes" + tags="unittest;wikipedia;_pictures:no;_videos:no;_details:no" articleCount="284" mediaCount="2" - size="556" + size="123" > \n" \ " urn:uuid:charlesray\n" \ " Charles, Ray\n" \ - " Wikipedia articles about Charles, Ray\n" \ + " Wikipedia articles about Ray Charles\n" \ " eng\n" \ " 2020-03-31T00:00::00Z\n" \ " wikipedia_en_ray_charles\n" \ @@ -677,14 +677,14 @@ std::string maskVariableOPDSFeedData(std::string s) #define UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY \ " \n" \ " urn:uuid:raycharles_uncategorized\n" \ - " Ray Charles\n" \ - " Wikipedia articles about Ray Charles\n" \ + " Ray (uncategorized) Charles\n" \ + " No category is assigned to this library entry.\n" \ " eng\n" \ " 2020-03-31T00:00::00Z\n" \ " wikipedia_en_ray_charles\n" \ " \n" \ " \n" \ - " unittest;wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes\n" \ + " unittest;wikipedia;_pictures:no;_videos:no;_details:no\n" \ " 284\n" \ " 2\n" \ " /meta?name=favicon&content=zimfile\n" \ @@ -695,7 +695,7 @@ std::string maskVariableOPDSFeedData(std::string s) " \n" \ " Kiwix\n" \ " \n" \ - " \n" \ + " \n" \ " \n" TEST_F(LibraryServerTest, catalog_root_xml) @@ -747,7 +747,7 @@ TEST_F(LibraryServerTest, catalog_search_by_phrase) " 2\n" CATALOG_LINK_TAGS RAY_CHARLES_CATALOG_ENTRY - UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY "\n" ); } @@ -766,8 +766,8 @@ TEST_F(LibraryServerTest, catalog_search_by_words) " 3\n" CATALOG_LINK_TAGS RAY_CHARLES_CATALOG_ENTRY - UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY CHARLES_RAY_CATALOG_ENTRY + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY "\n" ); } From 6b600a18eb31d27ef03c9f2fd56417dcd9e0021a Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 7 Mar 2021 20:30:03 +0400 Subject: [PATCH 06/10] LibraryServerTest.catalog_prefix_search --- test/server.cpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/test/server.cpp b/test/server.cpp index 6cf5bebac..6d1028bd1 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -772,6 +772,43 @@ TEST_F(LibraryServerTest, catalog_search_by_words) ); } +TEST_F(LibraryServerTest, catalog_prefix_search) +{ + { + const auto r = zfs1_->GET("/catalog/search?q=description:ray%20description:charles"); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for description:ray description:charles\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 2\n" + " 0\n" + " 2\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY + "\n" + ); + } + { + const auto r = zfs1_->GET("/catalog/search?q=title:\"ray%20charles\""); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for title:\"ray charles\"\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 1\n" + " 0\n" + " 1\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + "\n" + ); + } +} + TEST_F(LibraryServerTest, catalog_search_by_tag) { const auto r = zfs1_->GET("/catalog/search?tag=_category:jazz"); From 47c67a420210d56f5957e8e01c768382b7b9550c Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Sun, 7 Mar 2021 20:35:24 +0400 Subject: [PATCH 07/10] LibraryServerTest.catalog_search_with_word_exclusion --- test/server.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/server.cpp b/test/server.cpp index 6d1028bd1..e6df02474 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -809,6 +809,25 @@ TEST_F(LibraryServerTest, catalog_prefix_search) } } +TEST_F(LibraryServerTest, catalog_search_with_word_exclusion) +{ + const auto r = zfs1_->GET("/catalog/search?q=ray%20-uncategorized"); + EXPECT_EQ(r->status, 200); + EXPECT_EQ(maskVariableOPDSFeedData(r->body), + OPDS_FEED_TAG + " 12345678-90ab-cdef-1234-567890abcdef\n" + " Search result for ray -uncategorized\n" + " YYYY-MM-DDThh:mm:ssZ\n" + " 2\n" + " 0\n" + " 2\n" + CATALOG_LINK_TAGS + RAY_CHARLES_CATALOG_ENTRY + CHARLES_RAY_CATALOG_ENTRY + "\n" + ); +} + TEST_F(LibraryServerTest, catalog_search_by_tag) { const auto r = zfs1_->GET("/catalog/search?tag=_category:jazz"); From 09233bf4f3c9a3fc771004f988d177c015b87b2f Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 10 Mar 2021 16:18:43 +0400 Subject: [PATCH 08/10] Support for partial queries in catalog search The search text in the catalog query is interpreted as partial by default, but partial query mode can be disabled in C++. The latter possibility is not exposed via the /catalog/search kiwix-serve endpoint, though. --- include/library.h | 4 +++- src/library.cpp | 9 +++++++-- test/library.cpp | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/library.h b/include/library.h index fb2d8aad4..97e5f236e 100644 --- a/include/library.h +++ b/include/library.h @@ -59,6 +59,7 @@ class Filter { std::string _creator; size_t _maxSize; std::string _query; + bool _queryIsPartial; std::string _name; public: @@ -103,11 +104,12 @@ class Filter { Filter& publisher(std::string publisher); Filter& creator(std::string creator); Filter& maxSize(size_t size); - Filter& query(std::string query); + Filter& query(std::string query, bool partial=true); Filter& name(std::string name); bool hasQuery() const; const std::string& getQuery() const { return _query; } + bool queryIsPartial() const { return _queryIsPartial; } bool accept(const Book& book) const; bool acceptByQueryOnly(const Book& book) const; diff --git a/src/library.cpp b/src/library.cpp index 7cbfd08a1..52ca72670 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -296,6 +296,9 @@ Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& fi queryParser.set_default_op(Xapian::Query::OP_AND); queryParser.add_prefix("title", "S"); queryParser.add_prefix("description", "XD"); + const auto partialQueryFlag = filter.queryIsPartial() + ? Xapian::QueryParser::FLAG_PARTIAL + : 0; // Language assumed for the query is not known for sure so stemming // is not applied //queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian.at(???))); @@ -303,7 +306,8 @@ Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& fi const auto flags = Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_LOVEHATE - | Xapian::QueryParser::FLAG_WILDCARD; + | Xapian::QueryParser::FLAG_WILDCARD + | partialQueryFlag; const auto query = queryParser.parse_query(filter.getQuery(), flags); Xapian::Enquire enquire(m_bookDB); enquire.set_query(query); @@ -561,9 +565,10 @@ Filter& Filter::maxSize(size_t maxSize) return *this; } -Filter& Filter::query(std::string query) +Filter& Filter::query(std::string query, bool partial) { _query = query; + _queryIsPartial = partial; activeFilters |= QUERY; return *this; } diff --git a/test/library.cpp b/test/library.cpp index 0501124df..7f34e008b 100644 --- a/test/library.cpp +++ b/test/library.cpp @@ -263,7 +263,7 @@ TEST_F(LibraryTest, filterCheck) bookIds = lib.filter(kiwix::Filter().query("folklore")); EXPECT_EQ(bookIds.size(), 1U); - bookIds = lib.filter(kiwix::Filter().query("Wiki*")); + bookIds = lib.filter(kiwix::Filter().query("Wiki")); EXPECT_EQ(bookIds.size(), 4U); bookIds = lib.filter(kiwix::Filter().query("Wiki").creator("Wiki")); From e214efecd434cff696020d7dacff56119222e5f1 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 10 Mar 2021 16:38:08 +0400 Subject: [PATCH 09/10] Language code conversion via ICU Language code is converted from ISO 639-3 to ISO 639 (which is understood by Xapian) via ICU. The previous approach via an explicit map had its advantages since Xapian has more than one stemmer implementations for some languages (selectable via Xapian-specific identifiers). This commit relies on the defaults associated with the ISO 639 language codes. --- src/library.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/library.cpp b/src/library.cpp index 52ca72670..cdfc9a246 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -30,6 +30,7 @@ #include #include #include +#include namespace kiwix { @@ -37,13 +38,8 @@ namespace kiwix namespace { -const std::map iso639_3ToXapian { - {"deu", "german" }, - {"eng", "english" }, - {"fra", "french" }, - {"hye", "armenian"}, - {"rus", "russian" }, - {"spa", "spanish" }, +std::string iso639_3ToXapian(const std::string& lang) { + return icu::Locale(lang.c_str()).getLanguage(); }; std::string normalizeText(const std::string& text, const std::string& language) @@ -260,7 +256,7 @@ void Library::updateBookDB(const Book& book) Xapian::TermGenerator indexer; const std::string lang = book.getLanguage(); try { - stemmer = Xapian::Stem(iso639_3ToXapian.at(lang)); + stemmer = Xapian::Stem(iso639_3ToXapian(lang)); indexer.set_stemmer(stemmer); indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); } catch (...) {} @@ -301,7 +297,7 @@ Library::BookIdCollection Library::getBooksByTitleOrDescription(const Filter& fi : 0; // Language assumed for the query is not known for sure so stemming // is not applied - //queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian.at(???))); + //queryParser.set_stemmer(Xapian::Stem(iso639_3ToXapian(???))); //queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); const auto flags = Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN From 20b487da8d67cca0e7f4847c73841f1603dae996 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 10 Mar 2021 17:48:34 +0400 Subject: [PATCH 10/10] Added Xapian as direct dependency --- meson.build | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index 48e5a7d3b..7af35e225 100644 --- a/meson.build +++ b/meson.build @@ -34,6 +34,7 @@ pugixml_dep = dependency('pugixml', static:static_deps) libcurl_dep = dependency('libcurl', static:static_deps) microhttpd_dep = dependency('libmicrohttpd', static:static_deps) zlib_dep = dependency('zlib', static:static_deps) +xapian_dep = dependency('xapian-core', static:static_deps) if compiler.has_header('mustache.hpp') extra_include = [] @@ -55,7 +56,7 @@ if target_machine.system() == 'windows' and static_deps extra_cflags += '-DCURL_STATICLIB' endif -all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep] +all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep, microhttpd_dep, zlib_dep, xapian_dep] inc = include_directories('include', extra_include) @@ -74,7 +75,7 @@ subdir('static') subdir('src') subdir('test') -pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd'] +pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl', 'libmicrohttpd', 'xapian-core'] pkg_conf = configuration_data() pkg_conf.set('prefix', get_option('prefix'))