From d3d5abe14d1dcf5f6323bb84d9ba19c1ee3460ba Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Fri, 16 Apr 2021 12:27:14 +0400 Subject: [PATCH] Handling of non-words in publisher query This change fixes the failure of the LibraryTest.filterByPublisher unit-test broken by the previous commit. The previous approach used in `publisherQuery()` for building a phrase query enforcing the specified prefix for all terms fails if 1. the input phrase contains a non-word term that Xapian's query parser doesn't like (e.g. a standalone ampersand character, 1/2, a#1, etc); 2. the input phrase contains at least three terms that Xapian's query parser has no issue with. Using the `quest` tool (coming with xapian-tools under Ubuntu) the issue can be demonstrated as follows: ``` $ quest -o phrase -d some_xapian_db "Energy & security" Parsed Query: Query((energy@1 PHRASE 11 Zsecur@2)) Exactly 0 matches MSet: $ quest -o phrase -d some_xapian_db "Energy & security act" UnimplementedError: OP_NEAR and OP_PHRASE only currently support leaf subqueries $ quest -o phrase -d some_xapian_db 'Energy 1/2 security act' UnimplementedError: OP_NEAR and OP_PHRASE only currently support leaf subqueries $ quest -o phrase -d some_xapian_db "Energy a#1 security act" UnimplementedError: OP_NEAR and OP_PHRASE only currently support leaf subqueries ``` The problem comes from parsing the query with the default operation set to `OP_PHRASE` (exemplified by the `-o phrase` option in above invocations of `quest`). A workaround is to parse the phrase with a default operation of `OP_OR` and then combine all the terms with `OP_PHRASE`. Besides stemming should be disabled in order to target an exact phrase match (save for the non-word terms, if any, that are ignored by the query parser). --- src/library.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/library.cpp b/src/library.cpp index cff560fe2..72a947068 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -364,9 +364,11 @@ Xapian::Query langQuery(const std::string& lang) Xapian::Query publisherQuery(const std::string& publisher) { Xapian::QueryParser queryParser; - queryParser.set_default_op(Xapian::Query::OP_PHRASE); + queryParser.set_default_op(Xapian::Query::OP_OR); + queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE); const auto flags = 0; - return queryParser.parse_query(normalizeText(publisher), flags, "XP"); + const auto q = queryParser.parse_query(normalizeText(publisher), flags, "XP"); + return Xapian::Query(Xapian::Query::OP_PHRASE, q.get_terms_begin(), q.get_terms_end(), q.get_length()); } Xapian::Query buildXapianQuery(const Filter& filter)