Merge pull request #838 from kiwix/language_handling_during_search

This commit is contained in:
Matthieu Gautier 2022-11-01 18:28:02 +01:00 committed by GitHub
commit a52138e5ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 201 additions and 94 deletions

View File

@ -66,7 +66,7 @@ bool Book::update(const kiwix::Book& other)
void Book::update(const zim::Archive& archive) {
m_path = archive.getFilename();
m_pathValid = true;
m_id = getArchiveId(archive);
m_id = std::string(archive.getUuid());
m_title = getArchiveTitle(archive);
m_description = getMetaDescription(archive);
m_language = getMetaLanguage(archive);

View File

@ -77,7 +77,6 @@ extern "C" {
#include "request_context.h"
#include "response.h"
#define MAX_SEARCH_LEN 140
#define DEFAULT_CACHE_SIZE 2
namespace kiwix {
@ -212,6 +211,16 @@ void checkBookNumber(const Library::BookIdSet& bookIds, size_t limit) {
}
}
typedef std::set<std::string> Languages;
Languages getLanguages(const Library& lib, const Library::BookIdSet& bookIds) {
Languages langs;
for ( const auto& b : bookIds ) {
langs.insert(lib.getBookById(b).getLanguage());
}
return langs;
}
struct CustomizedResourceData
{
std::string mimeType;
@ -307,6 +316,10 @@ SearchInfo InternalServer::getSearchInfo(const RequestContext& request) const
{
auto bookIds = selectBooks(request);
checkBookNumber(bookIds.second, m_multizimSearchLimit);
if ( getLanguages(*mp_library, bookIds.second).size() != 1 ) {
throw Error(nonParameterizedMessage("confusion-of-tongues"));
}
auto pattern = request.get_optional_param<std::string>("pattern", "");
GeoQuery geoQuery;
@ -813,6 +826,32 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
}
try {
return handle_search_request(request);
} catch (const Error& e) {
return HTTP400Response(*this, request)
+ invalidUrlMsg
+ e.message();
}
}
namespace
{
unsigned getSearchPageSize(const RequestContext& r)
{
const auto DEFAULT_PAGE_LEN = 25u;
const auto MAX_PAGE_LEN = 140u;
const auto pageLength = r.get_optional_param("pageLength", DEFAULT_PAGE_LEN);
return pageLength == 0
? DEFAULT_PAGE_LEN
: min(MAX_PAGE_LEN, pageLength);
}
} // unnamed namespace
std::unique_ptr<Response> InternalServer::handle_search_request(const RequestContext& request)
{
auto searchInfo = getSearchInfo(request);
auto bookIds = searchInfo.getBookIds();
@ -849,22 +888,8 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
return response;
}
auto start = 1;
try {
start = request.get_argument<unsigned int>("start");
} catch (const std::exception&) {}
start = max(1, start);
auto pageLength = 25;
try {
pageLength = request.get_argument<unsigned int>("pageLength");
} catch (const std::exception&) {}
if (pageLength > MAX_SEARCH_LEN) {
pageLength = MAX_SEARCH_LEN;
}
if (pageLength == 0) {
pageLength = 25;
}
const auto start = max(1u, request.get_optional_param("start", 1u));
const auto pageLength = getSearchPageSize(request);
/* Get the results */
SearchRenderer renderer(search->getResults(start-1, pageLength), mp_nameMapper, mp_library, start,
@ -888,11 +913,6 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
}
*/
return std::move(response);
} catch (const Error& e) {
return HTTP400Response(*this, request)
+ invalidUrlMsg
+ e.message();
}
}
std::unique_ptr<Response> InternalServer::handle_random(const RequestContext& request)

View File

@ -134,6 +134,7 @@ class InternalServer {
std::unique_ptr<Response> handle_catalog_v2_languages(const RequestContext& request);
std::unique_ptr<Response> handle_catalog_v2_illustration(const RequestContext& request);
std::unique_ptr<Response> handle_search(const RequestContext& request);
std::unique_ptr<Response> handle_search_request(const RequestContext& request);
std::unique_ptr<Response> handle_suggest(const RequestContext& request);
std::unique_ptr<Response> handle_random(const RequestContext& request);
std::unique_ptr<Response> handle_catch(const RequestContext& request);

View File

@ -107,6 +107,14 @@ MHD_Result RequestContext::fill_argument(void *__this, enum MHD_ValueKind kind,
{
RequestContext *_this = static_cast<RequestContext*>(__this);
_this->arguments[key].push_back(value == nullptr ? "" : value);
if ( ! _this->queryString.empty() ) {
_this->queryString += "&";
}
_this->queryString += key;
if ( value ) {
_this->queryString += "=";
_this->queryString += value;
}
return MHD_YES;
}

View File

@ -92,9 +92,7 @@ class RequestContext {
std::string get_url_part(int part) const;
std::string get_full_url() const;
std::string get_query(bool mustEncode = false) const {
return get_query([](const std::string& key) {return true;}, mustEncode);
}
std::string get_query() const { return queryString; }
template<class F>
std::string get_query(F filter, bool mustEncode) const {
@ -132,6 +130,7 @@ class RequestContext {
ByteRange byteRange_;
std::map<std::string, std::string> headers;
std::map<std::string, std::vector<std::string>> arguments;
std::string queryString;
private: // functions
static MHD_Result fill_header(void *, enum MHD_ValueKind, const char*, const char*);

View File

@ -93,10 +93,6 @@ std::string getMetaFlavour(const zim::Archive& archive) {
return getMetadata(archive, "Flavour");
}
std::string getArchiveId(const zim::Archive& archive) {
return (std::string) archive.getUuid();
}
bool getArchiveFavicon(const zim::Archive& archive, unsigned size,
std::string& content, std::string& mimeType){
try {

View File

@ -40,7 +40,6 @@ namespace kiwix
std::string getMetaCreator(const zim::Archive& archive);
std::string getMetaPublisher(const zim::Archive& archive);
std::string getMetaFlavour(const zim::Archive& archive);
std::string getArchiveId(const zim::Archive& archive);
bool getArchiveFavicon(const zim::Archive& archive, unsigned size,
std::string& content, std::string& mimeType);

View File

@ -27,4 +27,5 @@
, "home-button-text": "Go to the main page of '{{BOOK_TITLE}}'"
, "random-page-button-text": "Go to a randomly selected page"
, "searchbox-tooltip": "Search '{{BOOK_TITLE}}'"
, "confusion-of-tongues": "Two or more books in different languages would participate in search, which may lead to confusing results."
}

View File

@ -0,0 +1,4 @@
<library version="20110515">
<book id="5dc0b3af-5df2-0925-f0ca-d2bf75e78af6" path="example.zim" title="Wikibooks" description="testZim" language="eng" creator="test" publisher="test" tags="_ftindex:yes;_ftindex:yes;_pictures:yes;_videos:yes;_details:yes" date="2021-04-17" mediaCount="22" size="253" />
<book id="6f1d19d0-633f-087b-fb55-7ac324ff9baf" path="zimfile.zim" title="Ray Charles" description="Wikipedia articles about Ray Charles" language="eng" creator="Wikipedia" publisher="Kiwix" name="wikipedia_en_ray_charles" flavour="_mini" tags="wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes" date="2020-03-31" articleCount="129" mediaCount="45" size="555" />
</library>

View File

@ -359,7 +359,7 @@ TEST_F(LibraryServerTest, catalog_search_results_pagination)
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Filtered zims (count=1&amp;start=1)</title>\n"
" <title>Filtered zims (start=1&amp;count=1)</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>1</startIndex>\n"
@ -375,7 +375,7 @@ TEST_F(LibraryServerTest, catalog_search_results_pagination)
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
OPDS_FEED_TAG
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
" <title>Filtered zims (count=10&amp;start=100)</title>\n"
" <title>Filtered zims (start=100&amp;count=10)</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>100</startIndex>\n"
@ -638,8 +638,8 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_range)
const auto r = zfs1_->GET("/ROOT/catalog/v2/entries?start=1&count=1");
EXPECT_EQ(r->status, 200);
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
CATALOG_V2_ENTRIES_PREAMBLE("?count=1&start=1")
" <title>Filtered Entries (count=1&amp;start=1)</title>\n"
CATALOG_V2_ENTRIES_PREAMBLE("?start=1&count=1")
" <title>Filtered Entries (start=1&amp;count=1)</title>\n"
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
" <totalResults>3</totalResults>\n"
" <startIndex>1</startIndex>\n"

View File

@ -37,6 +37,7 @@ if gtest_dep.found() and not meson.is_cross_build()
'corner_cases.zim',
'poor.zim',
'library.xml',
'lib_for_server_search_test.xml',
'customized_resources.txt',
'helloworld.txt',
'welcome.html',

View File

@ -839,7 +839,7 @@ TEST_F(ServerTest, Http400HtmlError)
expected_body==R"(
<h1>Invalid request</h1>
<p>
The requested URL "/ROOT/search?books.filter.lang=eng&pattern=" is not a valid request.
The requested URL "/ROOT/search?books.filter.lang=eng&pattern" is not a valid request.
</p>
<p>
No query provided.
@ -896,21 +896,21 @@ TEST_F(ServerTest, HttpXmlError)
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?content=zimfile&format=xml" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&content=zimfile" is not a valid request.</detail>
<detail>No query provided.</detail>
)" },
{ /* url */ "/ROOT/search?format=xml&content=non-existing-book&pattern=asdfqwerty",
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?content=non-existing-book&format=xml&pattern=asdfqwerty" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=asdfqwerty" is not a valid request.</detail>
<detail>No such book: non-existing-book</detail>
)" },
{ /* url */ "/ROOT/search?format=xml&content=non-existing-book&pattern=a\"<script foo>",
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?content=non-existing-book&format=xml&pattern=a"&lt;script foo&gt;" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=a"&lt;script foo&gt;" is not a valid request.</detail>
<detail>No such book: non-existing-book</detail>
)" },
// There is a flaw in our way to handle query string, we cannot differenciate
@ -919,7 +919,7 @@ TEST_F(ServerTest, HttpXmlError)
/* HTTP status code */ 400,
/* expected response XML */ R"(
<error>Invalid request</error>
<detail>The requested URL "/ROOT/search?books.filter.lang=eng&format=xml&pattern=" is not a valid request.</detail>
<detail>The requested URL "/ROOT/search?format=xml&books.filter.lang=eng&pattern" is not a valid request.</detail>
<detail>No query provided.</detail>
)" },
{ /* url */ "/ROOT/search?format=xml&pattern=foo",

View File

@ -6,6 +6,7 @@
#define SERVER_PORT 8101
#include "server_testing_tools.h"
std::string makeSearchResultsHtml(const std::string& pattern,
const std::string& header,
const std::string& results,
@ -555,7 +556,7 @@ const std::vector<SearchResult> LARGE_SEARCH_RESULTS = {
//
// In order to be able to share the same expected output data
// LARGE_SEARCH_RESULTS between multiple build platforms and test-points
// of the ServerTest.searchResults test-case
// of the ServerSearchTest.searchResults test-case
//
// 1. Snippets are excluded from the plain-text comparison of actual and
// expected HTML strings. This is done with the help of the
@ -916,7 +917,7 @@ struct TestData
}
};
TEST_F(ServerTest, searchResults)
TEST(ServerSearchTest, searchResults)
{
const TestData testData[] = {
{
@ -1340,14 +1341,12 @@ TEST_F(ServerTest, searchResults)
/* pagination */ {}
},
// Only RayCharles is in English.
// [TODO] We should extend our test data to have another zim file in english returning results.
{
/* query */ "pattern=travel"
"&books.filter.lang=eng",
/* start */ 0,
/* resultsPerPage */ 10,
/* totalResultCount */ 1,
/* totalResultCount */ 2,
/* firstResultIndex */ 1,
/* results */ {
SEARCH_RESULT(
@ -1357,6 +1356,14 @@ TEST_F(ServerTest, searchResults)
/*bookTitle*/ "Ray Charles",
/*wordCount*/ "204"
),
SEARCH_RESULT(
/*link*/ "/ROOT/content/example/Wikibooks.html",
/*title*/ "Wikibooks",
/*snippet*/ R"SNIPPET(...<b>Travel</b> guide Wikidata Knowledge database Commons Media repository Meta Coordination MediaWiki MediaWiki software Phabricator MediaWiki bug tracker Wikimedia Labs MediaWiki development The Wikimedia Foundation is a non-profit organization that depends on your voluntarism and donations to operate. If you find Wikibooks or other projects hosted by the Wikimedia Foundation useful, please volunteer or make a donation. Your donations primarily helps to purchase server equipment, launch new projects......)SNIPPET",
/*bookTitle*/ "Wikibooks",
/*wordCount*/ "538"
)
},
/* pagination */ {}
},
@ -1451,15 +1458,86 @@ TEST_F(ServerTest, searchResults)
},
};
ZimFileServer zfs(SERVER_PORT, ZimFileServer::DEFAULT_OPTIONS,
"./test/lib_for_server_search_test.xml");
for ( const auto& t : testData ) {
const std::string htmlSearchUrl = t.url();
const auto htmlRes = zfs1_->GET(htmlSearchUrl.c_str());
const auto htmlRes = zfs.GET(htmlSearchUrl.c_str());
EXPECT_EQ(htmlRes->status, 200);
t.checkHtml(htmlRes->body);
const std::string xmlSearchUrl = t.xmlSearchUrl();
const auto xmlRes = zfs1_->GET(xmlSearchUrl.c_str());
const auto xmlRes = zfs.GET(xmlSearchUrl.c_str());
EXPECT_EQ(xmlRes->status, 200);
t.checkXml(xmlRes->body);
}
}
std::string expectedConfusionOfTonguesErrorHtml(std::string url)
{
return R"(<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html;charset=UTF-8" http-equiv="content-type" />
<title>Invalid request</title>
</head>
<body>
<h1>Invalid request</h1>
<p>
The requested URL ")" + url + R"(" is not a valid request.
</p>
<p>
Two or more books in different languages would participate in search, which may lead to confusing results.
</p>
</body>
</html>
)";
}
std::string expectedConfusionOfTonguesErrorXml(std::string url)
{
return R"(<?xml version="1.0" encoding="UTF-8">
<error>Invalid request</error>
<detail>The requested URL ")" + url + R"(" is not a valid request.</detail>
<detail>Two or more books in different languages would participate in search, which may lead to confusing results.</detail>
)";
}
TEST(ServerSearchTest, searchInMultilanguageBookSetIsDenied)
{
const std::string testQueries[] = {
"pattern=towerofbabel",
"pattern=babylon&books.filter.maxsize=1000000",
"pattern=baby&books.id=" RAYCHARLESZIMID "&books.id=" EXAMPLEZIMID,
};
// The default limit on the number of books in a multi-zim search is 3
const ZimFileServer::FilePathCollection ZIMFILES{
"./test/zimfile.zim", // eng
"./test/example.zim", // en
"./test/corner_cases.zim" // =en
};
ZimFileServer zfs(SERVER_PORT, ZimFileServer::DEFAULT_OPTIONS, ZIMFILES);
for ( const auto& q : testQueries ) {
{
// HTML mode
const std::string url = "/ROOT/search?" + q;
const auto r = zfs.GET(url.c_str());
const TestContext ctx{ {"url", url} };
EXPECT_EQ(r->status, 400) << ctx;
EXPECT_EQ(r->body, expectedConfusionOfTonguesErrorHtml(url)) << ctx;
}
{
// XML mode
const std::string url = "/ROOT/search?" + q + "&format=xml";
const auto r = zfs.GET(url.c_str());
const TestContext ctx{ {"url", url} };
EXPECT_EQ(r->status, 400) << ctx;
EXPECT_EQ(r->body, expectedConfusionOfTonguesErrorXml(url)) << ctx;
}
}
}