mirror of https://github.com/kiwix/libkiwix.git
Merge pull request #838 from kiwix/language_handling_during_search
This commit is contained in:
commit
a52138e5ba
|
@ -66,7 +66,7 @@ bool Book::update(const kiwix::Book& other)
|
|||
void Book::update(const zim::Archive& archive) {
|
||||
m_path = archive.getFilename();
|
||||
m_pathValid = true;
|
||||
m_id = getArchiveId(archive);
|
||||
m_id = std::string(archive.getUuid());
|
||||
m_title = getArchiveTitle(archive);
|
||||
m_description = getMetaDescription(archive);
|
||||
m_language = getMetaLanguage(archive);
|
||||
|
|
|
@ -77,7 +77,6 @@ extern "C" {
|
|||
#include "request_context.h"
|
||||
#include "response.h"
|
||||
|
||||
#define MAX_SEARCH_LEN 140
|
||||
#define DEFAULT_CACHE_SIZE 2
|
||||
|
||||
namespace kiwix {
|
||||
|
@ -212,6 +211,16 @@ void checkBookNumber(const Library::BookIdSet& bookIds, size_t limit) {
|
|||
}
|
||||
}
|
||||
|
||||
typedef std::set<std::string> Languages;
|
||||
|
||||
Languages getLanguages(const Library& lib, const Library::BookIdSet& bookIds) {
|
||||
Languages langs;
|
||||
for ( const auto& b : bookIds ) {
|
||||
langs.insert(lib.getBookById(b).getLanguage());
|
||||
}
|
||||
return langs;
|
||||
}
|
||||
|
||||
struct CustomizedResourceData
|
||||
{
|
||||
std::string mimeType;
|
||||
|
@ -307,6 +316,10 @@ SearchInfo InternalServer::getSearchInfo(const RequestContext& request) const
|
|||
{
|
||||
auto bookIds = selectBooks(request);
|
||||
checkBookNumber(bookIds.second, m_multizimSearchLimit);
|
||||
if ( getLanguages(*mp_library, bookIds.second).size() != 1 ) {
|
||||
throw Error(nonParameterizedMessage("confusion-of-tongues"));
|
||||
}
|
||||
|
||||
auto pattern = request.get_optional_param<std::string>("pattern", "");
|
||||
GeoQuery geoQuery;
|
||||
|
||||
|
@ -813,6 +826,32 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
}
|
||||
|
||||
try {
|
||||
return handle_search_request(request);
|
||||
} catch (const Error& e) {
|
||||
return HTTP400Response(*this, request)
|
||||
+ invalidUrlMsg
|
||||
+ e.message();
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
unsigned getSearchPageSize(const RequestContext& r)
|
||||
{
|
||||
const auto DEFAULT_PAGE_LEN = 25u;
|
||||
const auto MAX_PAGE_LEN = 140u;
|
||||
|
||||
const auto pageLength = r.get_optional_param("pageLength", DEFAULT_PAGE_LEN);
|
||||
return pageLength == 0
|
||||
? DEFAULT_PAGE_LEN
|
||||
: min(MAX_PAGE_LEN, pageLength);
|
||||
}
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
std::unique_ptr<Response> InternalServer::handle_search_request(const RequestContext& request)
|
||||
{
|
||||
auto searchInfo = getSearchInfo(request);
|
||||
auto bookIds = searchInfo.getBookIds();
|
||||
|
||||
|
@ -849,22 +888,8 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
return response;
|
||||
}
|
||||
|
||||
auto start = 1;
|
||||
try {
|
||||
start = request.get_argument<unsigned int>("start");
|
||||
} catch (const std::exception&) {}
|
||||
start = max(1, start);
|
||||
|
||||
auto pageLength = 25;
|
||||
try {
|
||||
pageLength = request.get_argument<unsigned int>("pageLength");
|
||||
} catch (const std::exception&) {}
|
||||
if (pageLength > MAX_SEARCH_LEN) {
|
||||
pageLength = MAX_SEARCH_LEN;
|
||||
}
|
||||
if (pageLength == 0) {
|
||||
pageLength = 25;
|
||||
}
|
||||
const auto start = max(1u, request.get_optional_param("start", 1u));
|
||||
const auto pageLength = getSearchPageSize(request);
|
||||
|
||||
/* Get the results */
|
||||
SearchRenderer renderer(search->getResults(start-1, pageLength), mp_nameMapper, mp_library, start,
|
||||
|
@ -888,11 +913,6 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
|
|||
}
|
||||
*/
|
||||
return std::move(response);
|
||||
} catch (const Error& e) {
|
||||
return HTTP400Response(*this, request)
|
||||
+ invalidUrlMsg
|
||||
+ e.message();
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<Response> InternalServer::handle_random(const RequestContext& request)
|
||||
|
|
|
@ -134,6 +134,7 @@ class InternalServer {
|
|||
std::unique_ptr<Response> handle_catalog_v2_languages(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_catalog_v2_illustration(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_search(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_search_request(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_suggest(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_random(const RequestContext& request);
|
||||
std::unique_ptr<Response> handle_catch(const RequestContext& request);
|
||||
|
|
|
@ -107,6 +107,14 @@ MHD_Result RequestContext::fill_argument(void *__this, enum MHD_ValueKind kind,
|
|||
{
|
||||
RequestContext *_this = static_cast<RequestContext*>(__this);
|
||||
_this->arguments[key].push_back(value == nullptr ? "" : value);
|
||||
if ( ! _this->queryString.empty() ) {
|
||||
_this->queryString += "&";
|
||||
}
|
||||
_this->queryString += key;
|
||||
if ( value ) {
|
||||
_this->queryString += "=";
|
||||
_this->queryString += value;
|
||||
}
|
||||
return MHD_YES;
|
||||
}
|
||||
|
||||
|
|
|
@ -92,9 +92,7 @@ class RequestContext {
|
|||
std::string get_url_part(int part) const;
|
||||
std::string get_full_url() const;
|
||||
|
||||
std::string get_query(bool mustEncode = false) const {
|
||||
return get_query([](const std::string& key) {return true;}, mustEncode);
|
||||
}
|
||||
std::string get_query() const { return queryString; }
|
||||
|
||||
template<class F>
|
||||
std::string get_query(F filter, bool mustEncode) const {
|
||||
|
@ -132,6 +130,7 @@ class RequestContext {
|
|||
ByteRange byteRange_;
|
||||
std::map<std::string, std::string> headers;
|
||||
std::map<std::string, std::vector<std::string>> arguments;
|
||||
std::string queryString;
|
||||
|
||||
private: // functions
|
||||
static MHD_Result fill_header(void *, enum MHD_ValueKind, const char*, const char*);
|
||||
|
|
|
@ -93,10 +93,6 @@ std::string getMetaFlavour(const zim::Archive& archive) {
|
|||
return getMetadata(archive, "Flavour");
|
||||
}
|
||||
|
||||
std::string getArchiveId(const zim::Archive& archive) {
|
||||
return (std::string) archive.getUuid();
|
||||
}
|
||||
|
||||
bool getArchiveFavicon(const zim::Archive& archive, unsigned size,
|
||||
std::string& content, std::string& mimeType){
|
||||
try {
|
||||
|
|
|
@ -40,7 +40,6 @@ namespace kiwix
|
|||
std::string getMetaCreator(const zim::Archive& archive);
|
||||
std::string getMetaPublisher(const zim::Archive& archive);
|
||||
std::string getMetaFlavour(const zim::Archive& archive);
|
||||
std::string getArchiveId(const zim::Archive& archive);
|
||||
|
||||
bool getArchiveFavicon(const zim::Archive& archive, unsigned size,
|
||||
std::string& content, std::string& mimeType);
|
||||
|
|
|
@ -27,4 +27,5 @@
|
|||
, "home-button-text": "Go to the main page of '{{BOOK_TITLE}}'"
|
||||
, "random-page-button-text": "Go to a randomly selected page"
|
||||
, "searchbox-tooltip": "Search '{{BOOK_TITLE}}'"
|
||||
, "confusion-of-tongues": "Two or more books in different languages would participate in search, which may lead to confusing results."
|
||||
}
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
<library version="20110515">
|
||||
<book id="5dc0b3af-5df2-0925-f0ca-d2bf75e78af6" path="example.zim" title="Wikibooks" description="testZim" language="eng" creator="test" publisher="test" tags="_ftindex:yes;_ftindex:yes;_pictures:yes;_videos:yes;_details:yes" date="2021-04-17" mediaCount="22" size="253" />
|
||||
<book id="6f1d19d0-633f-087b-fb55-7ac324ff9baf" path="zimfile.zim" title="Ray Charles" description="Wikipedia articles about Ray Charles" language="eng" creator="Wikipedia" publisher="Kiwix" name="wikipedia_en_ray_charles" flavour="_mini" tags="wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:no;_ftindex:yes" date="2020-03-31" articleCount="129" mediaCount="45" size="555" />
|
||||
</library>
|
|
@ -359,7 +359,7 @@ TEST_F(LibraryServerTest, catalog_search_results_pagination)
|
|||
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
|
||||
OPDS_FEED_TAG
|
||||
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
|
||||
" <title>Filtered zims (count=1&start=1)</title>\n"
|
||||
" <title>Filtered zims (start=1&count=1)</title>\n"
|
||||
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
|
||||
" <totalResults>3</totalResults>\n"
|
||||
" <startIndex>1</startIndex>\n"
|
||||
|
@ -375,7 +375,7 @@ TEST_F(LibraryServerTest, catalog_search_results_pagination)
|
|||
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
|
||||
OPDS_FEED_TAG
|
||||
" <id>12345678-90ab-cdef-1234-567890abcdef</id>\n"
|
||||
" <title>Filtered zims (count=10&start=100)</title>\n"
|
||||
" <title>Filtered zims (start=100&count=10)</title>\n"
|
||||
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
|
||||
" <totalResults>3</totalResults>\n"
|
||||
" <startIndex>100</startIndex>\n"
|
||||
|
@ -638,8 +638,8 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_range)
|
|||
const auto r = zfs1_->GET("/ROOT/catalog/v2/entries?start=1&count=1");
|
||||
EXPECT_EQ(r->status, 200);
|
||||
EXPECT_EQ(maskVariableOPDSFeedData(r->body),
|
||||
CATALOG_V2_ENTRIES_PREAMBLE("?count=1&start=1")
|
||||
" <title>Filtered Entries (count=1&start=1)</title>\n"
|
||||
CATALOG_V2_ENTRIES_PREAMBLE("?start=1&count=1")
|
||||
" <title>Filtered Entries (start=1&count=1)</title>\n"
|
||||
" <updated>YYYY-MM-DDThh:mm:ssZ</updated>\n"
|
||||
" <totalResults>3</totalResults>\n"
|
||||
" <startIndex>1</startIndex>\n"
|
||||
|
|
|
@ -37,6 +37,7 @@ if gtest_dep.found() and not meson.is_cross_build()
|
|||
'corner_cases.zim',
|
||||
'poor.zim',
|
||||
'library.xml',
|
||||
'lib_for_server_search_test.xml',
|
||||
'customized_resources.txt',
|
||||
'helloworld.txt',
|
||||
'welcome.html',
|
||||
|
|
|
@ -839,7 +839,7 @@ TEST_F(ServerTest, Http400HtmlError)
|
|||
expected_body==R"(
|
||||
<h1>Invalid request</h1>
|
||||
<p>
|
||||
The requested URL "/ROOT/search?books.filter.lang=eng&pattern=" is not a valid request.
|
||||
The requested URL "/ROOT/search?books.filter.lang=eng&pattern" is not a valid request.
|
||||
</p>
|
||||
<p>
|
||||
No query provided.
|
||||
|
@ -896,21 +896,21 @@ TEST_F(ServerTest, HttpXmlError)
|
|||
/* HTTP status code */ 400,
|
||||
/* expected response XML */ R"(
|
||||
<error>Invalid request</error>
|
||||
<detail>The requested URL "/ROOT/search?content=zimfile&format=xml" is not a valid request.</detail>
|
||||
<detail>The requested URL "/ROOT/search?format=xml&content=zimfile" is not a valid request.</detail>
|
||||
<detail>No query provided.</detail>
|
||||
)" },
|
||||
{ /* url */ "/ROOT/search?format=xml&content=non-existing-book&pattern=asdfqwerty",
|
||||
/* HTTP status code */ 400,
|
||||
/* expected response XML */ R"(
|
||||
<error>Invalid request</error>
|
||||
<detail>The requested URL "/ROOT/search?content=non-existing-book&format=xml&pattern=asdfqwerty" is not a valid request.</detail>
|
||||
<detail>The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=asdfqwerty" is not a valid request.</detail>
|
||||
<detail>No such book: non-existing-book</detail>
|
||||
)" },
|
||||
{ /* url */ "/ROOT/search?format=xml&content=non-existing-book&pattern=a\"<script foo>",
|
||||
/* HTTP status code */ 400,
|
||||
/* expected response XML */ R"(
|
||||
<error>Invalid request</error>
|
||||
<detail>The requested URL "/ROOT/search?content=non-existing-book&format=xml&pattern=a"<script foo>" is not a valid request.</detail>
|
||||
<detail>The requested URL "/ROOT/search?format=xml&content=non-existing-book&pattern=a"<script foo>" is not a valid request.</detail>
|
||||
<detail>No such book: non-existing-book</detail>
|
||||
)" },
|
||||
// There is a flaw in our way to handle query string, we cannot differenciate
|
||||
|
@ -919,7 +919,7 @@ TEST_F(ServerTest, HttpXmlError)
|
|||
/* HTTP status code */ 400,
|
||||
/* expected response XML */ R"(
|
||||
<error>Invalid request</error>
|
||||
<detail>The requested URL "/ROOT/search?books.filter.lang=eng&format=xml&pattern=" is not a valid request.</detail>
|
||||
<detail>The requested URL "/ROOT/search?format=xml&books.filter.lang=eng&pattern" is not a valid request.</detail>
|
||||
<detail>No query provided.</detail>
|
||||
)" },
|
||||
{ /* url */ "/ROOT/search?format=xml&pattern=foo",
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#define SERVER_PORT 8101
|
||||
#include "server_testing_tools.h"
|
||||
|
||||
|
||||
std::string makeSearchResultsHtml(const std::string& pattern,
|
||||
const std::string& header,
|
||||
const std::string& results,
|
||||
|
@ -555,7 +556,7 @@ const std::vector<SearchResult> LARGE_SEARCH_RESULTS = {
|
|||
//
|
||||
// In order to be able to share the same expected output data
|
||||
// LARGE_SEARCH_RESULTS between multiple build platforms and test-points
|
||||
// of the ServerTest.searchResults test-case
|
||||
// of the ServerSearchTest.searchResults test-case
|
||||
//
|
||||
// 1. Snippets are excluded from the plain-text comparison of actual and
|
||||
// expected HTML strings. This is done with the help of the
|
||||
|
@ -916,7 +917,7 @@ struct TestData
|
|||
}
|
||||
};
|
||||
|
||||
TEST_F(ServerTest, searchResults)
|
||||
TEST(ServerSearchTest, searchResults)
|
||||
{
|
||||
const TestData testData[] = {
|
||||
{
|
||||
|
@ -1340,14 +1341,12 @@ TEST_F(ServerTest, searchResults)
|
|||
/* pagination */ {}
|
||||
},
|
||||
|
||||
// Only RayCharles is in English.
|
||||
// [TODO] We should extend our test data to have another zim file in english returning results.
|
||||
{
|
||||
/* query */ "pattern=travel"
|
||||
"&books.filter.lang=eng",
|
||||
/* start */ 0,
|
||||
/* resultsPerPage */ 10,
|
||||
/* totalResultCount */ 1,
|
||||
/* totalResultCount */ 2,
|
||||
/* firstResultIndex */ 1,
|
||||
/* results */ {
|
||||
SEARCH_RESULT(
|
||||
|
@ -1357,6 +1356,14 @@ TEST_F(ServerTest, searchResults)
|
|||
/*bookTitle*/ "Ray Charles",
|
||||
/*wordCount*/ "204"
|
||||
),
|
||||
|
||||
SEARCH_RESULT(
|
||||
/*link*/ "/ROOT/content/example/Wikibooks.html",
|
||||
/*title*/ "Wikibooks",
|
||||
/*snippet*/ R"SNIPPET(...<b>Travel</b> guide Wikidata Knowledge database Commons Media repository Meta Coordination MediaWiki MediaWiki software Phabricator MediaWiki bug tracker Wikimedia Labs MediaWiki development The Wikimedia Foundation is a non-profit organization that depends on your voluntarism and donations to operate. If you find Wikibooks or other projects hosted by the Wikimedia Foundation useful, please volunteer or make a donation. Your donations primarily helps to purchase server equipment, launch new projects......)SNIPPET",
|
||||
/*bookTitle*/ "Wikibooks",
|
||||
/*wordCount*/ "538"
|
||||
)
|
||||
},
|
||||
/* pagination */ {}
|
||||
},
|
||||
|
@ -1451,15 +1458,86 @@ TEST_F(ServerTest, searchResults)
|
|||
},
|
||||
};
|
||||
|
||||
ZimFileServer zfs(SERVER_PORT, ZimFileServer::DEFAULT_OPTIONS,
|
||||
"./test/lib_for_server_search_test.xml");
|
||||
|
||||
for ( const auto& t : testData ) {
|
||||
const std::string htmlSearchUrl = t.url();
|
||||
const auto htmlRes = zfs1_->GET(htmlSearchUrl.c_str());
|
||||
const auto htmlRes = zfs.GET(htmlSearchUrl.c_str());
|
||||
EXPECT_EQ(htmlRes->status, 200);
|
||||
t.checkHtml(htmlRes->body);
|
||||
|
||||
const std::string xmlSearchUrl = t.xmlSearchUrl();
|
||||
const auto xmlRes = zfs1_->GET(xmlSearchUrl.c_str());
|
||||
const auto xmlRes = zfs.GET(xmlSearchUrl.c_str());
|
||||
EXPECT_EQ(xmlRes->status, 200);
|
||||
t.checkXml(xmlRes->body);
|
||||
}
|
||||
}
|
||||
|
||||
std::string expectedConfusionOfTonguesErrorHtml(std::string url)
|
||||
{
|
||||
return R"(<!DOCTYPE html>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<meta content="text/html;charset=UTF-8" http-equiv="content-type" />
|
||||
<title>Invalid request</title>
|
||||
|
||||
</head>
|
||||
<body>
|
||||
<h1>Invalid request</h1>
|
||||
<p>
|
||||
The requested URL ")" + url + R"(" is not a valid request.
|
||||
</p>
|
||||
<p>
|
||||
Two or more books in different languages would participate in search, which may lead to confusing results.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
)";
|
||||
}
|
||||
|
||||
std::string expectedConfusionOfTonguesErrorXml(std::string url)
|
||||
{
|
||||
return R"(<?xml version="1.0" encoding="UTF-8">
|
||||
<error>Invalid request</error>
|
||||
<detail>The requested URL ")" + url + R"(" is not a valid request.</detail>
|
||||
<detail>Two or more books in different languages would participate in search, which may lead to confusing results.</detail>
|
||||
)";
|
||||
}
|
||||
|
||||
TEST(ServerSearchTest, searchInMultilanguageBookSetIsDenied)
|
||||
{
|
||||
const std::string testQueries[] = {
|
||||
"pattern=towerofbabel",
|
||||
"pattern=babylon&books.filter.maxsize=1000000",
|
||||
"pattern=baby&books.id=" RAYCHARLESZIMID "&books.id=" EXAMPLEZIMID,
|
||||
};
|
||||
|
||||
// The default limit on the number of books in a multi-zim search is 3
|
||||
const ZimFileServer::FilePathCollection ZIMFILES{
|
||||
"./test/zimfile.zim", // eng
|
||||
"./test/example.zim", // en
|
||||
"./test/corner_cases.zim" // =en
|
||||
};
|
||||
|
||||
ZimFileServer zfs(SERVER_PORT, ZimFileServer::DEFAULT_OPTIONS, ZIMFILES);
|
||||
for ( const auto& q : testQueries ) {
|
||||
{
|
||||
// HTML mode
|
||||
const std::string url = "/ROOT/search?" + q;
|
||||
const auto r = zfs.GET(url.c_str());
|
||||
const TestContext ctx{ {"url", url} };
|
||||
EXPECT_EQ(r->status, 400) << ctx;
|
||||
EXPECT_EQ(r->body, expectedConfusionOfTonguesErrorHtml(url)) << ctx;
|
||||
}
|
||||
|
||||
{
|
||||
// XML mode
|
||||
const std::string url = "/ROOT/search?" + q + "&format=xml";
|
||||
const auto r = zfs.GET(url.c_str());
|
||||
const TestContext ctx{ {"url", url} };
|
||||
EXPECT_EQ(r->status, 400) << ctx;
|
||||
EXPECT_EQ(r->body, expectedConfusionOfTonguesErrorXml(url)) << ctx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue