Merge pull request #724 from kiwix/search_improvement

This commit is contained in:
Matthieu Gautier 2022-03-29 14:42:24 +02:00 committed by GitHub
commit 95d4dd63ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 325 additions and 118 deletions

View File

@ -109,6 +109,54 @@ unsigned int getCacheLength(const char* name, unsigned int defaultVal) {
} }
} // unnamed namespace } // unnamed namespace
SearchInfo::SearchInfo(const std::string& pattern)
: pattern(pattern),
geoQuery()
{}
SearchInfo::SearchInfo(const std::string& pattern, GeoQuery geoQuery)
: pattern(pattern),
geoQuery(geoQuery)
{}
SearchInfo::SearchInfo(const RequestContext& request)
: pattern(request.get_optional_param<std::string>("pattern", "")),
geoQuery(),
bookName(request.get_optional_param<std::string>("content", ""))
{
/* Retrive geo search */
try {
auto latitude = request.get_argument<float>("latitude");
auto longitude = request.get_argument<float>("longitude");
auto distance = request.get_argument<float>("distance");
geoQuery = GeoQuery(latitude, longitude, distance);
} catch(const std::out_of_range&) {}
catch(const std::invalid_argument&) {}
if (!geoQuery && pattern.empty()) {
throw std::invalid_argument("No query provided.");
}
}
zim::Query SearchInfo::getZimQuery(bool verbose) const {
zim::Query query;
if (verbose) {
std::cout << "Performing query '" << pattern<< "'";
}
query.setQuery(pattern);
if (geoQuery) {
if (verbose) {
std::cout << " with geo query '" << geoQuery.distance << "&(" << geoQuery.latitude << ";" << geoQuery.longitude << ")'";
}
query.setGeorange(geoQuery.latitude, geoQuery.longitude, geoQuery.distance);
}
if (verbose) {
std::cout << std::endl;
}
return query;
}
static IdNameMapper defaultNameMapper; static IdNameMapper defaultNameMapper;
static MHD_Result staticHandlerCallback(void* cls, static MHD_Result staticHandlerCallback(void* cls,
@ -499,111 +547,86 @@ std::unique_ptr<Response> InternalServer::handle_search(const RequestContext& re
printf("** running handle_search\n"); printf("** running handle_search\n");
} }
std::string patternString;
try { try {
patternString = request.get_argument("pattern"); auto searchInfo = SearchInfo(request);
} catch (const std::out_of_range&) {}
/* Retrive geo search */ std::string bookId;
bool has_geo_query = false; std::shared_ptr<zim::Archive> archive;
float latitude = 0; if (!searchInfo.bookName.empty()) {
float longitude = 0; try {
float distance = 0; bookId = mp_nameMapper->getIdForName(searchInfo.bookName);
try { archive = mp_library->getArchiveById(bookId);
latitude = request.get_argument<float>("latitude"); } catch (const std::out_of_range&) {
longitude = request.get_argument<float>("longitude"); throw std::invalid_argument("The requested book doesn't exist.");
distance = request.get_argument<float>("distance");
has_geo_query = true;
} catch(const std::out_of_range&) {}
catch(const std::invalid_argument&) {}
std::string bookName, bookId;
std::shared_ptr<zim::Archive> archive;
try {
bookName = request.get_argument("content");
bookId = mp_nameMapper->getIdForName(bookName);
archive = mp_library->getArchiveById(bookId);
} catch (const std::out_of_range&) {}
/* Make the search */
if ( (!archive && !bookName.empty())
|| (patternString.empty() && ! has_geo_query) ) {
auto data = get_default_data();
data.set("pattern", encodeDiples(patternString));
data.set("root", m_root);
auto response = ContentResponse::build(*this, RESOURCE::templates::no_search_result_html, data, "text/html; charset=utf-8");
response->set_code(MHD_HTTP_NOT_FOUND);
return withTaskbarInfo(bookName, archive.get(), std::move(response));
}
std::shared_ptr<zim::Searcher> searcher;
if (archive) {
searcher = searcherCache.getOrPut(bookId, [=](){ return std::make_shared<zim::Searcher>(*archive);});
} else {
for (auto& bookId: mp_library->filter(kiwix::Filter().local(true).valid(true))) {
auto currentArchive = mp_library->getArchiveById(bookId);
if (currentArchive) {
if (! searcher) {
searcher = std::make_shared<zim::Searcher>(*currentArchive);
} else {
searcher->addArchive(*currentArchive);
}
} }
} }
}
auto start = 0;
try {
start = request.get_argument<unsigned int>("start");
} catch (const std::exception&) {}
auto pageLength = 25;
try {
pageLength = request.get_argument<unsigned int>("pageLength");
} catch (const std::exception&) {}
if (pageLength > MAX_SEARCH_LEN) {
pageLength = MAX_SEARCH_LEN;
}
if (pageLength == 0) {
pageLength = 25;
}
/* Get the results */
std::string queryString;
try {
zim::Query query;
if (patternString.empty()) {
// Execute geo-search
if (m_verbose.load()) {
cout << "Performing geo query `" << distance << "&(" << latitude << ";" << longitude << ")'" << endl;
}
query.setQuery("");
queryString = "GEO:" + to_string(latitude) + to_string(longitude) + to_string(distance);
query.setGeorange(latitude, longitude, distance);
} else {
// Execute Ft search
if (m_verbose.load()) {
cout << "Performing query `" << patternString << "'" << endl;
}
queryString = "FT:" + removeAccents(patternString);
query.setQuery(queryString);
}
queryString = bookId + queryString;
/* Make the search */
// Try to get a search from the searchInfo, else build it
std::shared_ptr<zim::Search> search; std::shared_ptr<zim::Search> search;
search = searchCache.getOrPut(queryString, [=](){ return make_shared<zim::Search>(searcher->search(query));}); try {
search = searchCache.getOrPut(searchInfo,
[=](){
std::shared_ptr<zim::Searcher> searcher;
if (archive) {
searcher = searcherCache.getOrPut(bookId, [=](){ return std::make_shared<zim::Searcher>(*archive);});
} else {
for (auto& bookId: mp_library->filter(kiwix::Filter().local(true).valid(true))) {
auto currentArchive = mp_library->getArchiveById(bookId);
if (currentArchive) {
if (! searcher) {
searcher = std::make_shared<zim::Searcher>(*currentArchive);
} else {
searcher->addArchive(*currentArchive);
}
}
}
}
return make_shared<zim::Search>(searcher->search(searchInfo.getZimQuery(m_verbose.load())));
}
);
} catch(std::runtime_error& e) {
// Searcher->search will throw a runtime error if there is no valid xapian database to do the search.
// (in case of zim file not containing a index)
auto data = get_default_data();
data.set("pattern", encodeDiples(searchInfo.pattern));
auto response = ContentResponse::build(*this, RESOURCE::templates::no_search_result_html, data, "text/html; charset=utf-8");
response->set_code(MHD_HTTP_NOT_FOUND);
return withTaskbarInfo(searchInfo.bookName, archive.get(), std::move(response));
}
auto start = 0;
try {
start = request.get_argument<unsigned int>("start");
} catch (const std::exception&) {}
auto pageLength = 25;
try {
pageLength = request.get_argument<unsigned int>("pageLength");
} catch (const std::exception&) {}
if (pageLength > MAX_SEARCH_LEN) {
pageLength = MAX_SEARCH_LEN;
}
if (pageLength == 0) {
pageLength = 25;
}
/* Get the results */
SearchRenderer renderer(search->getResults(start, pageLength), mp_nameMapper, mp_library, start, SearchRenderer renderer(search->getResults(start, pageLength), mp_nameMapper, mp_library, start,
search->getEstimatedMatches()); search->getEstimatedMatches());
renderer.setSearchPattern(patternString); renderer.setSearchPattern(searchInfo.pattern);
renderer.setSearchContent(bookName); renderer.setSearchContent(searchInfo.bookName);
renderer.setProtocolPrefix(m_root + "/"); renderer.setProtocolPrefix(m_root + "/");
renderer.setSearchProtocolPrefix(m_root + "/search?"); renderer.setSearchProtocolPrefix(m_root + "/search?");
renderer.setPageLength(pageLength); renderer.setPageLength(pageLength);
auto response = ContentResponse::build(*this, renderer.getHtml(), "text/html; charset=utf-8"); auto response = ContentResponse::build(*this, renderer.getHtml(), "text/html; charset=utf-8");
return withTaskbarInfo(bookName, archive.get(), std::move(response)); return withTaskbarInfo(searchInfo.bookName, archive.get(), std::move(response));
} catch (const std::invalid_argument& e) {
return HTTP400HtmlResponse(*this, request)
+ invalidUrlMsg
+ std::string(e.what());
} catch (const std::exception& e) { } catch (const std::exception& e) {
std::cerr << e.what() << std::endl; std::cerr << e.what() << std::endl;
return Response::build_500(*this, e.what()); return Response::build_500(*this, e.what());

View File

@ -43,9 +43,53 @@ extern "C" {
namespace kiwix { namespace kiwix {
struct GeoQuery {
GeoQuery()
: GeoQuery(0, 0, -1)
{}
GeoQuery(float latitude, float longitude, float distance)
: latitude(latitude), longitude(longitude), distance(distance)
{}
float latitude;
float longitude;
float distance;
explicit operator bool() const {
return distance >= 0;
}
friend bool operator<(const GeoQuery& l, const GeoQuery& r)
{
return std::tie(l.latitude, l.longitude, l.distance)
< std::tie(r.latitude, r.longitude, r.distance); // keep the same order
}
};
class SearchInfo {
public:
SearchInfo(const std::string& pattern);
SearchInfo(const std::string& pattern, GeoQuery geoQuery);
SearchInfo(const RequestContext& request);
zim::Query getZimQuery(bool verbose) const;
friend bool operator<(const SearchInfo& l, const SearchInfo& r)
{
return std::tie(l.bookName, l.pattern, l.geoQuery)
< std::tie(r.bookName, r.pattern, r.geoQuery); // keep the same order
}
public: //data
std::string pattern;
GeoQuery geoQuery;
std::string bookName;
};
typedef kainjow::mustache::data MustacheData; typedef kainjow::mustache::data MustacheData;
typedef ConcurrentCache<string, std::shared_ptr<zim::Searcher>> SearcherCache; typedef ConcurrentCache<string, std::shared_ptr<zim::Searcher>> SearcherCache;
typedef ConcurrentCache<string, std::shared_ptr<zim::Search>> SearchCache; typedef ConcurrentCache<SearchInfo, std::shared_ptr<zim::Search>> SearchCache;
typedef ConcurrentCache<string, std::shared_ptr<zim::SuggestionSearcher>> SuggestionSearcherCache; typedef ConcurrentCache<string, std::shared_ptr<zim::SuggestionSearcher>> SuggestionSearcherCache;
class Entry; class Entry;

View File

@ -110,6 +110,7 @@ std::unique_ptr<ContentResponse> Response::build_404(const InternalServer& serve
} }
extern const UrlNotFoundMsg urlNotFoundMsg; extern const UrlNotFoundMsg urlNotFoundMsg;
extern const InvalidUrlMsg invalidUrlMsg;
std::unique_ptr<ContentResponse> ContentResponseBlueprint::generateResponseObject() const std::unique_ptr<ContentResponse> ContentResponseBlueprint::generateResponseObject() const
{ {
@ -145,6 +146,36 @@ HTTP404HtmlResponse& HTTP404HtmlResponse::operator+(const std::string& msg)
return *this; return *this;
} }
HTTP400HtmlResponse::HTTP400HtmlResponse(const InternalServer& server,
const RequestContext& request)
: ContentResponseBlueprint(&server,
&request,
MHD_HTTP_BAD_REQUEST,
"text/html",
RESOURCE::templates::_400_html)
{
kainjow::mustache::list emptyList;
this->m_data = kainjow::mustache::object{{"details", emptyList}};
}
HTTP400HtmlResponse& HTTP400HtmlResponse::operator+(InvalidUrlMsg /*unused*/)
{
std::string requestUrl = m_request.get_full_url();
const auto query = m_request.get_query();
if (!query.empty()) {
requestUrl += "?" + encodeDiples(query);
}
kainjow::mustache::mustache msgTmpl(R"(The requested URL "{{{url}}}" is not a valid request.)");
return *this + msgTmpl.render({"url", requestUrl});
}
HTTP400HtmlResponse& HTTP400HtmlResponse::operator+(const std::string& msg)
{
m_data["details"].push_back({"p", msg});
return *this;
}
ContentResponseBlueprint& ContentResponseBlueprint::operator+(const TaskbarInfo& taskbarInfo) ContentResponseBlueprint& ContentResponseBlueprint::operator+(const TaskbarInfo& taskbarInfo)
{ {
this->m_taskbarInfo.reset(new TaskbarInfo(taskbarInfo)); this->m_taskbarInfo.reset(new TaskbarInfo(taskbarInfo));

View File

@ -207,6 +207,20 @@ struct HTTP404HtmlResponse : ContentResponseBlueprint
HTTP404HtmlResponse& operator+(const std::string& errorDetails); HTTP404HtmlResponse& operator+(const std::string& errorDetails);
}; };
class InvalidUrlMsg {};
extern const InvalidUrlMsg invalidUrlMsg;
struct HTTP400HtmlResponse : ContentResponseBlueprint
{
HTTP400HtmlResponse(const InternalServer& server,
const RequestContext& request);
using ContentResponseBlueprint::operator+;
HTTP400HtmlResponse& operator+(InvalidUrlMsg /*unused*/);
HTTP400HtmlResponse& operator+(const std::string& errorDetails);
};
class ItemResponse : public Response { class ItemResponse : public Response {
public: public:
ItemResponse(bool verbose, const zim::Item& item, const std::string& mimetype, const ByteRange& byterange); ItemResponse(bool verbose, const zim::Item& item, const std::string& mimetype, const ByteRange& byterange);

View File

@ -35,6 +35,7 @@ skin/block_external.js
skin/search_results.css skin/search_results.css
templates/search_result.html templates/search_result.html
templates/no_search_result.html templates/no_search_result.html
templates/400.html
templates/404.html templates/404.html
templates/500.html templates/500.html
templates/index.html templates/index.html

15
static/templates/400.html Normal file
View File

@ -0,0 +1,15 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html;charset=UTF-8" http-equiv="content-type" />
<title>Invalid request</title>
</head>
<body>
<h1>Invalid request</h1>
{{#details}}
<p>
{{{p}}}
</p>
{{/details}}
</body>
</html>

View File

@ -107,11 +107,11 @@
</b> of <b> </b> of <b>
{{count}} {{count}}
</b> for <b> </b> for <b>
{{searchPattern}} "{{{searchPattern}}}"
</b> </b>
{{/hasResults}} {{/hasResults}}
{{^hasResults}} {{^hasResults}}
No results were found for <b>{{searchPattern}}</b> No results were found for <b>"{{{searchPattern}}}"</b>
{{/hasResults}} {{/hasResults}}
</div> </div>

View File

@ -287,6 +287,22 @@ TEST_F(ServerTest, UncompressibleContentIsNotCompressed)
} }
} }
const char* urls400[] = {
"/ROOT/search",
"/ROOT/search?content=zimfile",
"/ROOT/search?content=non-existing-book&pattern=asdfqwerty",
"/ROOT/search?content=non-existing-book&pattern=asd<qwerty",
"/ROOT/search?pattern"
};
TEST_F(ServerTest, 400)
{
for (const char* url: urls400 ) {
EXPECT_EQ(400, zfs1_->GET(url)->status) << "url: " << url;
}
}
const char* urls404[] = { const char* urls404[] = {
"/", "/",
"/zimfile", "/zimfile",
@ -302,8 +318,6 @@ const char* urls404[] = {
"/ROOT/meta?content=non-existent-book&name=title", "/ROOT/meta?content=non-existent-book&name=title",
"/ROOT/random", "/ROOT/random",
"/ROOT/random?content=non-existent-book", "/ROOT/random?content=non-existent-book",
"/ROOT/search",
"/ROOT/search?content=non-existing-book&pattern=asdfqwerty",
"/ROOT/suggest", "/ROOT/suggest",
"/ROOT/suggest?content=non-existent-book&term=abcd", "/ROOT/suggest?content=non-existent-book&term=abcd",
"/ROOT/catch/external", "/ROOT/catch/external",
@ -319,8 +333,9 @@ const char* urls404[] = {
TEST_F(ServerTest, 404) TEST_F(ServerTest, 404)
{ {
for ( const char* url : urls404 ) for ( const char* url : urls404 ) {
EXPECT_EQ(404, zfs1_->GET(url)->status) << "url: " << url; EXPECT_EQ(404, zfs1_->GET(url)->status) << "url: " << url;
}
} }
namespace TestingOfHtmlResponses namespace TestingOfHtmlResponses
@ -388,13 +403,14 @@ public:
: ExpectedResponseData(erd) : ExpectedResponseData(erd)
, url(url) , url(url)
{} {}
virtual ~TestContentIn404HtmlResponse() = default;
const std::string url; const std::string url;
std::string expectedResponse() const; std::string expectedResponse() const;
private: private:
std::string pageTitle() const; virtual std::string pageTitle() const;
std::string pageCssLink() const; std::string pageCssLink() const;
std::string hiddenBookNameInput() const; std::string hiddenBookNameInput() const;
std::string searchPatternInput() const; std::string searchPatternInput() const;
@ -521,6 +537,25 @@ std::string TestContentIn404HtmlResponse::taskbarLinks() const
+ R"("><button>&#x1F3B2;</button></a>)"; + R"("><button>&#x1F3B2;</button></a>)";
} }
class TestContentIn400HtmlResponse : public TestContentIn404HtmlResponse
{
public:
TestContentIn400HtmlResponse(const std::string& url,
const ExpectedResponseData& erd)
: TestContentIn404HtmlResponse(url, erd)
{}
private:
std::string pageTitle() const;
};
std::string TestContentIn400HtmlResponse::pageTitle() const {
return expectedPageTitle.empty()
? "Invalid request"
: expectedPageTitle;
}
} // namespace TestingOfHtmlResponses } // namespace TestingOfHtmlResponses
TEST_F(ServerTest, 404WithBodyTesting) TEST_F(ServerTest, 404WithBodyTesting)
@ -650,28 +685,70 @@ TEST_F(ServerTest, 404WithBodyTesting)
Cannot find content entry invalid-article Cannot find content entry invalid-article
</p> </p>
)" }, )" },
};
{ /* url */ "/ROOT/search?content=zimfile", for ( const auto& t : testData ) {
expected_page_title=="Fulltext search unavailable" && const TestContext ctx{ {"url", t.url} };
expected_css_url=="/ROOT/skin/search_results.css" && const auto r = zfs1_->GET(t.url.c_str());
book_name=="zimfile" && EXPECT_EQ(r->status, 404) << ctx;
book_title=="Ray Charles" && EXPECT_EQ(r->body, t.expectedResponse()) << ctx;
expected_body==R"( }
<div class="header">Not found</div> }
TEST_F(ServerTest, 400WithBodyTesting)
{
using namespace TestingOfHtmlResponses;
const std::vector<TestContentIn400HtmlResponse> testData{
{ /* url */ "/ROOT/search",
expected_body== R"(
<h1>Invalid request</h1>
<p> <p>
There is no article with the title <b> ""</b> The requested URL "/ROOT/search" is not a valid request.
and the fulltext search engine is not available for this content. </p>
<p>
No query provided.
</p> </p>
)" }, )" },
{ /* url */ "/ROOT/search?content=zimfile",
{ /* url */ "/ROOT/search?content=non-existent-book&pattern=asdfqwerty",
expected_page_title=="Fulltext search unavailable" &&
expected_css_url=="/ROOT/skin/search_results.css" &&
expected_body==R"( expected_body==R"(
<div class="header">Not found</div> <h1>Invalid request</h1>
<p> <p>
There is no article with the title <b> "asdfqwerty"</b> The requested URL "/ROOT/search?content=zimfile" is not a valid request.
and the fulltext search engine is not available for this content. </p>
<p>
No query provided.
</p>
)" },
{ /* url */ "/ROOT/search?content=non-existing-book&pattern=asdfqwerty",
expected_body==R"(
<h1>Invalid request</h1>
<p>
The requested URL "/ROOT/search?content=non-existing-book&pattern=asdfqwerty" is not a valid request.
</p>
<p>
The requested book doesn't exist.
</p>
)" },
{ /* url */ "/ROOT/search?content=non-existing-book&pattern=a\"<script foo>",
expected_body==R"(
<h1>Invalid request</h1>
<p>
The requested URL "/ROOT/search?content=non-existing-book&pattern=a"&lt;script foo&gt;" is not a valid request.
</p>
<p>
The requested book doesn't exist.
</p>
)" },
// There is a flaw in our way to handle query string, we cannot differenciate
// between `pattern` and `pattern=`
{ /* url */ "/ROOT/search?pattern",
expected_body==R"(
<h1>Invalid request</h1>
<p>
The requested URL "/ROOT/search?pattern=" is not a valid request.
</p>
<p>
No query provided.
</p> </p>
)" }, )" },
}; };
@ -679,7 +756,7 @@ TEST_F(ServerTest, 404WithBodyTesting)
for ( const auto& t : testData ) { for ( const auto& t : testData ) {
const TestContext ctx{ {"url", t.url} }; const TestContext ctx{ {"url", t.url} };
const auto r = zfs1_->GET(t.url.c_str()); const auto r = zfs1_->GET(t.url.c_str());
EXPECT_EQ(r->status, 404) << ctx; EXPECT_EQ(r->status, 400) << ctx;
EXPECT_EQ(r->body, t.expectedResponse()) << ctx; EXPECT_EQ(r->body, t.expectedResponse()) << ctx;
} }
} }
@ -728,14 +805,16 @@ TEST_F(ServerTest, RawEntry)
TEST_F(ServerTest, HeadMethodIsSupported) TEST_F(ServerTest, HeadMethodIsSupported)
{ {
for ( const Resource& res : all200Resources() ) for ( const Resource& res : all200Resources() ) {
EXPECT_EQ(200, zfs1_->HEAD(res.url)->status) << res; EXPECT_EQ(200, zfs1_->HEAD(res.url)->status) << res;
}
} }
TEST_F(ServerTest, TheResponseToHeadRequestHasNoBody) TEST_F(ServerTest, TheResponseToHeadRequestHasNoBody)
{ {
for ( const Resource& res : all200Resources() ) for ( const Resource& res : all200Resources() ) {
EXPECT_TRUE(zfs1_->HEAD(res.url)->body.empty()) << res; EXPECT_TRUE(zfs1_->HEAD(res.url)->body.empty()) << res;
}
} }
TEST_F(ServerTest, HeadersAreTheSameInResponsesToHeadAndGetRequests) TEST_F(ServerTest, HeadersAreTheSameInResponsesToHeadAndGetRequests)