mirror of https://github.com/kiwix/libkiwix.git
Validation of snippets in search results
The "expected" snippets in the test data must be a union of all possible snippets produced at runtime for a given (document, search terms) pair on all platforms of interest: - Overlapping snippets must be properly merged - Non-overlapping snippets can be joined with a " ... " in between.
This commit is contained in:
parent
4521249452
commit
eeca88573b
115
test/server.cpp
115
test/server.cpp
|
@ -1523,14 +1523,73 @@ R"SEARCHRESULT(
|
||||||
// expected HTML strings. This is done with the help of the
|
// expected HTML strings. This is done with the help of the
|
||||||
// function maskSnippetsInSearchResults()
|
// function maskSnippetsInSearchResults()
|
||||||
//
|
//
|
||||||
// 2. Snippets must be checked separately using a somewhat lenient comparison
|
// 2. Snippets are checked separately. If a plain-text comparison fails
|
||||||
// method (TODO)
|
// then a weaker comparison is attempted. Currently it works by testing
|
||||||
|
// that the actual snippet is a substring of the "expected" snippet
|
||||||
|
// (the "..." omitted text markes on the snippet boundaries are taken
|
||||||
|
// into account). The implementation of that approach is via the
|
||||||
|
// isSubSnippet() function.
|
||||||
|
//
|
||||||
|
// Therefore the "expected" snippets in the test data must be a union of
|
||||||
|
// all possible snippets produced at runtime for a given (document, search
|
||||||
|
// terms) pair on all platforms of interest:
|
||||||
|
//
|
||||||
|
// - Overlapping snippets must be properly merged
|
||||||
|
//
|
||||||
|
// - Non-overlapping snippets can be joined with a " ... " in between.
|
||||||
|
//
|
||||||
|
|
||||||
std::string maskSnippetsInSearchResults(std::string s)
|
std::string maskSnippetsInSearchResults(std::string s)
|
||||||
{
|
{
|
||||||
return replace(s, "<cite>.+</cite>", "<cite>SNIPPET TEXT WAS MASKED</cite>");
|
return replace(s, "<cite>.+</cite>", "<cite>SNIPPET TEXT WAS MASKED</cite>");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isValidSnippet(const std::string& s)
|
||||||
|
{
|
||||||
|
return s.size() >= 250
|
||||||
|
&& s.find("<b>") != std::string::npos
|
||||||
|
&& s.find("</b>") != std::string::npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t leadingDotCount(const std::string& s)
|
||||||
|
{
|
||||||
|
return s.find_first_not_of(".");
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t trailingDotCount(const std::string& s)
|
||||||
|
{
|
||||||
|
return s.size() - 1 - s.find_last_not_of(".");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isSubSnippet(std::string subSnippet, const std::string& superSnippet)
|
||||||
|
{
|
||||||
|
const auto leadingDotCountInSubSnippet = leadingDotCount(subSnippet);
|
||||||
|
const auto trailingDotCountInSubSnippet = trailingDotCount(subSnippet);
|
||||||
|
const bool subSnippetIsHeadless = leadingDotCountInSubSnippet >= 3;
|
||||||
|
const bool subSnippetIsTailless = trailingDotCountInSubSnippet >= 3;
|
||||||
|
if ( subSnippetIsHeadless )
|
||||||
|
{
|
||||||
|
subSnippet = subSnippet.substr(leadingDotCountInSubSnippet);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( subSnippetIsTailless )
|
||||||
|
{
|
||||||
|
subSnippet = subSnippet.substr(0, subSnippet.size() - trailingDotCountInSubSnippet);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto pos = superSnippet.find(subSnippet);
|
||||||
|
if ( pos == std::string::npos )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if ( subSnippetIsHeadless == (pos == 0) )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if ( subSnippetIsTailless == (pos + subSnippet.size() == superSnippet.size()) )
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(TaskbarlessServerTest, searchResults)
|
TEST_F(TaskbarlessServerTest, searchResults)
|
||||||
{
|
{
|
||||||
struct TestData
|
struct TestData
|
||||||
|
@ -1652,6 +1711,58 @@ TEST_F(TaskbarlessServerTest, searchResults)
|
||||||
{
|
{
|
||||||
EXPECT_EQ(maskSnippetsInSearchResults(html), expectedHtml())
|
EXPECT_EQ(maskSnippetsInSearchResults(html), expectedHtml())
|
||||||
<< testContext();
|
<< testContext();
|
||||||
|
|
||||||
|
checkSnippets(extractSearchResultSnippets(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef std::vector<std::string> Snippets;
|
||||||
|
|
||||||
|
static Snippets extractSearchResultSnippets(const std::string& html)
|
||||||
|
{
|
||||||
|
Snippets snippets;
|
||||||
|
const std::regex snippetRegex("<cite>(.*)</cite>");
|
||||||
|
std::sregex_iterator snippetIt(html.begin(), html.end(), snippetRegex);
|
||||||
|
const std::sregex_iterator end;
|
||||||
|
for ( ; snippetIt != end; ++snippetIt)
|
||||||
|
{
|
||||||
|
const std::smatch snippetMatch = *snippetIt;
|
||||||
|
snippets.push_back(snippetMatch[1].str());
|
||||||
|
}
|
||||||
|
return snippets;
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkSnippets(const Snippets& snippets) const
|
||||||
|
{
|
||||||
|
ASSERT_EQ(snippets.size(), results.size());
|
||||||
|
for ( size_t i = 0; i < results.size(); ++i )
|
||||||
|
{
|
||||||
|
const auto& r = results[i];
|
||||||
|
const auto expectedSnippet = extractSearchResultSnippets(r);
|
||||||
|
ASSERT_EQ(1u, expectedSnippet.size())
|
||||||
|
<< "Multiple snippets in test data:"
|
||||||
|
<< "\n" << r;
|
||||||
|
|
||||||
|
if ( snippets[i] != expectedSnippet[0] ) {
|
||||||
|
std::cout << "Trying a weaker check for a mismatching snippet...\n";
|
||||||
|
checkMismatchingSnippet(snippets[i], expectedSnippet[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkMismatchingSnippet(std::string actual, std::string expected) const
|
||||||
|
{
|
||||||
|
TestContext testContext{
|
||||||
|
{ "url", url() },
|
||||||
|
{ "actual snippet", actual },
|
||||||
|
{ "expected snippet", expected }
|
||||||
|
};
|
||||||
|
|
||||||
|
ASSERT_TRUE(isValidSnippet(actual)) << testContext;
|
||||||
|
ASSERT_TRUE(isValidSnippet(expected)) << testContext;
|
||||||
|
|
||||||
|
if ( !isSubSnippet(actual, expected) ) {
|
||||||
|
EXPECT_EQ(actual, expected) << testContext;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue