Merge pull request #66 from kiwix/multisearch

Multisearch
This commit is contained in:
Matthieu Gautier 2017-07-18 16:07:46 +02:00 committed by GitHub
commit 473b62c9b8
6 changed files with 144 additions and 37 deletions

View File

@ -46,17 +46,21 @@ class Result
virtual std::string get_title() = 0; virtual std::string get_title() = 0;
virtual int get_score() = 0; virtual int get_score() = 0;
virtual std::string get_snippet() = 0; virtual std::string get_snippet() = 0;
virtual std::string get_content() = 0;
virtual int get_wordCount() = 0; virtual int get_wordCount() = 0;
virtual int get_size() = 0; virtual int get_size() = 0;
virtual int get_readerIndex() = 0;
}; };
struct SearcherInternal; struct SearcherInternal;
class Searcher class Searcher
{ {
public: public:
Searcher();
Searcher(const string& xapianDirectoryPath, Reader* reader); Searcher(const string& xapianDirectoryPath, Reader* reader);
~Searcher(); ~Searcher();
void add_reader(Reader* reader, const std::string& humanReaderName);
void search(std::string& search, void search(std::string& search,
unsigned int resultStart, unsigned int resultStart,
unsigned int resultEnd, unsigned int resultEnd,
@ -82,7 +86,8 @@ class Searcher
const unsigned int resultEnd, const unsigned int resultEnd,
const bool verbose = false); const bool verbose = false);
Reader* reader; std::vector<Reader*> readers;
std::vector<std::string> humanReaderNames;
SearcherInternal* internal; SearcherInternal* internal;
std::string searchPattern; std::string searchPattern;
std::string protocolPrefix; std::string protocolPrefix;

View File

@ -43,8 +43,10 @@ class XapianResult : public Result
virtual std::string get_title(); virtual std::string get_title();
virtual int get_score(); virtual int get_score();
virtual std::string get_snippet(); virtual std::string get_snippet();
virtual std::string get_content();
virtual int get_wordCount(); virtual int get_wordCount();
virtual int get_size(); virtual int get_size();
virtual int get_readerIndex() { return 0; };
private: private:
XapianSearcher* searcher; XapianSearcher* searcher;

View File

@ -486,12 +486,19 @@ JNIEXPORT jboolean JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_loadFulltextIndex(
std::string cPath = jni2c(path, env); std::string cPath = jni2c(path, env);
pthread_mutex_lock(&searcherLock); pthread_mutex_lock(&searcherLock);
searcher = NULL;
try { try {
if (searcher != NULL) { if (searcher != NULL) {
delete searcher; delete searcher;
} }
searcher = new kiwix::Searcher(cPath, reader); if (!reader || !reader->hasFulltextIndex()) {
// Use old API (no embedded full text index).
searcher = new kiwix::Searcher(cPath, reader);
} else {
// Use the new API. We don't care about the human readable name as
// we don't use it (in android).
searcher = new kiwix::Searcher();
searcher->add_reader(reader, "");
}
} catch (...) { } catch (...) {
searcher = NULL; searcher = NULL;
retVal = JNI_FALSE; retVal = JNI_FALSE;

View File

@ -45,8 +45,10 @@ class _Result : public Result
virtual std::string get_title(); virtual std::string get_title();
virtual int get_score(); virtual int get_score();
virtual std::string get_snippet(); virtual std::string get_snippet();
virtual std::string get_content();
virtual int get_wordCount(); virtual int get_wordCount();
virtual int get_size(); virtual int get_size();
virtual int get_readerIndex();
private: private:
Searcher* searcher; Searcher* searcher;
@ -72,8 +74,7 @@ struct SearcherInternal {
/* Constructor */ /* Constructor */
Searcher::Searcher(const string& xapianDirectoryPath, Reader* reader) Searcher::Searcher(const string& xapianDirectoryPath, Reader* reader)
: reader(reader), : internal(new SearcherInternal()),
internal(new SearcherInternal()),
searchPattern(""), searchPattern(""),
protocolPrefix("zim://"), protocolPrefix("zim://"),
searchProtocolPrefix("search://?"), searchProtocolPrefix("search://?"),
@ -89,11 +90,32 @@ Searcher::Searcher(const string& xapianDirectoryPath, Reader* reader)
} }
} }
Searcher::Searcher()
: internal(new SearcherInternal()),
searchPattern(""),
protocolPrefix("zim://"),
searchProtocolPrefix("search://?"),
resultCountPerPage(0),
estimatedResultCount(0),
resultStart(0),
resultEnd(0)
{
template_ct2 = RESOURCE::results_ct2;
loadICUExternalTables();
}
/* Destructor */ /* Destructor */
Searcher::~Searcher() Searcher::~Searcher()
{ {
delete internal; delete internal;
} }
void Searcher::add_reader(Reader* reader, const std::string& humanReadableName)
{
this->readers.push_back(reader);
this->humanReaderNames.push_back(humanReadableName);
}
/* Search strings in the database */ /* Search strings in the database */
void Searcher::search(std::string& search, void Searcher::search(std::string& search,
unsigned int resultStart, unsigned int resultStart,
@ -133,8 +155,15 @@ void Searcher::search(std::string& search,
this->estimatedResultCount this->estimatedResultCount
= internal->_xapianSearcher->results.get_matches_estimated(); = internal->_xapianSearcher->results.get_matches_estimated();
} else { } else {
internal->_search = this->reader->getZimFileHandler()->search( std::vector<const zim::File*> zims;
unaccentedSearch, resultStart, resultEnd); for (auto current = this->readers.begin(); current != this->readers.end();
current++) {
zims.push_back((*current)->getZimFileHandler());
}
zim::Search* search = new zim::Search(zims);
search->set_query(unaccentedSearch);
search->set_range(resultStart, resultEnd);
internal->_search = search;
internal->current_iterator = internal->_search->begin(); internal->current_iterator = internal->_search->begin();
this->estimatedResultCount = internal->_search->get_matches_estimated(); this->estimatedResultCount = internal->_search->get_matches_estimated();
} }
@ -190,8 +219,16 @@ void Searcher::suggestions(std::string& search, const bool verbose)
* We do not support that. */ * We do not support that. */
this->estimatedResultCount = 0; this->estimatedResultCount = 0;
} else { } else {
internal->_search = this->reader->getZimFileHandler()->suggestions( std::vector<const zim::File*> zims;
unaccentedSearch, resultStart, resultEnd); for (auto current = this->readers.begin(); current != this->readers.end();
current++) {
zims.push_back((*current)->getZimFileHandler());
}
zim::Search* search = new zim::Search(zims);
search->set_query(unaccentedSearch);
search->set_range(resultStart, resultEnd);
search->set_suggestion_mode(true);
internal->_search = search;
internal->current_iterator = internal->_search->begin(); internal->current_iterator = internal->_search->begin();
this->estimatedResultCount = internal->_search->get_matches_estimated(); this->estimatedResultCount = internal->_search->get_matches_estimated();
} }
@ -241,6 +278,13 @@ std::string _Result::get_snippet()
{ {
return iterator.get_snippet(); return iterator.get_snippet();
} }
std::string _Result::get_content()
{
if (iterator->good()) {
return iterator->getData();
}
return "";
}
int _Result::get_size() int _Result::get_size()
{ {
return iterator.get_size(); return iterator.get_size();
@ -249,6 +293,10 @@ int _Result::get_wordCount()
{ {
return iterator.get_wordCount(); return iterator.get_wordCount();
} }
int _Result::get_readerIndex()
{
return iterator.get_fileIndex();
}
#ifdef ENABLE_CTPP2 #ifdef ENABLE_CTPP2
string Searcher::getHtml() string Searcher::getHtml()
@ -266,6 +314,7 @@ string Searcher::getHtml()
result["title"] = p_result->get_title(); result["title"] = p_result->get_title();
result["url"] = p_result->get_url(); result["url"] = p_result->get_url();
result["snippet"] = p_result->get_snippet(); result["snippet"] = p_result->get_snippet();
result["contentId"] = humanReaderNames[p_result->get_readerIndex()];
if (p_result->get_size() >= 0) { if (p_result->get_size() >= 0) {
result["size"] = kiwix::beautifyInteger(p_result->get_size()); result["size"] = kiwix::beautifyInteger(p_result->get_size());

View File

@ -177,11 +177,10 @@ std::string XapianResult::get_snippet()
We parse it and use the html dump to avoid remove html tags in the We parse it and use the html dump to avoid remove html tags in the
content and be able to nicely cut the text at random place. */ content and be able to nicely cut the text at random place. */
MyHtmlParser htmlParser; MyHtmlParser htmlParser;
std::string content; std::string content = get_content();
unsigned int contentLength; if (content.empty()) {
std::string contentType; return content;
searcher->reader->getContentByUrl( }
get_url(), content, contentLength, contentType);
try { try {
htmlParser.parse_html(content, "UTF-8", true); htmlParser.parse_html(content, "UTF-8", true);
} catch (...) { } catch (...) {
@ -189,6 +188,19 @@ std::string XapianResult::get_snippet()
return searcher->results.snippet(htmlParser.dump, 500); return searcher->results.snippet(htmlParser.dump, 500);
} }
std::string XapianResult::get_content()
{
if (!searcher->reader) {
return "";
}
std::string content;
unsigned int contentLength;
std::string contentType;
searcher->reader->getContentByUrl(
get_url(), content, contentLength, contentType);
return content;
}
int XapianResult::get_size() int XapianResult::get_size()
{ {
if (searcher->valuesmap.empty()) { if (searcher->valuesmap.empty()) {

View File

@ -92,36 +92,68 @@
</style> </style>
<title>Search: <TMPL_var searchPattern></title> <title>Search: <TMPL_var searchPattern></title>
</head> </head>
<body bgcolor="white"> <body bgcolor="white">
<div class="header"> <div class="header">
<TMPL_if results>Results <b><TMPL_var resultStart>-<TMPL_var resultEnd></b> of <b><TMPL_var count></b> for <b><TMPL_var searchPattern></b><TMPL_else>No result were found for <b><TMPL_var searchPattern></b></TMPL_if> <TMPL_if results>
Results
<b>
<TMPL_var resultStart>-<TMPL_var resultEnd>
</b> of <b>
<TMPL_var count>
</b> for <b>
<TMPL_var searchPattern>
</b>
<TMPL_else>
No result were found for <b><TMPL_var searchPattern></b>
</TMPL_if>
</div> </div>
<div class="results"> <div class="results">
<ul> <ul>
<TMPL_foreach results as result> <TMPL_foreach results as result>
<li><a href="<TMPL_var protocolPrefix><TMPL_var contentId>/<TMPL_var result.url>"><TMPL_var result.title></a> <li>
<cite><TMPL_if result.snippet><TMPL_var result.snippet>...</TMPL_if></cite> <a href="<TMPL_var protocolPrefix><TMPL_var result.contentId>/<TMPL_var result.url>">
<TMPL_if wordCount><div class="informations"><TMPL_var wordCount> words</div></TMPL_if> <TMPL_var result.title>
</li> </a>
<cite>
<TMPL_if result.snippet>
<TMPL_var result.snippet>...
</TMPL_if>
</cite>
<TMPL_if wordCount>
<div class="informations"><TMPL_var wordCount> words</div>
</TMPL_if>
</li>
</TMPL_foreach> </TMPL_foreach>
</ul> </ul>
</div> </div>
<div class="footer"> <div class="footer">
<ul> <ul>
<TMPL_if (resultLastPageStart>0)> <TMPL_if (resultLastPageStart>0)>
<li><a href="<TMPL_var searchProtocolPrefix>content=<TMPL_var contentId>&pattern=<TMPL_var searchPatternEncoded>&start=0&end=<TMPL_var resultRange>">◀</a></li> <li>
</TMPL_if> <a href="<TMPL_var searchProtocolPrefix>pattern=<TMPL_var searchPatternEncoded><TMPL_if contentId>&content=<TMPL_var contentId></TMPL_if>&start=0&end=<TMPL_var resultRange>">
<TMPL_foreach pages as page>
<li><a <TMPL_if page.selected>class="selected"</TMPL_if> href="<TMPL_var searchProtocolPrefix>content=<TMPL_var contentId>&pattern=<TMPL_var searchPatternEncoded>&start=<TMPL_var page.start>&end=<TMPL_var page.end>"><TMPL_var page.label></a></li> </a>
</TMPL_foreach> </li>
<TMPL_if (resultLastPageStart>0)> </TMPL_if>
<li><a href="<TMPL_var searchProtocolPrefix>content=<TMPL_var contentId>&pattern=<TMPL_var searchPatternEncoded>&start=<TMPL_var resultLastPageStart>&end=<TMPL_var (resultLastPageStart+resultRange)>">▶</a></li> <TMPL_foreach pages as page>
</TMPL_if> <li>
</ul> <a <TMPL_if page.selected>class="selected"</TMPL_if>
</div> href="<TMPL_var searchProtocolPrefix>pattern=<TMPL_var searchPatternEncoded><TMPL_if contentId>&content=<TMPL_var contentId></TMPL_if>&start=<TMPL_var page.start>&end=<TMPL_var page.end>">
<TMPL_var page.label>
</body> </a>
</li>
</TMPL_foreach>
<TMPL_if (resultLastPageStart>0)>
<li>
<a href="<TMPL_var searchProtocolPrefix>pattern=<TMPL_var searchPatternEncoded><TMPL_if contentId>&content=<TMPL_var contentId></TMPL_if>&start=<TMPL_var resultLastPageStart>&end=<TMPL_var (resultLastPageStart+resultRange)>">
</a>
</li>
</TMPL_if>
</ul>
</div>
</body>
</html> </html>