mirror of https://github.com/kiwix/libkiwix.git
commit
62d26c27ff
|
@ -35,14 +35,16 @@
|
|||
|
||||
using namespace std;
|
||||
|
||||
struct Result
|
||||
class Result
|
||||
{
|
||||
string url;
|
||||
string title;
|
||||
int score;
|
||||
string snippet;
|
||||
int wordCount;
|
||||
int size;
|
||||
public:
|
||||
virtual ~Result() {};
|
||||
virtual std::string get_url() = 0;
|
||||
virtual std::string get_title() = 0;
|
||||
virtual int get_score() = 0;
|
||||
virtual std::string get_snippet() = 0;
|
||||
virtual int get_wordCount() = 0;
|
||||
virtual int get_size() = 0;
|
||||
};
|
||||
|
||||
namespace kiwix {
|
||||
|
@ -55,7 +57,8 @@ namespace kiwix {
|
|||
|
||||
void search(std::string &search, unsigned int resultStart,
|
||||
unsigned int resultEnd, const bool verbose=false);
|
||||
bool getNextResult(string &url, string &title, unsigned int &score);
|
||||
virtual Result* getNextResult() = 0;
|
||||
virtual void restart_search() = 0;
|
||||
unsigned int getEstimatedResultCount();
|
||||
bool setProtocolPrefix(const std::string prefix);
|
||||
bool setSearchProtocolPrefix(const std::string prefix);
|
||||
|
@ -72,8 +75,6 @@ namespace kiwix {
|
|||
virtual void searchInIndex(string &search, const unsigned int resultStart,
|
||||
const unsigned int resultEnd, const bool verbose=false) = 0;
|
||||
|
||||
std::vector<Result> results;
|
||||
std::vector<Result>::iterator resultOffset;
|
||||
std::string searchPattern;
|
||||
std::string protocolPrefix;
|
||||
std::string searchProtocolPrefix;
|
||||
|
|
|
@ -22,11 +22,35 @@
|
|||
|
||||
#include <xapian.h>
|
||||
#include "searcher.h"
|
||||
#include "reader.h"
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
class XapianSearcher;
|
||||
|
||||
class XapianResult : public Result {
|
||||
public:
|
||||
XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator);
|
||||
virtual ~XapianResult() {};
|
||||
|
||||
virtual std::string get_url();
|
||||
virtual std::string get_title();
|
||||
virtual int get_score();
|
||||
virtual std::string get_snippet();
|
||||
virtual int get_wordCount();
|
||||
virtual int get_size();
|
||||
|
||||
private:
|
||||
XapianSearcher* searcher;
|
||||
Xapian::MSetIterator iterator;
|
||||
Xapian::Document document;
|
||||
};
|
||||
|
||||
class NoXapianIndexInZim: public exception {
|
||||
virtual const char* what() const throw() {
|
||||
return "There is no fulltext index in the zim file";
|
||||
|
@ -34,19 +58,25 @@ namespace kiwix {
|
|||
};
|
||||
|
||||
class XapianSearcher : public Searcher {
|
||||
|
||||
friend class XapianResult;
|
||||
public:
|
||||
XapianSearcher(const string &xapianDirectoryPath);
|
||||
XapianSearcher(const string &xapianDirectoryPath, Reader* reader);
|
||||
virtual ~XapianSearcher() {};
|
||||
void searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd,
|
||||
const bool verbose=false);
|
||||
virtual Result* getNextResult();
|
||||
void restart_search();
|
||||
|
||||
protected:
|
||||
void closeIndex();
|
||||
void openIndex(const string &xapianDirectoryPath);
|
||||
|
||||
Reader* reader;
|
||||
Xapian::Database readableDatabase;
|
||||
Xapian::Stem stemmer;
|
||||
Xapian::MSet results;
|
||||
Xapian::MSetIterator current_result;
|
||||
std::map<std::string, int> valuesmap;
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -445,7 +445,7 @@ JNIEXPORT jboolean JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_loadFulltextIndex(JN
|
|||
searcher = NULL;
|
||||
try {
|
||||
if (searcher != NULL) delete searcher;
|
||||
searcher = new kiwix::XapianSearcher(cPath);
|
||||
searcher = new kiwix::XapianSearcher(cPath, NULL);
|
||||
} catch (...) {
|
||||
searcher = NULL;
|
||||
retVal = JNI_FALSE;
|
||||
|
@ -460,19 +460,18 @@ JNIEXPORT jstring JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_indexedQuery
|
|||
(JNIEnv *env, jclass obj, jstring query, jint count) {
|
||||
std::string cQuery = jni2c(query, env);
|
||||
unsigned int cCount = jni2c(count);
|
||||
std::string url;
|
||||
std::string title;
|
||||
kiwix::Result *p_result;
|
||||
std::string result;
|
||||
unsigned int score;
|
||||
|
||||
pthread_mutex_lock(&searcherLock);
|
||||
try {
|
||||
if (searcher != NULL) {
|
||||
searcher->search(cQuery, 0, count);
|
||||
while (searcher->getNextResult(url, title, score) &&
|
||||
!title.empty() &&
|
||||
!url.empty()) {
|
||||
result += title + "\n";
|
||||
while ( (p_result = searcher->getNextResult()) &&
|
||||
!(p_result->get_title().empty()) &&
|
||||
!(p_result->get_url().empty())) {
|
||||
result += p_result->get_title() + "\n";
|
||||
delete p_result;
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
|
|
|
@ -81,7 +81,6 @@ namespace kiwix {
|
|||
this->resultEnd = resultEnd;
|
||||
string unaccentedSearch = removeAccents(search);
|
||||
searchInIndex(unaccentedSearch, resultStart, resultEnd, verbose);
|
||||
this->resultOffset = this->results.begin();
|
||||
}
|
||||
|
||||
return;
|
||||
|
@ -89,8 +88,6 @@ namespace kiwix {
|
|||
|
||||
/* Reset the results */
|
||||
void Searcher::reset() {
|
||||
this->results.clear();
|
||||
this->resultOffset = this->results.begin();
|
||||
this->estimatedResultCount = 0;
|
||||
this->searchPattern = "";
|
||||
return;
|
||||
|
@ -101,30 +98,6 @@ namespace kiwix {
|
|||
return this->estimatedResultCount;
|
||||
}
|
||||
|
||||
/* Get next result */
|
||||
bool Searcher::getNextResult(string &url, string &title, unsigned int &score) {
|
||||
bool retVal = false;
|
||||
|
||||
if (this->resultOffset != this->results.end()) {
|
||||
|
||||
/* url */
|
||||
url = this->resultOffset->url;
|
||||
|
||||
/* title */
|
||||
title = this->resultOffset->title;
|
||||
|
||||
/* score */
|
||||
score = this->resultOffset->score;
|
||||
|
||||
/* increment the cursor for the next call */
|
||||
this->resultOffset++;
|
||||
|
||||
retVal = true;
|
||||
}
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
bool Searcher::setProtocolPrefix(const std::string prefix) {
|
||||
this->protocolPrefix = prefix;
|
||||
return true;
|
||||
|
@ -149,23 +122,24 @@ namespace kiwix {
|
|||
CDT oData;
|
||||
CDT resultsCDT(CDT::ARRAY_VAL);
|
||||
|
||||
this->resultOffset = this->results.begin();
|
||||
while (this->resultOffset != this->results.end()) {
|
||||
this->restart_search();
|
||||
Result * p_result = NULL;
|
||||
while ( (p_result = this->getNextResult()) ) {
|
||||
CDT result;
|
||||
result["title"] = this->resultOffset->title;
|
||||
result["url"] = this->resultOffset->url;
|
||||
result["snippet"] = this->resultOffset->snippet;
|
||||
result["title"] = p_result->get_title();
|
||||
result["url"] = p_result->get_url();
|
||||
result["snippet"] = p_result->get_snippet();
|
||||
|
||||
if (this->resultOffset->size >= 0)
|
||||
result["size"] = kiwix::beautifyInteger(this->resultOffset->size);
|
||||
if (p_result->get_size() >= 0)
|
||||
result["size"] = kiwix::beautifyInteger(p_result->get_size());
|
||||
|
||||
if (this->resultOffset->wordCount >= 0)
|
||||
result["wordCount"] = kiwix::beautifyInteger(this->resultOffset->wordCount);
|
||||
if (p_result->get_wordCount() >= 0)
|
||||
result["wordCount"] = kiwix::beautifyInteger(p_result->get_wordCount());
|
||||
|
||||
resultsCDT.PushBack(result);
|
||||
this->resultOffset++;
|
||||
delete p_result;
|
||||
}
|
||||
this->resultOffset = this->results.begin();
|
||||
this->restart_search();
|
||||
oData["results"] = resultsCDT;
|
||||
|
||||
// pages
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
*/
|
||||
|
||||
#include "xapianSearcher.h"
|
||||
#include "xapian/myhtmlparse.h"
|
||||
#include <zim/zim.h>
|
||||
#include <zim/file.h>
|
||||
#include <zim/article.h>
|
||||
|
@ -25,11 +26,25 @@
|
|||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace kiwix {
|
||||
|
||||
std::map<std::string, int> read_valuesmap(const std::string &s) {
|
||||
std::map<std::string, int> result;
|
||||
std::vector<std::string> elems = split(s, ";");
|
||||
for( auto elem: elems)
|
||||
{
|
||||
std::vector<std::string> tmp_elems = split(elem, ":");
|
||||
result.insert( std::pair<std::string, int>(tmp_elems[0], atoi(tmp_elems[1].c_str())) );
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Constructor */
|
||||
XapianSearcher::XapianSearcher(const string &xapianDirectoryPath)
|
||||
XapianSearcher::XapianSearcher(const string &xapianDirectoryPath, Reader* reader)
|
||||
: Searcher(),
|
||||
reader(reader),
|
||||
stemmer(Xapian::Stem("english")) {
|
||||
this->openIndex(xapianDirectoryPath);
|
||||
}
|
||||
|
@ -49,6 +64,7 @@ namespace kiwix {
|
|||
} catch (...) {
|
||||
this->readableDatabase = Xapian::Database(directoryPath);
|
||||
}
|
||||
this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap"));
|
||||
}
|
||||
|
||||
/* Close Xapian writable database */
|
||||
|
@ -68,32 +84,111 @@ namespace kiwix {
|
|||
enquire.set_query(query);
|
||||
|
||||
/* Get the results */
|
||||
Xapian::MSet matches = enquire.get_mset(resultStart, resultEnd - resultStart);
|
||||
|
||||
Xapian::MSetIterator i;
|
||||
for (i = matches.begin(); i != matches.end(); ++i) {
|
||||
Xapian::Document doc = i.get_document();
|
||||
|
||||
Result result;
|
||||
result.url = doc.get_data();
|
||||
result.title = doc.get_value(0);
|
||||
result.snippet = doc.get_value(1);
|
||||
result.size = (doc.get_value(2).empty() == true ? -1 : atoi(doc.get_value(2).c_str()));
|
||||
result.wordCount = (doc.get_value(3).empty() == true ? -1 : atoi(doc.get_value(3).c_str()));
|
||||
result.score = i.get_percent();
|
||||
|
||||
this->results.push_back(result);
|
||||
|
||||
if (verbose) {
|
||||
std::cout << "Document ID " << *i << " \t";
|
||||
std::cout << i.get_percent() << "% ";
|
||||
std::cout << "\t[" << doc.get_data() << "] - " << doc.get_value(0) << std::endl;
|
||||
}
|
||||
}
|
||||
this->results = enquire.get_mset(resultStart, resultEnd - resultStart);
|
||||
this->current_result = this->results.begin();
|
||||
|
||||
/* Update the global resultCount value*/
|
||||
this->estimatedResultCount = matches.get_matches_estimated();
|
||||
this->estimatedResultCount = this->results.get_matches_estimated();
|
||||
}
|
||||
|
||||
return;
|
||||
/* Get next result */
|
||||
Result* XapianSearcher::getNextResult() {
|
||||
if (this->current_result != this->results.end()) {
|
||||
XapianResult* result = new XapianResult(this, this->current_result);
|
||||
this->current_result++;
|
||||
return result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void XapianSearcher::restart_search() {
|
||||
this->current_result = this->results.begin();
|
||||
}
|
||||
|
||||
XapianResult::XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator):
|
||||
searcher(searcher),
|
||||
iterator(iterator),
|
||||
document(iterator.get_document())
|
||||
{
|
||||
}
|
||||
|
||||
std::string XapianResult::get_url() {
|
||||
return document.get_data();
|
||||
}
|
||||
|
||||
std::string XapianResult::get_title() {
|
||||
if ( searcher->valuesmap.empty() )
|
||||
{
|
||||
/* This is the old legacy version. Guess and try */
|
||||
return document.get_value(0);
|
||||
}
|
||||
else if ( searcher->valuesmap.find("title") != searcher->valuesmap.end() )
|
||||
{
|
||||
return document.get_value(searcher->valuesmap["title"]);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
int XapianResult::get_score() {
|
||||
return iterator.get_percent();
|
||||
}
|
||||
|
||||
std::string XapianResult::get_snippet() {
|
||||
if ( searcher->valuesmap.empty() )
|
||||
{
|
||||
/* This is the old legacy version. Guess and try */
|
||||
std::string stored_snippet = document.get_value(1);
|
||||
if ( ! stored_snippet.empty() )
|
||||
return stored_snippet;
|
||||
/* Let's continue here, and see if we can genenate one */
|
||||
}
|
||||
else if ( searcher->valuesmap.find("snippet") != searcher->valuesmap.end() )
|
||||
{
|
||||
return document.get_value(searcher->valuesmap["snippet"]);
|
||||
}
|
||||
/* No reader, no snippet */
|
||||
if ( ! searcher->reader )
|
||||
return "";
|
||||
/* Get the content of the article to generate a snippet.
|
||||
We parse it and use the html dump to avoid remove html tags in the
|
||||
content and be able to nicely cut the text at random place. */
|
||||
MyHtmlParser htmlParser;
|
||||
std::string content;
|
||||
unsigned int contentLength;
|
||||
std::string contentType;
|
||||
searcher->reader->getContentByUrl(get_url(), content, contentLength, contentType);
|
||||
try {
|
||||
htmlParser.parse_html(content, "UTF-8", true);
|
||||
} catch (...) {}
|
||||
return searcher->results.snippet(htmlParser.dump, 500);
|
||||
}
|
||||
|
||||
int XapianResult::get_size() {
|
||||
if ( searcher->valuesmap.empty() )
|
||||
{
|
||||
/* This is the old legacy version. Guess and try */
|
||||
return document.get_value(2).empty() == true ? -1 : atoi(document.get_value(2).c_str());
|
||||
}
|
||||
else if ( searcher->valuesmap.find("size") != searcher->valuesmap.end() )
|
||||
{
|
||||
return atoi(document.get_value(searcher->valuesmap["size"]).c_str());
|
||||
}
|
||||
/* The size is never used. Do we really want to get the content and
|
||||
calculate the size ? */
|
||||
return -1;
|
||||
}
|
||||
|
||||
int XapianResult::get_wordCount() {
|
||||
if ( searcher->valuesmap.empty() )
|
||||
{
|
||||
/* This is the old legacy version. Guess and try */
|
||||
return document.get_value(3).empty() == true ? -1 : atoi(document.get_value(3).c_str());
|
||||
}
|
||||
else if ( searcher->valuesmap.find("wordcount") != searcher->valuesmap.end() )
|
||||
{
|
||||
return atoi(document.get_value(searcher->valuesmap["wordcount"]).c_str());
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
} // Kiwix namespace
|
||||
|
|
Loading…
Reference in New Issue