From 83d27255cf7bbed1b3461d6a8067dc0f521c3e1c Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 21 Mar 2017 16:20:17 +0100 Subject: [PATCH 1/3] Do not create all the results at once. Be a bit lazy. We don't need to generate a vector of result when we do a search. We better to just keep the handle to the current MSetIterator and generate the wanted values when needed. --- include/searcher.h | 21 +++++------ include/xapianSearcher.h | 21 +++++++++++ src/android/kiwix.cpp | 13 ++++--- src/searcher.cpp | 50 +++++++-------------------- src/xapianSearcher.cpp | 75 ++++++++++++++++++++++++++-------------- 5 files changed, 99 insertions(+), 81 deletions(-) diff --git a/include/searcher.h b/include/searcher.h index 262f79aff..f8999501a 100644 --- a/include/searcher.h +++ b/include/searcher.h @@ -35,14 +35,16 @@ using namespace std; -struct Result +class Result { - string url; - string title; - int score; - string snippet; - int wordCount; - int size; + public: + virtual ~Result() {}; + virtual std::string get_url() = 0; + virtual std::string get_title() = 0; + virtual int get_score() = 0; + virtual std::string get_snippet() = 0; + virtual int get_wordCount() = 0; + virtual int get_size() = 0; }; namespace kiwix { @@ -55,7 +57,8 @@ namespace kiwix { void search(std::string &search, unsigned int resultStart, unsigned int resultEnd, const bool verbose=false); - bool getNextResult(string &url, string &title, unsigned int &score); + virtual Result* getNextResult() = 0; + virtual void restart_search() = 0; unsigned int getEstimatedResultCount(); bool setProtocolPrefix(const std::string prefix); bool setSearchProtocolPrefix(const std::string prefix); @@ -72,8 +75,6 @@ namespace kiwix { virtual void searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd, const bool verbose=false) = 0; - std::vector results; - std::vector::iterator resultOffset; std::string searchPattern; std::string protocolPrefix; std::string searchProtocolPrefix; diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index 4404f2545..8e43c9ff6 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -27,6 +27,23 @@ using namespace std; namespace kiwix { + class XapianResult : public Result { + public: + XapianResult(Xapian::MSetIterator& iterator); + virtual ~XapianResult() {}; + + virtual std::string get_url(); + virtual std::string get_title(); + virtual int get_score(); + virtual std::string get_snippet(); + virtual int get_wordCount(); + virtual int get_size(); + + private: + Xapian::MSetIterator iterator; + Xapian::Document document; + }; + class NoXapianIndexInZim: public exception { virtual const char* what() const throw() { return "There is no fulltext index in the zim file"; @@ -40,6 +57,8 @@ namespace kiwix { virtual ~XapianSearcher() {}; void searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd, const bool verbose=false); + virtual Result* getNextResult(); + void restart_search(); protected: void closeIndex(); @@ -47,6 +66,8 @@ namespace kiwix { Xapian::Database readableDatabase; Xapian::Stem stemmer; + Xapian::MSet results; + Xapian::MSetIterator current_result; }; } diff --git a/src/android/kiwix.cpp b/src/android/kiwix.cpp index dee61f79e..4d573e700 100644 --- a/src/android/kiwix.cpp +++ b/src/android/kiwix.cpp @@ -460,19 +460,18 @@ JNIEXPORT jstring JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_indexedQuery (JNIEnv *env, jclass obj, jstring query, jint count) { std::string cQuery = jni2c(query, env); unsigned int cCount = jni2c(count); - std::string url; - std::string title; + kiwix::Result *p_result; std::string result; - unsigned int score; pthread_mutex_lock(&searcherLock); try { if (searcher != NULL) { searcher->search(cQuery, 0, count); - while (searcher->getNextResult(url, title, score) && - !title.empty() && - !url.empty()) { - result += title + "\n"; + while ( (p_result = searcher->getNextResult()) && + !(p_result->get_title().empty()) && + !(p_result->get_url().empty())) { + result += p_result->get_title() + "\n"; + delete p_result; } } } catch (...) { diff --git a/src/searcher.cpp b/src/searcher.cpp index 22a8adbcb..f458c6129 100644 --- a/src/searcher.cpp +++ b/src/searcher.cpp @@ -81,7 +81,6 @@ namespace kiwix { this->resultEnd = resultEnd; string unaccentedSearch = removeAccents(search); searchInIndex(unaccentedSearch, resultStart, resultEnd, verbose); - this->resultOffset = this->results.begin(); } return; @@ -89,8 +88,6 @@ namespace kiwix { /* Reset the results */ void Searcher::reset() { - this->results.clear(); - this->resultOffset = this->results.begin(); this->estimatedResultCount = 0; this->searchPattern = ""; return; @@ -101,30 +98,6 @@ namespace kiwix { return this->estimatedResultCount; } - /* Get next result */ - bool Searcher::getNextResult(string &url, string &title, unsigned int &score) { - bool retVal = false; - - if (this->resultOffset != this->results.end()) { - - /* url */ - url = this->resultOffset->url; - - /* title */ - title = this->resultOffset->title; - - /* score */ - score = this->resultOffset->score; - - /* increment the cursor for the next call */ - this->resultOffset++; - - retVal = true; - } - - return retVal; - } - bool Searcher::setProtocolPrefix(const std::string prefix) { this->protocolPrefix = prefix; return true; @@ -149,23 +122,24 @@ namespace kiwix { CDT oData; CDT resultsCDT(CDT::ARRAY_VAL); - this->resultOffset = this->results.begin(); - while (this->resultOffset != this->results.end()) { + this->restart_search(); + Result * p_result = NULL; + while ( (p_result = this->getNextResult()) ) { CDT result; - result["title"] = this->resultOffset->title; - result["url"] = this->resultOffset->url; - result["snippet"] = this->resultOffset->snippet; + result["title"] = p_result->get_title(); + result["url"] = p_result->get_url(); + result["snippet"] = p_result->get_snippet(); - if (this->resultOffset->size >= 0) - result["size"] = kiwix::beautifyInteger(this->resultOffset->size); + if (p_result->get_size() >= 0) + result["size"] = kiwix::beautifyInteger(p_result->get_size()); - if (this->resultOffset->wordCount >= 0) - result["wordCount"] = kiwix::beautifyInteger(this->resultOffset->wordCount); + if (p_result->get_wordCount() >= 0) + result["wordCount"] = kiwix::beautifyInteger(p_result->get_wordCount()); resultsCDT.PushBack(result); - this->resultOffset++; + delete p_result; } - this->resultOffset = this->results.begin(); + this->restart_search(); oData["results"] = resultsCDT; // pages diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index d17b9207d..ec0038617 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -68,32 +68,55 @@ namespace kiwix { enquire.set_query(query); /* Get the results */ - Xapian::MSet matches = enquire.get_mset(resultStart, resultEnd - resultStart); - - Xapian::MSetIterator i; - for (i = matches.begin(); i != matches.end(); ++i) { - Xapian::Document doc = i.get_document(); - - Result result; - result.url = doc.get_data(); - result.title = doc.get_value(0); - result.snippet = doc.get_value(1); - result.size = (doc.get_value(2).empty() == true ? -1 : atoi(doc.get_value(2).c_str())); - result.wordCount = (doc.get_value(3).empty() == true ? -1 : atoi(doc.get_value(3).c_str())); - result.score = i.get_percent(); - - this->results.push_back(result); - - if (verbose) { - std::cout << "Document ID " << *i << " \t"; - std::cout << i.get_percent() << "% "; - std::cout << "\t[" << doc.get_data() << "] - " << doc.get_value(0) << std::endl; - } - } + this->results = enquire.get_mset(resultStart, resultEnd - resultStart); + this->current_result = this->results.begin(); /* Update the global resultCount value*/ - this->estimatedResultCount = matches.get_matches_estimated(); - - return; + this->estimatedResultCount = this->results.get_matches_estimated(); } -} + + /* Get next result */ + Result* XapianSearcher::getNextResult() { + if (this->current_result != this->results.end()) { + XapianResult* result = new XapianResult(this->current_result); + this->current_result++; + return result; + } + return NULL; + } + + void XapianSearcher::restart_search() { + this->current_result = this->results.begin(); + } + + XapianResult::XapianResult(Xapian::MSetIterator& iterator): + iterator(iterator), + document(iterator.get_document()) + { + } + + std::string XapianResult::get_url() { + return document.get_data(); + } + + std::string XapianResult::get_title() { + return document.get_value(0); + } + + int XapianResult::get_score() { + return iterator.get_percent(); + } + + std::string XapianResult::get_snippet() { + return document.get_value(1); + } + + int XapianResult::get_size() { + return document.get_value(2).empty() == true ? -1 : atoi(document.get_value(2).c_str()); + } + + int XapianResult::get_wordCount() { + return document.get_value(3).empty() == true ? -1 : atoi(document.get_value(3).c_str()); + } + +} // Kiwix namespace From 9be2abedf3b7877a5a6a23eb189c7e054c11dafe Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 21 Mar 2017 16:26:03 +0100 Subject: [PATCH 2/3] Check if a valuemaps metadata is available in the database and use it. This way, we do not make assumption of where the values are stored. --- include/xapianSearcher.h | 10 +++++-- src/xapianSearcher.cpp | 65 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index 8e43c9ff6..b44cb2e58 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -22,14 +22,18 @@ #include #include "searcher.h" +#include +#include using namespace std; namespace kiwix { + class XapianSearcher; + class XapianResult : public Result { public: - XapianResult(Xapian::MSetIterator& iterator); + XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator); virtual ~XapianResult() {}; virtual std::string get_url(); @@ -40,6 +44,7 @@ namespace kiwix { virtual int get_size(); private: + XapianSearcher* searcher; Xapian::MSetIterator iterator; Xapian::Document document; }; @@ -51,7 +56,7 @@ namespace kiwix { }; class XapianSearcher : public Searcher { - + friend class XapianResult; public: XapianSearcher(const string &xapianDirectoryPath); virtual ~XapianSearcher() {}; @@ -68,6 +73,7 @@ namespace kiwix { Xapian::Stem stemmer; Xapian::MSet results; Xapian::MSetIterator current_result; + std::map valuesmap; }; } diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index ec0038617..3f051f273 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -25,8 +25,21 @@ #include #include +#include + namespace kiwix { +std::map read_valuesmap(const std::string &s) { + std::map result; + std::vector elems = split(s, ";"); + for( auto elem: elems) + { + std::vector tmp_elems = split(elem, ":"); + result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); + } + return result; +} + /* Constructor */ XapianSearcher::XapianSearcher(const string &xapianDirectoryPath) : Searcher(), @@ -49,6 +62,7 @@ namespace kiwix { } catch (...) { this->readableDatabase = Xapian::Database(directoryPath); } + this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap")); } /* Close Xapian writable database */ @@ -78,7 +92,7 @@ namespace kiwix { /* Get next result */ Result* XapianSearcher::getNextResult() { if (this->current_result != this->results.end()) { - XapianResult* result = new XapianResult(this->current_result); + XapianResult* result = new XapianResult(this, this->current_result); this->current_result++; return result; } @@ -89,7 +103,8 @@ namespace kiwix { this->current_result = this->results.begin(); } - XapianResult::XapianResult(Xapian::MSetIterator& iterator): + XapianResult::XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator): + searcher(searcher), iterator(iterator), document(iterator.get_document()) { @@ -100,7 +115,16 @@ namespace kiwix { } std::string XapianResult::get_title() { - return document.get_value(0); + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(0); + } + else if ( searcher->valuesmap.find("title") != searcher->valuesmap.end() ) + { + return document.get_value(searcher->valuesmap["title"]); + } + return ""; } int XapianResult::get_score() { @@ -108,15 +132,44 @@ namespace kiwix { } std::string XapianResult::get_snippet() { - return document.get_value(1); + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(1); + } + else if ( searcher->valuesmap.find("snippet") != searcher->valuesmap.end() ) + { + return document.get_value(searcher->valuesmap["snippet"]); + } + return ""; } int XapianResult::get_size() { - return document.get_value(2).empty() == true ? -1 : atoi(document.get_value(2).c_str()); + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(2).empty() == true ? -1 : atoi(document.get_value(2).c_str()); + } + else if ( searcher->valuesmap.find("size") != searcher->valuesmap.end() ) + { + return atoi(document.get_value(searcher->valuesmap["size"]).c_str()); + } + /* The size is never used. Do we really want to get the content and + calculate the size ? */ + return -1; } int XapianResult::get_wordCount() { - return document.get_value(3).empty() == true ? -1 : atoi(document.get_value(3).c_str()); + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(3).empty() == true ? -1 : atoi(document.get_value(3).c_str()); + } + else if ( searcher->valuesmap.find("wordcount") != searcher->valuesmap.end() ) + { + return atoi(document.get_value(searcher->valuesmap["wordcount"]).c_str()); + } + return -1; } } // Kiwix namespace From 074c1bcffa7942f924b88a2ae63719181a199da7 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 21 Mar 2017 16:28:03 +0100 Subject: [PATCH 3/3] Try to generate the snippet if it is not present in the database. We generate the snippet from the content of the article in the zim so we need to have a access to the reader. --- include/xapianSearcher.h | 5 ++++- src/android/kiwix.cpp | 2 +- src/xapianSearcher.cpp | 29 ++++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index b44cb2e58..8b27eb229 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -22,6 +22,8 @@ #include #include "searcher.h" +#include "reader.h" + #include #include @@ -58,7 +60,7 @@ namespace kiwix { class XapianSearcher : public Searcher { friend class XapianResult; public: - XapianSearcher(const string &xapianDirectoryPath); + XapianSearcher(const string &xapianDirectoryPath, Reader* reader); virtual ~XapianSearcher() {}; void searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd, const bool verbose=false); @@ -69,6 +71,7 @@ namespace kiwix { void closeIndex(); void openIndex(const string &xapianDirectoryPath); + Reader* reader; Xapian::Database readableDatabase; Xapian::Stem stemmer; Xapian::MSet results; diff --git a/src/android/kiwix.cpp b/src/android/kiwix.cpp index 4d573e700..88076a9ff 100644 --- a/src/android/kiwix.cpp +++ b/src/android/kiwix.cpp @@ -445,7 +445,7 @@ JNIEXPORT jboolean JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_loadFulltextIndex(JN searcher = NULL; try { if (searcher != NULL) delete searcher; - searcher = new kiwix::XapianSearcher(cPath); + searcher = new kiwix::XapianSearcher(cPath, NULL); } catch (...) { searcher = NULL; retVal = JNI_FALSE; diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index 3f051f273..4de9d4834 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -18,6 +18,7 @@ */ #include "xapianSearcher.h" +#include "xapian/myhtmlparse.h" #include #include #include @@ -41,8 +42,9 @@ std::map read_valuesmap(const std::string &s) { } /* Constructor */ - XapianSearcher::XapianSearcher(const string &xapianDirectoryPath) + XapianSearcher::XapianSearcher(const string &xapianDirectoryPath, Reader* reader) : Searcher(), + reader(reader), stemmer(Xapian::Stem("english")) { this->openIndex(xapianDirectoryPath); } @@ -134,14 +136,31 @@ std::map read_valuesmap(const std::string &s) { std::string XapianResult::get_snippet() { if ( searcher->valuesmap.empty() ) { - /* This is the old legacy version. Guess and try */ - return document.get_value(1); + /* This is the old legacy version. Guess and try */ + std::string stored_snippet = document.get_value(1); + if ( ! stored_snippet.empty() ) + return stored_snippet; + /* Let's continue here, and see if we can genenate one */ } else if ( searcher->valuesmap.find("snippet") != searcher->valuesmap.end() ) { - return document.get_value(searcher->valuesmap["snippet"]); + return document.get_value(searcher->valuesmap["snippet"]); } - return ""; + /* No reader, no snippet */ + if ( ! searcher->reader ) + return ""; + /* Get the content of the article to generate a snippet. + We parse it and use the html dump to avoid remove html tags in the + content and be able to nicely cut the text at random place. */ + MyHtmlParser htmlParser; + std::string content; + unsigned int contentLength; + std::string contentType; + searcher->reader->getContentByUrl(get_url(), content, contentLength, contentType); + try { + htmlParser.parse_html(content, "UTF-8", true); + } catch (...) {} + return searcher->results.snippet(htmlParser.dump, 500); } int XapianResult::get_size() {