diff --git a/include/searcher.h b/include/searcher.h index 262f79aff..f8999501a 100644 --- a/include/searcher.h +++ b/include/searcher.h @@ -35,14 +35,16 @@ using namespace std; -struct Result +class Result { - string url; - string title; - int score; - string snippet; - int wordCount; - int size; + public: + virtual ~Result() {}; + virtual std::string get_url() = 0; + virtual std::string get_title() = 0; + virtual int get_score() = 0; + virtual std::string get_snippet() = 0; + virtual int get_wordCount() = 0; + virtual int get_size() = 0; }; namespace kiwix { @@ -55,7 +57,8 @@ namespace kiwix { void search(std::string &search, unsigned int resultStart, unsigned int resultEnd, const bool verbose=false); - bool getNextResult(string &url, string &title, unsigned int &score); + virtual Result* getNextResult() = 0; + virtual void restart_search() = 0; unsigned int getEstimatedResultCount(); bool setProtocolPrefix(const std::string prefix); bool setSearchProtocolPrefix(const std::string prefix); @@ -72,8 +75,6 @@ namespace kiwix { virtual void searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd, const bool verbose=false) = 0; - std::vector results; - std::vector::iterator resultOffset; std::string searchPattern; std::string protocolPrefix; std::string searchProtocolPrefix; diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h index 4404f2545..8b27eb229 100644 --- a/include/xapianSearcher.h +++ b/include/xapianSearcher.h @@ -22,11 +22,35 @@ #include #include "searcher.h" +#include "reader.h" + +#include +#include using namespace std; namespace kiwix { + class XapianSearcher; + + class XapianResult : public Result { + public: + XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator); + virtual ~XapianResult() {}; + + virtual std::string get_url(); + virtual std::string get_title(); + virtual int get_score(); + virtual std::string get_snippet(); + virtual int get_wordCount(); + virtual int get_size(); + + private: + XapianSearcher* searcher; + Xapian::MSetIterator iterator; + Xapian::Document document; + }; + class NoXapianIndexInZim: public exception { virtual const char* what() const throw() { return "There is no fulltext index in the zim file"; @@ -34,19 +58,25 @@ namespace kiwix { }; class XapianSearcher : public Searcher { - + friend class XapianResult; public: - XapianSearcher(const string &xapianDirectoryPath); + XapianSearcher(const string &xapianDirectoryPath, Reader* reader); virtual ~XapianSearcher() {}; void searchInIndex(string &search, const unsigned int resultStart, const unsigned int resultEnd, const bool verbose=false); + virtual Result* getNextResult(); + void restart_search(); protected: void closeIndex(); void openIndex(const string &xapianDirectoryPath); + Reader* reader; Xapian::Database readableDatabase; Xapian::Stem stemmer; + Xapian::MSet results; + Xapian::MSetIterator current_result; + std::map valuesmap; }; } diff --git a/src/android/kiwix.cpp b/src/android/kiwix.cpp index dee61f79e..88076a9ff 100644 --- a/src/android/kiwix.cpp +++ b/src/android/kiwix.cpp @@ -445,7 +445,7 @@ JNIEXPORT jboolean JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_loadFulltextIndex(JN searcher = NULL; try { if (searcher != NULL) delete searcher; - searcher = new kiwix::XapianSearcher(cPath); + searcher = new kiwix::XapianSearcher(cPath, NULL); } catch (...) { searcher = NULL; retVal = JNI_FALSE; @@ -460,19 +460,18 @@ JNIEXPORT jstring JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_indexedQuery (JNIEnv *env, jclass obj, jstring query, jint count) { std::string cQuery = jni2c(query, env); unsigned int cCount = jni2c(count); - std::string url; - std::string title; + kiwix::Result *p_result; std::string result; - unsigned int score; pthread_mutex_lock(&searcherLock); try { if (searcher != NULL) { searcher->search(cQuery, 0, count); - while (searcher->getNextResult(url, title, score) && - !title.empty() && - !url.empty()) { - result += title + "\n"; + while ( (p_result = searcher->getNextResult()) && + !(p_result->get_title().empty()) && + !(p_result->get_url().empty())) { + result += p_result->get_title() + "\n"; + delete p_result; } } } catch (...) { diff --git a/src/searcher.cpp b/src/searcher.cpp index 22a8adbcb..f458c6129 100644 --- a/src/searcher.cpp +++ b/src/searcher.cpp @@ -81,7 +81,6 @@ namespace kiwix { this->resultEnd = resultEnd; string unaccentedSearch = removeAccents(search); searchInIndex(unaccentedSearch, resultStart, resultEnd, verbose); - this->resultOffset = this->results.begin(); } return; @@ -89,8 +88,6 @@ namespace kiwix { /* Reset the results */ void Searcher::reset() { - this->results.clear(); - this->resultOffset = this->results.begin(); this->estimatedResultCount = 0; this->searchPattern = ""; return; @@ -101,30 +98,6 @@ namespace kiwix { return this->estimatedResultCount; } - /* Get next result */ - bool Searcher::getNextResult(string &url, string &title, unsigned int &score) { - bool retVal = false; - - if (this->resultOffset != this->results.end()) { - - /* url */ - url = this->resultOffset->url; - - /* title */ - title = this->resultOffset->title; - - /* score */ - score = this->resultOffset->score; - - /* increment the cursor for the next call */ - this->resultOffset++; - - retVal = true; - } - - return retVal; - } - bool Searcher::setProtocolPrefix(const std::string prefix) { this->protocolPrefix = prefix; return true; @@ -149,23 +122,24 @@ namespace kiwix { CDT oData; CDT resultsCDT(CDT::ARRAY_VAL); - this->resultOffset = this->results.begin(); - while (this->resultOffset != this->results.end()) { + this->restart_search(); + Result * p_result = NULL; + while ( (p_result = this->getNextResult()) ) { CDT result; - result["title"] = this->resultOffset->title; - result["url"] = this->resultOffset->url; - result["snippet"] = this->resultOffset->snippet; + result["title"] = p_result->get_title(); + result["url"] = p_result->get_url(); + result["snippet"] = p_result->get_snippet(); - if (this->resultOffset->size >= 0) - result["size"] = kiwix::beautifyInteger(this->resultOffset->size); + if (p_result->get_size() >= 0) + result["size"] = kiwix::beautifyInteger(p_result->get_size()); - if (this->resultOffset->wordCount >= 0) - result["wordCount"] = kiwix::beautifyInteger(this->resultOffset->wordCount); + if (p_result->get_wordCount() >= 0) + result["wordCount"] = kiwix::beautifyInteger(p_result->get_wordCount()); resultsCDT.PushBack(result); - this->resultOffset++; + delete p_result; } - this->resultOffset = this->results.begin(); + this->restart_search(); oData["results"] = resultsCDT; // pages diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index d17b9207d..4de9d4834 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -18,6 +18,7 @@ */ #include "xapianSearcher.h" +#include "xapian/myhtmlparse.h" #include #include #include @@ -25,11 +26,25 @@ #include #include +#include + namespace kiwix { +std::map read_valuesmap(const std::string &s) { + std::map result; + std::vector elems = split(s, ";"); + for( auto elem: elems) + { + std::vector tmp_elems = split(elem, ":"); + result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); + } + return result; +} + /* Constructor */ - XapianSearcher::XapianSearcher(const string &xapianDirectoryPath) + XapianSearcher::XapianSearcher(const string &xapianDirectoryPath, Reader* reader) : Searcher(), + reader(reader), stemmer(Xapian::Stem("english")) { this->openIndex(xapianDirectoryPath); } @@ -49,6 +64,7 @@ namespace kiwix { } catch (...) { this->readableDatabase = Xapian::Database(directoryPath); } + this->valuesmap = read_valuesmap(this->readableDatabase.get_metadata("valuesmap")); } /* Close Xapian writable database */ @@ -68,32 +84,111 @@ namespace kiwix { enquire.set_query(query); /* Get the results */ - Xapian::MSet matches = enquire.get_mset(resultStart, resultEnd - resultStart); - - Xapian::MSetIterator i; - for (i = matches.begin(); i != matches.end(); ++i) { - Xapian::Document doc = i.get_document(); - - Result result; - result.url = doc.get_data(); - result.title = doc.get_value(0); - result.snippet = doc.get_value(1); - result.size = (doc.get_value(2).empty() == true ? -1 : atoi(doc.get_value(2).c_str())); - result.wordCount = (doc.get_value(3).empty() == true ? -1 : atoi(doc.get_value(3).c_str())); - result.score = i.get_percent(); - - this->results.push_back(result); - - if (verbose) { - std::cout << "Document ID " << *i << " \t"; - std::cout << i.get_percent() << "% "; - std::cout << "\t[" << doc.get_data() << "] - " << doc.get_value(0) << std::endl; - } - } + this->results = enquire.get_mset(resultStart, resultEnd - resultStart); + this->current_result = this->results.begin(); /* Update the global resultCount value*/ - this->estimatedResultCount = matches.get_matches_estimated(); - - return; + this->estimatedResultCount = this->results.get_matches_estimated(); } -} + + /* Get next result */ + Result* XapianSearcher::getNextResult() { + if (this->current_result != this->results.end()) { + XapianResult* result = new XapianResult(this, this->current_result); + this->current_result++; + return result; + } + return NULL; + } + + void XapianSearcher::restart_search() { + this->current_result = this->results.begin(); + } + + XapianResult::XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator): + searcher(searcher), + iterator(iterator), + document(iterator.get_document()) + { + } + + std::string XapianResult::get_url() { + return document.get_data(); + } + + std::string XapianResult::get_title() { + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(0); + } + else if ( searcher->valuesmap.find("title") != searcher->valuesmap.end() ) + { + return document.get_value(searcher->valuesmap["title"]); + } + return ""; + } + + int XapianResult::get_score() { + return iterator.get_percent(); + } + + std::string XapianResult::get_snippet() { + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + std::string stored_snippet = document.get_value(1); + if ( ! stored_snippet.empty() ) + return stored_snippet; + /* Let's continue here, and see if we can genenate one */ + } + else if ( searcher->valuesmap.find("snippet") != searcher->valuesmap.end() ) + { + return document.get_value(searcher->valuesmap["snippet"]); + } + /* No reader, no snippet */ + if ( ! searcher->reader ) + return ""; + /* Get the content of the article to generate a snippet. + We parse it and use the html dump to avoid remove html tags in the + content and be able to nicely cut the text at random place. */ + MyHtmlParser htmlParser; + std::string content; + unsigned int contentLength; + std::string contentType; + searcher->reader->getContentByUrl(get_url(), content, contentLength, contentType); + try { + htmlParser.parse_html(content, "UTF-8", true); + } catch (...) {} + return searcher->results.snippet(htmlParser.dump, 500); + } + + int XapianResult::get_size() { + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(2).empty() == true ? -1 : atoi(document.get_value(2).c_str()); + } + else if ( searcher->valuesmap.find("size") != searcher->valuesmap.end() ) + { + return atoi(document.get_value(searcher->valuesmap["size"]).c_str()); + } + /* The size is never used. Do we really want to get the content and + calculate the size ? */ + return -1; + } + + int XapianResult::get_wordCount() { + if ( searcher->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return document.get_value(3).empty() == true ? -1 : atoi(document.get_value(3).c_str()); + } + else if ( searcher->valuesmap.find("wordcount") != searcher->valuesmap.end() ) + { + return atoi(document.get_value(searcher->valuesmap["wordcount"]).c_str()); + } + return -1; + } + +} // Kiwix namespace