Merge pull request #42 from kiwix/search_in_libzim

Search in libzim
This commit is contained in:
Matthieu Gautier 2017-04-11 13:26:23 +02:00 committed by GitHub
commit 4485cc8d0f
9 changed files with 276 additions and 231 deletions

View File

@ -5,10 +5,6 @@ headers = [
'searcher.h' 'searcher.h'
] ]
if xapian_dep.found()
headers += ['xapianSearcher.h']
endif
install_headers(headers, subdir:'kiwix') install_headers(headers, subdir:'kiwix')
install_headers( install_headers(

View File

@ -43,45 +43,45 @@ namespace kiwix {
~Reader(); ~Reader();
void reset(); void reset();
unsigned int getArticleCount(); unsigned int getArticleCount() const;
unsigned int getMediaCount(); unsigned int getMediaCount() const;
unsigned int getGlobalCount(); unsigned int getGlobalCount() const;
string getZimFilePath(); string getZimFilePath() const;
string getId(); string getId() const;
string getRandomPageUrl(); string getRandomPageUrl() const;
string getFirstPageUrl(); string getFirstPageUrl() const;
string getMainPageUrl(); string getMainPageUrl() const;
bool getMetatag(const string &url, string &content); bool getMetatag(const string &url, string &content) const;
string getTitle(); string getTitle() const;
string getDescription(); string getDescription() const;
string getLanguage(); string getLanguage() const;
string getName(); string getName() const;
string getTags(); string getTags() const;
string getDate(); string getDate() const;
string getCreator(); string getCreator() const;
string getPublisher(); string getPublisher() const;
string getOrigId(); string getOrigId() const;
bool getFavicon(string &content, string &mimeType); bool getFavicon(string &content, string &mimeType) const;
bool getPageUrlFromTitle(const string &title, string &url); bool getPageUrlFromTitle(const string &title, string &url) const;
bool getMimeTypeByUrl(const string &url, string &mimeType); bool getMimeTypeByUrl(const string &url, string &mimeType) const;
bool getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType); bool getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) const;
bool getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl); bool getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) const;
bool getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType); bool getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) const;
bool getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl); bool getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) const;
bool getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType); bool getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) const;
bool searchSuggestions(const string &prefix, unsigned int suggestionsCount, const bool reset = true); bool searchSuggestions(const string &prefix, unsigned int suggestionsCount, const bool reset = true);
bool searchSuggestionsSmart(const string &prefix, unsigned int suggestionsCount); bool searchSuggestionsSmart(const string &prefix, unsigned int suggestionsCount);
bool urlExists(const string &url); bool urlExists(const string &url) const;
bool hasFulltextIndex(); bool hasFulltextIndex() const;
std::vector<std::string> getTitleVariants(const std::string &title); std::vector<std::string> getTitleVariants(const std::string &title) const;
bool getNextSuggestion(string &title); bool getNextSuggestion(string &title);
bool getNextSuggestion(string &title, string &url); bool getNextSuggestion(string &title, string &url);
bool canCheckIntegrity(); bool canCheckIntegrity() const;
bool isCorrupted(); bool isCorrupted() const;
bool parseUrl(const string &url, char *ns, string &title); bool parseUrl(const string &url, char *ns, string &title) const;
unsigned int getFileSize(); unsigned int getFileSize() const;
zim::File* getZimFileHandler(); zim::File* getZimFileHandler() const;
bool getArticleObjectByDecodedUrl(const string &url, zim::Article &article); bool getArticleObjectByDecodedUrl(const string &url, zim::Article &article) const;
protected: protected:
zim::File* zimFileHandler; zim::File* zimFileHandler;
@ -96,7 +96,7 @@ namespace kiwix {
std::vector< std::vector<std::string> >::iterator suggestionsOffset; std::vector< std::vector<std::string> >::iterator suggestionsOffset;
private: private:
std::map<std::string, unsigned int> parseCounterMetadata(); std::map<const std::string, unsigned int> parseCounterMetadata() const;
}; };
} }

View File

@ -35,30 +35,31 @@
using namespace std; using namespace std;
class Result
{
public:
virtual ~Result() {};
virtual std::string get_url() = 0;
virtual std::string get_title() = 0;
virtual int get_score() = 0;
virtual std::string get_snippet() = 0;
virtual int get_wordCount() = 0;
virtual int get_size() = 0;
};
namespace kiwix { namespace kiwix {
class Reader;
class Result {
public:
virtual ~Result() {};
virtual std::string get_url() = 0;
virtual std::string get_title() = 0;
virtual int get_score() = 0;
virtual std::string get_snippet() = 0;
virtual int get_wordCount() = 0;
virtual int get_size() = 0;
};
struct SearcherInternal;
class Searcher { class Searcher {
public: public:
Searcher(); Searcher(Reader* reader);
virtual ~Searcher(); ~Searcher();
void search(std::string &search, unsigned int resultStart, void search(std::string &search, unsigned int resultStart,
unsigned int resultEnd, const bool verbose=false); unsigned int resultEnd, const bool verbose=false);
virtual Result* getNextResult() = 0; Result* getNextResult();
virtual void restart_search() = 0; void restart_search();
unsigned int getEstimatedResultCount(); unsigned int getEstimatedResultCount();
bool setProtocolPrefix(const std::string prefix); bool setProtocolPrefix(const std::string prefix);
bool setSearchProtocolPrefix(const std::string prefix); bool setSearchProtocolPrefix(const std::string prefix);
@ -71,10 +72,12 @@ namespace kiwix {
protected: protected:
std::string beautifyInteger(const unsigned int number); std::string beautifyInteger(const unsigned int number);
virtual void closeIndex() = 0; void closeIndex() ;
virtual void searchInIndex(string &search, const unsigned int resultStart, void searchInIndex(string &search, const unsigned int resultStart,
const unsigned int resultEnd, const bool verbose=false) = 0; const unsigned int resultEnd, const bool verbose=false);
Reader* reader;
SearcherInternal* internal;
std::string searchPattern; std::string searchPattern;
std::string protocolPrefix; std::string protocolPrefix;
std::string searchProtocolPrefix; std::string searchProtocolPrefix;

View File

@ -1,6 +1,7 @@
project('kiwixlib', 'cpp', project('kiwixlib', 'cpp',
version : '0.1.0', version : '0.1.0',
license : 'GPL') license : 'GPL',
default_options : ['c_std=c11', 'cpp_std=c++11'])
compiler = meson.get_compiler('cpp') compiler = meson.get_compiler('cpp')
find_library_in_compiler = meson.version().version_compare('>=0.31.0') find_library_in_compiler = meson.version().version_compare('>=0.31.0')
@ -61,9 +62,7 @@ else
endif endif
endif endif
xapian_dep = dependency('xapian-core', required:false) all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep]
all_deps = [thread_dep, libicu_dep, libzim_dep, xapian_dep, pugixml_dep]
if has_ctpp2_dep if has_ctpp2_dep
all_deps += [ctpp2_dep] all_deps += [ctpp2_dep]
endif endif
@ -80,9 +79,6 @@ subdir('static')
subdir('src') subdir('src')
pkg_requires = ['libzim', 'icu-i18n', 'pugixml'] pkg_requires = ['libzim', 'icu-i18n', 'pugixml']
if xapian_dep.found()
pkg_requires += ['xapian-core']
endif
extra_libs = [] extra_libs = []
extra_cflags = '' extra_cflags = ''

View File

@ -9,7 +9,7 @@
#include "unicode/putil.h" #include "unicode/putil.h"
#include "reader.h" #include "reader.h"
#include "xapianSearcher.h" #include "searcher.h"
#include "common/base64.h" #include "common/base64.h"
#include <android/log.h> #include <android/log.h>
@ -23,7 +23,7 @@
/* global variables */ /* global variables */
kiwix::Reader *reader = NULL; kiwix::Reader *reader = NULL;
kiwix::XapianSearcher *searcher = NULL; kiwix::Searcher *searcher = NULL;
static pthread_mutex_t readerLock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t readerLock = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t searcherLock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t searcherLock = PTHREAD_MUTEX_INITIALIZER;
@ -445,7 +445,7 @@ JNIEXPORT jboolean JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_loadFulltextIndex(JN
searcher = NULL; searcher = NULL;
try { try {
if (searcher != NULL) delete searcher; if (searcher != NULL) delete searcher;
searcher = new kiwix::XapianSearcher(cPath, NULL); searcher = new kiwix::Searcher(reader);
} catch (...) { } catch (...) {
searcher = NULL; searcher = NULL;
retVal = JNI_FALSE; retVal = JNI_FALSE;
@ -460,7 +460,7 @@ JNIEXPORT jstring JNICALL Java_org_kiwix_kiwixlib_JNIKiwix_indexedQuery
(JNIEnv *env, jclass obj, jstring query, jint count) { (JNIEnv *env, jclass obj, jstring query, jint count) {
std::string cQuery = jni2c(query, env); std::string cQuery = jni2c(query, env);
unsigned int cCount = jni2c(count); unsigned int cCount = jni2c(count);
Result *p_result; kiwix::Result *p_result;
std::string result; std::string result;
pthread_mutex_lock(&searcherLock); pthread_mutex_lock(&searcherLock);

View File

@ -85,7 +85,7 @@ std::map<std::string, std::string> kiwix::getNetworkInterfaces() {
/* some systems have ifr_addr.sa_len and adjust the length that /* some systems have ifr_addr.sa_len and adjust the length that
* way, but not mine. weird */ * way, but not mine. weird */
#ifndef linux #ifndef __linux__
len=IFNAMSIZ + ifreq->ifr_addr.sa_len; len=IFNAMSIZ + ifreq->ifr_addr.sa_len;
#else #else
len=sizeof *ifreq; len=sizeof *ifreq;

View File

@ -8,16 +8,10 @@ kiwix_sources = [
'common/regexTools.cpp', 'common/regexTools.cpp',
'common/stringTools.cpp', 'common/stringTools.cpp',
'common/networkTools.cpp', 'common/networkTools.cpp',
'common/otherTools.cpp', 'common/otherTools.cpp'
'xapian/htmlparse.cc',
'xapian/myhtmlparse.cc'
] ]
kiwix_sources += lib_resources kiwix_sources += lib_resources
if xapian_dep.found()
kiwix_sources += ['xapianSearcher.cpp']
endif
if get_option('android') if get_option('android')
subdir('android') subdir('android')
endif endif

View File

@ -87,7 +87,7 @@ namespace kiwix {
} }
} }
zim::File* Reader::getZimFileHandler() { zim::File* Reader::getZimFileHandler() const {
return this->zimFileHandler; return this->zimFileHandler;
} }
@ -96,14 +96,14 @@ namespace kiwix {
this->currentArticleOffset = this->firstArticleOffset; this->currentArticleOffset = this->firstArticleOffset;
} }
std::map<std::string, unsigned int> Reader::parseCounterMetadata() { std::map<const std::string, unsigned int> Reader::parseCounterMetadata() const {
std::map<std::string, unsigned int> counters; std::map<const std::string, unsigned int> counters;
string content, mimeType, item, counterString; string mimeType, item, counterString;
unsigned int contentLength, counter; unsigned int counter;
string counterUrl = "/M/Counter";
this->getContentByUrl(counterUrl, content, contentLength, mimeType); zim::Article article = this->zimFileHandler->getArticle('M',"Counter");
stringstream ssContent(content);
stringstream ssContent(article.getData());
while(getline(ssContent, item, ';')) { while(getline(ssContent, item, ';')) {
stringstream ssItem(item); stringstream ssItem(item);
@ -119,8 +119,8 @@ namespace kiwix {
} }
/* Get the count of articles which can be indexed/displayed */ /* Get the count of articles which can be indexed/displayed */
unsigned int Reader::getArticleCount() { unsigned int Reader::getArticleCount() const {
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata(); std::map<const std::string, unsigned int> counterMap = this->parseCounterMetadata();
unsigned int counter = 0; unsigned int counter = 0;
if (counterMap.empty()) { if (counterMap.empty()) {
@ -135,8 +135,8 @@ namespace kiwix {
} }
/* Get the count of medias content in the ZIM file */ /* Get the count of medias content in the ZIM file */
unsigned int Reader::getMediaCount() { unsigned int Reader::getMediaCount() const {
std::map<std::string, unsigned int> counterMap = this->parseCounterMetadata(); std::map<const std::string, unsigned int> counterMap = this->parseCounterMetadata();
unsigned int counter = 0; unsigned int counter = 0;
if (counterMap.empty()) if (counterMap.empty())
@ -161,43 +161,38 @@ namespace kiwix {
} }
/* Get the total of all items of a ZIM file, redirects included */ /* Get the total of all items of a ZIM file, redirects included */
unsigned int Reader::getGlobalCount() { unsigned int Reader::getGlobalCount() const {
return this->zimFileHandler->getCountArticles(); return this->zimFileHandler->getCountArticles();
} }
/* Return the UID of the ZIM file */ /* Return the UID of the ZIM file */
string Reader::getId() { string Reader::getId() const {
std::ostringstream s; std::ostringstream s;
s << this->zimFileHandler->getFileheader().getUuid(); s << this->zimFileHandler->getFileheader().getUuid();
return s.str(); return s.str();
} }
/* Return a page url from a title */ /* Return a page url from a title */
bool Reader::getPageUrlFromTitle(const string &title, string &url) { bool Reader::getPageUrlFromTitle(const string &title, string &url) const {
/* Extract the content from the zim file */ /* Extract the content from the zim file */
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findxByTitle('A', title); zim::Article article = this->zimFileHandler->getArticleByTitle('A', title);
/* Test if the article was found */ if ( ! article.good() )
if (resultPair.first == true) { {
return false;
/* Get the article */
zim::Article article = *resultPair.second;
/* If redirect */
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
url = article.getLongUrl();
return true;
} }
return false; unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
url = article.getLongUrl();
return true;
} }
/* Return an URL from a title*/ /* Return an URL from a title*/
string Reader::getRandomPageUrl() { string Reader::getRandomPageUrl() const {
zim::Article article; zim::Article article;
zim::size_type idx; zim::size_type idx;
std::string mainPageUrl = this->getMainPageUrl(); std::string mainPageUrl = this->getMainPageUrl();
@ -208,11 +203,11 @@ namespace kiwix {
article = zimFileHandler->getArticle(idx); article = zimFileHandler->getArticle(idx);
} while (article.getLongUrl() == mainPageUrl); } while (article.getLongUrl() == mainPageUrl);
return article.getLongUrl().c_str(); return article.getLongUrl();
} }
/* Return the welcome page URL */ /* Return the welcome page URL */
string Reader::getMainPageUrl() { string Reader::getMainPageUrl() const {
string url = ""; string url = "";
if (this->zimFileHandler->getFileheader().hasMainPage()) { if (this->zimFileHandler->getFileheader().hasMainPage()) {
@ -229,7 +224,7 @@ namespace kiwix {
return url; return url;
} }
bool Reader::getFavicon(string &content, string &mimeType) { bool Reader::getFavicon(string &content, string &mimeType) const {
unsigned int contentLength = 0; unsigned int contentLength = 0;
this->getContentByUrl( "/-/favicon.png", content, this->getContentByUrl( "/-/favicon.png", content,
@ -254,12 +249,12 @@ namespace kiwix {
return content.empty() ? false : true; return content.empty() ? false : true;
} }
string Reader::getZimFilePath() { string Reader::getZimFilePath() const {
return this->zimFilePath; return this->zimFilePath;
} }
/* Return a metatag value */ /* Return a metatag value */
bool Reader::getMetatag(const string &name, string &value) { bool Reader::getMetatag(const string &name, string &value) const {
unsigned int contentLength = 0; unsigned int contentLength = 0;
string contentType = ""; string contentType = "";
@ -267,7 +262,7 @@ namespace kiwix {
contentLength, contentType); contentLength, contentType);
} }
string Reader::getTitle() { string Reader::getTitle() const {
string value; string value;
this->getMetatag("Title", value); this->getMetatag("Title", value);
if (value.empty()) { if (value.empty()) {
@ -279,19 +274,19 @@ namespace kiwix {
return value; return value;
} }
string Reader::getName() { string Reader::getName() const {
string value; string value;
this->getMetatag("Name", value); this->getMetatag("Name", value);
return value; return value;
} }
string Reader::getTags() { string Reader::getTags() const {
string value; string value;
this->getMetatag("Tags", value); this->getMetatag("Tags", value);
return value; return value;
} }
string Reader::getDescription() { string Reader::getDescription() const{
string value; string value;
this->getMetatag("Description", value); this->getMetatag("Description", value);
@ -303,31 +298,31 @@ namespace kiwix {
return value; return value;
} }
string Reader::getLanguage() { string Reader::getLanguage() const {
string value; string value;
this->getMetatag("Language", value); this->getMetatag("Language", value);
return value; return value;
} }
string Reader::getDate() { string Reader::getDate() const {
string value; string value;
this->getMetatag("Date", value); this->getMetatag("Date", value);
return value; return value;
} }
string Reader::getCreator() { string Reader::getCreator() const {
string value; string value;
this->getMetatag("Creator", value); this->getMetatag("Creator", value);
return value; return value;
} }
string Reader::getPublisher() { string Reader::getPublisher() const {
string value; string value;
this->getMetatag("Publisher", value); this->getMetatag("Publisher", value);
return value; return value;
} }
string Reader::getOrigId() { string Reader::getOrigId() const {
string value; string value;
this->getMetatag("startfileuid", value); this->getMetatag("startfileuid", value);
if(value.empty()) if(value.empty())
@ -355,17 +350,13 @@ namespace kiwix {
} }
/* Return the first page URL */ /* Return the first page URL */
string Reader::getFirstPageUrl() { string Reader::getFirstPageUrl() const {
string url;
zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A'); zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A');
zim::Article article = zimFileHandler->getArticle(firstPageOffset); zim::Article article = zimFileHandler->getArticle(firstPageOffset);
url = article.getLongUrl(); return article.getLongUrl();
return url;
} }
bool Reader::parseUrl(const string &url, char *ns, string &title) { bool Reader::parseUrl(const string &url, char *ns, string &title) const {
/* Offset to visit the url */ /* Offset to visit the url */
unsigned int urlLength = url.size(); unsigned int urlLength = url.size();
unsigned int offset = 0; unsigned int offset = 0;
@ -395,130 +386,113 @@ namespace kiwix {
} }
/* Return article by url */ /* Return article by url */
bool Reader::getArticleObjectByDecodedUrl(const string &url, zim::Article &article) { bool Reader::getArticleObjectByDecodedUrl(const string &url, zim::Article &article) const {
bool retVal = false; if (this->zimFileHandler == NULL) {
return false;
if (this->zimFileHandler != NULL) {
/* Parse the url */
char ns = 0;
string titleStr;
this->parseUrl(url, &ns, titleStr);
/* Main page */
if (titleStr.empty() && ns == 0) {
this->parseUrl(this->getMainPageUrl(), &ns, titleStr);
}
/* Extract the content from the zim file */
std::pair<bool, zim::File::const_iterator> resultPair = zimFileHandler->findx(ns, titleStr);
/* Test if the article was found */
if (resultPair.first == true) {
article = zimFileHandler->getArticle(resultPair.second.getIndex());
retVal = true;
}
} }
return retVal; /* Parse the url */
char ns = 0;
string urlStr;
this->parseUrl(url, &ns, urlStr);
/* Main page */
if (urlStr.empty() && ns == 0) {
this->parseUrl(this->getMainPageUrl(), &ns, urlStr);
}
/* Extract the content from the zim file */
article = zimFileHandler->getArticle(ns, urlStr);
return article.good();
} }
/* Return the mimeType without the content */ /* Return the mimeType without the content */
bool Reader::getMimeTypeByUrl(const string &url, string &mimeType) { bool Reader::getMimeTypeByUrl(const string &url, string &mimeType) const {
bool retVal = false; if (this->zimFileHandler == NULL) {
return false;
if (this->zimFileHandler != NULL) {
zim::Article article;
if (this->getArticleObjectByDecodedUrl(url, article)) {
try {
mimeType = string(article.getMimeType().data(), article.getMimeType().size());
} catch (exception &e) {
cerr << "Unable to get the mimetype for "<< url << ":" << e.what() << endl;
mimeType = "application/octet-stream";
}
retVal = true;
} else {
mimeType = "";
}
} }
return retVal; zim::Article article;
if (this->getArticleObjectByDecodedUrl(url, article)) {
try {
mimeType = article.getMimeType();
} catch (exception &e) {
cerr << "Unable to get the mimetype for " << url << ":" << e.what() << endl;
mimeType = "application/octet-stream";
}
return true;
} else {
mimeType = "";
return false;
}
} }
/* Get a content from a zim file */ /* Get a content from a zim file */
bool Reader::getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) { bool Reader::getContentByUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) const {
return this->getContentByEncodedUrl(url, content, contentLength, contentType); return this->getContentByEncodedUrl(url, content, contentLength, contentType);
} }
bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) { bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) const {
return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, baseUrl); return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, baseUrl);
} }
bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) { bool Reader::getContentByEncodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) const {
std::string stubRedirectUrl; std::string stubRedirectUrl;
return this->getContentByEncodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl); return this->getContentByEncodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl);
} }
bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) { bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType) const {
std::string stubRedirectUrl; std::string stubRedirectUrl;
return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl); return this->getContentByDecodedUrl(kiwix::urlDecode(url), content, contentLength, contentType, stubRedirectUrl);
} }
bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) { bool Reader::getContentByDecodedUrl(const string &url, string &content, unsigned int &contentLength, string &contentType, string &baseUrl) const {
bool retVal = false;
content=""; content="";
contentType=""; contentType="";
contentLength = 0; contentLength = 0;
if (this->zimFileHandler != NULL) {
zim::Article article; zim::Article article;
if (this->getArticleObjectByDecodedUrl(url, article)) { if ( ! this->getArticleObjectByDecodedUrl(url, article)) {
return false;
/* If redirect */
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
if (loopCounter < 42) {
/* Compute base url (might be different from the url if redirects */
baseUrl = "/" + std::string(1, article.getNamespace()) + "/" + article.getUrl();
/* Get the content mime-type */
try {
contentType = string(article.getMimeType().data(), article.getMimeType().size());
} catch (exception &e) {
cerr << "Unable to get the mimetype for "<< baseUrl<< ":" << e.what() << endl;
contentType = "application/octet-stream";
}
/* Get the data */
content = string(article.getData().data(), article.getArticleSize());
}
/* Try to set a stub HTML header/footer if necesssary */
if (contentType.find("text/html") != string::npos &&
content.find("<body") == std::string::npos &&
content.find("<BODY") == std::string::npos) {
content = "<html><head><title>" + article.getTitle() + "</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body>" + content + "</body></html>";
}
/* Get the data length */
contentLength = article.getArticleSize();
/* Set return value */
retVal = true;
}
} }
return retVal; /* If redirect */
unsigned int loopCounter = 0;
while (article.isRedirect() && loopCounter++<42) {
article = article.getRedirectArticle();
}
if (loopCounter < 42) {
/* Compute base url (might be different from the url if redirects */
baseUrl = "/" + std::string(1, article.getNamespace()) + "/" + article.getUrl();
/* Get the content mime-type */
try {
contentType = string(article.getMimeType().data(), article.getMimeType().size());
} catch (exception &e) {
cerr << "Unable to get the mimetype for "<< baseUrl<< ":" << e.what() << endl;
contentType = "application/octet-stream";
}
/* Get the data */
content = string(article.getData().data(), article.getArticleSize());
}
/* Try to set a stub HTML header/footer if necesssary */
if (contentType.find("text/html") != string::npos &&
content.find("<body") == std::string::npos &&
content.find("<BODY") == std::string::npos) {
content = "<html><head><title>" + article.getTitle() + "</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head><body>" + content + "</body></html>";
}
/* Get the data length */
contentLength = article.getArticleSize();
return true;
} }
/* Check if an article exists */ /* Check if an article exists */
bool Reader::urlExists(const string &url) { bool Reader::urlExists(const string &url) const {
char ns = 0; char ns = 0;
string titleStr; string titleStr;
this->parseUrl(url, &ns, titleStr); this->parseUrl(url, &ns, titleStr);
@ -528,7 +502,7 @@ namespace kiwix {
} }
/* Does the ZIM file has a fulltext index */ /* Does the ZIM file has a fulltext index */
bool Reader::hasFulltextIndex() { bool Reader::hasFulltextIndex() const {
return this->urlExists("/Z/fulltextIndex/xapian"); return this->urlExists("/Z/fulltextIndex/xapian");
} }
@ -604,7 +578,7 @@ namespace kiwix {
return retVal; return retVal;
} }
std::vector<std::string> Reader::getTitleVariants(const std::string &title) { std::vector<std::string> Reader::getTitleVariants(const std::string &title) const {
std::vector<std::string> variants; std::vector<std::string> variants;
variants.push_back(title); variants.push_back(title);
variants.push_back(kiwix::ucFirst(title)); variants.push_back(kiwix::ucFirst(title));
@ -660,12 +634,12 @@ namespace kiwix {
} }
/* Check if the file has as checksum */ /* Check if the file has as checksum */
bool Reader::canCheckIntegrity() { bool Reader::canCheckIntegrity() const {
return this->zimFileHandler->getChecksum() != ""; return this->zimFileHandler->getChecksum() != "";
} }
/* Return true if corrupted, false otherwise */ /* Return true if corrupted, false otherwise */
bool Reader::isCorrupted() { bool Reader::isCorrupted() const {
try { try {
if (this->zimFileHandler->verify() == true) if (this->zimFileHandler->verify() == true)
return false; return false;
@ -678,7 +652,7 @@ namespace kiwix {
} }
/* Return the file size, works also for splitted files */ /* Return the file size, works also for splitted files */
unsigned int Reader::getFileSize() { unsigned int Reader::getFileSize() const {
zim::File *file = this->getZimFileHandler(); zim::File *file = this->getZimFileHandler();
zim::offset_type size = 0; zim::offset_type size = 0;

View File

@ -18,8 +18,11 @@
*/ */
#include "searcher.h" #include "searcher.h"
#include "reader.h"
#include "kiwixlib-resources.h" #include "kiwixlib-resources.h"
#include <zim/search.h>
#ifdef ENABLE_CTPP2 #ifdef ENABLE_CTPP2
#include <ctpp2/CDT.hpp> #include <ctpp2/CDT.hpp>
#include <ctpp2/CTPP2FileLogger.hpp> #include <ctpp2/CTPP2FileLogger.hpp>
@ -32,8 +35,39 @@ using namespace CTPP;
namespace kiwix { namespace kiwix {
class _Result : public Result {
public:
_Result(Searcher* searcher, zim::Search::iterator& iterator);
virtual ~_Result() {};
virtual std::string get_url();
virtual std::string get_title();
virtual int get_score();
virtual std::string get_snippet();
virtual int get_wordCount();
virtual int get_size();
private:
Searcher* searcher;
zim::Search::iterator iterator;
};
struct SearcherInternal {
const zim::Search *_search;
zim::Search::iterator current_iterator;
SearcherInternal() : _search(NULL) {}
~SearcherInternal() {
if ( _search != NULL )
delete _search;
}
};
/* Constructor */ /* Constructor */
Searcher::Searcher() : Searcher::Searcher(Reader* reader) :
reader(reader),
internal(new SearcherInternal()),
searchPattern(""), searchPattern(""),
protocolPrefix("zim://"), protocolPrefix("zim://"),
searchProtocolPrefix("search://?"), searchProtocolPrefix("search://?"),
@ -47,7 +81,9 @@ namespace kiwix {
} }
/* Destructor */ /* Destructor */
Searcher::~Searcher() {} Searcher::~Searcher() {
delete internal;
}
/* Search strings in the database */ /* Search strings in the database */
void Searcher::search(std::string &search, unsigned int resultStart, void Searcher::search(std::string &search, unsigned int resultStart,
@ -80,12 +116,28 @@ namespace kiwix {
this->resultStart = resultStart; this->resultStart = resultStart;
this->resultEnd = resultEnd; this->resultEnd = resultEnd;
string unaccentedSearch = removeAccents(search); string unaccentedSearch = removeAccents(search);
searchInIndex(unaccentedSearch, resultStart, resultEnd, verbose); internal->_search = this->reader->getZimFileHandler()->search(unaccentedSearch, resultStart, resultEnd);
internal->current_iterator = internal->_search->begin();
this->estimatedResultCount = internal->_search->get_matches_estimated();
} }
return; return;
} }
void Searcher::restart_search() {
internal->current_iterator = internal->_search->begin();
}
Result* Searcher::getNextResult() {
if (internal->current_iterator != internal->_search->end()) {
Result* result = new _Result(this, internal->current_iterator);
internal->current_iterator++;
return result;
}
return NULL;
}
/* Reset the results */ /* Reset the results */
void Searcher::reset() { void Searcher::reset() {
this->estimatedResultCount = 0; this->estimatedResultCount = 0;
@ -112,6 +164,36 @@ namespace kiwix {
this->contentHumanReadableId = contentHumanReadableId; this->contentHumanReadableId = contentHumanReadableId;
} }
_Result::_Result(Searcher* searcher, zim::Search::iterator& iterator):
searcher(searcher),
iterator(iterator)
{
}
std::string _Result::get_url() {
return iterator.get_url();
}
std::string _Result::get_title() {
return iterator.get_title();
}
int _Result::get_score() {
return iterator.get_score();
}
std::string _Result::get_snippet() {
return iterator.get_snippet();
}
int _Result::get_size() {
return iterator.get_size();
}
int _Result::get_wordCount() {
return iterator.get_wordCount();
}
#ifdef ENABLE_CTPP2 #ifdef ENABLE_CTPP2
string Searcher::getHtml() { string Searcher::getHtml() {