From 1f3fcd85a0973fdfa0aa74ea0d31ebf13a218bba Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 15 Mar 2018 15:27:13 +0100 Subject: [PATCH 1/4] Allow us to declare method to be deprecated. --- include/common.h | 24 ++++++++++++++++++++++++ include/meson.build | 1 + 2 files changed, 25 insertions(+) create mode 100644 include/common.h diff --git a/include/common.h b/include/common.h new file mode 100644 index 000000000..5df556e9d --- /dev/null +++ b/include/common.h @@ -0,0 +1,24 @@ + +#ifndef _KIWIX_COMMON_H_ +#define _KIWIX_COMMON_H_ + +#include + +#ifdef __GNUC__ +#define DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define DEPRECATED __declspec(deprecated) +#else +#praga message("WARNING: You need to implement DEPRECATED for this compiler") +#define DEPRECATED +#endif + + +namespace kiwix { + +typedef zim::size_type size_type; +typedef zim::offset_type offset_type; + +} + +#endif //_KIWIX_COMMON_H_ diff --git a/include/meson.build b/include/meson.build index a3d6b1ea1..2e376c617 100644 --- a/include/meson.build +++ b/include/meson.build @@ -1,4 +1,5 @@ headers = [ + 'common.h', 'library.h', 'manager.h', 'opds_dumper.h', From 135028c16a835df640ca835a99255ce8e53d781c Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Thu, 15 Mar 2018 15:35:59 +0100 Subject: [PATCH 2/4] Introduce better API to manipulate entries in a zim file. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous API suffer different problems: - It was difficult to handle articles redirecting to other article. - It was not possible to get few information (title) without getting the whole content. The new API introduce the new class `Entry` that act as a proxy to an article in the zim file. Methods of `Reader` now return an `Entry` and the user has to call `Entry`'s methods to get useful information. No redirection is made explicitly. If an entry is not found, an exception is raised instead of returning an invalid `Entry`. The common pattern to get the content of an entry become : ``` std::string content; try { auto entry = reader.getEntryFromPath(path); entry = entry.getFinalEntry(); content = entry.getContent(); } catch (NoEntry& e) { ... } ``` Older methods are keep (with the same behavior) but are marked as deprecated. --- include/entry.h | 191 ++++++++++++++++++++ include/meson.build | 1 + include/reader.h | 154 +++++++++++----- src/android/kiwixreader.cpp | 80 +++------ src/entry.cpp | 138 +++++++++++++++ src/meson.build | 1 + src/reader.cpp | 345 ++++++++++++++++++++++-------------- src/xapianSearcher.cpp | 9 +- 8 files changed, 684 insertions(+), 235 deletions(-) create mode 100644 include/entry.h create mode 100644 src/entry.cpp diff --git a/include/entry.h b/include/entry.h new file mode 100644 index 000000000..af9067e29 --- /dev/null +++ b/include/entry.h @@ -0,0 +1,191 @@ +/* + * Copyright 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef KIWIX_ENTRY_H +#define KIWIX_ENTRY_H + +#include +#include +#include +#include +#include "common.h" + +using namespace std; + +namespace kiwix +{ + + +class NoEntry : public std::exception {}; + +/** + * A entry represent an.. entry in a zim file. + */ +class Entry +{ + public: + /** + * Default constructor. + * + * Construct an invalid entry. + */ + Entry() = default; + + /** + * Construct an entry making reference to an zim article. + * + * @param article + */ + Entry(zim::Article article); + virtual ~Entry() = default; + + /** + * Get the path of the entry. + * + * The path is the "key" of an entry. + * + * @return the path of the entry. + */ + std::string getPath() const; + + /** + * Get the title of the entry. + * + * @return the title of the entry. + */ + std::string getTitle() const; + + /** + * Get the content of the entry. + * + * The string is a copy of the content. + * If you don't want to do a copy, use get_blob. + * + * @return the content of the entry. + */ + std::string getContent() const; + + /** + * Get the blob of the entry. + * + * A blob make reference to the content without copying it. + * + * @param offset The starting offset of the blob. + * @return the blob of the entry. + */ + zim::Blob getBlob(offset_type offset = 0) const; + + /** + * Get the blob of the entry. + * + * A blob make reference to the content without copying it. + * + * @param offset The starting offset of the blob. + * @param size The size of the blob. + * @return the blob of the entry. + */ + zim::Blob getBlob(offset_type offset, size_type size) const; + + /** + * Get the info for direct access to the content of the entry. + * + * Some entry (ie binary ones) have their content plain stored + * in the zim file. Knowing the offset where the content is stored + * an user can directly read the content in the zim file bypassing the + * kiwix-lib/libzim. + * + * @return A pair specifying where to read the content. + * The string is the real file to read (may be different that .zim + * file if zim is cut). + * The offset is the offset to read in the file. + * Return <"",0> if is not possible to read directly. + */ + std::pair getDirectAccessInfo() const; + + /** + * Get the size of the entry. + * + * @return the size of the entry. + */ + size_type getSize() const; + + /** + * Get the mime_type of the entry. + * + * @return the mime_type of the entry. + */ + std::string getMimetype() const; + + + /** + * Get if the entry is a redirect entry. + * + * @return True if the entry is a redirect. + */ + bool isRedirect() const; + + /** + * Get if the entry is a link target entry. + * + * @return True if the entry is a link target. + */ + bool isLinkTarget() const; + + /** + * Get if the entry is a deleted entry. + * + * @return True if the entry is a deleted entry. + */ + bool isDeleted() const; + + /** + * Get the entry pointed by this entry. + * + * @return the entry pointed. + * @throw NoEntry if the entry is not a redirected entry. + */ + Entry getRedirectEntry() const; + + /** + * Get the final entry pointed by this entry. + * + * Follow the redirection until a "not redirecting" entry is found. + * If the entry is not a redirected entry, return the entry itself. + * + * @return the final entry. + */ + Entry getFinalEntry() const; + + /** + * Convert the entry to a boolean value. + * + * @return True if the entry is valid. + */ + explicit operator bool() const { return good(); } + + private: + zim::Article article; + mutable zim::Article final_article; + + bool good() const { return article.good(); } +}; + +} + +#endif // KIWIX_ENTRY_H diff --git a/include/meson.build b/include/meson.build index 2e376c617..1aca8b9ab 100644 --- a/include/meson.build +++ b/include/meson.build @@ -5,6 +5,7 @@ headers = [ 'opds_dumper.h', 'downloader.h', 'reader.h', + 'entry.h', 'searcher.h' ] diff --git a/include/reader.h b/include/reader.h index d522c6923..52f8c0ec2 100644 --- a/include/reader.h +++ b/include/reader.h @@ -29,6 +29,8 @@ #include #include #include +#include "common.h" +#include "entry.h" #include "common/pathTools.h" #include "common/stringTools.h" @@ -38,7 +40,7 @@ namespace kiwix { /** - * The Reader class is the class who allow to get an article content from a zim + * The Reader class is the class who allow to get an entry content from a zim * file. */ class Reader @@ -57,11 +59,11 @@ class Reader ~Reader(); /** - * Get the number of "displayable" articles in the zim file. + * Get the number of "displayable" entries in the zim file. * * @return If the zim file has a /M/Counter metadata, return the number of - * articles with the 'text/html' MIMEtype specified in the metadata. - * Else return the number of articles in the 'A' namespace. + * entries with the 'text/html' MIMEtype specified in the metadata. + * Else return the number of entries in the 'A' namespace. */ unsigned int getArticleCount() const; @@ -69,16 +71,16 @@ class Reader * Get the number of media in the zim file. * * @return If the zim file has a /M/Counter metadata, return the number of - * articles with the 'image/jpeg', 'image/gif' and 'image/png' in + * entries with the 'image/jpeg', 'image/gif' and 'image/png' in * the metadata. - * Else return the number of articles in the 'I' namespace. + * Else return the number of entries in the 'I' namespace. */ unsigned int getMediaCount() const; /** - * Get the number of all articles in the zim file. + * Get the number of all entries in the zim file. * - * @return Return the number of all the articles, whatever their MIMEtype or + * @return Return the number of all the entries, whatever their MIMEtype or * their namespace. */ unsigned int getGlobalCount() const; @@ -100,25 +102,54 @@ class Reader /** * Get the url of a random page. * - * @return Url of a random page. The page is picked from all articles in + * Deprecated : Use `getRandomPage` instead. + * + * @return Url of a random page. The page is picked from all entries in * the 'A' namespace. * The main page is excluded from the potential results. */ - string getRandomPageUrl() const; + DEPRECATED string getRandomPageUrl() const; + + /** + * Get a random page. + * + * @return A random Entry. The entry is picked from all entries in + * the 'A' namespace. + * The main entry is excluded from the potential results. + */ + Entry getRandomPage() const; /** * Get the url of the first page. * - * @return Url of the first article in the 'A' namespace. + * Deprecated : Use `getFirstPage` instead. + * + * @return Url of the first entry in the 'A' namespace. */ - string getFirstPageUrl() const; + DEPRECATED string getFirstPageUrl() const; + + /** + * Get the entry of the first page. + * + * @return The first entry in the 'A' namespace. + */ + Entry getFirstPage() const; /** * Get the url of the main page. * + * Deprecated : Use `getMainPage` instead. + * * @return Url of the main page as specified in the zim file. */ - string getMainPageUrl() const; + DEPRECATED string getMainPageUrl() const; + + /** + * Get the entry of the main page. + * + * @return Entry of the main page as specified in the zim file. + */ + Entry getMainPage() const; /** * Get the content of a metadata. @@ -207,6 +238,35 @@ class Reader */ bool getFavicon(string& content, string& mimeType) const; + /** + * Get an entry associated to an path. + * + * @param path The path of the entry. + * @return The entry. + * @throw NoEntry If no entry correspond to the path. + */ + Entry getEntryFromPath(const std::string& path) const; + + /** + * Get an entry associated to an url encoded path. + * + * Equivalent to `getEntryFromPath(urlDecode(path));` + * + * @param path The url encoded path. + * @return The entry. + * @throw NoEntry If no entry correspond to the path. + */ + Entry getEntryFromEncodedPath(const std::string& path) const; + + /** + * Get un entry associated to a title. + * + * @param title The title. + * @return The entry + * throw NoEntry If no entry correspond to the url. + */ + Entry getEntryFromTitle(const std::string& title) const; + /** * Get the url of a page specified by a title. * @@ -214,34 +274,34 @@ class Reader * @param[out] url the url of the page. * @return True if the page can be found. */ - bool getPageUrlFromTitle(const string& title, string& url) const; + DEPRECATED bool getPageUrlFromTitle(const string& title, string& url) const; /** - * Get the mimetype of a article specified by a url. + * Get the mimetype of a entry specified by a url. * - * @param[in] url the url of the article. - * @param[out] mimetype the mimeType of the article. + * @param[in] url the url of the entry. + * @param[out] mimetype the mimeType of the entry. * @return True if the mimeType has been found. */ - bool getMimeTypeByUrl(const string& url, string& mimeType) const; + DEPRECATED bool getMimeTypeByUrl(const string& url, string& mimeType) const; /** - * Get the content of an article specifed by a url. + * Get the content of an entry specifed by a url. * * Alias to `getContentByEncodedUrl` */ - bool getContentByUrl(const string& url, + DEPRECATED bool getContentByUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType) const; /** - * Get the content of an article specified by a url encoded url. + * Get the content of an entry specified by a url encoded url. * * Equivalent to getContentByDecodedUrl(urlDecode(url), ...). */ - bool getContentByEncodedUrl(const string& url, + DEPRECATED bool getContentByEncodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, @@ -249,48 +309,48 @@ class Reader string& baseUrl) const; /** - * Get the content of an article specified by an url encoded url. + * Get the content of an entry specified by an url encoded url. * * Equivalent to getContentByEncodedUrl but without baseUrl. */ - bool getContentByEncodedUrl(const string& url, + DEPRECATED bool getContentByEncodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType) const; /** - * Get the content of an article specified by a url. + * Get the content of an entry specified by a url. * - * @param[in] url The url of the article. - * @param[out] content The content of the article. - * @param[out] title the title of the article. - * @param[out] contentLength The size of the article (size of content). - * @param[out] contentType The mimeType of the article. - * @param[out] baseUrl Return the true url of the article. - * If the specified article is a redirection, contains - * the url of the targeted article. - * @return True if the article has been found. + * @param[in] url The url of the entry. + * @param[out] content The content of the entry. + * @param[out] title the title of the entry. + * @param[out] contentLength The size of the entry (size of content). + * @param[out] contentType The mimeType of the entry. + * @param[out] baseUrl Return the true url of the entry. + * If the specified entry is a redirection, contains + * the url of the targeted entry. + * @return True if the entry has been found. */ - bool getContentByDecodedUrl(const string& url, + DEPRECATED bool getContentByDecodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType, string& baseUrl) const; /** - * Get the content of an article specified by a url. + * Get the content of an entry specified by a url. * * Equivalent to getContentByDecodedUrl but withou the baseUrl. */ - bool getContentByDecodedUrl(const string& url, + DEPRECATED bool getContentByDecodedUrl(const string& url, string& content, string& title, unsigned int& contentLength, string& contentType) const; /** - * Search for articles with title starting with prefix (case sensitive). + * Search for entries with title starting with prefix (case sensitive). * * Suggestions are stored in an internal vector and can be retrieved using * `getNextSuggestion` method. @@ -308,7 +368,7 @@ class Reader const bool reset = true); /** - * Search for articles for the given prefix. + * Search for entries for the given prefix. * * If the zim file has a internal fulltext index, the suggestions will be * searched using it. @@ -328,10 +388,20 @@ class Reader /** * Check if the url exists in the zim file. * + * Deprecated : Use `pathExists` instead. + * * @param url the url to check. * @return True if the url exits in the zim file. */ - bool urlExists(const string& url) const; + DEPRECATED bool urlExists(const string& url) const; + + /** + * Check if the path exists in the zim file. + * + * @param path the path to check. + * @return True if the path exists in the zim file. + */ + bool pathExists(const string& path) const; /** * Check if the zim file has a embedded fulltext index. @@ -388,7 +458,7 @@ class Reader * @param[out] title The url (url). * @return True */ - bool parseUrl(const string& url, char* ns, string& title) const; + DEPRECATED bool parseUrl(const string& url, char* ns, string& title) const; /** * Return the total size of the zim file. @@ -413,7 +483,7 @@ class Reader * @param[out] article The libzim article object. * @return True if the url is good (article.good()). */ - bool getArticleObjectByDecodedUrl(const string& url, + DEPRECATED bool getArticleObjectByDecodedUrl(const string& url, zim::Article& article) const; protected: diff --git a/src/android/kiwixreader.cpp b/src/android/kiwixreader.cpp index ba3e562f4..769f25286 100644 --- a/src/android/kiwixreader.cpp +++ b/src/android/kiwixreader.cpp @@ -60,7 +60,7 @@ Java_org_kiwix_kiwixlib_JNIKiwixReader_getMainPage(JNIEnv* env, jobject obj) jstring url; try { - std::string cUrl = READER->getMainPageUrl(); + std::string cUrl = READER->getMainPage().getPath(); url = c2jni(cUrl, env); } catch (...) { std::cerr << "Unable to get ZIM main page" << std::endl; @@ -196,8 +196,8 @@ JNIEXPORT jstring JNICALL Java_org_kiwix_kiwixlib_JNIKiwixReader_getMimeType( std::string cUrl = jni2c(url, env); try { - std::string cMimeType; - READER->getMimeTypeByUrl(cUrl, cMimeType); + auto entry = READER->getEntryFromEncodedPath(cUrl); + auto cMimeType = entry.getMimetype(); mimeType = c2jni(cMimeType, env); } catch (...) { std::cerr << "Unable to get mime-type for url " << cUrl << std::endl; @@ -216,20 +216,20 @@ JNIEXPORT jbyteArray JNICALL Java_org_kiwix_kiwixlib_JNIKiwixReader_getContent( /* Retrieve the content */ std::string cUrl = jni2c(url, env); - std::string cData; - std::string cTitle; - std::string cMimeType; unsigned int cSize = 0; try { - if (READER->getContentByUrl(cUrl, cData, cTitle, cSize, cMimeType)) { - data = env->NewByteArray(cSize); - env->SetByteArrayRegion( - data, 0, cSize, reinterpret_cast(cData.c_str())); - setStringObjValue(cMimeType, mimeTypeObj, env); - setStringObjValue(cTitle, titleObj, env); - setIntObjValue(cSize, sizeObj, env); - } + auto entry = READER->getEntryFromEncodedPath(cUrl); + entry = entry.getFinalEntry(); + cSize = entry.getSize(); + setIntObjValue(cSize, sizeObj, env); + + data = env->NewByteArray(cSize); + env->SetByteArrayRegion( + data, 0, cSize, reinterpret_cast(entry.getBlob().data())); + + setStringObjValue(entry.getMimetype(), mimeTypeObj, env); + setStringObjValue(entry.getTitle(), titleObj, env); } catch (...) { std::cerr << "Unable to get content for url " << cUrl << std::endl; } @@ -249,22 +249,13 @@ JNIEXPORT jbyteArray JNICALL Java_org_kiwix_kiwixlib_JNIKiwixReader_getContentPa unsigned int cOffset = jni2c(offset); unsigned int cLen = jni2c(len); try { - zim::Article article; - READER->getArticleObjectByDecodedUrl(kiwix::urlDecode(cUrl), article); - if (! article.good()) { - return data; - } - int loopCounter = 0; - while (article.isRedirect() && ++loopCounter < 42) { - article = article.getRedirectArticle(); - } - if (loopCounter == 42) { - return data; - } + auto entry = READER->getEntryFromEncodedPath(cUrl); + entry = entry.getFinalEntry(); + if (cLen == 0) { - setIntObjValue(article.getArticleSize(), sizeObj, env); - } else if (cOffset+cLen > article.getArticleSize()) { - auto blob = article.getData(cOffset, cLen); + setIntObjValue(entry.getSize(), sizeObj, env); + } else if (cOffset+cLen < entry.getSize()) { + auto blob = entry.getBlob(cOffset, cLen); data = env->NewByteArray(cLen); env->SetByteArrayRegion( data, 0, cLen, reinterpret_cast(blob.data())); @@ -288,20 +279,9 @@ Java_org_kiwix_kiwixlib_JNIKiwixReader_getDirectAccessInformation( std::string cUrl = jni2c(url, env); try { - zim::Article article; - READER->getArticleObjectByDecodedUrl(kiwix::urlDecode(cUrl), article); - if (! article.good()) { - return pair; - } - int loopCounter = 0; - while (article.isRedirect() && ++loopCounter < 42) { - article = article.getRedirectArticle(); - } - if (loopCounter == 42) { - return pair; - } - - auto part_info = article.getDirectAccessInformation(); + auto entry = READER->getEntryFromEncodedPath(cUrl); + entry = entry.getFinalEntry(); + auto part_info = entry.getDirectAccessInfo(); setPairObjValue(part_info.first, part_info.second, pair, env); } catch (...) { std::cerr << "Unable to locate direct access information for url " << cUrl @@ -359,20 +339,18 @@ Java_org_kiwix_kiwixlib_JNIKiwixReader_getPageUrlFromTitle(JNIEnv* env, jstring title, jobject urlObj) { - jboolean retVal = JNI_FALSE; std::string cTitle = jni2c(title, env); - std::string cUrl; try { - if (READER->getPageUrlFromTitle(cTitle, cUrl)) { - setStringObjValue(cUrl, urlObj, env); - retVal = JNI_TRUE; - } + auto entry = READER->getEntryFromTitle(cTitle); + entry = entry.getFinalEntry(); + setStringObjValue(entry.getPath(), urlObj, env); + return JNI_TRUE; } catch (...) { std::cerr << "Unable to get URL for title " << cTitle << std::endl; } - return retVal; + return JNI_FALSE; } JNIEXPORT jstring JNICALL Java_org_kiwix_kiwixlib_JNIKiwixReader_getTitle( @@ -410,7 +388,7 @@ JNIEXPORT jboolean JNICALL Java_org_kiwix_kiwixlib_JNIKiwixReader_getRandomPage( std::string cUrl; try { - std::string cUrl = READER->getRandomPageUrl(); + std::string cUrl = READER->getRandomPage().getPath(); setStringObjValue(cUrl, urlObj, env); retVal = JNI_TRUE; } catch (...) { diff --git a/src/entry.cpp b/src/entry.cpp new file mode 100644 index 000000000..6436aa181 --- /dev/null +++ b/src/entry.cpp @@ -0,0 +1,138 @@ +/* + * Copyright 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "reader.h" +#include + +#include + +namespace kiwix +{ + +Entry::Entry(zim::Article article) + : article(article) +{ +} + +#define RETURN_IF_INVALID(WHAT) if(!good()) { return (WHAT); } + +std::string Entry::getPath() const +{ + RETURN_IF_INVALID(""); + return article.getLongUrl(); +} + +std::string Entry::getTitle() const +{ + RETURN_IF_INVALID(""); + return article.getTitle(); +} + +std::string Entry::getContent() const +{ + RETURN_IF_INVALID(""); + return article.getData(); +} + +zim::Blob Entry::getBlob(offset_type offset) const +{ + RETURN_IF_INVALID(zim::Blob()); + return article.getData(offset); +} + +zim::Blob Entry::getBlob(offset_type offset, size_type size) const +{ + RETURN_IF_INVALID(zim::Blob()); + return article.getData(offset, size); +} + +std::pair Entry::getDirectAccessInfo() const +{ + RETURN_IF_INVALID(std::make_pair("", 0)); + return article.getDirectAccessInformation(); +} + +size_type Entry::getSize() const +{ + RETURN_IF_INVALID(0); + return article.getArticleSize(); +} + +std::string Entry::getMimetype() const +{ + RETURN_IF_INVALID(""); + try { + return article.getMimeType(); + } catch (exception& e) { + return "application/octet-stream"; + } +} + +bool Entry::isRedirect() const +{ + RETURN_IF_INVALID(false); + return article.isRedirect(); +} + +bool Entry::isLinkTarget() const +{ + RETURN_IF_INVALID(false); + return article.isLinktarget(); +} + +bool Entry::isDeleted() const +{ + RETURN_IF_INVALID(false); + return article.isDeleted(); +} + +Entry Entry::getRedirectEntry() const +{ + RETURN_IF_INVALID(Entry()); + if ( !article.isRedirect() ) { + throw NoEntry(); + } + + auto targeted_article = article.getRedirectArticle(); + if ( !targeted_article.good()) { + throw NoEntry(); + } + return targeted_article; +} + +Entry Entry::getFinalEntry() const +{ + RETURN_IF_INVALID(Entry()); + if (final_article.good()) { + return final_article; + } + + int loopCounter = 42; + final_article = article; + while (final_article.isRedirect() && loopCounter--) { + final_article = final_article.getRedirectArticle(); + if ( !final_article.good()) { + throw NoEntry(); + } + } + + return final_article; +} + +} diff --git a/src/meson.build b/src/meson.build index d3df0f4ad..f2be42e29 100644 --- a/src/meson.build +++ b/src/meson.build @@ -4,6 +4,7 @@ kiwix_sources = [ 'opds_dumper.cpp', 'downloader.cpp', 'reader.cpp', + 'entry.cpp', 'searcher.cpp', 'common/base64.cpp', 'common/pathTools.cpp', diff --git a/src/reader.cpp b/src/reader.cpp index 684b0ab84..0103366be 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -190,79 +190,88 @@ string Reader::getId() const /* Return a page url from a title */ bool Reader::getPageUrlFromTitle(const string& title, string& url) const { - /* Extract the content from the zim file */ - zim::Article article = this->zimFileHandler->getArticleByTitle('A', title); - - if (!article.good()) { + try { + auto entry = getEntryFromTitle(title); + entry = entry.getFinalEntry(); + url = entry.getPath(); + return true; + } catch (NoEntry& e) { return false; } - - unsigned int loopCounter = 0; - while (article.isRedirect() && loopCounter++ < 42) { - article = article.getRedirectArticle(); - } - - url = article.getLongUrl(); - return true; } /* Return an URL from a title */ string Reader::getRandomPageUrl() const { + return getRandomPage().getPath(); +} + +Entry Reader::getRandomPage() const +{ + if (!this->zimFileHandler) { + throw NoEntry(); + } + zim::Article article; - zim::size_type idx; - std::string mainPageUrl = this->getMainPageUrl(); + std::string mainPagePath = this->getMainPage().getPath(); + int watchdog = 42; do { - idx = this->firstArticleOffset + auto idx = this->firstArticleOffset + (zim::size_type)((double)rand() / ((double)RAND_MAX + 1) * this->nsACount); article = zimFileHandler->getArticle(idx); - } while (article.getLongUrl() == mainPageUrl); + if (!watchdog--) { + throw NoEntry(); + } + } while (!article.good() && article.getLongUrl() == mainPagePath); - return article.getLongUrl(); + return article; } /* Return the welcome page URL */ string Reader::getMainPageUrl() const { - string url = ""; + return getMainPage().getPath(); +} - if (this->zimFileHandler->getFileheader().hasMainPage()) { - zim::Article article = zimFileHandler->getArticle( - this->zimFileHandler->getFileheader().getMainPage()); - url = article.getLongUrl(); - - if (url.empty()) { - url = getFirstPageUrl(); - } - } else { - url = getFirstPageUrl(); +Entry Reader::getMainPage() const +{ + if (!this->zimFileHandler) { + throw NoEntry(); } - return url; + string url = ""; + + zim::Article article; + if (this->zimFileHandler->getFileheader().hasMainPage()) + { + article = zimFileHandler->getArticle( + this->zimFileHandler->getFileheader().getMainPage()); + } + + if (!article.good()) + { + return getFirstPage(); + } + + return article; } bool Reader::getFavicon(string& content, string& mimeType) const { - unsigned int contentLength = 0; - string title; + static const char* const paths[] = {"-/favicon.png", "I/favicon.png", "I/favicon", "-/favicon"}; - this->getContentByUrl("/-/favicon.png", content, title, contentLength, mimeType); - - if (content.empty()) { - this->getContentByUrl("/I/favicon.png", content, title, contentLength, mimeType); - - if (content.empty()) { - this->getContentByUrl("/I/favicon", content, title, contentLength, mimeType); - - if (content.empty()) { - this->getContentByUrl("/-/favicon", content, title, contentLength, mimeType); - } - } + for (auto &path: paths) { + try { + auto entry = getEntryFromPath(path); + content = entry.getContent(); + mimeType = entry.getMimetype(); + return true; + } catch(NoEntry& e) {}; } - return content.empty() ? false : true; + return false; } string Reader::getZimFilePath() const @@ -272,11 +281,13 @@ string Reader::getZimFilePath() const /* Return a metatag value */ bool Reader::getMetatag(const string& name, string& value) const { - unsigned int contentLength = 0; - string contentType = ""; - string title; - - return this->getContentByUrl("/M/" + name, value, title, contentLength, contentType); + try { + auto entry = getEntryFromPath("M/"+name); + value = entry.getContent(); + return true; + } catch(NoEntry& e) { + return false; + } } string Reader::getTitle() const @@ -375,12 +386,26 @@ string Reader::getOrigId() const /* Return the first page URL */ string Reader::getFirstPageUrl() const { - zim::size_type firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A'); - zim::Article article = zimFileHandler->getArticle(firstPageOffset); - return article.getLongUrl(); + return getFirstPage().getPath(); } -bool Reader::parseUrl(const string& url, char* ns, string& title) const +Entry Reader::getFirstPage() const +{ + if (!this->zimFileHandler) { + throw NoEntry(); + } + + auto firstPageOffset = zimFileHandler->getNamespaceBeginOffset('A'); + auto article = zimFileHandler->getArticle(firstPageOffset); + + if (! article.good()) { + throw NoEntry(); + } + + return article; +} + +bool _parseUrl(const string& url, char* ns, string& title) { /* Offset to visit the url */ unsigned int urlLength = url.size(); @@ -414,6 +439,52 @@ bool Reader::parseUrl(const string& url, char* ns, string& title) const return true; } +bool Reader::parseUrl(const string& url, char* ns, string& title) const +{ + return _parseUrl(url, ns, title); +} + +Entry Reader::getEntryFromPath(const std::string& path) const +{ + char ns = 0; + std::string short_url; + + if (!this->zimFileHandler) { + throw NoEntry(); + } + _parseUrl(path, &ns, short_url); + + if (short_url.empty() && ns == 0) { + return getMainPage(); + } + + auto article = zimFileHandler->getArticle(ns, short_url); + if (!article.good()) { + throw NoEntry(); + } + + return article; +} + +Entry Reader::getEntryFromEncodedPath(const std::string& path) const +{ + return getEntryFromPath(urlDecode(path)); +} + +Entry Reader::getEntryFromTitle(const std::string& title) const +{ + if (!this->zimFileHandler) { + throw NoEntry(); + } + + auto article = this->zimFileHandler->getArticleByTitle('A', title); + if (!article.good()) { + throw NoEntry(); + } + + return article; +} + /* Return article by url */ bool Reader::getArticleObjectByDecodedUrl(const string& url, zim::Article& article) const @@ -425,11 +496,11 @@ bool Reader::getArticleObjectByDecodedUrl(const string& url, /* Parse the url */ char ns = 0; string urlStr; - this->parseUrl(url, &ns, urlStr); + _parseUrl(url, &ns, urlStr); /* Main page */ if (urlStr.empty() && ns == 0) { - this->parseUrl(this->getMainPageUrl(), &ns, urlStr); + _parseUrl(this->getMainPage().getPath(), &ns, urlStr); } /* Extract the content from the zim file */ @@ -440,26 +511,53 @@ bool Reader::getArticleObjectByDecodedUrl(const string& url, /* Return the mimeType without the content */ bool Reader::getMimeTypeByUrl(const string& url, string& mimeType) const { - if (this->zimFileHandler == NULL) { - return false; - } - - zim::Article article; - if (this->getArticleObjectByDecodedUrl(url, article)) { - try { - mimeType = article.getMimeType(); - } catch (exception& e) { - cerr << "Unable to get the mimetype for " << url << ":" << e.what() - << endl; - mimeType = "application/octet-stream"; - } + try { + auto entry = getEntryFromPath(url); + mimeType = entry.getMimetype(); return true; - } else { + } catch (NoEntry& e) { mimeType = ""; return false; } } +bool get_content_by_decoded_url(const Reader& reader, + const string& url, + string& content, + string& title, + unsigned int& contentLength, + string& contentType, + string& baseUrl) +{ + content = ""; + contentType = ""; + contentLength = 0; + + try { + auto entry = reader.getEntryFromPath(url); + entry = entry.getFinalEntry(); + baseUrl = entry.getPath(); + contentType = entry.getMimetype(); + content = entry.getContent(); + contentLength = entry.getSize(); + title = entry.getTitle(); + + /* Try to set a stub HTML header/footer if necesssary */ + if (contentType.find("text/html") != string::npos + && content.find("" + + content + ""; + } + return true; + } catch (NoEntry& e) { + return false; + } +} + + /* Get a content from a zim file */ bool Reader::getContentByUrl(const string& url, string& content, @@ -467,7 +565,14 @@ bool Reader::getContentByUrl(const string& url, unsigned int& contentLength, string& contentType) const { - return this->getContentByEncodedUrl(url, content, title, contentLength, contentType); + std::string stubRedirectUrl; + return get_content_by_decoded_url(*this, + kiwix::urlDecode(url), + content, + title, + contentLength, + contentType, + stubRedirectUrl); } bool Reader::getContentByEncodedUrl(const string& url, @@ -477,8 +582,13 @@ bool Reader::getContentByEncodedUrl(const string& url, string& contentType, string& baseUrl) const { - return this->getContentByDecodedUrl( - kiwix::urlDecode(url), content, title, contentLength, contentType, baseUrl); + return get_content_by_decoded_url(*this, + kiwix::urlDecode(url), + content, + title, + contentLength, + contentType, + baseUrl); } bool Reader::getContentByEncodedUrl(const string& url, @@ -488,12 +598,13 @@ bool Reader::getContentByEncodedUrl(const string& url, string& contentType) const { std::string stubRedirectUrl; - return this->getContentByEncodedUrl(kiwix::urlDecode(url), - content, - title, - contentLength, - contentType, - stubRedirectUrl); + return get_content_by_decoded_url(*this, + kiwix::urlDecode(url), + content, + title, + contentLength, + contentType, + stubRedirectUrl); } bool Reader::getContentByDecodedUrl(const string& url, @@ -503,12 +614,13 @@ bool Reader::getContentByDecodedUrl(const string& url, string& contentType) const { std::string stubRedirectUrl; - return this->getContentByDecodedUrl(kiwix::urlDecode(url), - content, - title, - contentLength, - contentType, - stubRedirectUrl); + return get_content_by_decoded_url(*this, + url, + content, + title, + contentLength, + contentType, + stubRedirectUrl); } bool Reader::getContentByDecodedUrl(const string& url, @@ -518,63 +630,26 @@ bool Reader::getContentByDecodedUrl(const string& url, string& contentType, string& baseUrl) const { - content = ""; - contentType = ""; - contentLength = 0; - - zim::Article article; - if (!this->getArticleObjectByDecodedUrl(url, article)) { - return false; - } - - /* If redirect */ - unsigned int loopCounter = 0; - while (article.isRedirect() && loopCounter++ < 42) { - article = article.getRedirectArticle(); - } - - if (loopCounter < 42) { - /* Compute base url (might be different from the url if redirects */ - baseUrl - = "/" + std::string(1, article.getNamespace()) + "/" + article.getUrl(); - - /* Get the content mime-type */ - try { - contentType - = string(article.getMimeType().data(), article.getMimeType().size()); - } catch (exception& e) { - cerr << "Unable to get the mimetype for " << baseUrl << ":" << e.what() - << endl; - contentType = "application/octet-stream"; - } - - /* Get the data */ - content = string(article.getData().data(), article.getArticleSize()); - title = article.getTitle(); - } - - /* Try to set a stub HTML header/footer if necesssary */ - if (contentType.find("text/html") != string::npos - && content.find("" + - content + ""; - } - - /* Get the data length */ - contentLength = article.getArticleSize(); - - return true; + return get_content_by_decoded_url(*this, + url, + content, + title, + contentLength, + contentType, + baseUrl); } /* Check if an article exists */ bool Reader::urlExists(const string& url) const +{ + return pathExists(url); +} + +bool Reader::pathExists(const string& path) const { char ns = 0; string titleStr; - this->parseUrl(url, &ns, titleStr); + _parseUrl(path, &ns, titleStr); titleStr = "/" + titleStr; zim::File::const_iterator findItr = zimFileHandler->find(ns, titleStr); return findItr != zimFileHandler->end() && findItr->getUrl() == titleStr; @@ -583,7 +658,7 @@ bool Reader::urlExists(const string& url) const /* Does the ZIM file has a fulltext index */ bool Reader::hasFulltextIndex() const { - return ( this->urlExists("/Z/fulltextIndex/xapian") + return ( this->pathExists("/Z/fulltextIndex/xapian") && !zimFileHandler->is_multiPart() ); } diff --git a/src/xapianSearcher.cpp b/src/xapianSearcher.cpp index a281bcb2c..e3102b49d 100644 --- a/src/xapianSearcher.cpp +++ b/src/xapianSearcher.cpp @@ -193,13 +193,8 @@ std::string XapianResult::get_content() if (!searcher->reader) { return ""; } - std::string content; - std::string title; - unsigned int contentLength; - std::string contentType; - searcher->reader->getContentByUrl( - get_url(), content, title, contentLength, contentType); - return content; + auto entry = searcher->reader->getEntryFromEncodedPath(get_url()); + return entry.getContent(); } int XapianResult::get_size() From 1dd828e79cb790b9d3b9d70fb400410a3332abc5 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 18 Apr 2018 15:39:08 +0200 Subject: [PATCH 3/4] Fix pathExists and check for correct path for xapian index. The correct path for xapian database should be "X/fulltext/xapian", not "Z//fulltextIndex/xapian". So lets check for the right path and fallback to the wrong one (but used in old zims). The double '/' in the path is a bug of zimwriterfs and is specific to the xapian database. We must handle this correctly in `hasFulltextIndex` and not (buggly) in `pathExists`. (Hopefully, it seems that pathExists were used only by hasFulltextIndex) --- src/reader.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/reader.cpp b/src/reader.cpp index 0103366be..ccbede73c 100644 --- a/src/reader.cpp +++ b/src/reader.cpp @@ -647,10 +647,14 @@ bool Reader::urlExists(const string& url) const bool Reader::pathExists(const string& path) const { + if (!zimFileHandler) + { + return false; + } + char ns = 0; string titleStr; _parseUrl(path, &ns, titleStr); - titleStr = "/" + titleStr; zim::File::const_iterator findItr = zimFileHandler->find(ns, titleStr); return findItr != zimFileHandler->end() && findItr->getUrl() == titleStr; } @@ -658,8 +662,13 @@ bool Reader::pathExists(const string& path) const /* Does the ZIM file has a fulltext index */ bool Reader::hasFulltextIndex() const { - return ( this->pathExists("/Z/fulltextIndex/xapian") - && !zimFileHandler->is_multiPart() ); + if (!zimFileHandler || zimFileHandler->is_multiPart() ) + { + return false; + } + + return ( pathExists("Z//fulltextIndex/xapian") + || pathExists("X/fulltext/xapian")); } /* Search titles by prefix */ From 68665693c509ab777266767c0ecec7d104ada219 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Wed, 18 Apr 2018 10:33:35 -0400 Subject: [PATCH 4/4] fixed some typos in the docs string --- include/entry.h | 2 +- include/reader.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/entry.h b/include/entry.h index af9067e29..c47d33912 100644 --- a/include/entry.h +++ b/include/entry.h @@ -50,7 +50,7 @@ class Entry /** * Construct an entry making reference to an zim article. * - * @param article + * @param article a zim::Article object */ Entry(zim::Article article); virtual ~Entry() = default; diff --git a/include/reader.h b/include/reader.h index 52f8c0ec2..4301ea35c 100644 --- a/include/reader.h +++ b/include/reader.h @@ -280,7 +280,7 @@ class Reader * Get the mimetype of a entry specified by a url. * * @param[in] url the url of the entry. - * @param[out] mimetype the mimeType of the entry. + * @param[out] mimeType the mimeType of the entry. * @return True if the mimeType has been found. */ DEPRECATED bool getMimeTypeByUrl(const string& url, string& mimeType) const; @@ -356,7 +356,7 @@ class Reader * `getNextSuggestion` method. * * @param prefix The prefix to search. - * @param suggestionCount How many suggestions to search for. + * @param suggestionsCount How many suggestions to search for. * @param reset If true, remove previous suggestions in the internal vector. * If false, add suggestions to the internal vector * (until internal vector size is suggestionCount (or no more @@ -380,7 +380,7 @@ class Reader * The internal vector will be reset. * * @param prefix The prefix to search for. - * @param suggestionCount How many suggestions to search for. + * @param suggestionsCount How many suggestions to search for. */ bool searchSuggestionsSmart(const string& prefix, unsigned int suggestionsCount);