Introduce better API to manipulate entries in a zim file.

The previous API suffer different problems: - It was difficult to handle articles redirecting to other article. - It was not possible to get few information (title) without getting the whole content. The new API introduce the new class `Entry` that act as a proxy to an article in the zim file. Methods of `Reader` now return an `Entry` and the user has to call `Entry`'s methods to get useful information. No redirection is made explicitly. If an entry is not found, an exception is raised instead of returning an invalid `Entry`. The common pattern to get the content of an entry become : ``` std::string content; try { auto entry = reader.getEntryFromPath(path); entry = entry.getFinalEntry(); content = entry.getContent(); } catch (NoEntry& e) { ... } ``` Older methods are keep (with the same behavior) but are marked as deprecated.
2025-06-26 10:11:30 +00:00 · 2018-03-15 15:35:59 +01:00
parent 1f3fcd85a0
commit 135028c16a
8 changed files with 684 additions and 235 deletions
--- a/include/entry.h
+++ b/include/entry.h
@ -0,0 +1,191 @@
+/*
+ * Copyright 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef KIWIX_ENTRY_H
+#define KIWIX_ENTRY_H
+
+#include <stdio.h>
+#include <zim/article.h>
+#include <exception>
+#include <string>
+#include "common.h"
+
+using namespace std;
+
+namespace kiwix
+{
+
+
+class NoEntry : public std::exception {};
+
+/**
+ * A entry represent an.. entry in a zim file.
+ */
+class Entry
+{
+  public:
+    /**
+     * Default constructor.
+     *
+     * Construct an invalid entry.
+     */
+    Entry() = default;
+
+    /**
+     * Construct an entry making reference to an zim article.
+     *
+     * @param article
+     */
+    Entry(zim::Article article);
+    virtual ~Entry() = default;
+
+    /**
+     * Get the path of the entry.
+     *
+     * The path is the "key" of an entry.
+     *
+     * @return the path of the entry.
+     */
+    std::string getPath() const;
+    
+    /**
+     * Get the title of the entry.
+     *
+     * @return the title of the entry.
+     */
+    std::string getTitle() const;
+        
+    /**
+     * Get the content of the entry.
+     *
+     * The string is a copy of the content.
+     * If you don't want to do a copy, use get_blob.
+     *
+     * @return the content of the entry.
+     */
+    std::string getContent() const;
+        
+    /**
+     * Get the blob of the entry.
+     *
+     * A blob make reference to the content without copying it.
+     *
+     * @param offset The starting offset of the blob.
+     * @return the blob of the entry.
+     */
+    zim::Blob   getBlob(offset_type offset = 0) const;
+        
+    /**
+     * Get the blob of the entry.
+     *
+     * A blob make reference to the content without copying it.
+     *
+     * @param offset The starting offset of the blob.
+     * @param size The size of the blob.
+     * @return the blob of the entry.
+     */
+    zim::Blob   getBlob(offset_type offset, size_type size) const;
+        
+    /**
+     * Get the info for direct access to the content of the entry.
+     *
+     * Some entry (ie binary ones) have their content plain stored
+     * in the zim file. Knowing the offset where the content is stored
+     * an user can directly read the content in the zim file bypassing the
+     * kiwix-lib/libzim.
+     *
+     * @return A pair specifying where to read the content.
+     *         The string is the real file to read (may be different that .zim
+     *         file if zim is cut).
+     *         The offset is the offset to read in the file.
+     *         Return <"",0> if is not possible to read directly.
+     */
+    std::pair<std::string, offset_type> getDirectAccessInfo() const;
+        
+    /**
+     * Get the size of the entry.
+     *
+     * @return the size of the entry.
+     */
+    size_type   getSize() const;
+
+    /**
+     * Get the mime_type of the entry.
+     *
+     * @return the mime_type of the entry.
+     */
+    std::string getMimetype() const;
+    
+    
+    /**
+     * Get if the entry is a redirect entry.
+     *
+     * @return True if the entry is a redirect.
+     */
+    bool isRedirect() const;
+
+    /**
+     * Get if the entry is a link target entry.
+     *
+     * @return True if the entry is a link target.
+     */
+    bool isLinkTarget() const;
+
+    /**
+     * Get if the entry is a deleted entry.
+     *
+     * @return True if the entry is a deleted entry.
+     */
+    bool isDeleted() const;
+
+    /**
+     * Get the entry pointed by this entry.
+     *
+     * @return the entry pointed.
+     * @throw NoEntry if the entry is not a redirected entry.
+     */
+    Entry getRedirectEntry() const;
+
+    /**
+     * Get the final entry pointed by this entry.
+     *
+     * Follow the redirection until a "not redirecting" entry is found.
+     * If the entry is not a redirected entry, return the entry itself.
+     *
+     * @return the final entry.
+     */
+    Entry getFinalEntry() const;
+
+    /**
+     * Convert the entry to a boolean value.
+     *
+     * @return True if the entry is valid.
+     */
+    explicit operator bool() const { return good(); }
+
+  private:
+    zim::Article article;
+    mutable zim::Article final_article;
+
+    bool good() const { return article.good(); }
+};
+
+}
+
+#endif // KIWIX_ENTRY_H
--- a/include/meson.build
+++ b/include/meson.build
@ -5,6 +5,7 @@ headers = [
  'opds_dumper.h',
  'downloader.h',
  'reader.h',
+  'entry.h',
  'searcher.h'
 ]

--- a/include/reader.h
+++ b/include/reader.h
@ -29,6 +29,8 @@
 #include <map>
 #include <sstream>
 #include <string>
+#include "common.h"
+#include "entry.h"
 #include "common/pathTools.h"
 #include "common/stringTools.h"

@ -38,7 +40,7 @@ namespace kiwix
 {

 /**
- * The Reader class is the class who allow to get an article content from a zim
+ * The Reader class is the class who allow to get an entry content from a zim
 * file.
 */
 class Reader
@ -57,11 +59,11 @@ class Reader
  ~Reader();

  /**
-   * Get the number of "displayable" articles in the zim file.
+   * Get the number of "displayable" entries in the zim file.
   *
   * @return If the zim file has a /M/Counter metadata, return the number of
-   *         articles with the 'text/html' MIMEtype specified in the metadata.
-   *         Else return the number of articles in the 'A' namespace.
+   *         entries with the 'text/html' MIMEtype specified in the metadata.
+   *         Else return the number of entries in the 'A' namespace.
   */
  unsigned int getArticleCount() const;

@ -69,16 +71,16 @@ class Reader
   * Get the number of media in the zim file.
   *
   * @return If the zim file has a /M/Counter metadata, return the number of
-   *         articles with the 'image/jpeg', 'image/gif' and 'image/png' in
+   *         entries with the 'image/jpeg', 'image/gif' and 'image/png' in
   *         the metadata.
-   *         Else return the number of articles in the 'I' namespace.
+   *         Else return the number of entries in the 'I' namespace.
   */
  unsigned int getMediaCount() const;

  /**
-   * Get the number of all articles in the zim file.
+   * Get the number of all entries in the zim file.
   *
-   * @return Return the number of all the articles, whatever their MIMEtype or
+   * @return Return the number of all the entries, whatever their MIMEtype or
   *         their namespace.
   */
  unsigned int getGlobalCount() const;
@ -100,25 +102,54 @@ class Reader
  /**
   * Get the url of a random page.
   *
-   * @return Url of a random page. The page is picked from all articles in
+   * Deprecated : Use `getRandomPage` instead.
+   *
+   * @return Url of a random page. The page is picked from all entries in
   *         the 'A' namespace.
   *         The main page is excluded from the potential results.
   */
-  string getRandomPageUrl() const;
+  DEPRECATED string getRandomPageUrl() const;
+
+  /**
+   * Get a random page.
+   *
+   * @return A random Entry. The entry is picked from all entries in
+   *         the 'A' namespace.
+   *         The main entry is excluded from the potential results.
+   */
+  Entry getRandomPage() const;

  /**
   * Get the url of the first page.
   *
-   * @return Url of the first article in the 'A' namespace.
+   * Deprecated : Use `getFirstPage` instead.
+   *
+   * @return Url of the first entry in the 'A' namespace.
   */
-  string getFirstPageUrl() const;
+  DEPRECATED string getFirstPageUrl() const;
+
+  /**
+   * Get the entry of the first page.
+   *
+   * @return The first entry in the 'A' namespace.
+   */
+  Entry getFirstPage() const;

  /**
   * Get the url of the main page.
   *
+   * Deprecated : Use `getMainPage` instead.
+   *
   * @return Url of the main page as specified in the zim file.
   */
-  string getMainPageUrl() const;
+  DEPRECATED string getMainPageUrl() const;
+
+  /**
+   * Get the entry of the main page.
+   *
+   * @return Entry of the main page as specified in the zim file.
+   */
+  Entry getMainPage() const;

  /**
   * Get the content of a metadata.
@ -207,6 +238,35 @@ class Reader
   */
  bool getFavicon(string& content, string& mimeType) const;

+  /**
+   * Get an entry associated to an path.
+   *
+   * @param path The path of the entry.
+   * @return The entry.
+   * @throw NoEntry If no entry correspond to the path.
+   */
+  Entry getEntryFromPath(const std::string& path) const;
+
+  /**
+   * Get an entry associated to an url encoded path.
+   *
+   * Equivalent to `getEntryFromPath(urlDecode(path));`
+   *
+   * @param path The url encoded path.
+   * @return The entry.
+   * @throw NoEntry If no entry correspond to the path.
+   */
+  Entry getEntryFromEncodedPath(const std::string& path) const;
+
+  /**
+   * Get un entry associated to a title.
+   *
+   * @param title The title.
+   * @return The entry
+   * throw NoEntry If no entry correspond to the url.
+   */
+  Entry getEntryFromTitle(const std::string& title) const;
+
  /**
   * Get the url of a page specified by a title.
   *
@ -214,34 +274,34 @@ class Reader
   * @param[out] url the url of the page.
   * @return True if the page can be found.
   */
-  bool getPageUrlFromTitle(const string& title, string& url) const;
+  DEPRECATED bool getPageUrlFromTitle(const string& title, string& url) const;

  /**
-   * Get the mimetype of a article specified by a url.
+   * Get the mimetype of a entry specified by a url.
   *
-   * @param[in] url the url of the article.
-   * @param[out] mimetype the mimeType of the article.
+   * @param[in] url the url of the entry.
+   * @param[out] mimetype the mimeType of the entry.
   * @return True if the mimeType has been found.
   */
-  bool getMimeTypeByUrl(const string& url, string& mimeType) const;
+  DEPRECATED bool getMimeTypeByUrl(const string& url, string& mimeType) const;

  /**
-   * Get the content of an article specifed by a url.
+   * Get the content of an entry specifed by a url.
   *
   * Alias to `getContentByEncodedUrl`
   */
-  bool getContentByUrl(const string& url,
+  DEPRECATED bool getContentByUrl(const string& url,
                       string& content,
                       string& title,
                       unsigned int& contentLength,
                       string& contentType) const;

  /**
-   * Get the content of an article specified by a url encoded url.
+   * Get the content of an entry specified by a url encoded url.
   *
   * Equivalent to getContentByDecodedUrl(urlDecode(url), ...).
   */
-  bool getContentByEncodedUrl(const string& url,
+  DEPRECATED bool getContentByEncodedUrl(const string& url,
                              string& content,
                              string& title,
                              unsigned int& contentLength,
@ -249,48 +309,48 @@ class Reader
                              string& baseUrl) const;

  /**
-   * Get the content of an article specified by an url encoded url.
+   * Get the content of an entry specified by an url encoded url.
   *
   * Equivalent to getContentByEncodedUrl but without baseUrl.
   */
-  bool getContentByEncodedUrl(const string& url,
+  DEPRECATED bool getContentByEncodedUrl(const string& url,
                              string& content,
                              string& title,
                              unsigned int& contentLength,
                              string& contentType) const;

  /**
-   * Get the content of an article specified by a url.
+   * Get the content of an entry specified by a url.
   *
-   * @param[in] url The url of the article.
-   * @param[out] content The content of the article.
-   * @param[out] title the title of the article.
-   * @param[out] contentLength The size of the article (size of content).
-   * @param[out] contentType The mimeType of the article.
-   * @param[out] baseUrl Return the true url of the article.
-   *                     If the specified article is a redirection, contains
-   *                     the url of the targeted article.
-   * @return True if the article has been found.
+   * @param[in] url The url of the entry.
+   * @param[out] content The content of the entry.
+   * @param[out] title the title of the entry.
+   * @param[out] contentLength The size of the entry (size of content).
+   * @param[out] contentType The mimeType of the entry.
+   * @param[out] baseUrl Return the true url of the entry.
+   *                     If the specified entry is a redirection, contains
+   *                     the url of the targeted entry.
+   * @return True if the entry has been found.
   */
-  bool getContentByDecodedUrl(const string& url,
+  DEPRECATED bool getContentByDecodedUrl(const string& url,
                              string& content,
                              string& title,
                              unsigned int& contentLength,
                              string& contentType,
                              string& baseUrl) const;
  /**
-   * Get the content of an article specified by a url.
+   * Get the content of an entry specified by a url.
   *
   * Equivalent to getContentByDecodedUrl but withou the baseUrl.
   */
-  bool getContentByDecodedUrl(const string& url,
+  DEPRECATED bool getContentByDecodedUrl(const string& url,
                              string& content,
                              string& title,
                              unsigned int& contentLength,
                              string& contentType) const;

  /**
-   * Search for articles with title starting with prefix (case sensitive).
+   * Search for entries with title starting with prefix (case sensitive).
   *
   * Suggestions are stored in an internal vector and can be retrieved using
   * `getNextSuggestion` method.
@ -308,7 +368,7 @@ class Reader
                         const bool reset = true);

  /**
-   * Search for articles for the given prefix.
+   * Search for entries for the given prefix.
   *
   * If the zim file has a internal fulltext index, the suggestions will be
   * searched using it.
@ -328,10 +388,20 @@ class Reader
  /**
   * Check if the url exists in the zim file.
   *
+   * Deprecated : Use `pathExists` instead.
+   *
   * @param url the url to check.
   * @return True if the url exits in the zim file.
   */
-  bool urlExists(const string& url) const;
+  DEPRECATED bool urlExists(const string& url) const;
+
+  /**
+   * Check if the path exists in the zim file.
+   *
+   * @param path the path to check.
+   * @return True if the path exists in the zim file.
+   */
+  bool pathExists(const string& path) const;

  /**
   * Check if the zim file has a embedded fulltext index.
@ -388,7 +458,7 @@ class Reader
   * @param[out] title The url (url).
   * @return True
   */
-  bool parseUrl(const string& url, char* ns, string& title) const;
+  DEPRECATED bool parseUrl(const string& url, char* ns, string& title) const;

  /**
   * Return the total size of the zim file.
@ -413,7 +483,7 @@ class Reader
   * @param[out] article The libzim article object.
   * @return True if the url is good (article.good()).
   */
-  bool getArticleObjectByDecodedUrl(const string& url,
+  DEPRECATED bool getArticleObjectByDecodedUrl(const string& url,
                                    zim::Article& article) const;

 protected: