Merge pull request #190 from kiwix/no_external_index

Remove support for external index.
2019-01-08 16:13:54 +01:00 · 2019-01-08 16:13:54 +01:00 · c73ac9f2cd
parent cb98f11ddc 5159d985c6
commit c73ac9f2cd
16 changed files with 31 additions and 1553 deletions
--- a/README.md
+++ b/README.md
@ -35,8 +35,6 @@ libraries need to be available:
 (package libpugixml-dev on Ubuntu)
 * ctpp2 ........................................ http://ctpp.havoc.ru/
 (package libctpp2-dev on Ubuntu)
 * Xapian ......................................... https://xapian.org/
 (package libxapian-dev on Ubuntu)
 * libaria2 .................................. https://aria2.github.io/
 (no package on Ubuntu)
--- a/include/book.h
+++ b/include/book.h
@ -28,7 +28,6 @@ class xml_node;
 namespace kiwix
 {
 enum supportedIndexType { UNKNOWN, XAPIAN };
 class OPDSDumper;
 class Reader;
@ -52,8 +51,6 @@ class Book
  const std::string& getId() const { return m_id; }
  const std::string& getPath() const { return m_path; }
  bool isPathValid() const { return m_pathValid; }
  const std::string& getIndexPath() const { return m_indexPath; }
  const supportedIndexType& getIndexType() const { return m_indexType; }
  const std::string& getTitle() const { return m_title; }
  const std::string& getDescription() const { return m_description; }
  const std::string& getLanguage() const { return m_language; }
@ -76,8 +73,6 @@ class Book
  void setId(const std::string& id) { m_id = id; }
  void setPath(const std::string& path);
  void setPathValid(bool valid) { m_pathValid = valid; }
  void setIndexPath(const std::string& indexPath);
  void setIndexType(supportedIndexType indexType) { m_indexType = indexType;}
  void setTitle(const std::string& title) { m_title = title; }
  void setDescription(const std::string& description) { m_description = description; }
  void setLanguage(const std::string& language) { m_language = language; }
@ -100,8 +95,6 @@ class Book
  std::string m_downloadId;
  std::string m_path;
  bool m_pathValid;
  std::string m_indexPath;
  supportedIndexType m_indexType;
  std::string m_title;
  std::string m_description;
  std::string m_language;
--- a/include/meson.build
+++ b/include/meson.build
@ -12,10 +12,6 @@ headers = [
  'searcher.h'
 ]
 if xapian_dep.found()
  headers += ['xapianSearcher.h']
 endif
 install_headers(headers, subdir:'kiwix')
 install_headers(
--- a/include/searcher.h
+++ b/include/searcher.h
@ -57,24 +57,7 @@ struct SearcherInternal;
 * The Searcher class is reponsible to do different kind of search using the
 * fulltext index.
 *
- * Historically, there are two kind of fulltext index :
+ *  Searcher may (if compiled with ctpp2) be used to
 *  - The legacy one, is the external fulltext index. A directory stored outside
 *    of the zim file.
 *  - The new one, a embedded fulltext index in the zim file.
 *
 * Legacy external fulltext index has to be considered as obsolet format with
 * less functionnalities:
 *  - No multi zim search ;
 *  - No geo_search ;
 *  - No suggestions search ;
 *
 * To reflect this, there is two Search creation "API":
 *  - One for the external fulltext index, using the constructor taking a
 *    xapianDirectoryPath) ;
 *  - One for the embedded fulltext index, using a "empty" constructor and the
 *  `add_reader` method".
 *
 *  On top of that, the Searcher may (if compiled with ctpp2) be used to
 *  generate a html page for the search result. This use a template that need a
 *  humanReaderName. This feature is only used by kiwix-serve and this should be
 *  move outside of Searcher (and with a better API). If you don't use the html
@ -92,18 +75,6 @@ class Searcher
   */
  Searcher(const string& humanReadableName = "");
  /**
   * The constructor for legacy external fulltext index.
   *
   * @param xapianDirectoryPath The path to the external index directory.
   * @param reader The reader associated to the external index.
   *               It will be used retrive the article content or generate
   *               the snippet.
   * @param humanReadableName The humanReadableName for the zim.
   */
  Searcher(const string& xapianDirectoryPath,
           Reader* reader,
           const string& humanReadableName);
  ~Searcher();
  /**
--- a/include/xapianSearcher.h
+++ b/include/xapianSearcher.h
@ -1,98 +0,0 @@
 /*
 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */
 #ifndef KIWIX_XAPIAN_SEARCHER_H
 #define KIWIX_XAPIAN_SEARCHER_H
 #include <xapian.h>
 #include "reader.h"
 #include "searcher.h"
 #include <map>
 #include <string>
 using namespace std;
 namespace kiwix
 {
 class XapianSearcher;
 class XapianResult : public Result
 {
 public:
  XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator);
  virtual ~XapianResult(){};
  virtual std::string get_url();
  virtual std::string get_title();
  virtual int get_score();
  virtual std::string get_snippet();
  virtual std::string get_content();
  virtual int get_wordCount();
  virtual int get_size();
  virtual int get_readerIndex() { return 0; };
 private:
  XapianSearcher* searcher;
  Xapian::MSetIterator iterator;
  Xapian::Document document;
 };
 class NoXapianIndexInZim : public exception
 {
  virtual const char* what() const throw()
  {
    return "There is no fulltext index in the zim file";
  }
 };
 class XapianSearcher
 {
  friend class XapianResult;
 public:
  XapianSearcher(const string& xapianDirectoryPath, Reader* reader);
  virtual ~XapianSearcher(){};
  void searchInIndex(string& search,
                     const unsigned int resultStart,
                     const unsigned int resultEnd,
                     const bool verbose = false);
  virtual Result* getNextResult();
  void restart_search();
  Xapian::MSet results;
 protected:
  void closeIndex();
  void openIndex(const string& xapianDirectoryPath);
  void setup_queryParser();
  Reader* reader;
  Xapian::Database readableDatabase;
  std::string language;
  std::string stopwords;
  Xapian::QueryParser queryParser;
  Xapian::Stem stemmer;
  Xapian::SimpleStopper stopper;
  Xapian::MSetIterator current_result;
  std::map<std::string, int> valuesmap;
 };
 }
 #endif
--- a/meson.build
+++ b/meson.build
@ -84,9 +84,7 @@ else
  endif
 endif
-xapian_dep = dependency('xapian-core', required:false, static:static_deps)
+all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep]
 all_deps = [thread_dep, libicu_dep, libzim_dep, xapian_dep, pugixml_dep, libcurl_dep]
 if has_ctpp2_dep
  all_deps += [ctpp2_dep]
 endif
@ -110,9 +108,6 @@ subdir('src')
 subdir('test')
 pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl']
 if xapian_dep.found()
    pkg_requires += ['xapian-core']
 endif
 if has_ctpp2_dep
  extra_libs += ctpp2_link_args
--- a/src/book.cpp
+++ b/src/book.cpp
@ -60,11 +60,6 @@ bool Book::update(const kiwix::Book& other)
    m_name = other.m_name;
  }
  if (m_indexPath.empty()) {
    m_indexPath = other.m_indexPath;
    m_indexType = other.m_indexType;
  }
  if (m_faviconMimeType.empty()) {
    m_favicon = other.m_favicon;
    m_faviconMimeType = other.m_faviconMimeType;
@ -101,14 +96,6 @@ void Book::updateFromXml(const pugi::xml_node& node, const std::string& baseDir)
    path = computeAbsolutePath(baseDir, path);
  }
  m_path = path;
  path = ATTR("indexPath");
  if (!path.empty()) {
    if (isRelativePath(path)) {
      path = computeAbsolutePath(baseDir, path);
    }
    m_indexPath = path;
    m_indexType = XAPIAN;
  }
  m_title = ATTR("title");
  m_name = ATTR("name");
  m_tags = ATTR("tags");
@ -194,13 +181,6 @@ void Book::setPath(const std::string& path)
   : path;
 }
 void Book::setIndexPath(const std::string& indexPath)
 {
  m_indexPath = isRelativePath(indexPath)
    ? computeAbsolutePath(getCurrentDirectory(), indexPath)
    : indexPath;
 }
 const std::string& Book::getFavicon() const {
  if (m_favicon.empty() && !m_faviconUrl.empty()) {
    try {
--- a/src/libxml_dumper.cpp
+++ b/src/libxml_dumper.cpp
@ -50,11 +50,6 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) {
    ADD_ATTRIBUTE(entry_node, "path", computeRelativePath(baseDir, book.getPath()));
  }
  if (!book.getIndexPath().empty()) {
    ADD_ATTRIBUTE(entry_node, "indexPath", computeRelativePath(baseDir, book.getIndexPath()));
    entry_node.append_attribute("indexType") = "xapian";
  }
  if (book.getOrigId().empty()) {
    ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle());
    ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName());
--- a/src/meson.build
+++ b/src/meson.build
@ -17,8 +17,6 @@ kiwix_sources = [
  'common/stringTools.cpp',
  'common/networkTools.cpp',
  'common/otherTools.cpp',
  'xapian/htmlparse.cc',
  'xapian/myhtmlparse.cc'
 ]
 kiwix_sources += lib_resources
@ -28,10 +26,6 @@ else
  kiwix_sources += 'subprocess_unix.cpp'
 endif
 if xapian_dep.found()
  kiwix_sources += ['xapianSearcher.cpp']
 endif
 if get_option('android')
  subdir('android')
  install_dir = 'kiwix-lib/jniLibs/' + meson.get_cross_property('android_abi')
--- a/src/searcher.cpp
+++ b/src/searcher.cpp
@ -22,7 +22,6 @@
 #include "searcher.h"
 #include "reader.h"
 #include "xapianSearcher.h"
 #include <zim/search.h>
@ -61,42 +60,18 @@ class _Result : public Result
 struct SearcherInternal {
  const zim::Search* _search;
  XapianSearcher* _xapianSearcher;
  zim::Search::iterator current_iterator;
-  SearcherInternal() : _search(NULL), _xapianSearcher(NULL) {}
+  SearcherInternal() : _search(NULL) {}
  ~SearcherInternal()
  {
    if (_search != NULL) {
      delete _search;
    }
    if (_xapianSearcher != NULL) {
      delete _xapianSearcher;
    }
  }
 };
 /* Constructor */
 Searcher::Searcher(const string& xapianDirectoryPath,
                   Reader* reader,
                   const string& humanReadableName)
    : internal(new SearcherInternal()),
      searchPattern(""),
      protocolPrefix("zim://"),
      searchProtocolPrefix("search://?"),
      resultCountPerPage(0),
      estimatedResultCount(0),
      resultStart(0),
      resultEnd(0),
      contentHumanReadableId(humanReadableName)
 {
  loadICUExternalTables();
  if (!reader || !reader->hasFulltextIndex()) {
    internal->_xapianSearcher = new XapianSearcher(xapianDirectoryPath, reader);
  }
  this->humanReaderNames.push_back(humanReadableName);
 }
 Searcher::Searcher(const std::string& humanReadableName)
    : internal(new SearcherInternal()),
      searchPattern(""),
@ -160,12 +135,6 @@ void Searcher::search(std::string& search,
    this->resultStart = resultStart;
    this->resultEnd = resultEnd;
    string unaccentedSearch = removeAccents(search);
    if (internal->_xapianSearcher) {
      internal->_xapianSearcher->searchInIndex(
          unaccentedSearch, resultStart, resultEnd, verbose);
      this->estimatedResultCount
          = internal->_xapianSearcher->results.get_matches_estimated();
    } else {
    std::vector<const zim::File*> zims;
    for (auto current = this->readers.begin(); current != this->readers.end();
         current++) {
@ -180,7 +149,6 @@ void Searcher::search(std::string& search,
    internal->current_iterator = internal->_search->begin();
    this->estimatedResultCount = internal->_search->get_matches_estimated();
  }
  }
  return;
 }
@ -209,10 +177,6 @@ void Searcher::geo_search(float latitude, float longitude, float distance,
    return;
  }
  if (internal->_xapianSearcher) {
    return;
  }
  /* Avoid big researches */
  this->resultCountPerPage = resultEnd - resultStart;
  if (this->resultCountPerPage > MAX_SEARCH_LEN) {
@ -244,18 +208,14 @@ void Searcher::geo_search(float latitude, float longitude, float distance,
 void Searcher::restart_search()
 {
-  if (internal->_xapianSearcher) {
+  if (internal->_search) {
    internal->_xapianSearcher->restart_search();
  } else if (internal->_search) {
    internal->current_iterator = internal->_search->begin();
  }
 }
 Result* Searcher::getNextResult()
 {
-  if (internal->_xapianSearcher) {
+  if (internal->_search &&
    return internal->_xapianSearcher->getNextResult();
  } else if (internal->_search &&
             internal->current_iterator != internal->_search->end()) {
    Result* result = new _Result(internal->current_iterator);
    internal->current_iterator++;
@ -272,24 +232,19 @@ void Searcher::reset()
  return;
 }
-void Searcher::suggestions(std::string& search, const bool verbose)
+void Searcher::suggestions(std::string& searchPattern, const bool verbose)
 {
  this->reset();
  if (verbose == true) {
-    cout << "Performing suggestion query `" << search << "`" << endl;
+    cout << "Performing suggestion query `" << searchPattern << "`" << endl;
  }
-  this->searchPattern = search;
+  this->searchPattern = searchPattern;
  this->resultStart = 0;
  this->resultEnd = 10;
-  string unaccentedSearch = removeAccents(search);
+  string unaccentedSearch = removeAccents(searchPattern);
  if (internal->_xapianSearcher) {
    /* [TODO] Suggestion on a external database ?
     * We do not support that. */
    this->estimatedResultCount = 0;
  } else {
  std::vector<const zim::File*> zims;
  for (auto current = this->readers.begin(); current != this->readers.end();
       current++) {
@ -303,7 +258,6 @@ void Searcher::suggestions(std::string& search, const bool verbose)
  internal->current_iterator = internal->_search->begin();
  this->estimatedResultCount = internal->_search->get_matches_estimated();
 }
 }
 /* Return the result count estimation */
 unsigned int Searcher::getEstimatedResultCount()
--- a/src/xapian/htmlparse.cc
+++ b/src/xapian/htmlparse.cc
@ -1,373 +0,0 @@
 /* htmlparse.cc: simple HTML parser for omega indexer
 *
 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2001 Ananova Ltd
 * Copyright 2002,2006,2007,2008 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
 // #include <config.h>
 #include "htmlparse.h"
 #include <xapian.h>
 // #include "utf8convert.h"
 #include <algorithm>
 #include <ctype.h>
 #include <cstring>
 #include <stdio.h>
 #include <stdlib.h>
 using namespace std;
 inline void
 lowercase_string(string &str)
 {
    for (string::iterator i = str.begin(); i != str.end(); ++i) {
 	*i = tolower(static_cast<unsigned char>(*i));
    }
 }
 map<string, unsigned int> HtmlParser::named_ents;
 inline static bool
 p_notdigit(char c)
 {
    return !isdigit(static_cast<unsigned char>(c));
 }
 inline static bool
 p_notxdigit(char c)
 {
    return !isxdigit(static_cast<unsigned char>(c));
 }
 inline static bool
 p_notalnum(char c)
 {
    return !isalnum(static_cast<unsigned char>(c));
 }
 inline static bool
 p_notwhitespace(char c)
 {
    return !isspace(static_cast<unsigned char>(c));
 }
 inline static bool
 p_nottag(char c)
 {
    return !isalnum(static_cast<unsigned char>(c)) &&
 	c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
 }
 inline static bool
 p_whitespacegt(char c)
 {
    return isspace(static_cast<unsigned char>(c)) || c == '>';
 }
 inline static bool
 p_whitespaceeqgt(char c)
 {
    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
 }
 bool
 HtmlParser::get_parameter(const string & param, string & value)
 {
    map<string, string>::const_iterator i = parameters.find(param);
    if (i == parameters.end()) return false;
    value = i->second;
    return true;
 }
 HtmlParser::HtmlParser()
 {
    static const struct ent { const char *n; unsigned int v; } ents[] = {
 #include "namedentities.h"
 	{ NULL, 0 }
    };
    if (named_ents.empty()) {
 	const struct ent *i = ents;
 	while (i->n) {
 	    named_ents[string(i->n)] = i->v;
 	    ++i;
 	}
    }
 }
 void
 HtmlParser::decode_entities(string &s)
 {
    // We need a const_iterator version of s.end() - otherwise the
    // find() and find_if() templates don't work...
    string::const_iterator amp = s.begin(), s_end = s.end();
    while ((amp = find(amp, s_end, '&')) != s_end) {
 	unsigned int val = 0;
 	string::const_iterator end, p = amp + 1;
 	if (p != s_end && *p == '#') {
 	    p++;
 	    if (p != s_end && (*p == 'x' || *p == 'X')) {
 		// hex
 		p++;
 		end = find_if(p, s_end, p_notxdigit);
 		sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
 	    } else {
 		// number
 		end = find_if(p, s_end, p_notdigit);
 		val = atoi(s.substr(p - s.begin(), end - p).c_str());
 	    }
 	} else {
 	    end = find_if(p, s_end, p_notalnum);
 	    string code = s.substr(p - s.begin(), end - p);
 	    map<string, unsigned int>::const_iterator i;
 	    i = named_ents.find(code);
 	    if (i != named_ents.end()) val = i->second;
 	}
 	if (end < s_end && *end == ';') end++;
 	if (val) {
 	    string::size_type amp_pos = amp - s.begin();
 	    if (val < 0x80) {
 		s.replace(amp_pos, end - amp, 1u, char(val));
 	    } else {
 		// Convert unicode value val to UTF-8.
 		char seq[4];
 		unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
 		s.replace(amp_pos, end - amp, seq, len);
 	    }
 	    s_end = s.end();
 	    // We've modified the string, so the iterators are no longer
 	    // valid...
 	    amp = s.begin() + amp_pos + 1;
 	} else {
 	    amp = end;
 	}
    }
 }
 void
 HtmlParser::parse_html(const string &body)
 {
    in_script = false;
    parameters.clear();
    string::const_iterator start = body.begin();
    while (true) {
 	// Skip through until we find an HTML tag, a comment, or the end of
 	// document.  Ignore isolated occurrences of `<' which don't start
 	// a tag or comment.
 	string::const_iterator p = start;
 	while (true) {
 	    p = find(p, body.end(), '<');
 	    if (p == body.end()) break;
 	    unsigned char ch = *(p + 1);
 	    // Tag, closing tag, or comment (or SGML declaration).
 	    if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
 	    if (ch == '?') {
 		// PHP code or XML declaration.
 		// XML declaration is only valid at the start of the first line.
 		// FIXME: need to deal with BOMs...
 		if (p != body.begin() || body.size() < 20) break;
 		// XML declaration looks something like this:
 		// <?xml version="1.0" encoding="UTF-8"?>
 		if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
 		if (strchr(" \t\r\n", p[5]) == NULL) break;
 		string::const_iterator decl_end = find(p + 6, body.end(), '?');
 		if (decl_end == body.end()) break;
 		// Default charset for XML is UTF-8.
 		charset = "UTF-8";
 		string decl(p + 6, decl_end);
 		size_t enc = decl.find("encoding");
 		if (enc == string::npos) break;
 		enc = decl.find_first_not_of(" \t\r\n", enc + 8);
 		if (enc == string::npos || enc == decl.size()) break;
 		if (decl[enc] != '=') break;
 		enc = decl.find_first_not_of(" \t\r\n", enc + 1);
 		if (enc == string::npos || enc == decl.size()) break;
 		if (decl[enc] != '"' && decl[enc] != '\'') break;
 		char quote = decl[enc++];
 		size_t enc_end = decl.find(quote, enc);
 		if (enc != string::npos)
 		    charset = decl.substr(enc, enc_end - enc);
 		break;
 	    }
 	    p++;
 	}
 	// Process text up to start of tag.
 	if (p > start) {
 	    string text = body.substr(start - body.begin(), p - start);
 	    // convert_to_utf8(text, charset);
 	    decode_entities(text);
 	    process_text(text);
 	}
 	if (p == body.end()) break;
 	start = p + 1;
 	if (start == body.end()) break;
 	if (*start == '!') {
 	    if (++start == body.end()) break;
 	    if (++start == body.end()) break;
 	    // comment or SGML declaration
 	    if (*(start - 1) == '-' && *start == '-') {
 		++start;
 		string::const_iterator close = find(start, body.end(), '>');
 		// An unterminated comment swallows rest of document
 		// (like Netscape, but unlike MSIE IIRC)
 		if (close == body.end()) break;
 		p = close;
 		// look for -->
 		while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
 		    p = find(p + 1, body.end(), '>');
 		if (p != body.end()) {
 		    // Check for htdig's "ignore this bit" comments.
 		    if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
 			string::size_type i;
 			i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
 			if (i == string::npos) break;
 			start = body.begin() + i + 21;
 			continue;
 		    }
 		    // If we found --> skip to there.
 		    start = p;
 		} else {
 		    // Otherwise skip to the first > we found (as Netscape does).
 		    start = close;
 		}
 	    } else {
 		// just an SGML declaration, perhaps giving the DTD - ignore it
 		start = find(start - 1, body.end(), '>');
 		if (start == body.end()) break;
 	    }
 	    ++start;
 	} else if (*start == '?') {
 	    if (++start == body.end()) break;
 	    // PHP - swallow until ?> or EOF
 	    start = find(start + 1, body.end(), '>');
 	    // look for ?>
 	    while (start != body.end() && *(start - 1) != '?')
 		start = find(start + 1, body.end(), '>');
 	    // unterminated PHP swallows rest of document (rather arbitrarily
 	    // but it avoids polluting the database when things go wrong)
 	    if (start != body.end()) ++start;
 	} else {
 	    // opening or closing tag
 	    int closing = 0;
 	    if (*start == '/') {
 		closing = 1;
 		start = find_if(start + 1, body.end(), p_notwhitespace);
 	    }
 	    p = start;
 	    start = find_if(start, body.end(), p_nottag);
 	    string tag = body.substr(p - body.begin(), start - p);
 	    // convert tagname to lowercase
 	    lowercase_string(tag);
 	    if (closing) {
 		closing_tag(tag);
 		if (in_script && tag == "script") in_script = false;
 		/* ignore any bogus parameters on closing tags */
 		p = find(start, body.end(), '>');
 		if (p == body.end()) break;
 		start = p + 1;
 	    } else {
 		// FIXME: parse parameters lazily.
 		while (start < body.end() && *start != '>') {
 		    string name, value;
 		    p = find_if(start, body.end(), p_whitespaceeqgt);
 		    name.assign(body, start - body.begin(), p - start);
 		    p = find_if(p, body.end(), p_notwhitespace);
 		    start = p;
 		    if (start != body.end() && *start == '=') {
 			start = find_if(start + 1, body.end(), p_notwhitespace);
 			p = body.end();
 			int quote = *start;
 			if (quote == '"' || quote == '\'') {
 			    start++;
 			    p = find(start, body.end(), quote);
 			}
 			if (p == body.end()) {
 			    // unquoted or no closing quote
 			    p = find_if(start, body.end(), p_whitespacegt);
 			}
 			value.assign(body, start - body.begin(), p - start);
 			start = find_if(p, body.end(), p_notwhitespace);
 			if (!name.empty()) {
 			    // convert parameter name to lowercase
 			    lowercase_string(name);
 			    // in case of multiple entries, use the first
 			    // (as Netscape does)
 			    parameters.insert(make_pair(name, value));
 			}
 		    }
 		}
 #if 0
 		cout << "<" << tag;
 		map<string, string>::const_iterator x;
 		for (x = parameters.begin(); x != parameters.end(); x++) {
 		    cout << " " << x->first << "=\"" << x->second << "\"";
 		}
 		cout << ">\n";
 #endif
 		opening_tag(tag);
 		parameters.clear();
 		// In <script> tags we ignore opening tags to avoid problems
 		// with "a<b".
 		if (tag == "script") in_script = true;
 		if (start != body.end() && *start == '>') ++start;
 	    }
 	}
    }
 }
--- a/src/xapian/htmlparse.h
+++ b/src/xapian/htmlparse.h
@ -1,49 +0,0 @@
 /* htmlparse.h: simple HTML parser for omega indexer
 *
 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2002,2006,2008 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
 #ifndef OMEGA_INCLUDED_HTMLPARSE_H
 #define OMEGA_INCLUDED_HTMLPARSE_H
 #include <string>
 #include <map>
 using std::string;
 using std::map;
 class HtmlParser {
 	map<string, string> parameters;
    protected:
 	void decode_entities(string &s);
 	bool in_script;
 	string charset;
 	static map<string, unsigned int> named_ents;
 	bool get_parameter(const string & param, string & value);
    public:
 	virtual void process_text(const string &/*text*/) { }
 	virtual void opening_tag(const string &/*tag*/) { }
 	virtual void closing_tag(const string &/*tag*/) { }
 	virtual void parse_html(const string &text);
 	HtmlParser();
 	virtual ~HtmlParser() { }
 };
 #endif // OMEGA_INCLUDED_HTMLPARSE_H
--- a/src/xapian/myhtmlparse.cc
+++ b/src/xapian/myhtmlparse.cc
@ -1,302 +0,0 @@
 /* myhtmlparse.cc: subclass of HtmlParser for extracting text.
 *
 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
 // #include <config.h>
 #include "myhtmlparse.h"
 // #include "utf8convert.h"
 #include <ctype.h>
 #include <string.h>
 inline void
 lowercase_string(string &str)
 {
    for (string::iterator i = str.begin(); i != str.end(); ++i) {
 	*i = tolower(static_cast<unsigned char>(*i));
    }
 }
 void
 MyHtmlParser::parse_html(const string &text, const string &charset_,
 			 bool charset_from_meta_)
 {
    charset = charset_;
    charset_from_meta = charset_from_meta_;
    HtmlParser::parse_html(text);
 }
 void
 MyHtmlParser::process_text(const string &text)
 {
    if (!text.empty() && !in_script_tag && !in_style_tag) {
 	string::size_type b = text.find_first_not_of(WHITESPACE);
 	if (b) pending_space = true;
 	while (b != string::npos) {
 	    if (pending_space && !dump.empty()) dump += ' ';
 	    string::size_type e = text.find_first_of(WHITESPACE, b);
 	    pending_space = (e != string::npos);
 	    if (!pending_space) {
 		dump.append(text.data() + b, text.size() - b);
 		return;
 	    }
 	    dump.append(text.data() + b, e - b);
 	    b = text.find_first_not_of(WHITESPACE, e + 1);
 	}
    }
 }
 void
 MyHtmlParser::opening_tag(const string &tag)
 {
    if (tag.empty()) return;
    switch (tag[0]) {
 	case 'a':
 	    if (tag == "address") pending_space = true;
 	    break;
 	case 'b':
 	    if (tag == "body") {
 		dump.resize(0);
 		break;
 	    }
 	    if (tag == "blockquote" || tag == "br") pending_space = true;
 	    break;
 	case 'c':
 	    if (tag == "center") pending_space = true;
 	    break;
 	case 'd':
 	    if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
 		tag == "dt") pending_space = true;
 	    break;
 	case 'e':
 	    if (tag == "embed") pending_space = true;
 	    break;
 	case 'f':
 	    if (tag == "fieldset" || tag == "form") pending_space = true;
 	    break;
 	case 'h':
 	    // hr, and h1, ..., h6
 	    if (tag.length() == 2 && strchr("r123456", tag[1]))
 		pending_space = true;
 	    break;
 	case 'i':
 	    if (tag == "iframe" || tag == "img" || tag == "isindex" ||
 		tag == "input") pending_space = true;
 	    break;
 	case 'k':
 	    if (tag == "keygen") pending_space = true;
 	    break;
 	case 'l':
 	    if (tag == "legend" || tag == "li" || tag == "listing")
 		pending_space = true;
 	    break;
 	case 'm':
 	    if (tag == "meta") {
 		string content;
 		if (get_parameter("content", content)) {
 		    string name;
 		    if (get_parameter("name", name)) {
 			lowercase_string(name);
 			if (name == "description") {
 			    if (sample.empty()) {
 				swap(sample, content);
 				// convert_to_utf8(sample, charset);
 				decode_entities(sample);
 			    }
 			} else if (name == "keywords") {
 			    if (!keywords.empty()) keywords += ' ';
 			    // convert_to_utf8(content, charset);
 			    decode_entities(content);
 			    keywords += content;
 			} else if (name == "robots") {
 			    decode_entities(content);
 			    lowercase_string(content);
 			    if (content.find("none") != string::npos ||
 				content.find("noindex") != string::npos) {
 				indexing_allowed = false;
 				throw true;
 			    }
 			}
 			break;
 		    }
 		    // If the current charset came from a meta tag, don't
 		    // force reparsing again!
 		    if (charset_from_meta) break;
 		    string hdr;
 		    if (get_parameter("http-equiv", hdr)) {
 			lowercase_string(hdr);
 			if (hdr == "content-type") {
 			    lowercase_string(content);
 			    size_t start = content.find("charset=");
 			    if (start == string::npos) break;
 			    start += 8;
 			    if (start == content.size()) break;
 			    size_t end = start;
 			    if (content[start] != '"') {
 				while (end < content.size()) {
 				    unsigned char ch = content[end];
 				    if (ch <= 32 || ch >= 127 ||
 					strchr(";()<>@,:\\\"/[]?={}", ch))
 					break;
 				    ++end;
 				}
 			    } else {
 				++start;
 				++end;
 				while (end < content.size()) {
 				    unsigned char ch = content[end];
 				    if (ch == '"') break;
 				    if (ch == '\\') content.erase(end, 1);
 				    ++end;
 				}
 			    }
 			    string newcharset(content, start, end - start);
 			    if (charset != newcharset) {
 				throw newcharset;
 			    }
 			}
 		    }
 		    break;
 		}
 		if (charset_from_meta) break;
 		string newcharset;
 		if (get_parameter("charset", newcharset)) {
 		    // HTML5 added: <meta charset="...">
 		    lowercase_string(newcharset);
 		    if (charset != newcharset) {
 			throw newcharset;
 		    }
 		}
 		break;
 	    }
 	    if (tag == "marquee" || tag == "menu" || tag == "multicol")
 		pending_space = true;
 	    break;
 	case 'o':
 	    if (tag == "ol" || tag == "option") pending_space = true;
 	    break;
 	case 'p':
 	    if (tag == "p" || tag == "pre" || tag == "plaintext")
 		pending_space = true;
 	    break;
 	case 'q':
 	    if (tag == "q") pending_space = true;
 	    break;
 	case 's':
 	    if (tag == "style") {
 		in_style_tag = true;
 		break;
 	    }
 	    if (tag == "script") {
 		in_script_tag = true;
 		break;
 	    }
 	    if (tag == "select") pending_space = true;
 	    break;
 	case 't':
 	    if (tag == "table" || tag == "td" || tag == "textarea" ||
 		tag == "th") pending_space = true;
 	    break;
 	case 'u':
 	    if (tag == "ul") pending_space = true;
 	    break;
 	case 'x':
 	    if (tag == "xmp") pending_space = true;
 	    break;
    }
 }
 void
 MyHtmlParser::closing_tag(const string &tag)
 {
    if (tag.empty()) return;
    switch (tag[0]) {
 	case 'a':
 	    if (tag == "address") pending_space = true;
 	    break;
 	case 'b':
 	    if (tag == "body") {
 		throw true;
 	    }
 	    if (tag == "blockquote" || tag == "br") pending_space = true;
 	    break;
 	case 'c':
 	    if (tag == "center") pending_space = true;
 	    break;
 	case 'd':
 	    if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
 		tag == "dt") pending_space = true;
 	    break;
 	case 'f':
 	    if (tag == "fieldset" || tag == "form") pending_space = true;
 	    break;
 	case 'h':
 	    // hr, and h1, ..., h6
 	    if (tag.length() == 2 && strchr("r123456", tag[1]))
 		pending_space = true;
 	    break;
 	case 'i':
 	    if (tag == "iframe") pending_space = true;
 	    break;
 	case 'l':
 	    if (tag == "legend" || tag == "li" || tag == "listing")
 		pending_space = true;
 	    break;
 	case 'm':
 	    if (tag == "marquee" || tag == "menu") pending_space = true;
 	    break;
 	case 'o':
 	    if (tag == "ol" || tag == "option") pending_space = true;
 	    break;
 	case 'p':
 	    if (tag == "p" || tag == "pre") pending_space = true;
 	    break;
 	case 'q':
 	    if (tag == "q") pending_space = true;
 	    break;
 	case 's':
 	    if (tag == "style") {
 		in_style_tag = false;
 		break;
 	    }
 	    if (tag == "script") {
 		in_script_tag = false;
 		break;
 	    }
 	    if (tag == "select") pending_space = true;
 	    break;
 	case 't':
 	    if (tag == "title") {
 		if (title.empty()) swap(title, dump);
 		break;
 	    }
 	    if (tag == "table" || tag == "td" || tag == "textarea" ||
 		tag == "th") pending_space = true;
 	    break;
 	case 'u':
 	    if (tag == "ul") pending_space = true;
 	    break;
 	case 'x':
 	    if (tag == "xmp") pending_space = true;
 	    break;
    }
 }
--- a/src/xapian/myhtmlparse.h
+++ b/src/xapian/myhtmlparse.h
@ -1,66 +0,0 @@
 /* myhtmlparse.h: subclass of HtmlParser for extracting text
 *
 * Copyright 1999,2000,2001 BrightStation PLC
 * Copyright 2002,2003,2004,2006,2008 Olly Betts
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
 #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
 #define OMEGA_INCLUDED_MYHTMLPARSE_H
 #include "htmlparse.h"
 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
 // not in all charsets and perhaps spans of all \xa0 should become a single
 // \xa0?
 #define WHITESPACE " \t\n\r"
 class MyHtmlParser : public HtmlParser {
    public:
 	bool in_script_tag;
 	bool in_style_tag;
 	bool pending_space;
 	bool indexing_allowed;
 	bool charset_from_meta;
 	string title, sample, keywords, dump;
 	void process_text(const string &text);
 	void opening_tag(const string &tag);
 	void closing_tag(const string &tag);
        using HtmlParser::parse_html;
 	void parse_html(const string &text, const string &charset_,
 			bool charset_from_meta_);
 	MyHtmlParser() :
 		in_script_tag(false),
 		in_style_tag(false),
 		pending_space(false),
 		indexing_allowed(true),
 		charset_from_meta(false) { }
 	void reset() {
 	    in_script_tag = false;
 	    in_style_tag = false;
 	    pending_space = false;
 	    indexing_allowed = true;
 	    charset_from_meta = false;
 	    title.resize(0);
 	    sample.resize(0);
 	    keywords.resize(0);
 	    dump.resize(0);
 	}
 };
 #endif // OMEGA_INCLUDED_MYHTMLPARSE_H
--- a/src/xapian/namedentities.h
+++ b/src/xapian/namedentities.h
@ -1,279 +0,0 @@
 /* namedentities.h: named HTML entities.
 *
 * Copyright (C) 2006,2007 Olly Betts
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */
 #ifndef OMEGA_INCLUDED_NAMEDENTITIES_H
 #define OMEGA_INCLUDED_NAMEDENTITIES_H
 // Names and values from: "Character entity references in HTML 4"
 // http://www.w3.org/TR/html4/sgml/entities.html
 { "quot", 34 },
 { "amp", 38 },
 { "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML.
 { "lt", 60 },
 { "gt", 62 },
 { "nbsp", 160 },
 { "iexcl", 161 },
 { "cent", 162 },
 { "pound", 163 },
 { "curren", 164 },
 { "yen", 165 },
 { "brvbar", 166 },
 { "sect", 167 },
 { "uml", 168 },
 { "copy", 169 },
 { "ordf", 170 },
 { "laquo", 171 },
 { "not", 172 },
 { "shy", 173 },
 { "reg", 174 },
 { "macr", 175 },
 { "deg", 176 },
 { "plusmn", 177 },
 { "sup2", 178 },
 { "sup3", 179 },
 { "acute", 180 },
 { "micro", 181 },
 { "para", 182 },
 { "middot", 183 },
 { "cedil", 184 },
 { "sup1", 185 },
 { "ordm", 186 },
 { "raquo", 187 },
 { "frac14", 188 },
 { "frac12", 189 },
 { "frac34", 190 },
 { "iquest", 191 },
 { "Agrave", 192 },
 { "Aacute", 193 },
 { "Acirc", 194 },
 { "Atilde", 195 },
 { "Auml", 196 },
 { "Aring", 197 },
 { "AElig", 198 },
 { "Ccedil", 199 },
 { "Egrave", 200 },
 { "Eacute", 201 },
 { "Ecirc", 202 },
 { "Euml", 203 },
 { "Igrave", 204 },
 { "Iacute", 205 },
 { "Icirc", 206 },
 { "Iuml", 207 },
 { "ETH", 208 },
 { "Ntilde", 209 },
 { "Ograve", 210 },
 { "Oacute", 211 },
 { "Ocirc", 212 },
 { "Otilde", 213 },
 { "Ouml", 214 },
 { "times", 215 },
 { "Oslash", 216 },
 { "Ugrave", 217 },
 { "Uacute", 218 },
 { "Ucirc", 219 },
 { "Uuml", 220 },
 { "Yacute", 221 },
 { "THORN", 222 },
 { "szlig", 223 },
 { "agrave", 224 },
 { "aacute", 225 },
 { "acirc", 226 },
 { "atilde", 227 },
 { "auml", 228 },
 { "aring", 229 },
 { "aelig", 230 },
 { "ccedil", 231 },
 { "egrave", 232 },
 { "eacute", 233 },
 { "ecirc", 234 },
 { "euml", 235 },
 { "igrave", 236 },
 { "iacute", 237 },
 { "icirc", 238 },
 { "iuml", 239 },
 { "eth", 240 },
 { "ntilde", 241 },
 { "ograve", 242 },
 { "oacute", 243 },
 { "ocirc", 244 },
 { "otilde", 245 },
 { "ouml", 246 },
 { "divide", 247 },
 { "oslash", 248 },
 { "ugrave", 249 },
 { "uacute", 250 },
 { "ucirc", 251 },
 { "uuml", 252 },
 { "yacute", 253 },
 { "thorn", 254 },
 { "yuml", 255 },
 { "OElig", 338 },
 { "oelig", 339 },
 { "Scaron", 352 },
 { "scaron", 353 },
 { "Yuml", 376 },
 { "fnof", 402 },
 { "circ", 710 },
 { "tilde", 732 },
 { "Alpha", 913 },
 { "Beta", 914 },
 { "Gamma", 915 },
 { "Delta", 916 },
 { "Epsilon", 917 },
 { "Zeta", 918 },
 { "Eta", 919 },
 { "Theta", 920 },
 { "Iota", 921 },
 { "Kappa", 922 },
 { "Lambda", 923 },
 { "Mu", 924 },
 { "Nu", 925 },
 { "Xi", 926 },
 { "Omicron", 927 },
 { "Pi", 928 },
 { "Rho", 929 },
 { "Sigma", 931 },
 { "Tau", 932 },
 { "Upsilon", 933 },
 { "Phi", 934 },
 { "Chi", 935 },
 { "Psi", 936 },
 { "Omega", 937 },
 { "alpha", 945 },
 { "beta", 946 },
 { "gamma", 947 },
 { "delta", 948 },
 { "epsilon", 949 },
 { "zeta", 950 },
 { "eta", 951 },
 { "theta", 952 },
 { "iota", 953 },
 { "kappa", 954 },
 { "lambda", 955 },
 { "mu", 956 },
 { "nu", 957 },
 { "xi", 958 },
 { "omicron", 959 },
 { "pi", 960 },
 { "rho", 961 },
 { "sigmaf", 962 },
 { "sigma", 963 },
 { "tau", 964 },
 { "upsilon", 965 },
 { "phi", 966 },
 { "chi", 967 },
 { "psi", 968 },
 { "omega", 969 },
 { "thetasym", 977 },
 { "upsih", 978 },
 { "piv", 982 },
 { "ensp", 8194 },
 { "emsp", 8195 },
 { "thinsp", 8201 },
 { "zwnj", 8204 },
 { "zwj", 8205 },
 { "lrm", 8206 },
 { "rlm", 8207 },
 { "ndash", 8211 },
 { "mdash", 8212 },
 { "lsquo", 8216 },
 { "rsquo", 8217 },
 { "sbquo", 8218 },
 { "ldquo", 8220 },
 { "rdquo", 8221 },
 { "bdquo", 8222 },
 { "dagger", 8224 },
 { "Dagger", 8225 },
 { "bull", 8226 },
 { "hellip", 8230 },
 { "permil", 8240 },
 { "prime", 8242 },
 { "Prime", 8243 },
 { "lsaquo", 8249 },
 { "rsaquo", 8250 },
 { "oline", 8254 },
 { "frasl", 8260 },
 { "euro", 8364 },
 { "image", 8465 },
 { "weierp", 8472 },
 { "real", 8476 },
 { "trade", 8482 },
 { "alefsym", 8501 },
 { "larr", 8592 },
 { "uarr", 8593 },
 { "rarr", 8594 },
 { "darr", 8595 },
 { "harr", 8596 },
 { "crarr", 8629 },
 { "lArr", 8656 },
 { "uArr", 8657 },
 { "rArr", 8658 },
 { "dArr", 8659 },
 { "hArr", 8660 },
 { "forall", 8704 },
 { "part", 8706 },
 { "exist", 8707 },
 { "empty", 8709 },
 { "nabla", 8711 },
 { "isin", 8712 },
 { "notin", 8713 },
 { "ni", 8715 },
 { "prod", 8719 },
 { "sum", 8721 },
 { "minus", 8722 },
 { "lowast", 8727 },
 { "radic", 8730 },
 { "prop", 8733 },
 { "infin", 8734 },
 { "ang", 8736 },
 { "and", 8743 },
 { "or", 8744 },
 { "cap", 8745 },
 { "cup", 8746 },
 { "int", 8747 },
 { "there4", 8756 },
 { "sim", 8764 },
 { "cong", 8773 },
 { "asymp", 8776 },
 { "ne", 8800 },
 { "equiv", 8801 },
 { "le", 8804 },
 { "ge", 8805 },
 { "sub", 8834 },
 { "sup", 8835 },
 { "nsub", 8836 },
 { "sube", 8838 },
 { "supe", 8839 },
 { "oplus", 8853 },
 { "otimes", 8855 },
 { "perp", 8869 },
 { "sdot", 8901 },
 { "lceil", 8968 },
 { "rceil", 8969 },
 { "lfloor", 8970 },
 { "rfloor", 8971 },
 { "lang", 9001 },
 { "rang", 9002 },
 { "loz", 9674 },
 { "spades", 9824 },
 { "clubs", 9827 },
 { "hearts", 9829 },
 { "diams", 9830 },
 #endif // OMEGA_INCLUDED_NAMEDENTITIES_H
--- a/src/xapianSearcher.cpp
+++ b/src/xapianSearcher.cpp
@ -1,231 +0,0 @@
 /*
 * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU  General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301, USA.
 */
 #include "xapianSearcher.h"
 #include <sys/types.h>
 #include <unicode/locid.h>
 #ifndef _WIN32
 #  include <unistd.h>
 #endif
 #include <zim/article.h>
 #include <zim/error.h>
 #include <zim/file.h>
 #include <zim/zim.h>
 #include "xapian/myhtmlparse.h"
 #include <vector>
 namespace kiwix
 {
 std::map<std::string, int> read_valuesmap(const std::string& s)
 {
  std::map<std::string, int> result;
  std::vector<std::string> elems = split(s, ";");
  for (std::vector<std::string>::iterator elem = elems.begin();
       elem != elems.end();
       elem++) {
    std::vector<std::string> tmp_elems = split(*elem, ":");
    result.insert(
        std::pair<std::string, int>(tmp_elems[0], atoi(tmp_elems[1].c_str())));
  }
  return result;
 }
 /* Constructor */
 XapianSearcher::XapianSearcher(const string& xapianDirectoryPath,
                               Reader* reader)
    : reader(reader)
 {
  this->openIndex(xapianDirectoryPath);
 }
 /* Open Xapian readable database */
 void XapianSearcher::openIndex(const string& directoryPath)
 {
  this->readableDatabase = Xapian::Database(directoryPath);
  this->valuesmap
      = read_valuesmap(this->readableDatabase.get_metadata("valuesmap"));
  this->language = this->readableDatabase.get_metadata("language");
  this->stopwords = this->readableDatabase.get_metadata("stopwords");
  setup_queryParser();
 }
 /* Close Xapian writable database */
 void XapianSearcher::closeIndex()
 {
  return;
 }
 void XapianSearcher::setup_queryParser()
 {
  queryParser.set_database(readableDatabase);
  if (!language.empty()) {
    /* Build ICU Local object to retrieve ISO-639 language code (from
       ISO-639-3) */
    icu::Locale languageLocale(language.c_str());
    /* Configuring language base steemming */
    try {
      stemmer = Xapian::Stem(languageLocale.getLanguage());
      queryParser.set_stemmer(stemmer);
      queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
    } catch (...) {
      std::cout << "No steemming for language '" << languageLocale.getLanguage()
                << "'" << std::endl;
    }
  }
  if (!stopwords.empty()) {
    std::string stopWord;
    std::istringstream file(this->stopwords);
    while (std::getline(file, stopWord, '\n')) {
      this->stopper.add(stopWord);
    }
    queryParser.set_stopper(&(this->stopper));
  }
 }
 /* Search strings in the database */
 void XapianSearcher::searchInIndex(string& search,
                                   const unsigned int resultStart,
                                   const unsigned int resultEnd,
                                   const bool verbose)
 {
  /* Create the query */
  Xapian::Query query = queryParser.parse_query(search);
  /* Create the enquire object */
  Xapian::Enquire enquire(this->readableDatabase);
  enquire.set_query(query);
  /* Get the results */
  this->results = enquire.get_mset(resultStart, resultEnd - resultStart);
  this->current_result = this->results.begin();
 }
 /* Get next result */
 Result* XapianSearcher::getNextResult()
 {
  if (this->current_result != this->results.end()) {
    XapianResult* result = new XapianResult(this, this->current_result);
    this->current_result++;
    return result;
  }
  return NULL;
 }
 void XapianSearcher::restart_search()
 {
  this->current_result = this->results.begin();
 }
 XapianResult::XapianResult(XapianSearcher* searcher,
                           Xapian::MSetIterator& iterator)
    : searcher(searcher), iterator(iterator), document(iterator.get_document())
 {
 }
 std::string XapianResult::get_url()
 {
  return document.get_data();
 }
 std::string XapianResult::get_title()
 {
  if (searcher->valuesmap.empty()) {
    /* This is the old legacy version. Guess and try */
    return document.get_value(0);
  } else if (searcher->valuesmap.find("title") != searcher->valuesmap.end()) {
    return document.get_value(searcher->valuesmap["title"]);
  }
  return "";
 }
 int XapianResult::get_score()
 {
  return iterator.get_percent();
 }
 std::string XapianResult::get_snippet()
 {
  if (searcher->valuesmap.empty()) {
    /* This is the old legacy version. Guess and try */
    std::string stored_snippet = document.get_value(1);
    if (!stored_snippet.empty()) {
      return stored_snippet;
    }
    /* Let's continue here, and see if we can genenate one */
  } else if (searcher->valuesmap.find("snippet") != searcher->valuesmap.end()) {
    return document.get_value(searcher->valuesmap["snippet"]);
  }
  /* No reader, no snippet */
  if (!searcher->reader) {
    return "";
  }
  /* Get the content of the article to generate a snippet.
     We parse it and use the html dump to avoid remove html tags in the
     content and be able to nicely cut the text at random place. */
  MyHtmlParser htmlParser;
  std::string content = get_content();
  if (content.empty()) {
    return content;
  }
  try {
    htmlParser.parse_html(content, "UTF-8", true);
  } catch (...) {
  }
  return searcher->results.snippet(htmlParser.dump, 500);
 }
 std::string XapianResult::get_content()
 {
  if (!searcher->reader) {
    return "";
  }
  auto entry = searcher->reader->getEntryFromEncodedPath(get_url());
  return entry.getContent();
 }
 int XapianResult::get_size()
 {
  if (searcher->valuesmap.empty()) {
    /* This is the old legacy version. Guess and try */
    return document.get_value(2).empty() == true
               ? -1
               : atoi(document.get_value(2).c_str());
  } else if (searcher->valuesmap.find("size") != searcher->valuesmap.end()) {
    return atoi(document.get_value(searcher->valuesmap["size"]).c_str());
  }
  /* The size is never used. Do we really want to get the content and
     calculate the size ? */
  return -1;
 }
 int XapianResult::get_wordCount()
 {
  if (searcher->valuesmap.empty()) {
    /* This is the old legacy version. Guess and try */
    return document.get_value(3).empty() == true
               ? -1
               : atoi(document.get_value(3).c_str());
  } else if (searcher->valuesmap.find("wordcount")
             != searcher->valuesmap.end()) {
    return atoi(document.get_value(searcher->valuesmap["wordcount"]).c_str());
  }
  return -1;
 }
 }  // Kiwix namespace