From 5159d985c608074e1b6c34809bd286c4d829b0f4 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Mon, 7 Jan 2019 16:47:08 +0100 Subject: [PATCH] Remove support for external index. This feature is considered obsolete for a while. In fact, it was already not supported since June 2018 as we were compiling xapian without the chert backend support. Assume that we don't support it and remove it from the code. See kiwix/kiwix-tools#245 This is a API break. library.xml files will still work but the indexPath and indexType will be dropped silently from the file. --- README.md | 2 - include/book.h | 7 - include/meson.build | 4 - include/searcher.h | 31 +-- include/xapianSearcher.h | 98 ---------- meson.build | 7 +- src/book.cpp | 20 -- src/libxml_dumper.cpp | 5 - src/meson.build | 6 - src/searcher.cpp | 104 +++-------- src/xapian/htmlparse.cc | 373 ------------------------------------- src/xapian/htmlparse.h | 49 ----- src/xapian/myhtmlparse.cc | 302 ------------------------------ src/xapian/myhtmlparse.h | 66 ------- src/xapian/namedentities.h | 279 --------------------------- src/xapianSearcher.cpp | 231 ----------------------- 16 files changed, 31 insertions(+), 1553 deletions(-) delete mode 100644 include/xapianSearcher.h delete mode 100644 src/xapian/htmlparse.cc delete mode 100644 src/xapian/htmlparse.h delete mode 100644 src/xapian/myhtmlparse.cc delete mode 100644 src/xapian/myhtmlparse.h delete mode 100644 src/xapian/namedentities.h delete mode 100644 src/xapianSearcher.cpp diff --git a/README.md b/README.md index 0cdec90e8..cb4723782 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,6 @@ libraries need to be available: (package libpugixml-dev on Ubuntu) * ctpp2 ........................................ http://ctpp.havoc.ru/ (package libctpp2-dev on Ubuntu) -* Xapian ......................................... https://xapian.org/ -(package libxapian-dev on Ubuntu) * libaria2 .................................. https://aria2.github.io/ (no package on Ubuntu) diff --git a/include/book.h b/include/book.h index b48fdde90..836bbbe7e 100644 --- a/include/book.h +++ b/include/book.h @@ -28,7 +28,6 @@ class xml_node; namespace kiwix { -enum supportedIndexType { UNKNOWN, XAPIAN }; class OPDSDumper; class Reader; @@ -52,8 +51,6 @@ class Book const std::string& getId() const { return m_id; } const std::string& getPath() const { return m_path; } bool isPathValid() const { return m_pathValid; } - const std::string& getIndexPath() const { return m_indexPath; } - const supportedIndexType& getIndexType() const { return m_indexType; } const std::string& getTitle() const { return m_title; } const std::string& getDescription() const { return m_description; } const std::string& getLanguage() const { return m_language; } @@ -76,8 +73,6 @@ class Book void setId(const std::string& id) { m_id = id; } void setPath(const std::string& path); void setPathValid(bool valid) { m_pathValid = valid; } - void setIndexPath(const std::string& indexPath); - void setIndexType(supportedIndexType indexType) { m_indexType = indexType;} void setTitle(const std::string& title) { m_title = title; } void setDescription(const std::string& description) { m_description = description; } void setLanguage(const std::string& language) { m_language = language; } @@ -100,8 +95,6 @@ class Book std::string m_downloadId; std::string m_path; bool m_pathValid; - std::string m_indexPath; - supportedIndexType m_indexType; std::string m_title; std::string m_description; std::string m_language; diff --git a/include/meson.build b/include/meson.build index a813dcf9c..a278adf91 100644 --- a/include/meson.build +++ b/include/meson.build @@ -12,10 +12,6 @@ headers = [ 'searcher.h' ] -if xapian_dep.found() - headers += ['xapianSearcher.h'] -endif - install_headers(headers, subdir:'kiwix') install_headers( diff --git a/include/searcher.h b/include/searcher.h index 52b65bb2e..aa3eee966 100644 --- a/include/searcher.h +++ b/include/searcher.h @@ -57,24 +57,7 @@ struct SearcherInternal; * The Searcher class is reponsible to do different kind of search using the * fulltext index. * - * Historically, there are two kind of fulltext index : - * - The legacy one, is the external fulltext index. A directory stored outside - * of the zim file. - * - The new one, a embedded fulltext index in the zim file. - * - * Legacy external fulltext index has to be considered as obsolet format with - * less functionnalities: - * - No multi zim search ; - * - No geo_search ; - * - No suggestions search ; - * - * To reflect this, there is two Search creation "API": - * - One for the external fulltext index, using the constructor taking a - * xapianDirectoryPath) ; - * - One for the embedded fulltext index, using a "empty" constructor and the - * `add_reader` method". - * - * On top of that, the Searcher may (if compiled with ctpp2) be used to + * Searcher may (if compiled with ctpp2) be used to * generate a html page for the search result. This use a template that need a * humanReaderName. This feature is only used by kiwix-serve and this should be * move outside of Searcher (and with a better API). If you don't use the html @@ -92,18 +75,6 @@ class Searcher */ Searcher(const string& humanReadableName = ""); - /** - * The constructor for legacy external fulltext index. - * - * @param xapianDirectoryPath The path to the external index directory. - * @param reader The reader associated to the external index. - * It will be used retrive the article content or generate - * the snippet. - * @param humanReadableName The humanReadableName for the zim. - */ - Searcher(const string& xapianDirectoryPath, - Reader* reader, - const string& humanReadableName); ~Searcher(); /** diff --git a/include/xapianSearcher.h b/include/xapianSearcher.h deleted file mode 100644 index 8c0cb3a71..000000000 --- a/include/xapianSearcher.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright 2011 Emmanuel Engelhart - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301, USA. - */ - -#ifndef KIWIX_XAPIAN_SEARCHER_H -#define KIWIX_XAPIAN_SEARCHER_H - -#include -#include "reader.h" -#include "searcher.h" - -#include -#include - -using namespace std; - -namespace kiwix -{ -class XapianSearcher; - -class XapianResult : public Result -{ - public: - XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator); - virtual ~XapianResult(){}; - - virtual std::string get_url(); - virtual std::string get_title(); - virtual int get_score(); - virtual std::string get_snippet(); - virtual std::string get_content(); - virtual int get_wordCount(); - virtual int get_size(); - virtual int get_readerIndex() { return 0; }; - - private: - XapianSearcher* searcher; - Xapian::MSetIterator iterator; - Xapian::Document document; -}; - -class NoXapianIndexInZim : public exception -{ - virtual const char* what() const throw() - { - return "There is no fulltext index in the zim file"; - } -}; - -class XapianSearcher -{ - friend class XapianResult; - - public: - XapianSearcher(const string& xapianDirectoryPath, Reader* reader); - virtual ~XapianSearcher(){}; - void searchInIndex(string& search, - const unsigned int resultStart, - const unsigned int resultEnd, - const bool verbose = false); - virtual Result* getNextResult(); - void restart_search(); - - Xapian::MSet results; - - protected: - void closeIndex(); - void openIndex(const string& xapianDirectoryPath); - void setup_queryParser(); - - Reader* reader; - Xapian::Database readableDatabase; - std::string language; - std::string stopwords; - Xapian::QueryParser queryParser; - Xapian::Stem stemmer; - Xapian::SimpleStopper stopper; - Xapian::MSetIterator current_result; - std::map valuesmap; -}; -} - -#endif diff --git a/meson.build b/meson.build index d313623c8..0a3e5e955 100644 --- a/meson.build +++ b/meson.build @@ -84,9 +84,7 @@ else endif endif -xapian_dep = dependency('xapian-core', required:false, static:static_deps) - -all_deps = [thread_dep, libicu_dep, libzim_dep, xapian_dep, pugixml_dep, libcurl_dep] +all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep] if has_ctpp2_dep all_deps += [ctpp2_dep] endif @@ -110,9 +108,6 @@ subdir('src') subdir('test') pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl'] -if xapian_dep.found() - pkg_requires += ['xapian-core'] -endif if has_ctpp2_dep extra_libs += ctpp2_link_args diff --git a/src/book.cpp b/src/book.cpp index 02489ac0d..0eafff34c 100644 --- a/src/book.cpp +++ b/src/book.cpp @@ -60,11 +60,6 @@ bool Book::update(const kiwix::Book& other) m_name = other.m_name; } - if (m_indexPath.empty()) { - m_indexPath = other.m_indexPath; - m_indexType = other.m_indexType; - } - if (m_faviconMimeType.empty()) { m_favicon = other.m_favicon; m_faviconMimeType = other.m_faviconMimeType; @@ -101,14 +96,6 @@ void Book::updateFromXml(const pugi::xml_node& node, const std::string& baseDir) path = computeAbsolutePath(baseDir, path); } m_path = path; - path = ATTR("indexPath"); - if (!path.empty()) { - if (isRelativePath(path)) { - path = computeAbsolutePath(baseDir, path); - } - m_indexPath = path; - m_indexType = XAPIAN; - } m_title = ATTR("title"); m_name = ATTR("name"); m_tags = ATTR("tags"); @@ -194,13 +181,6 @@ void Book::setPath(const std::string& path) : path; } -void Book::setIndexPath(const std::string& indexPath) -{ - m_indexPath = isRelativePath(indexPath) - ? computeAbsolutePath(getCurrentDirectory(), indexPath) - : indexPath; -} - const std::string& Book::getFavicon() const { if (m_favicon.empty() && !m_faviconUrl.empty()) { try { diff --git a/src/libxml_dumper.cpp b/src/libxml_dumper.cpp index 96e580de2..84092decd 100644 --- a/src/libxml_dumper.cpp +++ b/src/libxml_dumper.cpp @@ -50,11 +50,6 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) { ADD_ATTRIBUTE(entry_node, "path", computeRelativePath(baseDir, book.getPath())); } - if (!book.getIndexPath().empty()) { - ADD_ATTRIBUTE(entry_node, "indexPath", computeRelativePath(baseDir, book.getIndexPath())); - entry_node.append_attribute("indexType") = "xapian"; - } - if (book.getOrigId().empty()) { ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle()); ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName()); diff --git a/src/meson.build b/src/meson.build index 46f585b1c..e51a9c33e 100644 --- a/src/meson.build +++ b/src/meson.build @@ -17,8 +17,6 @@ kiwix_sources = [ 'common/stringTools.cpp', 'common/networkTools.cpp', 'common/otherTools.cpp', - 'xapian/htmlparse.cc', - 'xapian/myhtmlparse.cc' ] kiwix_sources += lib_resources @@ -28,10 +26,6 @@ else kiwix_sources += 'subprocess_unix.cpp' endif -if xapian_dep.found() - kiwix_sources += ['xapianSearcher.cpp'] -endif - if get_option('android') subdir('android') install_dir = 'kiwix-lib/jniLibs/' + meson.get_cross_property('android_abi') diff --git a/src/searcher.cpp b/src/searcher.cpp index e60205b77..dbd7dd939 100644 --- a/src/searcher.cpp +++ b/src/searcher.cpp @@ -22,7 +22,6 @@ #include "searcher.h" #include "reader.h" -#include "xapianSearcher.h" #include @@ -61,42 +60,18 @@ class _Result : public Result struct SearcherInternal { const zim::Search* _search; - XapianSearcher* _xapianSearcher; zim::Search::iterator current_iterator; - SearcherInternal() : _search(NULL), _xapianSearcher(NULL) {} + SearcherInternal() : _search(NULL) {} ~SearcherInternal() { if (_search != NULL) { delete _search; } - if (_xapianSearcher != NULL) { - delete _xapianSearcher; - } } }; /* Constructor */ -Searcher::Searcher(const string& xapianDirectoryPath, - Reader* reader, - const string& humanReadableName) - : internal(new SearcherInternal()), - searchPattern(""), - protocolPrefix("zim://"), - searchProtocolPrefix("search://?"), - resultCountPerPage(0), - estimatedResultCount(0), - resultStart(0), - resultEnd(0), - contentHumanReadableId(humanReadableName) -{ - loadICUExternalTables(); - if (!reader || !reader->hasFulltextIndex()) { - internal->_xapianSearcher = new XapianSearcher(xapianDirectoryPath, reader); - } - this->humanReaderNames.push_back(humanReadableName); -} - Searcher::Searcher(const std::string& humanReadableName) : internal(new SearcherInternal()), searchPattern(""), @@ -160,26 +135,19 @@ void Searcher::search(std::string& search, this->resultStart = resultStart; this->resultEnd = resultEnd; string unaccentedSearch = removeAccents(search); - if (internal->_xapianSearcher) { - internal->_xapianSearcher->searchInIndex( - unaccentedSearch, resultStart, resultEnd, verbose); - this->estimatedResultCount - = internal->_xapianSearcher->results.get_matches_estimated(); - } else { - std::vector zims; - for (auto current = this->readers.begin(); current != this->readers.end(); - current++) { - if ( (*current)->hasFulltextIndex() ) { - zims.push_back((*current)->getZimFileHandler()); - } + std::vector zims; + for (auto current = this->readers.begin(); current != this->readers.end(); + current++) { + if ( (*current)->hasFulltextIndex() ) { + zims.push_back((*current)->getZimFileHandler()); } - zim::Search* search = new zim::Search(zims); - search->set_query(unaccentedSearch); - search->set_range(resultStart, resultEnd); - internal->_search = search; - internal->current_iterator = internal->_search->begin(); - this->estimatedResultCount = internal->_search->get_matches_estimated(); } + zim::Search* search = new zim::Search(zims); + search->set_query(unaccentedSearch); + search->set_range(resultStart, resultEnd); + internal->_search = search; + internal->current_iterator = internal->_search->begin(); + this->estimatedResultCount = internal->_search->get_matches_estimated(); } return; @@ -209,10 +177,6 @@ void Searcher::geo_search(float latitude, float longitude, float distance, return; } - if (internal->_xapianSearcher) { - return; - } - /* Avoid big researches */ this->resultCountPerPage = resultEnd - resultStart; if (this->resultCountPerPage > MAX_SEARCH_LEN) { @@ -244,18 +208,14 @@ void Searcher::geo_search(float latitude, float longitude, float distance, void Searcher::restart_search() { - if (internal->_xapianSearcher) { - internal->_xapianSearcher->restart_search(); - } else if (internal->_search) { + if (internal->_search) { internal->current_iterator = internal->_search->begin(); } } Result* Searcher::getNextResult() { - if (internal->_xapianSearcher) { - return internal->_xapianSearcher->getNextResult(); - } else if (internal->_search && + if (internal->_search && internal->current_iterator != internal->_search->end()) { Result* result = new _Result(internal->current_iterator); internal->current_iterator++; @@ -272,37 +232,31 @@ void Searcher::reset() return; } -void Searcher::suggestions(std::string& search, const bool verbose) +void Searcher::suggestions(std::string& searchPattern, const bool verbose) { this->reset(); if (verbose == true) { - cout << "Performing suggestion query `" << search << "`" << endl; + cout << "Performing suggestion query `" << searchPattern << "`" << endl; } - this->searchPattern = search; + this->searchPattern = searchPattern; this->resultStart = 0; this->resultEnd = 10; - string unaccentedSearch = removeAccents(search); + string unaccentedSearch = removeAccents(searchPattern); - if (internal->_xapianSearcher) { - /* [TODO] Suggestion on a external database ? - * We do not support that. */ - this->estimatedResultCount = 0; - } else { - std::vector zims; - for (auto current = this->readers.begin(); current != this->readers.end(); - current++) { - zims.push_back((*current)->getZimFileHandler()); - } - zim::Search* search = new zim::Search(zims); - search->set_query(unaccentedSearch); - search->set_range(resultStart, resultEnd); - search->set_suggestion_mode(true); - internal->_search = search; - internal->current_iterator = internal->_search->begin(); - this->estimatedResultCount = internal->_search->get_matches_estimated(); + std::vector zims; + for (auto current = this->readers.begin(); current != this->readers.end(); + current++) { + zims.push_back((*current)->getZimFileHandler()); } + zim::Search* search = new zim::Search(zims); + search->set_query(unaccentedSearch); + search->set_range(resultStart, resultEnd); + search->set_suggestion_mode(true); + internal->_search = search; + internal->current_iterator = internal->_search->begin(); + this->estimatedResultCount = internal->_search->get_matches_estimated(); } /* Return the result count estimation */ diff --git a/src/xapian/htmlparse.cc b/src/xapian/htmlparse.cc deleted file mode 100644 index 483b03fe2..000000000 --- a/src/xapian/htmlparse.cc +++ /dev/null @@ -1,373 +0,0 @@ -/* htmlparse.cc: simple HTML parser for omega indexer - * - * Copyright 1999,2000,2001 BrightStation PLC - * Copyright 2001 Ananova Ltd - * Copyright 2002,2006,2007,2008 Olly Betts - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 - * USA - */ - -// #include - -#include "htmlparse.h" - -#include - -// #include "utf8convert.h" - -#include - -#include -#include -#include -#include - -using namespace std; - -inline void -lowercase_string(string &str) -{ - for (string::iterator i = str.begin(); i != str.end(); ++i) { - *i = tolower(static_cast(*i)); - } -} - -map HtmlParser::named_ents; - -inline static bool -p_notdigit(char c) -{ - return !isdigit(static_cast(c)); -} - -inline static bool -p_notxdigit(char c) -{ - return !isxdigit(static_cast(c)); -} - -inline static bool -p_notalnum(char c) -{ - return !isalnum(static_cast(c)); -} - -inline static bool -p_notwhitespace(char c) -{ - return !isspace(static_cast(c)); -} - -inline static bool -p_nottag(char c) -{ - return !isalnum(static_cast(c)) && - c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. -} - -inline static bool -p_whitespacegt(char c) -{ - return isspace(static_cast(c)) || c == '>'; -} - -inline static bool -p_whitespaceeqgt(char c) -{ - return isspace(static_cast(c)) || c == '=' || c == '>'; -} - -bool -HtmlParser::get_parameter(const string & param, string & value) -{ - map::const_iterator i = parameters.find(param); - if (i == parameters.end()) return false; - value = i->second; - return true; -} - -HtmlParser::HtmlParser() -{ - static const struct ent { const char *n; unsigned int v; } ents[] = { -#include "namedentities.h" - { NULL, 0 } - }; - if (named_ents.empty()) { - const struct ent *i = ents; - while (i->n) { - named_ents[string(i->n)] = i->v; - ++i; - } - } -} - -void -HtmlParser::decode_entities(string &s) -{ - // We need a const_iterator version of s.end() - otherwise the - // find() and find_if() templates don't work... - string::const_iterator amp = s.begin(), s_end = s.end(); - while ((amp = find(amp, s_end, '&')) != s_end) { - unsigned int val = 0; - string::const_iterator end, p = amp + 1; - if (p != s_end && *p == '#') { - p++; - if (p != s_end && (*p == 'x' || *p == 'X')) { - // hex - p++; - end = find_if(p, s_end, p_notxdigit); - sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); - } else { - // number - end = find_if(p, s_end, p_notdigit); - val = atoi(s.substr(p - s.begin(), end - p).c_str()); - } - } else { - end = find_if(p, s_end, p_notalnum); - string code = s.substr(p - s.begin(), end - p); - map::const_iterator i; - i = named_ents.find(code); - if (i != named_ents.end()) val = i->second; - } - if (end < s_end && *end == ';') end++; - if (val) { - string::size_type amp_pos = amp - s.begin(); - if (val < 0x80) { - s.replace(amp_pos, end - amp, 1u, char(val)); - } else { - // Convert unicode value val to UTF-8. - char seq[4]; - unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); - s.replace(amp_pos, end - amp, seq, len); - } - s_end = s.end(); - // We've modified the string, so the iterators are no longer - // valid... - amp = s.begin() + amp_pos + 1; - } else { - amp = end; - } - } -} - -void -HtmlParser::parse_html(const string &body) -{ - in_script = false; - - parameters.clear(); - string::const_iterator start = body.begin(); - - while (true) { - // Skip through until we find an HTML tag, a comment, or the end of - // document. Ignore isolated occurrences of `<' which don't start - // a tag or comment. - string::const_iterator p = start; - while (true) { - p = find(p, body.end(), '<'); - if (p == body.end()) break; - unsigned char ch = *(p + 1); - - // Tag, closing tag, or comment (or SGML declaration). - if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; - - if (ch == '?') { - // PHP code or XML declaration. - // XML declaration is only valid at the start of the first line. - // FIXME: need to deal with BOMs... - if (p != body.begin() || body.size() < 20) break; - - // XML declaration looks something like this: - // - if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; - if (strchr(" \t\r\n", p[5]) == NULL) break; - - string::const_iterator decl_end = find(p + 6, body.end(), '?'); - if (decl_end == body.end()) break; - - // Default charset for XML is UTF-8. - charset = "UTF-8"; - - string decl(p + 6, decl_end); - size_t enc = decl.find("encoding"); - if (enc == string::npos) break; - - enc = decl.find_first_not_of(" \t\r\n", enc + 8); - if (enc == string::npos || enc == decl.size()) break; - - if (decl[enc] != '=') break; - - enc = decl.find_first_not_of(" \t\r\n", enc + 1); - if (enc == string::npos || enc == decl.size()) break; - - if (decl[enc] != '"' && decl[enc] != '\'') break; - - char quote = decl[enc++]; - size_t enc_end = decl.find(quote, enc); - - if (enc != string::npos) - charset = decl.substr(enc, enc_end - enc); - - break; - } - p++; - } - - // Process text up to start of tag. - if (p > start) { - string text = body.substr(start - body.begin(), p - start); - // convert_to_utf8(text, charset); - decode_entities(text); - process_text(text); - } - - if (p == body.end()) break; - - start = p + 1; - - if (start == body.end()) break; - - if (*start == '!') { - if (++start == body.end()) break; - if (++start == body.end()) break; - // comment or SGML declaration - if (*(start - 1) == '-' && *start == '-') { - ++start; - string::const_iterator close = find(start, body.end(), '>'); - // An unterminated comment swallows rest of document - // (like Netscape, but unlike MSIE IIRC) - if (close == body.end()) break; - - p = close; - // look for --> - while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) - p = find(p + 1, body.end(), '>'); - - if (p != body.end()) { - // Check for htdig's "ignore this bit" comments. - if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { - string::size_type i; - i = body.find("", p + 1 - body.begin()); - if (i == string::npos) break; - start = body.begin() + i + 21; - continue; - } - // If we found --> skip to there. - start = p; - } else { - // Otherwise skip to the first > we found (as Netscape does). - start = close; - } - } else { - // just an SGML declaration, perhaps giving the DTD - ignore it - start = find(start - 1, body.end(), '>'); - if (start == body.end()) break; - } - ++start; - } else if (*start == '?') { - if (++start == body.end()) break; - // PHP - swallow until ?> or EOF - start = find(start + 1, body.end(), '>'); - - // look for ?> - while (start != body.end() && *(start - 1) != '?') - start = find(start + 1, body.end(), '>'); - - // unterminated PHP swallows rest of document (rather arbitrarily - // but it avoids polluting the database when things go wrong) - if (start != body.end()) ++start; - } else { - // opening or closing tag - int closing = 0; - - if (*start == '/') { - closing = 1; - start = find_if(start + 1, body.end(), p_notwhitespace); - } - - p = start; - start = find_if(start, body.end(), p_nottag); - string tag = body.substr(p - body.begin(), start - p); - // convert tagname to lowercase - lowercase_string(tag); - - if (closing) { - closing_tag(tag); - if (in_script && tag == "script") in_script = false; - - /* ignore any bogus parameters on closing tags */ - p = find(start, body.end(), '>'); - if (p == body.end()) break; - start = p + 1; - } else { - // FIXME: parse parameters lazily. - while (start < body.end() && *start != '>') { - string name, value; - - p = find_if(start, body.end(), p_whitespaceeqgt); - - name.assign(body, start - body.begin(), p - start); - - p = find_if(p, body.end(), p_notwhitespace); - - start = p; - if (start != body.end() && *start == '=') { - start = find_if(start + 1, body.end(), p_notwhitespace); - - p = body.end(); - - int quote = *start; - if (quote == '"' || quote == '\'') { - start++; - p = find(start, body.end(), quote); - } - - if (p == body.end()) { - // unquoted or no closing quote - p = find_if(start, body.end(), p_whitespacegt); - } - value.assign(body, start - body.begin(), p - start); - start = find_if(p, body.end(), p_notwhitespace); - - if (!name.empty()) { - // convert parameter name to lowercase - lowercase_string(name); - // in case of multiple entries, use the first - // (as Netscape does) - parameters.insert(make_pair(name, value)); - } - } - } -#if 0 - cout << "<" << tag; - map::const_iterator x; - for (x = parameters.begin(); x != parameters.end(); x++) { - cout << " " << x->first << "=\"" << x->second << "\""; - } - cout << ">\n"; -#endif - opening_tag(tag); - parameters.clear(); - - // In