mirror of
https://github.com/kiwix/libkiwix.git
synced 2025-06-26 10:11:30 +00:00
Remove support for external index.
This feature is considered obsolete for a while. In fact, it was already not supported since June 2018 as we were compiling xapian without the chert backend support. Assume that we don't support it and remove it from the code. See kiwix/kiwix-tools#245 This is a API break. library.xml files will still work but the indexPath and indexType will be dropped silently from the file.
This commit is contained in:
20
src/book.cpp
20
src/book.cpp
@ -60,11 +60,6 @@ bool Book::update(const kiwix::Book& other)
|
||||
m_name = other.m_name;
|
||||
}
|
||||
|
||||
if (m_indexPath.empty()) {
|
||||
m_indexPath = other.m_indexPath;
|
||||
m_indexType = other.m_indexType;
|
||||
}
|
||||
|
||||
if (m_faviconMimeType.empty()) {
|
||||
m_favicon = other.m_favicon;
|
||||
m_faviconMimeType = other.m_faviconMimeType;
|
||||
@ -101,14 +96,6 @@ void Book::updateFromXml(const pugi::xml_node& node, const std::string& baseDir)
|
||||
path = computeAbsolutePath(baseDir, path);
|
||||
}
|
||||
m_path = path;
|
||||
path = ATTR("indexPath");
|
||||
if (!path.empty()) {
|
||||
if (isRelativePath(path)) {
|
||||
path = computeAbsolutePath(baseDir, path);
|
||||
}
|
||||
m_indexPath = path;
|
||||
m_indexType = XAPIAN;
|
||||
}
|
||||
m_title = ATTR("title");
|
||||
m_name = ATTR("name");
|
||||
m_tags = ATTR("tags");
|
||||
@ -194,13 +181,6 @@ void Book::setPath(const std::string& path)
|
||||
: path;
|
||||
}
|
||||
|
||||
void Book::setIndexPath(const std::string& indexPath)
|
||||
{
|
||||
m_indexPath = isRelativePath(indexPath)
|
||||
? computeAbsolutePath(getCurrentDirectory(), indexPath)
|
||||
: indexPath;
|
||||
}
|
||||
|
||||
const std::string& Book::getFavicon() const {
|
||||
if (m_favicon.empty() && !m_faviconUrl.empty()) {
|
||||
try {
|
||||
|
@ -50,11 +50,6 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) {
|
||||
ADD_ATTRIBUTE(entry_node, "path", computeRelativePath(baseDir, book.getPath()));
|
||||
}
|
||||
|
||||
if (!book.getIndexPath().empty()) {
|
||||
ADD_ATTRIBUTE(entry_node, "indexPath", computeRelativePath(baseDir, book.getIndexPath()));
|
||||
entry_node.append_attribute("indexType") = "xapian";
|
||||
}
|
||||
|
||||
if (book.getOrigId().empty()) {
|
||||
ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle());
|
||||
ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName());
|
||||
|
@ -17,8 +17,6 @@ kiwix_sources = [
|
||||
'common/stringTools.cpp',
|
||||
'common/networkTools.cpp',
|
||||
'common/otherTools.cpp',
|
||||
'xapian/htmlparse.cc',
|
||||
'xapian/myhtmlparse.cc'
|
||||
]
|
||||
kiwix_sources += lib_resources
|
||||
|
||||
@ -28,10 +26,6 @@ else
|
||||
kiwix_sources += 'subprocess_unix.cpp'
|
||||
endif
|
||||
|
||||
if xapian_dep.found()
|
||||
kiwix_sources += ['xapianSearcher.cpp']
|
||||
endif
|
||||
|
||||
if get_option('android')
|
||||
subdir('android')
|
||||
install_dir = 'kiwix-lib/jniLibs/' + meson.get_cross_property('android_abi')
|
||||
|
104
src/searcher.cpp
104
src/searcher.cpp
@ -22,7 +22,6 @@
|
||||
|
||||
#include "searcher.h"
|
||||
#include "reader.h"
|
||||
#include "xapianSearcher.h"
|
||||
|
||||
#include <zim/search.h>
|
||||
|
||||
@ -61,42 +60,18 @@ class _Result : public Result
|
||||
|
||||
struct SearcherInternal {
|
||||
const zim::Search* _search;
|
||||
XapianSearcher* _xapianSearcher;
|
||||
zim::Search::iterator current_iterator;
|
||||
|
||||
SearcherInternal() : _search(NULL), _xapianSearcher(NULL) {}
|
||||
SearcherInternal() : _search(NULL) {}
|
||||
~SearcherInternal()
|
||||
{
|
||||
if (_search != NULL) {
|
||||
delete _search;
|
||||
}
|
||||
if (_xapianSearcher != NULL) {
|
||||
delete _xapianSearcher;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/* Constructor */
|
||||
Searcher::Searcher(const string& xapianDirectoryPath,
|
||||
Reader* reader,
|
||||
const string& humanReadableName)
|
||||
: internal(new SearcherInternal()),
|
||||
searchPattern(""),
|
||||
protocolPrefix("zim://"),
|
||||
searchProtocolPrefix("search://?"),
|
||||
resultCountPerPage(0),
|
||||
estimatedResultCount(0),
|
||||
resultStart(0),
|
||||
resultEnd(0),
|
||||
contentHumanReadableId(humanReadableName)
|
||||
{
|
||||
loadICUExternalTables();
|
||||
if (!reader || !reader->hasFulltextIndex()) {
|
||||
internal->_xapianSearcher = new XapianSearcher(xapianDirectoryPath, reader);
|
||||
}
|
||||
this->humanReaderNames.push_back(humanReadableName);
|
||||
}
|
||||
|
||||
Searcher::Searcher(const std::string& humanReadableName)
|
||||
: internal(new SearcherInternal()),
|
||||
searchPattern(""),
|
||||
@ -160,26 +135,19 @@ void Searcher::search(std::string& search,
|
||||
this->resultStart = resultStart;
|
||||
this->resultEnd = resultEnd;
|
||||
string unaccentedSearch = removeAccents(search);
|
||||
if (internal->_xapianSearcher) {
|
||||
internal->_xapianSearcher->searchInIndex(
|
||||
unaccentedSearch, resultStart, resultEnd, verbose);
|
||||
this->estimatedResultCount
|
||||
= internal->_xapianSearcher->results.get_matches_estimated();
|
||||
} else {
|
||||
std::vector<const zim::File*> zims;
|
||||
for (auto current = this->readers.begin(); current != this->readers.end();
|
||||
current++) {
|
||||
if ( (*current)->hasFulltextIndex() ) {
|
||||
zims.push_back((*current)->getZimFileHandler());
|
||||
}
|
||||
std::vector<const zim::File*> zims;
|
||||
for (auto current = this->readers.begin(); current != this->readers.end();
|
||||
current++) {
|
||||
if ( (*current)->hasFulltextIndex() ) {
|
||||
zims.push_back((*current)->getZimFileHandler());
|
||||
}
|
||||
zim::Search* search = new zim::Search(zims);
|
||||
search->set_query(unaccentedSearch);
|
||||
search->set_range(resultStart, resultEnd);
|
||||
internal->_search = search;
|
||||
internal->current_iterator = internal->_search->begin();
|
||||
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
||||
}
|
||||
zim::Search* search = new zim::Search(zims);
|
||||
search->set_query(unaccentedSearch);
|
||||
search->set_range(resultStart, resultEnd);
|
||||
internal->_search = search;
|
||||
internal->current_iterator = internal->_search->begin();
|
||||
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
||||
}
|
||||
|
||||
return;
|
||||
@ -209,10 +177,6 @@ void Searcher::geo_search(float latitude, float longitude, float distance,
|
||||
return;
|
||||
}
|
||||
|
||||
if (internal->_xapianSearcher) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Avoid big researches */
|
||||
this->resultCountPerPage = resultEnd - resultStart;
|
||||
if (this->resultCountPerPage > MAX_SEARCH_LEN) {
|
||||
@ -244,18 +208,14 @@ void Searcher::geo_search(float latitude, float longitude, float distance,
|
||||
|
||||
void Searcher::restart_search()
|
||||
{
|
||||
if (internal->_xapianSearcher) {
|
||||
internal->_xapianSearcher->restart_search();
|
||||
} else if (internal->_search) {
|
||||
if (internal->_search) {
|
||||
internal->current_iterator = internal->_search->begin();
|
||||
}
|
||||
}
|
||||
|
||||
Result* Searcher::getNextResult()
|
||||
{
|
||||
if (internal->_xapianSearcher) {
|
||||
return internal->_xapianSearcher->getNextResult();
|
||||
} else if (internal->_search &&
|
||||
if (internal->_search &&
|
||||
internal->current_iterator != internal->_search->end()) {
|
||||
Result* result = new _Result(internal->current_iterator);
|
||||
internal->current_iterator++;
|
||||
@ -272,37 +232,31 @@ void Searcher::reset()
|
||||
return;
|
||||
}
|
||||
|
||||
void Searcher::suggestions(std::string& search, const bool verbose)
|
||||
void Searcher::suggestions(std::string& searchPattern, const bool verbose)
|
||||
{
|
||||
this->reset();
|
||||
|
||||
if (verbose == true) {
|
||||
cout << "Performing suggestion query `" << search << "`" << endl;
|
||||
cout << "Performing suggestion query `" << searchPattern << "`" << endl;
|
||||
}
|
||||
|
||||
this->searchPattern = search;
|
||||
this->searchPattern = searchPattern;
|
||||
this->resultStart = 0;
|
||||
this->resultEnd = 10;
|
||||
string unaccentedSearch = removeAccents(search);
|
||||
string unaccentedSearch = removeAccents(searchPattern);
|
||||
|
||||
if (internal->_xapianSearcher) {
|
||||
/* [TODO] Suggestion on a external database ?
|
||||
* We do not support that. */
|
||||
this->estimatedResultCount = 0;
|
||||
} else {
|
||||
std::vector<const zim::File*> zims;
|
||||
for (auto current = this->readers.begin(); current != this->readers.end();
|
||||
current++) {
|
||||
zims.push_back((*current)->getZimFileHandler());
|
||||
}
|
||||
zim::Search* search = new zim::Search(zims);
|
||||
search->set_query(unaccentedSearch);
|
||||
search->set_range(resultStart, resultEnd);
|
||||
search->set_suggestion_mode(true);
|
||||
internal->_search = search;
|
||||
internal->current_iterator = internal->_search->begin();
|
||||
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
||||
std::vector<const zim::File*> zims;
|
||||
for (auto current = this->readers.begin(); current != this->readers.end();
|
||||
current++) {
|
||||
zims.push_back((*current)->getZimFileHandler());
|
||||
}
|
||||
zim::Search* search = new zim::Search(zims);
|
||||
search->set_query(unaccentedSearch);
|
||||
search->set_range(resultStart, resultEnd);
|
||||
search->set_suggestion_mode(true);
|
||||
internal->_search = search;
|
||||
internal->current_iterator = internal->_search->begin();
|
||||
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
||||
}
|
||||
|
||||
/* Return the result count estimation */
|
||||
|
@ -1,373 +0,0 @@
|
||||
/* htmlparse.cc: simple HTML parser for omega indexer
|
||||
*
|
||||
* Copyright 1999,2000,2001 BrightStation PLC
|
||||
* Copyright 2001 Ananova Ltd
|
||||
* Copyright 2002,2006,2007,2008 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
// #include <config.h>
|
||||
|
||||
#include "htmlparse.h"
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
// #include "utf8convert.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include <ctype.h>
|
||||
#include <cstring>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
inline void
|
||||
lowercase_string(string &str)
|
||||
{
|
||||
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
||||
*i = tolower(static_cast<unsigned char>(*i));
|
||||
}
|
||||
}
|
||||
|
||||
map<string, unsigned int> HtmlParser::named_ents;
|
||||
|
||||
inline static bool
|
||||
p_notdigit(char c)
|
||||
{
|
||||
return !isdigit(static_cast<unsigned char>(c));
|
||||
}
|
||||
|
||||
inline static bool
|
||||
p_notxdigit(char c)
|
||||
{
|
||||
return !isxdigit(static_cast<unsigned char>(c));
|
||||
}
|
||||
|
||||
inline static bool
|
||||
p_notalnum(char c)
|
||||
{
|
||||
return !isalnum(static_cast<unsigned char>(c));
|
||||
}
|
||||
|
||||
inline static bool
|
||||
p_notwhitespace(char c)
|
||||
{
|
||||
return !isspace(static_cast<unsigned char>(c));
|
||||
}
|
||||
|
||||
inline static bool
|
||||
p_nottag(char c)
|
||||
{
|
||||
return !isalnum(static_cast<unsigned char>(c)) &&
|
||||
c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
|
||||
}
|
||||
|
||||
inline static bool
|
||||
p_whitespacegt(char c)
|
||||
{
|
||||
return isspace(static_cast<unsigned char>(c)) || c == '>';
|
||||
}
|
||||
|
||||
inline static bool
|
||||
p_whitespaceeqgt(char c)
|
||||
{
|
||||
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
||||
}
|
||||
|
||||
bool
|
||||
HtmlParser::get_parameter(const string & param, string & value)
|
||||
{
|
||||
map<string, string>::const_iterator i = parameters.find(param);
|
||||
if (i == parameters.end()) return false;
|
||||
value = i->second;
|
||||
return true;
|
||||
}
|
||||
|
||||
HtmlParser::HtmlParser()
|
||||
{
|
||||
static const struct ent { const char *n; unsigned int v; } ents[] = {
|
||||
#include "namedentities.h"
|
||||
{ NULL, 0 }
|
||||
};
|
||||
if (named_ents.empty()) {
|
||||
const struct ent *i = ents;
|
||||
while (i->n) {
|
||||
named_ents[string(i->n)] = i->v;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
HtmlParser::decode_entities(string &s)
|
||||
{
|
||||
// We need a const_iterator version of s.end() - otherwise the
|
||||
// find() and find_if() templates don't work...
|
||||
string::const_iterator amp = s.begin(), s_end = s.end();
|
||||
while ((amp = find(amp, s_end, '&')) != s_end) {
|
||||
unsigned int val = 0;
|
||||
string::const_iterator end, p = amp + 1;
|
||||
if (p != s_end && *p == '#') {
|
||||
p++;
|
||||
if (p != s_end && (*p == 'x' || *p == 'X')) {
|
||||
// hex
|
||||
p++;
|
||||
end = find_if(p, s_end, p_notxdigit);
|
||||
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
|
||||
} else {
|
||||
// number
|
||||
end = find_if(p, s_end, p_notdigit);
|
||||
val = atoi(s.substr(p - s.begin(), end - p).c_str());
|
||||
}
|
||||
} else {
|
||||
end = find_if(p, s_end, p_notalnum);
|
||||
string code = s.substr(p - s.begin(), end - p);
|
||||
map<string, unsigned int>::const_iterator i;
|
||||
i = named_ents.find(code);
|
||||
if (i != named_ents.end()) val = i->second;
|
||||
}
|
||||
if (end < s_end && *end == ';') end++;
|
||||
if (val) {
|
||||
string::size_type amp_pos = amp - s.begin();
|
||||
if (val < 0x80) {
|
||||
s.replace(amp_pos, end - amp, 1u, char(val));
|
||||
} else {
|
||||
// Convert unicode value val to UTF-8.
|
||||
char seq[4];
|
||||
unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
|
||||
s.replace(amp_pos, end - amp, seq, len);
|
||||
}
|
||||
s_end = s.end();
|
||||
// We've modified the string, so the iterators are no longer
|
||||
// valid...
|
||||
amp = s.begin() + amp_pos + 1;
|
||||
} else {
|
||||
amp = end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
HtmlParser::parse_html(const string &body)
|
||||
{
|
||||
in_script = false;
|
||||
|
||||
parameters.clear();
|
||||
string::const_iterator start = body.begin();
|
||||
|
||||
while (true) {
|
||||
// Skip through until we find an HTML tag, a comment, or the end of
|
||||
// document. Ignore isolated occurrences of `<' which don't start
|
||||
// a tag or comment.
|
||||
string::const_iterator p = start;
|
||||
while (true) {
|
||||
p = find(p, body.end(), '<');
|
||||
if (p == body.end()) break;
|
||||
unsigned char ch = *(p + 1);
|
||||
|
||||
// Tag, closing tag, or comment (or SGML declaration).
|
||||
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
||||
|
||||
if (ch == '?') {
|
||||
// PHP code or XML declaration.
|
||||
// XML declaration is only valid at the start of the first line.
|
||||
// FIXME: need to deal with BOMs...
|
||||
if (p != body.begin() || body.size() < 20) break;
|
||||
|
||||
// XML declaration looks something like this:
|
||||
// <?xml version="1.0" encoding="UTF-8"?>
|
||||
if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
|
||||
if (strchr(" \t\r\n", p[5]) == NULL) break;
|
||||
|
||||
string::const_iterator decl_end = find(p + 6, body.end(), '?');
|
||||
if (decl_end == body.end()) break;
|
||||
|
||||
// Default charset for XML is UTF-8.
|
||||
charset = "UTF-8";
|
||||
|
||||
string decl(p + 6, decl_end);
|
||||
size_t enc = decl.find("encoding");
|
||||
if (enc == string::npos) break;
|
||||
|
||||
enc = decl.find_first_not_of(" \t\r\n", enc + 8);
|
||||
if (enc == string::npos || enc == decl.size()) break;
|
||||
|
||||
if (decl[enc] != '=') break;
|
||||
|
||||
enc = decl.find_first_not_of(" \t\r\n", enc + 1);
|
||||
if (enc == string::npos || enc == decl.size()) break;
|
||||
|
||||
if (decl[enc] != '"' && decl[enc] != '\'') break;
|
||||
|
||||
char quote = decl[enc++];
|
||||
size_t enc_end = decl.find(quote, enc);
|
||||
|
||||
if (enc != string::npos)
|
||||
charset = decl.substr(enc, enc_end - enc);
|
||||
|
||||
break;
|
||||
}
|
||||
p++;
|
||||
}
|
||||
|
||||
// Process text up to start of tag.
|
||||
if (p > start) {
|
||||
string text = body.substr(start - body.begin(), p - start);
|
||||
// convert_to_utf8(text, charset);
|
||||
decode_entities(text);
|
||||
process_text(text);
|
||||
}
|
||||
|
||||
if (p == body.end()) break;
|
||||
|
||||
start = p + 1;
|
||||
|
||||
if (start == body.end()) break;
|
||||
|
||||
if (*start == '!') {
|
||||
if (++start == body.end()) break;
|
||||
if (++start == body.end()) break;
|
||||
// comment or SGML declaration
|
||||
if (*(start - 1) == '-' && *start == '-') {
|
||||
++start;
|
||||
string::const_iterator close = find(start, body.end(), '>');
|
||||
// An unterminated comment swallows rest of document
|
||||
// (like Netscape, but unlike MSIE IIRC)
|
||||
if (close == body.end()) break;
|
||||
|
||||
p = close;
|
||||
// look for -->
|
||||
while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
|
||||
p = find(p + 1, body.end(), '>');
|
||||
|
||||
if (p != body.end()) {
|
||||
// Check for htdig's "ignore this bit" comments.
|
||||
if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
|
||||
string::size_type i;
|
||||
i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
|
||||
if (i == string::npos) break;
|
||||
start = body.begin() + i + 21;
|
||||
continue;
|
||||
}
|
||||
// If we found --> skip to there.
|
||||
start = p;
|
||||
} else {
|
||||
// Otherwise skip to the first > we found (as Netscape does).
|
||||
start = close;
|
||||
}
|
||||
} else {
|
||||
// just an SGML declaration, perhaps giving the DTD - ignore it
|
||||
start = find(start - 1, body.end(), '>');
|
||||
if (start == body.end()) break;
|
||||
}
|
||||
++start;
|
||||
} else if (*start == '?') {
|
||||
if (++start == body.end()) break;
|
||||
// PHP - swallow until ?> or EOF
|
||||
start = find(start + 1, body.end(), '>');
|
||||
|
||||
// look for ?>
|
||||
while (start != body.end() && *(start - 1) != '?')
|
||||
start = find(start + 1, body.end(), '>');
|
||||
|
||||
// unterminated PHP swallows rest of document (rather arbitrarily
|
||||
// but it avoids polluting the database when things go wrong)
|
||||
if (start != body.end()) ++start;
|
||||
} else {
|
||||
// opening or closing tag
|
||||
int closing = 0;
|
||||
|
||||
if (*start == '/') {
|
||||
closing = 1;
|
||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
||||
}
|
||||
|
||||
p = start;
|
||||
start = find_if(start, body.end(), p_nottag);
|
||||
string tag = body.substr(p - body.begin(), start - p);
|
||||
// convert tagname to lowercase
|
||||
lowercase_string(tag);
|
||||
|
||||
if (closing) {
|
||||
closing_tag(tag);
|
||||
if (in_script && tag == "script") in_script = false;
|
||||
|
||||
/* ignore any bogus parameters on closing tags */
|
||||
p = find(start, body.end(), '>');
|
||||
if (p == body.end()) break;
|
||||
start = p + 1;
|
||||
} else {
|
||||
// FIXME: parse parameters lazily.
|
||||
while (start < body.end() && *start != '>') {
|
||||
string name, value;
|
||||
|
||||
p = find_if(start, body.end(), p_whitespaceeqgt);
|
||||
|
||||
name.assign(body, start - body.begin(), p - start);
|
||||
|
||||
p = find_if(p, body.end(), p_notwhitespace);
|
||||
|
||||
start = p;
|
||||
if (start != body.end() && *start == '=') {
|
||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
||||
|
||||
p = body.end();
|
||||
|
||||
int quote = *start;
|
||||
if (quote == '"' || quote == '\'') {
|
||||
start++;
|
||||
p = find(start, body.end(), quote);
|
||||
}
|
||||
|
||||
if (p == body.end()) {
|
||||
// unquoted or no closing quote
|
||||
p = find_if(start, body.end(), p_whitespacegt);
|
||||
}
|
||||
value.assign(body, start - body.begin(), p - start);
|
||||
start = find_if(p, body.end(), p_notwhitespace);
|
||||
|
||||
if (!name.empty()) {
|
||||
// convert parameter name to lowercase
|
||||
lowercase_string(name);
|
||||
// in case of multiple entries, use the first
|
||||
// (as Netscape does)
|
||||
parameters.insert(make_pair(name, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
cout << "<" << tag;
|
||||
map<string, string>::const_iterator x;
|
||||
for (x = parameters.begin(); x != parameters.end(); x++) {
|
||||
cout << " " << x->first << "=\"" << x->second << "\"";
|
||||
}
|
||||
cout << ">\n";
|
||||
#endif
|
||||
opening_tag(tag);
|
||||
parameters.clear();
|
||||
|
||||
// In <script> tags we ignore opening tags to avoid problems
|
||||
// with "a<b".
|
||||
if (tag == "script") in_script = true;
|
||||
|
||||
if (start != body.end() && *start == '>') ++start;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
/* htmlparse.h: simple HTML parser for omega indexer
|
||||
*
|
||||
* Copyright 1999,2000,2001 BrightStation PLC
|
||||
* Copyright 2002,2006,2008 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
#ifndef OMEGA_INCLUDED_HTMLPARSE_H
|
||||
#define OMEGA_INCLUDED_HTMLPARSE_H
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
using std::string;
|
||||
using std::map;
|
||||
|
||||
class HtmlParser {
|
||||
map<string, string> parameters;
|
||||
protected:
|
||||
void decode_entities(string &s);
|
||||
bool in_script;
|
||||
string charset;
|
||||
static map<string, unsigned int> named_ents;
|
||||
|
||||
bool get_parameter(const string & param, string & value);
|
||||
public:
|
||||
virtual void process_text(const string &/*text*/) { }
|
||||
virtual void opening_tag(const string &/*tag*/) { }
|
||||
virtual void closing_tag(const string &/*tag*/) { }
|
||||
virtual void parse_html(const string &text);
|
||||
HtmlParser();
|
||||
virtual ~HtmlParser() { }
|
||||
};
|
||||
|
||||
#endif // OMEGA_INCLUDED_HTMLPARSE_H
|
@ -1,302 +0,0 @@
|
||||
/* myhtmlparse.cc: subclass of HtmlParser for extracting text.
|
||||
*
|
||||
* Copyright 1999,2000,2001 BrightStation PLC
|
||||
* Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
// #include <config.h>
|
||||
|
||||
#include "myhtmlparse.h"
|
||||
|
||||
// #include "utf8convert.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
||||
inline void
|
||||
lowercase_string(string &str)
|
||||
{
|
||||
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
||||
*i = tolower(static_cast<unsigned char>(*i));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::parse_html(const string &text, const string &charset_,
|
||||
bool charset_from_meta_)
|
||||
{
|
||||
charset = charset_;
|
||||
charset_from_meta = charset_from_meta_;
|
||||
HtmlParser::parse_html(text);
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::process_text(const string &text)
|
||||
{
|
||||
if (!text.empty() && !in_script_tag && !in_style_tag) {
|
||||
string::size_type b = text.find_first_not_of(WHITESPACE);
|
||||
if (b) pending_space = true;
|
||||
while (b != string::npos) {
|
||||
if (pending_space && !dump.empty()) dump += ' ';
|
||||
string::size_type e = text.find_first_of(WHITESPACE, b);
|
||||
pending_space = (e != string::npos);
|
||||
if (!pending_space) {
|
||||
dump.append(text.data() + b, text.size() - b);
|
||||
return;
|
||||
}
|
||||
dump.append(text.data() + b, e - b);
|
||||
b = text.find_first_not_of(WHITESPACE, e + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::opening_tag(const string &tag)
|
||||
{
|
||||
if (tag.empty()) return;
|
||||
switch (tag[0]) {
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
if (tag == "body") {
|
||||
dump.resize(0);
|
||||
break;
|
||||
}
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
break;
|
||||
case 'd':
|
||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
||||
tag == "dt") pending_space = true;
|
||||
break;
|
||||
case 'e':
|
||||
if (tag == "embed") pending_space = true;
|
||||
break;
|
||||
case 'f':
|
||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
||||
tag == "input") pending_space = true;
|
||||
break;
|
||||
case 'k':
|
||||
if (tag == "keygen") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "meta") {
|
||||
string content;
|
||||
if (get_parameter("content", content)) {
|
||||
string name;
|
||||
if (get_parameter("name", name)) {
|
||||
lowercase_string(name);
|
||||
if (name == "description") {
|
||||
if (sample.empty()) {
|
||||
swap(sample, content);
|
||||
// convert_to_utf8(sample, charset);
|
||||
decode_entities(sample);
|
||||
}
|
||||
} else if (name == "keywords") {
|
||||
if (!keywords.empty()) keywords += ' ';
|
||||
// convert_to_utf8(content, charset);
|
||||
decode_entities(content);
|
||||
keywords += content;
|
||||
} else if (name == "robots") {
|
||||
decode_entities(content);
|
||||
lowercase_string(content);
|
||||
if (content.find("none") != string::npos ||
|
||||
content.find("noindex") != string::npos) {
|
||||
indexing_allowed = false;
|
||||
throw true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
// If the current charset came from a meta tag, don't
|
||||
// force reparsing again!
|
||||
if (charset_from_meta) break;
|
||||
string hdr;
|
||||
if (get_parameter("http-equiv", hdr)) {
|
||||
lowercase_string(hdr);
|
||||
if (hdr == "content-type") {
|
||||
lowercase_string(content);
|
||||
size_t start = content.find("charset=");
|
||||
if (start == string::npos) break;
|
||||
start += 8;
|
||||
if (start == content.size()) break;
|
||||
size_t end = start;
|
||||
if (content[start] != '"') {
|
||||
while (end < content.size()) {
|
||||
unsigned char ch = content[end];
|
||||
if (ch <= 32 || ch >= 127 ||
|
||||
strchr(";()<>@,:\\\"/[]?={}", ch))
|
||||
break;
|
||||
++end;
|
||||
}
|
||||
} else {
|
||||
++start;
|
||||
++end;
|
||||
while (end < content.size()) {
|
||||
unsigned char ch = content[end];
|
||||
if (ch == '"') break;
|
||||
if (ch == '\\') content.erase(end, 1);
|
||||
++end;
|
||||
}
|
||||
}
|
||||
string newcharset(content, start, end - start);
|
||||
if (charset != newcharset) {
|
||||
throw newcharset;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (charset_from_meta) break;
|
||||
string newcharset;
|
||||
if (get_parameter("charset", newcharset)) {
|
||||
// HTML5 added: <meta charset="...">
|
||||
lowercase_string(newcharset);
|
||||
if (charset != newcharset) {
|
||||
throw newcharset;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'o':
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p" || tag == "pre" || tag == "plaintext")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
break;
|
||||
case 's':
|
||||
if (tag == "style") {
|
||||
in_style_tag = true;
|
||||
break;
|
||||
}
|
||||
if (tag == "script") {
|
||||
in_script_tag = true;
|
||||
break;
|
||||
}
|
||||
if (tag == "select") pending_space = true;
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||
tag == "th") pending_space = true;
|
||||
break;
|
||||
case 'u':
|
||||
if (tag == "ul") pending_space = true;
|
||||
break;
|
||||
case 'x':
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
MyHtmlParser::closing_tag(const string &tag)
|
||||
{
|
||||
if (tag.empty()) return;
|
||||
switch (tag[0]) {
|
||||
case 'a':
|
||||
if (tag == "address") pending_space = true;
|
||||
break;
|
||||
case 'b':
|
||||
if (tag == "body") {
|
||||
throw true;
|
||||
}
|
||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
||||
break;
|
||||
case 'c':
|
||||
if (tag == "center") pending_space = true;
|
||||
break;
|
||||
case 'd':
|
||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
||||
tag == "dt") pending_space = true;
|
||||
break;
|
||||
case 'f':
|
||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
||||
break;
|
||||
case 'h':
|
||||
// hr, and h1, ..., h6
|
||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'i':
|
||||
if (tag == "iframe") pending_space = true;
|
||||
break;
|
||||
case 'l':
|
||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
||||
pending_space = true;
|
||||
break;
|
||||
case 'm':
|
||||
if (tag == "marquee" || tag == "menu") pending_space = true;
|
||||
break;
|
||||
case 'o':
|
||||
if (tag == "ol" || tag == "option") pending_space = true;
|
||||
break;
|
||||
case 'p':
|
||||
if (tag == "p" || tag == "pre") pending_space = true;
|
||||
break;
|
||||
case 'q':
|
||||
if (tag == "q") pending_space = true;
|
||||
break;
|
||||
case 's':
|
||||
if (tag == "style") {
|
||||
in_style_tag = false;
|
||||
break;
|
||||
}
|
||||
if (tag == "script") {
|
||||
in_script_tag = false;
|
||||
break;
|
||||
}
|
||||
if (tag == "select") pending_space = true;
|
||||
break;
|
||||
case 't':
|
||||
if (tag == "title") {
|
||||
if (title.empty()) swap(title, dump);
|
||||
break;
|
||||
}
|
||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
||||
tag == "th") pending_space = true;
|
||||
break;
|
||||
case 'u':
|
||||
if (tag == "ul") pending_space = true;
|
||||
break;
|
||||
case 'x':
|
||||
if (tag == "xmp") pending_space = true;
|
||||
break;
|
||||
}
|
||||
}
|
@ -1,66 +0,0 @@
|
||||
/* myhtmlparse.h: subclass of HtmlParser for extracting text
|
||||
*
|
||||
* Copyright 1999,2000,2001 BrightStation PLC
|
||||
* Copyright 2002,2003,2004,2006,2008 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation; either version 2 of the
|
||||
* License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
||||
* USA
|
||||
*/
|
||||
|
||||
#ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
|
||||
#define OMEGA_INCLUDED_MYHTMLPARSE_H
|
||||
|
||||
#include "htmlparse.h"
|
||||
|
||||
// FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
|
||||
// not in all charsets and perhaps spans of all \xa0 should become a single
|
||||
// \xa0?
|
||||
#define WHITESPACE " \t\n\r"
|
||||
|
||||
class MyHtmlParser : public HtmlParser {
|
||||
public:
|
||||
bool in_script_tag;
|
||||
bool in_style_tag;
|
||||
bool pending_space;
|
||||
bool indexing_allowed;
|
||||
bool charset_from_meta;
|
||||
string title, sample, keywords, dump;
|
||||
void process_text(const string &text);
|
||||
void opening_tag(const string &tag);
|
||||
void closing_tag(const string &tag);
|
||||
using HtmlParser::parse_html;
|
||||
void parse_html(const string &text, const string &charset_,
|
||||
bool charset_from_meta_);
|
||||
MyHtmlParser() :
|
||||
in_script_tag(false),
|
||||
in_style_tag(false),
|
||||
pending_space(false),
|
||||
indexing_allowed(true),
|
||||
charset_from_meta(false) { }
|
||||
|
||||
void reset() {
|
||||
in_script_tag = false;
|
||||
in_style_tag = false;
|
||||
pending_space = false;
|
||||
indexing_allowed = true;
|
||||
charset_from_meta = false;
|
||||
title.resize(0);
|
||||
sample.resize(0);
|
||||
keywords.resize(0);
|
||||
dump.resize(0);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // OMEGA_INCLUDED_MYHTMLPARSE_H
|
@ -1,279 +0,0 @@
|
||||
/* namedentities.h: named HTML entities.
|
||||
*
|
||||
* Copyright (C) 2006,2007 Olly Betts
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef OMEGA_INCLUDED_NAMEDENTITIES_H
|
||||
#define OMEGA_INCLUDED_NAMEDENTITIES_H
|
||||
|
||||
// Names and values from: "Character entity references in HTML 4"
|
||||
// http://www.w3.org/TR/html4/sgml/entities.html
|
||||
{ "quot", 34 },
|
||||
{ "amp", 38 },
|
||||
{ "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML.
|
||||
{ "lt", 60 },
|
||||
{ "gt", 62 },
|
||||
{ "nbsp", 160 },
|
||||
{ "iexcl", 161 },
|
||||
{ "cent", 162 },
|
||||
{ "pound", 163 },
|
||||
{ "curren", 164 },
|
||||
{ "yen", 165 },
|
||||
{ "brvbar", 166 },
|
||||
{ "sect", 167 },
|
||||
{ "uml", 168 },
|
||||
{ "copy", 169 },
|
||||
{ "ordf", 170 },
|
||||
{ "laquo", 171 },
|
||||
{ "not", 172 },
|
||||
{ "shy", 173 },
|
||||
{ "reg", 174 },
|
||||
{ "macr", 175 },
|
||||
{ "deg", 176 },
|
||||
{ "plusmn", 177 },
|
||||
{ "sup2", 178 },
|
||||
{ "sup3", 179 },
|
||||
{ "acute", 180 },
|
||||
{ "micro", 181 },
|
||||
{ "para", 182 },
|
||||
{ "middot", 183 },
|
||||
{ "cedil", 184 },
|
||||
{ "sup1", 185 },
|
||||
{ "ordm", 186 },
|
||||
{ "raquo", 187 },
|
||||
{ "frac14", 188 },
|
||||
{ "frac12", 189 },
|
||||
{ "frac34", 190 },
|
||||
{ "iquest", 191 },
|
||||
{ "Agrave", 192 },
|
||||
{ "Aacute", 193 },
|
||||
{ "Acirc", 194 },
|
||||
{ "Atilde", 195 },
|
||||
{ "Auml", 196 },
|
||||
{ "Aring", 197 },
|
||||
{ "AElig", 198 },
|
||||
{ "Ccedil", 199 },
|
||||
{ "Egrave", 200 },
|
||||
{ "Eacute", 201 },
|
||||
{ "Ecirc", 202 },
|
||||
{ "Euml", 203 },
|
||||
{ "Igrave", 204 },
|
||||
{ "Iacute", 205 },
|
||||
{ "Icirc", 206 },
|
||||
{ "Iuml", 207 },
|
||||
{ "ETH", 208 },
|
||||
{ "Ntilde", 209 },
|
||||
{ "Ograve", 210 },
|
||||
{ "Oacute", 211 },
|
||||
{ "Ocirc", 212 },
|
||||
{ "Otilde", 213 },
|
||||
{ "Ouml", 214 },
|
||||
{ "times", 215 },
|
||||
{ "Oslash", 216 },
|
||||
{ "Ugrave", 217 },
|
||||
{ "Uacute", 218 },
|
||||
{ "Ucirc", 219 },
|
||||
{ "Uuml", 220 },
|
||||
{ "Yacute", 221 },
|
||||
{ "THORN", 222 },
|
||||
{ "szlig", 223 },
|
||||
{ "agrave", 224 },
|
||||
{ "aacute", 225 },
|
||||
{ "acirc", 226 },
|
||||
{ "atilde", 227 },
|
||||
{ "auml", 228 },
|
||||
{ "aring", 229 },
|
||||
{ "aelig", 230 },
|
||||
{ "ccedil", 231 },
|
||||
{ "egrave", 232 },
|
||||
{ "eacute", 233 },
|
||||
{ "ecirc", 234 },
|
||||
{ "euml", 235 },
|
||||
{ "igrave", 236 },
|
||||
{ "iacute", 237 },
|
||||
{ "icirc", 238 },
|
||||
{ "iuml", 239 },
|
||||
{ "eth", 240 },
|
||||
{ "ntilde", 241 },
|
||||
{ "ograve", 242 },
|
||||
{ "oacute", 243 },
|
||||
{ "ocirc", 244 },
|
||||
{ "otilde", 245 },
|
||||
{ "ouml", 246 },
|
||||
{ "divide", 247 },
|
||||
{ "oslash", 248 },
|
||||
{ "ugrave", 249 },
|
||||
{ "uacute", 250 },
|
||||
{ "ucirc", 251 },
|
||||
{ "uuml", 252 },
|
||||
{ "yacute", 253 },
|
||||
{ "thorn", 254 },
|
||||
{ "yuml", 255 },
|
||||
{ "OElig", 338 },
|
||||
{ "oelig", 339 },
|
||||
{ "Scaron", 352 },
|
||||
{ "scaron", 353 },
|
||||
{ "Yuml", 376 },
|
||||
{ "fnof", 402 },
|
||||
{ "circ", 710 },
|
||||
{ "tilde", 732 },
|
||||
{ "Alpha", 913 },
|
||||
{ "Beta", 914 },
|
||||
{ "Gamma", 915 },
|
||||
{ "Delta", 916 },
|
||||
{ "Epsilon", 917 },
|
||||
{ "Zeta", 918 },
|
||||
{ "Eta", 919 },
|
||||
{ "Theta", 920 },
|
||||
{ "Iota", 921 },
|
||||
{ "Kappa", 922 },
|
||||
{ "Lambda", 923 },
|
||||
{ "Mu", 924 },
|
||||
{ "Nu", 925 },
|
||||
{ "Xi", 926 },
|
||||
{ "Omicron", 927 },
|
||||
{ "Pi", 928 },
|
||||
{ "Rho", 929 },
|
||||
{ "Sigma", 931 },
|
||||
{ "Tau", 932 },
|
||||
{ "Upsilon", 933 },
|
||||
{ "Phi", 934 },
|
||||
{ "Chi", 935 },
|
||||
{ "Psi", 936 },
|
||||
{ "Omega", 937 },
|
||||
{ "alpha", 945 },
|
||||
{ "beta", 946 },
|
||||
{ "gamma", 947 },
|
||||
{ "delta", 948 },
|
||||
{ "epsilon", 949 },
|
||||
{ "zeta", 950 },
|
||||
{ "eta", 951 },
|
||||
{ "theta", 952 },
|
||||
{ "iota", 953 },
|
||||
{ "kappa", 954 },
|
||||
{ "lambda", 955 },
|
||||
{ "mu", 956 },
|
||||
{ "nu", 957 },
|
||||
{ "xi", 958 },
|
||||
{ "omicron", 959 },
|
||||
{ "pi", 960 },
|
||||
{ "rho", 961 },
|
||||
{ "sigmaf", 962 },
|
||||
{ "sigma", 963 },
|
||||
{ "tau", 964 },
|
||||
{ "upsilon", 965 },
|
||||
{ "phi", 966 },
|
||||
{ "chi", 967 },
|
||||
{ "psi", 968 },
|
||||
{ "omega", 969 },
|
||||
{ "thetasym", 977 },
|
||||
{ "upsih", 978 },
|
||||
{ "piv", 982 },
|
||||
{ "ensp", 8194 },
|
||||
{ "emsp", 8195 },
|
||||
{ "thinsp", 8201 },
|
||||
{ "zwnj", 8204 },
|
||||
{ "zwj", 8205 },
|
||||
{ "lrm", 8206 },
|
||||
{ "rlm", 8207 },
|
||||
{ "ndash", 8211 },
|
||||
{ "mdash", 8212 },
|
||||
{ "lsquo", 8216 },
|
||||
{ "rsquo", 8217 },
|
||||
{ "sbquo", 8218 },
|
||||
{ "ldquo", 8220 },
|
||||
{ "rdquo", 8221 },
|
||||
{ "bdquo", 8222 },
|
||||
{ "dagger", 8224 },
|
||||
{ "Dagger", 8225 },
|
||||
{ "bull", 8226 },
|
||||
{ "hellip", 8230 },
|
||||
{ "permil", 8240 },
|
||||
{ "prime", 8242 },
|
||||
{ "Prime", 8243 },
|
||||
{ "lsaquo", 8249 },
|
||||
{ "rsaquo", 8250 },
|
||||
{ "oline", 8254 },
|
||||
{ "frasl", 8260 },
|
||||
{ "euro", 8364 },
|
||||
{ "image", 8465 },
|
||||
{ "weierp", 8472 },
|
||||
{ "real", 8476 },
|
||||
{ "trade", 8482 },
|
||||
{ "alefsym", 8501 },
|
||||
{ "larr", 8592 },
|
||||
{ "uarr", 8593 },
|
||||
{ "rarr", 8594 },
|
||||
{ "darr", 8595 },
|
||||
{ "harr", 8596 },
|
||||
{ "crarr", 8629 },
|
||||
{ "lArr", 8656 },
|
||||
{ "uArr", 8657 },
|
||||
{ "rArr", 8658 },
|
||||
{ "dArr", 8659 },
|
||||
{ "hArr", 8660 },
|
||||
{ "forall", 8704 },
|
||||
{ "part", 8706 },
|
||||
{ "exist", 8707 },
|
||||
{ "empty", 8709 },
|
||||
{ "nabla", 8711 },
|
||||
{ "isin", 8712 },
|
||||
{ "notin", 8713 },
|
||||
{ "ni", 8715 },
|
||||
{ "prod", 8719 },
|
||||
{ "sum", 8721 },
|
||||
{ "minus", 8722 },
|
||||
{ "lowast", 8727 },
|
||||
{ "radic", 8730 },
|
||||
{ "prop", 8733 },
|
||||
{ "infin", 8734 },
|
||||
{ "ang", 8736 },
|
||||
{ "and", 8743 },
|
||||
{ "or", 8744 },
|
||||
{ "cap", 8745 },
|
||||
{ "cup", 8746 },
|
||||
{ "int", 8747 },
|
||||
{ "there4", 8756 },
|
||||
{ "sim", 8764 },
|
||||
{ "cong", 8773 },
|
||||
{ "asymp", 8776 },
|
||||
{ "ne", 8800 },
|
||||
{ "equiv", 8801 },
|
||||
{ "le", 8804 },
|
||||
{ "ge", 8805 },
|
||||
{ "sub", 8834 },
|
||||
{ "sup", 8835 },
|
||||
{ "nsub", 8836 },
|
||||
{ "sube", 8838 },
|
||||
{ "supe", 8839 },
|
||||
{ "oplus", 8853 },
|
||||
{ "otimes", 8855 },
|
||||
{ "perp", 8869 },
|
||||
{ "sdot", 8901 },
|
||||
{ "lceil", 8968 },
|
||||
{ "rceil", 8969 },
|
||||
{ "lfloor", 8970 },
|
||||
{ "rfloor", 8971 },
|
||||
{ "lang", 9001 },
|
||||
{ "rang", 9002 },
|
||||
{ "loz", 9674 },
|
||||
{ "spades", 9824 },
|
||||
{ "clubs", 9827 },
|
||||
{ "hearts", 9829 },
|
||||
{ "diams", 9830 },
|
||||
|
||||
#endif // OMEGA_INCLUDED_NAMEDENTITIES_H
|
@ -1,231 +0,0 @@
|
||||
/*
|
||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 3 of the License, or
|
||||
* any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
||||
* MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#include "xapianSearcher.h"
|
||||
#include <sys/types.h>
|
||||
#include <unicode/locid.h>
|
||||
#ifndef _WIN32
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <zim/article.h>
|
||||
#include <zim/error.h>
|
||||
#include <zim/file.h>
|
||||
#include <zim/zim.h>
|
||||
#include "xapian/myhtmlparse.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace kiwix
|
||||
{
|
||||
std::map<std::string, int> read_valuesmap(const std::string& s)
|
||||
{
|
||||
std::map<std::string, int> result;
|
||||
std::vector<std::string> elems = split(s, ";");
|
||||
for (std::vector<std::string>::iterator elem = elems.begin();
|
||||
elem != elems.end();
|
||||
elem++) {
|
||||
std::vector<std::string> tmp_elems = split(*elem, ":");
|
||||
result.insert(
|
||||
std::pair<std::string, int>(tmp_elems[0], atoi(tmp_elems[1].c_str())));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Constructor */
|
||||
XapianSearcher::XapianSearcher(const string& xapianDirectoryPath,
|
||||
Reader* reader)
|
||||
: reader(reader)
|
||||
{
|
||||
this->openIndex(xapianDirectoryPath);
|
||||
}
|
||||
|
||||
/* Open Xapian readable database */
|
||||
void XapianSearcher::openIndex(const string& directoryPath)
|
||||
{
|
||||
this->readableDatabase = Xapian::Database(directoryPath);
|
||||
this->valuesmap
|
||||
= read_valuesmap(this->readableDatabase.get_metadata("valuesmap"));
|
||||
this->language = this->readableDatabase.get_metadata("language");
|
||||
this->stopwords = this->readableDatabase.get_metadata("stopwords");
|
||||
setup_queryParser();
|
||||
}
|
||||
|
||||
/* Close Xapian writable database */
|
||||
void XapianSearcher::closeIndex()
|
||||
{
|
||||
return;
|
||||
}
|
||||
void XapianSearcher::setup_queryParser()
|
||||
{
|
||||
queryParser.set_database(readableDatabase);
|
||||
if (!language.empty()) {
|
||||
/* Build ICU Local object to retrieve ISO-639 language code (from
|
||||
ISO-639-3) */
|
||||
icu::Locale languageLocale(language.c_str());
|
||||
|
||||
/* Configuring language base steemming */
|
||||
try {
|
||||
stemmer = Xapian::Stem(languageLocale.getLanguage());
|
||||
queryParser.set_stemmer(stemmer);
|
||||
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
|
||||
} catch (...) {
|
||||
std::cout << "No steemming for language '" << languageLocale.getLanguage()
|
||||
<< "'" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if (!stopwords.empty()) {
|
||||
std::string stopWord;
|
||||
std::istringstream file(this->stopwords);
|
||||
while (std::getline(file, stopWord, '\n')) {
|
||||
this->stopper.add(stopWord);
|
||||
}
|
||||
queryParser.set_stopper(&(this->stopper));
|
||||
}
|
||||
}
|
||||
|
||||
/* Search strings in the database */
|
||||
void XapianSearcher::searchInIndex(string& search,
|
||||
const unsigned int resultStart,
|
||||
const unsigned int resultEnd,
|
||||
const bool verbose)
|
||||
{
|
||||
/* Create the query */
|
||||
Xapian::Query query = queryParser.parse_query(search);
|
||||
|
||||
/* Create the enquire object */
|
||||
Xapian::Enquire enquire(this->readableDatabase);
|
||||
enquire.set_query(query);
|
||||
|
||||
/* Get the results */
|
||||
this->results = enquire.get_mset(resultStart, resultEnd - resultStart);
|
||||
this->current_result = this->results.begin();
|
||||
}
|
||||
|
||||
/* Get next result */
|
||||
Result* XapianSearcher::getNextResult()
|
||||
{
|
||||
if (this->current_result != this->results.end()) {
|
||||
XapianResult* result = new XapianResult(this, this->current_result);
|
||||
this->current_result++;
|
||||
return result;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void XapianSearcher::restart_search()
|
||||
{
|
||||
this->current_result = this->results.begin();
|
||||
}
|
||||
|
||||
XapianResult::XapianResult(XapianSearcher* searcher,
|
||||
Xapian::MSetIterator& iterator)
|
||||
: searcher(searcher), iterator(iterator), document(iterator.get_document())
|
||||
{
|
||||
}
|
||||
|
||||
std::string XapianResult::get_url()
|
||||
{
|
||||
return document.get_data();
|
||||
}
|
||||
std::string XapianResult::get_title()
|
||||
{
|
||||
if (searcher->valuesmap.empty()) {
|
||||
/* This is the old legacy version. Guess and try */
|
||||
return document.get_value(0);
|
||||
} else if (searcher->valuesmap.find("title") != searcher->valuesmap.end()) {
|
||||
return document.get_value(searcher->valuesmap["title"]);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
int XapianResult::get_score()
|
||||
{
|
||||
return iterator.get_percent();
|
||||
}
|
||||
std::string XapianResult::get_snippet()
|
||||
{
|
||||
if (searcher->valuesmap.empty()) {
|
||||
/* This is the old legacy version. Guess and try */
|
||||
std::string stored_snippet = document.get_value(1);
|
||||
if (!stored_snippet.empty()) {
|
||||
return stored_snippet;
|
||||
}
|
||||
/* Let's continue here, and see if we can genenate one */
|
||||
} else if (searcher->valuesmap.find("snippet") != searcher->valuesmap.end()) {
|
||||
return document.get_value(searcher->valuesmap["snippet"]);
|
||||
}
|
||||
/* No reader, no snippet */
|
||||
if (!searcher->reader) {
|
||||
return "";
|
||||
}
|
||||
/* Get the content of the article to generate a snippet.
|
||||
We parse it and use the html dump to avoid remove html tags in the
|
||||
content and be able to nicely cut the text at random place. */
|
||||
MyHtmlParser htmlParser;
|
||||
std::string content = get_content();
|
||||
if (content.empty()) {
|
||||
return content;
|
||||
}
|
||||
try {
|
||||
htmlParser.parse_html(content, "UTF-8", true);
|
||||
} catch (...) {
|
||||
}
|
||||
return searcher->results.snippet(htmlParser.dump, 500);
|
||||
}
|
||||
|
||||
std::string XapianResult::get_content()
|
||||
{
|
||||
if (!searcher->reader) {
|
||||
return "";
|
||||
}
|
||||
auto entry = searcher->reader->getEntryFromEncodedPath(get_url());
|
||||
return entry.getContent();
|
||||
}
|
||||
|
||||
int XapianResult::get_size()
|
||||
{
|
||||
if (searcher->valuesmap.empty()) {
|
||||
/* This is the old legacy version. Guess and try */
|
||||
return document.get_value(2).empty() == true
|
||||
? -1
|
||||
: atoi(document.get_value(2).c_str());
|
||||
} else if (searcher->valuesmap.find("size") != searcher->valuesmap.end()) {
|
||||
return atoi(document.get_value(searcher->valuesmap["size"]).c_str());
|
||||
}
|
||||
/* The size is never used. Do we really want to get the content and
|
||||
calculate the size ? */
|
||||
return -1;
|
||||
}
|
||||
|
||||
int XapianResult::get_wordCount()
|
||||
{
|
||||
if (searcher->valuesmap.empty()) {
|
||||
/* This is the old legacy version. Guess and try */
|
||||
return document.get_value(3).empty() == true
|
||||
? -1
|
||||
: atoi(document.get_value(3).c_str());
|
||||
} else if (searcher->valuesmap.find("wordcount")
|
||||
!= searcher->valuesmap.end()) {
|
||||
return atoi(document.get_value(searcher->valuesmap["wordcount"]).c_str());
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
} // Kiwix namespace
|
Reference in New Issue
Block a user