mirror of https://github.com/kiwix/libkiwix.git
Merge pull request #190 from kiwix/no_external_index
Remove support for external index.
This commit is contained in:
commit
c73ac9f2cd
|
@ -35,8 +35,6 @@ libraries need to be available:
|
||||||
(package libpugixml-dev on Ubuntu)
|
(package libpugixml-dev on Ubuntu)
|
||||||
* ctpp2 ........................................ http://ctpp.havoc.ru/
|
* ctpp2 ........................................ http://ctpp.havoc.ru/
|
||||||
(package libctpp2-dev on Ubuntu)
|
(package libctpp2-dev on Ubuntu)
|
||||||
* Xapian ......................................... https://xapian.org/
|
|
||||||
(package libxapian-dev on Ubuntu)
|
|
||||||
* libaria2 .................................. https://aria2.github.io/
|
* libaria2 .................................. https://aria2.github.io/
|
||||||
(no package on Ubuntu)
|
(no package on Ubuntu)
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,6 @@ class xml_node;
|
||||||
|
|
||||||
namespace kiwix
|
namespace kiwix
|
||||||
{
|
{
|
||||||
enum supportedIndexType { UNKNOWN, XAPIAN };
|
|
||||||
|
|
||||||
class OPDSDumper;
|
class OPDSDumper;
|
||||||
class Reader;
|
class Reader;
|
||||||
|
@ -52,8 +51,6 @@ class Book
|
||||||
const std::string& getId() const { return m_id; }
|
const std::string& getId() const { return m_id; }
|
||||||
const std::string& getPath() const { return m_path; }
|
const std::string& getPath() const { return m_path; }
|
||||||
bool isPathValid() const { return m_pathValid; }
|
bool isPathValid() const { return m_pathValid; }
|
||||||
const std::string& getIndexPath() const { return m_indexPath; }
|
|
||||||
const supportedIndexType& getIndexType() const { return m_indexType; }
|
|
||||||
const std::string& getTitle() const { return m_title; }
|
const std::string& getTitle() const { return m_title; }
|
||||||
const std::string& getDescription() const { return m_description; }
|
const std::string& getDescription() const { return m_description; }
|
||||||
const std::string& getLanguage() const { return m_language; }
|
const std::string& getLanguage() const { return m_language; }
|
||||||
|
@ -76,8 +73,6 @@ class Book
|
||||||
void setId(const std::string& id) { m_id = id; }
|
void setId(const std::string& id) { m_id = id; }
|
||||||
void setPath(const std::string& path);
|
void setPath(const std::string& path);
|
||||||
void setPathValid(bool valid) { m_pathValid = valid; }
|
void setPathValid(bool valid) { m_pathValid = valid; }
|
||||||
void setIndexPath(const std::string& indexPath);
|
|
||||||
void setIndexType(supportedIndexType indexType) { m_indexType = indexType;}
|
|
||||||
void setTitle(const std::string& title) { m_title = title; }
|
void setTitle(const std::string& title) { m_title = title; }
|
||||||
void setDescription(const std::string& description) { m_description = description; }
|
void setDescription(const std::string& description) { m_description = description; }
|
||||||
void setLanguage(const std::string& language) { m_language = language; }
|
void setLanguage(const std::string& language) { m_language = language; }
|
||||||
|
@ -100,8 +95,6 @@ class Book
|
||||||
std::string m_downloadId;
|
std::string m_downloadId;
|
||||||
std::string m_path;
|
std::string m_path;
|
||||||
bool m_pathValid;
|
bool m_pathValid;
|
||||||
std::string m_indexPath;
|
|
||||||
supportedIndexType m_indexType;
|
|
||||||
std::string m_title;
|
std::string m_title;
|
||||||
std::string m_description;
|
std::string m_description;
|
||||||
std::string m_language;
|
std::string m_language;
|
||||||
|
|
|
@ -12,10 +12,6 @@ headers = [
|
||||||
'searcher.h'
|
'searcher.h'
|
||||||
]
|
]
|
||||||
|
|
||||||
if xapian_dep.found()
|
|
||||||
headers += ['xapianSearcher.h']
|
|
||||||
endif
|
|
||||||
|
|
||||||
install_headers(headers, subdir:'kiwix')
|
install_headers(headers, subdir:'kiwix')
|
||||||
|
|
||||||
install_headers(
|
install_headers(
|
||||||
|
|
|
@ -57,24 +57,7 @@ struct SearcherInternal;
|
||||||
* The Searcher class is reponsible to do different kind of search using the
|
* The Searcher class is reponsible to do different kind of search using the
|
||||||
* fulltext index.
|
* fulltext index.
|
||||||
*
|
*
|
||||||
* Historically, there are two kind of fulltext index :
|
* Searcher may (if compiled with ctpp2) be used to
|
||||||
* - The legacy one, is the external fulltext index. A directory stored outside
|
|
||||||
* of the zim file.
|
|
||||||
* - The new one, a embedded fulltext index in the zim file.
|
|
||||||
*
|
|
||||||
* Legacy external fulltext index has to be considered as obsolet format with
|
|
||||||
* less functionnalities:
|
|
||||||
* - No multi zim search ;
|
|
||||||
* - No geo_search ;
|
|
||||||
* - No suggestions search ;
|
|
||||||
*
|
|
||||||
* To reflect this, there is two Search creation "API":
|
|
||||||
* - One for the external fulltext index, using the constructor taking a
|
|
||||||
* xapianDirectoryPath) ;
|
|
||||||
* - One for the embedded fulltext index, using a "empty" constructor and the
|
|
||||||
* `add_reader` method".
|
|
||||||
*
|
|
||||||
* On top of that, the Searcher may (if compiled with ctpp2) be used to
|
|
||||||
* generate a html page for the search result. This use a template that need a
|
* generate a html page for the search result. This use a template that need a
|
||||||
* humanReaderName. This feature is only used by kiwix-serve and this should be
|
* humanReaderName. This feature is only used by kiwix-serve and this should be
|
||||||
* move outside of Searcher (and with a better API). If you don't use the html
|
* move outside of Searcher (and with a better API). If you don't use the html
|
||||||
|
@ -92,18 +75,6 @@ class Searcher
|
||||||
*/
|
*/
|
||||||
Searcher(const string& humanReadableName = "");
|
Searcher(const string& humanReadableName = "");
|
||||||
|
|
||||||
/**
|
|
||||||
* The constructor for legacy external fulltext index.
|
|
||||||
*
|
|
||||||
* @param xapianDirectoryPath The path to the external index directory.
|
|
||||||
* @param reader The reader associated to the external index.
|
|
||||||
* It will be used retrive the article content or generate
|
|
||||||
* the snippet.
|
|
||||||
* @param humanReadableName The humanReadableName for the zim.
|
|
||||||
*/
|
|
||||||
Searcher(const string& xapianDirectoryPath,
|
|
||||||
Reader* reader,
|
|
||||||
const string& humanReadableName);
|
|
||||||
~Searcher();
|
~Searcher();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1,98 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef KIWIX_XAPIAN_SEARCHER_H
|
|
||||||
#define KIWIX_XAPIAN_SEARCHER_H
|
|
||||||
|
|
||||||
#include <xapian.h>
|
|
||||||
#include "reader.h"
|
|
||||||
#include "searcher.h"
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
namespace kiwix
|
|
||||||
{
|
|
||||||
class XapianSearcher;
|
|
||||||
|
|
||||||
class XapianResult : public Result
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
XapianResult(XapianSearcher* searcher, Xapian::MSetIterator& iterator);
|
|
||||||
virtual ~XapianResult(){};
|
|
||||||
|
|
||||||
virtual std::string get_url();
|
|
||||||
virtual std::string get_title();
|
|
||||||
virtual int get_score();
|
|
||||||
virtual std::string get_snippet();
|
|
||||||
virtual std::string get_content();
|
|
||||||
virtual int get_wordCount();
|
|
||||||
virtual int get_size();
|
|
||||||
virtual int get_readerIndex() { return 0; };
|
|
||||||
|
|
||||||
private:
|
|
||||||
XapianSearcher* searcher;
|
|
||||||
Xapian::MSetIterator iterator;
|
|
||||||
Xapian::Document document;
|
|
||||||
};
|
|
||||||
|
|
||||||
class NoXapianIndexInZim : public exception
|
|
||||||
{
|
|
||||||
virtual const char* what() const throw()
|
|
||||||
{
|
|
||||||
return "There is no fulltext index in the zim file";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class XapianSearcher
|
|
||||||
{
|
|
||||||
friend class XapianResult;
|
|
||||||
|
|
||||||
public:
|
|
||||||
XapianSearcher(const string& xapianDirectoryPath, Reader* reader);
|
|
||||||
virtual ~XapianSearcher(){};
|
|
||||||
void searchInIndex(string& search,
|
|
||||||
const unsigned int resultStart,
|
|
||||||
const unsigned int resultEnd,
|
|
||||||
const bool verbose = false);
|
|
||||||
virtual Result* getNextResult();
|
|
||||||
void restart_search();
|
|
||||||
|
|
||||||
Xapian::MSet results;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
void closeIndex();
|
|
||||||
void openIndex(const string& xapianDirectoryPath);
|
|
||||||
void setup_queryParser();
|
|
||||||
|
|
||||||
Reader* reader;
|
|
||||||
Xapian::Database readableDatabase;
|
|
||||||
std::string language;
|
|
||||||
std::string stopwords;
|
|
||||||
Xapian::QueryParser queryParser;
|
|
||||||
Xapian::Stem stemmer;
|
|
||||||
Xapian::SimpleStopper stopper;
|
|
||||||
Xapian::MSetIterator current_result;
|
|
||||||
std::map<std::string, int> valuesmap;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -84,9 +84,7 @@ else
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
xapian_dep = dependency('xapian-core', required:false, static:static_deps)
|
all_deps = [thread_dep, libicu_dep, libzim_dep, pugixml_dep, libcurl_dep]
|
||||||
|
|
||||||
all_deps = [thread_dep, libicu_dep, libzim_dep, xapian_dep, pugixml_dep, libcurl_dep]
|
|
||||||
if has_ctpp2_dep
|
if has_ctpp2_dep
|
||||||
all_deps += [ctpp2_dep]
|
all_deps += [ctpp2_dep]
|
||||||
endif
|
endif
|
||||||
|
@ -110,9 +108,6 @@ subdir('src')
|
||||||
subdir('test')
|
subdir('test')
|
||||||
|
|
||||||
pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl']
|
pkg_requires = ['libzim', 'icu-i18n', 'pugixml', 'libcurl']
|
||||||
if xapian_dep.found()
|
|
||||||
pkg_requires += ['xapian-core']
|
|
||||||
endif
|
|
||||||
|
|
||||||
if has_ctpp2_dep
|
if has_ctpp2_dep
|
||||||
extra_libs += ctpp2_link_args
|
extra_libs += ctpp2_link_args
|
||||||
|
|
20
src/book.cpp
20
src/book.cpp
|
@ -60,11 +60,6 @@ bool Book::update(const kiwix::Book& other)
|
||||||
m_name = other.m_name;
|
m_name = other.m_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_indexPath.empty()) {
|
|
||||||
m_indexPath = other.m_indexPath;
|
|
||||||
m_indexType = other.m_indexType;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_faviconMimeType.empty()) {
|
if (m_faviconMimeType.empty()) {
|
||||||
m_favicon = other.m_favicon;
|
m_favicon = other.m_favicon;
|
||||||
m_faviconMimeType = other.m_faviconMimeType;
|
m_faviconMimeType = other.m_faviconMimeType;
|
||||||
|
@ -101,14 +96,6 @@ void Book::updateFromXml(const pugi::xml_node& node, const std::string& baseDir)
|
||||||
path = computeAbsolutePath(baseDir, path);
|
path = computeAbsolutePath(baseDir, path);
|
||||||
}
|
}
|
||||||
m_path = path;
|
m_path = path;
|
||||||
path = ATTR("indexPath");
|
|
||||||
if (!path.empty()) {
|
|
||||||
if (isRelativePath(path)) {
|
|
||||||
path = computeAbsolutePath(baseDir, path);
|
|
||||||
}
|
|
||||||
m_indexPath = path;
|
|
||||||
m_indexType = XAPIAN;
|
|
||||||
}
|
|
||||||
m_title = ATTR("title");
|
m_title = ATTR("title");
|
||||||
m_name = ATTR("name");
|
m_name = ATTR("name");
|
||||||
m_tags = ATTR("tags");
|
m_tags = ATTR("tags");
|
||||||
|
@ -194,13 +181,6 @@ void Book::setPath(const std::string& path)
|
||||||
: path;
|
: path;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Book::setIndexPath(const std::string& indexPath)
|
|
||||||
{
|
|
||||||
m_indexPath = isRelativePath(indexPath)
|
|
||||||
? computeAbsolutePath(getCurrentDirectory(), indexPath)
|
|
||||||
: indexPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string& Book::getFavicon() const {
|
const std::string& Book::getFavicon() const {
|
||||||
if (m_favicon.empty() && !m_faviconUrl.empty()) {
|
if (m_favicon.empty() && !m_faviconUrl.empty()) {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -50,11 +50,6 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) {
|
||||||
ADD_ATTRIBUTE(entry_node, "path", computeRelativePath(baseDir, book.getPath()));
|
ADD_ATTRIBUTE(entry_node, "path", computeRelativePath(baseDir, book.getPath()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!book.getIndexPath().empty()) {
|
|
||||||
ADD_ATTRIBUTE(entry_node, "indexPath", computeRelativePath(baseDir, book.getIndexPath()));
|
|
||||||
entry_node.append_attribute("indexType") = "xapian";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (book.getOrigId().empty()) {
|
if (book.getOrigId().empty()) {
|
||||||
ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle());
|
ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle());
|
||||||
ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName());
|
ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName());
|
||||||
|
|
|
@ -17,8 +17,6 @@ kiwix_sources = [
|
||||||
'common/stringTools.cpp',
|
'common/stringTools.cpp',
|
||||||
'common/networkTools.cpp',
|
'common/networkTools.cpp',
|
||||||
'common/otherTools.cpp',
|
'common/otherTools.cpp',
|
||||||
'xapian/htmlparse.cc',
|
|
||||||
'xapian/myhtmlparse.cc'
|
|
||||||
]
|
]
|
||||||
kiwix_sources += lib_resources
|
kiwix_sources += lib_resources
|
||||||
|
|
||||||
|
@ -28,10 +26,6 @@ else
|
||||||
kiwix_sources += 'subprocess_unix.cpp'
|
kiwix_sources += 'subprocess_unix.cpp'
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if xapian_dep.found()
|
|
||||||
kiwix_sources += ['xapianSearcher.cpp']
|
|
||||||
endif
|
|
||||||
|
|
||||||
if get_option('android')
|
if get_option('android')
|
||||||
subdir('android')
|
subdir('android')
|
||||||
install_dir = 'kiwix-lib/jniLibs/' + meson.get_cross_property('android_abi')
|
install_dir = 'kiwix-lib/jniLibs/' + meson.get_cross_property('android_abi')
|
||||||
|
|
|
@ -22,7 +22,6 @@
|
||||||
|
|
||||||
#include "searcher.h"
|
#include "searcher.h"
|
||||||
#include "reader.h"
|
#include "reader.h"
|
||||||
#include "xapianSearcher.h"
|
|
||||||
|
|
||||||
#include <zim/search.h>
|
#include <zim/search.h>
|
||||||
|
|
||||||
|
@ -61,42 +60,18 @@ class _Result : public Result
|
||||||
|
|
||||||
struct SearcherInternal {
|
struct SearcherInternal {
|
||||||
const zim::Search* _search;
|
const zim::Search* _search;
|
||||||
XapianSearcher* _xapianSearcher;
|
|
||||||
zim::Search::iterator current_iterator;
|
zim::Search::iterator current_iterator;
|
||||||
|
|
||||||
SearcherInternal() : _search(NULL), _xapianSearcher(NULL) {}
|
SearcherInternal() : _search(NULL) {}
|
||||||
~SearcherInternal()
|
~SearcherInternal()
|
||||||
{
|
{
|
||||||
if (_search != NULL) {
|
if (_search != NULL) {
|
||||||
delete _search;
|
delete _search;
|
||||||
}
|
}
|
||||||
if (_xapianSearcher != NULL) {
|
|
||||||
delete _xapianSearcher;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Constructor */
|
/* Constructor */
|
||||||
Searcher::Searcher(const string& xapianDirectoryPath,
|
|
||||||
Reader* reader,
|
|
||||||
const string& humanReadableName)
|
|
||||||
: internal(new SearcherInternal()),
|
|
||||||
searchPattern(""),
|
|
||||||
protocolPrefix("zim://"),
|
|
||||||
searchProtocolPrefix("search://?"),
|
|
||||||
resultCountPerPage(0),
|
|
||||||
estimatedResultCount(0),
|
|
||||||
resultStart(0),
|
|
||||||
resultEnd(0),
|
|
||||||
contentHumanReadableId(humanReadableName)
|
|
||||||
{
|
|
||||||
loadICUExternalTables();
|
|
||||||
if (!reader || !reader->hasFulltextIndex()) {
|
|
||||||
internal->_xapianSearcher = new XapianSearcher(xapianDirectoryPath, reader);
|
|
||||||
}
|
|
||||||
this->humanReaderNames.push_back(humanReadableName);
|
|
||||||
}
|
|
||||||
|
|
||||||
Searcher::Searcher(const std::string& humanReadableName)
|
Searcher::Searcher(const std::string& humanReadableName)
|
||||||
: internal(new SearcherInternal()),
|
: internal(new SearcherInternal()),
|
||||||
searchPattern(""),
|
searchPattern(""),
|
||||||
|
@ -160,12 +135,6 @@ void Searcher::search(std::string& search,
|
||||||
this->resultStart = resultStart;
|
this->resultStart = resultStart;
|
||||||
this->resultEnd = resultEnd;
|
this->resultEnd = resultEnd;
|
||||||
string unaccentedSearch = removeAccents(search);
|
string unaccentedSearch = removeAccents(search);
|
||||||
if (internal->_xapianSearcher) {
|
|
||||||
internal->_xapianSearcher->searchInIndex(
|
|
||||||
unaccentedSearch, resultStart, resultEnd, verbose);
|
|
||||||
this->estimatedResultCount
|
|
||||||
= internal->_xapianSearcher->results.get_matches_estimated();
|
|
||||||
} else {
|
|
||||||
std::vector<const zim::File*> zims;
|
std::vector<const zim::File*> zims;
|
||||||
for (auto current = this->readers.begin(); current != this->readers.end();
|
for (auto current = this->readers.begin(); current != this->readers.end();
|
||||||
current++) {
|
current++) {
|
||||||
|
@ -180,7 +149,6 @@ void Searcher::search(std::string& search,
|
||||||
internal->current_iterator = internal->_search->begin();
|
internal->current_iterator = internal->_search->begin();
|
||||||
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -209,10 +177,6 @@ void Searcher::geo_search(float latitude, float longitude, float distance,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (internal->_xapianSearcher) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Avoid big researches */
|
/* Avoid big researches */
|
||||||
this->resultCountPerPage = resultEnd - resultStart;
|
this->resultCountPerPage = resultEnd - resultStart;
|
||||||
if (this->resultCountPerPage > MAX_SEARCH_LEN) {
|
if (this->resultCountPerPage > MAX_SEARCH_LEN) {
|
||||||
|
@ -244,18 +208,14 @@ void Searcher::geo_search(float latitude, float longitude, float distance,
|
||||||
|
|
||||||
void Searcher::restart_search()
|
void Searcher::restart_search()
|
||||||
{
|
{
|
||||||
if (internal->_xapianSearcher) {
|
if (internal->_search) {
|
||||||
internal->_xapianSearcher->restart_search();
|
|
||||||
} else if (internal->_search) {
|
|
||||||
internal->current_iterator = internal->_search->begin();
|
internal->current_iterator = internal->_search->begin();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Result* Searcher::getNextResult()
|
Result* Searcher::getNextResult()
|
||||||
{
|
{
|
||||||
if (internal->_xapianSearcher) {
|
if (internal->_search &&
|
||||||
return internal->_xapianSearcher->getNextResult();
|
|
||||||
} else if (internal->_search &&
|
|
||||||
internal->current_iterator != internal->_search->end()) {
|
internal->current_iterator != internal->_search->end()) {
|
||||||
Result* result = new _Result(internal->current_iterator);
|
Result* result = new _Result(internal->current_iterator);
|
||||||
internal->current_iterator++;
|
internal->current_iterator++;
|
||||||
|
@ -272,24 +232,19 @@ void Searcher::reset()
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Searcher::suggestions(std::string& search, const bool verbose)
|
void Searcher::suggestions(std::string& searchPattern, const bool verbose)
|
||||||
{
|
{
|
||||||
this->reset();
|
this->reset();
|
||||||
|
|
||||||
if (verbose == true) {
|
if (verbose == true) {
|
||||||
cout << "Performing suggestion query `" << search << "`" << endl;
|
cout << "Performing suggestion query `" << searchPattern << "`" << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
this->searchPattern = search;
|
this->searchPattern = searchPattern;
|
||||||
this->resultStart = 0;
|
this->resultStart = 0;
|
||||||
this->resultEnd = 10;
|
this->resultEnd = 10;
|
||||||
string unaccentedSearch = removeAccents(search);
|
string unaccentedSearch = removeAccents(searchPattern);
|
||||||
|
|
||||||
if (internal->_xapianSearcher) {
|
|
||||||
/* [TODO] Suggestion on a external database ?
|
|
||||||
* We do not support that. */
|
|
||||||
this->estimatedResultCount = 0;
|
|
||||||
} else {
|
|
||||||
std::vector<const zim::File*> zims;
|
std::vector<const zim::File*> zims;
|
||||||
for (auto current = this->readers.begin(); current != this->readers.end();
|
for (auto current = this->readers.begin(); current != this->readers.end();
|
||||||
current++) {
|
current++) {
|
||||||
|
@ -303,7 +258,6 @@ void Searcher::suggestions(std::string& search, const bool verbose)
|
||||||
internal->current_iterator = internal->_search->begin();
|
internal->current_iterator = internal->_search->begin();
|
||||||
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
this->estimatedResultCount = internal->_search->get_matches_estimated();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* Return the result count estimation */
|
/* Return the result count estimation */
|
||||||
unsigned int Searcher::getEstimatedResultCount()
|
unsigned int Searcher::getEstimatedResultCount()
|
||||||
|
|
|
@ -1,373 +0,0 @@
|
||||||
/* htmlparse.cc: simple HTML parser for omega indexer
|
|
||||||
*
|
|
||||||
* Copyright 1999,2000,2001 BrightStation PLC
|
|
||||||
* Copyright 2001 Ananova Ltd
|
|
||||||
* Copyright 2002,2006,2007,2008 Olly Betts
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or
|
|
||||||
* modify it under the terms of the GNU General Public License as
|
|
||||||
* published by the Free Software Foundation; either version 2 of the
|
|
||||||
* License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
|
||||||
* USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
// #include <config.h>
|
|
||||||
|
|
||||||
#include "htmlparse.h"
|
|
||||||
|
|
||||||
#include <xapian.h>
|
|
||||||
|
|
||||||
// #include "utf8convert.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <cstring>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
inline void
|
|
||||||
lowercase_string(string &str)
|
|
||||||
{
|
|
||||||
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
|
||||||
*i = tolower(static_cast<unsigned char>(*i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
map<string, unsigned int> HtmlParser::named_ents;
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_notdigit(char c)
|
|
||||||
{
|
|
||||||
return !isdigit(static_cast<unsigned char>(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_notxdigit(char c)
|
|
||||||
{
|
|
||||||
return !isxdigit(static_cast<unsigned char>(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_notalnum(char c)
|
|
||||||
{
|
|
||||||
return !isalnum(static_cast<unsigned char>(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_notwhitespace(char c)
|
|
||||||
{
|
|
||||||
return !isspace(static_cast<unsigned char>(c));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_nottag(char c)
|
|
||||||
{
|
|
||||||
return !isalnum(static_cast<unsigned char>(c)) &&
|
|
||||||
c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_whitespacegt(char c)
|
|
||||||
{
|
|
||||||
return isspace(static_cast<unsigned char>(c)) || c == '>';
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static bool
|
|
||||||
p_whitespaceeqgt(char c)
|
|
||||||
{
|
|
||||||
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
HtmlParser::get_parameter(const string & param, string & value)
|
|
||||||
{
|
|
||||||
map<string, string>::const_iterator i = parameters.find(param);
|
|
||||||
if (i == parameters.end()) return false;
|
|
||||||
value = i->second;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
HtmlParser::HtmlParser()
|
|
||||||
{
|
|
||||||
static const struct ent { const char *n; unsigned int v; } ents[] = {
|
|
||||||
#include "namedentities.h"
|
|
||||||
{ NULL, 0 }
|
|
||||||
};
|
|
||||||
if (named_ents.empty()) {
|
|
||||||
const struct ent *i = ents;
|
|
||||||
while (i->n) {
|
|
||||||
named_ents[string(i->n)] = i->v;
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
HtmlParser::decode_entities(string &s)
|
|
||||||
{
|
|
||||||
// We need a const_iterator version of s.end() - otherwise the
|
|
||||||
// find() and find_if() templates don't work...
|
|
||||||
string::const_iterator amp = s.begin(), s_end = s.end();
|
|
||||||
while ((amp = find(amp, s_end, '&')) != s_end) {
|
|
||||||
unsigned int val = 0;
|
|
||||||
string::const_iterator end, p = amp + 1;
|
|
||||||
if (p != s_end && *p == '#') {
|
|
||||||
p++;
|
|
||||||
if (p != s_end && (*p == 'x' || *p == 'X')) {
|
|
||||||
// hex
|
|
||||||
p++;
|
|
||||||
end = find_if(p, s_end, p_notxdigit);
|
|
||||||
sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
|
|
||||||
} else {
|
|
||||||
// number
|
|
||||||
end = find_if(p, s_end, p_notdigit);
|
|
||||||
val = atoi(s.substr(p - s.begin(), end - p).c_str());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
end = find_if(p, s_end, p_notalnum);
|
|
||||||
string code = s.substr(p - s.begin(), end - p);
|
|
||||||
map<string, unsigned int>::const_iterator i;
|
|
||||||
i = named_ents.find(code);
|
|
||||||
if (i != named_ents.end()) val = i->second;
|
|
||||||
}
|
|
||||||
if (end < s_end && *end == ';') end++;
|
|
||||||
if (val) {
|
|
||||||
string::size_type amp_pos = amp - s.begin();
|
|
||||||
if (val < 0x80) {
|
|
||||||
s.replace(amp_pos, end - amp, 1u, char(val));
|
|
||||||
} else {
|
|
||||||
// Convert unicode value val to UTF-8.
|
|
||||||
char seq[4];
|
|
||||||
unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
|
|
||||||
s.replace(amp_pos, end - amp, seq, len);
|
|
||||||
}
|
|
||||||
s_end = s.end();
|
|
||||||
// We've modified the string, so the iterators are no longer
|
|
||||||
// valid...
|
|
||||||
amp = s.begin() + amp_pos + 1;
|
|
||||||
} else {
|
|
||||||
amp = end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
HtmlParser::parse_html(const string &body)
|
|
||||||
{
|
|
||||||
in_script = false;
|
|
||||||
|
|
||||||
parameters.clear();
|
|
||||||
string::const_iterator start = body.begin();
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
// Skip through until we find an HTML tag, a comment, or the end of
|
|
||||||
// document. Ignore isolated occurrences of `<' which don't start
|
|
||||||
// a tag or comment.
|
|
||||||
string::const_iterator p = start;
|
|
||||||
while (true) {
|
|
||||||
p = find(p, body.end(), '<');
|
|
||||||
if (p == body.end()) break;
|
|
||||||
unsigned char ch = *(p + 1);
|
|
||||||
|
|
||||||
// Tag, closing tag, or comment (or SGML declaration).
|
|
||||||
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
|
||||||
|
|
||||||
if (ch == '?') {
|
|
||||||
// PHP code or XML declaration.
|
|
||||||
// XML declaration is only valid at the start of the first line.
|
|
||||||
// FIXME: need to deal with BOMs...
|
|
||||||
if (p != body.begin() || body.size() < 20) break;
|
|
||||||
|
|
||||||
// XML declaration looks something like this:
|
|
||||||
// <?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
|
|
||||||
if (strchr(" \t\r\n", p[5]) == NULL) break;
|
|
||||||
|
|
||||||
string::const_iterator decl_end = find(p + 6, body.end(), '?');
|
|
||||||
if (decl_end == body.end()) break;
|
|
||||||
|
|
||||||
// Default charset for XML is UTF-8.
|
|
||||||
charset = "UTF-8";
|
|
||||||
|
|
||||||
string decl(p + 6, decl_end);
|
|
||||||
size_t enc = decl.find("encoding");
|
|
||||||
if (enc == string::npos) break;
|
|
||||||
|
|
||||||
enc = decl.find_first_not_of(" \t\r\n", enc + 8);
|
|
||||||
if (enc == string::npos || enc == decl.size()) break;
|
|
||||||
|
|
||||||
if (decl[enc] != '=') break;
|
|
||||||
|
|
||||||
enc = decl.find_first_not_of(" \t\r\n", enc + 1);
|
|
||||||
if (enc == string::npos || enc == decl.size()) break;
|
|
||||||
|
|
||||||
if (decl[enc] != '"' && decl[enc] != '\'') break;
|
|
||||||
|
|
||||||
char quote = decl[enc++];
|
|
||||||
size_t enc_end = decl.find(quote, enc);
|
|
||||||
|
|
||||||
if (enc != string::npos)
|
|
||||||
charset = decl.substr(enc, enc_end - enc);
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
p++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process text up to start of tag.
|
|
||||||
if (p > start) {
|
|
||||||
string text = body.substr(start - body.begin(), p - start);
|
|
||||||
// convert_to_utf8(text, charset);
|
|
||||||
decode_entities(text);
|
|
||||||
process_text(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p == body.end()) break;
|
|
||||||
|
|
||||||
start = p + 1;
|
|
||||||
|
|
||||||
if (start == body.end()) break;
|
|
||||||
|
|
||||||
if (*start == '!') {
|
|
||||||
if (++start == body.end()) break;
|
|
||||||
if (++start == body.end()) break;
|
|
||||||
// comment or SGML declaration
|
|
||||||
if (*(start - 1) == '-' && *start == '-') {
|
|
||||||
++start;
|
|
||||||
string::const_iterator close = find(start, body.end(), '>');
|
|
||||||
// An unterminated comment swallows rest of document
|
|
||||||
// (like Netscape, but unlike MSIE IIRC)
|
|
||||||
if (close == body.end()) break;
|
|
||||||
|
|
||||||
p = close;
|
|
||||||
// look for -->
|
|
||||||
while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
|
|
||||||
p = find(p + 1, body.end(), '>');
|
|
||||||
|
|
||||||
if (p != body.end()) {
|
|
||||||
// Check for htdig's "ignore this bit" comments.
|
|
||||||
if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
|
|
||||||
string::size_type i;
|
|
||||||
i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
|
|
||||||
if (i == string::npos) break;
|
|
||||||
start = body.begin() + i + 21;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// If we found --> skip to there.
|
|
||||||
start = p;
|
|
||||||
} else {
|
|
||||||
// Otherwise skip to the first > we found (as Netscape does).
|
|
||||||
start = close;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// just an SGML declaration, perhaps giving the DTD - ignore it
|
|
||||||
start = find(start - 1, body.end(), '>');
|
|
||||||
if (start == body.end()) break;
|
|
||||||
}
|
|
||||||
++start;
|
|
||||||
} else if (*start == '?') {
|
|
||||||
if (++start == body.end()) break;
|
|
||||||
// PHP - swallow until ?> or EOF
|
|
||||||
start = find(start + 1, body.end(), '>');
|
|
||||||
|
|
||||||
// look for ?>
|
|
||||||
while (start != body.end() && *(start - 1) != '?')
|
|
||||||
start = find(start + 1, body.end(), '>');
|
|
||||||
|
|
||||||
// unterminated PHP swallows rest of document (rather arbitrarily
|
|
||||||
// but it avoids polluting the database when things go wrong)
|
|
||||||
if (start != body.end()) ++start;
|
|
||||||
} else {
|
|
||||||
// opening or closing tag
|
|
||||||
int closing = 0;
|
|
||||||
|
|
||||||
if (*start == '/') {
|
|
||||||
closing = 1;
|
|
||||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
|
||||||
}
|
|
||||||
|
|
||||||
p = start;
|
|
||||||
start = find_if(start, body.end(), p_nottag);
|
|
||||||
string tag = body.substr(p - body.begin(), start - p);
|
|
||||||
// convert tagname to lowercase
|
|
||||||
lowercase_string(tag);
|
|
||||||
|
|
||||||
if (closing) {
|
|
||||||
closing_tag(tag);
|
|
||||||
if (in_script && tag == "script") in_script = false;
|
|
||||||
|
|
||||||
/* ignore any bogus parameters on closing tags */
|
|
||||||
p = find(start, body.end(), '>');
|
|
||||||
if (p == body.end()) break;
|
|
||||||
start = p + 1;
|
|
||||||
} else {
|
|
||||||
// FIXME: parse parameters lazily.
|
|
||||||
while (start < body.end() && *start != '>') {
|
|
||||||
string name, value;
|
|
||||||
|
|
||||||
p = find_if(start, body.end(), p_whitespaceeqgt);
|
|
||||||
|
|
||||||
name.assign(body, start - body.begin(), p - start);
|
|
||||||
|
|
||||||
p = find_if(p, body.end(), p_notwhitespace);
|
|
||||||
|
|
||||||
start = p;
|
|
||||||
if (start != body.end() && *start == '=') {
|
|
||||||
start = find_if(start + 1, body.end(), p_notwhitespace);
|
|
||||||
|
|
||||||
p = body.end();
|
|
||||||
|
|
||||||
int quote = *start;
|
|
||||||
if (quote == '"' || quote == '\'') {
|
|
||||||
start++;
|
|
||||||
p = find(start, body.end(), quote);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p == body.end()) {
|
|
||||||
// unquoted or no closing quote
|
|
||||||
p = find_if(start, body.end(), p_whitespacegt);
|
|
||||||
}
|
|
||||||
value.assign(body, start - body.begin(), p - start);
|
|
||||||
start = find_if(p, body.end(), p_notwhitespace);
|
|
||||||
|
|
||||||
if (!name.empty()) {
|
|
||||||
// convert parameter name to lowercase
|
|
||||||
lowercase_string(name);
|
|
||||||
// in case of multiple entries, use the first
|
|
||||||
// (as Netscape does)
|
|
||||||
parameters.insert(make_pair(name, value));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#if 0
|
|
||||||
cout << "<" << tag;
|
|
||||||
map<string, string>::const_iterator x;
|
|
||||||
for (x = parameters.begin(); x != parameters.end(); x++) {
|
|
||||||
cout << " " << x->first << "=\"" << x->second << "\"";
|
|
||||||
}
|
|
||||||
cout << ">\n";
|
|
||||||
#endif
|
|
||||||
opening_tag(tag);
|
|
||||||
parameters.clear();
|
|
||||||
|
|
||||||
// In <script> tags we ignore opening tags to avoid problems
|
|
||||||
// with "a<b".
|
|
||||||
if (tag == "script") in_script = true;
|
|
||||||
|
|
||||||
if (start != body.end() && *start == '>') ++start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,49 +0,0 @@
|
||||||
/* htmlparse.h: simple HTML parser for omega indexer
|
|
||||||
*
|
|
||||||
* Copyright 1999,2000,2001 BrightStation PLC
|
|
||||||
* Copyright 2002,2006,2008 Olly Betts
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or
|
|
||||||
* modify it under the terms of the GNU General Public License as
|
|
||||||
* published by the Free Software Foundation; either version 2 of the
|
|
||||||
* License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
|
||||||
* USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef OMEGA_INCLUDED_HTMLPARSE_H
|
|
||||||
#define OMEGA_INCLUDED_HTMLPARSE_H
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <map>
|
|
||||||
|
|
||||||
using std::string;
|
|
||||||
using std::map;
|
|
||||||
|
|
||||||
class HtmlParser {
|
|
||||||
map<string, string> parameters;
|
|
||||||
protected:
|
|
||||||
void decode_entities(string &s);
|
|
||||||
bool in_script;
|
|
||||||
string charset;
|
|
||||||
static map<string, unsigned int> named_ents;
|
|
||||||
|
|
||||||
bool get_parameter(const string & param, string & value);
|
|
||||||
public:
|
|
||||||
virtual void process_text(const string &/*text*/) { }
|
|
||||||
virtual void opening_tag(const string &/*tag*/) { }
|
|
||||||
virtual void closing_tag(const string &/*tag*/) { }
|
|
||||||
virtual void parse_html(const string &text);
|
|
||||||
HtmlParser();
|
|
||||||
virtual ~HtmlParser() { }
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // OMEGA_INCLUDED_HTMLPARSE_H
|
|
|
@ -1,302 +0,0 @@
|
||||||
/* myhtmlparse.cc: subclass of HtmlParser for extracting text.
|
|
||||||
*
|
|
||||||
* Copyright 1999,2000,2001 BrightStation PLC
|
|
||||||
* Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or
|
|
||||||
* modify it under the terms of the GNU General Public License as
|
|
||||||
* published by the Free Software Foundation; either version 2 of the
|
|
||||||
* License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
|
||||||
* USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
// #include <config.h>
|
|
||||||
|
|
||||||
#include "myhtmlparse.h"
|
|
||||||
|
|
||||||
// #include "utf8convert.h"
|
|
||||||
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
inline void
|
|
||||||
lowercase_string(string &str)
|
|
||||||
{
|
|
||||||
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
|
||||||
*i = tolower(static_cast<unsigned char>(*i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::parse_html(const string &text, const string &charset_,
|
|
||||||
bool charset_from_meta_)
|
|
||||||
{
|
|
||||||
charset = charset_;
|
|
||||||
charset_from_meta = charset_from_meta_;
|
|
||||||
HtmlParser::parse_html(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::process_text(const string &text)
|
|
||||||
{
|
|
||||||
if (!text.empty() && !in_script_tag && !in_style_tag) {
|
|
||||||
string::size_type b = text.find_first_not_of(WHITESPACE);
|
|
||||||
if (b) pending_space = true;
|
|
||||||
while (b != string::npos) {
|
|
||||||
if (pending_space && !dump.empty()) dump += ' ';
|
|
||||||
string::size_type e = text.find_first_of(WHITESPACE, b);
|
|
||||||
pending_space = (e != string::npos);
|
|
||||||
if (!pending_space) {
|
|
||||||
dump.append(text.data() + b, text.size() - b);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
dump.append(text.data() + b, e - b);
|
|
||||||
b = text.find_first_not_of(WHITESPACE, e + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::opening_tag(const string &tag)
|
|
||||||
{
|
|
||||||
if (tag.empty()) return;
|
|
||||||
switch (tag[0]) {
|
|
||||||
case 'a':
|
|
||||||
if (tag == "address") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'b':
|
|
||||||
if (tag == "body") {
|
|
||||||
dump.resize(0);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'c':
|
|
||||||
if (tag == "center") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'd':
|
|
||||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
|
||||||
tag == "dt") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'e':
|
|
||||||
if (tag == "embed") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'f':
|
|
||||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'h':
|
|
||||||
// hr, and h1, ..., h6
|
|
||||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
|
||||||
pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'i':
|
|
||||||
if (tag == "iframe" || tag == "img" || tag == "isindex" ||
|
|
||||||
tag == "input") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'k':
|
|
||||||
if (tag == "keygen") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'l':
|
|
||||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
|
||||||
pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'm':
|
|
||||||
if (tag == "meta") {
|
|
||||||
string content;
|
|
||||||
if (get_parameter("content", content)) {
|
|
||||||
string name;
|
|
||||||
if (get_parameter("name", name)) {
|
|
||||||
lowercase_string(name);
|
|
||||||
if (name == "description") {
|
|
||||||
if (sample.empty()) {
|
|
||||||
swap(sample, content);
|
|
||||||
// convert_to_utf8(sample, charset);
|
|
||||||
decode_entities(sample);
|
|
||||||
}
|
|
||||||
} else if (name == "keywords") {
|
|
||||||
if (!keywords.empty()) keywords += ' ';
|
|
||||||
// convert_to_utf8(content, charset);
|
|
||||||
decode_entities(content);
|
|
||||||
keywords += content;
|
|
||||||
} else if (name == "robots") {
|
|
||||||
decode_entities(content);
|
|
||||||
lowercase_string(content);
|
|
||||||
if (content.find("none") != string::npos ||
|
|
||||||
content.find("noindex") != string::npos) {
|
|
||||||
indexing_allowed = false;
|
|
||||||
throw true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// If the current charset came from a meta tag, don't
|
|
||||||
// force reparsing again!
|
|
||||||
if (charset_from_meta) break;
|
|
||||||
string hdr;
|
|
||||||
if (get_parameter("http-equiv", hdr)) {
|
|
||||||
lowercase_string(hdr);
|
|
||||||
if (hdr == "content-type") {
|
|
||||||
lowercase_string(content);
|
|
||||||
size_t start = content.find("charset=");
|
|
||||||
if (start == string::npos) break;
|
|
||||||
start += 8;
|
|
||||||
if (start == content.size()) break;
|
|
||||||
size_t end = start;
|
|
||||||
if (content[start] != '"') {
|
|
||||||
while (end < content.size()) {
|
|
||||||
unsigned char ch = content[end];
|
|
||||||
if (ch <= 32 || ch >= 127 ||
|
|
||||||
strchr(";()<>@,:\\\"/[]?={}", ch))
|
|
||||||
break;
|
|
||||||
++end;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
++start;
|
|
||||||
++end;
|
|
||||||
while (end < content.size()) {
|
|
||||||
unsigned char ch = content[end];
|
|
||||||
if (ch == '"') break;
|
|
||||||
if (ch == '\\') content.erase(end, 1);
|
|
||||||
++end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
string newcharset(content, start, end - start);
|
|
||||||
if (charset != newcharset) {
|
|
||||||
throw newcharset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (charset_from_meta) break;
|
|
||||||
string newcharset;
|
|
||||||
if (get_parameter("charset", newcharset)) {
|
|
||||||
// HTML5 added: <meta charset="...">
|
|
||||||
lowercase_string(newcharset);
|
|
||||||
if (charset != newcharset) {
|
|
||||||
throw newcharset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
|
||||||
pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'o':
|
|
||||||
if (tag == "ol" || tag == "option") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'p':
|
|
||||||
if (tag == "p" || tag == "pre" || tag == "plaintext")
|
|
||||||
pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'q':
|
|
||||||
if (tag == "q") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 's':
|
|
||||||
if (tag == "style") {
|
|
||||||
in_style_tag = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "script") {
|
|
||||||
in_script_tag = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "select") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 't':
|
|
||||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
|
||||||
tag == "th") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'u':
|
|
||||||
if (tag == "ul") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'x':
|
|
||||||
if (tag == "xmp") pending_space = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
MyHtmlParser::closing_tag(const string &tag)
|
|
||||||
{
|
|
||||||
if (tag.empty()) return;
|
|
||||||
switch (tag[0]) {
|
|
||||||
case 'a':
|
|
||||||
if (tag == "address") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'b':
|
|
||||||
if (tag == "body") {
|
|
||||||
throw true;
|
|
||||||
}
|
|
||||||
if (tag == "blockquote" || tag == "br") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'c':
|
|
||||||
if (tag == "center") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'd':
|
|
||||||
if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
|
|
||||||
tag == "dt") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'f':
|
|
||||||
if (tag == "fieldset" || tag == "form") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'h':
|
|
||||||
// hr, and h1, ..., h6
|
|
||||||
if (tag.length() == 2 && strchr("r123456", tag[1]))
|
|
||||||
pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'i':
|
|
||||||
if (tag == "iframe") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'l':
|
|
||||||
if (tag == "legend" || tag == "li" || tag == "listing")
|
|
||||||
pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'm':
|
|
||||||
if (tag == "marquee" || tag == "menu") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'o':
|
|
||||||
if (tag == "ol" || tag == "option") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'p':
|
|
||||||
if (tag == "p" || tag == "pre") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'q':
|
|
||||||
if (tag == "q") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 's':
|
|
||||||
if (tag == "style") {
|
|
||||||
in_style_tag = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "script") {
|
|
||||||
in_script_tag = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "select") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 't':
|
|
||||||
if (tag == "title") {
|
|
||||||
if (title.empty()) swap(title, dump);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
|
||||||
tag == "th") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'u':
|
|
||||||
if (tag == "ul") pending_space = true;
|
|
||||||
break;
|
|
||||||
case 'x':
|
|
||||||
if (tag == "xmp") pending_space = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,66 +0,0 @@
|
||||||
/* myhtmlparse.h: subclass of HtmlParser for extracting text
|
|
||||||
*
|
|
||||||
* Copyright 1999,2000,2001 BrightStation PLC
|
|
||||||
* Copyright 2002,2003,2004,2006,2008 Olly Betts
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or
|
|
||||||
* modify it under the terms of the GNU General Public License as
|
|
||||||
* published by the Free Software Foundation; either version 2 of the
|
|
||||||
* License, or (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
|
||||||
* USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
|
|
||||||
#define OMEGA_INCLUDED_MYHTMLPARSE_H
|
|
||||||
|
|
||||||
#include "htmlparse.h"
|
|
||||||
|
|
||||||
// FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
|
|
||||||
// not in all charsets and perhaps spans of all \xa0 should become a single
|
|
||||||
// \xa0?
|
|
||||||
#define WHITESPACE " \t\n\r"
|
|
||||||
|
|
||||||
class MyHtmlParser : public HtmlParser {
|
|
||||||
public:
|
|
||||||
bool in_script_tag;
|
|
||||||
bool in_style_tag;
|
|
||||||
bool pending_space;
|
|
||||||
bool indexing_allowed;
|
|
||||||
bool charset_from_meta;
|
|
||||||
string title, sample, keywords, dump;
|
|
||||||
void process_text(const string &text);
|
|
||||||
void opening_tag(const string &tag);
|
|
||||||
void closing_tag(const string &tag);
|
|
||||||
using HtmlParser::parse_html;
|
|
||||||
void parse_html(const string &text, const string &charset_,
|
|
||||||
bool charset_from_meta_);
|
|
||||||
MyHtmlParser() :
|
|
||||||
in_script_tag(false),
|
|
||||||
in_style_tag(false),
|
|
||||||
pending_space(false),
|
|
||||||
indexing_allowed(true),
|
|
||||||
charset_from_meta(false) { }
|
|
||||||
|
|
||||||
void reset() {
|
|
||||||
in_script_tag = false;
|
|
||||||
in_style_tag = false;
|
|
||||||
pending_space = false;
|
|
||||||
indexing_allowed = true;
|
|
||||||
charset_from_meta = false;
|
|
||||||
title.resize(0);
|
|
||||||
sample.resize(0);
|
|
||||||
keywords.resize(0);
|
|
||||||
dump.resize(0);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // OMEGA_INCLUDED_MYHTMLPARSE_H
|
|
|
@ -1,279 +0,0 @@
|
||||||
/* namedentities.h: named HTML entities.
|
|
||||||
*
|
|
||||||
* Copyright (C) 2006,2007 Olly Betts
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef OMEGA_INCLUDED_NAMEDENTITIES_H
|
|
||||||
#define OMEGA_INCLUDED_NAMEDENTITIES_H
|
|
||||||
|
|
||||||
// Names and values from: "Character entity references in HTML 4"
|
|
||||||
// http://www.w3.org/TR/html4/sgml/entities.html
|
|
||||||
{ "quot", 34 },
|
|
||||||
{ "amp", 38 },
|
|
||||||
{ "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML.
|
|
||||||
{ "lt", 60 },
|
|
||||||
{ "gt", 62 },
|
|
||||||
{ "nbsp", 160 },
|
|
||||||
{ "iexcl", 161 },
|
|
||||||
{ "cent", 162 },
|
|
||||||
{ "pound", 163 },
|
|
||||||
{ "curren", 164 },
|
|
||||||
{ "yen", 165 },
|
|
||||||
{ "brvbar", 166 },
|
|
||||||
{ "sect", 167 },
|
|
||||||
{ "uml", 168 },
|
|
||||||
{ "copy", 169 },
|
|
||||||
{ "ordf", 170 },
|
|
||||||
{ "laquo", 171 },
|
|
||||||
{ "not", 172 },
|
|
||||||
{ "shy", 173 },
|
|
||||||
{ "reg", 174 },
|
|
||||||
{ "macr", 175 },
|
|
||||||
{ "deg", 176 },
|
|
||||||
{ "plusmn", 177 },
|
|
||||||
{ "sup2", 178 },
|
|
||||||
{ "sup3", 179 },
|
|
||||||
{ "acute", 180 },
|
|
||||||
{ "micro", 181 },
|
|
||||||
{ "para", 182 },
|
|
||||||
{ "middot", 183 },
|
|
||||||
{ "cedil", 184 },
|
|
||||||
{ "sup1", 185 },
|
|
||||||
{ "ordm", 186 },
|
|
||||||
{ "raquo", 187 },
|
|
||||||
{ "frac14", 188 },
|
|
||||||
{ "frac12", 189 },
|
|
||||||
{ "frac34", 190 },
|
|
||||||
{ "iquest", 191 },
|
|
||||||
{ "Agrave", 192 },
|
|
||||||
{ "Aacute", 193 },
|
|
||||||
{ "Acirc", 194 },
|
|
||||||
{ "Atilde", 195 },
|
|
||||||
{ "Auml", 196 },
|
|
||||||
{ "Aring", 197 },
|
|
||||||
{ "AElig", 198 },
|
|
||||||
{ "Ccedil", 199 },
|
|
||||||
{ "Egrave", 200 },
|
|
||||||
{ "Eacute", 201 },
|
|
||||||
{ "Ecirc", 202 },
|
|
||||||
{ "Euml", 203 },
|
|
||||||
{ "Igrave", 204 },
|
|
||||||
{ "Iacute", 205 },
|
|
||||||
{ "Icirc", 206 },
|
|
||||||
{ "Iuml", 207 },
|
|
||||||
{ "ETH", 208 },
|
|
||||||
{ "Ntilde", 209 },
|
|
||||||
{ "Ograve", 210 },
|
|
||||||
{ "Oacute", 211 },
|
|
||||||
{ "Ocirc", 212 },
|
|
||||||
{ "Otilde", 213 },
|
|
||||||
{ "Ouml", 214 },
|
|
||||||
{ "times", 215 },
|
|
||||||
{ "Oslash", 216 },
|
|
||||||
{ "Ugrave", 217 },
|
|
||||||
{ "Uacute", 218 },
|
|
||||||
{ "Ucirc", 219 },
|
|
||||||
{ "Uuml", 220 },
|
|
||||||
{ "Yacute", 221 },
|
|
||||||
{ "THORN", 222 },
|
|
||||||
{ "szlig", 223 },
|
|
||||||
{ "agrave", 224 },
|
|
||||||
{ "aacute", 225 },
|
|
||||||
{ "acirc", 226 },
|
|
||||||
{ "atilde", 227 },
|
|
||||||
{ "auml", 228 },
|
|
||||||
{ "aring", 229 },
|
|
||||||
{ "aelig", 230 },
|
|
||||||
{ "ccedil", 231 },
|
|
||||||
{ "egrave", 232 },
|
|
||||||
{ "eacute", 233 },
|
|
||||||
{ "ecirc", 234 },
|
|
||||||
{ "euml", 235 },
|
|
||||||
{ "igrave", 236 },
|
|
||||||
{ "iacute", 237 },
|
|
||||||
{ "icirc", 238 },
|
|
||||||
{ "iuml", 239 },
|
|
||||||
{ "eth", 240 },
|
|
||||||
{ "ntilde", 241 },
|
|
||||||
{ "ograve", 242 },
|
|
||||||
{ "oacute", 243 },
|
|
||||||
{ "ocirc", 244 },
|
|
||||||
{ "otilde", 245 },
|
|
||||||
{ "ouml", 246 },
|
|
||||||
{ "divide", 247 },
|
|
||||||
{ "oslash", 248 },
|
|
||||||
{ "ugrave", 249 },
|
|
||||||
{ "uacute", 250 },
|
|
||||||
{ "ucirc", 251 },
|
|
||||||
{ "uuml", 252 },
|
|
||||||
{ "yacute", 253 },
|
|
||||||
{ "thorn", 254 },
|
|
||||||
{ "yuml", 255 },
|
|
||||||
{ "OElig", 338 },
|
|
||||||
{ "oelig", 339 },
|
|
||||||
{ "Scaron", 352 },
|
|
||||||
{ "scaron", 353 },
|
|
||||||
{ "Yuml", 376 },
|
|
||||||
{ "fnof", 402 },
|
|
||||||
{ "circ", 710 },
|
|
||||||
{ "tilde", 732 },
|
|
||||||
{ "Alpha", 913 },
|
|
||||||
{ "Beta", 914 },
|
|
||||||
{ "Gamma", 915 },
|
|
||||||
{ "Delta", 916 },
|
|
||||||
{ "Epsilon", 917 },
|
|
||||||
{ "Zeta", 918 },
|
|
||||||
{ "Eta", 919 },
|
|
||||||
{ "Theta", 920 },
|
|
||||||
{ "Iota", 921 },
|
|
||||||
{ "Kappa", 922 },
|
|
||||||
{ "Lambda", 923 },
|
|
||||||
{ "Mu", 924 },
|
|
||||||
{ "Nu", 925 },
|
|
||||||
{ "Xi", 926 },
|
|
||||||
{ "Omicron", 927 },
|
|
||||||
{ "Pi", 928 },
|
|
||||||
{ "Rho", 929 },
|
|
||||||
{ "Sigma", 931 },
|
|
||||||
{ "Tau", 932 },
|
|
||||||
{ "Upsilon", 933 },
|
|
||||||
{ "Phi", 934 },
|
|
||||||
{ "Chi", 935 },
|
|
||||||
{ "Psi", 936 },
|
|
||||||
{ "Omega", 937 },
|
|
||||||
{ "alpha", 945 },
|
|
||||||
{ "beta", 946 },
|
|
||||||
{ "gamma", 947 },
|
|
||||||
{ "delta", 948 },
|
|
||||||
{ "epsilon", 949 },
|
|
||||||
{ "zeta", 950 },
|
|
||||||
{ "eta", 951 },
|
|
||||||
{ "theta", 952 },
|
|
||||||
{ "iota", 953 },
|
|
||||||
{ "kappa", 954 },
|
|
||||||
{ "lambda", 955 },
|
|
||||||
{ "mu", 956 },
|
|
||||||
{ "nu", 957 },
|
|
||||||
{ "xi", 958 },
|
|
||||||
{ "omicron", 959 },
|
|
||||||
{ "pi", 960 },
|
|
||||||
{ "rho", 961 },
|
|
||||||
{ "sigmaf", 962 },
|
|
||||||
{ "sigma", 963 },
|
|
||||||
{ "tau", 964 },
|
|
||||||
{ "upsilon", 965 },
|
|
||||||
{ "phi", 966 },
|
|
||||||
{ "chi", 967 },
|
|
||||||
{ "psi", 968 },
|
|
||||||
{ "omega", 969 },
|
|
||||||
{ "thetasym", 977 },
|
|
||||||
{ "upsih", 978 },
|
|
||||||
{ "piv", 982 },
|
|
||||||
{ "ensp", 8194 },
|
|
||||||
{ "emsp", 8195 },
|
|
||||||
{ "thinsp", 8201 },
|
|
||||||
{ "zwnj", 8204 },
|
|
||||||
{ "zwj", 8205 },
|
|
||||||
{ "lrm", 8206 },
|
|
||||||
{ "rlm", 8207 },
|
|
||||||
{ "ndash", 8211 },
|
|
||||||
{ "mdash", 8212 },
|
|
||||||
{ "lsquo", 8216 },
|
|
||||||
{ "rsquo", 8217 },
|
|
||||||
{ "sbquo", 8218 },
|
|
||||||
{ "ldquo", 8220 },
|
|
||||||
{ "rdquo", 8221 },
|
|
||||||
{ "bdquo", 8222 },
|
|
||||||
{ "dagger", 8224 },
|
|
||||||
{ "Dagger", 8225 },
|
|
||||||
{ "bull", 8226 },
|
|
||||||
{ "hellip", 8230 },
|
|
||||||
{ "permil", 8240 },
|
|
||||||
{ "prime", 8242 },
|
|
||||||
{ "Prime", 8243 },
|
|
||||||
{ "lsaquo", 8249 },
|
|
||||||
{ "rsaquo", 8250 },
|
|
||||||
{ "oline", 8254 },
|
|
||||||
{ "frasl", 8260 },
|
|
||||||
{ "euro", 8364 },
|
|
||||||
{ "image", 8465 },
|
|
||||||
{ "weierp", 8472 },
|
|
||||||
{ "real", 8476 },
|
|
||||||
{ "trade", 8482 },
|
|
||||||
{ "alefsym", 8501 },
|
|
||||||
{ "larr", 8592 },
|
|
||||||
{ "uarr", 8593 },
|
|
||||||
{ "rarr", 8594 },
|
|
||||||
{ "darr", 8595 },
|
|
||||||
{ "harr", 8596 },
|
|
||||||
{ "crarr", 8629 },
|
|
||||||
{ "lArr", 8656 },
|
|
||||||
{ "uArr", 8657 },
|
|
||||||
{ "rArr", 8658 },
|
|
||||||
{ "dArr", 8659 },
|
|
||||||
{ "hArr", 8660 },
|
|
||||||
{ "forall", 8704 },
|
|
||||||
{ "part", 8706 },
|
|
||||||
{ "exist", 8707 },
|
|
||||||
{ "empty", 8709 },
|
|
||||||
{ "nabla", 8711 },
|
|
||||||
{ "isin", 8712 },
|
|
||||||
{ "notin", 8713 },
|
|
||||||
{ "ni", 8715 },
|
|
||||||
{ "prod", 8719 },
|
|
||||||
{ "sum", 8721 },
|
|
||||||
{ "minus", 8722 },
|
|
||||||
{ "lowast", 8727 },
|
|
||||||
{ "radic", 8730 },
|
|
||||||
{ "prop", 8733 },
|
|
||||||
{ "infin", 8734 },
|
|
||||||
{ "ang", 8736 },
|
|
||||||
{ "and", 8743 },
|
|
||||||
{ "or", 8744 },
|
|
||||||
{ "cap", 8745 },
|
|
||||||
{ "cup", 8746 },
|
|
||||||
{ "int", 8747 },
|
|
||||||
{ "there4", 8756 },
|
|
||||||
{ "sim", 8764 },
|
|
||||||
{ "cong", 8773 },
|
|
||||||
{ "asymp", 8776 },
|
|
||||||
{ "ne", 8800 },
|
|
||||||
{ "equiv", 8801 },
|
|
||||||
{ "le", 8804 },
|
|
||||||
{ "ge", 8805 },
|
|
||||||
{ "sub", 8834 },
|
|
||||||
{ "sup", 8835 },
|
|
||||||
{ "nsub", 8836 },
|
|
||||||
{ "sube", 8838 },
|
|
||||||
{ "supe", 8839 },
|
|
||||||
{ "oplus", 8853 },
|
|
||||||
{ "otimes", 8855 },
|
|
||||||
{ "perp", 8869 },
|
|
||||||
{ "sdot", 8901 },
|
|
||||||
{ "lceil", 8968 },
|
|
||||||
{ "rceil", 8969 },
|
|
||||||
{ "lfloor", 8970 },
|
|
||||||
{ "rfloor", 8971 },
|
|
||||||
{ "lang", 9001 },
|
|
||||||
{ "rang", 9002 },
|
|
||||||
{ "loz", 9674 },
|
|
||||||
{ "spades", 9824 },
|
|
||||||
{ "clubs", 9827 },
|
|
||||||
{ "hearts", 9829 },
|
|
||||||
{ "diams", 9830 },
|
|
||||||
|
|
||||||
#endif // OMEGA_INCLUDED_NAMEDENTITIES_H
|
|
|
@ -1,231 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 3 of the License, or
|
|
||||||
* any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
||||||
* MA 02110-1301, USA.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "xapianSearcher.h"
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <unicode/locid.h>
|
|
||||||
#ifndef _WIN32
|
|
||||||
# include <unistd.h>
|
|
||||||
#endif
|
|
||||||
#include <zim/article.h>
|
|
||||||
#include <zim/error.h>
|
|
||||||
#include <zim/file.h>
|
|
||||||
#include <zim/zim.h>
|
|
||||||
#include "xapian/myhtmlparse.h"
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace kiwix
|
|
||||||
{
|
|
||||||
std::map<std::string, int> read_valuesmap(const std::string& s)
|
|
||||||
{
|
|
||||||
std::map<std::string, int> result;
|
|
||||||
std::vector<std::string> elems = split(s, ";");
|
|
||||||
for (std::vector<std::string>::iterator elem = elems.begin();
|
|
||||||
elem != elems.end();
|
|
||||||
elem++) {
|
|
||||||
std::vector<std::string> tmp_elems = split(*elem, ":");
|
|
||||||
result.insert(
|
|
||||||
std::pair<std::string, int>(tmp_elems[0], atoi(tmp_elems[1].c_str())));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Constructor */
|
|
||||||
XapianSearcher::XapianSearcher(const string& xapianDirectoryPath,
|
|
||||||
Reader* reader)
|
|
||||||
: reader(reader)
|
|
||||||
{
|
|
||||||
this->openIndex(xapianDirectoryPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Open Xapian readable database */
|
|
||||||
void XapianSearcher::openIndex(const string& directoryPath)
|
|
||||||
{
|
|
||||||
this->readableDatabase = Xapian::Database(directoryPath);
|
|
||||||
this->valuesmap
|
|
||||||
= read_valuesmap(this->readableDatabase.get_metadata("valuesmap"));
|
|
||||||
this->language = this->readableDatabase.get_metadata("language");
|
|
||||||
this->stopwords = this->readableDatabase.get_metadata("stopwords");
|
|
||||||
setup_queryParser();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Close Xapian writable database */
|
|
||||||
void XapianSearcher::closeIndex()
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
void XapianSearcher::setup_queryParser()
|
|
||||||
{
|
|
||||||
queryParser.set_database(readableDatabase);
|
|
||||||
if (!language.empty()) {
|
|
||||||
/* Build ICU Local object to retrieve ISO-639 language code (from
|
|
||||||
ISO-639-3) */
|
|
||||||
icu::Locale languageLocale(language.c_str());
|
|
||||||
|
|
||||||
/* Configuring language base steemming */
|
|
||||||
try {
|
|
||||||
stemmer = Xapian::Stem(languageLocale.getLanguage());
|
|
||||||
queryParser.set_stemmer(stemmer);
|
|
||||||
queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
|
|
||||||
} catch (...) {
|
|
||||||
std::cout << "No steemming for language '" << languageLocale.getLanguage()
|
|
||||||
<< "'" << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!stopwords.empty()) {
|
|
||||||
std::string stopWord;
|
|
||||||
std::istringstream file(this->stopwords);
|
|
||||||
while (std::getline(file, stopWord, '\n')) {
|
|
||||||
this->stopper.add(stopWord);
|
|
||||||
}
|
|
||||||
queryParser.set_stopper(&(this->stopper));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Search strings in the database */
|
|
||||||
void XapianSearcher::searchInIndex(string& search,
|
|
||||||
const unsigned int resultStart,
|
|
||||||
const unsigned int resultEnd,
|
|
||||||
const bool verbose)
|
|
||||||
{
|
|
||||||
/* Create the query */
|
|
||||||
Xapian::Query query = queryParser.parse_query(search);
|
|
||||||
|
|
||||||
/* Create the enquire object */
|
|
||||||
Xapian::Enquire enquire(this->readableDatabase);
|
|
||||||
enquire.set_query(query);
|
|
||||||
|
|
||||||
/* Get the results */
|
|
||||||
this->results = enquire.get_mset(resultStart, resultEnd - resultStart);
|
|
||||||
this->current_result = this->results.begin();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Get next result */
|
|
||||||
Result* XapianSearcher::getNextResult()
|
|
||||||
{
|
|
||||||
if (this->current_result != this->results.end()) {
|
|
||||||
XapianResult* result = new XapianResult(this, this->current_result);
|
|
||||||
this->current_result++;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void XapianSearcher::restart_search()
|
|
||||||
{
|
|
||||||
this->current_result = this->results.begin();
|
|
||||||
}
|
|
||||||
|
|
||||||
XapianResult::XapianResult(XapianSearcher* searcher,
|
|
||||||
Xapian::MSetIterator& iterator)
|
|
||||||
: searcher(searcher), iterator(iterator), document(iterator.get_document())
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string XapianResult::get_url()
|
|
||||||
{
|
|
||||||
return document.get_data();
|
|
||||||
}
|
|
||||||
std::string XapianResult::get_title()
|
|
||||||
{
|
|
||||||
if (searcher->valuesmap.empty()) {
|
|
||||||
/* This is the old legacy version. Guess and try */
|
|
||||||
return document.get_value(0);
|
|
||||||
} else if (searcher->valuesmap.find("title") != searcher->valuesmap.end()) {
|
|
||||||
return document.get_value(searcher->valuesmap["title"]);
|
|
||||||
}
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
int XapianResult::get_score()
|
|
||||||
{
|
|
||||||
return iterator.get_percent();
|
|
||||||
}
|
|
||||||
std::string XapianResult::get_snippet()
|
|
||||||
{
|
|
||||||
if (searcher->valuesmap.empty()) {
|
|
||||||
/* This is the old legacy version. Guess and try */
|
|
||||||
std::string stored_snippet = document.get_value(1);
|
|
||||||
if (!stored_snippet.empty()) {
|
|
||||||
return stored_snippet;
|
|
||||||
}
|
|
||||||
/* Let's continue here, and see if we can genenate one */
|
|
||||||
} else if (searcher->valuesmap.find("snippet") != searcher->valuesmap.end()) {
|
|
||||||
return document.get_value(searcher->valuesmap["snippet"]);
|
|
||||||
}
|
|
||||||
/* No reader, no snippet */
|
|
||||||
if (!searcher->reader) {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
/* Get the content of the article to generate a snippet.
|
|
||||||
We parse it and use the html dump to avoid remove html tags in the
|
|
||||||
content and be able to nicely cut the text at random place. */
|
|
||||||
MyHtmlParser htmlParser;
|
|
||||||
std::string content = get_content();
|
|
||||||
if (content.empty()) {
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
htmlParser.parse_html(content, "UTF-8", true);
|
|
||||||
} catch (...) {
|
|
||||||
}
|
|
||||||
return searcher->results.snippet(htmlParser.dump, 500);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string XapianResult::get_content()
|
|
||||||
{
|
|
||||||
if (!searcher->reader) {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
auto entry = searcher->reader->getEntryFromEncodedPath(get_url());
|
|
||||||
return entry.getContent();
|
|
||||||
}
|
|
||||||
|
|
||||||
int XapianResult::get_size()
|
|
||||||
{
|
|
||||||
if (searcher->valuesmap.empty()) {
|
|
||||||
/* This is the old legacy version. Guess and try */
|
|
||||||
return document.get_value(2).empty() == true
|
|
||||||
? -1
|
|
||||||
: atoi(document.get_value(2).c_str());
|
|
||||||
} else if (searcher->valuesmap.find("size") != searcher->valuesmap.end()) {
|
|
||||||
return atoi(document.get_value(searcher->valuesmap["size"]).c_str());
|
|
||||||
}
|
|
||||||
/* The size is never used. Do we really want to get the content and
|
|
||||||
calculate the size ? */
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int XapianResult::get_wordCount()
|
|
||||||
{
|
|
||||||
if (searcher->valuesmap.empty()) {
|
|
||||||
/* This is the old legacy version. Guess and try */
|
|
||||||
return document.get_value(3).empty() == true
|
|
||||||
? -1
|
|
||||||
: atoi(document.get_value(3).c_str());
|
|
||||||
} else if (searcher->valuesmap.find("wordcount")
|
|
||||||
!= searcher->valuesmap.end()) {
|
|
||||||
return atoi(document.get_value(searcher->valuesmap["wordcount"]).c_str());
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // Kiwix namespace
|
|
Loading…
Reference in New Issue